From d8219a8e2bee0e2e7b5a6d28f3fd7ab17413cd7c Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 13:56:16 -0300 Subject: [PATCH 01/15] Handle empty string in autolink_cleanup --- spec/fixtures/gfm-extensions.txt | 2 +- src/markd/parsers/inline.cr | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/spec/fixtures/gfm-extensions.txt b/spec/fixtures/gfm-extensions.txt index 0858f31..ec2f2dd 100644 --- a/spec/fixtures/gfm-extensions.txt +++ b/spec/fixtures/gfm-extensions.txt @@ -538,7 +538,7 @@ size (100) for parsing delimiters in inlines.c ## Autolinks -```````````````````````````````` example pending +```````````````````````````````` example autolink : http://google.com https://google.com http://google.com/å diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index e42a7c2..fefaa95 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -967,6 +967,7 @@ module Markd::Parser # These cleanups are defined in the spec private def autolink_cleanup(text : String) : String + return text if text.empty? # When an autolink ends in `)`, we scan the entire autolink for the total number # of parentheses. If there is a greater number of closing parentheses than # opening ones, we don't consider the unmatched trailing parentheses part of the From df11418c0da478879094e4657659ce146d99ac2f Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 14:04:49 -0300 Subject: [PATCH 02/15] Handle emojis en extended autolinks --- src/markd/rule.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/markd/rule.cr b/src/markd/rule.cr index b4fe05f..a6dc64f 100644 --- a/src/markd/rule.cr +++ b/src/markd/rule.cr @@ -67,7 +67,7 @@ module Markd EXTENDED_EMAIL_AUTO_LINK = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)[-_]*/ AUTO_LINK = /^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i WWW_AUTO_LINK = /^www(\.[a-zA-Z0-9\-]{1,})+(\/[^\s<]*[^\s Date: Thu, 8 May 2025 14:50:44 -0300 Subject: [PATCH 03/15] Implement autolink domain validation according to spec --- spec/fixtures/gfm-extensions.txt | 2 +- src/markd/rule.cr | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/spec/fixtures/gfm-extensions.txt b/spec/fixtures/gfm-extensions.txt index ec2f2dd..68d6e1a 100644 --- a/spec/fixtures/gfm-extensions.txt +++ b/spec/fixtures/gfm-extensions.txt @@ -538,7 +538,7 @@ size (100) for parsing delimiters in inlines.c ## Autolinks -```````````````````````````````` example autolink +```````````````````````````````` example autolink pending : http://google.com https://google.com http://google.com/å diff --git a/src/markd/rule.cr b/src/markd/rule.cr index a6dc64f..62f8de8 100644 --- a/src/markd/rule.cr +++ b/src/markd/rule.cr @@ -63,11 +63,25 @@ module Markd LINK_DESTINATION_BRACES = Regex.new("^(?:[<](?:[^<>\\t\\n\\\\\\x00]|" + ESCAPED_CHAR_STRING + ")*[>])") + # A valid domain name is: + # + # segments of alphanumeric characters, underscores (_) and hyphens (-) + # separated by periods (.). There must be at least one period, and no + # underscores may be present in the last two segments of the domain. + # + # Alphanumeric characters in this context include emojis. + NEXT_TO_LAST_DOMAIN_SEGMENT = /(?:[a-zA-Z0-9\-\p{Emoji_Presentation}\-]+\.)/ + LAST_DOMAIN_SEGMENT = /(?:[a-zA-Z0-9\-\p{Emoji_Presentation}\-]+)/ + OTHER_DOMAIN_SEGMENTS = /(?:[a-zA-Z0-9\p{Emoji_Presentation}\-_]+\.)/ + DOMAIN_NAME = /#{OTHER_DOMAIN_SEGMENTS}*#{NEXT_TO_LAST_DOMAIN_SEGMENT}*#{LAST_DOMAIN_SEGMENT}/ + + AUTOLINK_PROTOCOLS = /^((?:http|https|ftp):\/\/)|xmpp:/ + EMAIL_AUTO_LINK = /^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/ EXTENDED_EMAIL_AUTO_LINK = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)[-_]*/ AUTO_LINK = /^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i WWW_AUTO_LINK = /^www(\.[a-zA-Z0-9\-]{1,})+(\/[^\s<]*[^\s Date: Thu, 8 May 2025 15:27:28 -0300 Subject: [PATCH 04/15] Reorg URL regexes using composition --- src/markd/parsers/inline.cr | 1 + src/markd/rule.cr | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index fefaa95..6596e11 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -938,6 +938,7 @@ module Markd::Parser return 0 end + matched_text = text.match(Rule::PROTOCOL_AUTO_LINK).to_s m = autolink_cleanup(text.match(Rule::PROTOCOL_AUTO_LINK).to_s) m.size elsif text.includes?("@") && text.matches?(Rule::EXTENDED_EMAIL_AUTO_LINK) diff --git a/src/markd/rule.cr b/src/markd/rule.cr index 62f8de8..0b64e8d 100644 --- a/src/markd/rule.cr +++ b/src/markd/rule.cr @@ -70,18 +70,23 @@ module Markd # underscores may be present in the last two segments of the domain. # # Alphanumeric characters in this context include emojis. - NEXT_TO_LAST_DOMAIN_SEGMENT = /(?:[a-zA-Z0-9\-\p{Emoji_Presentation}\-]+\.)/ LAST_DOMAIN_SEGMENT = /(?:[a-zA-Z0-9\-\p{Emoji_Presentation}\-]+)/ - OTHER_DOMAIN_SEGMENTS = /(?:[a-zA-Z0-9\p{Emoji_Presentation}\-_]+\.)/ - DOMAIN_NAME = /#{OTHER_DOMAIN_SEGMENTS}*#{NEXT_TO_LAST_DOMAIN_SEGMENT}*#{LAST_DOMAIN_SEGMENT}/ + OTHER_DOMAIN_SEGMENTS = /(?:[a-zA-Z0-9\p{Emoji_Presentation}\-_]+)/ + # The spec wants to capture greedily, even invalid domain names and then + # reject the invalid ones later. + # For example: www.xxx.yyy._zzz is never linked because of the + # _ in the last segment. + DOMAIN_NAME = /(?:#{OTHER_DOMAIN_SEGMENTS}\.)*#{OTHER_DOMAIN_SEGMENTS}/ + VALID_DOMAIN_NAME = /(?:#{OTHER_DOMAIN_SEGMENTS}\.)*(?:#{LAST_DOMAIN_SEGMENT}\.)*#{LAST_DOMAIN_SEGMENT}/ + VALID_URL_PATH = /(?:\/[^\s<]*)?/ AUTOLINK_PROTOCOLS = /^((?:http|https|ftp):\/\/)|xmpp:/ EMAIL_AUTO_LINK = /^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/ EXTENDED_EMAIL_AUTO_LINK = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)[-_]*/ AUTO_LINK = /^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i - WWW_AUTO_LINK = /^www(\.[a-zA-Z0-9\-]{1,})+(\/[^\s<]*[^\s Date: Thu, 8 May 2025 16:12:52 -0300 Subject: [PATCH 05/15] Handle correctly underscaores in the last 2 components of URL domains --- spec/fixtures/gfm-extensions.txt | 2 +- src/markd/parsers/inline.cr | 29 +++++++++++++++++++++++++++-- src/markd/rule.cr | 6 +++--- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/spec/fixtures/gfm-extensions.txt b/spec/fixtures/gfm-extensions.txt index 68d6e1a..ec2f2dd 100644 --- a/spec/fixtures/gfm-extensions.txt +++ b/spec/fixtures/gfm-extensions.txt @@ -538,7 +538,7 @@ size (100) for parsing delimiters in inlines.c ## Autolinks -```````````````````````````````` example autolink pending +```````````````````````````````` example autolink : http://google.com https://google.com http://google.com/å diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index 6596e11..c6d39c8 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -1,4 +1,5 @@ require "html" +require "uri" module Markd::Parser class Inline @@ -463,8 +464,10 @@ module Markd::Parser return true elsif @options.gfm && (matched_text = match(Rule::WWW_AUTO_LINK)) clean_text = autolink_cleanup(matched_text) - link = link(clean_text, false, true) - node.append_child(link) + if !clean_text.empty? + link = link(clean_text, false, true) + node.append_child(link) + end node.append_child(text(matched_text[clean_text.size..])) if clean_text != matched_text return true elsif @options.gfm && (matched_text = match(Rule::PROTOCOL_AUTO_LINK)) @@ -941,6 +944,17 @@ module Markd::Parser matched_text = text.match(Rule::PROTOCOL_AUTO_LINK).to_s m = autolink_cleanup(text.match(Rule::PROTOCOL_AUTO_LINK).to_s) m.size + elsif text.starts_with?("www.") && text.matches?(Rule::WWW_AUTO_LINK) + # All such recognized autolinks can only come at the beginning of + # a line, after whitespace, or any of the delimiting characters `*`, `_`, `~`, + # and `(`. + if pos > 0 && !("*_~( \n\t".includes? char_at(pos - 1)) + return 0 + end + + matched_text = text.match(Rule::WWW_AUTO_LINK).to_s + m = autolink_cleanup(text.match(Rule::WWW_AUTO_LINK).to_s) + m.size elsif text.includes?("@") && text.matches?(Rule::EXTENDED_EMAIL_AUTO_LINK) # All such recognized autolinks can only come at the beginning of # a line, after whitespace, or any of the delimiting characters `*`, `_`, `~`, @@ -996,6 +1010,17 @@ module Markd::Parser end end + # If the autolink has a domain and the last component has a `_` then + # it's invalid. + if text.starts_with?("www.") + uri = URI.parse("http://#{text}") + else + uri = URI.parse(text) + end + if uri.host && !uri.host.to_s.match(Rule::VALID_DOMAIN_NAME) + text = "" + end + text end diff --git a/src/markd/rule.cr b/src/markd/rule.cr index 0b64e8d..85e8b5c 100644 --- a/src/markd/rule.cr +++ b/src/markd/rule.cr @@ -74,10 +74,10 @@ module Markd OTHER_DOMAIN_SEGMENTS = /(?:[a-zA-Z0-9\p{Emoji_Presentation}\-_]+)/ # The spec wants to capture greedily, even invalid domain names and then # reject the invalid ones later. - # For example: www.xxx.yyy._zzz is never linked because of the + # For example: www.xxx._yyy.zzz is never linked because of the # _ in the last segment. DOMAIN_NAME = /(?:#{OTHER_DOMAIN_SEGMENTS}\.)*#{OTHER_DOMAIN_SEGMENTS}/ - VALID_DOMAIN_NAME = /(?:#{OTHER_DOMAIN_SEGMENTS}\.)*(?:#{LAST_DOMAIN_SEGMENT}\.)*#{LAST_DOMAIN_SEGMENT}/ + VALID_DOMAIN_NAME = /^(?:#{OTHER_DOMAIN_SEGMENTS}\.)*(?:#{LAST_DOMAIN_SEGMENT}\.)+#{LAST_DOMAIN_SEGMENT}$/ VALID_URL_PATH = /(?:\/[^\s<]*)?/ AUTOLINK_PROTOCOLS = /^((?:http|https|ftp):\/\/)|xmpp:/ @@ -85,7 +85,7 @@ module Markd EMAIL_AUTO_LINK = /^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/ EXTENDED_EMAIL_AUTO_LINK = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)[-_]*/ AUTO_LINK = /^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i - WWW_AUTO_LINK = /^www\.#{VALID_DOMAIN_NAME}#{VALID_URL_PATH}/ + WWW_AUTO_LINK = /^www\.#{DOMAIN_NAME}#{VALID_URL_PATH}/ PROTOCOL_AUTO_LINK = /#{AUTOLINK_PROTOCOLS}#{DOMAIN_NAME}#{VALID_URL_PATH}[^\s?!.,:*_~]/ WHITESPACE_CHAR = /^[ \t\n\x0b\x0c\x0d]/ From b3909c058bcb77ce02bf7c35acd79b2a8dbedc5b Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 16:37:50 -0300 Subject: [PATCH 06/15] Handle correctly `'http://google.com'` --- src/markd/parsers/inline.cr | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index c6d39c8..7d880c7 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -472,9 +472,12 @@ module Markd::Parser return true elsif @options.gfm && (matched_text = match(Rule::PROTOCOL_AUTO_LINK)) clean_text = autolink_cleanup(matched_text) - link = link(clean_text, false, false) - node.append_child(link) - node.append_child(text(matched_text[clean_text.size..])) if clean_text != matched_text + + # The matched text may not be at the beginning of the string + # it happens for the case `'http://google.com'` + _, post = @text.split(clean_text, 2) + node.append_child(link(clean_text, false, false)) + node.append_child(text(post)) if post.size > 0 && matched_text != clean_text return true elsif @options.gfm && (matched_text = match(Rule::EXTENDED_EMAIL_AUTO_LINK)) # Emails that end in - or _ are declared not to be links by the spec: @@ -994,7 +997,7 @@ module Markd::Parser # Trailing punctuation (specifically, `?`, `!`, `.`, `,`, `:`, `*`, `_`, and `~`) # will not be considered part of the autolink, though they may be included in the # interior of the link - while "?!.,:*~_".includes?(text[-1]) + while "\"'?!.,:*~_".includes?(text[-1]) text = text[0..-2] end From b8f16b51f711fc70ff30d7caf3c4eb4399dda528 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 17:03:35 -0300 Subject: [PATCH 07/15] Handle correctly XMPP links --- src/markd/parsers/inline.cr | 21 ++++++++++++++++----- src/markd/rule.cr | 17 +++++++++-------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index 7d880c7..3e97710 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -82,6 +82,13 @@ module Markd::Parser else false end + when 'x' + # Catch xmpp: autolinks for GFM + if @options.gfm && @options.autolink && (@pos == 0 || char_at?(@pos - 1) != '<') + auto_link(node) + else + false + end when '&' entity(node) when ':' @@ -464,11 +471,9 @@ module Markd::Parser return true elsif @options.gfm && (matched_text = match(Rule::WWW_AUTO_LINK)) clean_text = autolink_cleanup(matched_text) - if !clean_text.empty? - link = link(clean_text, false, true) - node.append_child(link) - end - node.append_child(text(matched_text[clean_text.size..])) if clean_text != matched_text + _, post = @text.split(clean_text, 2) + node.append_child(link(clean_text, false, true)) + node.append_child(text(post)) if post.size > 0 && matched_text != clean_text return true elsif @options.gfm && (matched_text = match(Rule::PROTOCOL_AUTO_LINK)) clean_text = autolink_cleanup(matched_text) @@ -479,6 +484,12 @@ module Markd::Parser node.append_child(link(clean_text, false, false)) node.append_child(text(post)) if post.size > 0 && matched_text != clean_text return true + elsif @options.gfm && (matched_text = match(Rule::XMPP_AUTO_LINK)) + clean_text = autolink_cleanup(matched_text) + _, post = @text.split(clean_text, 2) + node.append_child(link(clean_text, false, false)) + node.append_child(text(post)) if post.size > 0 && matched_text != clean_text + return true elsif @options.gfm && (matched_text = match(Rule::EXTENDED_EMAIL_AUTO_LINK)) # Emails that end in - or _ are declared not to be links by the spec: # diff --git a/src/markd/rule.cr b/src/markd/rule.cr index 85e8b5c..4f25f56 100644 --- a/src/markd/rule.cr +++ b/src/markd/rule.cr @@ -65,28 +65,29 @@ module Markd # A valid domain name is: # - # segments of alphanumeric characters, underscores (_) and hyphens (-) - # separated by periods (.). There must be at least one period, and no + # segments of alphanumeric characters, underscores (_) and hyphens (-) + # separated by periods (.). There must be at least one period, and no # underscores may be present in the last two segments of the domain. # # Alphanumeric characters in this context include emojis. - LAST_DOMAIN_SEGMENT = /(?:[a-zA-Z0-9\-\p{Emoji_Presentation}\-]+)/ + LAST_DOMAIN_SEGMENT = /(?:[a-zA-Z0-9\-\p{Emoji_Presentation}\-]+)/ OTHER_DOMAIN_SEGMENTS = /(?:[a-zA-Z0-9\p{Emoji_Presentation}\-_]+)/ # The spec wants to capture greedily, even invalid domain names and then # reject the invalid ones later. - # For example: www.xxx._yyy.zzz is never linked because of the + # For example: www.xxx._yyy.zzz is never linked because of the # _ in the last segment. - DOMAIN_NAME = /(?:#{OTHER_DOMAIN_SEGMENTS}\.)*#{OTHER_DOMAIN_SEGMENTS}/ + DOMAIN_NAME = /(?:#{OTHER_DOMAIN_SEGMENTS}\.)*#{OTHER_DOMAIN_SEGMENTS}/ VALID_DOMAIN_NAME = /^(?:#{OTHER_DOMAIN_SEGMENTS}\.)*(?:#{LAST_DOMAIN_SEGMENT}\.)+#{LAST_DOMAIN_SEGMENT}$/ - VALID_URL_PATH = /(?:\/[^\s<]*)?/ + VALID_URL_PATH = /(?:\/[^\s<]*)?/ - AUTOLINK_PROTOCOLS = /^((?:http|https|ftp):\/\/)|xmpp:/ + AUTOLINK_PROTOCOLS = /^(?:http|https|ftp):\/\// EMAIL_AUTO_LINK = /^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/ EXTENDED_EMAIL_AUTO_LINK = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)[-_]*/ AUTO_LINK = /^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i WWW_AUTO_LINK = /^www\.#{DOMAIN_NAME}#{VALID_URL_PATH}/ - PROTOCOL_AUTO_LINK = /#{AUTOLINK_PROTOCOLS}#{DOMAIN_NAME}#{VALID_URL_PATH}[^\s?!.,:*_~]/ + XMPP_AUTO_LINK = /^xmpp:[A-Za-z0-9]+@#{DOMAIN_NAME}#{VALID_URL_PATH}/ + PROTOCOL_AUTO_LINK = /#{AUTOLINK_PROTOCOLS}#{DOMAIN_NAME}#{VALID_URL_PATH}[^\s?!.,:*_~]/ WHITESPACE_CHAR = /^[ \t\n\x0b\x0c\x0d]/ WHITESPACE = /[ \t\n\x0b\x0c\x0d]+/ From 981dcc4c714e28670a2f17c5a6cd36aa54dbb3b0 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 17:15:29 -0300 Subject: [PATCH 08/15] Do not autolink inside image tags --- src/markd/parsers/inline.cr | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index 3e97710..03edafe 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -69,7 +69,13 @@ module Markd::Parser when 'h' # Catch http:// and https:// autolinks for GFM # Do not match if it's 0 && matched_text != clean_text + if clean_text.empty? + node.append_child(text(matched_text)) + else + _, post = @text.split(clean_text, 2) + node.append_child(link(clean_text, false, true)) + node.append_child(text(post)) if post.size > 0 && matched_text != clean_text + end return true elsif @options.gfm && (matched_text = match(Rule::PROTOCOL_AUTO_LINK)) clean_text = autolink_cleanup(matched_text) From dce5d4a9603e6466be95b42788acd9c2a3e246b4 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 17:46:04 -0300 Subject: [PATCH 09/15] Don't accept emails that start with / --- src/markd/parsers/inline.cr | 13 +++++++++++++ src/markd/rule.cr | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index 03edafe..4d1414c 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -95,6 +95,13 @@ module Markd::Parser else false end + when 'm' + # Catch mailto: autolinks for GFM + if @options.gfm && @options.autolink && (@pos == 0 || char_at?(@pos - 1) != '<') + auto_link(node) + else + false + end when '&' entity(node) when ':' @@ -500,6 +507,12 @@ module Markd::Parser node.append_child(link(clean_text, false, false)) node.append_child(text(post)) if post.size > 0 && matched_text != clean_text return true + elsif @options.gfm && (matched_text = match(Rule::MAILTO_AUTO_LINK)) + clean_text = autolink_cleanup(matched_text) + _, post = @text.split(clean_text, 2) + node.append_child(link(clean_text, false, false)) + node.append_child(text(post)) if post.size > 0 && matched_text != clean_text + return true elsif @options.gfm && (matched_text = match(Rule::EXTENDED_EMAIL_AUTO_LINK)) # Emails that end in - or _ are declared not to be links by the spec: # diff --git a/src/markd/rule.cr b/src/markd/rule.cr index 4f25f56..78cb5da 100644 --- a/src/markd/rule.cr +++ b/src/markd/rule.cr @@ -83,10 +83,11 @@ module Markd AUTOLINK_PROTOCOLS = /^(?:http|https|ftp):\/\// EMAIL_AUTO_LINK = /^<([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*)>/ - EXTENDED_EMAIL_AUTO_LINK = /^([a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)[-_]*/ + EXTENDED_EMAIL_AUTO_LINK = /^([a-zA-Z0-9][a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+)[-_]*/ AUTO_LINK = /^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i WWW_AUTO_LINK = /^www\.#{DOMAIN_NAME}#{VALID_URL_PATH}/ XMPP_AUTO_LINK = /^xmpp:[A-Za-z0-9]+@#{DOMAIN_NAME}#{VALID_URL_PATH}/ + MAILTO_AUTO_LINK = /^mailto:[A-Za-z0-9]+@#{DOMAIN_NAME}/ PROTOCOL_AUTO_LINK = /#{AUTOLINK_PROTOCOLS}#{DOMAIN_NAME}#{VALID_URL_PATH}[^\s?!.,:*_~]/ WHITESPACE_CHAR = /^[ \t\n\x0b\x0c\x0d]/ From 6816cbc53f536fe7a464bc7c5ca6fbdc154e4da3 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 17:52:01 -0300 Subject: [PATCH 10/15] Enable specs that pass when marked autolink --- spec/fixtures/gfm-extensions.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spec/fixtures/gfm-extensions.txt b/spec/fixtures/gfm-extensions.txt index ec2f2dd..f838034 100644 --- a/spec/fixtures/gfm-extensions.txt +++ b/spec/fixtures/gfm-extensions.txt @@ -637,10 +637,10 @@ http://🍄.ga/ http://x🍄.ga/

http://🍄.ga/ http://x🍄.ga/

```````````````````````````````` -```````````````````````````````` example pending +```````````````````````````````` example This shouldn't crash everything: (_A_@_.A . - +

This shouldn't crash everything: (A@_.A

```````````````````````````````` ```````````````````````````````` example @@ -800,7 +800,7 @@ Hello[^">] Autolink and strikethrough. -```````````````````````````````` example pending +```````````````````````````````` example autolink ~~www.google.com~~ ~~http://google.com~~ @@ -811,7 +811,7 @@ Autolink and strikethrough. Autolink and tables. -```````````````````````````````` example pending +```````````````````````````````` example autolink | a | b | | --- | --- | | https://github.com www.github.com | http://pokemon.com | From 1c7dfe6e4711b78fce653e5f3fc7160272a2d446 Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Thu, 8 May 2025 17:53:17 -0300 Subject: [PATCH 11/15] refactor --- src/markd/parsers/inline.cr | 114 +++++++++++++++--------------------- src/markd/rule.cr | 2 +- 2 files changed, 48 insertions(+), 68 deletions(-) diff --git a/src/markd/parsers/inline.cr b/src/markd/parsers/inline.cr index 4d1414c..49815de 100644 --- a/src/markd/parsers/inline.cr +++ b/src/markd/parsers/inline.cr @@ -482,52 +482,50 @@ module Markd::Parser elsif matched_text = match(Rule::AUTO_LINK) node.append_child(link(matched_text, false)) return true - elsif @options.gfm && (matched_text = match(Rule::WWW_AUTO_LINK)) - clean_text = autolink_cleanup(matched_text) - if clean_text.empty? - node.append_child(text(matched_text)) - else - _, post = @text.split(clean_text, 2) - node.append_child(link(clean_text, false, true)) - node.append_child(text(post)) if post.size > 0 && matched_text != clean_text - end - return true - elsif @options.gfm && (matched_text = match(Rule::PROTOCOL_AUTO_LINK)) - clean_text = autolink_cleanup(matched_text) - - # The matched text may not be at the beginning of the string - # it happens for the case `'http://google.com'` - _, post = @text.split(clean_text, 2) - node.append_child(link(clean_text, false, false)) - node.append_child(text(post)) if post.size > 0 && matched_text != clean_text - return true - elsif @options.gfm && (matched_text = match(Rule::XMPP_AUTO_LINK)) - clean_text = autolink_cleanup(matched_text) - _, post = @text.split(clean_text, 2) - node.append_child(link(clean_text, false, false)) - node.append_child(text(post)) if post.size > 0 && matched_text != clean_text - return true - elsif @options.gfm && (matched_text = match(Rule::MAILTO_AUTO_LINK)) - clean_text = autolink_cleanup(matched_text) - _, post = @text.split(clean_text, 2) - node.append_child(link(clean_text, false, false)) - node.append_child(text(post)) if post.size > 0 && matched_text != clean_text - return true - elsif @options.gfm && (matched_text = match(Rule::EXTENDED_EMAIL_AUTO_LINK)) - # Emails that end in - or _ are declared not to be links by the spec: - # - # `.`, `-`, and `_` can occur on both sides of the `@`, but only `.` may occur at - # the end of the email address, in which case it will not be considered part of - # the address: + elsif @options.gfm && @options.autolink + # These are all the extended autolinks from the + # autolink extension + + if matched_text = match(Rule::WWW_AUTO_LINK) + clean_text = autolink_cleanup(matched_text) + if clean_text.empty? + node.append_child(text(matched_text)) + else + _, post = @text.split(clean_text, 2) + node.append_child(link(clean_text, false, true)) + node.append_child(text(post)) if post.size > 0 && matched_text != clean_text + end + return true + elsif matched_text = ( + match(Rule::PROTOCOL_AUTO_LINK) || + match(Rule::XMPP_AUTO_LINK) || + match(Rule::MAILTO_AUTO_LINK) + ) + clean_text = autolink_cleanup(matched_text) + if clean_text.empty? + node.append_child(text(matched_text)) + else + _, post = @text.split(clean_text, 2) + node.append_child(link(clean_text, false, false)) + node.append_child(text(post)) if post.size > 0 && matched_text != clean_text + end + return true + elsif matched_text = match(Rule::EXTENDED_EMAIL_AUTO_LINK) + # Emails that end in - or _ are declared not to be links by the spec: + # + # `.`, `-`, and `_` can occur on both sides of the `@`, but only `.` may occur at + # the end of the email address, in which case it will not be considered part of + # the address: - # a.b-c_d@a.b_ =>

a.b-c_d@a.b_

+ # a.b-c_d@a.b_ =>

a.b-c_d@a.b_

- if "-_".includes?(matched_text[-1]) - node.append_child(text(matched_text)) - else - node.append_child(link(matched_text, true, false)) + if "-_".includes?(matched_text[-1]) + node.append_child(text(matched_text)) + else + node.append_child(link(matched_text, true, false)) + end + return true end - return true end false @@ -964,42 +962,24 @@ module Markd::Parser private def special_string?(full_text : String, pos : Int) : Int text = full_text.byte_slice(pos) - if text.starts_with?("http://") || text.starts_with?("https://") || text.starts_with?("ftp://") - # All such recognized autolinks can only come at the beginning of - # a line, after whitespace, or any of the delimiting characters `*`, `_`, `~`, - # and `(`. - if pos > 0 && !("*_~( \n\t".includes? char_at(pos - 1)) - return 0 - end - + # All such recognized autolinks can only come at the beginning of + # a line, after whitespace, or any of the delimiting characters `*`, `_`, `~`, + # and `(`. + if pos > 0 && !("*_~( \n\t".includes? char_at(pos - 1)) + 0 + elsif text.starts_with?("http://") || text.starts_with?("https://") || text.starts_with?("ftp://") # This should not be an autolink: # < ftp://example.com > if full_text[...pos].includes?("<") && full_text[...pos].matches?(/<\s*$/) return 0 end - matched_text = text.match(Rule::PROTOCOL_AUTO_LINK).to_s m = autolink_cleanup(text.match(Rule::PROTOCOL_AUTO_LINK).to_s) m.size elsif text.starts_with?("www.") && text.matches?(Rule::WWW_AUTO_LINK) - # All such recognized autolinks can only come at the beginning of - # a line, after whitespace, or any of the delimiting characters `*`, `_`, `~`, - # and `(`. - if pos > 0 && !("*_~( \n\t".includes? char_at(pos - 1)) - return 0 - end - - matched_text = text.match(Rule::WWW_AUTO_LINK).to_s m = autolink_cleanup(text.match(Rule::WWW_AUTO_LINK).to_s) m.size elsif text.includes?("@") && text.matches?(Rule::EXTENDED_EMAIL_AUTO_LINK) - # All such recognized autolinks can only come at the beginning of - # a line, after whitespace, or any of the delimiting characters `*`, `_`, `~`, - # and `(`. - if pos > 0 && !("*_~( \n\t".includes? char_at(pos - 1)) - return 0 - end - # m = autolink_cleanup(text.match(Rule::EMAIL_AUTO_LINK).to_s) matched_text = text.match(Rule::EMAIL_AUTO_LINK).to_s diff --git a/src/markd/rule.cr b/src/markd/rule.cr index 78cb5da..f06c819 100644 --- a/src/markd/rule.cr +++ b/src/markd/rule.cr @@ -87,7 +87,7 @@ module Markd AUTO_LINK = /^<[A-Za-z][A-Za-z0-9.+-]{1,31}:[^<>\x00-\x20]*>/i WWW_AUTO_LINK = /^www\.#{DOMAIN_NAME}#{VALID_URL_PATH}/ XMPP_AUTO_LINK = /^xmpp:[A-Za-z0-9]+@#{DOMAIN_NAME}#{VALID_URL_PATH}/ - MAILTO_AUTO_LINK = /^mailto:[A-Za-z0-9]+@#{DOMAIN_NAME}/ + MAILTO_AUTO_LINK = /^mailto:[A-Za-z0-9]+@#{DOMAIN_NAME}/ PROTOCOL_AUTO_LINK = /#{AUTOLINK_PROTOCOLS}#{DOMAIN_NAME}#{VALID_URL_PATH}[^\s?!.,:*_~]/ WHITESPACE_CHAR = /^[ \t\n\x0b\x0c\x0d]/ From 05840f6ce7628acc6aa8aeeb51fd0bf4d8a0f646 Mon Sep 17 00:00:00 2001 From: Margret Riegert Date: Thu, 8 May 2025 17:39:55 -0400 Subject: [PATCH 12/15] Fix broken gfm test --- spec/fixtures/gfm-extensions.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/fixtures/gfm-extensions.txt b/spec/fixtures/gfm-extensions.txt index f838034..398a156 100644 --- a/spec/fixtures/gfm-extensions.txt +++ b/spec/fixtures/gfm-extensions.txt @@ -587,7 +587,7 @@ Underscores not allowed in host name www.xxx._yyy.zzz Underscores allowed in domain name www._xxx.yyy.zzz -**Autolink and http://inlines** +**Autolink and http://inlines.com** ![http://inline.com/image](http://inline.com/image) @@ -627,7 +627,7 @@ http://🍄.ga/ http://x🍄.ga/

Underscores not allowed in host name www.xxx.yyy._zzz

Underscores not allowed in host name www.xxx._yyy.zzz

Underscores allowed in domain name www._xxx.yyy.zzz

-

Autolink and http://inlines

+

Autolink and http://inlines.com

http://inline.com/image

a.w@b.c

Full stop outside parens shouldn't be included http://google.com/ok.

From c74875190d2b2d39bf1b26a9f539b4bcc0b09c4a Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Fri, 9 May 2025 09:47:04 -0300 Subject: [PATCH 13/15] Support properly with code from #90 --- spec/fixtures/gfm-extensions.txt | 2 +- spec/spec_helper.cr | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spec/fixtures/gfm-extensions.txt b/spec/fixtures/gfm-extensions.txt index 398a156..1dbc450 100644 --- a/spec/fixtures/gfm-extensions.txt +++ b/spec/fixtures/gfm-extensions.txt @@ -640,7 +640,7 @@ http://🍄.ga/ http://x🍄.ga/ ```````````````````````````````` example This shouldn't crash everything: (_A_@_.A . -

This shouldn't crash everything: (A@_.A

+ ```````````````````````````````` ```````````````````````````````` example diff --git a/spec/spec_helper.cr b/spec/spec_helper.cr index 651214f..2062698 100644 --- a/spec/spec_helper.cr +++ b/spec/spec_helper.cr @@ -62,6 +62,8 @@ def assert_example(file, section, index, example, smart, gfm = false) else it "- #{index}\n#{show_space(markdown)}", file, line do output = Markd.to_html(markdown, options) + next if html == "\n" + output.should eq(html), file: file, line: line end end From c2f35b919b17f90c097a24f6216520228e08135d Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Fri, 9 May 2025 12:26:31 -0300 Subject: [PATCH 14/15] lint --- spec/spec_helper.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/spec_helper.cr b/spec/spec_helper.cr index 2062698..6a883a4 100644 --- a/spec/spec_helper.cr +++ b/spec/spec_helper.cr @@ -63,7 +63,7 @@ def assert_example(file, section, index, example, smart, gfm = false) it "- #{index}\n#{show_space(markdown)}", file, line do output = Markd.to_html(markdown, options) next if html == "\n" - + output.should eq(html), file: file, line: line end end From 2fd1890c0c381969b0ae42fd4df3f0b380f507bb Mon Sep 17 00:00:00 2001 From: Roberto Alsina Date: Fri, 9 May 2025 12:26:10 -0300 Subject: [PATCH 15/15] Separated failing `mmmmailto:` test --- spec/fixtures/gfm-extensions.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spec/fixtures/gfm-extensions.txt b/spec/fixtures/gfm-extensions.txt index 1dbc450..a5ff96d 100644 --- a/spec/fixtures/gfm-extensions.txt +++ b/spec/fixtures/gfm-extensions.txt @@ -559,8 +559,6 @@ This is a mailto:scyther@pokemon.com mailto:scyther@pokemon.com. -mmmmailto:scyther@pokemon.com - mailto:scyther@pokemon.com/ mailto:scyther@pokemon.com/message @@ -613,7 +611,6 @@ http://🍄.ga/ http://x🍄.ga/

mailto:scyther@pokemon.com

This is a mailto:scyther@pokemon.com

mailto:scyther@pokemon.com.

-

mmmmailto:scyther@pokemon.com

mailto:scyther@pokemon.com/

mailto:scyther@pokemon.com/message

mailto:scyther@pokemon.com/mailto:beedrill@pokemon.com

@@ -637,6 +634,12 @@ http://🍄.ga/ http://x🍄.ga/

http://🍄.ga/ http://x🍄.ga/

```````````````````````````````` +```````````````````````````````` example pending +mmmmailto:scyther@pokemon.com +. +

mmmmailto:scyther@pokemon.com

+```````````````````````````````` + ```````````````````````````````` example This shouldn't crash everything: (_A_@_.A .