about summary refs log tree commit diff
path: root/config/initializers
diff options
context:
space:
mode:
authorClaire <claire.github-309c@sitedethib.com>2021-07-15 15:56:58 +0200
committerGitHub <noreply@github.com>2021-07-15 15:56:58 +0200
commit211d5c3c300b5a54b60c7b0142158144f9b0d392 (patch)
treecc2232be6461db04a473a5765b3ec4f826be6d47 /config/initializers
parent3dcf3f2a3a059191668c4edf395e4ad0b19209c5 (diff)
Fix inefficiencies in auto-linking code (#16506)
The auto-linking code basically rewrote the whole string escaping non-ascii
characters in an inefficient way, and building a full character offset map
between the unescaped and escaped texts before sending the contents to
TwitterText's extractor.

Instead of doing that, this commit changes the TwitterText regexps to include
valid IRI characters in addition to valid URI characters.
Diffstat (limited to 'config/initializers')
-rw-r--r--config/initializers/twitter_regex.rb4
1 files changed, 4 insertions, 0 deletions
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb
index 3ff2aa9e5..84c09ff35 100644
--- a/config/initializers/twitter_regex.rb
+++ b/config/initializers/twitter_regex.rb
@@ -24,6 +24,10 @@ module Twitter::TwitterText
         )
       \)
     /iox
+    REGEXEN[:valid_iri_ucschar] = /[\u{A0}-\u{D7FF}\u{F900}-\u{FDCF}\u{FDF0}-\u{FFEF}\u{10000}-\u{1FFFD}\u{20000}-\u{2FFFD}\u{30000}-\u{3FFFD}\u{40000}-\u{4FFFD}\u{50000}-\u{5FFFD}\u{60000}-\u{6FFFD}\u{70000}-\u{7FFFD}\u{80000}-\u{8FFFD}\u{90000}-\u{9FFFD}\u{A0000}-\u{AFFFD}\u{B0000}-\u{BFFFD}\u{C0000}-\u{CFFFD}\u{D0000}-\u{DFFFD}\u{E1000}-\u{EFFFD}]/iou
+    REGEXEN[:valid_iri_iprivate] = /[\u{E000}-\u{F8FF}\u{F0000}-\u{FFFFD}\u{100000}-\u{10FFFD}]/iou
+    REGEXEN[:valid_url_query_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]/iou
+    REGEXEN[:valid_url_query_ending_chars] = /(?:#{REGEXEN[:valid_iri_ucschar]})|(?:#{REGEXEN[:valid_iri_iprivate]})|[a-z0-9_&=#\/\-]/iou
     REGEXEN[:valid_url_path] = /(?:
       (?:
         #{REGEXEN[:valid_general_url_path_chars]}*