Add support for linking XMPP URIs in toots (#12709)

* Fix wrong grouping in Twitter valid_url regex * Add support for xmpp URIs Fixes #9776 The difficult part is autolinking, because Twitter-text's extractor does some pretty ad-hoc stuff to find things that “look like” URLs, and XMPP URIs do not really match the assumptions of that lib, so it doesn't sound wise to try to shoehorn it into the existing regex. This is why I used a specific regex (very close, although slightly more permissive than the RFC), and a specific scan function (a simplified version of the generalized one from Twitter). * Remove leading “xmpp:” from auto-linked text
author: ThibG <thib@sitedethib.com> 2020-01-11 02:15:25 +0100
committer: Eugen Rochko <eugen@zeonfederated.com> 2020-01-11 02:15:25 +0100
commit: ea436b355bd844c86a4f4ddfd204b9bf15a1db6c (patch)
tree: c33fd5501bf295573665740481a6d749a57bab35
parent: e9ea09d17398763ead69ca845c7aefebf266d373 (diff)
4 files changed, 70 insertions, 4 deletions
diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb
index 6ba327614..c771dcaaa 100644
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -245,8 +245,9 @@ class Formatter
     end
 
     standard = Extractor.extract_entities_with_indices(text, options)
+    xmpp = Extractor.extract_xmpp_uris_with_indices(text, options)
 
-    Extractor.remove_overlapping_entities(special + standard)
+    Extractor.remove_overlapping_entities(special + standard + xmpp)
   end
 
   def link_to_url(entity, options = {})
@@ -284,7 +285,7 @@ class Formatter
 
   def link_html(url)
     url    = Addressable::URI.parse(url).to_s
-    prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s
+    prefix = url.match(/\A(https?:\/\/(www\.)?|xmpp:)/).to_s
     text   = url[prefix.length, 30]
     suffix = url[prefix.length + 30..-1]
     cutoff = url[prefix.length..-1].length > 30
diff --git a/app/lib/sanitize_config.rb b/app/lib/sanitize_config.rb
index 77045155e..e2480376e 100644
--- a/app/lib/sanitize_config.rb
+++ b/app/lib/sanitize_config.rb
@@ -2,7 +2,7 @@
 
 class Sanitize
   module Config
-    HTTP_PROTOCOLS ||= ['http', 'https', 'dat', 'dweb', 'ipfs', 'ipns', 'ssb', 'gopher', :relative].freeze
+    HTTP_PROTOCOLS ||= ['http', 'https', 'dat', 'dweb', 'ipfs', 'ipns', 'ssb', 'gopher', 'xmpp', :relative].freeze
 
     CLASS_WHITELIST_TRANSFORMER = lambda do |env|
       node = env[:node]
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb
index 0ddbbee98..87815d458 100644
--- a/config/initializers/twitter_regex.rb
+++ b/config/initializers/twitter_regex.rb
@@ -29,7 +29,7 @@ module Twitter
       (                                                                                     #   $1 total match
         (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceding character
         (                                                                                   #   $3 URL
-          ((https?|dat|dweb|ipfs|ipns|ssb|gopher):\/\/)?                                    #   $4 Protocol (optional)
+          ((?:https?|dat|dweb|ipfs|ipns|ssb|gopher):\/\/)?                                  #   $4 Protocol (optional)
           (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
           (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
           (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
@@ -37,5 +37,54 @@ module Twitter
         )
       )
     }iox
+    REGEXEN[:validate_nodeid] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      [!$()*+,;=]
+    )/iox
+    REGEXEN[:validate_resid] = /(?:
+      #{REGEXEN[:validate_url_unreserved]}|
+      #{REGEXEN[:validate_url_pct_encoded]}|
+      #{REGEXEN[:validate_url_sub_delims]}
+    )/iox
+    REGEXEN[:valid_xmpp_uri] = %r{
+      (                                                                                     #   $1 total match
+        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceding character
+        (                                                                                   #   $3 URL
+          ((?:xmpp):)                                                                       #   $4 Protocol
+          (//#{REGEXEN[:validate_nodeid]}+@#{REGEXEN[:valid_domain]}/)?                     #   $5 Authority (optional)
+          (#{REGEXEN[:validate_nodeid]}+@)?                                                 #   $6 Username in path (optional)
+          (#{REGEXEN[:valid_domain]})                                                       #   $7 Domain in path
+          (/#{REGEXEN[:validate_resid]}+)?                                                  #   $8 Resource in path (optional)
+          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $9 Query String
+        )
+      )
+    }iox
+  end
+
+  module Extractor
+    # Extracts a list of all XMPP URIs included in the Tweet <tt>text</tt> along
+    # with the indices. If the <tt>text</tt> is <tt>nil</tt> or contains no
+    # XMPP URIs an empty array will be returned.
+    #
+    # If a block is given then it will be called for each XMPP URI.
+    def extract_xmpp_uris_with_indices(text, options = {}) # :yields: uri, start, end
+      return [] unless text && text.index(":")
+      urls = []
+
+      text.to_s.scan(Twitter::Regex[:valid_xmpp_uri]) do
+        valid_uri_match_data = $~
+
+        start_position = valid_uri_match_data.char_begin(3)
+        end_position = valid_uri_match_data.char_end(3)
+
+        urls << {
+          :url => valid_uri_match_data[3],
+          :indices => [start_position, end_position]
+        }
+      end
+      urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given?
+      urls
+    end
   end
 end
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb
index b8108a247..83be0a588 100644
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@@ -242,6 +242,22 @@ RSpec.describe Formatter do
         is_expected.to include '/tags/hashtag%E3%82%BF%E3%82%B0" class="mention hashtag" rel="tag">#<span>hashtagタグ</span></a>'
       end
     end
+
+    context 'given a stand-alone xmpp: URI' do
+      let(:text) { 'xmpp:user@instance.com' }
+
+      it 'matches the full URI' do
+        is_expected.to include 'href="xmpp:user@instance.com"'
+      end
+    end
+
+    context 'given a an xmpp: URI with a query-string' do
+      let(:text) { 'please join xmpp:muc@instance.com?join right now' }
+
+      it 'matches the full URI' do
+        is_expected.to include 'href="xmpp:muc@instance.com?join"'
+      end
+    end
   end
 
   describe '#format_spoiler' do
author	ThibG <thib@sitedethib.com>	2020-01-11 02:15:25 +0100
committer	Eugen Rochko <eugen@zeonfederated.com>	2020-01-11 02:15:25 +0100
commit	ea436b355bd844c86a4f4ddfd204b9bf15a1db6c (patch)
tree	c33fd5501bf295573665740481a6d749a57bab35
parent	e9ea09d17398763ead69ca845c7aefebf266d373 (diff)