1 files changed, 72 insertions, 16 deletions
diff --git a/app/lib/extractor.rb b/app/lib/extractor.rb
index 8020aa916..aea60dae5 100644
--- a/app/lib/extractor.rb
+++ b/app/lib/extractor.rb
@@ -1,22 +1,44 @@
 # frozen_string_literal: true
 
 module Extractor
+  MAX_DOMAIN_LENGTH = 253
+
   extend Twitter::TwitterText::Extractor
 
   module_function
 
-  # :yields: username, list_slug, start, end
+  def extract_entities_with_indices(text, options = {}, &block)
+    entities = begin
+      extract_urls_with_indices(text, options) +
+        extract_hashtags_with_indices(text, check_url_overlap: false) +
+        extract_mentions_or_lists_with_indices(text) +
+        extract_extra_uris_with_indices(text)
+    end
+
+    return [] if entities.empty?
+
+    entities = remove_overlapping_entities(entities)
+    entities.each(&block) if block_given?
+    entities
+  end
+
   def extract_mentions_or_lists_with_indices(text)
-    return [] unless Twitter::TwitterText::Regex[:at_signs].match?(text)
+    return [] unless text && Twitter::TwitterText::Regex[:at_signs].match?(text)
 
     possible_entries = []
 
-    text.to_s.scan(Account::MENTION_RE) do |screen_name, _|
+    text.scan(Account::MENTION_RE) do |screen_name, _|
       match_data = $LAST_MATCH_INFO
-      after = $'
+      after      = $'
+
       unless Twitter::TwitterText::Regex[:end_mention_match].match?(after)
+        _, domain = screen_name.split('@')
+
+        next if domain.present? && domain.length > MAX_DOMAIN_LENGTH
+
         start_position = match_data.char_begin(1) - 1
-        end_position = match_data.char_end(1)
+        end_position   = match_data.char_end(1)
+
         possible_entries << {
           screen_name: screen_name,
           indices: [start_position, end_position],
@@ -29,36 +51,70 @@ module Extractor
         yield mention[:screen_name], mention[:indices].first, mention[:indices].last
       end
     end
+
     possible_entries
   end
 
-  def extract_hashtags_with_indices(text, **)
-    return [] unless /#/.match?(text)
+  def extract_hashtags_with_indices(text, _options = {})
+    return [] unless text&.index('#')
+
+    possible_entries = []
 
-    tags = []
     text.scan(Tag::HASHTAG_RE) do |hash_text, _|
-      match_data = $LAST_MATCH_INFO
+      match_data     = $LAST_MATCH_INFO
       start_position = match_data.char_begin(1) - 1
-      end_position = match_data.char_end(1)
-      after = $'
+      end_position   = match_data.char_end(1)
+      after          = $'
+
       if %r{\A://}.match?(after)
         hash_text.match(/(.+)(https?\Z)/) do |matched|
-          hash_text = matched[1]
+          hash_text     = matched[1]
           end_position -= matched[2].codepoint_length
         end
       end
 
-      tags << {
+      possible_entries << {
         hashtag: hash_text,
         indices: [start_position, end_position],
       }
     end
 
-    tags.each { |tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
-    tags
+    if block_given?
+      possible_entries.each do |tag|
+        yield tag[:hashtag], tag[:indices].first, tag[:indices].last
+      end
+    end
+
+    possible_entries
   end
 
   def extract_cashtags_with_indices(_text)
-    [] # always returns empty array
+    []
+  end
+
+  def extract_extra_uris_with_indices(text)
+    return [] unless text&.index(':')
+
+    possible_entries = []
+
+    text.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do
+      valid_uri_match_data = $LAST_MATCH_INFO
+
+      start_position = valid_uri_match_data.char_begin(3)
+      end_position   = valid_uri_match_data.char_end(3)
+
+      possible_entries << {
+        url: valid_uri_match_data[3],
+        indices: [start_position, end_position],
+      }
+    end
+
+    if block_given?
+      possible_entries.each do |url|
+        yield url[:url], url[:indices].first, url[:indices].last
+      end
+    end
+
+    possible_entries
   end
 end