about summary refs log tree commit diff
path: root/app/lib/extractor.rb
diff options
context:
space:
mode:
authorClaire <claire.github-309c@sitedethib.com>2022-03-26 19:18:55 +0100
committerClaire <claire.github-309c@sitedethib.com>2022-03-26 19:18:55 +0100
commitaaa9ec340b7291bace3a899cfcfef7524ecdfe72 (patch)
treec4070b5c32efc4b6474bfb0ca5814c5779a5b6f8 /app/lib/extractor.rb
parent2287eebae0c1d699436a8cf3218d7cfe990a3605 (diff)
parentd7d049aab7578028492e73671769f0a350e34203 (diff)
Merge branch 'main' into glitch-soc/merge-upstream
Conflicts:
- `app/lib/formatter.rb`:
  Upstream completely refactored the formatting code and removed that file,
  while glitch-soc had code for Markdown and HTML toots.
  Took upstream code, glitch-soc changes will be re-implemented on top of the
  refactored classes in a later commit.
- `app/models/status.rb`:
  Upstream refactored status edit handling and moved code to
  `app/models/concerns/status_snapshot_concern.rb`.
  Applied glitch-soc's changes to that file.
- `app/serializers/activitypub/note_serializer.rb`:
  Not really a conflict, just a line added too close to one modified by
  glitch-soc.
  Applied upstream changes while keeping the glitch-soc-modified one.
- `app/services/update_status_service.rb`:
  Not really a conflict, upstream modified a line adjacent to one added by
  glitch-soc.
  Applied upstream changes while keeping the glitch-soc line.
- `app/views/statuses/_simple_status.html.haml`:
  Upstream refactored formatting, glitch-soc changed the markup slightly.
  Applied upstream changes.
- `spec/lib/formatter_spec.rb`:
  Upstream completely refactored the formatting code and removed that file,
  while glitch-soc had code for Markdown and HTML toots.
  Took upstream code, glitch-soc changes will be re-implemented on top of the
  refactored classes in a later commit.
Diffstat (limited to 'app/lib/extractor.rb')
-rw-r--r--app/lib/extractor.rb82
1 files changed, 66 insertions, 16 deletions
diff --git a/app/lib/extractor.rb b/app/lib/extractor.rb
index 8020aa916..ef9407864 100644
--- a/app/lib/extractor.rb
+++ b/app/lib/extractor.rb
@@ -5,18 +5,34 @@ module Extractor
 
   module_function
 
-  # :yields: username, list_slug, start, end
+  def extract_entities_with_indices(text, options = {}, &block)
+    entities = begin
+      extract_urls_with_indices(text, options) +
+        extract_hashtags_with_indices(text, check_url_overlap: false) +
+        extract_mentions_or_lists_with_indices(text) +
+        extract_extra_uris_with_indices(text)
+    end
+
+    return [] if entities.empty?
+
+    entities = remove_overlapping_entities(entities)
+    entities.each(&block) if block_given?
+    entities
+  end
+
   def extract_mentions_or_lists_with_indices(text)
-    return [] unless Twitter::TwitterText::Regex[:at_signs].match?(text)
+    return [] unless text && Twitter::TwitterText::Regex[:at_signs].match?(text)
 
     possible_entries = []
 
-    text.to_s.scan(Account::MENTION_RE) do |screen_name, _|
+    text.scan(Account::MENTION_RE) do |screen_name, _|
       match_data = $LAST_MATCH_INFO
-      after = $'
+      after      = $'
+
       unless Twitter::TwitterText::Regex[:end_mention_match].match?(after)
         start_position = match_data.char_begin(1) - 1
-        end_position = match_data.char_end(1)
+        end_position   = match_data.char_end(1)
+
         possible_entries << {
           screen_name: screen_name,
           indices: [start_position, end_position],
@@ -29,36 +45,70 @@ module Extractor
         yield mention[:screen_name], mention[:indices].first, mention[:indices].last
       end
     end
+
     possible_entries
   end
 
-  def extract_hashtags_with_indices(text, **)
-    return [] unless /#/.match?(text)
+  def extract_hashtags_with_indices(text, _options = {})
+    return [] unless text&.index('#')
+
+    possible_entries = []
 
-    tags = []
     text.scan(Tag::HASHTAG_RE) do |hash_text, _|
-      match_data = $LAST_MATCH_INFO
+      match_data     = $LAST_MATCH_INFO
       start_position = match_data.char_begin(1) - 1
-      end_position = match_data.char_end(1)
-      after = $'
+      end_position   = match_data.char_end(1)
+      after          = $'
+
       if %r{\A://}.match?(after)
         hash_text.match(/(.+)(https?\Z)/) do |matched|
-          hash_text = matched[1]
+          hash_text     = matched[1]
           end_position -= matched[2].codepoint_length
         end
       end
 
-      tags << {
+      possible_entries << {
         hashtag: hash_text,
         indices: [start_position, end_position],
       }
     end
 
-    tags.each { |tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given?
-    tags
+    if block_given?
+      possible_entries.each do |tag|
+        yield tag[:hashtag], tag[:indices].first, tag[:indices].last
+      end
+    end
+
+    possible_entries
   end
 
   def extract_cashtags_with_indices(_text)
-    [] # always returns empty array
+    []
+  end
+
+  def extract_extra_uris_with_indices(text)
+    return [] unless text&.index(':')
+
+    possible_entries = []
+
+    text.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do
+      valid_uri_match_data = $LAST_MATCH_INFO
+
+      start_position = valid_uri_match_data.char_begin(3)
+      end_position   = valid_uri_match_data.char_end(3)
+
+      possible_entries << {
+        url: valid_uri_match_data[3],
+        indices: [start_position, end_position],
+      }
+    end
+
+    if block_given?
+      possible_entries.each do |url|
+        yield url[:url], url[:indices].first, url[:indices].last
+      end
+    end
+
+    possible_entries
   end
 end