From cefa526c6d3a45df2d0fcb7643ced828e2e87dea Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Sat, 26 Mar 2022 02:53:34 +0100 Subject: Refactor formatter (#17828) * Refactor formatter * Move custom emoji pre-rendering logic to view helpers * Move more methods out of Formatter * Fix code style issues * Remove Formatter * Add inline poll options to RSS feeds * Remove unused helper method * Fix code style issues * Various fixes and improvements * Fix test --- app/lib/extractor.rb | 82 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 16 deletions(-) (limited to 'app/lib/extractor.rb') diff --git a/app/lib/extractor.rb b/app/lib/extractor.rb index 8020aa916..ef9407864 100644 --- a/app/lib/extractor.rb +++ b/app/lib/extractor.rb @@ -5,18 +5,34 @@ module Extractor module_function - # :yields: username, list_slug, start, end + def extract_entities_with_indices(text, options = {}, &block) + entities = begin + extract_urls_with_indices(text, options) + + extract_hashtags_with_indices(text, check_url_overlap: false) + + extract_mentions_or_lists_with_indices(text) + + extract_extra_uris_with_indices(text) + end + + return [] if entities.empty? + + entities = remove_overlapping_entities(entities) + entities.each(&block) if block_given? + entities + end + def extract_mentions_or_lists_with_indices(text) - return [] unless Twitter::TwitterText::Regex[:at_signs].match?(text) + return [] unless text && Twitter::TwitterText::Regex[:at_signs].match?(text) possible_entries = [] - text.to_s.scan(Account::MENTION_RE) do |screen_name, _| + text.scan(Account::MENTION_RE) do |screen_name, _| match_data = $LAST_MATCH_INFO - after = $' + after = $' + unless Twitter::TwitterText::Regex[:end_mention_match].match?(after) start_position = match_data.char_begin(1) - 1 - end_position = match_data.char_end(1) + end_position = match_data.char_end(1) + possible_entries << { screen_name: screen_name, indices: [start_position, end_position], @@ -29,36 +45,70 @@ module Extractor yield mention[:screen_name], mention[:indices].first, mention[:indices].last end end + possible_entries end - def extract_hashtags_with_indices(text, **) - return [] unless /#/.match?(text) + def extract_hashtags_with_indices(text, _options = {}) + return [] unless text&.index('#') + + possible_entries = [] - tags = [] text.scan(Tag::HASHTAG_RE) do |hash_text, _| - match_data = $LAST_MATCH_INFO + match_data = $LAST_MATCH_INFO start_position = match_data.char_begin(1) - 1 - end_position = match_data.char_end(1) - after = $' + end_position = match_data.char_end(1) + after = $' + if %r{\A://}.match?(after) hash_text.match(/(.+)(https?\Z)/) do |matched| - hash_text = matched[1] + hash_text = matched[1] end_position -= matched[2].codepoint_length end end - tags << { + possible_entries << { hashtag: hash_text, indices: [start_position, end_position], } end - tags.each { |tag| yield tag[:hashtag], tag[:indices].first, tag[:indices].last } if block_given? - tags + if block_given? + possible_entries.each do |tag| + yield tag[:hashtag], tag[:indices].first, tag[:indices].last + end + end + + possible_entries end def extract_cashtags_with_indices(_text) - [] # always returns empty array + [] + end + + def extract_extra_uris_with_indices(text) + return [] unless text&.index(':') + + possible_entries = [] + + text.scan(Twitter::TwitterText::Regex[:valid_extended_uri]) do + valid_uri_match_data = $LAST_MATCH_INFO + + start_position = valid_uri_match_data.char_begin(3) + end_position = valid_uri_match_data.char_end(3) + + possible_entries << { + url: valid_uri_match_data[3], + indices: [start_position, end_position], + } + end + + if block_given? + possible_entries.each do |url| + yield url[:url], url[:indices].first, url[:indices].last + end + end + + possible_entries end end -- cgit