Use a tree‐based approach for advanced text formatting (#1907)

* Use a tree‐based approach for adv. text formatting Sanitizing HTML/Markdown means parsing the content into an HTML tree under‐the‐hood anyway, and it is more accurate to do mention/hashtag replacement on the text nodes in that tree than it is to try to hack it in with regexes et cetera. This undoes the overrides of `#entities` and `#rewrite` on `AdvancedTextFormatter` but also stops using them, instead keeping track of the parsed Nokogiri tree itself and using that in the `#to_s` method. Internally, this tree uses `<mastodon-entity>` nodes to keep track of hashtags, links, and mentions. Sanitization is moved to the beginning, so it should be known that these do not appear in the input. * Also disallow entities inside of `<code>` I think this is generally expected behaviour, and people are annoyed when their code gets turned into links/hashtags/mentions. * Minor cleanup to AdvancedTextFormatter * Change AdvancedTextFormatter to rewrite entities in one pass and sanitize at the end Also, minor refactoring to better match how other formatters are organized. * Add some tests Co-authored-by: Claire <claire.github-309c@sitedethib.com>
author: kibigo! <marrus-sh@users.noreply.github.com> 2022-12-02 01:29:42 -0800
committer: GitHub <noreply@github.com> 2022-12-02 10:29:42 +0100
commit: 2cabc5d188ee5b5c7bea808c58500d2c74e4b087 (patch)
tree: 77166cfbba7bfe8ab0adb599413e85c0df9555ed
parent: 4ac660147643291458c38e7c00802de5a0b65b62 (diff)
2 files changed, 50 insertions, 44 deletions
diff --git a/app/lib/advanced_text_formatter.rb b/app/lib/advanced_text_formatter.rb
index dcaf34b91..21e81d4d1 100644
--- a/app/lib/advanced_text_formatter.rb
+++ b/app/lib/advanced_text_formatter.rb
@@ -19,6 +19,8 @@ class AdvancedTextFormatter < TextFormatter
     end
   end
 
+  attr_reader :content_type
+
   # @param [String] text
   # @param [Hash] options
   # @option options [Boolean] :multiline
@@ -27,7 +29,7 @@ class AdvancedTextFormatter < TextFormatter
   # @option options [Array<Account>] :preloaded_accounts
   # @option options [String] :content_type
   def initialize(text, options = {})
-    content_type = options.delete(:content_type)
+    @content_type = options.delete(:content_type)
     super(text, options)
 
     @text = format_markdown(text) if content_type == 'text/markdown'
@@ -50,50 +52,50 @@ class AdvancedTextFormatter < TextFormatter
     html.html_safe # rubocop:disable Rails/OutputSafety
   end
 
-  # Differs from `TextFormatter` by skipping HTML tags and entities
-  def entities
-    @entities ||= begin
-      gaps = []
-      total_offset = 0
-
-      escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match|
-        total_offset += match.length - 1
-        end_offset = Regexp.last_match.end(0)
-        gaps << [end_offset - total_offset, total_offset]
-        ' '
-      end
-
-      Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity|
-        start_pos, end_pos = entity[:indices]
-        offset_idx = gaps.rindex { |gap| gap.first <= start_pos }
-        offset = offset_idx.nil? ? 0 : gaps[offset_idx].last
-        entity.merge(indices: [start_pos + offset, end_pos + offset])
+  # Differs from TextFormatter by operating on the parsed HTML tree
+  def rewrite
+    if @tree.nil?
+      src = text.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '')
+      @tree = Nokogiri::HTML5.fragment(src)
+      document = @tree.document
+
+      @tree.xpath('.//text()[not(ancestor::a | ancestor::code)]').each do |text_node|
+        # Iterate over text elements and build up their replacements.
+        content = text_node.content
+        replacement = Nokogiri::XML::NodeSet.new(document)
+        processed_index = 0
+        Extractor.extract_entities_with_indices(
+          content,
+          extract_url_without_protocol: false
+        ) do |entity|
+          # Iterate over entities in this text node.
+          advance = entity[:indices].first - processed_index
+          if advance.positive?
+            # Text node for content which precedes entity.
+            replacement << Nokogiri::XML::Text.new(
+              content[processed_index, advance],
+              document
+            )
+          end
+          replacement << Nokogiri::HTML5.fragment(yield(entity))
+          processed_index = entity[:indices].last
+        end
+        if processed_index < content.size
+          # Text node for remaining content.
+          replacement << Nokogiri::XML::Text.new(
+            content[processed_index, content.size - processed_index],
+            document
+          )
+        end
+        text_node.replace(replacement)
       end
     end
+
+    Sanitize.node!(@tree, Sanitize::Config::MASTODON_OUTGOING).to_html
   end
 
   private
 
-  # Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe
-  def rewrite
-    entities.sort_by! do |entity|
-      entity[:indices].first
-    end
-
-    result = ''.dup
-
-    last_index = entities.reduce(0) do |index, entity|
-      indices = entity[:indices]
-      result << text[index...indices.first]
-      result << yield(entity)
-      indices.last
-    end
-
-    result << text[last_index..-1]
-
-    Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING)
-  end
-
   def format_markdown(html)
     html = markdown_formatter.render(html)
     html.delete("\r").delete("\n")
diff --git a/spec/lib/advanced_text_formatter_spec.rb b/spec/lib/advanced_text_formatter_spec.rb
index 3255fc927..c1e469606 100644
--- a/spec/lib/advanced_text_formatter_spec.rb
+++ b/spec/lib/advanced_text_formatter_spec.rb
@@ -35,7 +35,7 @@ RSpec.describe AdvancedTextFormatter do
       end
 
       context 'given a block code' do
-        let(:text) { "test\n\n```\nint main(void) {\n  return 0;\n}\n```\n" }
+        let(:text) { "test\n\n```\nint main(void) {\n  return 0; // https://joinmastodon.org/foo\n}\n```\n" }
 
         it 'formats code using <pre> and <code>' do
           is_expected.to include '<pre><code>int main'
@@ -44,13 +44,17 @@ RSpec.describe AdvancedTextFormatter do
         it 'does not strip leading spaces' do
           is_expected.to include '>  return 0'
         end
+
+        it 'does not format links' do
+          is_expected.to include 'return 0; // https://joinmastodon.org/foo'
+        end
       end
 
-      context 'given some quote' do
-        let(:text) { "> foo\n\nbar" }
+      context 'given a link in inline code using backticks' do
+        let(:text) { 'test `https://foo.bar/bar` bar' }
 
-        it 'formats code using <code>' do
-          is_expected.to include '<blockquote><p>foo</p></blockquote>'
+        it 'does not rewrite the link' do
+          is_expected.to include 'test <code>https://foo.bar/bar</code> bar'
         end
       end
author	kibigo! <marrus-sh@users.noreply.github.com>	2022-12-02 01:29:42 -0800
committer	GitHub <noreply@github.com>	2022-12-02 10:29:42 +0100
commit	2cabc5d188ee5b5c7bea808c58500d2c74e4b087 (patch)
tree	77166cfbba7bfe8ab0adb599413e85c0df9555ed
parent	4ac660147643291458c38e7c00802de5a0b65b62 (diff)