diff options
author | kibigo! <marrus-sh@users.noreply.github.com> | 2022-12-02 01:29:42 -0800 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-12-02 10:29:42 +0100 |
commit | 2cabc5d188ee5b5c7bea808c58500d2c74e4b087 (patch) | |
tree | 77166cfbba7bfe8ab0adb599413e85c0df9555ed | |
parent | 4ac660147643291458c38e7c00802de5a0b65b62 (diff) |
Use a tree‐based approach for advanced text formatting (#1907)
* Use a tree‐based approach for adv. text formatting Sanitizing HTML/Markdown means parsing the content into an HTML tree under‐the‐hood anyway, and it is more accurate to do mention/hashtag replacement on the text nodes in that tree than it is to try to hack it in with regexes et cetera. This undoes the overrides of `#entities` and `#rewrite` on `AdvancedTextFormatter` but also stops using them, instead keeping track of the parsed Nokogiri tree itself and using that in the `#to_s` method. Internally, this tree uses `<mastodon-entity>` nodes to keep track of hashtags, links, and mentions. Sanitization is moved to the beginning, so it should be known that these do not appear in the input. * Also disallow entities inside of `<code>` I think this is generally expected behaviour, and people are annoyed when their code gets turned into links/hashtags/mentions. * Minor cleanup to AdvancedTextFormatter * Change AdvancedTextFormatter to rewrite entities in one pass and sanitize at the end Also, minor refactoring to better match how other formatters are organized. * Add some tests Co-authored-by: Claire <claire.github-309c@sitedethib.com>
-rw-r--r-- | app/lib/advanced_text_formatter.rb | 80 | ||||
-rw-r--r-- | spec/lib/advanced_text_formatter_spec.rb | 14 |
2 files changed, 50 insertions, 44 deletions
diff --git a/app/lib/advanced_text_formatter.rb b/app/lib/advanced_text_formatter.rb index dcaf34b91..21e81d4d1 100644 --- a/app/lib/advanced_text_formatter.rb +++ b/app/lib/advanced_text_formatter.rb @@ -19,6 +19,8 @@ class AdvancedTextFormatter < TextFormatter end end + attr_reader :content_type + # @param [String] text # @param [Hash] options # @option options [Boolean] :multiline @@ -27,7 +29,7 @@ class AdvancedTextFormatter < TextFormatter # @option options [Array<Account>] :preloaded_accounts # @option options [String] :content_type def initialize(text, options = {}) - content_type = options.delete(:content_type) + @content_type = options.delete(:content_type) super(text, options) @text = format_markdown(text) if content_type == 'text/markdown' @@ -50,50 +52,50 @@ class AdvancedTextFormatter < TextFormatter html.html_safe # rubocop:disable Rails/OutputSafety end - # Differs from `TextFormatter` by skipping HTML tags and entities - def entities - @entities ||= begin - gaps = [] - total_offset = 0 - - escaped = text.gsub(/<[^>]*>|&#[0-9]+;/) do |match| - total_offset += match.length - 1 - end_offset = Regexp.last_match.end(0) - gaps << [end_offset - total_offset, total_offset] - ' ' - end - - Extractor.extract_entities_with_indices(escaped, extract_url_without_protocol: false).map do |entity| - start_pos, end_pos = entity[:indices] - offset_idx = gaps.rindex { |gap| gap.first <= start_pos } - offset = offset_idx.nil? ? 0 : gaps[offset_idx].last - entity.merge(indices: [start_pos + offset, end_pos + offset]) + # Differs from TextFormatter by operating on the parsed HTML tree + def rewrite + if @tree.nil? + src = text.gsub(Sanitize::REGEX_UNSUITABLE_CHARS, '') + @tree = Nokogiri::HTML5.fragment(src) + document = @tree.document + + @tree.xpath('.//text()[not(ancestor::a | ancestor::code)]').each do |text_node| + # Iterate over text elements and build up their replacements. + content = text_node.content + replacement = Nokogiri::XML::NodeSet.new(document) + processed_index = 0 + Extractor.extract_entities_with_indices( + content, + extract_url_without_protocol: false + ) do |entity| + # Iterate over entities in this text node. + advance = entity[:indices].first - processed_index + if advance.positive? + # Text node for content which precedes entity. + replacement << Nokogiri::XML::Text.new( + content[processed_index, advance], + document + ) + end + replacement << Nokogiri::HTML5.fragment(yield(entity)) + processed_index = entity[:indices].last + end + if processed_index < content.size + # Text node for remaining content. + replacement << Nokogiri::XML::Text.new( + content[processed_index, content.size - processed_index], + document + ) + end + text_node.replace(replacement) end end + + Sanitize.node!(@tree, Sanitize::Config::MASTODON_OUTGOING).to_html end private - # Differs from `TextFormatter` in that it keeps HTML; but it sanitizes at the end to remain safe - def rewrite - entities.sort_by! do |entity| - entity[:indices].first - end - - result = ''.dup - - last_index = entities.reduce(0) do |index, entity| - indices = entity[:indices] - result << text[index...indices.first] - result << yield(entity) - indices.last - end - - result << text[last_index..-1] - - Sanitize.fragment(result, Sanitize::Config::MASTODON_OUTGOING) - end - def format_markdown(html) html = markdown_formatter.render(html) html.delete("\r").delete("\n") diff --git a/spec/lib/advanced_text_formatter_spec.rb b/spec/lib/advanced_text_formatter_spec.rb index 3255fc927..c1e469606 100644 --- a/spec/lib/advanced_text_formatter_spec.rb +++ b/spec/lib/advanced_text_formatter_spec.rb @@ -35,7 +35,7 @@ RSpec.describe AdvancedTextFormatter do end context 'given a block code' do - let(:text) { "test\n\n```\nint main(void) {\n return 0;\n}\n```\n" } + let(:text) { "test\n\n```\nint main(void) {\n return 0; // https://joinmastodon.org/foo\n}\n```\n" } it 'formats code using <pre> and <code>' do is_expected.to include '<pre><code>int main' @@ -44,13 +44,17 @@ RSpec.describe AdvancedTextFormatter do it 'does not strip leading spaces' do is_expected.to include '> return 0' end + + it 'does not format links' do + is_expected.to include 'return 0; // https://joinmastodon.org/foo' + end end - context 'given some quote' do - let(:text) { "> foo\n\nbar" } + context 'given a link in inline code using backticks' do + let(:text) { 'test `https://foo.bar/bar` bar' } - it 'formats code using <code>' do - is_expected.to include '<blockquote><p>foo</p></blockquote>' + it 'does not rewrite the link' do + is_expected.to include 'test <code>https://foo.bar/bar</code> bar' end end |