From 2da941beaab17923ec72af0c2e2d8de6c62cd20e Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Tue, 10 Dec 2019 05:18:50 -0600 Subject: simplify normalizer & add headings --- app/helpers/text_helper.rb | 64 ++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 22 deletions(-) (limited to 'app/helpers/text_helper.rb') diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb index d85c6bbd8..ac49e56f0 100644 --- a/app/helpers/text_helper.rb +++ b/app/helpers/text_helper.rb @@ -4,35 +4,55 @@ require 'sixarm_ruby_unaccent' module TextHelper - def normalize_text(html) - t = html.downcase - - t.gsub!(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n") - t.gsub!(/<[bh]r[\/ ]*>/, "\n") - t.gsub!(/<\/?[^>]*>/, '') - - t = HTMLEntities.new.decode(t) + def html2text(html) + html = html + .gsub(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n") + .gsub(/<[bh]r[\/ ]*>/, "\n") + .gsub(/<\/?[^>]*>/, '') - t.gsub!(/[ \t]*\302\240+[ \t]*/, ' ') - t.gsub!(/ +/, ' ') + HTMLEntities.new.decode(html) + end - t.gsub!(/\r\n?/, "\n") - t.gsub!(/\n[ \t]+/, "\n") - t.gsub!(/[ \t]+\n/, "\n") - t.gsub!(/\n\n+/, "\n") + def normalize_text(html) + html.downcase + .gsub(/[ \t]*\302\240+[ \t]*/, ' ') + .gsub(/ +/, ' ') + .gsub(/\r\n?/, "\n") + .gsub(/\n[ \t]+/, "\n") + .gsub(/[ \t]+\n/, "\n") + .gsub(/\n\n+/, "\n") + .gsub(/^(?:#[\w:._·\-]+\s*)+/, '') + .gsub(/(?:#[\w:._·\-]+\s*)+$/, '') + .delete('#') + .strip + .unaccent_via_split_map + end - return t.strip.unaccent_via_split_map unless '#'.in?(t) + def normalize_status(status) + "#{_format_tags(status)}\n#{_format_spoiler(status)}\n#{_format_status(status)}\n#{_format_desc(status)}".delete("\n\n").strip + end - tags = Extractor.extract_hashtags(t).uniq - t.gsub!(/^(?:#[\w:._·\-]+\s*)+/, '') - t.gsub!(/(?:#[\w:._·\-]+\s*)+$/, '') + def _format_tags(status) + return unless status.tags.present? + "tags #{status.tags.pluck(:name).join("\ntags ")}" + end - t.delete!('#') + def _format_spoiler(status) + return if status.spoiler_text.blank? + "subj #{normalize_text(status.spoiler_text)}" + end - "#{tags.join(' ')}\n#{t.lstrip}".strip.unaccent_via_split_map + def _format_status(status) + text = status.local? ? Formatter.instance.format(status) : status.text + return if text.blank? + text = normalize_text(html2text(text)) + text.gsub!("\n", "\ntext ") + "text #{text}" end - def normalize_status(status) - normalize_text("tags #{status.tags.pluck(:name).join(' ')}\nsubj #{status.spoiler_text}\ntext #{status.local? ? Formatter.instance.format(status) : status.text}\ndesc #{status.media_attachments.pluck(:description).join("\n")}") + def _format_desc(status) + return unless status.media_attachments.present? + text = status.media_attachments.pluck(:description).join("\ndesc ") + "desc #{normalize_text(text)}" end end -- cgit