From a29fb04e7c1c7a719a29f40da275d4981ef2ebb5 Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Fri, 10 Jan 2020 04:14:37 -0600 Subject: bon voyage to that shitty text normalization code --- app/helpers/text_helper.rb | 58 ---------------------------------------------- 1 file changed, 58 deletions(-) delete mode 100644 app/helpers/text_helper.rb (limited to 'app/helpers') diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb deleted file mode 100644 index 16bb3f66e..000000000 --- a/app/helpers/text_helper.rb +++ /dev/null @@ -1,58 +0,0 @@ -# coding: utf-8 -require 'htmlentities' -require 'sixarm_ruby_unaccent' - -module TextHelper - - def html2text(html) - html = html - .gsub(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n") - .gsub(/<[bh]r[\/ ]*>/, "\n") - .gsub(/<\/?[^>]*>/, '') - - HTMLEntities.new.decode(html) - end - - def normalize_text(text) - text.downcase - .gsub(Account::MENTION_RE, '') - .gsub(/^(?:#[\w:._·\-]+\s*)+|(?:#[\w:._·\-]+\s*)+$/, '') - .gsub(/\s*\302\240+\s*/, ' ') - .gsub(/\n\s+|\s+\n/, "\n") - .gsub(/\r\n?/, "\n") - .gsub(/\n\n+/, "\n") - .unaccent_via_split_map - .gsub(/(?:htt|ft)ps?:\/\//, '') - .gsub(/[^\n\p{Word} [:punct:]]/, '') - .gsub(/ +/, ' ') - .strip - end - - def normalize_status(status) - "#{_format_tags(status)}\n#{_format_spoiler(status)}\n#{_format_status(status)}\n#{_format_desc(status)}".strip - end - - def _format_tags(status) - return unless status.tags.present? - "tag #{status.tags.pluck(:name).join("\ntag ")}" - end - - def _format_spoiler(status) - return if status.spoiler_text.blank? - "subj #{normalize_text(status.spoiler_text)}" - end - - def _format_status(status) - text = status.local? ? Formatter.instance.format(status) : status.text - return if text.blank? - text = normalize_text(html2text(text)) - text.gsub!("\n", "\ntext ") - "text #{text}" - end - - def _format_desc(status) - return unless status.media_attachments.present? - text = status.media_attachments.pluck(:description).compact.join("\ndesc ") - "desc #{normalize_text(text)}" - end -end -- cgit