From a29fb04e7c1c7a719a29f40da275d4981ef2ebb5 Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Fri, 10 Jan 2020 04:14:37 -0600 Subject: bon voyage to that shitty text normalization code --- app/helpers/text_helper.rb | 58 ---------------------------------------------- app/models/status.rb | 12 ---------- lib/tasks/monsterfork.rake | 42 --------------------------------- 3 files changed, 112 deletions(-) delete mode 100644 app/helpers/text_helper.rb diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb deleted file mode 100644 index 16bb3f66e..000000000 --- a/app/helpers/text_helper.rb +++ /dev/null @@ -1,58 +0,0 @@ -# coding: utf-8 -require 'htmlentities' -require 'sixarm_ruby_unaccent' - -module TextHelper - - def html2text(html) - html = html - .gsub(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n") - .gsub(/<[bh]r[\/ ]*>/, "\n") - .gsub(/<\/?[^>]*>/, '') - - HTMLEntities.new.decode(html) - end - - def normalize_text(text) - text.downcase - .gsub(Account::MENTION_RE, '') - .gsub(/^(?:#[\w:._·\-]+\s*)+|(?:#[\w:._·\-]+\s*)+$/, '') - .gsub(/\s*\302\240+\s*/, ' ') - .gsub(/\n\s+|\s+\n/, "\n") - .gsub(/\r\n?/, "\n") - .gsub(/\n\n+/, "\n") - .unaccent_via_split_map - .gsub(/(?:htt|ft)ps?:\/\//, '') - .gsub(/[^\n\p{Word} [:punct:]]/, '') - .gsub(/ +/, ' ') - .strip - end - - def normalize_status(status) - "#{_format_tags(status)}\n#{_format_spoiler(status)}\n#{_format_status(status)}\n#{_format_desc(status)}".strip - end - - def _format_tags(status) - return unless status.tags.present? - "tag #{status.tags.pluck(:name).join("\ntag ")}" - end - - def _format_spoiler(status) - return if status.spoiler_text.blank? - "subj #{normalize_text(status.spoiler_text)}" - end - - def _format_status(status) - text = status.local? ? Formatter.instance.format(status) : status.text - return if text.blank? - text = normalize_text(html2text(text)) - text.gsub!("\n", "\ntext ") - "text #{text}" - end - - def _format_desc(status) - return unless status.media_attachments.present? - text = status.media_attachments.pluck(:description).compact.join("\ndesc ") - "desc #{normalize_text(text)}" - end -end diff --git a/app/models/status.rb b/app/models/status.rb index d051d8962..f5f51779f 100644 --- a/app/models/status.rb +++ b/app/models/status.rb @@ -40,7 +40,6 @@ class Status < ApplicationRecord include Streamable include Cacheable include StatusThreadingConcern - include TextHelper # match both with and without U+FE0F (the emoji variation selector) LOCAL_ONLY_TOKENS = /(?:#!|\u{1f441}\ufe0f?)\u200b?\z/ @@ -358,7 +357,6 @@ class Status < ApplicationRecord after_save :update_sharekey, if: :local? after_save :update_origin, if: :local? - after_save :update_normalized_text after_save :process_bangtags, if: :local? class << self @@ -680,16 +678,6 @@ class Status < ApplicationRecord end end - def update_normalized_text - return if destroyed? || text.blank? || !(text_changed? || saved_change_to_text?) - normalized_text = normalize_status(self) - if self.normalized_status.nil? - self.create_normalized_status(text: normalized_text) - else - self.normalized_status.update_attributes(text: normalized_text) - end - end - def set_conversation self.thread = thread.reblog if thread&.reblog? diff --git a/lib/tasks/monsterfork.rake b/lib/tasks/monsterfork.rake index 14e001f4b..a5a5ea434 100644 --- a/lib/tasks/monsterfork.rake +++ b/lib/tasks/monsterfork.rake @@ -1,48 +1,6 @@ # frozen_string_literal: true -def index_statuses(statuses_query) - include TextHelper - - i = 0 - total = statuses_query.count - - statuses_query.find_in_batches do |statuses| - ActiveRecord::Base.logger.info("Indexing status #{1+i} of #{total}.") - ActiveRecord::Base.logger.silence do - i += statuses.count - statuses.each do |s| - begin - next if s.destroyed? - normalized_text = normalize_status(s) - if s.normalized_status.nil? - s.create_normalized_status(text: normalized_text) - elsif s.normalized_status.text != normalized_text - s.normalized_status.update_column(:text, normalized_text) - end - rescue ActiveRecord::RecordNotFound - true - end - end - end - end -end - namespace :monsterfork do - desc 'Index statuses for search that have not been indexed yet.' - task index_statuses: :environment do - index_statuses(Status.where(normalized_text: '')) - end - - desc 'Reindex all statuses for search.' - task reindex_statuses: :environment do - index_statuses(Status) - end - - desc 'Reindex statuses containing media with descriptions for search.' - task reindex_media_descs: :environment do - index_statuses(Status.left_outer_joins(:media_attachments).where('media_attachments.description IS NOT NULL')) - end - desc "Re-apply all users' filters to their home and list timelines." task reapply_filters: :environment do Account.local.find_each do |account| -- cgit