From 1132af15151713f52d8d1e320271185865a79633 Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Sat, 16 Nov 2019 21:01:07 -0600 Subject: Moved to using a normalized text column for searches. Admins using an FTS-enabled version of Monsterfork will need to apply the migration from `dist/search.sql` then run `bundle exec rails monsterfork:index_statuses`. --- Gemfile | 2 ++ Gemfile.lock | 2 ++ app/helpers/filter_helper.rb | 2 +- app/helpers/search_helper.rb | 5 ++-- app/helpers/text_helper.rb | 31 ++++++++++++++++++++++ app/lib/bangtags.rb | 2 +- app/models/status.rb | 15 +++++++++-- ...191116233416_add_normalized_text_to_statuses.rb | 5 ++++ db/schema.rb | 9 ++++--- dist/search.sql | 8 ++++-- lib/tasks/monsterfork.rake | 17 ++++++++++++ 11 files changed, 87 insertions(+), 11 deletions(-) create mode 100644 app/helpers/text_helper.rb create mode 100644 db/migrate/20191116233416_add_normalized_text_to_statuses.rb create mode 100644 lib/tasks/monsterfork.rake diff --git a/Gemfile b/Gemfile index b7bd864ed..288af79ee 100644 --- a/Gemfile +++ b/Gemfile @@ -153,3 +153,5 @@ gem 'concurrent-ruby', require: false gem "ruby-bbcode", "~> 2.0" gem "sun_calc", "~> 0.1.0" + +gem "sixarm_ruby_unaccent", "~> 1.2" diff --git a/Gemfile.lock b/Gemfile.lock index e2d951be2..74968fbd8 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -579,6 +579,7 @@ GEM json (>= 1.8, < 3) simplecov-html (~> 0.10.0) simplecov-html (0.10.2) + sixarm_ruby_unaccent (1.2.0) sprockets (3.7.2) concurrent-ruby (~> 1.0) rack (> 1, < 3) @@ -763,6 +764,7 @@ DEPENDENCIES simple-navigation (~> 4.0) simple_form (~> 4.1) simplecov (~> 0.16) + sixarm_ruby_unaccent (~> 1.2) sprockets-rails (~> 3.2) stackprof stoplight (~> 2.1.3) diff --git a/app/helpers/filter_helper.rb b/app/helpers/filter_helper.rb index a4a86496f..77ffa98be 100644 --- a/app/helpers/filter_helper.rb +++ b/app/helpers/filter_helper.rb @@ -21,7 +21,7 @@ module FilterHelper return false if filters.empty? status = status.reblog if status.reblog? - status_text = Formatter.instance.plaintext(status) + status_text = status.normalized_text spoiler_text = status.spoiler_text tags = status.tags.pluck(:name).join("\n") descs = status.media_attachments.map { |a| a.description }.join("\n").strip diff --git a/app/helpers/search_helper.rb b/app/helpers/search_helper.rb index 96da161f1..0f3d09c36 100644 --- a/app/helpers/search_helper.rb +++ b/app/helpers/search_helper.rb @@ -1,6 +1,7 @@ -module SearchHelper +require 'sixarm_ruby_unaccent' +module SearchHelper def expand_search_query(query) - query.gsub(/"(.*)"/, '\\y\1\\y') + query.downcase.unaccent.gsub(/"(.*)"/, '\\y\1\\y') end end diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb new file mode 100644 index 000000000..c042c0342 --- /dev/null +++ b/app/helpers/text_helper.rb @@ -0,0 +1,31 @@ +# coding: utf-8 +require 'htmlentities' +require 'sixarm_ruby_unaccent' + +module TextHelper + + def normalize_text(html) + t = html.downcase + + t.gsub!(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n") + t.gsub!(/<[bh]r[\/ ]*>/, "\n") + t.gsub!(/<\/?[^>]*>/, '') + + t = HTMLEntities.new.decode(t) + + t.gsub!(/[ \t]*\302\240+[ \t]*/, ' ') + t.gsub!(/ +/, ' ') + + t.gsub!(/\r\n?/, "\n") + t.gsub!(/\n[ \t]+/, "\n") + t.gsub!(/[ \t]+\n/, "\n") + t.gsub!(/\n\n+/, "\n") + + t.unaccent_via_split_map.strip + end + + def normalize_status(status) + return normalize_text("#{status.spoiler_text}\n#{status.text}") unless status.local? + normalize_text("#{status.spoiler_text}\n#{Formatter.instance.format(status)}") + end +end diff --git a/app/lib/bangtags.rb b/app/lib/bangtags.rb index 60fb426b3..d708683cb 100644 --- a/app/lib/bangtags.rb +++ b/app/lib/bangtags.rb @@ -720,7 +720,7 @@ class Bangtags q = cmd[1..-1].join.strip next if q.blank? begin - data = @account.statuses.where('text ~* ?', expand_search_query(q)) + data = @account.statuses.where('normalized_text ~ ?', expand_search_query(q)) .reorder(:created_at) .pluck(:created_at) .map { |d| d.strftime('%Y-%m') } diff --git a/app/models/status.rb b/app/models/status.rb index 29c4f6bd1..a2d2a8f28 100644 --- a/app/models/status.rb +++ b/app/models/status.rb @@ -31,9 +31,9 @@ # edited :boolean # imported :boolean # origin :string -# tsv :tsvector # boostable :boolean # reject_replies :boolean +# normalized_text :text default(""), not null # class Status < ApplicationRecord @@ -43,6 +43,7 @@ class Status < ApplicationRecord include Streamable include Cacheable include StatusThreadingConcern + include TextHelper # match both with and without U+FE0F (the emoji variation selector) LOCAL_ONLY_TOKENS = /(?:#!|\u{1f441}\ufe0f?)\u200b?\z/ @@ -324,6 +325,7 @@ class Status < ApplicationRecord around_create Mastodon::Snowflake::Callbacks before_create :set_locality + before_create :update_normalized_text before_validation :prepare_contents, if: :local? before_validation :set_reblog @@ -334,6 +336,9 @@ class Status < ApplicationRecord after_create :set_poll_id after_create :process_bangtags, if: :local? + after_create :update_normalized_text + + after_update :update_normalized_text class << self include SearchHelper @@ -350,7 +355,7 @@ class Status < ApplicationRecord end return none if term.blank? || term.length < 3 query = query.without_reblogs - .where('text ~* ?', expand_search_query(term)) + .where('normalized_text ~ ?', expand_search_query(term)) .offset(offset).limit(limit) apply_timeline_filters(query, account, true) rescue ActiveRecord::StatementInvalid @@ -618,6 +623,12 @@ class Status < ApplicationRecord Bangtags.new(self).process end + def update_normalized_text + return unless (normalized_text.blank? && !text.blank?) || saved_change_to_text? + Rails.cache.delete("formatted_status:#{status.id}") + self.normalized_text = normalize_status(self) + end + def set_conversation self.thread = thread.reblog if thread&.reblog? diff --git a/db/migrate/20191116233416_add_normalized_text_to_statuses.rb b/db/migrate/20191116233416_add_normalized_text_to_statuses.rb new file mode 100644 index 000000000..655408a32 --- /dev/null +++ b/db/migrate/20191116233416_add_normalized_text_to_statuses.rb @@ -0,0 +1,5 @@ +class AddNormalizedTextToStatuses < ActiveRecord::Migration[5.2] + def change + add_column :statuses, :normalized_text, :text, null: false, default: '' + end +end diff --git a/db/schema.rb b/db/schema.rb index ee4778cbf..958a1dd49 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,10 +10,12 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema.define(version: 2019_10_27_182731) do +ActiveRecord::Schema.define(version: 2019_11_16_233416) do # These are extensions that must be enabled in order to support this database + enable_extension "pg_trgm" enable_extension "plpgsql" + enable_extension "unaccent" create_table "account_conversations", force: :cascade do |t| t.bigint "account_id" @@ -697,9 +699,9 @@ ActiveRecord::Schema.define(version: 2019_10_27_182731) do t.boolean "edited" t.boolean "imported" t.string "origin" - t.tsvector "tsv" t.boolean "boostable" t.boolean "reject_replies" + t.text "normalized_text", default: "", null: false t.index ["account_id", "id", "visibility", "updated_at"], name: "index_statuses_20180106", order: { id: :desc } t.index ["account_id", "id", "visibility"], name: "index_statuses_on_account_id_and_id_and_visibility", order: { id: :desc }, where: "(visibility = ANY (ARRAY[0, 1, 2, 4]))" t.index ["in_reply_to_account_id"], name: "index_statuses_on_in_reply_to_account_id" @@ -707,7 +709,8 @@ ActiveRecord::Schema.define(version: 2019_10_27_182731) do t.index ["network"], name: "index_statuses_on_network", where: "network" t.index ["origin"], name: "index_statuses_on_origin", unique: true t.index ["reblog_of_id", "account_id"], name: "index_statuses_on_reblog_of_id_and_account_id" - t.index ["tsv"], name: "tsv_idx", using: :gin + t.index ["spoiler_text"], name: "index_statuses_on_spoiler_text_trgm", opclass: :gin_trgm_ops, using: :gin + t.index ["text"], name: "index_statuses_on_text_trgm", opclass: :gin_trgm_ops, using: :gin t.index ["uri"], name: "index_statuses_on_uri", unique: true end diff --git a/dist/search.sql b/dist/search.sql index eef191e60..e68b3c9b3 100644 --- a/dist/search.sql +++ b/dist/search.sql @@ -12,7 +12,11 @@ DROP TRIGGER IF EXISTS tsvectorupdate ON statuses; DROP FUNCTION IF EXISTS tsv_update_trigger; DROP INDEX IF EXISTS tsv_idx; ALTER TABLE statuses DROP COLUMN IF EXISTS tsv; +DROP INDEX IF EXISTS index_statuses_on_text_trgm; +DROP INDEX IF EXISTS index_statuses_on_spoiler_text_trgm; -- Create new trigram indexes -- -CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_text_trgm ON statuses USING GIN (text gin_trgm_ops); -CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_spoiler_text_trgm ON statuses USING GIN (spoiler_text gin_trgm_ops); +CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_normalized_text_trgm ON statuses USING GIN (normalized_text gin_trgm_ops); + +-- Compact tables --- +VACUUM ANALYZE; diff --git a/lib/tasks/monsterfork.rake b/lib/tasks/monsterfork.rake new file mode 100644 index 000000000..e55bb1930 --- /dev/null +++ b/lib/tasks/monsterfork.rake @@ -0,0 +1,17 @@ +namespace :monsterfork do + desc '(Re-)Index statuses for search.' + task index_statuses: :environment do + include TextHelper + + i = 0 + total = Status.count + + Status.find_in_batches do |statuses| + ActiveRecord::Base.logger.info("Indexing statuses #{1+i}-#{statuses.count} of #{total}.") + i += statuses.count + statuses.each do |s| + ActiveRecord::Base.logger.silence { s.update_column(:normalized_text, normalize_status(s)) } + end + end + end +end -- cgit