about summary refs log tree commit diff
diff options
context:
space:
mode:
authormultiple creatures <dev@multiple-creature.party>2019-11-16 21:01:07 -0600
committermultiple creatures <dev@multiple-creature.party>2019-11-16 21:01:07 -0600
commit1132af15151713f52d8d1e320271185865a79633 (patch)
treec7d5dd6afb7a60c840a0d3238124f01c747eada8
parent487c945d160e9349579bf541147c949f8bca3c46 (diff)
Moved to using a normalized text column for searches. Admins using an FTS-enabled version of Monsterfork will need to apply the migration from `dist/search.sql` then run `bundle exec rails monsterfork:index_statuses`.
-rw-r--r--Gemfile2
-rw-r--r--Gemfile.lock2
-rw-r--r--app/helpers/filter_helper.rb2
-rw-r--r--app/helpers/search_helper.rb5
-rw-r--r--app/helpers/text_helper.rb31
-rw-r--r--app/lib/bangtags.rb2
-rw-r--r--app/models/status.rb15
-rw-r--r--db/migrate/20191116233416_add_normalized_text_to_statuses.rb5
-rw-r--r--db/schema.rb9
-rw-r--r--dist/search.sql8
-rw-r--r--lib/tasks/monsterfork.rake17
11 files changed, 87 insertions, 11 deletions
diff --git a/Gemfile b/Gemfile
index b7bd864ed..288af79ee 100644
--- a/Gemfile
+++ b/Gemfile
@@ -153,3 +153,5 @@ gem 'concurrent-ruby', require: false
 gem "ruby-bbcode", "~> 2.0"
 
 gem "sun_calc", "~> 0.1.0"
+
+gem "sixarm_ruby_unaccent", "~> 1.2"
diff --git a/Gemfile.lock b/Gemfile.lock
index e2d951be2..74968fbd8 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -579,6 +579,7 @@ GEM
       json (>= 1.8, < 3)
       simplecov-html (~> 0.10.0)
     simplecov-html (0.10.2)
+    sixarm_ruby_unaccent (1.2.0)
     sprockets (3.7.2)
       concurrent-ruby (~> 1.0)
       rack (> 1, < 3)
@@ -763,6 +764,7 @@ DEPENDENCIES
   simple-navigation (~> 4.0)
   simple_form (~> 4.1)
   simplecov (~> 0.16)
+  sixarm_ruby_unaccent (~> 1.2)
   sprockets-rails (~> 3.2)
   stackprof
   stoplight (~> 2.1.3)
diff --git a/app/helpers/filter_helper.rb b/app/helpers/filter_helper.rb
index a4a86496f..77ffa98be 100644
--- a/app/helpers/filter_helper.rb
+++ b/app/helpers/filter_helper.rb
@@ -21,7 +21,7 @@ module FilterHelper
     return false if filters.empty?
 
     status = status.reblog if status.reblog?
-    status_text = Formatter.instance.plaintext(status)
+    status_text = status.normalized_text
     spoiler_text = status.spoiler_text
     tags = status.tags.pluck(:name).join("\n")
     descs = status.media_attachments.map { |a| a.description }.join("\n").strip
diff --git a/app/helpers/search_helper.rb b/app/helpers/search_helper.rb
index 96da161f1..0f3d09c36 100644
--- a/app/helpers/search_helper.rb
+++ b/app/helpers/search_helper.rb
@@ -1,6 +1,7 @@
-module SearchHelper
+require 'sixarm_ruby_unaccent'
 
+module SearchHelper
 	def expand_search_query(query)
-    query.gsub(/"(.*)"/, '\\y\1\\y')
+    query.downcase.unaccent.gsub(/"(.*)"/, '\\y\1\\y')
   end
 end
diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb
new file mode 100644
index 000000000..c042c0342
--- /dev/null
+++ b/app/helpers/text_helper.rb
@@ -0,0 +1,31 @@
+# coding: utf-8
+require 'htmlentities'
+require 'sixarm_ruby_unaccent'
+
+module TextHelper
+
+  def normalize_text(html)
+    t = html.downcase
+
+    t.gsub!(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
+    t.gsub!(/<[bh]r[\/ ]*>/, "\n")
+    t.gsub!(/<\/?[^>]*>/, '')
+
+    t = HTMLEntities.new.decode(t)
+
+    t.gsub!(/[ \t]*\302\240+[ \t]*/, ' ')
+    t.gsub!(/  +/, ' ')
+
+    t.gsub!(/\r\n?/, "\n")
+    t.gsub!(/\n[ \t]+/, "\n")
+    t.gsub!(/[ \t]+\n/, "\n")
+    t.gsub!(/\n\n+/, "\n")
+
+    t.unaccent_via_split_map.strip
+  end
+
+  def normalize_status(status)
+    return normalize_text("#{status.spoiler_text}\n#{status.text}") unless status.local?
+    normalize_text("#{status.spoiler_text}\n#{Formatter.instance.format(status)}")
+  end
+end
diff --git a/app/lib/bangtags.rb b/app/lib/bangtags.rb
index 60fb426b3..d708683cb 100644
--- a/app/lib/bangtags.rb
+++ b/app/lib/bangtags.rb
@@ -720,7 +720,7 @@ class Bangtags
           q = cmd[1..-1].join.strip
           next if q.blank?
           begin
-            data = @account.statuses.where('text ~* ?', expand_search_query(q))
+            data = @account.statuses.where('normalized_text ~ ?', expand_search_query(q))
               .reorder(:created_at)
               .pluck(:created_at)
               .map { |d| d.strftime('%Y-%m') }
diff --git a/app/models/status.rb b/app/models/status.rb
index 29c4f6bd1..a2d2a8f28 100644
--- a/app/models/status.rb
+++ b/app/models/status.rb
@@ -31,9 +31,9 @@
 #  edited                 :boolean
 #  imported               :boolean
 #  origin                 :string
-#  tsv                    :tsvector
 #  boostable              :boolean
 #  reject_replies         :boolean
+#  normalized_text        :text             default(""), not null
 #
 
 class Status < ApplicationRecord
@@ -43,6 +43,7 @@ class Status < ApplicationRecord
   include Streamable
   include Cacheable
   include StatusThreadingConcern
+  include TextHelper
 
   # match both with and without U+FE0F (the emoji variation selector)
   LOCAL_ONLY_TOKENS = /(?:#!|\u{1f441}\ufe0f?)\u200b?\z/
@@ -324,6 +325,7 @@ class Status < ApplicationRecord
   around_create Mastodon::Snowflake::Callbacks
 
   before_create :set_locality
+  before_create :update_normalized_text
 
   before_validation :prepare_contents, if: :local?
   before_validation :set_reblog
@@ -334,6 +336,9 @@ class Status < ApplicationRecord
 
   after_create :set_poll_id
   after_create :process_bangtags, if: :local?
+  after_create :update_normalized_text
+
+  after_update :update_normalized_text
 
   class << self
     include SearchHelper
@@ -350,7 +355,7 @@ class Status < ApplicationRecord
       end
       return none if term.blank? || term.length < 3
       query = query.without_reblogs
-        .where('text ~* ?', expand_search_query(term))
+        .where('normalized_text ~ ?', expand_search_query(term))
         .offset(offset).limit(limit)
       apply_timeline_filters(query, account, true)
     rescue ActiveRecord::StatementInvalid
@@ -618,6 +623,12 @@ class Status < ApplicationRecord
     Bangtags.new(self).process
   end
 
+  def update_normalized_text
+    return unless (normalized_text.blank? && !text.blank?) || saved_change_to_text?
+    Rails.cache.delete("formatted_status:#{status.id}")
+    self.normalized_text = normalize_status(self)
+  end
+
   def set_conversation
     self.thread = thread.reblog if thread&.reblog?
 
diff --git a/db/migrate/20191116233416_add_normalized_text_to_statuses.rb b/db/migrate/20191116233416_add_normalized_text_to_statuses.rb
new file mode 100644
index 000000000..655408a32
--- /dev/null
+++ b/db/migrate/20191116233416_add_normalized_text_to_statuses.rb
@@ -0,0 +1,5 @@
+class AddNormalizedTextToStatuses < ActiveRecord::Migration[5.2]
+  def change
+    add_column :statuses, :normalized_text, :text, null: false, default: ''
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
index ee4778cbf..958a1dd49 100644
--- a/db/schema.rb
+++ b/db/schema.rb
@@ -10,10 +10,12 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2019_10_27_182731) do
+ActiveRecord::Schema.define(version: 2019_11_16_233416) do
 
   # These are extensions that must be enabled in order to support this database
+  enable_extension "pg_trgm"
   enable_extension "plpgsql"
+  enable_extension "unaccent"
 
   create_table "account_conversations", force: :cascade do |t|
     t.bigint "account_id"
@@ -697,9 +699,9 @@ ActiveRecord::Schema.define(version: 2019_10_27_182731) do
     t.boolean "edited"
     t.boolean "imported"
     t.string "origin"
-    t.tsvector "tsv"
     t.boolean "boostable"
     t.boolean "reject_replies"
+    t.text "normalized_text", default: "", null: false
     t.index ["account_id", "id", "visibility", "updated_at"], name: "index_statuses_20180106", order: { id: :desc }
     t.index ["account_id", "id", "visibility"], name: "index_statuses_on_account_id_and_id_and_visibility", order: { id: :desc }, where: "(visibility = ANY (ARRAY[0, 1, 2, 4]))"
     t.index ["in_reply_to_account_id"], name: "index_statuses_on_in_reply_to_account_id"
@@ -707,7 +709,8 @@ ActiveRecord::Schema.define(version: 2019_10_27_182731) do
     t.index ["network"], name: "index_statuses_on_network", where: "network"
     t.index ["origin"], name: "index_statuses_on_origin", unique: true
     t.index ["reblog_of_id", "account_id"], name: "index_statuses_on_reblog_of_id_and_account_id"
-    t.index ["tsv"], name: "tsv_idx", using: :gin
+    t.index ["spoiler_text"], name: "index_statuses_on_spoiler_text_trgm", opclass: :gin_trgm_ops, using: :gin
+    t.index ["text"], name: "index_statuses_on_text_trgm", opclass: :gin_trgm_ops, using: :gin
     t.index ["uri"], name: "index_statuses_on_uri", unique: true
   end
 
diff --git a/dist/search.sql b/dist/search.sql
index eef191e60..e68b3c9b3 100644
--- a/dist/search.sql
+++ b/dist/search.sql
@@ -12,7 +12,11 @@ DROP TRIGGER IF EXISTS tsvectorupdate ON statuses;
 DROP FUNCTION IF EXISTS tsv_update_trigger;
 DROP INDEX IF EXISTS tsv_idx;
 ALTER TABLE statuses DROP COLUMN IF EXISTS tsv;
+DROP INDEX IF EXISTS index_statuses_on_text_trgm;
+DROP INDEX IF EXISTS index_statuses_on_spoiler_text_trgm;
 
 -- Create new trigram indexes --
-CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_text_trgm ON statuses USING GIN (text gin_trgm_ops);
-CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_spoiler_text_trgm ON statuses USING GIN (spoiler_text gin_trgm_ops);
+CREATE INDEX CONCURRENTLY IF NOT EXISTS index_statuses_on_normalized_text_trgm ON statuses USING GIN (normalized_text gin_trgm_ops);
+
+-- Compact tables ---
+VACUUM ANALYZE;
diff --git a/lib/tasks/monsterfork.rake b/lib/tasks/monsterfork.rake
new file mode 100644
index 000000000..e55bb1930
--- /dev/null
+++ b/lib/tasks/monsterfork.rake
@@ -0,0 +1,17 @@
+namespace :monsterfork do
+  desc '(Re-)Index statuses for search.'
+  task index_statuses: :environment do
+    include TextHelper
+
+    i = 0
+    total = Status.count
+
+    Status.find_in_batches do |statuses|
+      ActiveRecord::Base.logger.info("Indexing statuses #{1+i}-#{statuses.count} of #{total}.")
+      i += statuses.count
+      statuses.each do |s|
+        ActiveRecord::Base.logger.silence { s.update_column(:normalized_text, normalize_status(s)) }
+      end
+    end
+  end
+end