about summary refs log tree commit diff
path: root/app
diff options
context:
space:
mode:
authormultiple creatures <dev@multiple-creature.party>2019-11-16 21:01:07 -0600
committermultiple creatures <dev@multiple-creature.party>2019-11-16 21:01:07 -0600
commit1132af15151713f52d8d1e320271185865a79633 (patch)
treec7d5dd6afb7a60c840a0d3238124f01c747eada8 /app
parent487c945d160e9349579bf541147c949f8bca3c46 (diff)
Moved to using a normalized text column for searches. Admins using an FTS-enabled version of Monsterfork will need to apply the migration from `dist/search.sql` then run `bundle exec rails monsterfork:index_statuses`.
Diffstat (limited to 'app')
-rw-r--r--app/helpers/filter_helper.rb2
-rw-r--r--app/helpers/search_helper.rb5
-rw-r--r--app/helpers/text_helper.rb31
-rw-r--r--app/lib/bangtags.rb2
-rw-r--r--app/models/status.rb15
5 files changed, 49 insertions, 6 deletions
diff --git a/app/helpers/filter_helper.rb b/app/helpers/filter_helper.rb
index a4a86496f..77ffa98be 100644
--- a/app/helpers/filter_helper.rb
+++ b/app/helpers/filter_helper.rb
@@ -21,7 +21,7 @@ module FilterHelper
     return false if filters.empty?
 
     status = status.reblog if status.reblog?
-    status_text = Formatter.instance.plaintext(status)
+    status_text = status.normalized_text
     spoiler_text = status.spoiler_text
     tags = status.tags.pluck(:name).join("\n")
     descs = status.media_attachments.map { |a| a.description }.join("\n").strip
diff --git a/app/helpers/search_helper.rb b/app/helpers/search_helper.rb
index 96da161f1..0f3d09c36 100644
--- a/app/helpers/search_helper.rb
+++ b/app/helpers/search_helper.rb
@@ -1,6 +1,7 @@
-module SearchHelper
+require 'sixarm_ruby_unaccent'
 
+module SearchHelper
 	def expand_search_query(query)
-    query.gsub(/"(.*)"/, '\\y\1\\y')
+    query.downcase.unaccent.gsub(/"(.*)"/, '\\y\1\\y')
   end
 end
diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb
new file mode 100644
index 000000000..c042c0342
--- /dev/null
+++ b/app/helpers/text_helper.rb
@@ -0,0 +1,31 @@
+# coding: utf-8
+require 'htmlentities'
+require 'sixarm_ruby_unaccent'
+
+module TextHelper
+
+  def normalize_text(html)
+    t = html.downcase
+
+    t.gsub!(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
+    t.gsub!(/<[bh]r[\/ ]*>/, "\n")
+    t.gsub!(/<\/?[^>]*>/, '')
+
+    t = HTMLEntities.new.decode(t)
+
+    t.gsub!(/[ \t]*\302\240+[ \t]*/, ' ')
+    t.gsub!(/  +/, ' ')
+
+    t.gsub!(/\r\n?/, "\n")
+    t.gsub!(/\n[ \t]+/, "\n")
+    t.gsub!(/[ \t]+\n/, "\n")
+    t.gsub!(/\n\n+/, "\n")
+
+    t.unaccent_via_split_map.strip
+  end
+
+  def normalize_status(status)
+    return normalize_text("#{status.spoiler_text}\n#{status.text}") unless status.local?
+    normalize_text("#{status.spoiler_text}\n#{Formatter.instance.format(status)}")
+  end
+end
diff --git a/app/lib/bangtags.rb b/app/lib/bangtags.rb
index 60fb426b3..d708683cb 100644
--- a/app/lib/bangtags.rb
+++ b/app/lib/bangtags.rb
@@ -720,7 +720,7 @@ class Bangtags
           q = cmd[1..-1].join.strip
           next if q.blank?
           begin
-            data = @account.statuses.where('text ~* ?', expand_search_query(q))
+            data = @account.statuses.where('normalized_text ~ ?', expand_search_query(q))
               .reorder(:created_at)
               .pluck(:created_at)
               .map { |d| d.strftime('%Y-%m') }
diff --git a/app/models/status.rb b/app/models/status.rb
index 29c4f6bd1..a2d2a8f28 100644
--- a/app/models/status.rb
+++ b/app/models/status.rb
@@ -31,9 +31,9 @@
 #  edited                 :boolean
 #  imported               :boolean
 #  origin                 :string
-#  tsv                    :tsvector
 #  boostable              :boolean
 #  reject_replies         :boolean
+#  normalized_text        :text             default(""), not null
 #
 
 class Status < ApplicationRecord
@@ -43,6 +43,7 @@ class Status < ApplicationRecord
   include Streamable
   include Cacheable
   include StatusThreadingConcern
+  include TextHelper
 
   # match both with and without U+FE0F (the emoji variation selector)
   LOCAL_ONLY_TOKENS = /(?:#!|\u{1f441}\ufe0f?)\u200b?\z/
@@ -324,6 +325,7 @@ class Status < ApplicationRecord
   around_create Mastodon::Snowflake::Callbacks
 
   before_create :set_locality
+  before_create :update_normalized_text
 
   before_validation :prepare_contents, if: :local?
   before_validation :set_reblog
@@ -334,6 +336,9 @@ class Status < ApplicationRecord
 
   after_create :set_poll_id
   after_create :process_bangtags, if: :local?
+  after_create :update_normalized_text
+
+  after_update :update_normalized_text
 
   class << self
     include SearchHelper
@@ -350,7 +355,7 @@ class Status < ApplicationRecord
       end
       return none if term.blank? || term.length < 3
       query = query.without_reblogs
-        .where('text ~* ?', expand_search_query(term))
+        .where('normalized_text ~ ?', expand_search_query(term))
         .offset(offset).limit(limit)
       apply_timeline_filters(query, account, true)
     rescue ActiveRecord::StatementInvalid
@@ -618,6 +623,12 @@ class Status < ApplicationRecord
     Bangtags.new(self).process
   end
 
+  def update_normalized_text
+    return unless (normalized_text.blank? && !text.blank?) || saved_change_to_text?
+    Rails.cache.delete("formatted_status:#{status.id}")
+    self.normalized_text = normalize_status(self)
+  end
+
   def set_conversation
     self.thread = thread.reblog if thread&.reblog?