about summary refs log tree commit diff
path: root/app
diff options
context:
space:
mode:
authorEugen Rochko <eugen@zeonfederated.com>2019-08-18 03:45:51 +0200
committerGitHub <noreply@github.com>2019-08-18 03:45:51 +0200
commitcc0a55cf9aead00e1cb044649f84c2187e0e4a35 (patch)
tree57c0920e813d7c8df9a0f483f111d2808f5045d3 /app
parent3a77090d015203a1ae20376ed69ca699eed3976d (diff)
Add more accurate hashtag search (#11579)
* Add more accurate hashtag search

Using ElasticSearch to index hashtags with edge n-grams and score
them by usage within the last 7 days since last activity. Only
hashtags that have been reviewed and are listable can appear in
searches, unless they match the query exactly

* Fix search analyzer dropping non-ascii characters
Diffstat (limited to 'app')
-rw-r--r--app/chewy/tags_index.rb37
-rw-r--r--app/models/tag.rb14
-rw-r--r--app/models/trending_tags.rb3
-rw-r--r--app/services/account_search_service.rb2
-rw-r--r--app/services/search_service.rb8
-rw-r--r--app/services/tag_search_service.rb82
6 files changed, 137 insertions, 9 deletions
diff --git a/app/chewy/tags_index.rb b/app/chewy/tags_index.rb
new file mode 100644
index 000000000..300fc128f
--- /dev/null
+++ b/app/chewy/tags_index.rb
@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+
+class TagsIndex < Chewy::Index
+  settings index: { refresh_interval: '15m' }, analysis: {
+    analyzer: {
+      content: {
+        tokenizer: 'keyword',
+        filter: %w(lowercase asciifolding cjk_width),
+      },
+
+      edge_ngram: {
+        tokenizer: 'edge_ngram',
+        filter: %w(lowercase asciifolding cjk_width),
+      },
+    },
+
+    tokenizer: {
+      edge_ngram: {
+        type: 'edge_ngram',
+        min_gram: 2,
+        max_gram: 15,
+      },
+    },
+  }
+
+  define_type ::Tag.listable, delete_if: ->(tag) { tag.destroyed? || !tag.listable? } do
+    root date_detection: false do
+      field :name, type: 'text', analyzer: 'content' do
+        field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
+      end
+
+      field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }
+      field :usage, type: 'long', value: ->(tag) { tag.history.reduce(0) { |total, day| total + day[:accounts].to_i } }
+      field :last_status_at, type: 'date', value: ->(tag) { tag.last_status_at || tag.created_at }
+    end
+  end
+end
diff --git a/app/models/tag.rb b/app/models/tag.rb
index 1364d1dba..5094d973d 100644
--- a/app/models/tag.rb
+++ b/app/models/tag.rb
@@ -13,6 +13,8 @@
 #  listable            :boolean
 #  reviewed_at         :datetime
 #  requested_review_at :datetime
+#  last_status_at      :datetime
+#  last_trend_at       :datetime
 #
 
 class Tag < ApplicationRecord
@@ -33,7 +35,8 @@ class Tag < ApplicationRecord
   scope :unreviewed, -> { where(reviewed_at: nil) }
   scope :pending_review, -> { unreviewed.where.not(requested_review_at: nil) }
   scope :usable, -> { where(usable: [true, nil]) }
-  scope :discoverable, -> { where(listable: [true, nil]).joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
+  scope :listable, -> { where(listable: [true, nil]) }
+  scope :discoverable, -> { listable.joins(:account_tag_stat).where(AccountTagStat.arel_table[:accounts_count].gt(0)).order(Arel.sql('account_tag_stats.accounts_count desc')) }
   scope :most_used, ->(account) { joins(:statuses).where(statuses: { account: account }).group(:id).order(Arel.sql('count(*) desc')) }
 
   delegate :accounts_count,
@@ -44,6 +47,8 @@ class Tag < ApplicationRecord
 
   after_save :save_account_tag_stat
 
+  update_index('tags#tag', :self) if Chewy.enabled?
+
   def account_tag_stat
     super || build_account_tag_stat
   end
@@ -121,9 +126,10 @@ class Tag < ApplicationRecord
       normalized_term = normalize(term.strip).mb_chars.downcase.to_s
       pattern         = sanitize_sql_like(normalized_term) + '%'
 
-      Tag.where(arel_table[:name].lower.matches(pattern))
-         .where(arel_table[:score].gt(0).or(arel_table[:name].lower.eq(normalized_term)))
-         .order(Arel.sql('length(name) ASC, score DESC, name ASC'))
+      Tag.listable
+         .where(arel_table[:name].lower.matches(pattern))
+         .where(arel_table[:name].lower.eq(normalized_term).or(arel_table[:reviewed_at].not_eq(nil)))
+         .order(Arel.sql('length(name) ASC, name ASC'))
          .limit(limit)
          .offset(offset)
     end
diff --git a/app/models/trending_tags.rb b/app/models/trending_tags.rb
index 3d60a7fea..e4ce988c1 100644
--- a/app/models/trending_tags.rb
+++ b/app/models/trending_tags.rb
@@ -17,6 +17,9 @@ class TrendingTags
       increment_historical_use!(tag.id, at_time)
       increment_unique_use!(tag.id, account.id, at_time)
       increment_vote!(tag, at_time)
+
+      tag.update(last_status_at: Time.now.utc) if tag.last_status_at.nil? || tag.last_status_at < 12.hours.ago
+      tag.update(last_trend_at: Time.now.utc)  if trending?(tag) && (tag.last_trend_at.nil? || tag.last_trend_at < 12.hours.ago)
     end
 
     def get(limit, filtered: true)
diff --git a/app/services/account_search_service.rb b/app/services/account_search_service.rb
index d7bccdfe0..01caaefa9 100644
--- a/app/services/account_search_service.rb
+++ b/app/services/account_search_service.rb
@@ -109,7 +109,7 @@ class AccountSearchService < BaseService
       field_value_factor: {
         field: 'followers_count',
         modifier: 'log2p',
-        missing: 1,
+        missing: 0,
       },
     }
   end
diff --git a/app/services/search_service.rb b/app/services/search_service.rb
index 786d34b15..fe601bbf4 100644
--- a/app/services/search_service.rb
+++ b/app/services/search_service.rb
@@ -57,10 +57,10 @@ class SearchService < BaseService
   end
 
   def perform_hashtags_search!
-    Tag.search_for(
-      @query.gsub(/\A#/, ''),
-      @limit,
-      @offset
+    TagSearchService.new.call(
+      @query,
+      limit: @limit,
+      offset: @offset
     )
   end
 
diff --git a/app/services/tag_search_service.rb b/app/services/tag_search_service.rb
new file mode 100644
index 000000000..64dd76bb7
--- /dev/null
+++ b/app/services/tag_search_service.rb
@@ -0,0 +1,82 @@
+# frozen_string_literal: true
+
+class TagSearchService < BaseService
+  def call(query, options = {})
+    @query  = query.strip.gsub(/\A#/, '')
+    @offset = options[:offset].to_i
+    @limit  = options[:limit].to_i
+
+    if Chewy.enabled?
+      from_elasticsearch
+    else
+      from_database
+    end
+  end
+
+  private
+
+  def from_elasticsearch
+    query = {
+      function_score: {
+        query: {
+          multi_match: {
+            query: @query,
+            fields: %w(name.edge_ngram name),
+            type: 'most_fields',
+            operator: 'and',
+          },
+        },
+
+        functions: [
+          {
+            field_value_factor: {
+              field: 'usage',
+              modifier: 'log2p',
+              missing: 0,
+            },
+          },
+
+          {
+            gauss: {
+              last_status_at: {
+                scale: '7d',
+                offset: '14d',
+                decay: 0.5,
+              },
+            },
+          },
+        ],
+
+        boost_mode: 'multiply',
+      },
+    }
+
+    filter = {
+      bool: {
+        should: [
+          {
+            term: {
+              reviewed: {
+                value: true,
+              },
+            },
+          },
+
+          {
+            term: {
+              name: {
+                value: @query,
+              },
+            },
+          },
+        ],
+      },
+    }
+
+    TagsIndex.query(query).filter(filter).limit(@limit).offset(@offset).objects.compact
+  end
+
+  def from_database
+    Tag.search_for(@query, @limit, @offset)
+  end
+end