From 8fdff2748ffbc4ce3d36c75f2bd33182b641e895 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Fri, 16 Aug 2019 01:24:03 +0200 Subject: Add more accurate account search (#11537) * Add more accurate account search When ElasticSearch is available, a more accurate search is implemented: - Using edge n-gram index for acct and display name - Using asciifolding and cjk width normalization on display names - Using Gaussian decay on account activity for additional scoring (recency) - Using followers/friends ratio for additional scoring (spamminess) - Using followers number for additional scoring (size) The exact match precedence only takes effect when the input conforms to the username format and the username part of it is complete, i.e. when the user started typing the domain part. * Support single-letter usernames * Fix tests * Fix not picking up account updates * Add weights and normalization for scores, skip zero terms queries * Use local counts for accounts index, adjust search parameters * Fix mistakes * Using updated_at of accounts is inadequate for remote accounts --- app/chewy/accounts_index.rb | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 app/chewy/accounts_index.rb (limited to 'app/chewy/accounts_index.rb') diff --git a/app/chewy/accounts_index.rb b/app/chewy/accounts_index.rb new file mode 100644 index 000000000..e11b80039 --- /dev/null +++ b/app/chewy/accounts_index.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +class AccountsIndex < Chewy::Index + settings index: { refresh_interval: '5m' }, analysis: { + analyzer: { + content: { + tokenizer: 'whitespace', + filter: %w(lowercase asciifolding cjk_width), + }, + + edge_ngram: { + tokenizer: 'edge_ngram', + filter: %w(lowercase asciifolding cjk_width), + }, + }, + + tokenizer: { + edge_ngram: { + type: 'edge_ngram', + min_gram: 1, + max_gram: 15, + }, + }, + } + + define_type ::Account.searchable.includes(:account_stat), delete_if: ->(account) { account.destroyed? || !account.searchable? } do + root date_detection: false do + field :id, type: 'long' + field :display_name, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' + field :acct, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') } + field :following_count, type: 'long', value: ->(account) { account.active_relationships.count } + field :followers_count, type: 'long', value: ->(account) { account.passive_relationships.count } + field :last_status_at, type: 'date', value: ->(account) { account.last_status_at || account.created_at } + end + end +end -- cgit From 70da6d663078fb7d04aed387ac085afb2e9e2cd2 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Fri, 16 Aug 2019 13:00:30 +0200 Subject: Fix accounts search by full/partial display name and others (#11580) - Restrict followers counts to local users to minimize local advantage - Fix emoji shortcodes causing error in search - Fix search syntax parse errors not being caught --- app/chewy/accounts_index.rb | 15 +++++++++++---- app/lib/search_query_parser.rb | 19 ++++++++++--------- app/lib/search_query_transformer.rb | 2 ++ app/services/account_search_service.rb | 2 +- app/services/search_service.rb | 2 +- 5 files changed, 25 insertions(+), 15 deletions(-) (limited to 'app/chewy/accounts_index.rb') diff --git a/app/chewy/accounts_index.rb b/app/chewy/accounts_index.rb index e11b80039..b814e009e 100644 --- a/app/chewy/accounts_index.rb +++ b/app/chewy/accounts_index.rb @@ -26,10 +26,17 @@ class AccountsIndex < Chewy::Index define_type ::Account.searchable.includes(:account_stat), delete_if: ->(account) { account.destroyed? || !account.searchable? } do root date_detection: false do field :id, type: 'long' - field :display_name, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' - field :acct, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') } - field :following_count, type: 'long', value: ->(account) { account.active_relationships.count } - field :followers_count, type: 'long', value: ->(account) { account.passive_relationships.count } + + field :display_name, type: 'text', analyzer: 'content' do + field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' + end + + field :acct, type: 'text', analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') } do + field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' + end + + field :following_count, type: 'long', value: ->(account) { account.following.local.count } + field :followers_count, type: 'long', value: ->(account) { account.followers.local.count } field :last_status_at, type: 'date', value: ->(account) { account.last_status_at || account.created_at } end end diff --git a/app/lib/search_query_parser.rb b/app/lib/search_query_parser.rb index 405ad15b8..15956d4cf 100644 --- a/app/lib/search_query_parser.rb +++ b/app/lib/search_query_parser.rb @@ -1,14 +1,15 @@ # frozen_string_literal: true class SearchQueryParser < Parslet::Parser - rule(:term) { match('[^\s":]').repeat(1).as(:term) } - rule(:quote) { str('"') } - rule(:colon) { str(':') } - rule(:space) { match('\s').repeat(1) } - rule(:operator) { (str('+') | str('-')).as(:operator) } - rule(:prefix) { (term >> colon).as(:prefix) } - rule(:phrase) { (quote >> (term >> space.maybe).repeat >> quote).as(:phrase) } - rule(:clause) { (prefix.maybe >> operator.maybe >> (phrase | term)).as(:clause) } - rule(:query) { (clause >> space.maybe).repeat.as(:query) } + rule(:term) { match('[^\s":]').repeat(1).as(:term) } + rule(:quote) { str('"') } + rule(:colon) { str(':') } + rule(:space) { match('\s').repeat(1) } + rule(:operator) { (str('+') | str('-')).as(:operator) } + rule(:prefix) { (term >> colon).as(:prefix) } + rule(:shortcode) { (colon >> term >> colon.maybe).as(:shortcode) } + rule(:phrase) { (quote >> (term >> space.maybe).repeat >> quote).as(:phrase) } + rule(:clause) { (prefix.maybe >> operator.maybe >> (phrase | term | shortcode)).as(:clause) } + rule(:query) { (clause >> space.maybe).repeat.as(:query) } root(:query) end diff --git a/app/lib/search_query_transformer.rb b/app/lib/search_query_transformer.rb index 2c4144790..6a299f59d 100644 --- a/app/lib/search_query_transformer.rb +++ b/app/lib/search_query_transformer.rb @@ -75,6 +75,8 @@ class SearchQueryTransformer < Parslet::Transform if clause[:term] TermClause.new(prefix, operator, clause[:term].to_s) + elsif clause[:shortcode] + TermClause.new(prefix, operator, ":#{clause[:term]}:") elsif clause[:phrase] PhraseClause.new(prefix, operator, clause[:phrase].map { |p| p[:term].to_s }.join(' ')) else diff --git a/app/services/account_search_service.rb b/app/services/account_search_service.rb index 2d602a31d..d7bccdfe0 100644 --- a/app/services/account_search_service.rb +++ b/app/services/account_search_service.rb @@ -67,7 +67,7 @@ class AccountSearchService < BaseService end def from_elasticsearch - must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct) : %w(acct^2 display_name), type: 'best_fields' } }] + must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct.edge_ngram acct) : %w(acct.edge_ngram acct display_name.edge_ngram display_name), type: 'most_fields', operator: 'and' } }] should_clauses = [] if account diff --git a/app/services/search_service.rb b/app/services/search_service.rb index 769d1ac7a..786d34b15 100644 --- a/app/services/search_service.rb +++ b/app/services/search_service.rb @@ -52,7 +52,7 @@ class SearchService < BaseService preloaded_relations = relations_map_for_account(@account, account_ids, account_domains) results.reject { |status| StatusFilter.new(status, @account, preloaded_relations).filtered? } - rescue Faraday::ConnectionFailed + rescue Faraday::ConnectionFailed, Parslet::ParseFailed [] end -- cgit