From cb4ef24ac9d48e70648135f106fdc275dedf14fc Mon Sep 17 00:00:00 2001 From: David Yip Date: Wed, 15 Nov 2017 17:26:29 -0600 Subject: Match keyword mute filter on hashtags. #208. It is reasonable to expect someone to enter #foo to mute hashtag #foo. However, tags are recorded on statuses without the preceding #. To adjust for this, we build a separate tag matcher and use Tag::HASHTAG_RE to extract a hashtag from the hashtag syntax. --- app/models/glitch/keyword_mute.rb | 70 +++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 18 deletions(-) (limited to 'app/models') diff --git a/app/models/glitch/keyword_mute.rb b/app/models/glitch/keyword_mute.rb index 009de1880..733dd0bc8 100644 --- a/app/models/glitch/keyword_mute.rb +++ b/app/models/glitch/keyword_mute.rb @@ -16,51 +16,85 @@ class Glitch::KeywordMute < ApplicationRecord validates_presence_of :keyword - after_commit :invalidate_cached_matcher + after_commit :invalidate_cached_matchers - def self.matcher_for(account_id) - Matcher.new(account_id) + def self.text_matcher_for(account_id) + TextMatcher.new(account_id) + end + + def self.tag_matcher_for(account_id) + TagMatcher.new(account_id) end private - def invalidate_cached_matcher - Rails.cache.delete("keyword_mutes:regex:#{account_id}") + def invalidate_cached_matchers + Rails.cache.delete(TextMatcher.cache_key(account_id)) + Rails.cache.delete(TagMatcher.cache_key(account_id)) end - class Matcher + class RegexpMatcher attr_reader :account_id attr_reader :regex def initialize(account_id) @account_id = account_id - regex_text = Rails.cache.fetch("keyword_mutes:regex:#{account_id}") { regex_text_for_account } + regex_text = Rails.cache.fetch(self.class.cache_key(account_id)) { make_regex_text } @regex = /#{regex_text}/ end + protected + + def keywords + Glitch::KeywordMute.where(account_id: account_id).pluck(:whole_word, :keyword) + end + + def boundary_regex_for_keyword(keyword) + sb = keyword =~ /\A[[:word:]]/ ? '\b' : '' + eb = keyword =~ /[[:word:]]\Z/ ? '\b' : '' + + /(?mix:#{sb}#{Regexp.escape(keyword)}#{eb})/ + end + end + + class TextMatcher < RegexpMatcher + def self.cache_key(account_id) + format('keyword_mutes:regex:%s', account_id) + end + def =~(str) regex =~ str end private - def keywords - Glitch::KeywordMute.where(account_id: account_id).select(:keyword, :id, :whole_word) - end - - def regex_text_for_account - kws = keywords.find_each.with_object([]) do |kw, a| - a << (kw.whole_word ? boundary_regex_for_keyword(kw.keyword) : kw.keyword) + def make_regex_text + kws = keywords.map! do |whole_word, keyword| + whole_word ? boundary_regex_for_keyword(keyword) : keyword end Regexp.union(kws).source end + end - def boundary_regex_for_keyword(keyword) - sb = keyword =~ /\A[[:word:]]/ ? '\b' : '' - eb = keyword =~ /[[:word:]]\Z/ ? '\b' : '' + class TagMatcher < RegexpMatcher + def self.cache_key(account_id) + format('keyword_mutes:tag:%s', account_id) + end - /(?mix:#{sb}#{Regexp.escape(keyword)}#{eb})/ + def =~(tags) + tags.pluck(:name).detect { |n| regex =~ n } + end + + private + + def make_regex_text + kws = keywords.map! do |whole_word, keyword| + term = (Tag::HASHTAG_RE =~ keyword) ? $1 : keyword + whole_word ? boundary_regex_for_keyword(term) : term + end + + Regexp.union(kws).source end end end -- cgit From 8fc54890e50de024683d9b763b8ed8f82d46433a Mon Sep 17 00:00:00 2001 From: David Yip Date: Wed, 15 Nov 2017 18:08:03 -0600 Subject: Prefix cache keys with the matcher type. #208. We already know about one regex limitation, which is that they cannot segment words in e.g. Japanese, Chinese, or Thai. It may also end up that regex matching is too slow compared to other methods. However, the regex is an implementation detail. We still want the ability to switch between "occurs anywhere" and "match whole word", and caching the matcher result is likely to still be important (since the matcher itself won't change nearly as often as status ingress rate). Therefore, we ought to be able to change the cache keys to reflect a change of data structure. (Old cache keys expire within minutes, so they shouldn't be too big of an issue. Old cache keys could also be explicitly removed by an instance administrator.) --- app/models/glitch/keyword_mute.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'app/models') diff --git a/app/models/glitch/keyword_mute.rb b/app/models/glitch/keyword_mute.rb index 733dd0bc8..001be9201 100644 --- a/app/models/glitch/keyword_mute.rb +++ b/app/models/glitch/keyword_mute.rb @@ -59,7 +59,7 @@ class Glitch::KeywordMute < ApplicationRecord class TextMatcher < RegexpMatcher def self.cache_key(account_id) - format('keyword_mutes:regex:%s', account_id) + format('keyword_mutes:regex:text:%s', account_id) end def =~(str) @@ -79,7 +79,7 @@ class Glitch::KeywordMute < ApplicationRecord class TagMatcher < RegexpMatcher def self.cache_key(account_id) - format('keyword_mutes:tag:%s', account_id) + format('keyword_mutes:regex:tag:%s', account_id) end def =~(tags) -- cgit From 08652baab018e65cfc4e1fbcfc909ff01f96ea60 Mon Sep 17 00:00:00 2001 From: David Yip Date: Wed, 15 Nov 2017 18:26:21 -0600 Subject: Replace =~ with #matches?. #208. =~ made sense when we were passing it through to a regex, but we're no longer doing that: TagMatcher looks at individual tags and returns a value that *looks* like what you get out of #=~ but really isn't that meaningful. Probably a good idea to not subvert convention like this and instead use a name with guessable intent. --- app/lib/feed_manager.rb | 14 +++++++------- app/models/glitch/keyword_mute.rb | 8 ++++---- spec/models/glitch/keyword_mute_spec.rb | 28 ++++++++++++++-------------- 3 files changed, 25 insertions(+), 25 deletions(-) (limited to 'app/models') diff --git a/app/lib/feed_manager.rb b/app/lib/feed_manager.rb index e12a5c016..a99606e1b 100644 --- a/app/lib/feed_manager.rb +++ b/app/lib/feed_manager.rb @@ -174,19 +174,19 @@ class FeedManager text_matcher = Glitch::KeywordMute.text_matcher_for(receiver_id) tag_matcher = Glitch::KeywordMute.tag_matcher_for(receiver_id) - should_filter = text_matcher =~ status.text - should_filter ||= text_matcher =~ status.spoiler_text - should_filter ||= tag_matcher =~ status.tags + should_filter = text_matcher.matches?(status.text) + should_filter ||= text_matcher.matches?(status.spoiler_text) + should_filter ||= tag_matcher.matches?(status.tags) if status.reblog? reblog = status.reblog - should_filter ||= text_matcher =~ reblog.text - should_filter ||= text_matcher =~ reblog.spoiler_text - should_filter ||= tag_matcher =~ status.tags + should_filter ||= text_matcher.matches?(reblog.text) + should_filter ||= text_matcher.matches?(reblog.spoiler_text) + should_filter ||= tag_matcher.matches?(status.tags) end - !!should_filter + should_filter end def filter_from_mentions?(status, receiver_id) diff --git a/app/models/glitch/keyword_mute.rb b/app/models/glitch/keyword_mute.rb index 001be9201..a2481308f 100644 --- a/app/models/glitch/keyword_mute.rb +++ b/app/models/glitch/keyword_mute.rb @@ -62,8 +62,8 @@ class Glitch::KeywordMute < ApplicationRecord format('keyword_mutes:regex:text:%s', account_id) end - def =~(str) - regex =~ str + def matches?(str) + !!(regex =~ str) end private @@ -82,8 +82,8 @@ class Glitch::KeywordMute < ApplicationRecord format('keyword_mutes:regex:tag:%s', account_id) end - def =~(tags) - tags.pluck(:name).detect { |n| regex =~ n } + def matches?(tags) + tags.pluck(:name).any? { |n| regex =~ n } end private diff --git a/spec/models/glitch/keyword_mute_spec.rb b/spec/models/glitch/keyword_mute_spec.rb index e14af0e6a..bab276c26 100644 --- a/spec/models/glitch/keyword_mute_spec.rb +++ b/spec/models/glitch/keyword_mute_spec.rb @@ -13,7 +13,7 @@ RSpec.describe Glitch::KeywordMute, type: :model do end it 'does not match' do - expect(matcher =~ 'This is a hot take').to be_falsy + expect(matcher.matches?('This is a hot take')).to be_falsy end end @@ -21,75 +21,75 @@ RSpec.describe Glitch::KeywordMute, type: :model do it 'does not match keywords set by a different account' do Glitch::KeywordMute.create!(account: bob, keyword: 'take') - expect(matcher =~ 'This is a hot take').to be_falsy + expect(matcher.matches?('This is a hot take')).to be_falsy end it 'does not match if no keywords match the status text' do Glitch::KeywordMute.create!(account: alice, keyword: 'cold') - expect(matcher =~ 'This is a hot take').to be_falsy + expect(matcher.matches?('This is a hot take')).to be_falsy end it 'considers word boundaries when matching' do Glitch::KeywordMute.create!(account: alice, keyword: 'bob', whole_word: true) - expect(matcher =~ 'bobcats').to be_falsy + expect(matcher.matches?('bobcats')).to be_falsy end it 'matches substrings if whole_word is false' do Glitch::KeywordMute.create!(account: alice, keyword: 'take', whole_word: false) - expect(matcher =~ 'This is a shiitake mushroom').to be_truthy + expect(matcher.matches?('This is a shiitake mushroom')).to be_truthy end it 'matches keywords at the beginning of the text' do Glitch::KeywordMute.create!(account: alice, keyword: 'take') - expect(matcher =~ 'Take this').to be_truthy + expect(matcher.matches?('Take this')).to be_truthy end it 'matches keywords at the end of the text' do Glitch::KeywordMute.create!(account: alice, keyword: 'take') - expect(matcher =~ 'This is a hot take').to be_truthy + expect(matcher.matches?('This is a hot take')).to be_truthy end it 'matches if at least one keyword case-insensitively matches the text' do Glitch::KeywordMute.create!(account: alice, keyword: 'hot') - expect(matcher =~ 'This is a HOT take').to be_truthy + expect(matcher.matches?('This is a HOT take')).to be_truthy end it 'maintains case-insensitivity when combining keywords into a single matcher' do Glitch::KeywordMute.create!(account: alice, keyword: 'hot') Glitch::KeywordMute.create!(account: alice, keyword: 'cold') - expect(matcher =~ 'This is a HOT take').to be_truthy + expect(matcher.matches?('This is a HOT take')).to be_truthy end it 'matches keywords surrounded by non-alphanumeric ornamentation' do Glitch::KeywordMute.create!(account: alice, keyword: 'hot') - expect(matcher =~ '(hot take)').to be_truthy + expect(matcher.matches?('(hot take)')).to be_truthy end it 'escapes metacharacters in keywords' do Glitch::KeywordMute.create!(account: alice, keyword: '(hot take)') - expect(matcher =~ '(hot take)').to be_truthy + expect(matcher.matches?('(hot take)')).to be_truthy end it 'uses case-folding rules appropriate for more than just English' do Glitch::KeywordMute.create!(account: alice, keyword: 'großeltern') - expect(matcher =~ 'besuch der grosseltern').to be_truthy + expect(matcher.matches?('besuch der grosseltern')).to be_truthy end it 'matches keywords that are composed of multiple words' do Glitch::KeywordMute.create!(account: alice, keyword: 'a shiitake') - expect(matcher =~ 'This is a shiitake').to be_truthy - expect(matcher =~ 'This is shiitake').to_not be_truthy + expect(matcher.matches?('This is a shiitake')).to be_truthy + expect(matcher.matches?('This is shiitake')).to_not be_truthy end end end -- cgit