From 4f6af87906175d9ea802ef0c6f050388eac890fa Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Wed, 18 Sep 2019 12:53:13 +0200 Subject: Change spam check to apply to local accounts and add a threshold (#11806) Instead of detecting spam on first duplicate message, add a threshold of 5 such messages to reduce false positives --- app/lib/activitypub/activity/create.rb | 10 +------- app/lib/spam_check.rb | 46 ++++++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 17 deletions(-) (limited to 'app/lib') diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index dea7fd43c..e69193b71 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -408,15 +408,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity end def check_for_spam - spam_check = SpamCheck.new(@status) - - return if spam_check.skip? - - if spam_check.spam? - spam_check.flag! - else - spam_check.remember! - end + SpamCheck.perform(@status) end def forward_for_reply diff --git a/app/lib/spam_check.rb b/app/lib/spam_check.rb index 0cf1b8790..441697364 100644 --- a/app/lib/spam_check.rb +++ b/app/lib/spam_check.rb @@ -4,9 +4,25 @@ class SpamCheck include Redisable include ActionView::Helpers::TextHelper + # Threshold over which two Nilsimsa values are considered + # to refer to the same text NILSIMSA_COMPARE_THRESHOLD = 95 - NILSIMSA_MIN_SIZE = 10 - EXPIRE_SET_AFTER = 1.week.seconds + + # Nilsimsa doesn't work well on small inputs, so below + # this size, we check only for exact matches with MD5 + NILSIMSA_MIN_SIZE = 10 + + # How long to keep the trail of digests between updates, + # there is no reason to store it forever + EXPIRE_SET_AFTER = 1.week.seconds + + # How many digests to keep in an account's trail. If it's + # too small, spam could rotate around different message templates + MAX_TRAIL_SIZE = 10 + + # How many detected duplicates to allow through before + # considering the message as spam + THRESHOLD = 5 def initialize(status) @account = status.account @@ -21,9 +37,9 @@ class SpamCheck if insufficient_data? false elsif nilsimsa? - any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } + digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } else - any_other_digest?('md5') { |_, other_digest| other_digest == digest } + digests_over_threshold?('md5') { |_, other_digest| other_digest == digest } end end @@ -38,7 +54,7 @@ class SpamCheck # get the correct status ID back, we have to save it in the string value redis.zadd(redis_key, @status.id, digest_with_algorithm) - redis.zremrangebyrank(redis_key, '0', '-10') + redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1)) redis.expire(redis_key, EXPIRE_SET_AFTER) end @@ -78,6 +94,20 @@ class SpamCheck end end + class << self + def perform(status) + spam_check = new(status) + + return if spam_check.skip? + + if spam_check.spam? + spam_check.flag! + else + spam_check.remember! + end + end + end + private def disabled? @@ -149,14 +179,14 @@ class SpamCheck redis.zrange(redis_key, 0, -1) end - def any_other_digest?(filter_algorithm) - other_digests.any? do |record| + def digests_over_threshold?(filter_algorithm) + other_digests.select do |record| algorithm, other_digest, status_id = record.split(':') next unless algorithm == filter_algorithm yield algorithm, other_digest, status_id - end + end.size >= THRESHOLD end def matching_status_ids -- cgit