diff options
author | Eugen Rochko <eugen@zeonfederated.com> | 2019-07-13 16:45:50 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-07-13 16:45:50 +0200 |
commit | 6ff67be0f6e79ec403e08c69717ee8c89451c70e (patch) | |
tree | 1746025bdf7b3113ddb373e9fb137877952d2503 /app | |
parent | 402302776c82c3853e723fe0c0c4dc99c69da3d9 (diff) |
Add a spam check (#11217)
* Add a spam check * Use Nilsimsa to generate locality-sensitive hashes and compare using Levenshtein distance * Add more tests * Add exemption when the message is a reply to something that mentions the sender * Use Nilsimsa Compare Value instead of Levenshtein distance * Use MD5 for messages shorter than 10 characters * Add message to automated report, do not add non-public statuses to automated report, add trust level to accounts and make unsilencing raise the trust level to prevent repeated spam checks on that account * Expire spam check data after 3 months * Add support for local statuses, reduce expiration to 1 week, always create a report * Add content warnings to the spam check and exempt empty statuses * Change Nilsimsa threshold to 95 and make sure removed statuses are removed from the spam check * Add all matched statuses into automatic report
Diffstat (limited to 'app')
-rw-r--r-- | app/lib/activitypub/activity/create.rb | 13 | ||||
-rw-r--r-- | app/lib/spam_check.rb | 169 | ||||
-rw-r--r-- | app/models/account.rb | 18 | ||||
-rw-r--r-- | app/services/remove_status_service.rb | 5 |
4 files changed, 200 insertions, 5 deletions
diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index 5849c20d7..56c24680a 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -41,6 +41,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity resolve_thread(@status) fetch_replies(@status) + check_for_spam distribute(@status) forward_for_reply if @status.distributable? end @@ -406,6 +407,18 @@ class ActivityPub::Activity::Create < ActivityPub::Activity Account.local.where(username: local_usernames).exists? end + def check_for_spam + spam_check = SpamCheck.new(@status) + + return if spam_check.skip? + + if spam_check.spam? + spam_check.flag! + else + spam_check.remember! + end + end + def forward_for_reply return unless @json['signature'].present? && reply_to_local? ActivityPub::RawDistributionWorker.perform_async(Oj.dump(@json), replied_to_status.account_id, [@account.preferred_inbox_url]) diff --git a/app/lib/spam_check.rb b/app/lib/spam_check.rb new file mode 100644 index 000000000..923d48a02 --- /dev/null +++ b/app/lib/spam_check.rb @@ -0,0 +1,169 @@ +# frozen_string_literal: true + +class SpamCheck + include Redisable + include ActionView::Helpers::TextHelper + + NILSIMSA_COMPARE_THRESHOLD = 95 + NILSIMSA_MIN_SIZE = 10 + EXPIRE_SET_AFTER = 1.week.seconds + + def initialize(status) + @account = status.account + @status = status + end + + def skip? + already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply? + end + + def spam? + if insufficient_data? + false + elsif nilsimsa? + any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD } + else + any_other_digest?('md5') { |_, other_digest| other_digest == digest } + end + end + + def flag! + auto_silence_account! + auto_report_status! + end + + def remember! + # The scores in sorted sets don't actually have enough bits to hold an exact + # value of our snowflake IDs, so we use it only for its ordering property. To + # get the correct status ID back, we have to save it in the string value + + redis.zadd(redis_key, @status.id, digest_with_algorithm) + redis.zremrangebyrank(redis_key, '0', '-10') + redis.expire(redis_key, EXPIRE_SET_AFTER) + end + + def reset! + redis.del(redis_key) + end + + def hashable_text + return @hashable_text if defined?(@hashable_text) + + @hashable_text = @status.text + @hashable_text = remove_mentions(@hashable_text) + @hashable_text = strip_tags(@hashable_text) unless @status.local? + @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text) + @hashable_text = remove_whitespace(@hashable_text) + end + + def insufficient_data? + hashable_text.blank? + end + + def digest + @digest ||= begin + if nilsimsa? + Nilsimsa.new(hashable_text).hexdigest + else + Digest::MD5.hexdigest(hashable_text) + end + end + end + + def digest_with_algorithm + if nilsimsa? + ['nilsimsa', digest, @status.id].join(':') + else + ['md5', digest, @status.id].join(':') + end + end + + private + + def remove_mentions(text) + return text.gsub(Account::MENTION_RE, '') if @status.local? + + Nokogiri::HTML.fragment(text).tap do |html| + mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) } + + html.traverse do |element| + element.unlink if element.name == 'a' && mentions.include?(element['href']) + end + end.to_s + end + + def normalize_unicode(text) + text.unicode_normalize(:nfkc).downcase + end + + def remove_whitespace(text) + text.gsub(/\s+/, ' ').strip + end + + def auto_silence_account! + @account.silence! + end + + def auto_report_status! + status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable? + ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected_and_silenced')) + end + + def already_flagged? + @account.silenced? + end + + def trusted? + @account.trust_level > Account::TRUST_LEVELS[:untrusted] + end + + def no_unsolicited_mentions? + @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) } + end + + def solicited_reply? + !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists? + end + + def nilsimsa_compare_value(first, second) + first = [first].pack('H*') + second = [second].pack('H*') + bits = 0 + + 0.upto(31) do |i| + bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord + end + + 128 - bits # -128 <= Nilsimsa Compare Value <= 128 + end + + def nilsimsa? + hashable_text.size > NILSIMSA_MIN_SIZE + end + + def other_digests + redis.zrange(redis_key, 0, -1) + end + + def any_other_digest?(filter_algorithm) + other_digests.any? do |record| + algorithm, other_digest, status_id = record.split(':') + + next unless algorithm == filter_algorithm + + yield algorithm, other_digest, status_id + end + end + + def matching_status_ids + if nilsimsa? + other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.map { |record| record.split(':')[2] }.compact + else + other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.map { |record| record.split(':')[2] }.compact + end + end + + def redis_key + @redis_key ||= "spam_check:#{@account.id}" + end +end diff --git a/app/models/account.rb b/app/models/account.rb index d6772eb98..a22b7fd7c 100644 --- a/app/models/account.rb +++ b/app/models/account.rb @@ -45,6 +45,7 @@ # also_known_as :string is an Array # silenced_at :datetime # suspended_at :datetime +# trust_level :integer # class Account < ApplicationRecord @@ -62,6 +63,11 @@ class Account < ApplicationRecord include AccountCounters include DomainNormalizable + TRUST_LEVELS = { + untrusted: 0, + trusted: 1, + }.freeze + enum protocol: [:ostatus, :activitypub] validates :username, presence: true @@ -163,6 +169,10 @@ class Account < ApplicationRecord last_webfingered_at.nil? || last_webfingered_at <= 1.day.ago end + def trust_level + self[:trust_level] || 0 + end + def refresh! ResolveAccountService.new.call(acct) unless local? end @@ -171,21 +181,19 @@ class Account < ApplicationRecord silenced_at.present? end - def silence!(date = nil) - date ||= Time.now.utc + def silence!(date = Time.now.utc) update!(silenced_at: date) end def unsilence! - update!(silenced_at: nil) + update!(silenced_at: nil, trust_level: trust_level == TRUST_LEVELS[:untrusted] ? TRUST_LEVELS[:trusted] : trust_level) end def suspended? suspended_at.present? end - def suspend!(date = nil) - date ||= Time.now.utc + def suspend!(date = Time.now.utc) transaction do user&.disable! if local? update!(suspended_at: date) diff --git a/app/services/remove_status_service.rb b/app/services/remove_status_service.rb index 6311971ff..a69fce8b8 100644 --- a/app/services/remove_status_service.rb +++ b/app/services/remove_status_service.rb @@ -23,6 +23,7 @@ class RemoveStatusService < BaseService remove_from_hashtags remove_from_public remove_from_media if status.media_attachments.any? + remove_from_spam_check @status.destroy! else @@ -142,6 +143,10 @@ class RemoveStatusService < BaseService redis.publish('timeline:public:local:media', @payload) if @status.local? end + def remove_from_spam_check + redis.zremrangebyscore("spam_check:#{@status.account_id}", @status.id, @status.id) + end + def lock_options { redis: Redis.current, key: "distribute:#{@status.id}" } end |