about summary refs log tree commit diff
path: root/app/lib/spam_check.rb
blob: 68e586d00bd739c4c653d7144704f818ffee0d29 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# frozen_string_literal: true

class SpamCheck
  include Redisable
  include ActionView::Helpers::TextHelper

  # Threshold over which two Nilsimsa values are considered
  # to refer to the same text
  NILSIMSA_COMPARE_THRESHOLD = 95

  # Nilsimsa doesn't work well on small inputs, so below
  # this size, we check only for exact matches with MD5
  NILSIMSA_MIN_SIZE = 10

  # How long to keep the trail of digests between updates,
  # there is no reason to store it forever
  EXPIRE_SET_AFTER = 1.week.seconds

  # How many digests to keep in an account's trail. If it's
  # too small, spam could rotate around different message templates
  MAX_TRAIL_SIZE = 10

  # How many detected duplicates to allow through before
  # considering the message as spam
  THRESHOLD = 5

  def initialize(status)
    @account = status.account
    @status  = status
  end

  def skip?
    disabled? || already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply?
  end

  def spam?
    if insufficient_data?
      false
    elsif nilsimsa?
      digests_over_threshold?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
    else
      digests_over_threshold?('md5') { |_, other_digest| other_digest == digest }
    end
  end

  def flag!
    auto_report_status!
  end

  def remember!
    # The scores in sorted sets don't actually have enough bits to hold an exact
    # value of our snowflake IDs, so we use it only for its ordering property. To
    # get the correct status ID back, we have to save it in the string value

    redis.zadd(redis_key, @status.id, digest_with_algorithm)
    redis.zremrangebyrank(redis_key, 0, -(MAX_TRAIL_SIZE + 1))
    redis.expire(redis_key, EXPIRE_SET_AFTER)
  end

  def reset!
    redis.del(redis_key)
  end

  def hashable_text
    return @hashable_text if defined?(@hashable_text)

    @hashable_text = @status.text
    @hashable_text = remove_mentions(@hashable_text)
    @hashable_text = strip_tags(@hashable_text) unless @status.local?
    @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text)
    @hashable_text = remove_whitespace(@hashable_text)
  end

  def insufficient_data?
    hashable_text.blank?
  end

  def digest
    @digest ||= begin
      if nilsimsa?
        Nilsimsa.new(hashable_text).hexdigest
      else
        Digest::MD5.hexdigest(hashable_text)
      end
    end
  end

  def digest_with_algorithm
    if nilsimsa?
      ['nilsimsa', digest, @status.id].join(':')
    else
      ['md5', digest, @status.id].join(':')
    end
  end

  class << self
    def perform(status)
      spam_check = new(status)

      return if spam_check.skip?

      if spam_check.spam?
        spam_check.flag!
      else
        spam_check.remember!
      end
    end
  end

  private

  def disabled?
    !Setting.spam_check_enabled
  end

  def remove_mentions(text)
    return text.gsub(Account::MENTION_RE, '') if @status.local?

    Nokogiri::HTML.fragment(text).tap do |html|
      mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) }

      html.traverse do |element|
        element.unlink if element.name == 'a' && mentions.include?(element['href'])
      end
    end.to_s
  end

  def normalize_unicode(text)
    text.unicode_normalize(:nfkc).downcase
  end

  def remove_whitespace(text)
    text.gsub(/\s+/, ' ').strip
  end

  def auto_report_status!
    status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable?
    ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected'))
  end

  def already_flagged?
    @account.silenced? || @account.targeted_reports.unresolved.where(account_id: -99).exists?
  end

  def trusted?
    @account.trust_level > Account::TRUST_LEVELS[:untrusted] || (@account.local? && @account.user_staff?)
  end

  def no_unsolicited_mentions?
    @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) }
  end

  def solicited_reply?
    !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists?
  end

  def nilsimsa_compare_value(first, second)
    first  = [first].pack('H*')
    second = [second].pack('H*')
    bits   = 0

    0.upto(31) do |i|
      bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord
    end

    128 - bits # -128 <= Nilsimsa Compare Value <= 128
  end

  def nilsimsa?
    hashable_text.size > NILSIMSA_MIN_SIZE
  end

  def other_digests
    redis.zrange(redis_key, 0, -1)
  end

  def digests_over_threshold?(filter_algorithm)
    other_digests.select do |record|
      algorithm, other_digest, status_id = record.split(':')

      next unless algorithm == filter_algorithm

      yield algorithm, other_digest, status_id
    end.size >= THRESHOLD
  end

  def matching_status_ids
    if nilsimsa?
      other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.filter_map { |record| record.split(':')[2] }
    else
      other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.filter_map { |record| record.split(':')[2] }
    end
  end

  def redis_key
    @redis_key ||= "spam_check:#{@account.id}"
  end
end