about summary refs log tree commit diff
path: root/app/lib/spam_check.rb
blob: 923d48a02233cbc8c345bfbbd12699b1e5f725e8 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# frozen_string_literal: true

class SpamCheck
  include Redisable
  include ActionView::Helpers::TextHelper

  NILSIMSA_COMPARE_THRESHOLD = 95
  NILSIMSA_MIN_SIZE          = 10
  EXPIRE_SET_AFTER           = 1.week.seconds

  def initialize(status)
    @account = status.account
    @status  = status
  end

  def skip?
    already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply?
  end

  def spam?
    if insufficient_data?
      false
    elsif nilsimsa?
      any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
    else
      any_other_digest?('md5') { |_, other_digest| other_digest == digest }
    end
  end

  def flag!
    auto_silence_account!
    auto_report_status!
  end

  def remember!
    # The scores in sorted sets don't actually have enough bits to hold an exact
    # value of our snowflake IDs, so we use it only for its ordering property. To
    # get the correct status ID back, we have to save it in the string value

    redis.zadd(redis_key, @status.id, digest_with_algorithm)
    redis.zremrangebyrank(redis_key, '0', '-10')
    redis.expire(redis_key, EXPIRE_SET_AFTER)
  end

  def reset!
    redis.del(redis_key)
  end

  def hashable_text
    return @hashable_text if defined?(@hashable_text)

    @hashable_text = @status.text
    @hashable_text = remove_mentions(@hashable_text)
    @hashable_text = strip_tags(@hashable_text) unless @status.local?
    @hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text)
    @hashable_text = remove_whitespace(@hashable_text)
  end

  def insufficient_data?
    hashable_text.blank?
  end

  def digest
    @digest ||= begin
      if nilsimsa?
        Nilsimsa.new(hashable_text).hexdigest
      else
        Digest::MD5.hexdigest(hashable_text)
      end
    end
  end

  def digest_with_algorithm
    if nilsimsa?
      ['nilsimsa', digest, @status.id].join(':')
    else
      ['md5', digest, @status.id].join(':')
    end
  end

  private

  def remove_mentions(text)
    return text.gsub(Account::MENTION_RE, '') if @status.local?

    Nokogiri::HTML.fragment(text).tap do |html|
      mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) }

      html.traverse do |element|
        element.unlink if element.name == 'a' && mentions.include?(element['href'])
      end
    end.to_s
  end

  def normalize_unicode(text)
    text.unicode_normalize(:nfkc).downcase
  end

  def remove_whitespace(text)
    text.gsub(/\s+/, ' ').strip
  end

  def auto_silence_account!
    @account.silence!
  end

  def auto_report_status!
    status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable?
    ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected_and_silenced'))
  end

  def already_flagged?
    @account.silenced?
  end

  def trusted?
    @account.trust_level > Account::TRUST_LEVELS[:untrusted]
  end

  def no_unsolicited_mentions?
    @status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) }
  end

  def solicited_reply?
    !@status.thread.nil? && @status.thread.mentions.where(account: @account).exists?
  end

  def nilsimsa_compare_value(first, second)
    first  = [first].pack('H*')
    second = [second].pack('H*')
    bits   = 0

    0.upto(31) do |i|
      bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord
    end

    128 - bits # -128 <= Nilsimsa Compare Value <= 128
  end

  def nilsimsa?
    hashable_text.size > NILSIMSA_MIN_SIZE
  end

  def other_digests
    redis.zrange(redis_key, 0, -1)
  end

  def any_other_digest?(filter_algorithm)
    other_digests.any? do |record|
      algorithm, other_digest, status_id = record.split(':')

      next unless algorithm == filter_algorithm

      yield algorithm, other_digest, status_id
    end
  end

  def matching_status_ids
    if nilsimsa?
      other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.map { |record| record.split(':')[2] }.compact
    else
      other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.map { |record| record.split(':')[2] }.compact
    end
  end

  def redis_key
    @redis_key ||= "spam_check:#{@account.id}"
  end
end