1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
|
# frozen_string_literal: true
class SpamCheck
include Redisable
include ActionView::Helpers::TextHelper
NILSIMSA_COMPARE_THRESHOLD = 95
NILSIMSA_MIN_SIZE = 10
EXPIRE_SET_AFTER = 1.week.seconds
def initialize(status)
@account = status.account
@status = status
end
def skip?
already_flagged? || trusted? || no_unsolicited_mentions? || solicited_reply?
end
def spam?
if insufficient_data?
false
elsif nilsimsa?
any_other_digest?('nilsimsa') { |_, other_digest| nilsimsa_compare_value(digest, other_digest) >= NILSIMSA_COMPARE_THRESHOLD }
else
any_other_digest?('md5') { |_, other_digest| other_digest == digest }
end
end
def flag!
auto_silence_account!
auto_report_status!
end
def remember!
# The scores in sorted sets don't actually have enough bits to hold an exact
# value of our snowflake IDs, so we use it only for its ordering property. To
# get the correct status ID back, we have to save it in the string value
redis.zadd(redis_key, @status.id, digest_with_algorithm)
redis.zremrangebyrank(redis_key, '0', '-10')
redis.expire(redis_key, EXPIRE_SET_AFTER)
end
def reset!
redis.del(redis_key)
end
def hashable_text
return @hashable_text if defined?(@hashable_text)
@hashable_text = @status.text
@hashable_text = remove_mentions(@hashable_text)
@hashable_text = strip_tags(@hashable_text) unless @status.local?
@hashable_text = normalize_unicode(@status.spoiler_text + ' ' + @hashable_text)
@hashable_text = remove_whitespace(@hashable_text)
end
def insufficient_data?
hashable_text.blank?
end
def digest
@digest ||= begin
if nilsimsa?
Nilsimsa.new(hashable_text).hexdigest
else
Digest::MD5.hexdigest(hashable_text)
end
end
end
def digest_with_algorithm
if nilsimsa?
['nilsimsa', digest, @status.id].join(':')
else
['md5', digest, @status.id].join(':')
end
end
private
def remove_mentions(text)
return text.gsub(Account::MENTION_RE, '') if @status.local?
Nokogiri::HTML.fragment(text).tap do |html|
mentions = @status.mentions.map { |mention| ActivityPub::TagManager.instance.url_for(mention.account) }
html.traverse do |element|
element.unlink if element.name == 'a' && mentions.include?(element['href'])
end
end.to_s
end
def normalize_unicode(text)
text.unicode_normalize(:nfkc).downcase
end
def remove_whitespace(text)
text.gsub(/\s+/, ' ').strip
end
def auto_silence_account!
@account.silence!
end
def auto_report_status!
status_ids = Status.where(visibility: %i(public unlisted)).where(id: matching_status_ids).pluck(:id) + [@status.id] if @status.distributable?
ReportService.new.call(Account.representative, @account, status_ids: status_ids, comment: I18n.t('spam_check.spam_detected_and_silenced'))
end
def already_flagged?
@account.silenced?
end
def trusted?
@account.trust_level > Account::TRUST_LEVELS[:untrusted]
end
def no_unsolicited_mentions?
@status.mentions.all? { |mention| mention.silent? || (!@account.local? && !mention.account.local?) || mention.account.following?(@account) }
end
def solicited_reply?
!@status.thread.nil? && @status.thread.mentions.where(account: @account).exists?
end
def nilsimsa_compare_value(first, second)
first = [first].pack('H*')
second = [second].pack('H*')
bits = 0
0.upto(31) do |i|
bits += Nilsimsa::POPC[255 & (first[i].ord ^ second[i].ord)].ord
end
128 - bits # -128 <= Nilsimsa Compare Value <= 128
end
def nilsimsa?
hashable_text.size > NILSIMSA_MIN_SIZE
end
def other_digests
redis.zrange(redis_key, 0, -1)
end
def any_other_digest?(filter_algorithm)
other_digests.any? do |record|
algorithm, other_digest, status_id = record.split(':')
next unless algorithm == filter_algorithm
yield algorithm, other_digest, status_id
end
end
def matching_status_ids
if nilsimsa?
other_digests.select { |record| record.start_with?('nilsimsa') && nilsimsa_compare_value(digest, record.split(':')[1]) >= NILSIMSA_COMPARE_THRESHOLD }.map { |record| record.split(':')[2] }.compact
else
other_digests.select { |record| record.start_with?('md5') && record.split(':')[1] == digest }.map { |record| record.split(':')[2] }.compact
end
end
def redis_key
@redis_key ||= "spam_check:#{@account.id}"
end
end
|