blob: 67ece4f3233f7f5ca900ff4acb6d06d75cd9cb88 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
# coding: utf-8
require 'htmlentities'
require 'sixarm_ruby_unaccent'
module TextHelper
def normalize_text(html)
t = html.downcase
t.gsub!(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
t.gsub!(/<[bh]r[\/ ]*>/, "\n")
t.gsub!(/<\/?[^>]*>/, '')
t = HTMLEntities.new.decode(t)
t.gsub!(/[ \t]*\302\240+[ \t]*/, ' ')
t.gsub!(/ +/, ' ')
t.gsub!(/\r\n?/, "\n")
t.gsub!(/\n[ \t]+/, "\n")
t.gsub!(/[ \t]+\n/, "\n")
t.gsub!(/\n\n+/, "\n")
return t.strip.unaccent_via_split_map unless '#'.in?(t)
tags = Extractor.extract_hashtags(t).uniq
t.gsub!(/^(?:#[\w:._·\-]+\s*)+/, '')
t.gsub!(/(?:#[\w:._·\-]+\s*)+$/, '')
t.delete!('#')
"#{tags.join(' ')}\n#{t.lstrip}".strip.unaccent_via_split_map
end
def normalize_status(status)
normalize_text("#{status.tags.pluck(:name).join(' ')}\n#{status.spoiler_text}\n#{status.local? ? Formatter.instance.format(status) : status.text}\n#{status.media_attachments.pluck(:description).join("\n")}")
end
end
|