# coding: utf-8
require 'htmlentities'
require 'sixarm_ruby_unaccent'
module TextHelper
def html2text(html)
html = html
.gsub(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
.gsub(/<[bh]r[\/ ]*>/, "\n")
.gsub(/<\/?[^>]*>/, '')
HTMLEntities.new.decode(html)
end
def normalize_text(text)
text.downcase
.gsub(Account::MENTION_RE, '')
.gsub(/^(?:#[\w:._·\-]+\s*)+|(?:#[\w:._·\-]+\s*)+$/, '')
.gsub(/\s*\302\240+\s*/, ' ')
.gsub(/\n\s+|\s+\n/, "\n")
.gsub(/\r\n?/, "\n")
.gsub(/\n\n+/, "\n")
.unaccent_via_split_map
.gsub(/(?:htt|ft)ps?:\/\//, '')
.gsub(/[^\n\p{Word} [:punct:]]/, '')
.gsub(/ +/, ' ')
.strip
end
def normalize_status(status)
"#{_format_tags(status)}\n#{_format_spoiler(status)}\n#{_format_status(status)}\n#{_format_desc(status)}".strip
end
def _format_tags(status)
return unless status.tags.present?
"tag #{status.tags.pluck(:name).join("\ntag ")}"
end
def _format_spoiler(status)
return if status.spoiler_text.blank?
"subj #{normalize_text(status.spoiler_text)}"
end
def _format_status(status)
text = status.local? ? Formatter.instance.format(status) : status.text
return if text.blank?
text = normalize_text(html2text(text))
text.gsub!("\n", "\ntext ")
"text #{text}"
end
def _format_desc(status)
return unless status.media_attachments.present?
text = status.media_attachments.pluck(:description).compact.join("\ndesc ")
"desc #{normalize_text(text)}"
end
end