diff options
author | Renato "Lond" Cerqueira <renato@lond.com.br> | 2017-11-16 10:51:38 -0200 |
---|---|---|
committer | Eugen Rochko <eugen@zeonfederated.com> | 2017-11-16 13:51:38 +0100 |
commit | ad207456d64c76d21c17b26a954b459fe2dc0f54 (patch) | |
tree | af1fcf001fec2883f963c772ece944f8e29d3ed3 /app | |
parent | 9e3d24a150acf118789461797735fc0e4a8a30ea (diff) |
Improve language filter (#5724)
* Scrub text of html before detecting language. * Detect language on statuses coming from activitypub. * Fix rubocop comments. * Remove custom emoji from text before language detection
Diffstat (limited to 'app')
-rw-r--r-- | app/lib/activitypub/activity/create.rb | 2 | ||||
-rw-r--r-- | app/lib/language_detector.rb | 31 |
2 files changed, 26 insertions, 7 deletions
diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index 376684c00..66e4f7c5e 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -173,7 +173,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity end def language_from_content - return nil unless language_map? + return LanguageDetector.instance.detect(text_from_content, @account) unless language_map? @object['contentMap'].keys.first end diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb index a42460e10..c6f52f0c7 100644 --- a/app/lib/language_detector.rb +++ b/app/lib/language_detector.rb @@ -38,12 +38,31 @@ class LanguageDetector end def simplify_text(text) - text.dup.tap do |new_text| - new_text.gsub!(FetchLinkCardService::URL_PATTERN, '') - new_text.gsub!(Account::MENTION_RE, '') - new_text.gsub!(Tag::HASHTAG_RE, '') - new_text.gsub!(/\s+/, ' ') - end + new_text = remove_html(text) + new_text.gsub!(FetchLinkCardService::URL_PATTERN, '') + new_text.gsub!(Account::MENTION_RE, '') + new_text.gsub!(Tag::HASHTAG_RE, '') + new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '') + new_text.gsub!(/\s+/, ' ') + new_text + end + + def new_scrubber + scrubber = Rails::Html::PermitScrubber.new + scrubber.tags = %w(br p) + scrubber + end + + def scrubber + @scrubber ||= new_scrubber + end + + def remove_html(text) + text = Loofah.fragment(text).scrub!(scrubber).to_s + text.gsub!('<br>', "\n") + text.gsub!('</p><p>', "\n\n") + text.gsub!(/(^<p>|<\/p>$)/, '') + text end def default_locale(account) |