about summary refs log tree commit diff
path: root/app/lib/language_detector.rb
diff options
context:
space:
mode:
authorRenato "Lond" Cerqueira <renato@lond.com.br>2017-11-16 10:51:38 -0200
committerEugen Rochko <eugen@zeonfederated.com>2017-11-16 13:51:38 +0100
commitad207456d64c76d21c17b26a954b459fe2dc0f54 (patch)
treeaf1fcf001fec2883f963c772ece944f8e29d3ed3 /app/lib/language_detector.rb
parent9e3d24a150acf118789461797735fc0e4a8a30ea (diff)
Improve language filter (#5724)
* Scrub text of html before detecting language.

* Detect language on statuses coming from activitypub.

* Fix rubocop comments.

* Remove custom emoji from text before language detection
Diffstat (limited to 'app/lib/language_detector.rb')
-rw-r--r--app/lib/language_detector.rb31
1 files changed, 25 insertions, 6 deletions
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb
index a42460e10..c6f52f0c7 100644
--- a/app/lib/language_detector.rb
+++ b/app/lib/language_detector.rb
@@ -38,12 +38,31 @@ class LanguageDetector
   end
 
   def simplify_text(text)
-    text.dup.tap do |new_text|
-      new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
-      new_text.gsub!(Account::MENTION_RE, '')
-      new_text.gsub!(Tag::HASHTAG_RE, '')
-      new_text.gsub!(/\s+/, ' ')
-    end
+    new_text = remove_html(text)
+    new_text.gsub!(FetchLinkCardService::URL_PATTERN, '')
+    new_text.gsub!(Account::MENTION_RE, '')
+    new_text.gsub!(Tag::HASHTAG_RE, '')
+    new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '')
+    new_text.gsub!(/\s+/, ' ')
+    new_text
+  end
+
+  def new_scrubber
+    scrubber = Rails::Html::PermitScrubber.new
+    scrubber.tags = %w(br p)
+    scrubber
+  end
+
+  def scrubber
+    @scrubber ||= new_scrubber
+  end
+
+  def remove_html(text)
+    text = Loofah.fragment(text).scrub!(scrubber).to_s
+    text.gsub!('<br>', "\n")
+    text.gsub!('</p><p>', "\n\n")
+    text.gsub!(/(^<p>|<\/p>$)/, '')
+    text
   end
 
   def default_locale(account)