about summary refs log tree commit diff
path: root/app/lib/language_detector.rb
diff options
context:
space:
mode:
authorMatt Jankowski <mjankowski@thoughtbot.com>2017-06-01 09:29:14 -0400
committerGitHub <noreply@github.com>2017-06-01 09:29:14 -0400
commitd010e270e613f6299397601289158bd2acedbe8e (patch)
tree6bc639705b9e53be8efd6ff3b7fe791b5a9e13ad /app/lib/language_detector.rb
parentd1e08bd38c029f0b47dfd2f3ba61ca5bb3e414b8 (diff)
Remove usernames and hashtags from language detection (#3503)
* Add failing specs for hashtag and username extraction in language detector

* Remove usernames and hashtags from text before language detection

* Handle multiple instances of special case, and reduce whitespace
Diffstat (limited to 'app/lib/language_detector.rb')
-rw-r--r--app/lib/language_detector.rb11
1 files changed, 9 insertions, 2 deletions
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb
index 1c22a9ccc..0d087f8d1 100644
--- a/app/lib/language_detector.rb
+++ b/app/lib/language_detector.rb
@@ -13,6 +13,10 @@ class LanguageDetector
     detected_language_code || default_locale.to_sym
   end
 
+  def prepared_text
+    simplified_text.strip
+  end
+
   private
 
   def detected_language_code
@@ -20,18 +24,21 @@ class LanguageDetector
   end
 
   def result
-    @result ||= @identifier.find_language(text_without_urls)
+    @result ||= @identifier.find_language(prepared_text)
   end
 
   def detected_language_reliable?
     result.reliable?
   end
 
-  def text_without_urls
+  def simplified_text
     text.dup.tap do |new_text|
       URI.extract(new_text).each do |url|
         new_text.gsub!(url, '')
       end
+      new_text.gsub!(Account::MENTION_RE, '')
+      new_text.gsub!(Tag::HASHTAG_RE, '')
+      new_text.gsub!(/\s+/, ' ')
     end
   end