From d010e270e613f6299397601289158bd2acedbe8e Mon Sep 17 00:00:00 2001 From: Matt Jankowski Date: Thu, 1 Jun 2017 09:29:14 -0400 Subject: Remove usernames and hashtags from language detection (#3503) * Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace --- app/lib/language_detector.rb | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'app/lib/language_detector.rb') diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb index 1c22a9ccc..0d087f8d1 100644 --- a/app/lib/language_detector.rb +++ b/app/lib/language_detector.rb @@ -13,6 +13,10 @@ class LanguageDetector detected_language_code || default_locale.to_sym end + def prepared_text + simplified_text.strip + end + private def detected_language_code @@ -20,18 +24,21 @@ class LanguageDetector end def result - @result ||= @identifier.find_language(text_without_urls) + @result ||= @identifier.find_language(prepared_text) end def detected_language_reliable? result.reliable? end - def text_without_urls + def simplified_text text.dup.tap do |new_text| URI.extract(new_text).each do |url| new_text.gsub!(url, '') end + new_text.gsub!(Account::MENTION_RE, '') + new_text.gsub!(Tag::HASHTAG_RE, '') + new_text.gsub!(/\s+/, ' ') end end -- cgit