diff options
author | Matt Jankowski <mjankowski@thoughtbot.com> | 2017-06-01 09:29:14 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-06-01 09:29:14 -0400 |
commit | d010e270e613f6299397601289158bd2acedbe8e (patch) | |
tree | 6bc639705b9e53be8efd6ff3b7fe791b5a9e13ad /app/lib | |
parent | d1e08bd38c029f0b47dfd2f3ba61ca5bb3e414b8 (diff) |
Remove usernames and hashtags from language detection (#3503)
* Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace
Diffstat (limited to 'app/lib')
-rw-r--r-- | app/lib/language_detector.rb | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb index 1c22a9ccc..0d087f8d1 100644 --- a/app/lib/language_detector.rb +++ b/app/lib/language_detector.rb @@ -13,6 +13,10 @@ class LanguageDetector detected_language_code || default_locale.to_sym end + def prepared_text + simplified_text.strip + end + private def detected_language_code @@ -20,18 +24,21 @@ class LanguageDetector end def result - @result ||= @identifier.find_language(text_without_urls) + @result ||= @identifier.find_language(prepared_text) end def detected_language_reliable? result.reliable? end - def text_without_urls + def simplified_text text.dup.tap do |new_text| URI.extract(new_text).each do |url| new_text.gsub!(url, '') end + new_text.gsub!(Account::MENTION_RE, '') + new_text.gsub!(Tag::HASHTAG_RE, '') + new_text.gsub!(/\s+/, ' ') end end |