From d010e270e613f6299397601289158bd2acedbe8e Mon Sep 17 00:00:00 2001 From: Matt Jankowski Date: Thu, 1 Jun 2017 09:29:14 -0400 Subject: Remove usernames and hashtags from language detection (#3503) * Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace --- spec/lib/language_detector_spec.rb | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'spec/lib/language_detector_spec.rb') diff --git a/spec/lib/language_detector_spec.rb b/spec/lib/language_detector_spec.rb index e543edd49..ace7a326a 100644 --- a/spec/lib/language_detector_spec.rb +++ b/spec/lib/language_detector_spec.rb @@ -1,7 +1,45 @@ # frozen_string_literal: true + require 'rails_helper' describe LanguageDetector do + describe 'prepared_text' do + it 'returns unmodified string without special cases' do + string = 'just a regular string' + result = described_class.new(string).prepared_text + + expect(result).to eq string + end + + it 'collapses spacing in strings' do + string = 'The formatting in this is very odd' + + result = described_class.new(string).prepared_text + expect(result).to eq 'The formatting in this is very odd' + end + + it 'strips usernames from strings before detection' do + string = '@username Yeah, very surreal...! also @friend' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Yeah, very surreal...! also' + end + + it 'strips URLs from strings before detection' do + string = 'Our website is https://example.com and also http://localhost.dev' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Our website is and also' + end + + it 'strips #hashtags from strings before detection' do + string = 'Hey look at all the #animals and #fish' + + result = described_class.new(string).prepared_text + expect(result).to eq 'Hey look at all the and' + end + end + describe 'to_iso_s' do it 'detects english language for basic strings' do strings = [ -- cgit