diff options
-rw-r--r-- | Gemfile | 2 | ||||
-rw-r--r-- | Gemfile.lock | 4 | ||||
-rw-r--r-- | app/helpers/settings_helper.rb | 2 | ||||
-rw-r--r-- | app/lib/language_detector.rb | 39 | ||||
-rw-r--r-- | app/services/post_status_service.rb | 6 | ||||
-rw-r--r-- | spec/lib/language_detector_spec.rb | 34 | ||||
-rw-r--r-- | spec/services/post_status_service_spec.rb | 9 |
7 files changed, 43 insertions, 53 deletions
diff --git a/Gemfile b/Gemfile index 051f09f78..4f4861913 100644 --- a/Gemfile +++ b/Gemfile @@ -25,7 +25,7 @@ gem 'bootsnap' gem 'browser' gem 'charlock_holmes', '~> 0.7.5' gem 'iso-639' -gem 'cld3', '~> 3.1' +gem 'cld3', '~> 3.2.0' gem 'devise', '~> 4.2' gem 'devise-two-factor', '~> 3.0' gem 'doorkeeper', '~> 4.2' diff --git a/Gemfile.lock b/Gemfile.lock index 31893dc3f..f080f15e5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -110,7 +110,7 @@ GEM activesupport charlock_holmes (0.7.5) chunky_png (1.3.8) - cld3 (3.1.3) + cld3 (3.2.0) ffi (>= 1.1.0, < 1.10.0) climate_control (0.2.0) cocaine (0.5.8) @@ -541,7 +541,7 @@ DEPENDENCIES capistrano-yarn (~> 2.0) capybara (~> 2.14) charlock_holmes (~> 0.7.5) - cld3 (~> 3.1) + cld3 (~> 3.2.0) climate_control (~> 0.2) devise (~> 4.2) devise-two-factor (~> 3.0) diff --git a/app/helpers/settings_helper.rb b/app/helpers/settings_helper.rb index 369a45680..14776b354 100644 --- a/app/helpers/settings_helper.rb +++ b/app/helpers/settings_helper.rb @@ -41,7 +41,7 @@ module SettingsHelper end def filterable_languages - I18n.available_locales.map { |locale| locale.to_s.split('-').first.to_sym }.uniq + LanguageDetector.instance.language_names.select(&HUMAN_LOCALES.method(:key?)) end def hash_to_object(hash) diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb index 1d9932b52..a42460e10 100644 --- a/app/lib/language_detector.rb +++ b/app/lib/language_detector.rb @@ -1,26 +1,31 @@ # frozen_string_literal: true class LanguageDetector - attr_reader :text, :account + include Singleton - def initialize(text, account = nil) - @text = text - @account = account + def initialize @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048) end - def to_iso_s - detected_language_code || default_locale + def detect(text, account) + detect_language_code(text) || default_locale(account) end - def prepared_text - simplified_text.strip + def language_names + @language_names = + CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym } + .uniq end private - def detected_language_code - iso6391(result.language).to_sym if detected_language_reliable? + def prepare_text(text) + simplify_text(text).strip + end + + def detect_language_code(text) + result = @identifier.find_language(prepare_text(text)) + iso6391(result.language.to_s).to_sym if result.reliable? end def iso6391(bcp47) @@ -32,15 +37,7 @@ class LanguageDetector ISO_639.find(iso639).alpha2 end - def result - @result ||= @identifier.find_language(prepared_text) - end - - def detected_language_reliable? - result.reliable? - end - - def simplified_text + def simplify_text(text) text.dup.tap do |new_text| new_text.gsub!(FetchLinkCardService::URL_PATTERN, '') new_text.gsub!(Account::MENTION_RE, '') @@ -49,7 +46,7 @@ class LanguageDetector end end - def default_locale - account&.user_locale&.to_sym || nil + def default_locale(account) + account.user_locale&.to_sym end end diff --git a/app/services/post_status_service.rb b/app/services/post_status_service.rb index 97c55c4b2..e37cd94df 100644 --- a/app/services/post_status_service.rb +++ b/app/services/post_status_service.rb @@ -28,7 +28,7 @@ class PostStatusService < BaseService sensitive: options[:sensitive], spoiler_text: options[:spoiler_text] || '', visibility: options[:visibility] || account.user&.setting_default_privacy, - language: detect_language_for(text, account), + language: LanguageDetector.instance.detect(text, account), application: options[:application]) attach_media(status, media) @@ -69,10 +69,6 @@ class PostStatusService < BaseService media.update(status_id: status.id) end - def detect_language_for(text, account) - LanguageDetector.new(text, account).to_iso_s - end - def process_mentions_service @process_mentions_service ||= ProcessMentionsService.new end diff --git a/spec/lib/language_detector_spec.rb b/spec/lib/language_detector_spec.rb index ec39cb6a0..d17026511 100644 --- a/spec/lib/language_detector_spec.rb +++ b/spec/lib/language_detector_spec.rb @@ -3,10 +3,10 @@ require 'rails_helper' describe LanguageDetector do - describe 'prepared_text' do + describe 'prepare_text' do it 'returns unmodified string without special cases' do string = 'just a regular string' - result = described_class.new(string).prepared_text + result = described_class.instance.send(:prepare_text, string) expect(result).to eq string end @@ -14,33 +14,35 @@ describe LanguageDetector do it 'collapses spacing in strings' do string = 'The formatting in this is very odd' - result = described_class.new(string).prepared_text + result = described_class.instance.send(:prepare_text, string) expect(result).to eq 'The formatting in this is very odd' end it 'strips usernames from strings before detection' do string = '@username Yeah, very surreal...! also @friend' - result = described_class.new(string).prepared_text + result = described_class.instance.send(:prepare_text, string) expect(result).to eq 'Yeah, very surreal...! also' end it 'strips URLs from strings before detection' do string = 'Our website is https://example.com and also http://localhost.dev' - result = described_class.new(string).prepared_text + result = described_class.instance.send(:prepare_text, string) expect(result).to eq 'Our website is and also' end it 'strips #hashtags from strings before detection' do string = 'Hey look at all the #animals and #fish' - result = described_class.new(string).prepared_text + result = described_class.instance.send(:prepare_text, string) expect(result).to eq 'Hey look at all the and' end end - describe 'to_iso_s' do + describe 'detect' do + let(:account_without_user_locale) { Fabricate(:user, locale: nil).account } + it 'detects english language for basic strings' do strings = [ "Hello and welcome to mastodon how are you today?", @@ -48,7 +50,7 @@ describe LanguageDetector do "a lot of people just want to feel righteous all the time and that's all that matters", ] strings.each do |string| - result = described_class.new(string).to_iso_s + result = described_class.instance.detect(string, account_without_user_locale) expect(result).to eq(:en), string end @@ -56,14 +58,14 @@ describe LanguageDetector do it 'detects spanish language' do string = 'Obtener un Hola y bienvenidos a Mastodon' - result = described_class.new(string).to_iso_s + result = described_class.instance.detect(string, account_without_user_locale) expect(result).to eq :es end describe 'when language can\'t be detected' do it 'uses nil when sent an empty document' do - result = described_class.new('').to_iso_s + result = described_class.instance.detect('', account_without_user_locale) expect(result).to eq nil end @@ -73,7 +75,7 @@ describe LanguageDetector do cld_result = CLD3::NNetLanguageIdentifier.new(0, 2048).find_language(string) expect(cld_result).not_to eq :en - result = described_class.new(string).to_iso_s + result = described_class.instance.detect(string, account_without_user_locale) expect(result).to eq nil end @@ -82,14 +84,13 @@ describe LanguageDetector do describe 'with an account' do it 'uses the account locale when present' do account = double(user_locale: 'fr') - result = described_class.new('', account).to_iso_s + result = described_class.instance.detect('', account) expect(result).to eq :fr end it 'uses nil when account is present but has no locale' do - account = double(user_locale: nil) - result = described_class.new('', account).to_iso_s + result = described_class.instance.detect('', account_without_user_locale) expect(result).to eq nil end @@ -97,8 +98,7 @@ describe LanguageDetector do describe 'with an `en` default locale' do it 'uses nil for undetectable string' do - string = '' - result = described_class.new(string).to_iso_s + result = described_class.instance.detect('', account_without_user_locale) expect(result).to eq nil end @@ -114,7 +114,7 @@ describe LanguageDetector do it 'uses nil for undetectable string' do string = '' - result = described_class.new(string).to_iso_s + result = described_class.instance.detect(string, account_without_user_locale) expect(result).to eq nil end diff --git a/spec/services/post_status_service_spec.rb b/spec/services/post_status_service_spec.rb index 4182c4e1f..91902ff69 100644 --- a/spec/services/post_status_service_spec.rb +++ b/spec/services/post_status_service_spec.rb @@ -65,15 +65,12 @@ RSpec.describe PostStatusService do end it 'creates a status with a language set' do - detector = double(to_iso_s: :en) - allow(LanguageDetector).to receive(:new).and_return(detector) - account = Fabricate(:account) - text = 'test status text' + text = 'This is an English text.' - subject.call(account, text) + status = subject.call(account, text) - expect(LanguageDetector).to have_received(:new).with(text, account) + expect(status.language).to eq 'en' end it 'processes mentions' do |