about summary refs log tree commit diff
path: root/app/lib
diff options
context:
space:
mode:
authorThibaut Girka <thib@sitedethib.com>2019-03-16 13:52:55 +0100
committerThibaut Girka <thib@sitedethib.com>2019-03-16 14:14:20 +0100
commit4df3b7cb067797982199c3e613ba32a9c7474348 (patch)
treebf237d385411f73e670af4c744d8239ab4799434 /app/lib
parentc2fa0f7c40bcc4064e8baaa221665eadd391c001 (diff)
parent1a0d3c9c65d663210494ec9b55912debad6331f5 (diff)
Merge branch 'master' into glitch-soc/merge-upstream
Conflicts:
- app/controllers/settings/follower_domains_controller.rb
  Removed upstream. Did the same here. Maybe we should not have?
- config/locales/en.yml
  Upstream removed the “Authorized followers” page and associated
  translations. This is too close in the file to our glitch-soc-specific
  “flavour” string. No actual conflict.
- config/locales/ja.yml
  Same as above.
- config/locales/pl.yml
  Same as above.
- config/navigation.rb
  No real conflict. New route added too close to the glitch-soc-specific
  “flavours” one.
- config/webpack/configuration.js
  Upstream refactored the webpack(er) configuration quite a bit.
  Tried to keep up.
- config/webpack/loaders/babel.js
  Upstream refactored the webpack(er) configuration quite a bit.
  Tried to keep up.
  The contents of this file have been moved to package.json.
- config/webpack/shared.js
  Upstream refactored the webpack(er) configuration quite a bit.
  Tried to keep up.
- config/webpacker.yml
  Upstream refactored the webpack(er) configuration quite a bit.
  Tried to keep up.
- jest.config.js
  The contents of this file have been moved to package.json.
- package.json
  Upstream refactored the webpack(er) configuration quite a bit.
  Tried to keep up.
- yarn.lock
  Upstream refactored the webpack(er) configuration quite a bit.
  Tried to keep up.
Diffstat (limited to 'app/lib')
-rw-r--r--app/lib/language_detector.rb31
1 files changed, 24 insertions, 7 deletions
diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb
index 58c8e2069..70a9084d1 100644
--- a/app/lib/language_detector.rb
+++ b/app/lib/language_detector.rb
@@ -3,7 +3,8 @@
 class LanguageDetector
   include Singleton
 
-  CHARACTER_THRESHOLD = 140
+  CHARACTER_THRESHOLD    = 140
+  RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]+/m
 
   def initialize
     @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048)
@@ -11,15 +12,14 @@ class LanguageDetector
 
   def detect(text, account)
     input_text = prepare_text(text)
+
     return if input_text.blank?
 
     detect_language_code(input_text) || default_locale(account)
   end
 
   def language_names
-    @language_names =
-      CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }
-                                             .uniq
+    @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq
   end
 
   private
@@ -29,12 +29,29 @@ class LanguageDetector
   end
 
   def unreliable_input?(text)
-    text.size < CHARACTER_THRESHOLD
+    !reliable_input?(text)
+  end
+
+  def reliable_input?(text)
+    sufficient_text_length?(text) || language_specific_character_set?(text)
+  end
+
+  def sufficient_text_length?(text)
+    text.size >= CHARACTER_THRESHOLD
+  end
+
+  def language_specific_character_set?(text)
+    words = text.scan(RELIABLE_CHARACTERS_RE)
+
+    if words.present?
+      words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size.to_f > 0.3
+    else
+      false
+    end
   end
 
   def detect_language_code(text)
     return if unreliable_input?(text)
-
     result = @identifier.find_language(text)
     iso6391(result.language.to_s).to_sym if result.reliable?
   end
@@ -77,6 +94,6 @@ class LanguageDetector
   end
 
   def default_locale(account)
-    return account.user_locale&.to_sym || I18n.default_locale if account.local?
+    account.user_locale&.to_sym || I18n.default_locale if account.local?
   end
 end