From b6d7726ecbc833abd00f6a9d36b24d9776cfe623 Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Tue, 8 Feb 2022 02:41:17 +0100 Subject: Remove language detection through cld3 (#17478) * Remove language detection through cld3 * Update app/helpers/languages_helper.rb Co-authored-by: Yamagishi Kazutoshi Co-authored-by: Yamagishi Kazutoshi --- Gemfile | 2 - Gemfile.lock | 5 - app/helpers/languages_helper.rb | 291 +++++++++++++++------ app/helpers/settings_helper.rb | 2 +- app/lib/activitypub/activity/create.rb | 6 +- app/lib/language_detector.rb | 101 ------- app/lib/link_details_extractor.rb | 9 +- app/models/user.rb | 4 + .../activitypub/process_status_update_service.rb | 6 +- app/services/post_status_service.rb | 7 +- app/validators/import_validator.rb | 2 + .../settings/preferences/other/show.html.haml | 4 +- config/locales/en.yml | 2 +- lib/tasks/repo.rake | 5 +- spec/helpers/languages_helper_spec.rb | 6 +- spec/lib/language_detector_spec.rb | 134 ---------- 16 files changed, 238 insertions(+), 348 deletions(-) delete mode 100644 app/lib/language_detector.rb delete mode 100644 spec/lib/language_detector_spec.rb diff --git a/Gemfile b/Gemfile index afed1ac94..e869e5f7a 100644 --- a/Gemfile +++ b/Gemfile @@ -29,9 +29,7 @@ gem 'addressable', '~> 2.8' gem 'bootsnap', '~> 1.10.2', require: false gem 'browser' gem 'charlock_holmes', '~> 0.7.7' -gem 'iso-639' gem 'chewy', '~> 7.2' -gem 'cld3', '~> 3.4.4' gem 'devise', '~> 4.8' gem 'devise-two-factor', '~> 4.0' diff --git a/Gemfile.lock b/Gemfile.lock index 5ecddec12..f7dd292dd 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -152,8 +152,6 @@ GEM elasticsearch (>= 7.12.0, < 7.14.0) elasticsearch-dsl chunky_png (1.4.0) - cld3 (3.4.4) - ffi (>= 1.1.0, < 1.16.0) climate_control (0.2.0) coderay (1.1.3) color_diff (0.1) @@ -301,7 +299,6 @@ GEM terminal-table (>= 1.5.1) idn-ruby (0.1.4) ipaddress (0.8.3) - iso-639 (0.3.5) jmespath (1.5.0) json (2.5.1) json-canonicalization (0.3.0) @@ -698,7 +695,6 @@ DEPENDENCIES capybara (~> 3.36) charlock_holmes (~> 0.7.7) chewy (~> 7.2) - cld3 (~> 3.4.4) climate_control (~> 0.2) color_diff (~> 0.1) concurrent-ruby @@ -725,7 +721,6 @@ DEPENDENCIES httplog (~> 1.5.0) i18n-tasks (~> 0.9) idn-ruby - iso-639 json-ld json-ld-preloaded (~> 3.2) kaminari (~> 1.2) diff --git a/app/helpers/languages_helper.rb b/app/helpers/languages_helper.rb index 730724208..f3ed7b314 100644 --- a/app/helpers/languages_helper.rb +++ b/app/helpers/languages_helper.rb @@ -1,94 +1,237 @@ # frozen_string_literal: true module LanguagesHelper - HUMAN_LOCALES = { - af: 'Afrikaans', - ar: 'العربية', - ast: 'Asturianu', - bg: 'Български', - bn: 'বাংলা', - br: 'Breton', - ca: 'Català', - co: 'Corsu', - cs: 'Čeština', - cy: 'Cymraeg', - da: 'Dansk', - de: 'Deutsch', - el: 'Ελληνικά', - en: 'English', - eo: 'Esperanto', + ISO_639_1 = { + aa: ['Afar', 'Afaraf'].freeze, + ab: ['Abkhaz', 'аҧсуа бызшәа'].freeze, + ae: ['Avestan', 'avesta'].freeze, + af: ['Afrikaans', 'Afrikaans'].freeze, + ak: ['Akan', 'Akan'].freeze, + am: ['Amharic', 'አማርኛ'].freeze, + an: ['Aragonese', 'aragonés'].freeze, + ar: ['Arabic', 'اللغة العربية'].freeze, + as: ['Assamese', 'অসমীয়া'].freeze, + av: ['Avaric', 'авар мацӀ'].freeze, + ay: ['Aymara', 'aymar aru'].freeze, + az: ['Azerbaijani', 'azərbaycan dili'].freeze, + ba: ['Bashkir', 'башҡорт теле'].freeze, + be: ['Belarusian', 'беларуская мова'].freeze, + bg: ['Bulgarian', 'български език'].freeze, + bh: ['Bihari', 'भोजपुरी'].freeze, + bi: ['Bislama', 'Bislama'].freeze, + bm: ['Bambara', 'bamanankan'].freeze, + bn: ['Bengali', 'বাংলা'].freeze, + bo: ['Tibetan', 'བོད་ཡིག'].freeze, + br: ['Breton', 'brezhoneg'].freeze, + bs: ['Bosnian', 'bosanski jezik'].freeze, + ca: ['Catalan', 'Català'].freeze, + ce: ['Chechen', 'нохчийн мотт'].freeze, + ch: ['Chamorro', 'Chamoru'].freeze, + co: ['Corsican', 'corsu'].freeze, + cr: ['Cree', 'ᓀᐦᐃᔭᐍᐏᐣ'].freeze, + cs: ['Czech', 'čeština'].freeze, + cu: ['Old Church Slavonic', 'ѩзыкъ словѣньскъ'].freeze, + cv: ['Chuvash', 'чӑваш чӗлхи'].freeze, + cy: ['Welsh', 'Cymraeg'].freeze, + da: ['Danish', 'dansk'].freeze, + de: ['German', 'Deutsch'].freeze, + dv: ['Divehi', 'Dhivehi'].freeze, + dz: ['Dzongkha', 'རྫོང་ཁ'].freeze, + ee: ['Ewe', 'Eʋegbe'].freeze, + el: ['Greek', 'Ελληνικά'].freeze, + en: ['English', 'English'].freeze, + eo: ['Esperanto', 'Esperanto'].freeze, + es: ['Spanish', 'Español'].freeze, + et: ['Estonian', 'eesti'].freeze, + eu: ['Basque', 'euskara'].freeze, + fa: ['Persian', 'فارسی'].freeze, + ff: ['Fula', 'Fulfulde'].freeze, + fi: ['Finnish', 'suomi'].freeze, + fj: ['Fijian', 'Vakaviti'].freeze, + fo: ['Faroese', 'føroyskt'].freeze, + fr: ['French', 'Français'].freeze, + fy: ['Western Frisian', 'Frysk'].freeze, + ga: ['Irish', 'Gaeilge'].freeze, + gd: ['Scottish Gaelic', 'Gàidhlig'].freeze, + gl: ['Galician', 'galego'].freeze, + gu: ['Gujarati', 'ગુજરાતી'].freeze, + gv: ['Manx', 'Gaelg'].freeze, + ha: ['Hausa', 'هَوُسَ'].freeze, + he: ['Hebrew', 'עברית'].freeze, + hi: ['Hindi', 'हिन्दी'].freeze, + ho: ['Hiri Motu', 'Hiri Motu'].freeze, + hr: ['Croatian', 'Hrvatski'].freeze, + ht: ['Haitian', 'Kreyòl ayisyen'].freeze, + hu: ['Hungarian', 'magyar'].freeze, + hy: ['Armenian', 'Հայերեն'].freeze, + hz: ['Herero', 'Otjiherero'].freeze, + ia: ['Interlingua', 'Interlingua'].freeze, + id: ['Indonesian', 'Bahasa Indonesia'].freeze, + ie: ['Interlingue', 'Interlingue'].freeze, + ig: ['Igbo', 'Asụsụ Igbo'].freeze, + ii: ['Nuosu', 'ꆈꌠ꒿ Nuosuhxop'].freeze, + ik: ['Inupiaq', 'Iñupiaq'].freeze, + io: ['Ido', 'Ido'].freeze, + is: ['Icelandic', 'Íslenska'].freeze, + it: ['Italian', 'Italiano'].freeze, + iu: ['Inuktitut', 'ᐃᓄᒃᑎᑐᑦ'].freeze, + ja: ['Japanese', '日本語'].freeze, + jv: ['Javanese', 'basa Jawa'].freeze, + ka: ['Georgian', 'ქართული'].freeze, + kg: ['Kongo', 'Kikongo'].freeze, + ki: ['Kikuyu', 'Gĩkũyũ'].freeze, + kj: ['Kwanyama', 'Kuanyama'].freeze, + kk: ['Kazakh', 'қазақ тілі'].freeze, + kl: ['Kalaallisut', 'kalaallisut'].freeze, + km: ['Khmer', 'ខេមរភាសា'].freeze, + kn: ['Kannada', 'ಕನ್ನಡ'].freeze, + ko: ['Korean', '한국어'].freeze, + kr: ['Kanuri', 'Kanuri'].freeze, + ks: ['Kashmiri', 'कश्मीरी'].freeze, + ku: ['Kurdish', 'Kurdî'].freeze, + kv: ['Komi', 'коми кыв'].freeze, + kw: ['Cornish', 'Kernewek'].freeze, + ky: ['Kyrgyz', 'Кыргызча'].freeze, + la: ['Latin', 'latine'].freeze, + lb: ['Luxembourgish', 'Lëtzebuergesch'].freeze, + lg: ['Ganda', 'Luganda'].freeze, + li: ['Limburgish', 'Limburgs'].freeze, + ln: ['Lingala', 'Lingála'].freeze, + lo: ['Lao', 'ພາສາ'].freeze, + lt: ['Lithuanian', 'lietuvių kalba'].freeze, + lu: ['Luba-Katanga', 'Tshiluba'].freeze, + lv: ['Latvian', 'latviešu valoda'].freeze, + mg: ['Malagasy', 'fiteny malagasy'].freeze, + mh: ['Marshallese', 'Kajin M̧ajeļ'].freeze, + mi: ['Māori', 'te reo Māori'].freeze, + mk: ['Macedonian', 'македонски јазик'].freeze, + ml: ['Malayalam', 'മലയാളം'].freeze, + mn: ['Mongolian', 'Монгол хэл'].freeze, + mr: ['Marathi', 'मराठी'].freeze, + ms: ['Malay', 'Bahasa Malaysia'].freeze, + mt: ['Maltese', 'Malti'].freeze, + my: ['Burmese', 'ဗမာစာ'].freeze, + na: ['Nauru', 'Ekakairũ Naoero'].freeze, + nb: ['Norwegian Bokmål', 'Norsk bokmål'].freeze, + nd: ['Northern Ndebele', 'isiNdebele'].freeze, + ne: ['Nepali', 'नेपाली'].freeze, + ng: ['Ndonga', 'Owambo'].freeze, + nl: ['Dutch', 'Nederlands'].freeze, + nn: ['Norwegian Nynorsk', 'Norsk nynorsk'].freeze, + no: ['Norwegian', 'Norsk'].freeze, + nr: ['Southern Ndebele', 'isiNdebele'].freeze, + nv: ['Navajo', 'Diné bizaad'].freeze, + ny: ['Chichewa', 'chiCheŵa'].freeze, + oc: ['Occitan', 'occitan'].freeze, + oj: ['Ojibwe', 'ᐊᓂᔑᓈᐯᒧᐎᓐ'].freeze, + om: ['Oromo', 'Afaan Oromoo'].freeze, + or: ['Oriya', 'ଓଡ଼ିଆ'].freeze, + os: ['Ossetian', 'ирон æвзаг'].freeze, + pa: ['Panjabi', 'ਪੰਜਾਬੀ'].freeze, + pi: ['Pāli', 'पाऴि'].freeze, + pl: ['Polish', 'Polski'].freeze, + ps: ['Pashto', 'پښتو'].freeze, + pt: ['Portuguese', 'Português'].freeze, + qu: ['Quechua', 'Runa Simi'].freeze, + rm: ['Romansh', 'rumantsch grischun'].freeze, + rn: ['Kirundi', 'Ikirundi'].freeze, + ro: ['Romanian', 'Română'].freeze, + ru: ['Russian', 'Русский'].freeze, + rw: ['Kinyarwanda', 'Ikinyarwanda'].freeze, + sa: ['Sanskrit', 'संस्कृतम्'].freeze, + sc: ['Sardinian', 'sardu'].freeze, + sd: ['Sindhi', 'सिन्धी'].freeze, + se: ['Northern Sami', 'Davvisámegiella'].freeze, + sg: ['Sango', 'yângâ tî sängö'].freeze, + si: ['Sinhala', 'සිංහල'].freeze, + sk: ['Slovak', 'slovenčina'].freeze, + sl: ['Slovenian', 'slovenščina'].freeze, + sn: ['Shona', 'chiShona'].freeze, + so: ['Somali', 'Soomaaliga'].freeze, + sq: ['Albanian', 'Shqip'].freeze, + sr: ['Serbian', 'српски језик'].freeze, + ss: ['Swati', 'SiSwati'].freeze, + st: ['Southern Sotho', 'Sesotho'].freeze, + su: ['Sundanese', 'Basa Sunda'].freeze, + sv: ['Swedish', 'Svenska'].freeze, + sw: ['Swahili', 'Kiswahili'].freeze, + ta: ['Tamil', 'தமிழ்'].freeze, + te: ['Telugu', 'తెలుగు'].freeze, + tg: ['Tajik', 'тоҷикӣ'].freeze, + th: ['Thai', 'ไทย'].freeze, + ti: ['Tigrinya', 'ትግርኛ'].freeze, + tk: ['Turkmen', 'Türkmen'].freeze, + tl: ['Tagalog', 'Wikang Tagalog'].freeze, + tn: ['Tswana', 'Setswana'].freeze, + to: ['Tonga', 'faka Tonga'].freeze, + tr: ['Turkish', 'Türkçe'].freeze, + ts: ['Tsonga', 'Xitsonga'].freeze, + tt: ['Tatar', 'татар теле'].freeze, + tw: ['Twi', 'Twi'].freeze, + ty: ['Tahitian', 'Reo Tahiti'].freeze, + ug: ['Uyghur', 'ئۇيغۇرچە‎'].freeze, + uk: ['Ukrainian', 'Українська'].freeze, + ur: ['Urdu', 'اردو'].freeze, + uz: ['Uzbek', 'Ўзбек'].freeze, + ve: ['Venda', 'Tshivenḓa'].freeze, + vi: ['Vietnamese', 'Tiếng Việt'].freeze, + vo: ['Volapük', 'Volapük'].freeze, + wa: ['Walloon', 'walon'].freeze, + wo: ['Wolof', 'Wollof'].freeze, + xh: ['Xhosa', 'isiXhosa'].freeze, + yi: ['Yiddish', 'ייִדיש'].freeze, + yo: ['Yoruba', 'Yorùbá'].freeze, + za: ['Zhuang', 'Saɯ cueŋƅ'].freeze, + zh: ['Chinese', '中文'].freeze, + zu: ['Zulu', 'isiZulu'].freeze, + }.freeze + + ISO_639_3 = { + ast: ['Asturian', 'Asturianu'].freeze, + kab: ['Kabyle', 'Taqbaylit'].freeze, + kmr: ['Northern Kurdish', 'Kurmancî'].freeze, + zgh: ['Standard Moroccan Tamazight', 'ⵜⴰⵎⴰⵣⵉⵖⵜ'].freeze, + }.freeze + + SUPPORTED_LOCALES = {}.merge(ISO_639_1).merge(ISO_639_3).freeze + + # For ISO-639-1 and ISO-639-3 language codes, we have their official + # names, but for some translations, we need the names of the + # regional variants specifically + REGIONAL_LOCALE_NAMES = { 'es-AR': 'Español (Argentina)', 'es-MX': 'Español (México)', - es: 'Español', - et: 'Eesti', - eu: 'Euskara', - fa: 'فارسی', - fi: 'Suomi', - fr: 'Français', - ga: 'Gaeilge', - gd: 'Gàidhlig', - gl: 'Galego', - he: 'עברית', - hi: 'हिन्दी', - hr: 'Hrvatski', - hu: 'Magyar', - hy: 'Հայերեն', - id: 'Bahasa Indonesia', - io: 'Ido', - is: 'Íslenska', - it: 'Italiano', - ja: '日本語', - ka: 'ქართული', - kab: 'Taqbaylit', - kk: 'Қазақша', - kmr: 'Kurmancî', - kn: 'ಕನ್ನಡ', - ko: '한국어', - ku: 'سۆرانی', - lt: 'Lietuvių', - lv: 'Latviešu', - mk: 'Македонски', - ml: 'മലയാളം', - mr: 'मराठी', - ms: 'Bahasa Melayu', - nl: 'Nederlands', - nn: 'Nynorsk', - no: 'Norsk', - oc: 'Occitan', - pl: 'Polski', 'pt-BR': 'Português (Brasil)', 'pt-PT': 'Português (Portugal)', - pt: 'Português', - ro: 'Română', - ru: 'Русский', - sa: 'संस्कृतम्', - sc: 'Sardu', - si: 'සිංහල', - sk: 'Slovenčina', - sl: 'Slovenščina', - sq: 'Shqip', 'sr-Latn': 'Srpski (latinica)', - sr: 'Српски', - sv: 'Svenska', - ta: 'தமிழ்', - te: 'తెలుగు', - th: 'ไทย', - tr: 'Türkçe', - uk: 'Українська', - ur: 'اُردُو', - vi: 'Tiếng Việt', - zgh: 'ⵜⴰⵎⴰⵣⵉⵖⵜ', 'zh-CN': '简体中文', 'zh-HK': '繁體中文(香港)', 'zh-TW': '繁體中文(臺灣)', - zh: '中文', }.freeze def human_locale(locale) if locale == 'und' I18n.t('generic.none') + elsif (supported_locale = SUPPORTED_LOCALES[locale.to_sym]) + supported_locale[1] + elsif (regional_locale = REGIONAL_LOCALE_NAMES[locale.to_sym]) + regional_locale else - HUMAN_LOCALES[locale.to_sym] || locale + locale end end + + def valid_locale_or_nil(str) + return if str.blank? + + code, = str.to_s.split(/[_-]/) # Strip out the region from e.g. en_US or ja-JP + + return unless valid_locale?(code) + + code + end + + def valid_locale?(locale) + SUPPORTED_LOCALES.key?(locale.to_sym) + end end diff --git a/app/helpers/settings_helper.rb b/app/helpers/settings_helper.rb index 23739d1cd..3d5592867 100644 --- a/app/helpers/settings_helper.rb +++ b/app/helpers/settings_helper.rb @@ -2,7 +2,7 @@ module SettingsHelper def filterable_languages - LanguageDetector.instance.language_names.select(&LanguagesHelper::HUMAN_LOCALES.method(:key?)) + LanguagesHelper::SUPPORTED_LOCALES.keys end def hash_to_object(hash) diff --git a/app/lib/activitypub/activity/create.rb b/app/lib/activitypub/activity/create.rb index 33998c477..ea8d146d4 100644 --- a/app/lib/activitypub/activity/create.rb +++ b/app/lib/activitypub/activity/create.rb @@ -112,7 +112,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity url: @status_parser.url || @status_parser.uri, account: @account, text: converted_object_type? ? converted_text : (@status_parser.text || ''), - language: @status_parser.language || detected_language, + language: @status_parser.language, spoiler_text: converted_object_type? ? '' : (@status_parser.spoiler_text || ''), created_at: @status_parser.created_at, edited_at: @status_parser.edited_at, @@ -370,10 +370,6 @@ class ActivityPub::Activity::Create < ActivityPub::Activity Formatter.instance.linkify([@status_parser.title.presence, @status_parser.spoiler_text.presence, @status_parser.url || @status_parser.uri].compact.join("\n\n")) end - def detected_language - LanguageDetector.instance.detect(@status_parser.text, @account) if supported_object_type? - end - def unsupported_media_type?(mime_type) mime_type.present? && !MediaAttachment.supported_mime_types.include?(mime_type) end diff --git a/app/lib/language_detector.rb b/app/lib/language_detector.rb deleted file mode 100644 index 40452eddc..000000000 --- a/app/lib/language_detector.rb +++ /dev/null @@ -1,101 +0,0 @@ -# frozen_string_literal: true - -class LanguageDetector - include Singleton - - WORDS_THRESHOLD = 4 - RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}\p{Thai}]+/m - - def initialize - @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048) - end - - def detect(text, account) - input_text = prepare_text(text) - - return if input_text.blank? - - detect_language_code(input_text) || default_locale(account) - end - - def language_names - @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq - end - - private - - def prepare_text(text) - simplify_text(text).strip - end - - def unreliable_input?(text) - !reliable_input?(text) - end - - def reliable_input?(text) - sufficient_text_length?(text) || language_specific_character_set?(text) - end - - def sufficient_text_length?(text) - text.split(/\s+/).size >= WORDS_THRESHOLD - end - - def language_specific_character_set?(text) - words = text.scan(RELIABLE_CHARACTERS_RE) - - if words.present? - words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size > 0.3 - else - false - end - end - - def detect_language_code(text) - return if unreliable_input?(text) - - result = @identifier.find_language(text) - - iso6391(result.language.to_s).to_sym if result&.reliable? - end - - def iso6391(bcp47) - iso639 = bcp47.split('-').first - - # CLD3 returns grandfathered language code for Hebrew - return 'he' if iso639 == 'iw' - - ISO_639.find(iso639).alpha2 - end - - def simplify_text(text) - new_text = remove_html(text) - new_text.gsub!(FetchLinkCardService::URL_PATTERN, '\1') - new_text.gsub!(Account::MENTION_RE, '') - new_text.gsub!(Tag::HASHTAG_RE) { |string| string.gsub(/[#_]/, '#' => '', '_' => ' ').gsub(/[a-z][A-Z]|[a-zA-Z][\d]/) { |s| s.insert(1, ' ') }.downcase } - new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '') - new_text.gsub!(/\s+/, ' ') - new_text - end - - def new_scrubber - scrubber = Rails::Html::PermitScrubber.new - scrubber.tags = %w(br p) - scrubber - end - - def scrubber - @scrubber ||= new_scrubber - end - - def remove_html(text) - text = Loofah.fragment(text).scrub!(scrubber).to_s - text.gsub!('
', "\n") - text.gsub!('

', "\n\n") - text.gsub!(/(^

|<\/p>$)/, '') - text - end - - def default_locale(account) - account.user_locale&.to_sym || I18n.default_locale if account.local? - end -end diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index d2bcf0c25..fabbd244d 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -2,6 +2,7 @@ class LinkDetailsExtractor include ActionView::Helpers::TagHelper + include LanguagesHelper # Some publications wrap their JSON-LD data in their