diff options
author | Eugen Rochko <eugen@zeonfederated.com> | 2022-07-13 15:03:28 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-07-13 15:03:28 +0200 |
commit | e7aa2be828f6a632dadd5c41e2364cea91ddbb2c (patch) | |
tree | f18390c05c4aa6ce5b15572b470db4bd4791129b /app/lib | |
parent | 12ed2d793b1b4823b0df047a47677bb0667bf43d (diff) |
Change how hashtags are normalized (#18795)
* Change how hashtags are normalized * Fix tests
Diffstat (limited to 'app/lib')
-rw-r--r-- | app/lib/ascii_folding.rb | 10 | ||||
-rw-r--r-- | app/lib/hashtag_normalizer.rb | 25 |
2 files changed, 35 insertions, 0 deletions
diff --git a/app/lib/ascii_folding.rb b/app/lib/ascii_folding.rb new file mode 100644 index 000000000..1798d3d0e --- /dev/null +++ b/app/lib/ascii_folding.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +class ASCIIFolding + NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž' + EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz' + + def fold(str) + str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS) + end +end diff --git a/app/lib/hashtag_normalizer.rb b/app/lib/hashtag_normalizer.rb new file mode 100644 index 000000000..c1f99e163 --- /dev/null +++ b/app/lib/hashtag_normalizer.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +class HashtagNormalizer + def normalize(str) + remove_invalid_characters(ascii_folding(lowercase(cjk_width(str)))) + end + + private + + def remove_invalid_characters(str) + str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '') + end + + def ascii_folding(str) + ASCIIFolding.new.fold(str) + end + + def lowercase(str) + str.mb_chars.downcase.to_s + end + + def cjk_width(str) + str.unicode_normalize(:nfkc) + end +end |