about summary refs log tree commit diff
path: root/app/lib
diff options
context:
space:
mode:
authorEugen Rochko <eugen@zeonfederated.com>2022-07-13 15:03:28 +0200
committerGitHub <noreply@github.com>2022-07-13 15:03:28 +0200
commite7aa2be828f6a632dadd5c41e2364cea91ddbb2c (patch)
treef18390c05c4aa6ce5b15572b470db4bd4791129b /app/lib
parent12ed2d793b1b4823b0df047a47677bb0667bf43d (diff)
Change how hashtags are normalized (#18795)
* Change how hashtags are normalized

* Fix tests
Diffstat (limited to 'app/lib')
-rw-r--r--app/lib/ascii_folding.rb10
-rw-r--r--app/lib/hashtag_normalizer.rb25
2 files changed, 35 insertions, 0 deletions
diff --git a/app/lib/ascii_folding.rb b/app/lib/ascii_folding.rb
new file mode 100644
index 000000000..1798d3d0e
--- /dev/null
+++ b/app/lib/ascii_folding.rb
@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+
+class ASCIIFolding
+  NON_ASCII_CHARS        = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
+  EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
+
+  def fold(str)
+    str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
+  end
+end
diff --git a/app/lib/hashtag_normalizer.rb b/app/lib/hashtag_normalizer.rb
new file mode 100644
index 000000000..c1f99e163
--- /dev/null
+++ b/app/lib/hashtag_normalizer.rb
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+
+class HashtagNormalizer
+  def normalize(str)
+    remove_invalid_characters(ascii_folding(lowercase(cjk_width(str))))
+  end
+
+  private
+
+  def remove_invalid_characters(str)
+    str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '')
+  end
+
+  def ascii_folding(str)
+    ASCIIFolding.new.fold(str)
+  end
+
+  def lowercase(str)
+    str.mb_chars.downcase.to_s
+  end
+
+  def cjk_width(str)
+    str.unicode_normalize(:nfkc)
+  end
+end