From e7aa2be828f6a632dadd5c41e2364cea91ddbb2c Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Wed, 13 Jul 2022 15:03:28 +0200 Subject: Change how hashtags are normalized (#18795) * Change how hashtags are normalized * Fix tests --- streaming/index.js | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) (limited to 'streaming') diff --git a/streaming/index.js b/streaming/index.js index 792ec5a44..a55181bad 100644 --- a/streaming/index.js +++ b/streaming/index.js @@ -892,6 +892,34 @@ const startWorker = async (workerId) => { return arr; }; + /** + * See app/lib/ascii_folder.rb for the canon definitions + * of these constants + */ + const NON_ASCII_CHARS = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'; + const EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'; + + /** + * @param {string} str + * @return {string} + */ + const foldToASCII = str => { + const regex = new RegExp(NON_ASCII_CHARS.split('').join('|'), 'g'); + + return str.replace(regex, match => { + const index = NON_ASCII_CHARS.indexOf(match); + return EQUIVALENT_ASCII_CHARS[index]; + }); + }; + + /** + * @param {string} str + * @return {string} + */ + const normalizeHashtag = str => { + return foldToASCII(str.normalize('NFKC').toLowerCase()).replace(/[^\p{L}\p{N}_\u00b7\u200c]/gu, ''); + }; + /** * @param {any} req * @param {string} name @@ -968,7 +996,7 @@ const startWorker = async (workerId) => { reject('No tag for stream provided'); } else { resolve({ - channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}`], + channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}`], options: { needsFiltering: true }, }); } @@ -979,7 +1007,7 @@ const startWorker = async (workerId) => { reject('No tag for stream provided'); } else { resolve({ - channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}:local`], + channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}:local`], options: { needsFiltering: true }, }); } -- cgit