From 4b0612c11db1ff2f2206b23d16586f6c3c31e2af Mon Sep 17 00:00:00 2001 From: multiple creatures Date: Fri, 13 Dec 2019 00:43:20 -0600 Subject: improve normalization --- app/helpers/search_helper.rb | 3 ++- app/helpers/text_helper.rb | 21 ++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'app') diff --git a/app/helpers/search_helper.rb b/app/helpers/search_helper.rb index 9510abe99..8bddbe187 100644 --- a/app/helpers/search_helper.rb +++ b/app/helpers/search_helper.rb @@ -3,7 +3,8 @@ require 'sixarm_ruby_unaccent' module SearchHelper def expand_search_query(query) return '' if query.blank? - query = query.strip.downcase.unaccent + query = query.downcase.unaccent.gsub(/[^\p{Word} [:punct:]]/, '').gsub(/ +/, ' ').strip + return '' if query.blank? if query.include?(':') query_parts = query.split(':', 2) diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb index b60eee22d..16bb3f66e 100644 --- a/app/helpers/text_helper.rb +++ b/app/helpers/text_helper.rb @@ -13,20 +13,19 @@ module TextHelper HTMLEntities.new.decode(html) end - def normalize_text(html) - html.downcase + def normalize_text(text) + text.downcase .gsub(Account::MENTION_RE, '') - .gsub(/[ \t]*\302\240+[ \t]*/, ' ') - .gsub(/ +/, ' ') + .gsub(/^(?:#[\w:._·\-]+\s*)+|(?:#[\w:._·\-]+\s*)+$/, '') + .gsub(/\s*\302\240+\s*/, ' ') + .gsub(/\n\s+|\s+\n/, "\n") .gsub(/\r\n?/, "\n") - .gsub(/\n[ \t]+/, "\n") - .gsub(/[ \t]+\n/, "\n") .gsub(/\n\n+/, "\n") - .gsub(/^(?:#[\w:._·\-]+\s*)+/, '') - .gsub(/(?:#[\w:._·\-]+\s*)+$/, '') - .delete('#') - .strip .unaccent_via_split_map + .gsub(/(?:htt|ft)ps?:\/\//, '') + .gsub(/[^\n\p{Word} [:punct:]]/, '') + .gsub(/ +/, ' ') + .strip end def normalize_status(status) @@ -53,7 +52,7 @@ module TextHelper def _format_desc(status) return unless status.media_attachments.present? - text = status.media_attachments.pluck(:description).join("\ndesc ") + text = status.media_attachments.pluck(:description).compact.join("\ndesc ") "desc #{normalize_text(text)}" end end -- cgit