diff options
author | multiple creatures <dev@multiple-creature.party> | 2019-12-13 00:43:20 -0600 |
---|---|---|
committer | multiple creatures <dev@multiple-creature.party> | 2019-12-13 00:43:20 -0600 |
commit | 4b0612c11db1ff2f2206b23d16586f6c3c31e2af (patch) | |
tree | 9240fa54cd581e6ae7856040bbf8466447937d25 | |
parent | a547f442e02562f33b26a4a473bcc847bde4e48e (diff) |
improve normalization
-rw-r--r-- | app/helpers/search_helper.rb | 3 | ||||
-rw-r--r-- | app/helpers/text_helper.rb | 21 |
2 files changed, 12 insertions, 12 deletions
diff --git a/app/helpers/search_helper.rb b/app/helpers/search_helper.rb index 9510abe99..8bddbe187 100644 --- a/app/helpers/search_helper.rb +++ b/app/helpers/search_helper.rb @@ -3,7 +3,8 @@ require 'sixarm_ruby_unaccent' module SearchHelper def expand_search_query(query) return '' if query.blank? - query = query.strip.downcase.unaccent + query = query.downcase.unaccent.gsub(/[^\p{Word} [:punct:]]/, '').gsub(/ +/, ' ').strip + return '' if query.blank? if query.include?(':') query_parts = query.split(':', 2) diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb index b60eee22d..16bb3f66e 100644 --- a/app/helpers/text_helper.rb +++ b/app/helpers/text_helper.rb @@ -13,20 +13,19 @@ module TextHelper HTMLEntities.new.decode(html) end - def normalize_text(html) - html.downcase + def normalize_text(text) + text.downcase .gsub(Account::MENTION_RE, '') - .gsub(/[ \t]*\302\240+[ \t]*/, ' ') - .gsub(/ +/, ' ') + .gsub(/^(?:#[\w:._·\-]+\s*)+|(?:#[\w:._·\-]+\s*)+$/, '') + .gsub(/\s*\302\240+\s*/, ' ') + .gsub(/\n\s+|\s+\n/, "\n") .gsub(/\r\n?/, "\n") - .gsub(/\n[ \t]+/, "\n") - .gsub(/[ \t]+\n/, "\n") .gsub(/\n\n+/, "\n") - .gsub(/^(?:#[\w:._·\-]+\s*)+/, '') - .gsub(/(?:#[\w:._·\-]+\s*)+$/, '') - .delete('#') - .strip .unaccent_via_split_map + .gsub(/(?:htt|ft)ps?:\/\//, '') + .gsub(/[^\n\p{Word} [:punct:]]/, '') + .gsub(/ +/, ' ') + .strip end def normalize_status(status) @@ -53,7 +52,7 @@ module TextHelper def _format_desc(status) return unless status.media_attachments.present? - text = status.media_attachments.pluck(:description).join("\ndesc ") + text = status.media_attachments.pluck(:description).compact.join("\ndesc ") "desc #{normalize_text(text)}" end end |