about summary refs log tree commit diff
path: root/app
diff options
context:
space:
mode:
authormultiple creatures <dev@multiple-creature.party>2019-12-10 05:18:50 -0600
committermultiple creatures <dev@multiple-creature.party>2019-12-10 05:18:50 -0600
commit2da941beaab17923ec72af0c2e2d8de6c62cd20e (patch)
treec7d1b7752808a4787e94b6c83c0ee83a50c48152 /app
parent533dd6d985430babe9cb8a1dc4dfc78d0e92b485 (diff)
simplify normalizer & add headings
Diffstat (limited to 'app')
-rw-r--r--app/helpers/text_helper.rb64
1 files changed, 42 insertions, 22 deletions
diff --git a/app/helpers/text_helper.rb b/app/helpers/text_helper.rb
index d85c6bbd8..ac49e56f0 100644
--- a/app/helpers/text_helper.rb
+++ b/app/helpers/text_helper.rb
@@ -4,35 +4,55 @@ require 'sixarm_ruby_unaccent'
 
 module TextHelper
 
-  def normalize_text(html)
-    t = html.downcase
-
-    t.gsub!(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
-    t.gsub!(/<[bh]r[\/ ]*>/, "\n")
-    t.gsub!(/<\/?[^>]*>/, '')
-
-    t = HTMLEntities.new.decode(t)
+  def html2text(html)
+    html = html
+      .gsub(/<(?:p|pre|blockquote|code|h[1-6]|li)\b[^>]*>/, "\n")
+      .gsub(/<[bh]r[\/ ]*>/, "\n")
+      .gsub(/<\/?[^>]*>/, '')
 
-    t.gsub!(/[ \t]*\302\240+[ \t]*/, ' ')
-    t.gsub!(/  +/, ' ')
+    HTMLEntities.new.decode(html)
+  end
 
-    t.gsub!(/\r\n?/, "\n")
-    t.gsub!(/\n[ \t]+/, "\n")
-    t.gsub!(/[ \t]+\n/, "\n")
-    t.gsub!(/\n\n+/, "\n")
+  def normalize_text(html)
+    html.downcase
+      .gsub(/[ \t]*\302\240+[ \t]*/, ' ')
+      .gsub(/  +/, ' ')
+      .gsub(/\r\n?/, "\n")
+      .gsub(/\n[ \t]+/, "\n")
+      .gsub(/[ \t]+\n/, "\n")
+      .gsub(/\n\n+/, "\n")
+      .gsub(/^(?:#[\w:._·\-]+\s*)+/, '')
+      .gsub(/(?:#[\w:._·\-]+\s*)+$/, '')
+      .delete('#')
+      .strip
+      .unaccent_via_split_map
+  end
 
-    return t.strip.unaccent_via_split_map unless '#'.in?(t)
+  def normalize_status(status)
+    "#{_format_tags(status)}\n#{_format_spoiler(status)}\n#{_format_status(status)}\n#{_format_desc(status)}".delete("\n\n").strip
+  end
 
-    tags = Extractor.extract_hashtags(t).uniq
-    t.gsub!(/^(?:#[\w:._·\-]+\s*)+/, '')
-    t.gsub!(/(?:#[\w:._·\-]+\s*)+$/, '')
+  def _format_tags(status)
+    return unless status.tags.present?
+    "tags #{status.tags.pluck(:name).join("\ntags ")}"
+  end
 
-    t.delete!('#')
+  def _format_spoiler(status)
+    return if status.spoiler_text.blank?
+    "subj #{normalize_text(status.spoiler_text)}"
+  end
 
-    "#{tags.join(' ')}\n#{t.lstrip}".strip.unaccent_via_split_map
+  def _format_status(status)
+    text = status.local? ? Formatter.instance.format(status) : status.text
+    return if text.blank?
+    text = normalize_text(html2text(text))
+    text.gsub!("\n", "\ntext ")
+    "text #{text}"
   end
 
-  def normalize_status(status)
-    normalize_text("tags #{status.tags.pluck(:name).join(' ')}\nsubj #{status.spoiler_text}\ntext #{status.local? ? Formatter.instance.format(status) : status.text}\ndesc #{status.media_attachments.pluck(:description).join("\n")}")
+  def _format_desc(status)
+    return unless status.media_attachments.present?
+    text = status.media_attachments.pluck(:description).join("\ndesc ")
+    "desc #{normalize_text(text)}"
   end
 end