about summary refs log tree commit diff
path: root/app
diff options
context:
space:
mode:
Diffstat (limited to 'app')
-rw-r--r--app/lib/sanitize_config.rb48
1 files changed, 48 insertions, 0 deletions
diff --git a/app/lib/sanitize_config.rb b/app/lib/sanitize_config.rb
index 9b4160552..da38d70b7 100644
--- a/app/lib/sanitize_config.rb
+++ b/app/lib/sanitize_config.rb
@@ -21,6 +21,53 @@ class Sanitize
       node['class'] = class_list.join(' ')
     end
 
+    ANCHOR_SANITIZER = lambda do |env|
+      return unless env[:node_name] == 'a'
+      node = env[:node]
+      return if node['href'].blank? || node.text.blank?
+
+      # href matches link text verbatim?
+      href = node['href']
+      return if href == node.text.strip
+
+      # remove query string from link text
+      node.inner_html = node.inner_html.sub(/\?\S+=\S+/, '')
+
+      # href matches link text without query string?
+      text = node.text.strip
+      return if href == text
+
+      uri = Addressable::URI.parse(node['href'])
+      text.sub!(/ *(?:\u2026|\.\.\.)/, '')
+
+      # href starts with link text?
+      return if href.start_with?(text)
+      # shortened href starts with link text?
+      return if (uri.host + uri.path).start_with?(text)
+      # shorterned & normalized href starts with link text?
+      return if (uri.normalized_host + uri.normalized_path).start_with?(text)
+
+      # grab first domain from link text
+      text = text.downcase.gsub(' dot ', '.')
+      first_domain = text.scan(/[\w\-]+\.[\w\-]+(?:\.[\w\-]+)*/).first
+
+      # first domain in link text (if there is one) matches href domain?
+      if first_domain.nil? || uri.domain == first_domain
+        # link text customized by author
+        node.inner_html = "\u270d\ufe0f #{node.inner_html}"
+        return
+      end
+
+      # possibly misleading link text
+      node.inner_html = "\u26a0\ufe0f #{node.inner_html}"
+    rescue Addressable::URI::InvalidURIError, IDN::Idna::IdnaError
+      # strip malformed links
+      node = env[:node]
+      node['href'] = '#'
+      node.children.remove
+      node.inner_html = "\u274c #{node.inner_html}"
+    end
+
     QUERY_STRING_SANITIZER = lambda do |env|
       return unless %w(a blockquote embed iframe source).include?(env[:node_name])
       node = env[:node]
@@ -85,6 +132,7 @@ class Sanitize
       transformers: [
         CLASS_WHITELIST_TRANSFORMER,
         QUERY_STRING_SANITIZER,
+        ANCHOR_SANITIZER
       ]
     )