# frozen_string_literal: true class Sanitize module Config HTTP_PROTOCOLS ||= ['http', 'https', 'dat', 'dweb', 'ipfs', 'ipns', 'ssb', 'gopher', :relative].freeze CLASS_WHITELIST_TRANSFORMER = lambda do |env| node = env[:node] class_list = node['class']&.split(/[\t\n\f\r ]/) return unless class_list class_list.keep_if do |e| next true if e =~ /^(h|p|u|dt|e)-/ # microformats classes next true if e =~ /^(mention|hashtag)$/ # semantic classes next true if e =~ /^(ellipsis|invisible)$/ # link formatting classes next true if e =~ /^bbcode__([a-z1-6\-]+)$/ # bbcode next true if e == 'signature' end node['class'] = class_list.join(' ') end ANCHOR_SANITIZER = lambda do |env| return unless env[:node_name] == 'a' node = env[:node] return if node['href'].blank? || node.text.blank? class_list = node['class']&.split(/[\t\n\f\r ]/) return if class_list && (class_list.include?('mention') || class_list.include?('hashtag')) # href matches link text verbatim? href = node['href'] return if href == node.text.strip # remove query string from link text node.inner_html = node.inner_html.sub(/\?\S+=\S+/, '') # href matches link text without query string? text = node.text.strip return if href == text uri = Addressable::URI.parse(node['href']) text.sub!(/ *(?:\u2026|\.\.\.)/, '') # href starts with link text? return if href.start_with?(text) # shortened href starts with link text? return if (uri.host + uri.path).start_with?(text) # shorterned & normalized href starts with link text? return if (uri.normalized_host + uri.normalized_path).start_with?(text) # grab first domain from link text text = text.downcase.gsub(' dot ', '.') first_domain = text.scan(/[\w\-]+\.[\w\-]+(?:\.[\w\-]+)*/).first # first domain in link text (if there is one) matches href domain? if first_domain.nil? || uri.domain == first_domain # link text customized by author node.inner_html = "\u270d\ufe0f #{node.inner_html}" return end # possibly misleading link text node.inner_html = "\u26a0\ufe0f #{node.inner_html}" rescue Addressable::URI::InvalidURIError, IDN::Idna::IdnaError # strip malformed links node = env[:node] node['href'] = '#' node.children.remove node.inner_html = "\u274c #{node.inner_html}" end QUERY_STRING_SANITIZER = lambda do |env| return unless %w(a blockquote embed iframe source).include?(env[:node_name]) node = env[:node] ['href', 'src', 'cite'].each do |attr| next if node[attr].blank? url = Addressable::URI.parse(node[attr]) next if url.query.blank? params = CGI.parse(url.query) params.delete_if do |key| k = key.downcase next true if k.start_with?( '_hs', 'ic', 'mc_', 'mkt_', 'ns_', 'sr_', 'utm', 'vero_', 'nr_', 'ref', ) next true if 'track'.in?(k) next true if [ 'fbclid', 'gclid', 'ncid', 'ocid', 'r', 'spm', ].include?(k) false end url.query = URI.encode_www_form(params) node[attr] = url end end MASTODON_STRICT ||= freeze_config( elements: %w(p br span a abbr del pre sub sup blockquote code b strong u i em h1 h2 h3 h4 h5 h6 ul ol li hr), attributes: { 'a' => %w(href rel class title alt), 'span' => %w(class), 'abbr' => %w(title), 'blockquote' => %w(cite), 'p' => %w(class), }, add_attributes: { 'a' => { 'rel' => 'nofollow noopener', 'target' => '_blank', }, }, protocols: { 'a' => { 'href' => HTTP_PROTOCOLS }, 'blockquote' => { 'cite' => HTTP_PROTOCOLS }, }, transformers: [ CLASS_WHITELIST_TRANSFORMER, QUERY_STRING_SANITIZER, ANCHOR_SANITIZER ] ) MASTODON_OEMBED ||= freeze_config merge( RELAXED, elements: RELAXED[:elements] + %w(audio embed iframe source video), attributes: merge( RELAXED[:attributes], 'audio' => %w(controls), 'embed' => %w(height src type width), 'iframe' => %w(allowfullscreen frameborder height scrolling src width), 'source' => %w(src type), 'video' => %w(controls height loop width), 'div' => [:data] ), protocols: merge( RELAXED[:protocols], 'embed' => { 'src' => HTTP_PROTOCOLS }, 'iframe' => { 'src' => HTTP_PROTOCOLS }, 'source' => { 'src' => HTTP_PROTOCOLS } ), transformers: [QUERY_STRING_SANITIZER] ) end end