# frozen_string_literal: true

class Sanitize
  extend UrlHelper

  module Config
    HTTP_PROTOCOLS ||= ['http', 'https', 'dat', 'dweb', 'ipfs', 'ipns', 'ssb', 'gopher', :relative].freeze
    MEDIA_EXTENSIONS ||= %w(png apng jpg jpe jpeg mpg mpeg mpeg4 mp4 mp3 aac ogg oga ogv qt gif)

    CLASS_WHITELIST_TRANSFORMER = lambda do |env|
      node = env[:node]
      class_list = node['class']&.split(/[\t\n\f\r ]/)

      return unless class_list

      class_list.keep_if do |e|
        next true if e =~ /^(h|p|u|dt|e)-/ # microformats classes
        next true if e =~ /^(mention|hashtag)$/ # semantic classes
        next true if e =~ /^(ellipsis|invisible)$/ # link formatting classes
        next true if e =~ /^bbcode__([a-z1-6\-]+)$/ # bbcode
        next true if e == 'signature'
      end

      node['class'] = class_list.join(' ')
    end

    ANCHOR_SANITIZER = lambda do |env|
      return unless env[:node_name] == 'a'
      node = env[:node]
      return if node['href'].blank? || node.text.blank?

      class_list = node['class']&.split(/[\t\n\f\r ]/)
      return if class_list && (class_list.include?('mention') || class_list.include?('hashtag'))

      # href matches link text verbatim?
      href = node['href']
      return if href == node.text.strip

      # href matches link text with sanitized query string?
      text = Sanitize::sanitize_query_string(node.text.strip)
      return if text.blank?
      if href == text
        node.inner_html = "\u2728 #{node.inner_html}"
        return
      end

      # strip ellipse & replace keyword search obscuring
      text = text.sub(/ *(?:\u2026|\.\.\.)\Z/, '').gsub(/ dot /i, '.').gsub(/[\u200b-\u200d\ufeff\u200e\u200f]/, '')

      # href now matches text without obscuring?
      if href == text
        node.inner_html = "\u2728 #{node.inner_html}"
        return
      end

      # try to detect pseudomentions
      if text.start_with?('@') && text.match?(Account::MENTION_RE)
        username, domain = text[1..-1].split('@', 2)
        return if href == "https://#{domain}/@#{username}"
        return if href == "https://#{domain}/#{username}"
        return if href == "https://#{username}.#{domain}"
        return if href == "https://#{domain}/users/#{username}"
        return if href == "https://#{domain}/user/#{username}"
      end

      # try to detect filenames
      href_filename = '/'.in?(href) ? href.rpartition('/')[2] : nil
      unless href_filename.blank? || !('.'.in?(href_filename))
        # possibly linked media?
        ext = href_filename.rpartition('.')[2]
        if ext.downcase.in?(MEDIA_EXTENSIONS)
          node.inner_html = "\xf0\x9f\x96\xbc\xef\xb8\x8f #{node.inner_html}"
          return
        end
      end

      # grab first url from link text
      first_url = text.scan(/[\w\-]+\.[\w\-]+(?:\.[\w\-]+)*\S*/).first

      return if first_url.nil?

      # strip trailing punctuation
      text.sub!(/\p{Punct}+\Z/, '')

      # href starts with link text?
      return if href.start_with?(text)

      # split href into parts & grab shortened href
      uri = Addressable::URI.parse(href)
      short_href = "#{uri.host}#{uri.path}"
      normalized_short_href = "#{uri.normalized_host}#{uri.normalized_path}"

      # shortened href starts with link text?
      return if short_href.start_with?(text) || normalized_short_href.start_with?(text)

      # first domain in link text (if there is one) matches href domain?
      return if short_href == first_url || normalized_short_href == first_url

      # possibly misleading link text
      node.inner_html = "\u26a0\ufe0f #{node.inner_html}"
    rescue Addressable::URI::InvalidURIError, IDN::Idna::IdnaError
      # strip malformed links
      node = env[:node]
      node['href'] = '#'
      node.children.remove
      node.inner_html = "\u274c #{node.inner_html}"
    end

    QUERY_STRING_SANITIZER = lambda do |env|
      return unless %w(a blockquote embed iframe source).include?(env[:node_name])
      node = env[:node]
      ['href', 'src', 'cite'].each do |attr|
        next if node[attr].blank?
        url = Sanitize::sanitize_query_string(node[attr])
        next if url.blank?
        node[attr] = url
      end
    end

    MASTODON_STRICT ||= freeze_config(
      elements: %w(p br span a abbr del pre sub sup blockquote code b strong u i s em h1 h2 h3 h4 h5 h6 ul ol li hr),

      attributes: {
        'a'          => %w(href rel class title alt),
        'span'       => %w(class),
        'abbr'       => %w(title),
        'blockquote' => %w(cite),
        'p'          => %w(class),
        :all         => %w(aria-hidden aria-label lang),
      },

      add_attributes: {
        'a' => {
          'rel' => 'nofollow noopener tag',
          'target' => '_blank',
        },
      },

      protocols: {
        'a'          => { 'href' => HTTP_PROTOCOLS },
        'blockquote' => { 'cite' => HTTP_PROTOCOLS },
      },

      transformers: [
        CLASS_WHITELIST_TRANSFORMER,
        QUERY_STRING_SANITIZER,
        ANCHOR_SANITIZER
      ]
    )

    MASTODON_OEMBED ||= freeze_config merge(
      RELAXED,
      elements: RELAXED[:elements] + %w(audio embed iframe source video),

      attributes: merge(
        RELAXED[:attributes],
        'audio'  => %w(controls),
        'embed'  => %w(height src type width),
        'iframe' => %w(allowfullscreen frameborder height scrolling src width),
        'source' => %w(src type),
        'video'  => %w(controls height loop width),
        'div'    => [:data]
      ),

      protocols: merge(
        RELAXED[:protocols],
        'embed'  => { 'src' => HTTP_PROTOCOLS },
        'iframe' => { 'src' => HTTP_PROTOCOLS },
        'source' => { 'src' => HTTP_PROTOCOLS }
      ),

      transformers: [QUERY_STRING_SANITIZER]
    )
  end
end