about summary refs log tree commit diff
diff options
context:
space:
mode:
authorabcang <abcang1015@gmail.com>2017-04-19 21:52:18 +0900
committerEugen <eugen@zeonfederated.com>2017-04-19 14:52:18 +0200
commit70891a99a97bc1ca14a8ded13a5cd45b648b92b3 (patch)
tree227d9e18e035ec10d96bfbd9754d6b2dc4b261a0
parent3572f4423f1b0a50c06129a106a0dc599cba0335 (diff)
Fix html escape characters in the URL (#2138)
* fix character escaping in URL

* add tests

* put a comma after the last item

* add HTML escape test
-rw-r--r--app/lib/formatter.rb26
-rw-r--r--spec/lib/formatter_spec.rb77
2 files changed, 84 insertions, 19 deletions
diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb
index a44e5ed3e..43893915d 100644
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -13,10 +13,9 @@ class Formatter
     return reformat(status.content) unless status.local?
 
     html = status.text
-    html = encode(html)
+    html = encode_and_link_urls(html)
     html = simple_format(html, {}, sanitize: false)
     html = html.delete("\n")
-    html = link_urls(html)
     html = link_mentions(html, status.mentions)
     html = link_hashtags(html)
 
@@ -35,8 +34,7 @@ class Formatter
   def simplified_format(account)
     return reformat(account.note) unless account.local?
 
-    html = encode(account.note)
-    html = link_urls(html)
+    html = encode_and_link_urls(account.note)
     html = link_accounts(html)
     html = link_hashtags(html)
 
@@ -49,6 +47,26 @@ class Formatter
     HTMLEntities.new.encode(html)
   end
 
+  def encode_and_link_urls(html)
+    entities = Twitter::Extractor.extract_urls_with_indices(html, extract_url_without_protocol: false)
+    entities = entities.sort_by { |entity| entity[:indices].first }
+
+    chars = html.to_s.to_char_a
+    html_attrs = {
+      target: '_blank',
+      rel: 'nofollow noopener',
+    }
+    result = ''
+
+    last_index = entities.reduce(0) do |index, entity|
+      indices = entity[:indices]
+      result += encode(chars[index...indices.first].join)
+      result += Twitter::Autolink.send(:link_to_text, entity, link_html(entity[:url]), entity[:url], html_attrs)
+      indices.last
+    end
+    result += encode(chars[last_index..-1].join)
+  end
+
   def link_urls(html)
     Twitter::Autolink.auto_link_urls(html, url_target: '_blank',
                                            link_attribute_block: lambda { |_, a| a[:rel] << ' noopener' },
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb
index 4b003b8e5..b70231d26 100644
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@@ -2,7 +2,8 @@ require 'rails_helper'
 
 RSpec.describe Formatter do
   let(:account)       { Fabricate(:account, username: 'alice') }
-  let(:local_status)  { Fabricate(:status, text: 'Hello world http://google.com', account: account) }
+  let(:local_text)    { 'Hello world http://google.com' }
+  let(:local_status)  { Fabricate(:status, text: local_text, account: account) }
   let(:remote_status) { Fabricate(:status, text: '<script>alert("Hello")</script> Beep boop', uri: 'beepboop', account: account) }
 
   describe '#format' do
@@ -20,35 +21,81 @@ RSpec.describe Formatter do
       expect(subject).to match('<a href="http://google.com" rel="nofollow noopener" target="_blank"><span class="invisible">http://</span><span class="">google.com</span><span class="invisible"></span></a>')
     end
 
-=begin
-    it 'matches a stand-alone medium URL' do
-      expect(subject.match('https://hackernoon.com/the-power-to-build-communities-a-response-to-mark-zuckerberg-3f2cac9148a4')[0]).to eq 'https://hackernoon.com/the-power-to-build-communities-a-response-to-mark-zuckerberg-3f2cac9148a4'
+    context 'matches a stand-alone medium URL' do
+      let(:local_text) { 'https://hackernoon.com/the-power-to-build-communities-a-response-to-mark-zuckerberg-3f2cac9148a4' }
+      it 'has valid url' do
+        expect(subject).to include('href="https://hackernoon.com/the-power-to-build-communities-a-response-to-mark-zuckerberg-3f2cac9148a4"')
+      end
     end
 
-    it 'matches a stand-alone google URL' do
-      expect(subject.match('http://google.com')[0]).to eq 'http://google.com'
+    context 'matches a stand-alone google URL' do
+      let(:local_text) { 'http://google.com' }
+      it 'has valid url' do
+        expect(subject).to include('href="http://google.com"')
+      end
     end
 
-    it 'matches a URL without trailing period' do
-      expect(subject.match('http://www.mcmansionhell.com/post/156408871451/50-states-of-mcmansion-hell-scottsdale-arizona. ')[0]).to eq 'http://www.mcmansionhell.com/post/156408871451/50-states-of-mcmansion-hell-scottsdale-arizona'
+    context 'matches a URL without trailing period' do
+      let(:local_text) { 'http://www.mcmansionhell.com/post/156408871451/50-states-of-mcmansion-hell-scottsdale-arizona. ' }
+      it 'has valid url' do
+        expect(subject).to include('href="http://www.mcmansionhell.com/post/156408871451/50-states-of-mcmansion-hell-scottsdale-arizona"')
+      end
     end
 
+=begin
     it 'matches a URL without closing paranthesis' do
       expect(subject.match('(http://google.com/)')[0]).to eq 'http://google.com'
     end
+=end
+
+    context 'matches a URL without exclamation point' do
+      let(:local_text) { 'http://www.google.com!' }
+      it 'has valid url' do
+        expect(subject).to include('href="http://www.google.com"')
+      end
+    end
 
-    it 'matches a URL without exclamation point' do
-      expect(subject.match('http://www.google.com! ')[0]).to eq 'http://www.google.com'
+    context 'matches a URL without single quote' do
+      let(:local_text) { "http://www.google.com'" }
+      it 'has valid url' do
+        expect(subject).to include('href="http://www.google.com"')
+      end
     end
 
-    it 'matches a URL with a query string' do
-      expect(subject.match('https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink')[0]).to eq 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink'
+    context 'matches a URL without angle brackets' do
+      let(:local_text) { 'http://www.google.com>' }
+      it 'has valid url' do
+        expect(subject).to include('href="http://www.google.com"')
+      end
     end
 
-    it 'matches a URL with parenthesis in it' do
-      expect(subject.match('https://en.wikipedia.org/wiki/Diaspora_(software)')[0]).to eq 'https://en.wikipedia.org/wiki/Diaspora_(software)'
+    context 'matches a URL with a query string' do
+      let(:local_text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
+      it 'has valid url' do
+        expect(subject).to include('href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&amp;q=autolink"')
+      end
+    end
+
+    context 'matches a URL with parenthesis in it' do
+      let(:local_text) { 'https://en.wikipedia.org/wiki/Diaspora_(software)' }
+      it 'has valid url' do
+        expect(subject).to include('href="https://en.wikipedia.org/wiki/Diaspora_(software)"')
+      end
+    end
+
+    context 'contains html (script tag)' do
+        let(:local_text) { '<script>alert("Hello")</script>' }
+        it 'has valid url' do
+            expect(subject).to match '<p>&lt;script&gt;alert(&quot;Hello&quot;)&lt;/script&gt;</p>'
+        end
+    end
+
+    context 'contains html (xss attack)' do
+      let(:local_text) { %q{<img src="javascript:alert('XSS');">} }
+      it 'has valid url' do
+        expect(subject).to match '<p>&lt;img src=&quot;javascript:alert(&apos;XSS&apos;);&quot;&gt;</p>'
+      end
     end
-=end
   end
 
   describe '#reformat' do