From 6a5e3da6b044e50635d293c2716883cc5627e4c8 Mon Sep 17 00:00:00 2001 From: Jakub Mendyk Date: Sat, 2 Feb 2019 19:01:18 +0100 Subject: Allow most kinds of characters in URL query (fixes #8408) (#8447) * Allow unicode characters in URL query strings Fixes #8408 * Alternative approach to unicode support in urls Adds PoC/idea to approch this problem. --- spec/lib/formatter_spec.rb | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'spec/lib') diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb index 0c1efe7c3..9872d3756 100644 --- a/spec/lib/formatter_spec.rb +++ b/spec/lib/formatter_spec.rb @@ -74,10 +74,36 @@ RSpec.describe Formatter do end context 'given a URL with a query string' do - let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' } + context 'with escaped unicode character' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' } - it 'matches the full URL' do - is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"' + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"' + end + end + + context 'with unicode character' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' } + + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&q=autolink"' + end + end + + context 'with unicode character at the end' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' } + + it 'matches the full URL' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"' + end + end + + context 'with escaped and not escaped unicode characters' do + let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' } + + it 'preserves escaped unicode characters' do + is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink"' + end end end -- cgit From 157d3af46c1a94dd0365b6e33e82919dd3c15fde Mon Sep 17 00:00:00 2001 From: Hinaloe Date: Sat, 9 Feb 2019 11:39:38 +0900 Subject: Only URLs extract with pre-escaped text (#9991) * [test] add japanese hashtag testcase * Only URLs extract with pre-escaped text ( https://github.com/tootsuite/mastodon/issues/9989 ) --- app/lib/formatter.rb | 2 +- spec/lib/formatter_spec.rb | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'spec/lib') diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb index 2e3587169..6603b8df1 100644 --- a/app/lib/formatter.rb +++ b/app/lib/formatter.rb @@ -210,7 +210,7 @@ class Formatter # Note: I couldn't obtain list_slug with @user/list-name format # for mention so this requires additional check - special = Extractor.extract_entities_with_indices(escaped, options).map do |extract| + special = Extractor.extract_urls_with_indices(escaped, options).map do |extract| # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb index 9872d3756..8fb6695a9 100644 --- a/spec/lib/formatter_spec.rb +++ b/spec/lib/formatter_spec.rb @@ -194,6 +194,14 @@ RSpec.describe Formatter do is_expected.to include '/tags/hashtag" class="mention hashtag" rel="tag">#hashtag' end end + + context 'given text containing a hashtag with Unicode chars' do + let(:text) { '#hashtagタグ' } + + it 'creates a hashtag link' do + is_expected.to include '/tags/hashtag%E3%82%BF%E3%82%B0" class="mention hashtag" rel="tag">#hashtagタグ' + end + end end describe '#format_spoiler' do -- cgit From 016ad37bc8c9ca8bf8f872b8fee704a0388f575e Mon Sep 17 00:00:00 2001 From: Eugen Rochko Date: Sat, 9 Feb 2019 20:13:11 +0100 Subject: Fix URL linkifier grabbing full-width spaces and quotations (#9997) Fix #9993 Fix #5654 --- app/lib/formatter.rb | 12 +++++++++- config/initializers/twitter_regex.rb | 4 ++-- spec/lib/formatter_spec.rb | 44 ++++++++++++++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 5 deletions(-) (limited to 'spec/lib') diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb index 6603b8df1..0653214f5 100644 --- a/app/lib/formatter.rb +++ b/app/lib/formatter.rb @@ -199,12 +199,22 @@ class Formatter result.flatten.join end + UNICODE_ESCAPE_BLACKLIST_RE = /\p{Z}|\p{P}/ + def utf8_friendly_extractor(text, options = {}) old_to_new_index = [0] escaped = text.chars.map do |c| - output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c + output = begin + if c.ord.to_s(16).length > 2 && UNICODE_ESCAPE_BLACKLIST_RE.match(c).nil? + CGI.escape(c) + else + c + end + end + old_to_new_index << old_to_new_index.last + output.length + output end.join diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb index 0e8f5bfeb..0ddbbee98 100644 --- a/config/initializers/twitter_regex.rb +++ b/config/initializers/twitter_regex.rb @@ -1,7 +1,7 @@ module Twitter class Regex - REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou - REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou + REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}<>\(\)\?]/iou + REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*"'「」<>;:=\,\.\$%\[\]~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou REGEXEN[:valid_url_balanced_parens] = / \( (?: diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb index 8fb6695a9..96d2fc7e0 100644 --- a/spec/lib/formatter_spec.rb +++ b/spec/lib/formatter_spec.rb @@ -115,6 +115,22 @@ RSpec.describe Formatter do end end + context 'given a URL in quotation marks' do + let(:text) { '"https://example.com/"' } + + it 'does not match the quotation marks' do + is_expected.to include 'href="https://example.com/"' + end + end + + context 'given a URL in angle brackets' do + let(:text) { '' } + + it 'does not match the angle brackets' do + is_expected.to include 'href="https://example.com/"' + end + end + context 'given a URL with Japanese path string' do let(:text) { 'https://ja.wikipedia.org/wiki/日本' } @@ -131,6 +147,22 @@ RSpec.describe Formatter do end end + context 'given a URL with a full-width space' do + let(:text) { 'https://example.com/ abc123' } + + it 'does not match the full-width space' do + is_expected.to include 'href="https://example.com/"' + end + end + + context 'given a URL in Japanese quotation marks' do + let(:text) { '「[https://example.org/」' } + + it 'does not match the quotation marks' do + is_expected.to include 'href="https://example.org/"' + end + end + context 'given a URL with Simplified Chinese path string' do let(:text) { 'https://baike.baidu.com/item/中华人民共和国' } @@ -150,7 +182,11 @@ RSpec.describe Formatter do context 'given a URL containing unsafe code (XSS attack, visible part)' do let(:text) { %q{http://example.com/bb} } - it 'escapes the HTML in the URL' do + it 'does not include the HTML in the URL' do + is_expected.to include '"http://example.com/b"' + end + + it 'escapes the HTML' do is_expected.to include '<del>b</del>' end end @@ -158,7 +194,11 @@ RSpec.describe Formatter do context 'given a URL containing unsafe code (XSS attack, invisible part)' do let(:text) { %q{http://example.com/blahblahblahblah/a} } - it 'escapes the HTML in the URL' do + it 'does not include the HTML in the URL' do + is_expected.to include '"http://example.com/blahblahblahblah/a"' + end + + it 'escapes the HTML' do is_expected.to include '<script>alert("Hello")</script>' end end -- cgit