about summary refs log tree commit diff
diff options
context:
space:
mode:
authorふぁぼ原 <ko_kurihara@yahoo.co.jp>2017-09-15 01:03:20 +0900
committerEugen Rochko <eugen@zeonfederated.com>2017-09-14 18:03:20 +0200
commit3816943e6b5e86b22c35f3c068521f7a9007deec (patch)
tree3a245238e9410a4dce71bba1accbb2bc0e456508
parentb39d512ade9f556ae29d60239102faf67ff6a89f (diff)
Enable to recognize most kinds of characters as URL paths (#4941)
-rw-r--r--app/lib/formatter.rb2
-rw-r--r--app/services/fetch_link_card_service.rb14
-rw-r--r--config/initializers/twitter_regex.rb42
-rw-r--r--spec/lib/formatter_spec.rb32
-rw-r--r--spec/services/fetch_link_card_service_spec.rb11
5 files changed, 96 insertions, 5 deletions
diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb
index cacc0364f..d9f843f44 100644
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -131,7 +131,7 @@ class Formatter
   end
 
   def link_html(url)
-    url    = Addressable::URI.parse(url).display_uri.to_s
+    url    = Addressable::URI.parse(url).to_s
     prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s
     text   = url[prefix.length, 30]
     suffix = url[prefix.length + 30..-1]
diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb
index 215c69fe4..4acbfae7a 100644
--- a/app/services/fetch_link_card_service.rb
+++ b/app/services/fetch_link_card_service.rb
@@ -1,9 +1,15 @@
 # frozen_string_literal: true
 
 class FetchLinkCardService < BaseService
-  include ActionView::Helpers::TagHelper
-
-  URL_PATTERN = %r{https?://\S+}
+  URL_PATTERN = %r{
+    (                                                                                                 #   $1 URL
+      (https?:\/\/)?                                                                                  #   $2 Protocol (optional)
+      (#{Twitter::Regex[:valid_domain]})                                                              #   $3 Domain(s)
+      (?::(#{Twitter::Regex[:valid_port_number]}))?                                                   #   $4 Port number (optional)
+      (/#{Twitter::Regex[:valid_url_path]}*)?                                                         #   $5 URL Path and anchor
+      (\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? #   $6 Query String
+    )
+  }iox
 
   def call(status)
     @status = status
@@ -42,7 +48,7 @@ class FetchLinkCardService < BaseService
 
   def parse_urls
     if @status.local?
-      urls = @status.text.match(URL_PATTERN).to_a.map { |uri| Addressable::URI.parse(uri).normalize }
+      urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[0]).normalize }
     else
       html  = Nokogiri::HTML(@status.text)
       links = html.css('a')
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb
new file mode 100644
index 000000000..5a0723d24
--- /dev/null
+++ b/config/initializers/twitter_regex.rb
@@ -0,0 +1,42 @@
+module Twitter
+  class Regex
+
+    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
+    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]\p{Pd}_~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
+    REGEXEN[:valid_url_balanced_parens] = /
+      \(
+        (?:
+          #{REGEXEN[:valid_general_url_path_chars]}+
+          |
+          # allow one nested level of balanced parentheses
+          (?:
+            #{REGEXEN[:valid_general_url_path_chars]}*
+            \(
+              #{REGEXEN[:valid_general_url_path_chars]}+
+            \)
+            #{REGEXEN[:valid_general_url_path_chars]}*
+          )
+        )
+      \)
+    /iox
+    REGEXEN[:valid_url_path] = /(?:
+      (?:
+        #{REGEXEN[:valid_general_url_path_chars]}*
+        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
+        #{REGEXEN[:valid_url_path_ending_chars]}
+      )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
+    )/iox
+    REGEXEN[:valid_url] = %r{
+      (                                                                                     #   $1 total match
+        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceeding chracter
+        (                                                                                   #   $3 URL
+          (https?:\/\/)?                                                                    #   $4 Protocol (optional)
+          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
+          (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
+          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
+          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
+        )
+      )
+    }iox
+  end
+end
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb
index ab04ccbab..f9b7efac5 100644
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@@ -89,6 +89,38 @@ RSpec.describe Formatter do
       end
     end
 
+    context 'matches a URL with Japanese path string' do
+      let(:text) { 'https://ja.wikipedia.org/wiki/日本' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC"'
+      end
+    end
+
+    context 'matches a URL with Korean path string' do
+      let(:text) { 'https://ko.wikipedia.org/wiki/대한민국' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD"'
+      end
+    end
+
+    context 'matches a URL with Simplified Chinese path string' do
+      let(:text) { 'https://baike.baidu.com/item/中华人民共和国' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://baike.baidu.com/item/%E4%B8%AD%E5%8D%8E%E4%BA%BA%E6%B0%91%E5%85%B1%E5%92%8C%E5%9B%BD"'
+      end
+    end
+
+    context 'matches a URL with Traditional Chinese path string' do
+      let(:text) { 'https://zh.wikipedia.org/wiki/臺灣' }
+
+      it 'has valid URL' do
+        is_expected.to include 'href="https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3"'
+      end
+    end
+
     context 'contains HTML (script tag)' do
       let(:text) { '<script>alert("Hello")</script>' }
 
diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb
index b0aa740ac..ba61d22c3 100644
--- a/spec/services/fetch_link_card_service_spec.rb
+++ b/spec/services/fetch_link_card_service_spec.rb
@@ -12,6 +12,8 @@ RSpec.describe FetchLinkCardService do
     stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt'))
     stub_request(:head, 'http://example.com/koi8-r').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
     stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt'))
+    stub_request(:head, 'http://example.com/日本語').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+    stub_request(:get, 'http://example.com/日本語').to_return(request_fixture('sjis.txt'))
     stub_request(:head, 'https://github.com/qbi/WannaCry').to_return(status: 404)
 
     subject.call(status)
@@ -52,6 +54,15 @@ RSpec.describe FetchLinkCardService do
         expect(status.preview_cards.first.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.")
       end
     end
+
+    context do
+      let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') }
+
+      it 'works with Japanese path string' do
+        expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.at_least_once
+        expect(status.preview_cards.first.title).to eq("SJISのページ")
+      end
+    end
   end
 
   context 'in a remote status' do