From 3816943e6b5e86b22c35f3c068521f7a9007deec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=81=B5=E3=81=81=E3=81=BC=E5=8E=9F?=
Date: Fri, 15 Sep 2017 01:03:20 +0900
Subject: [PATCH] Enable to recognize most kinds of characters as URL paths

 app/lib/formatter.rb                          |  2 +-
 app/services/fetch_link_card_service.rb       | 14 +++++--
 config/initializers/twitter_regex.rb          | 42 +++++++++++++++++++
 spec/lib/formatter_spec.rb                    | 32 ++++++++++++++
 spec/services/fetch_link_card_service_spec.rb | 11 +++++
 5 files changed, 96 insertions(+), 5 deletions(-)
 create mode 100644 config/initializers/twitter_regex.rb

diff --git a/app/lib/formatter.rb b/app/lib/formatter.rb
index cacc0364f..d9f843f44 100644
--- a/app/lib/formatter.rb
+++ b/app/lib/formatter.rb
@@ -131,7 +131,7 @@ class Formatter
   def link_html(url)
-    url    = Addressable::URI.parse(url).display_uri.to_s
+    url    = Addressable::URI.parse(url).to_s
     prefix = url.match(/\Ahttps?:\/\/(www\.)?/).to_s
     text   = url[prefix.length, 30]
     suffix = url[prefix.length + 30..-1]
diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb
index 215c69fe4..4acbfae7a 100644
--- a/app/services/fetch_link_card_service.rb
+++ b/app/services/fetch_link_card_service.rb
@@ -1,9 +1,15 @@
 # frozen_string_literal: true
 class FetchLinkCardService < BaseService
-  include ActionView::Helpers::TagHelper
-  URL_PATTERN = %r{https?://\S+}
+  URL_PATTERN = %r{
+    (                                                                                                 #   $1 URL
+      (https?:\/\/)?                                                                                  #   $2 Protocol (optional)
+      (#{Twitter::Regex[:valid_domain]})                                                              #   $3 Domain(s)
+      (?::(#{Twitter::Regex[:valid_port_number]}))?                                                   #   $4 Port number (optional)
+      (/#{Twitter::Regex[:valid_url_path]}*)?                                                         #   $5 URL Path and anchor
+      (\?#{Twitter::Regex[:valid_url_query_chars]}*#{Twitter::Regex[:valid_url_query_ending_chars]})? #   $6 Query String
+    )
+  }iox
   def call(status)
     @status = status
@@ -42,7 +48,7 @@ class FetchLinkCardService < BaseService
   def parse_urls
     if @status.local?
-      urls = @status.text.match(URL_PATTERN) { |uri| Addressable::URI.parse(uri).normalize }
+      urls = @status.text.scan(URL_PATTERN).map { |array| Addressable::URI.parse(array[0]).normalize }
       html  = Nokogiri::HTML(@status.text)
       links = html.css('a')
diff --git a/config/initializers/twitter_regex.rb b/config/initializers/twitter_regex.rb
new file mode 100644
index 000000000..5a0723d24
--- /dev/null
+++ b/config/initializers/twitter_regex.rb
@@ -0,0 +1,42 @@
+module Twitter
+  class Regex
+    REGEXEN[:valid_general_url_path_chars] = /[^\p{White_Space}\(\)\?]/iou
+    REGEXEN[:valid_url_path_ending_chars] = /[^\p{White_Space}\(\)\?!\*';:=\,\.\$%\[\]\p{Pd}_~&\|@]|(?:#{REGEXEN[:valid_url_balanced_parens]})/iou
+    REGEXEN[:valid_url_balanced_parens] = /
+      \(
+        (?:
+          #{REGEXEN[:valid_general_url_path_chars]}+
+          |
+          # allow one nested level of balanced parentheses
+          (?:
+            #{REGEXEN[:valid_general_url_path_chars]}*
+            \(
+              #{REGEXEN[:valid_general_url_path_chars]}+
+            \)
+            #{REGEXEN[:valid_general_url_path_chars]}*
+          )
+        )
+      \)
+    /iox
+    REGEXEN[:valid_url_path] = /(?:
+      (?:
+        #{REGEXEN[:valid_general_url_path_chars]}*
+        (?:#{REGEXEN[:valid_url_balanced_parens]} #{REGEXEN[:valid_general_url_path_chars]}*)*
+        #{REGEXEN[:valid_url_path_ending_chars]}
+      )|(?:#{REGEXEN[:valid_general_url_path_chars]}+\/)
+    )/iox
+    REGEXEN[:valid_url] = %r{
+      (                                                                                     #   $1 total match
+        (#{REGEXEN[:valid_url_preceding_chars]})                                            #   $2 Preceeding chracter
+        (                                                                                   #   $3 URL
+          (https?:\/\/)?                                                                    #   $4 Protocol (optional)
+          (#{REGEXEN[:valid_domain]})                                                       #   $5 Domain(s)
+          (?::(#{REGEXEN[:valid_port_number]}))?                                            #   $6 Port number (optional)
+          (/#{REGEXEN[:valid_url_path]}*)?                                                  #   $7 URL Path and anchor
+          (\?#{REGEXEN[:valid_url_query_chars]}*#{REGEXEN[:valid_url_query_ending_chars]})? #   $8 Query String
+        )
+      )
+    }iox
+  end
diff --git a/spec/lib/formatter_spec.rb b/spec/lib/formatter_spec.rb
index ab04ccbab..f9b7efac5 100644
--- a/spec/lib/formatter_spec.rb
+++ b/spec/lib/formatter_spec.rb
@@ -89,6 +89,38 @@ RSpec.describe Formatter do
+    context 'matches a URL with Japanese path string' do
+      let(:text) { '日本' }
+      it 'has valid URL' do
+ include 'href=""'
+      end
+    end
+    context 'matches a URL with Korean path string' do
+      let(:text) { '대한민국' }
+      it 'has valid URL' do
+ include 'href=""'
+      end
+    end
+    context 'matches a URL with Simplified Chinese path string' do
+      let(:text) { '中华人民共和国' }
+      it 'has valid URL' do
+ include 'href=""'
+      end
+    end
+    context 'matches a URL with Traditional Chinese path string' do
+      let(:text) { '臺灣' }
+      it 'has valid URL' do
+ include 'href=""'
+      end
+    end
     context 'contains HTML (script tag)' do
       let(:text) { '<script>alert("Hello")</script>' }
diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb
index b0aa740ac..ba61d22c3 100644
--- a/spec/services/fetch_link_card_service_spec.rb
+++ b/spec/services/fetch_link_card_service_spec.rb
@@ -12,6 +12,8 @@ RSpec.describe FetchLinkCardService do
     stub_request(:get, '').to_return(request_fixture('sjis_with_wrong_charset.txt'))
     stub_request(:head, '').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
     stub_request(:get, '').to_return(request_fixture('koi8-r.txt'))
+    stub_request(:head, '日本語').to_return(status: 200, headers: { 'Content-Type' => 'text/html' })
+    stub_request(:get, '日本語').to_return(request_fixture('sjis.txt'))
     stub_request(:head, '').to_return(status: 404)
@@ -52,6 +54,15 @@ RSpec.describe FetchLinkCardService do
         expect(status.preview_cards.first.title).to eq("Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.")
+    context do
+      let(:status) { Fabricate(:status, text: 'テスト日本語') }
+      it 'works with Japanese path string' do
+        expect(a_request(:get, '日本語')).to have_been_made.at_least_once
+        expect(status.preview_cards.first.title).to eq("SJISのページ")
+      end
+    end
   context 'in a remote status' do