diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 72992d2f4..d81f4a306 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -274,7 +274,7 @@ class LinkDetailsExtractor end def detect_encoding_and_parse_document - [detect_encoding, nil, @html_charset].uniq.each do |encoding| + [detect_encoding, nil, header_encoding].uniq.each do |encoding| document = Nokogiri::HTML(@html, nil, encoding) return document if document.to_s.valid_encoding? end @@ -286,6 +286,13 @@ class LinkDetailsExtractor guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil end + def header_encoding + Encoding.find(@html_charset).name if @html_charset + rescue ArgumentError + # Encoding from HTTP header is not recognized by ruby + nil + end + def detector @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector| detector.strip_tags = true diff --git a/spec/fixtures/requests/alternative_utf8_spelling_in_header.txt b/spec/fixtures/requests/alternative_utf8_spelling_in_header.txt new file mode 100644 index 000000000..7aaea370e --- /dev/null +++ b/spec/fixtures/requests/alternative_utf8_spelling_in_header.txt @@ -0,0 +1,18 @@ +HTTP/1.1 200 OK +server: nginx +date: Thu, 13 Jun 2024 14:33:13 GMT +content-type: text/html; charset=utf8 +content-length: 192 +accept-ranges: bytes + + + + + + Webserver Configs R Us + + +

Welcome

+

Sneaky non-UTF character: á

+ + diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb index 342902cdb..2f64f4055 100644 --- a/spec/services/fetch_link_card_service_spec.rb +++ b/spec/services/fetch_link_card_service_spec.rb @@ -32,6 +32,7 @@ RSpec.describe FetchLinkCardService do stub_request(:get, 'http://example.com/aergerliche-umlaute').to_return(request_fixture('redirect_with_utf8_url.txt')) stub_request(:get, 'http://example.com/page_without_title').to_return(request_fixture('page_without_title.txt')) stub_request(:get, 'http://example.com/long_canonical_url').to_return(request_fixture('long_canonical_url.txt')) + stub_request(:get, 'http://example.com/alternative_utf8_spelling_in_header').to_return(request_fixture('alternative_utf8_spelling_in_header.txt')) Rails.cache.write('oembed_endpoint:example.com', oembed_cache) if oembed_cache @@ -292,6 +293,14 @@ RSpec.describe FetchLinkCardService do expect(status.preview_card).to be_nil end end + + context 'with a URL where the `Content-Type` header uses `utf8` instead of `utf-8`' do + let(:status) { Fabricate(:status, text: 'test http://example.com/alternative_utf8_spelling_in_header') } + + it 'does not create a preview card' do + expect(status.preview_card.title).to eq 'Webserver Configs R Us' + end + end end context 'with a remote status' do