diff --git a/app/lib/link_details_extractor.rb b/app/lib/link_details_extractor.rb index 2e49d3fb4..dbfdd33fc 100644 --- a/app/lib/link_details_extractor.rb +++ b/app/lib/link_details_extractor.rb @@ -269,16 +269,21 @@ class LinkDetailsExtractor end def document - @document ||= Nokogiri::HTML(@html, nil, encoding) + @document ||= detect_encoding_and_parse_document end - def encoding - @encoding ||= begin - guess = detector.detect(@html, @html_charset) - guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil + def detect_encoding_and_parse_document + [detect_encoding, nil, @html_charset, 'UTF-8'].uniq.each do |encoding| + document = Nokogiri::HTML(@html, nil, encoding) + return document if document.to_s.valid_encoding? end end + def detect_encoding + guess = detector.detect(@html, @html_charset) + guess&.fetch(:confidence, 0).to_i > 60 ? guess&.fetch(:encoding, nil) : nil + end + def detector @detector ||= CharlockHolmes::EncodingDetector.new.tap do |detector| detector.strip_tags = true diff --git a/spec/fixtures/requests/low_confidence_latin1.txt b/spec/fixtures/requests/low_confidence_latin1.txt new file mode 100644 index 000000000..39c3e23d6 --- /dev/null +++ b/spec/fixtures/requests/low_confidence_latin1.txt @@ -0,0 +1,17 @@ +HTTP/1.1 200 OK +server: nginx +date: Thu, 13 Jun 2024 14:33:13 GMT +content-type: text/html; charset=ISO-8859-1 +content-length: 158 +accept-ranges: bytes + + + +
+ +