Detect and convert html body in the correct charset before parsing it
Signed-off-by: Thomas Citharel <tcit@tcit.fr>
This commit is contained in:
parent
c8735e5837
commit
fbe5a8d0c4
|
@ -185,6 +185,12 @@ config :phoenix, :filter_parameters, ["password", "token"]
|
||||||
config :absinthe, schema: Mobilizon.GraphQL.Schema
|
config :absinthe, schema: Mobilizon.GraphQL.Schema
|
||||||
config :absinthe, Absinthe.Logger, filter_variables: ["token", "password", "secret"]
|
config :absinthe, Absinthe.Logger, filter_variables: ["token", "password", "secret"]
|
||||||
|
|
||||||
|
config :codepagex, :encodings, [
|
||||||
|
:ascii,
|
||||||
|
~r[iso8859]i,
|
||||||
|
:"VENDORS/MICSFT/WINDOWS/CP1252"
|
||||||
|
]
|
||||||
|
|
||||||
config :mobilizon, Mobilizon.Web.Gettext, split_module_by: [:locale, :domain]
|
config :mobilizon, Mobilizon.Web.Gettext, split_module_by: [:locale, :domain]
|
||||||
|
|
||||||
config :ex_cldr,
|
config :ex_cldr,
|
||||||
|
|
|
@ -74,6 +74,7 @@ defmodule Mobilizon.Service.RichMedia.Parser do
|
||||||
{:is_html, _response_headers, true} <-
|
{:is_html, _response_headers, true} <-
|
||||||
{:is_html, response_headers, is_html(response_headers)} do
|
{:is_html, response_headers, is_html(response_headers)} do
|
||||||
body
|
body
|
||||||
|
|> convert_utf8(response_headers)
|
||||||
|> maybe_parse()
|
|> maybe_parse()
|
||||||
|> Map.put(:url, url)
|
|> Map.put(:url, url)
|
||||||
|> maybe_add_favicon()
|
|> maybe_add_favicon()
|
||||||
|
@ -317,4 +318,78 @@ defmodule Mobilizon.Service.RichMedia.Parser do
|
||||||
defp default_user_agent(_url) do
|
defp default_user_agent(_url) do
|
||||||
Config.instance_user_agent()
|
Config.instance_user_agent()
|
||||||
end
|
end
|
||||||
|
|
||||||
|
defp convert_utf8(body, headers) do
|
||||||
|
headers
|
||||||
|
|> get_header("Content-Type")
|
||||||
|
|> handle_charset(body)
|
||||||
|
end
|
||||||
|
|
||||||
|
defp handle_charset(nil, body) do
|
||||||
|
case detect_charset_from_meta(body) do
|
||||||
|
"" -> body
|
||||||
|
nil -> body
|
||||||
|
charset -> convert_body(body, charset)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp handle_charset(content_type, body) do
|
||||||
|
case charset_from_content_type(content_type) do
|
||||||
|
nil -> handle_charset(nil, body)
|
||||||
|
charset -> convert_body(body, charset)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp charset_from_content_type(content_type) do
|
||||||
|
with [_, params] <- :binary.split(content_type, ";"),
|
||||||
|
%{"charset" => charset} <- Utils.params(params) do
|
||||||
|
charset
|
||||||
|
else
|
||||||
|
_ -> nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp detect_charset_from_meta(body) do
|
||||||
|
Logger.debug("Trying to detect charset from meta")
|
||||||
|
|
||||||
|
document = Floki.parse_document!(body)
|
||||||
|
|
||||||
|
case document
|
||||||
|
|> Floki.find("meta[http-equiv=\"content-type\"]")
|
||||||
|
|> List.first() do
|
||||||
|
nil ->
|
||||||
|
case document
|
||||||
|
|> Floki.find("meta[http-equiv=\"Content-Type\"]")
|
||||||
|
|> List.first() do
|
||||||
|
nil -> nil
|
||||||
|
meta -> content_type_from_meta(meta)
|
||||||
|
end
|
||||||
|
|
||||||
|
meta ->
|
||||||
|
content_type_from_meta(meta)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
defp content_type_from_meta(meta) do
|
||||||
|
Logger.debug("Finding content-type into <meta> element")
|
||||||
|
|
||||||
|
meta
|
||||||
|
|> Floki.attribute("content")
|
||||||
|
|> List.first()
|
||||||
|
|> String.trim()
|
||||||
|
|> charset_from_content_type()
|
||||||
|
end
|
||||||
|
|
||||||
|
defp convert_body(body, "utf-8"), do: body
|
||||||
|
|
||||||
|
defp convert_body(body, charset) do
|
||||||
|
Logger.debug("Converting body from #{charset}")
|
||||||
|
Codepagex.to_string!(body, fix_charset(charset))
|
||||||
|
end
|
||||||
|
|
||||||
|
defp fix_charset("windows-1252"), do: :"VENDORS/MICSFT/WINDOWS/CP1252"
|
||||||
|
|
||||||
|
defp fix_charset(charset) do
|
||||||
|
String.replace(charset, "-", "_")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
1
mix.exs
1
mix.exs
|
@ -203,6 +203,7 @@ defmodule Mobilizon.Mixfile do
|
||||||
{:export, "~> 0.1.0"},
|
{:export, "~> 0.1.0"},
|
||||||
{:tz_world, "~> 1.0"},
|
{:tz_world, "~> 1.0"},
|
||||||
{:tzdata, "~> 1.1"},
|
{:tzdata, "~> 1.1"},
|
||||||
|
{:codepagex, "~> 0.1.6"},
|
||||||
# Dev and test dependencies
|
# Dev and test dependencies
|
||||||
{:phoenix_live_reload, "~> 1.2", only: [:dev, :e2e]},
|
{:phoenix_live_reload, "~> 1.2", only: [:dev, :e2e]},
|
||||||
{:ex_machina, "~> 2.3", only: [:dev, :test]},
|
{:ex_machina, "~> 2.3", only: [:dev, :test]},
|
||||||
|
|
1
mix.lock
1
mix.lock
|
@ -11,6 +11,7 @@
|
||||||
"cachex": {:hex, :cachex, "3.4.0", "868b2959ea4aeb328c6b60ff66c8d5123c083466ad3c33d3d8b5f142e13101fb", [:mix], [{:eternal, "~> 1.2", [hex: :eternal, repo: "hexpm", optional: false]}, {:jumper, "~> 1.0", [hex: :jumper, repo: "hexpm", optional: false]}, {:sleeplocks, "~> 1.1", [hex: :sleeplocks, repo: "hexpm", optional: false]}, {:unsafe, "~> 1.0", [hex: :unsafe, repo: "hexpm", optional: false]}], "hexpm", "370123b1ab4fba4d2965fb18f87fd758325709787c8c5fce35b3fe80645ccbe5"},
|
"cachex": {:hex, :cachex, "3.4.0", "868b2959ea4aeb328c6b60ff66c8d5123c083466ad3c33d3d8b5f142e13101fb", [:mix], [{:eternal, "~> 1.2", [hex: :eternal, repo: "hexpm", optional: false]}, {:jumper, "~> 1.0", [hex: :jumper, repo: "hexpm", optional: false]}, {:sleeplocks, "~> 1.1", [hex: :sleeplocks, repo: "hexpm", optional: false]}, {:unsafe, "~> 1.0", [hex: :unsafe, repo: "hexpm", optional: false]}], "hexpm", "370123b1ab4fba4d2965fb18f87fd758325709787c8c5fce35b3fe80645ccbe5"},
|
||||||
"certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"},
|
"certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"},
|
||||||
"cldr_utils": {:hex, :cldr_utils, "2.17.0", "05453797e5b89f936c54c5602ac881e46b1ba4423a803c27a414466f4b598c94", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.5", [hex: :certifi, repo: "hexpm", optional: true]}, {:decimal, "~> 1.9 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6077ddaaa155f27755638225617bdc00c004f39b3c9355b688e52a3fc98d57e8"},
|
"cldr_utils": {:hex, :cldr_utils, "2.17.0", "05453797e5b89f936c54c5602ac881e46b1ba4423a803c27a414466f4b598c94", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.5", [hex: :certifi, repo: "hexpm", optional: true]}, {:decimal, "~> 1.9 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6077ddaaa155f27755638225617bdc00c004f39b3c9355b688e52a3fc98d57e8"},
|
||||||
|
"codepagex": {:hex, :codepagex, "0.1.6", "49110d09a25ee336a983281a48ef883da4c6190481e0b063afe2db481af6117e", [:mix], [], "hexpm", "1521461097dde281edf084062f525a4edc6a5e49f4fd1f5ec41c9c4955d5bd59"},
|
||||||
"combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm", "1b1dbc1790073076580d0d1d64e42eae2366583e7aecd455d1215b0d16f2451b"},
|
"combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm", "1b1dbc1790073076580d0d1d64e42eae2366583e7aecd455d1215b0d16f2451b"},
|
||||||
"comeonin": {:hex, :comeonin, "5.3.2", "5c2f893d05c56ae3f5e24c1b983c2d5dfb88c6d979c9287a76a7feb1e1d8d646", [:mix], [], "hexpm", "d0993402844c49539aeadb3fe46a3c9bd190f1ecf86b6f9ebd71957534c95f04"},
|
"comeonin": {:hex, :comeonin, "5.3.2", "5c2f893d05c56ae3f5e24c1b983c2d5dfb88c6d979c9287a76a7feb1e1d8d646", [:mix], [], "hexpm", "d0993402844c49539aeadb3fe46a3c9bd190f1ecf86b6f9ebd71957534c95f04"},
|
||||||
"connection": {:hex, :connection, "1.1.0", "ff2a49c4b75b6fb3e674bfc5536451607270aac754ffd1bdfe175abe4a6d7a68", [:mix], [], "hexpm", "722c1eb0a418fbe91ba7bd59a47e28008a189d47e37e0e7bb85585a016b2869c"},
|
"connection": {:hex, :connection, "1.1.0", "ff2a49c4b75b6fb3e674bfc5536451607270aac754ffd1bdfe175abe4a6d7a68", [:mix], [], "hexpm", "722c1eb0a418fbe91ba7bd59a47e28008a189d47e37e0e7bb85585a016b2869c"},
|
||||||
|
|
Loading…
Reference in a new issue