Improve rich media parsers

Signed-off-by: Thomas Citharel <tcit@tcit.fr>
This commit is contained in:
Thomas Citharel 2022-01-18 12:52:45 +01:00
parent a66f19cc5d
commit 2134e7b152
No known key found for this signature in database
GPG key ID: A061B9DDE0CA0773
6 changed files with 16 additions and 5 deletions

View file

@ -87,6 +87,10 @@ defmodule Mobilizon.Service.RichMedia.Parser do
{:ok, data} {:ok, data}
{:ok, err} ->
Logger.debug("HTTP error: #{inspect(err)}")
{:error, "HTTP error: #{inspect(err)}"}
{:error, err} -> {:error, err} ->
Logger.debug("HTTP error: #{inspect(err)}") Logger.debug("HTTP error: #{inspect(err)}")
{:error, "HTTP error: #{inspect(err)}"} {:error, "HTTP error: #{inspect(err)}"}
@ -196,6 +200,8 @@ defmodule Mobilizon.Service.RichMedia.Parser do
@spec maybe_parse(String.t()) :: map() @spec maybe_parse(String.t()) :: map()
defp maybe_parse(html) do defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc -> Enum.reduce_while(parsers(), %{}, fn parser, acc ->
Logger.debug("Using #{inspect(parser)} to parse link")
case parser.parse(html, acc) do case parser.parse(html, acc) do
{:ok, data} -> {:ok, data} ->
{:halt, data} {:halt, data}

View file

@ -35,7 +35,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.Fallback do
defp get_page(html, :title) do defp get_page(html, :title) do
html html
|> Floki.parse_document!() |> Floki.parse_document!()
|> Floki.find("html title") |> Floki.find("title")
|> List.first() |> List.first()
|> Floki.text() |> Floki.text()
|> String.trim() |> String.trim()

View file

@ -53,7 +53,10 @@ defmodule Mobilizon.Service.RichMedia.Parsers.MetaTagsParser do
end) end)
if data[to_string(key_name)] in Enum.map(allowed_attributes, &to_string/1) do if data[to_string(key_name)] in Enum.map(allowed_attributes, &to_string/1) do
%{String.to_existing_atom(data[to_string(key_name)]) => data[to_string(value_name)]} %{
String.to_existing_atom(data[to_string(key_name)]) =>
String.trim(data[to_string(value_name)])
}
else else
%{} %{}
end end
@ -65,7 +68,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.MetaTagsParser do
defp maybe_put_title(meta, html) when meta != %{} do defp maybe_put_title(meta, html) when meta != %{} do
case get_page_title(html) do case get_page_title(html) do
"" -> meta "" -> meta
title -> Map.put_new(meta, :title, title) title -> Map.put_new(meta, :title, String.trim(title))
end end
end end
@ -80,7 +83,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.MetaTagsParser do
meta meta
description -> description ->
Map.put_new(meta, :description, description) Map.put_new(meta, :description, String.trim(description))
end end
end end

View file

@ -67,7 +67,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.OEmbed do
{:ok, data} <- Jason.decode(json), {:ok, data} <- Jason.decode(json),
data <- data <-
data data
|> Map.new(fn {k, v} -> {String.to_existing_atom(k), v} end) |> Map.new(fn {k, v} -> {String.to_existing_atom(k), String.trim(v)} end)
|> Map.take(@oembed_allowed_attributes) do |> Map.take(@oembed_allowed_attributes) do
{:ok, data} {:ok, data}
end end

View file

@ -54,6 +54,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.OGP do
defp transform_tags(data) do defp transform_tags(data) do
data data
|> Enum.reject(fn {_, v} -> is_nil(v) end) |> Enum.reject(fn {_, v} -> is_nil(v) end)
|> Enum.map(fn {k, v} -> {k, String.trim(v)} end)
|> Map.new() |> Map.new()
|> Map.update(:image_remote_url, Map.get(data, :image), & &1) |> Map.update(:image_remote_url, Map.get(data, :image), & &1)
|> Map.update(:width, get_integer_value(data, :"image:width"), & &1) |> Map.update(:width, get_integer_value(data, :"image:width"), & &1)

View file

@ -63,6 +63,7 @@ defmodule Mobilizon.Service.RichMedia.Parsers.TwitterCard do
defp transform_tags(data) do defp transform_tags(data) do
data data
|> Enum.reject(fn {_, v} -> is_nil(v) end) |> Enum.reject(fn {_, v} -> is_nil(v) end)
|> Enum.map(fn {k, v} -> {k, String.trim(v)} end)
|> Map.new() |> Map.new()
|> Map.update(:image_remote_url, Map.get(data, :image), & &1) |> Map.update(:image_remote_url, Map.get(data, :image), & &1)
end end