2020-02-18 08:57:00 +01:00
|
|
|
# Portions of this file are derived from Pleroma:
|
|
|
|
# Pleroma: A lightweight social networking server
|
|
|
|
# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>
|
|
|
|
# SPDX-License-Identifier: AGPL-3.0-only
|
|
|
|
|
|
|
|
defmodule Mobilizon.Service.RichMedia.Parsers.MetaTagsParser do
|
|
|
|
@moduledoc """
|
|
|
|
Module to parse meta tags data in HTML pages
|
|
|
|
"""
|
2021-01-22 18:14:52 +01:00
|
|
|
|
2021-09-28 19:40:37 +02:00
|
|
|
@spec parse(String.t(), map(), String.t(), String.t(), atom(), atom(), list(atom())) ::
|
|
|
|
{:ok, map()} | {:error, String.t()}
|
2021-01-22 18:14:52 +01:00
|
|
|
def parse(
|
|
|
|
html,
|
|
|
|
data,
|
|
|
|
prefix,
|
|
|
|
error_message,
|
|
|
|
key_name,
|
|
|
|
value_name \\ :content,
|
|
|
|
allowed_attributes \\ []
|
|
|
|
) do
|
2020-02-18 08:57:00 +01:00
|
|
|
meta_data =
|
|
|
|
html
|
|
|
|
|> get_elements(key_name, prefix)
|
|
|
|
|> Enum.reduce(data, fn el, acc ->
|
2021-01-22 18:14:52 +01:00
|
|
|
attributes = normalize_attributes(el, prefix, key_name, value_name, allowed_attributes)
|
2020-02-18 08:57:00 +01:00
|
|
|
|
|
|
|
Map.merge(acc, attributes)
|
|
|
|
end)
|
|
|
|
|> maybe_put_title(html)
|
|
|
|
|> maybe_put_description(html)
|
|
|
|
|
|
|
|
if Enum.empty?(meta_data) do
|
|
|
|
{:error, error_message}
|
|
|
|
else
|
|
|
|
{:ok, meta_data}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
2021-09-28 19:40:37 +02:00
|
|
|
@spec get_elements(String.t(), atom(), String.t()) :: Floki.html_tree()
|
2020-02-18 08:57:00 +01:00
|
|
|
defp get_elements(html, key_name, prefix) do
|
2021-05-03 14:52:37 +02:00
|
|
|
html |> Floki.parse_document!() |> Floki.find("meta[#{to_string(key_name)}^='#{prefix}:']")
|
2020-02-18 08:57:00 +01:00
|
|
|
end
|
|
|
|
|
2021-09-28 19:40:37 +02:00
|
|
|
@spec normalize_attributes(Floki.html_node(), String.t(), atom(), atom(), list(atom())) :: map()
|
2021-01-22 18:14:52 +01:00
|
|
|
defp normalize_attributes(html_node, prefix, key_name, value_name, allowed_attributes) do
|
2020-02-18 08:57:00 +01:00
|
|
|
{_tag, attributes, _children} = html_node
|
|
|
|
|
|
|
|
data =
|
2021-01-22 18:14:52 +01:00
|
|
|
attributes
|
|
|
|
|> Enum.into(%{}, fn {name, value} ->
|
2020-02-18 08:57:00 +01:00
|
|
|
{name, String.trim_leading(value, "#{prefix}:")}
|
|
|
|
end)
|
|
|
|
|
2021-01-22 18:14:52 +01:00
|
|
|
if data[to_string(key_name)] in Enum.map(allowed_attributes, &to_string/1) do
|
2022-01-18 12:52:45 +01:00
|
|
|
%{
|
|
|
|
String.to_existing_atom(data[to_string(key_name)]) =>
|
|
|
|
String.trim(data[to_string(value_name)])
|
|
|
|
}
|
2021-01-22 18:14:52 +01:00
|
|
|
else
|
|
|
|
%{}
|
|
|
|
end
|
2020-02-18 08:57:00 +01:00
|
|
|
end
|
|
|
|
|
2021-09-28 19:40:37 +02:00
|
|
|
@spec maybe_put_title(map(), String.t()) :: map()
|
2020-02-18 08:57:00 +01:00
|
|
|
defp maybe_put_title(%{title: _} = meta, _), do: meta
|
|
|
|
|
|
|
|
defp maybe_put_title(meta, html) when meta != %{} do
|
|
|
|
case get_page_title(html) do
|
|
|
|
"" -> meta
|
2022-01-18 12:52:45 +01:00
|
|
|
title -> Map.put_new(meta, :title, String.trim(title))
|
2020-02-18 08:57:00 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp maybe_put_title(meta, _), do: meta
|
|
|
|
|
2021-09-28 19:40:37 +02:00
|
|
|
@spec maybe_put_description(map(), String.t()) :: map()
|
2020-02-18 08:57:00 +01:00
|
|
|
defp maybe_put_description(%{description: _} = meta, _), do: meta
|
|
|
|
|
|
|
|
defp maybe_put_description(meta, html) when meta != %{} do
|
|
|
|
case get_page_description(html) do
|
2021-03-24 10:45:29 +01:00
|
|
|
"" ->
|
|
|
|
meta
|
|
|
|
|
|
|
|
description ->
|
2022-01-18 12:52:45 +01:00
|
|
|
Map.put_new(meta, :description, String.trim(description))
|
2020-02-18 08:57:00 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp maybe_put_description(meta, _), do: meta
|
|
|
|
|
2021-05-03 14:52:37 +02:00
|
|
|
@spec get_page_title(String.t()) :: String.t()
|
2020-02-18 08:57:00 +01:00
|
|
|
defp get_page_title(html) do
|
2021-05-03 14:52:37 +02:00
|
|
|
with {:ok, document} <- Floki.parse_document(html),
|
|
|
|
elem when not is_nil(elem) <- document |> Floki.find("html head title") |> List.first(),
|
|
|
|
title when is_binary(title) <- Floki.text(elem) do
|
|
|
|
title
|
|
|
|
else
|
|
|
|
_ -> ""
|
|
|
|
end
|
2020-02-18 08:57:00 +01:00
|
|
|
end
|
|
|
|
|
2021-05-03 14:52:37 +02:00
|
|
|
@spec get_page_description(String.t()) :: String.t()
|
2020-02-18 08:57:00 +01:00
|
|
|
defp get_page_description(html) do
|
2021-05-03 14:52:37 +02:00
|
|
|
with {:ok, document} <- Floki.parse_document(html),
|
|
|
|
elem when not is_nil(elem) <-
|
|
|
|
document |> Floki.find("html head meta[name='description']") |> List.first(),
|
2021-09-24 16:46:42 +02:00
|
|
|
[_ | _] = descriptions <- Floki.attribute(elem, "content") do
|
|
|
|
hd(descriptions)
|
2021-05-03 14:52:37 +02:00
|
|
|
else
|
|
|
|
_ -> ""
|
2020-02-18 08:57:00 +01:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|