Fixed deduplicated files from orphan media being deleted as well

Happens when a file is uploaded, then orphaned, and a similar file is
used somewhere. The CleanMedia job service didn't consider that case

Signed-off-by: Thomas Citharel <tcit@tcit.fr>
This commit is contained in:
Thomas Citharel 2021-08-22 16:17:20 +02:00
parent 8eacf29705
commit 241710807c
No known key found for this signature in database
GPG key ID: A061B9DDE0CA0773
9 changed files with 98 additions and 137 deletions

1
.gitignore vendored
View file

@ -31,6 +31,7 @@ priv/cert/
cover/ cover/
site/ site/
test/fixtures/image_tmp.jpg test/fixtures/image_tmp.jpg
test/fixtures/picture_tmp.png
test/fixtures/DSCN0010_tmp.jpg test/fixtures/DSCN0010_tmp.jpg
test/uploads/ test/uploads/
uploads/* uploads/*

View file

@ -1,107 +0,0 @@
defmodule Mix.Tasks.Mobilizon.Maintenance.FixUnattachedMediaInBody do
@moduledoc """
Task to reattach media files that were added in event, post or comment bodies without being attached to their entities.
This task should only be run once.
"""
use Mix.Task
import Mix.Tasks.Mobilizon.Common
alias Mobilizon.{Discussions, Events, Medias, Posts}
alias Mobilizon.Discussions.Comment
alias Mobilizon.Events.Event
alias Mobilizon.Posts.Post
alias Mobilizon.Storage.Repo
require Logger
@preferred_cli_env "prod"
# TODO: Remove me in Mobilizon 1.2
@shortdoc "Reattaches inline media from events and posts"
def run([]) do
start_mobilizon()
shell_info("Going to extract pictures from events")
extract_inline_pictures_from_bodies(Event)
shell_info("Going to extract pictures from posts")
extract_inline_pictures_from_bodies(Post)
shell_info("Going to extract pictures from comments")
extract_inline_pictures_from_bodies(Comment)
end
defp extract_inline_pictures_from_bodies(entity) do
Repo.transaction(
fn ->
entity
|> Repo.stream()
|> Stream.map(&extract_pictures(&1))
|> Stream.map(fn {entity, pics} -> save_entity(entity, pics) end)
|> Stream.run()
end,
timeout: :infinity
)
end
defp extract_pictures(entity) do
extracted_pictures = entity |> get_body() |> parse_body() |> get_media_entities_from_urls()
attached_picture = entity |> get_picture() |> get_media_entity_from_media_id()
attached_pictures = [attached_picture] |> Enum.filter(& &1)
{entity, extracted_pictures ++ attached_pictures}
end
defp get_body(%Event{description: description}), do: description
defp get_body(%Post{body: body}), do: body
defp get_body(%Comment{text: text}), do: text
defp get_picture(%Event{picture_id: picture_id}), do: picture_id
defp get_picture(%Post{picture_id: picture_id}), do: picture_id
defp get_picture(%Comment{}), do: nil
defp parse_body(nil), do: []
defp parse_body(body) do
with res <- Regex.scan(~r/<img\s[^>]*?src\s*=\s*['\"]([^'\"]*?)['\"][^>]*?>/, body),
res <- Enum.map(res, fn [_, res] -> res end) do
res
end
end
defp get_media_entities_from_urls(media_urls) do
media_urls
|> Enum.map(fn media_url ->
# We prefer orphan media, but fallback on already attached media just in case
Medias.get_unattached_media_by_url(media_url) || Medias.get_media_by_url(media_url)
end)
|> Enum.filter(& &1)
end
defp get_media_entity_from_media_id(nil), do: nil
defp get_media_entity_from_media_id(media_id) do
Medias.get_media(media_id)
end
defp save_entity(%Event{} = _event, []), do: :ok
defp save_entity(%Event{} = event, media) do
event = Repo.preload(event, [:contacts, :media])
Events.update_event(event, %{media: media})
end
defp save_entity(%Post{} = _post, []), do: :ok
defp save_entity(%Post{} = post, media) do
post = Repo.preload(post, [:media])
Posts.update_post(post, %{media: media})
end
defp save_entity(%Comment{} = _comment, []), do: :ok
defp save_entity(%Comment{} = comment, media) do
comment = Repo.preload(comment, [:media])
Discussions.update_comment(comment, %{media: media})
end
end

View file

@ -64,7 +64,9 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphan do
end end
Enum.each(medias, fn media -> Enum.each(medias, fn media ->
shell_info("ID: #{media.id}, Actor: #{media.actor_id}, URL: #{media.file.url}") shell_info(
"ID: #{hd(media).id}, Actor: #{hd(media).actor_id}, Deduplicated #{length(media)} times, URL: #{hd(media).file.url}"
)
end) end)
end end

View file

@ -4,6 +4,7 @@ defmodule Mobilizon.Medias do
""" """
import Ecto.Query import Ecto.Query
import Mobilizon.Storage.CustomFunctions
alias Ecto.Multi alias Ecto.Multi
@ -40,23 +41,15 @@ defmodule Mobilizon.Medias do
end end
@doc """ @doc """
Get an unattached media by it's URL Get all media by an URL.
""" """
def get_unattached_media_by_url(url) do @spec get_all_media_by_url(String.t()) :: Media.t() | nil
def get_all_media_by_url(url) do
url url
|> String.split("?", parts: 2)
|> hd
|> media_by_url_query() |> media_by_url_query()
|> join(:left, [m], e in assoc(m, :events)) |> Repo.all()
|> join(:left, [m], ep in assoc(m, :event_picture))
|> join(:left, [m], p in assoc(m, :posts))
|> join(:left, [m], pp in assoc(m, :posts_picture))
|> join(:left, [m], c in assoc(m, :comments))
|> where([_m, e], is_nil(e.id))
|> where([_m, _e, ep], is_nil(ep.id))
|> where([_m, _e, _ep, p], is_nil(p.id))
|> where([_m, _e, _ep, _p, pp], is_nil(pp.id))
|> where([_m, _e, _ep, _p, _pp, c], is_nil(c.id))
|> limit(1)
|> Repo.one()
end end
@doc """ @doc """
@ -163,7 +156,7 @@ defmodule Mobilizon.Medias do
defp media_by_url_query(url) do defp media_by_url_query(url) do
from( from(
p in Media, p in Media,
where: fragment("? @> ?", p.file, ~s|{"url": "#{url}"}|) where: split_part(fragment("file->>'url'"), "?", 1) == ^url
) )
end end

View file

@ -0,0 +1,10 @@
defmodule Mobilizon.Storage.CustomFunctions do
@moduledoc """
Helper module for custom PostgreSQL functions
"""
defmacro split_part(string, delimiter, position) do
quote do
fragment("split_part(?, ?, ?)", unquote(string), unquote(delimiter), unquote(position))
end
end
end

View file

@ -5,6 +5,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
alias Mobilizon.Actors.Actor alias Mobilizon.Actors.Actor
alias Mobilizon.Medias alias Mobilizon.Medias
alias Mobilizon.Medias.File
alias Mobilizon.Medias.Media alias Mobilizon.Medias.Media
alias Mobilizon.Storage.Repo alias Mobilizon.Storage.Repo
import Ecto.Query import Ecto.Query
@ -25,9 +26,11 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
if Keyword.get(opts, :dry_run, false) do if Keyword.get(opts, :dry_run, false) do
{:ok, medias} {:ok, medias}
else else
Enum.each(medias, fn media -> Enum.each(medias, fn media_list ->
Enum.each(media_list, fn media ->
Medias.delete_media(media, ignore_file_not_found: true) Medias.delete_media(media, ignore_file_not_found: true)
end) end)
end)
{:ok, medias} {:ok, medias}
end end
@ -49,7 +52,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
|> Enum.join(" UNION ") |> Enum.join(" UNION ")
|> (&"NOT EXISTS(#{&1})").() |> (&"NOT EXISTS(#{&1})").()
@spec find_media(Keyword.t()) :: list(Media.t()) @spec find_media(Keyword.t()) :: list(list(Media.t()))
defp find_media(opts) do defp find_media(opts) do
default_grace_period = default_grace_period =
Mobilizon.Config.get([:instance, :orphan_upload_grace_period_hours], 48) Mobilizon.Config.get([:instance, :orphan_upload_grace_period_hours], 48)
@ -68,6 +71,38 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
where: fragment(@union_query) where: fragment(@union_query)
) )
Repo.all(query) query
|> Repo.all()
|> Enum.filter(fn %Media{file: %File{url: url}} ->
is_all_media_orphan?(url, expiration_date)
end)
|> Enum.chunk_by(fn %Media{file: %File{url: url}} ->
url
|> String.split("?", parts: 2)
|> hd
end)
end
def is_all_media_orphan?(url, expiration_date) do
url
|> Medias.get_all_media_by_url()
|> Enum.all?(&is_media_orphan?(&1, expiration_date))
end
@spec is_media_orphan?(Media.t(), DateTime.t()) :: boolean()
def is_media_orphan?(%Media{id: media_id}, expiration_date) do
query =
from(m in Media,
as: :media,
distinct: true,
join: a in Actor,
on: a.id == m.actor_id,
where: m.id == ^media_id,
where: is_nil(a.domain),
where: m.inserted_at < ^expiration_date,
where: fragment(@union_query)
)
Repo.exists?(query)
end end
end end

View file

@ -16,7 +16,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
refute is_nil(Medias.get_media(media_id)) refute is_nil(Medias.get_media(media_id))
refute is_nil(Medias.get_media(media_2_id)) refute is_nil(Medias.get_media(media_2_id))
assert {:ok, [found_media]} = CleanOrphanMedia.clean() assert {:ok, [[found_media]]} = CleanOrphanMedia.clean()
assert found_media.id == media_id assert found_media.id == media_id
assert is_nil(Medias.get_media(media_id)) assert is_nil(Medias.get_media(media_id))
@ -31,7 +31,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
refute is_nil(Medias.get_media(media_id)) refute is_nil(Medias.get_media(media_id))
refute is_nil(Medias.get_media(media_2_id)) refute is_nil(Medias.get_media(media_2_id))
assert {:ok, [found_media]} = CleanOrphanMedia.clean(dry_run: true) assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(dry_run: true)
assert found_media.id == media_id assert found_media.id == media_id
refute is_nil(Medias.get_media(media_id)) refute is_nil(Medias.get_media(media_id))
@ -46,7 +46,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
refute is_nil(Medias.get_media(media_id)) refute is_nil(Medias.get_media(media_id))
refute is_nil(Medias.get_media(media_2_id)) refute is_nil(Medias.get_media(media_2_id))
assert {:ok, [found_media]} = CleanOrphanMedia.clean(grace_period: 12) assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(grace_period: 12)
assert found_media.id == media_id assert found_media.id == media_id
assert is_nil(Medias.get_media(media_id)) assert is_nil(Medias.get_media(media_id))

View file

@ -280,7 +280,7 @@ defmodule Mobilizon.Factory do
%Mobilizon.Medias.File{ %Mobilizon.Medias.File{
name: "My Media", name: "My Media",
url: url, url: url,
content_type: "image/png", content_type: "image/jpg",
size: 13_120 size: 13_120
} }
end end

View file

@ -43,21 +43,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
test "with media returned" do test "with media returned" do
media1 = insert(:media) media1 = insert(:media)
media2 = insert(:media) media2 = insert(:media)
media3 = insert(:media, file: create_file())
with_mock CleanOrphanMedia, with_mock CleanOrphanMedia,
clean: fn [dry_run: true, grace_period: 48] -> {:ok, [media1, media2]} end do clean: fn [dry_run: true, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do
CleanOrphan.run(["--dry-run"]) CleanOrphan.run(["--dry-run"])
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == "List of files that would have been deleted" assert output_received == "List of files that would have been deleted"
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == assert output_received ==
"ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}" "ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}"
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == assert output_received ==
"ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}" "ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}"
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == "2 files would have been deleted" assert output_received == "2 files would have been deleted"
@ -78,21 +79,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
test "with media returned" do test "with media returned" do
media1 = insert(:media) media1 = insert(:media)
media2 = insert(:media) media2 = insert(:media)
media3 = insert(:media, file: create_file())
with_mock CleanOrphanMedia, with_mock CleanOrphanMedia,
clean: fn [dry_run: false, grace_period: 48] -> {:ok, [media1, media2]} end do clean: fn [dry_run: false, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do
CleanOrphan.run(["--verbose"]) CleanOrphan.run(["--verbose"])
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == "List of files that have been deleted" assert output_received == "List of files that have been deleted"
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == assert output_received ==
"ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}" "ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}"
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == assert output_received ==
"ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}" "ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}"
assert_received {:mix_shell, :info, [output_received]} assert_received {:mix_shell, :info, [output_received]}
assert output_received == "2 files have been deleted" assert output_received == "2 files have been deleted"
@ -121,4 +123,29 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
end end
end end
end end
defp create_file do
File.cp!("test/fixtures/picture.png", "test/fixtures/picture_tmp.png")
file = %Plug.Upload{
content_type: "image/png",
path: Path.absname("test/fixtures/picture_tmp.png"),
filename: "image.png"
}
{:ok, data} = Mobilizon.Web.Upload.store(file)
%{
content_type: "image/png",
name: "image.png",
url: url
} = data
%Mobilizon.Medias.File{
name: "My Media",
url: url,
content_type: "image/png",
size: 13_120
}
end
end end