Fixed deduplicated files from orphan media being deleted as well

Happens when a file is uploaded, then orphaned, and a similar file is
used somewhere. The CleanMedia job service didn't consider that case

Signed-off-by: Thomas Citharel <tcit@tcit.fr>
This commit is contained in:
Thomas Citharel 2021-08-22 16:17:20 +02:00
parent 1ac2990cda
commit ab843dff4c
No known key found for this signature in database
GPG key ID: A061B9DDE0CA0773
9 changed files with 98 additions and 137 deletions

1
.gitignore vendored
View file

@ -31,6 +31,7 @@ priv/cert/
cover/
site/
test/fixtures/image_tmp.jpg
test/fixtures/picture_tmp.png
test/fixtures/DSCN0010_tmp.jpg
test/uploads/
uploads/*

View file

@ -1,107 +0,0 @@
defmodule Mix.Tasks.Mobilizon.Maintenance.FixUnattachedMediaInBody do
@moduledoc """
Task to reattach media files that were added in event, post or comment bodies without being attached to their entities.
This task should only be run once.
"""
use Mix.Task
import Mix.Tasks.Mobilizon.Common
alias Mobilizon.{Discussions, Events, Medias, Posts}
alias Mobilizon.Discussions.Comment
alias Mobilizon.Events.Event
alias Mobilizon.Posts.Post
alias Mobilizon.Storage.Repo
require Logger
@preferred_cli_env "prod"
# TODO: Remove me in Mobilizon 1.2
@shortdoc "Reattaches inline media from events and posts"
def run([]) do
start_mobilizon()
shell_info("Going to extract pictures from events")
extract_inline_pictures_from_bodies(Event)
shell_info("Going to extract pictures from posts")
extract_inline_pictures_from_bodies(Post)
shell_info("Going to extract pictures from comments")
extract_inline_pictures_from_bodies(Comment)
end
defp extract_inline_pictures_from_bodies(entity) do
Repo.transaction(
fn ->
entity
|> Repo.stream()
|> Stream.map(&extract_pictures(&1))
|> Stream.map(fn {entity, pics} -> save_entity(entity, pics) end)
|> Stream.run()
end,
timeout: :infinity
)
end
defp extract_pictures(entity) do
extracted_pictures = entity |> get_body() |> parse_body() |> get_media_entities_from_urls()
attached_picture = entity |> get_picture() |> get_media_entity_from_media_id()
attached_pictures = [attached_picture] |> Enum.filter(& &1)
{entity, extracted_pictures ++ attached_pictures}
end
defp get_body(%Event{description: description}), do: description
defp get_body(%Post{body: body}), do: body
defp get_body(%Comment{text: text}), do: text
defp get_picture(%Event{picture_id: picture_id}), do: picture_id
defp get_picture(%Post{picture_id: picture_id}), do: picture_id
defp get_picture(%Comment{}), do: nil
defp parse_body(nil), do: []
defp parse_body(body) do
with res <- Regex.scan(~r/<img\s[^>]*?src\s*=\s*['\"]([^'\"]*?)['\"][^>]*?>/, body),
res <- Enum.map(res, fn [_, res] -> res end) do
res
end
end
defp get_media_entities_from_urls(media_urls) do
media_urls
|> Enum.map(fn media_url ->
# We prefer orphan media, but fallback on already attached media just in case
Medias.get_unattached_media_by_url(media_url) || Medias.get_media_by_url(media_url)
end)
|> Enum.filter(& &1)
end
defp get_media_entity_from_media_id(nil), do: nil
defp get_media_entity_from_media_id(media_id) do
Medias.get_media(media_id)
end
defp save_entity(%Event{} = _event, []), do: :ok
defp save_entity(%Event{} = event, media) do
event = Repo.preload(event, [:contacts, :media])
Events.update_event(event, %{media: media})
end
defp save_entity(%Post{} = _post, []), do: :ok
defp save_entity(%Post{} = post, media) do
post = Repo.preload(post, [:media])
Posts.update_post(post, %{media: media})
end
defp save_entity(%Comment{} = _comment, []), do: :ok
defp save_entity(%Comment{} = comment, media) do
comment = Repo.preload(comment, [:media])
Discussions.update_comment(comment, %{media: media})
end
end

View file

@ -64,7 +64,9 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphan do
end
Enum.each(medias, fn media ->
shell_info("ID: #{media.id}, Actor: #{media.actor_id}, URL: #{media.file.url}")
shell_info(
"ID: #{hd(media).id}, Actor: #{hd(media).actor_id}, Deduplicated #{length(media)} times, URL: #{hd(media).file.url}"
)
end)
end

View file

@ -4,6 +4,7 @@ defmodule Mobilizon.Medias do
"""
import Ecto.Query
import Mobilizon.Storage.CustomFunctions
alias Ecto.Multi
@ -40,23 +41,15 @@ defmodule Mobilizon.Medias do
end
@doc """
Get an unattached media by it's URL
Get all media by an URL.
"""
def get_unattached_media_by_url(url) do
@spec get_all_media_by_url(String.t()) :: Media.t() | nil
def get_all_media_by_url(url) do
url
|> String.split("?", parts: 2)
|> hd
|> media_by_url_query()
|> join(:left, [m], e in assoc(m, :events))
|> join(:left, [m], ep in assoc(m, :event_picture))
|> join(:left, [m], p in assoc(m, :posts))
|> join(:left, [m], pp in assoc(m, :posts_picture))
|> join(:left, [m], c in assoc(m, :comments))
|> where([_m, e], is_nil(e.id))
|> where([_m, _e, ep], is_nil(ep.id))
|> where([_m, _e, _ep, p], is_nil(p.id))
|> where([_m, _e, _ep, _p, pp], is_nil(pp.id))
|> where([_m, _e, _ep, _p, _pp, c], is_nil(c.id))
|> limit(1)
|> Repo.one()
|> Repo.all()
end
@doc """
@ -163,7 +156,7 @@ defmodule Mobilizon.Medias do
defp media_by_url_query(url) do
from(
p in Media,
where: fragment("? @> ?", p.file, ~s|{"url": "#{url}"}|)
where: split_part(fragment("file->>'url'"), "?", 1) == ^url
)
end

View file

@ -0,0 +1,10 @@
defmodule Mobilizon.Storage.CustomFunctions do
@moduledoc """
Helper module for custom PostgreSQL functions
"""
defmacro split_part(string, delimiter, position) do
quote do
fragment("split_part(?, ?, ?)", unquote(string), unquote(delimiter), unquote(position))
end
end
end

View file

@ -5,6 +5,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
alias Mobilizon.Actors.Actor
alias Mobilizon.Medias
alias Mobilizon.Medias.File
alias Mobilizon.Medias.Media
alias Mobilizon.Storage.Repo
import Ecto.Query
@ -25,8 +26,10 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
if Keyword.get(opts, :dry_run, false) do
{:ok, medias}
else
Enum.each(medias, fn media ->
Medias.delete_media(media, ignore_file_not_found: true)
Enum.each(medias, fn media_list ->
Enum.each(media_list, fn media ->
Medias.delete_media(media, ignore_file_not_found: true)
end)
end)
{:ok, medias}
@ -49,7 +52,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
|> Enum.join(" UNION ")
|> (&"NOT EXISTS(#{&1})").()
@spec find_media(Keyword.t()) :: list(Media.t())
@spec find_media(Keyword.t()) :: list(list(Media.t()))
defp find_media(opts) do
default_grace_period =
Mobilizon.Config.get([:instance, :orphan_upload_grace_period_hours], 48)
@ -68,6 +71,38 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
where: fragment(@union_query)
)
Repo.all(query)
query
|> Repo.all()
|> Enum.filter(fn %Media{file: %File{url: url}} ->
is_all_media_orphan?(url, expiration_date)
end)
|> Enum.chunk_by(fn %Media{file: %File{url: url}} ->
url
|> String.split("?", parts: 2)
|> hd
end)
end
def is_all_media_orphan?(url, expiration_date) do
url
|> Medias.get_all_media_by_url()
|> Enum.all?(&is_media_orphan?(&1, expiration_date))
end
@spec is_media_orphan?(Media.t(), DateTime.t()) :: boolean()
def is_media_orphan?(%Media{id: media_id}, expiration_date) do
query =
from(m in Media,
as: :media,
distinct: true,
join: a in Actor,
on: a.id == m.actor_id,
where: m.id == ^media_id,
where: is_nil(a.domain),
where: m.inserted_at < ^expiration_date,
where: fragment(@union_query)
)
Repo.exists?(query)
end
end

View file

@ -16,7 +16,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
refute is_nil(Medias.get_media(media_id))
refute is_nil(Medias.get_media(media_2_id))
assert {:ok, [found_media]} = CleanOrphanMedia.clean()
assert {:ok, [[found_media]]} = CleanOrphanMedia.clean()
assert found_media.id == media_id
assert is_nil(Medias.get_media(media_id))
@ -31,7 +31,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
refute is_nil(Medias.get_media(media_id))
refute is_nil(Medias.get_media(media_2_id))
assert {:ok, [found_media]} = CleanOrphanMedia.clean(dry_run: true)
assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(dry_run: true)
assert found_media.id == media_id
refute is_nil(Medias.get_media(media_id))
@ -46,7 +46,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
refute is_nil(Medias.get_media(media_id))
refute is_nil(Medias.get_media(media_2_id))
assert {:ok, [found_media]} = CleanOrphanMedia.clean(grace_period: 12)
assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(grace_period: 12)
assert found_media.id == media_id
assert is_nil(Medias.get_media(media_id))

View file

@ -280,7 +280,7 @@ defmodule Mobilizon.Factory do
%Mobilizon.Medias.File{
name: "My Media",
url: url,
content_type: "image/png",
content_type: "image/jpg",
size: 13_120
}
end

View file

@ -43,21 +43,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
test "with media returned" do
media1 = insert(:media)
media2 = insert(:media)
media3 = insert(:media, file: create_file())
with_mock CleanOrphanMedia,
clean: fn [dry_run: true, grace_period: 48] -> {:ok, [media1, media2]} end do
clean: fn [dry_run: true, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do
CleanOrphan.run(["--dry-run"])
assert_received {:mix_shell, :info, [output_received]}
assert output_received == "List of files that would have been deleted"
assert_received {:mix_shell, :info, [output_received]}
assert output_received ==
"ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}"
"ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}"
assert_received {:mix_shell, :info, [output_received]}
assert output_received ==
"ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}"
"ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}"
assert_received {:mix_shell, :info, [output_received]}
assert output_received == "2 files would have been deleted"
@ -78,21 +79,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
test "with media returned" do
media1 = insert(:media)
media2 = insert(:media)
media3 = insert(:media, file: create_file())
with_mock CleanOrphanMedia,
clean: fn [dry_run: false, grace_period: 48] -> {:ok, [media1, media2]} end do
clean: fn [dry_run: false, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do
CleanOrphan.run(["--verbose"])
assert_received {:mix_shell, :info, [output_received]}
assert output_received == "List of files that have been deleted"
assert_received {:mix_shell, :info, [output_received]}
assert output_received ==
"ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}"
"ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}"
assert_received {:mix_shell, :info, [output_received]}
assert output_received ==
"ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}"
"ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}"
assert_received {:mix_shell, :info, [output_received]}
assert output_received == "2 files have been deleted"
@ -121,4 +123,29 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
end
end
end
defp create_file do
File.cp!("test/fixtures/picture.png", "test/fixtures/picture_tmp.png")
file = %Plug.Upload{
content_type: "image/png",
path: Path.absname("test/fixtures/picture_tmp.png"),
filename: "image.png"
}
{:ok, data} = Mobilizon.Web.Upload.store(file)
%{
content_type: "image/png",
name: "image.png",
url: url
} = data
%Mobilizon.Medias.File{
name: "My Media",
url: url,
content_type: "image/png",
size: 13_120
}
end
end