From ab843dff4cf22ed28071b4fea1a1edbe843215e0 Mon Sep 17 00:00:00 2001
From: Thomas Citharel <tcit@tcit.fr>
Date: Sun, 22 Aug 2021 16:17:20 +0200
Subject: [PATCH] Fixed deduplicated files from orphan media being deleted as
 well

Happens when a file is uploaded, then orphaned, and a similar file is
used somewhere. The CleanMedia job service didn't consider that case

Signed-off-by: Thomas Citharel <tcit@tcit.fr>
---
 .gitignore                                    |   1 +
 .../fix_unattached_media_in_body.ex           | 107 ------------------
 lib/mix/tasks/mobilizon/media/clean_orphan.ex |   4 +-
 lib/mobilizon/medias/medias.ex                |  23 ++--
 lib/mobilizon/storage/custom_functions.ex     |  10 ++
 lib/service/clean_orphan_media.ex             |  43 ++++++-
 test/service/clean_orphan_media_test.exs      |   6 +-
 test/support/factory.ex                       |   2 +-
 test/tasks/media/clean_orphan_test.exs        |  39 ++++++-
 9 files changed, 98 insertions(+), 137 deletions(-)
 delete mode 100644 lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex
 create mode 100644 lib/mobilizon/storage/custom_functions.ex

diff --git a/.gitignore b/.gitignore
index b4b15332c..5b7ebafb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -31,6 +31,7 @@ priv/cert/
 cover/
 site/
 test/fixtures/image_tmp.jpg
+test/fixtures/picture_tmp.png
 test/fixtures/DSCN0010_tmp.jpg
 test/uploads/
 uploads/*
diff --git a/lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex b/lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex
deleted file mode 100644
index d1934bad7..000000000
--- a/lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex
+++ /dev/null
@@ -1,107 +0,0 @@
-defmodule Mix.Tasks.Mobilizon.Maintenance.FixUnattachedMediaInBody do
-  @moduledoc """
-  Task to reattach media files that were added in event, post or comment bodies without being attached to their entities.
-
-  This task should only be run once.
-  """
-  use Mix.Task
-
-  import Mix.Tasks.Mobilizon.Common
-  alias Mobilizon.{Discussions, Events, Medias, Posts}
-  alias Mobilizon.Discussions.Comment
-  alias Mobilizon.Events.Event
-  alias Mobilizon.Posts.Post
-  alias Mobilizon.Storage.Repo
-  require Logger
-
-  @preferred_cli_env "prod"
-
-  # TODO: Remove me in Mobilizon 1.2
-
-  @shortdoc "Reattaches inline media from events and posts"
-  def run([]) do
-    start_mobilizon()
-
-    shell_info("Going to extract pictures from events")
-    extract_inline_pictures_from_bodies(Event)
-    shell_info("Going to extract pictures from posts")
-    extract_inline_pictures_from_bodies(Post)
-    shell_info("Going to extract pictures from comments")
-    extract_inline_pictures_from_bodies(Comment)
-  end
-
-  defp extract_inline_pictures_from_bodies(entity) do
-    Repo.transaction(
-      fn ->
-        entity
-        |> Repo.stream()
-        |> Stream.map(&extract_pictures(&1))
-        |> Stream.map(fn {entity, pics} -> save_entity(entity, pics) end)
-        |> Stream.run()
-      end,
-      timeout: :infinity
-    )
-  end
-
-  defp extract_pictures(entity) do
-    extracted_pictures = entity |> get_body() |> parse_body() |> get_media_entities_from_urls()
-
-    attached_picture = entity |> get_picture() |> get_media_entity_from_media_id()
-    attached_pictures = [attached_picture] |> Enum.filter(& &1)
-
-    {entity, extracted_pictures ++ attached_pictures}
-  end
-
-  defp get_body(%Event{description: description}), do: description
-  defp get_body(%Post{body: body}), do: body
-  defp get_body(%Comment{text: text}), do: text
-
-  defp get_picture(%Event{picture_id: picture_id}), do: picture_id
-  defp get_picture(%Post{picture_id: picture_id}), do: picture_id
-  defp get_picture(%Comment{}), do: nil
-
-  defp parse_body(nil), do: []
-
-  defp parse_body(body) do
-    with res <- Regex.scan(~r/<img\s[^>]*?src\s*=\s*['\"]([^'\"]*?)['\"][^>]*?>/, body),
-         res <- Enum.map(res, fn [_, res] -> res end) do
-      res
-    end
-  end
-
-  defp get_media_entities_from_urls(media_urls) do
-    media_urls
-    |> Enum.map(fn media_url ->
-      # We prefer orphan media, but fallback on already attached media just in case
-      Medias.get_unattached_media_by_url(media_url) || Medias.get_media_by_url(media_url)
-    end)
-    |> Enum.filter(& &1)
-  end
-
-  defp get_media_entity_from_media_id(nil), do: nil
-
-  defp get_media_entity_from_media_id(media_id) do
-    Medias.get_media(media_id)
-  end
-
-  defp save_entity(%Event{} = _event, []), do: :ok
-
-  defp save_entity(%Event{} = event, media) do
-    event = Repo.preload(event, [:contacts, :media])
-    Events.update_event(event, %{media: media})
-  end
-
-  defp save_entity(%Post{} = _post, []), do: :ok
-
-  defp save_entity(%Post{} = post, media) do
-    post = Repo.preload(post, [:media])
-    Posts.update_post(post, %{media: media})
-  end
-
-  defp save_entity(%Comment{} = _comment, []), do: :ok
-
-  defp save_entity(%Comment{} = comment, media) do
-    comment = Repo.preload(comment, [:media])
-    Discussions.update_comment(comment, %{media: media})
-  end
-end
diff --git a/lib/mix/tasks/mobilizon/media/clean_orphan.ex b/lib/mix/tasks/mobilizon/media/clean_orphan.ex
index 7b3ee34e5..09ba6a37f 100644
--- a/lib/mix/tasks/mobilizon/media/clean_orphan.ex
+++ b/lib/mix/tasks/mobilizon/media/clean_orphan.ex
@@ -64,7 +64,9 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphan do
     end
 
     Enum.each(medias, fn media ->
-      shell_info("ID: #{media.id}, Actor: #{media.actor_id}, URL: #{media.file.url}")
+      shell_info(
+        "ID: #{hd(media).id}, Actor: #{hd(media).actor_id}, Deduplicated #{length(media)} times, URL: #{hd(media).file.url}"
+      )
     end)
   end
 
diff --git a/lib/mobilizon/medias/medias.ex b/lib/mobilizon/medias/medias.ex
index 565b828c7..7b4362621 100644
--- a/lib/mobilizon/medias/medias.ex
+++ b/lib/mobilizon/medias/medias.ex
@@ -4,6 +4,7 @@ defmodule Mobilizon.Medias do
   """
 
   import Ecto.Query
+  import Mobilizon.Storage.CustomFunctions
 
   alias Ecto.Multi
 
@@ -40,23 +41,15 @@ defmodule Mobilizon.Medias do
   end
 
   @doc """
-  Get an unattached media by it's URL
+  Get all media by an URL.
   """
-  def get_unattached_media_by_url(url) do
+  @spec get_all_media_by_url(String.t()) :: Media.t() | nil
+  def get_all_media_by_url(url) do
     url
+    |> String.split("?", parts: 2)
+    |> hd
     |> media_by_url_query()
-    |> join(:left, [m], e in assoc(m, :events))
-    |> join(:left, [m], ep in assoc(m, :event_picture))
-    |> join(:left, [m], p in assoc(m, :posts))
-    |> join(:left, [m], pp in assoc(m, :posts_picture))
-    |> join(:left, [m], c in assoc(m, :comments))
-    |> where([_m, e], is_nil(e.id))
-    |> where([_m, _e, ep], is_nil(ep.id))
-    |> where([_m, _e, _ep, p], is_nil(p.id))
-    |> where([_m, _e, _ep, _p, pp], is_nil(pp.id))
-    |> where([_m, _e, _ep, _p, _pp, c], is_nil(c.id))
-    |> limit(1)
-    |> Repo.one()
+    |> Repo.all()
   end
 
   @doc """
@@ -163,7 +156,7 @@ defmodule Mobilizon.Medias do
   defp media_by_url_query(url) do
     from(
       p in Media,
-      where: fragment("? @> ?", p.file, ~s|{"url": "#{url}"}|)
+      where: split_part(fragment("file->>'url'"), "?", 1) == ^url
     )
   end
 
diff --git a/lib/mobilizon/storage/custom_functions.ex b/lib/mobilizon/storage/custom_functions.ex
new file mode 100644
index 000000000..eab327234
--- /dev/null
+++ b/lib/mobilizon/storage/custom_functions.ex
@@ -0,0 +1,10 @@
+defmodule Mobilizon.Storage.CustomFunctions do
+  @moduledoc """
+  Helper module for custom PostgreSQL functions
+  """
+  defmacro split_part(string, delimiter, position) do
+    quote do
+      fragment("split_part(?, ?, ?)", unquote(string), unquote(delimiter), unquote(position))
+    end
+  end
+end
diff --git a/lib/service/clean_orphan_media.ex b/lib/service/clean_orphan_media.ex
index 66703d30c..27c674aa5 100644
--- a/lib/service/clean_orphan_media.ex
+++ b/lib/service/clean_orphan_media.ex
@@ -5,6 +5,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
 
   alias Mobilizon.Actors.Actor
   alias Mobilizon.Medias
+  alias Mobilizon.Medias.File
   alias Mobilizon.Medias.Media
   alias Mobilizon.Storage.Repo
   import Ecto.Query
@@ -25,8 +26,10 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
     if Keyword.get(opts, :dry_run, false) do
       {:ok, medias}
     else
-      Enum.each(medias, fn media ->
-        Medias.delete_media(media, ignore_file_not_found: true)
+      Enum.each(medias, fn media_list ->
+        Enum.each(media_list, fn media ->
+          Medias.delete_media(media, ignore_file_not_found: true)
+        end)
       end)
 
       {:ok, medias}
@@ -49,7 +52,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
                |> Enum.join(" UNION ")
                |> (&"NOT EXISTS(#{&1})").()
 
-  @spec find_media(Keyword.t()) :: list(Media.t())
+  @spec find_media(Keyword.t()) :: list(list(Media.t()))
   defp find_media(opts) do
     default_grace_period =
       Mobilizon.Config.get([:instance, :orphan_upload_grace_period_hours], 48)
@@ -68,6 +71,38 @@ defmodule Mobilizon.Service.CleanOrphanMedia do
         where: fragment(@union_query)
       )
 
-    Repo.all(query)
+    query
+    |> Repo.all()
+    |> Enum.filter(fn %Media{file: %File{url: url}} ->
+      is_all_media_orphan?(url, expiration_date)
+    end)
+    |> Enum.chunk_by(fn %Media{file: %File{url: url}} ->
+      url
+      |> String.split("?", parts: 2)
+      |> hd
+    end)
+  end
+
+  def is_all_media_orphan?(url, expiration_date) do
+    url
+    |> Medias.get_all_media_by_url()
+    |> Enum.all?(&is_media_orphan?(&1, expiration_date))
+  end
+
+  @spec is_media_orphan?(Media.t(), DateTime.t()) :: boolean()
+  def is_media_orphan?(%Media{id: media_id}, expiration_date) do
+    query =
+      from(m in Media,
+        as: :media,
+        distinct: true,
+        join: a in Actor,
+        on: a.id == m.actor_id,
+        where: m.id == ^media_id,
+        where: is_nil(a.domain),
+        where: m.inserted_at < ^expiration_date,
+        where: fragment(@union_query)
+      )
+
+    Repo.exists?(query)
   end
 end
diff --git a/test/service/clean_orphan_media_test.exs b/test/service/clean_orphan_media_test.exs
index 21c1c2264..f0ca3197d 100644
--- a/test/service/clean_orphan_media_test.exs
+++ b/test/service/clean_orphan_media_test.exs
@@ -16,7 +16,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
       refute is_nil(Medias.get_media(media_id))
       refute is_nil(Medias.get_media(media_2_id))
 
-      assert {:ok, [found_media]} = CleanOrphanMedia.clean()
+      assert {:ok, [[found_media]]} = CleanOrphanMedia.clean()
       assert found_media.id == media_id
 
       assert is_nil(Medias.get_media(media_id))
@@ -31,7 +31,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
       refute is_nil(Medias.get_media(media_id))
       refute is_nil(Medias.get_media(media_2_id))
 
-      assert {:ok, [found_media]} = CleanOrphanMedia.clean(dry_run: true)
+      assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(dry_run: true)
       assert found_media.id == media_id
 
       refute is_nil(Medias.get_media(media_id))
@@ -46,7 +46,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do
       refute is_nil(Medias.get_media(media_id))
       refute is_nil(Medias.get_media(media_2_id))
 
-      assert {:ok, [found_media]} = CleanOrphanMedia.clean(grace_period: 12)
+      assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(grace_period: 12)
       assert found_media.id == media_id
 
       assert is_nil(Medias.get_media(media_id))
diff --git a/test/support/factory.ex b/test/support/factory.ex
index db1a40967..953ac8c0b 100644
--- a/test/support/factory.ex
+++ b/test/support/factory.ex
@@ -280,7 +280,7 @@ defmodule Mobilizon.Factory do
     %Mobilizon.Medias.File{
       name: "My Media",
       url: url,
-      content_type: "image/png",
+      content_type: "image/jpg",
       size: 13_120
     }
   end
diff --git a/test/tasks/media/clean_orphan_test.exs b/test/tasks/media/clean_orphan_test.exs
index 0aeb7e00b..83440dcf2 100644
--- a/test/tasks/media/clean_orphan_test.exs
+++ b/test/tasks/media/clean_orphan_test.exs
@@ -43,21 +43,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
     test "with media returned" do
       media1 = insert(:media)
       media2 = insert(:media)
+      media3 = insert(:media, file: create_file())
 
       with_mock CleanOrphanMedia,
-        clean: fn [dry_run: true, grace_period: 48] -> {:ok, [media1, media2]} end do
+        clean: fn [dry_run: true, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do
         CleanOrphan.run(["--dry-run"])
         assert_received {:mix_shell, :info, [output_received]}
         assert output_received == "List of files that would have been deleted"
         assert_received {:mix_shell, :info, [output_received]}
 
         assert output_received ==
-                 "ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}"
+                 "ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}"
 
         assert_received {:mix_shell, :info, [output_received]}
 
         assert output_received ==
-                 "ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}"
+                 "ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}"
 
         assert_received {:mix_shell, :info, [output_received]}
         assert output_received == "2 files would have been deleted"
@@ -78,21 +79,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
     test "with media returned" do
       media1 = insert(:media)
       media2 = insert(:media)
+      media3 = insert(:media, file: create_file())
 
       with_mock CleanOrphanMedia,
-        clean: fn [dry_run: false, grace_period: 48] -> {:ok, [media1, media2]} end do
+        clean: fn [dry_run: false, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do
         CleanOrphan.run(["--verbose"])
         assert_received {:mix_shell, :info, [output_received]}
         assert output_received == "List of files that have been deleted"
         assert_received {:mix_shell, :info, [output_received]}
 
         assert output_received ==
-                 "ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}"
+                 "ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}"
 
         assert_received {:mix_shell, :info, [output_received]}
 
         assert output_received ==
-                 "ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}"
+                 "ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}"
 
         assert_received {:mix_shell, :info, [output_received]}
         assert output_received == "2 files have been deleted"
@@ -121,4 +123,29 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do
       end
     end
   end
+
+  defp create_file do
+    File.cp!("test/fixtures/picture.png", "test/fixtures/picture_tmp.png")
+
+    file = %Plug.Upload{
+      content_type: "image/png",
+      path: Path.absname("test/fixtures/picture_tmp.png"),
+      filename: "image.png"
+    }
+
+    {:ok, data} = Mobilizon.Web.Upload.store(file)
+
+    %{
+      content_type: "image/png",
+      name: "image.png",
+      url: url
+    } = data
+
+    %Mobilizon.Medias.File{
+      name: "My Media",
+      url: url,
+      content_type: "image/png",
+      size: 13_120
+    }
+  end
 end