Prevent local quarantined media from being claimed by media retention (#12972)

This commit is contained in:
Andrew Morgan 2022-06-07 11:53:47 +01:00 committed by GitHub
parent f7baffd8ec
commit a47636c570
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 185 additions and 29 deletions

View file

@ -0,0 +1 @@
Add new `media_retention` options to the homeserver config for routinely cleaning up non-recently accessed media.

View file

@ -1583,6 +1583,12 @@ been accessed, the media's creation time is used instead. Both thumbnails
and the original media will be removed. If either of these options are unset,
then media of that type will not be purged.
Local or cached remote media that has been
[quarantined](../../admin_api/media_admin_api.md#quarantining-media-in-a-room)
will not be deleted. Similarly, local media that has been marked as
[protected from quarantine](../../admin_api/media_admin_api.md#protecting-media-from-being-quarantined)
will not be deleted.
Example configuration:
```yaml
media_retention:

View file

@ -83,7 +83,7 @@ class QuarantineMediaByUser(RestServlet):
requester = await self.auth.get_user_by_req(request)
await assert_user_is_admin(self.auth, requester.user)
logging.info("Quarantining local media by user: %s", user_id)
logging.info("Quarantining media by user: %s", user_id)
# Quarantine all media this user has uploaded
num_quarantined = await self.store.quarantine_media_ids_by_user(
@ -112,7 +112,7 @@ class QuarantineMediaByID(RestServlet):
requester = await self.auth.get_user_by_req(request)
await assert_user_is_admin(self.auth, requester.user)
logging.info("Quarantining local media by ID: %s/%s", server_name, media_id)
logging.info("Quarantining media by ID: %s/%s", server_name, media_id)
# Quarantine this media id
await self.store.quarantine_media_by_id(
@ -140,9 +140,7 @@ class UnquarantineMediaByID(RestServlet):
) -> Tuple[int, JsonDict]:
await assert_requester_is_admin(self.auth, request)
logging.info(
"Remove from quarantine local media by ID: %s/%s", server_name, media_id
)
logging.info("Remove from quarantine media by ID: %s/%s", server_name, media_id)
# Remove from quarantine this media id
await self.store.quarantine_media_by_id(server_name, media_id, None)

View file

@ -919,10 +919,14 @@ class MediaRepository:
await self.delete_old_local_media(
before_ts=local_media_threshold_timestamp_ms,
keep_profiles=True,
delete_quarantined_media=False,
delete_protected_media=False,
)
async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
old_media = await self.store.get_remote_media_before(before_ts)
old_media = await self.store.get_remote_media_ids(
before_ts, include_quarantined_media=False
)
deleted = 0
@ -975,6 +979,8 @@ class MediaRepository:
before_ts: int,
size_gt: int = 0,
keep_profiles: bool = True,
delete_quarantined_media: bool = False,
delete_protected_media: bool = False,
) -> Tuple[List[str], int]:
"""
Delete local or remote media from this server by size and timestamp. Removes
@ -982,18 +988,22 @@ class MediaRepository:
Args:
before_ts: Unix timestamp in ms.
Files that were last used before this timestamp will be deleted
size_gt: Size of the media in bytes. Files that are larger will be deleted
Files that were last used before this timestamp will be deleted.
size_gt: Size of the media in bytes. Files that are larger will be deleted.
keep_profiles: Switch to delete also files that are still used in image data
(e.g user profile, room avatar)
If false these files will be deleted
(e.g user profile, room avatar). If false these files will be deleted.
delete_quarantined_media: If True, media marked as quarantined will be deleted.
delete_protected_media: If True, media marked as protected will be deleted.
Returns:
A tuple of (list of deleted media IDs, total deleted media IDs).
"""
old_media = await self.store.get_local_media_before(
old_media = await self.store.get_local_media_ids(
before_ts,
size_gt,
keep_profiles,
include_quarantined_media=delete_quarantined_media,
include_protected_media=delete_protected_media,
)
return await self._remove_local_media_from_disk(old_media)

View file

@ -251,12 +251,36 @@ class MediaRepositoryStore(MediaRepositoryBackgroundUpdateStore):
"get_local_media_by_user_paginate_txn", get_local_media_by_user_paginate_txn
)
async def get_local_media_before(
async def get_local_media_ids(
self,
before_ts: int,
size_gt: int,
keep_profiles: bool,
include_quarantined_media: bool,
include_protected_media: bool,
) -> List[str]:
"""
Retrieve a list of media IDs from the local media store.
Args:
before_ts: Only retrieve IDs from media that was either last accessed
(or if never accessed, created) before the given UNIX timestamp in ms.
size_gt: Only retrieve IDs from media that has a size (in bytes) greater than
the given integer.
keep_profiles: If True, exclude media IDs from the results that are used in the
following situations:
* global profile user avatar
* per-room profile user avatar
* room avatar
* a user's avatar in the user directory
include_quarantined_media: If False, exclude media IDs from the results that have
been marked as quarantined.
include_protected_media: If False, exclude media IDs from the results that have
been marked as protected from quarantine.
Returns:
A list of local media IDs.
"""
# to find files that have never been accessed (last_access_ts IS NULL)
# compare with `created_ts`
@ -294,12 +318,24 @@ class MediaRepositoryStore(MediaRepositoryBackgroundUpdateStore):
)
sql += sql_keep
def _get_local_media_before_txn(txn: LoggingTransaction) -> List[str]:
if include_quarantined_media is False:
# Do not include media that has been quarantined
sql += """
AND quarantined_by IS NULL
"""
if include_protected_media is False:
# Do not include media that has been protected from quarantine
sql += """
AND safe_from_quarantine = false
"""
def _get_local_media_ids_txn(txn: LoggingTransaction) -> List[str]:
txn.execute(sql, (before_ts, before_ts, size_gt))
return [row[0] for row in txn]
return await self.db_pool.runInteraction(
"get_local_media_before", _get_local_media_before_txn
"get_local_media_ids", _get_local_media_ids_txn
)
async def store_local_media(
@ -599,15 +635,37 @@ class MediaRepositoryStore(MediaRepositoryBackgroundUpdateStore):
desc="store_remote_media_thumbnail",
)
async def get_remote_media_before(self, before_ts: int) -> List[Dict[str, str]]:
async def get_remote_media_ids(
self, before_ts: int, include_quarantined_media: bool
) -> List[Dict[str, str]]:
"""
Retrieve a list of server name, media ID tuples from the remote media cache.
Args:
before_ts: Only retrieve IDs from media that was either last accessed
(or if never accessed, created) before the given UNIX timestamp in ms.
include_quarantined_media: If False, exclude media IDs from the results that have
been marked as quarantined.
Returns:
A list of tuples containing:
* The server name of homeserver where the media originates from,
* The ID of the media.
"""
sql = (
"SELECT media_origin, media_id, filesystem_id"
" FROM remote_media_cache"
" WHERE last_access_ts < ?"
)
if include_quarantined_media is False:
# Only include media that has not been quarantined
sql += """
AND quarantined_by IS NULL
"""
return await self.db_pool.execute(
"get_remote_media_before", self.db_pool.cursor_to_dict, sql, before_ts
"get_remote_media_ids", self.db_pool.cursor_to_dict, sql, before_ts
)
async def delete_remote_media(self, media_origin: str, media_id: str) -> None:

View file

@ -53,13 +53,16 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase):
# Create a user to upload media with
test_user_id = self.register_user("alice", "password")
# Inject media (3 images each; recently accessed, old access, never accessed)
# into both the local store and the remote cache
# Inject media (recently accessed, old access, never accessed, old access
# quarantined media) into both the local store and the remote cache, plus
# one additional local media that is marked as protected from quarantine.
media_repository = hs.get_media_repository()
test_media_content = b"example string"
def _create_media_and_set_last_accessed(
def _create_media_and_set_attributes(
last_accessed_ms: Optional[int],
is_quarantined: Optional[bool] = False,
is_protected: Optional[bool] = False,
) -> str:
# "Upload" some media to the local media store
mxc_uri = self.get_success(
@ -84,10 +87,31 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase):
)
)
if is_quarantined:
# Mark this media as quarantined
self.get_success(
self.store.quarantine_media_by_id(
server_name=self.hs.config.server.server_name,
media_id=media_id,
quarantined_by="@theadmin:test",
)
)
if is_protected:
# Mark this media as protected from quarantine
self.get_success(
self.store.mark_local_media_as_safe(
media_id=media_id,
safe=True,
)
)
return media_id
def _cache_remote_media_and_set_last_accessed(
media_id: str, last_accessed_ms: Optional[int]
def _cache_remote_media_and_set_attributes(
media_id: str,
last_accessed_ms: Optional[int],
is_quarantined: Optional[bool] = False,
) -> str:
# Pretend to cache some remote media
self.get_success(
@ -112,23 +136,58 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase):
)
)
if is_quarantined:
# Mark this media as quarantined
self.get_success(
self.store.quarantine_media_by_id(
server_name=self.remote_server_name,
media_id=media_id,
quarantined_by="@theadmin:test",
)
)
return media_id
# Start with the local media store
self.local_recently_accessed_media = _create_media_and_set_last_accessed(
self.THIRTY_DAYS_IN_MS
self.local_recently_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=self.THIRTY_DAYS_IN_MS,
)
self.local_not_recently_accessed_media = _create_media_and_set_last_accessed(
self.ONE_DAY_IN_MS
self.local_not_recently_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
)
self.local_not_recently_accessed_quarantined_media = (
_create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
is_quarantined=True,
)
)
self.local_not_recently_accessed_protected_media = (
_create_media_and_set_attributes(
last_accessed_ms=self.ONE_DAY_IN_MS,
is_protected=True,
)
)
self.local_never_accessed_media = _create_media_and_set_attributes(
last_accessed_ms=None,
)
self.local_never_accessed_media = _create_media_and_set_last_accessed(None)
# And now the remote media store
self.remote_recently_accessed_media = _cache_remote_media_and_set_last_accessed(
"a", self.THIRTY_DAYS_IN_MS
self.remote_recently_accessed_media = _cache_remote_media_and_set_attributes(
media_id="a",
last_accessed_ms=self.THIRTY_DAYS_IN_MS,
)
self.remote_not_recently_accessed_media = (
_cache_remote_media_and_set_last_accessed("b", self.ONE_DAY_IN_MS)
_cache_remote_media_and_set_attributes(
media_id="b",
last_accessed_ms=self.ONE_DAY_IN_MS,
)
)
self.remote_not_recently_accessed_quarantined_media = (
_cache_remote_media_and_set_attributes(
media_id="c",
last_accessed_ms=self.ONE_DAY_IN_MS,
is_quarantined=True,
)
)
# Remote media will always have a "last accessed" attribute, as it would not
# be fetched from the remote homeserver unless instigated by a user.
@ -163,8 +222,20 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase):
],
not_purged=[
(self.hs.config.server.server_name, self.local_recently_accessed_media),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_quarantined_media,
),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_protected_media,
),
(self.remote_server_name, self.remote_recently_accessed_media),
(self.remote_server_name, self.remote_not_recently_accessed_media),
(
self.remote_server_name,
self.remote_not_recently_accessed_quarantined_media,
),
],
)
@ -199,6 +270,18 @@ class MediaRetentionTestCase(unittest.HomeserverTestCase):
self.hs.config.server.server_name,
self.local_not_recently_accessed_media,
),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_quarantined_media,
),
(
self.hs.config.server.server_name,
self.local_not_recently_accessed_protected_media,
),
(
self.remote_server_name,
self.remote_not_recently_accessed_quarantined_media,
),
(self.hs.config.server.server_name, self.local_never_accessed_media),
],
)