Add information on uploaded media to user export command. (#15107)

This commit is contained in:
Dirk Klimpel 2023-02-23 19:14:17 +01:00 committed by GitHub
parent 452b009eb0
commit a068ad7dd4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 136 additions and 16 deletions

View file

@ -0,0 +1 @@
Add media information to the command line [user data export tool](https://matrix-org.github.io/synapse/v1.79/usage/administration/admin_faq.html#how-can-i-export-user-data).

View file

@ -70,13 +70,55 @@ output-directory
│ ├───state │ ├───state
│ ├───invite_state │ ├───invite_state
│ └───knock_state │ └───knock_state
└───user_data ├───user_data
├───account_data │ ├───account_data
│ ├───global │ │ ├───global
│ └───<room_id> │ │ └───<room_id>
├───connections │ ├───connections
├───devices │ ├───devices
└───profile │ └───profile
└───media_ids
└───<media_id>
```
The `media_ids` folder contains only the metadata of the media uploaded by the user.
It does not contain the media itself.
Furthermore, only the `media_ids` that Synapse manages itself are exported.
If another media repository (e.g. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo))
is used, the data must be exported separately.
With the `media_ids` the media files can be downloaded.
Media that have been sent in encrypted rooms are only retrieved in encrypted form.
The following script can help with download the media files:
```bash
#!/usr/bin/env bash
# Parameters
#
# source_directory: Directory which contains the export with the media_ids.
# target_directory: Directory into which all files are to be downloaded.
# repository_url: Address of the media repository resp. media worker.
# serverName: Name of the server (`server_name` from homeserver.yaml).
#
# Example:
# ./download_media.sh /tmp/export_data/media_ids/ /tmp/export_data/media_files/ http://localhost:8008 matrix.example.com
source_directory=$1
target_directory=$2
repository_url=$3
serverName=$4
mkdir -p $target_directory
for file in $source_directory/*; do
filename=$(basename ${file})
url=$repository_url/_matrix/media/v3/download/$serverName/$filename
echo "Downloading $filename - $url"
if ! wget -o /dev/null -P $target_directory $url; then
echo "Could not download $filename"
fi
done
``` ```
Manually resetting passwords Manually resetting passwords

View file

@ -44,6 +44,7 @@ from synapse.storage.databases.main.event_push_actions import (
) )
from synapse.storage.databases.main.events_worker import EventsWorkerStore from synapse.storage.databases.main.events_worker import EventsWorkerStore
from synapse.storage.databases.main.filtering import FilteringWorkerStore from synapse.storage.databases.main.filtering import FilteringWorkerStore
from synapse.storage.databases.main.media_repository import MediaRepositoryStore
from synapse.storage.databases.main.profile import ProfileWorkerStore from synapse.storage.databases.main.profile import ProfileWorkerStore
from synapse.storage.databases.main.push_rule import PushRulesWorkerStore from synapse.storage.databases.main.push_rule import PushRulesWorkerStore
from synapse.storage.databases.main.receipts import ReceiptsWorkerStore from synapse.storage.databases.main.receipts import ReceiptsWorkerStore
@ -86,6 +87,7 @@ class AdminCmdSlavedStore(
RegistrationWorkerStore, RegistrationWorkerStore,
RoomWorkerStore, RoomWorkerStore,
ProfileWorkerStore, ProfileWorkerStore,
MediaRepositoryStore,
): ):
def __init__( def __init__(
self, self,
@ -235,6 +237,14 @@ class FileExfiltrationWriter(ExfiltrationWriter):
with open(account_data_file, "a") as f: with open(account_data_file, "a") as f:
json.dump(account_data, fp=f) json.dump(account_data, fp=f)
def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None:
file_directory = os.path.join(self.base_directory, "media_ids")
os.makedirs(file_directory, exist_ok=True)
media_id_file = os.path.join(file_directory, media_id)
with open(media_id_file, "w") as f:
json.dump(media_metadata, fp=f)
def finished(self) -> str: def finished(self) -> str:
return self.base_directory return self.base_directory

View file

@ -252,16 +252,19 @@ class AdminHandler:
profile = await self.get_user(UserID.from_string(user_id)) profile = await self.get_user(UserID.from_string(user_id))
if profile is not None: if profile is not None:
writer.write_profile(profile) writer.write_profile(profile)
logger.info("[%s] Written profile", user_id)
# Get all devices the user has # Get all devices the user has
devices = await self._device_handler.get_devices_by_user(user_id) devices = await self._device_handler.get_devices_by_user(user_id)
writer.write_devices(devices) writer.write_devices(devices)
logger.info("[%s] Written %s devices", user_id, len(devices))
# Get all connections the user has # Get all connections the user has
connections = await self.get_whois(UserID.from_string(user_id)) connections = await self.get_whois(UserID.from_string(user_id))
writer.write_connections( writer.write_connections(
connections["devices"][""]["sessions"][0]["connections"] connections["devices"][""]["sessions"][0]["connections"]
) )
logger.info("[%s] Written %s connections", user_id, len(connections))
# Get all account data the user has global and in rooms # Get all account data the user has global and in rooms
global_data = await self._store.get_global_account_data_for_user(user_id) global_data = await self._store.get_global_account_data_for_user(user_id)
@ -269,6 +272,29 @@ class AdminHandler:
writer.write_account_data("global", global_data) writer.write_account_data("global", global_data)
for room_id in by_room_data: for room_id in by_room_data:
writer.write_account_data(room_id, by_room_data[room_id]) writer.write_account_data(room_id, by_room_data[room_id])
logger.info(
"[%s] Written account data for %s rooms", user_id, len(by_room_data)
)
# Get all media ids the user has
limit = 100
start = 0
while True:
media_ids, total = await self._store.get_local_media_by_user_paginate(
start, limit, user_id
)
for media in media_ids:
writer.write_media_id(media["media_id"], media)
logger.info(
"[%s] Written %d media_ids of %s",
user_id,
(start + len(media_ids)),
total,
)
if (start + limit) >= total:
break
start += limit
return writer.finished() return writer.finished()
@ -359,6 +385,18 @@ class ExfiltrationWriter(metaclass=abc.ABCMeta):
""" """
raise NotImplementedError() raise NotImplementedError()
@abc.abstractmethod
def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None:
"""Write the media's metadata of a user.
Exports only the metadata, as this can be fetched from the database via
read only. In order to access the files, a connection to the correct
media repository would be required.
Args:
media_id: ID of the media.
media_metadata: Metadata of one media file.
"""
@abc.abstractmethod @abc.abstractmethod
def finished(self) -> Any: def finished(self) -> Any:
"""Called when all data has successfully been exported and written. """Called when all data has successfully been exported and written.

View file

@ -23,6 +23,7 @@ from synapse.api.constants import EventTypes, JoinRules
from synapse.api.room_versions import RoomVersions from synapse.api.room_versions import RoomVersions
from synapse.rest.client import knock, login, room from synapse.rest.client import knock, login, room
from synapse.server import HomeServer from synapse.server import HomeServer
from synapse.types import UserID
from synapse.util import Clock from synapse.util import Clock
from tests import unittest from tests import unittest
@ -323,3 +324,31 @@ class ExfiltrateData(unittest.HomeserverTestCase):
args = writer.write_account_data.call_args_list[1][0] args = writer.write_account_data.call_args_list[1][0]
self.assertEqual(args[0], "test_room") self.assertEqual(args[0], "test_room")
self.assertEqual(args[1]["m.per_room"]["b"], 2) self.assertEqual(args[1]["m.per_room"]["b"], 2)
def test_media_ids(self) -> None:
"""Tests that media's metadata get exported."""
self.get_success(
self._store.store_local_media(
media_id="media_1",
media_type="image/png",
time_now_ms=self.clock.time_msec(),
upload_name=None,
media_length=50,
user_id=UserID.from_string(self.user2),
)
)
writer = Mock()
self.get_success(self.admin_handler.export_user_data(self.user2, writer))
writer.write_media_id.assert_called_once()
args = writer.write_media_id.call_args[0]
self.assertEqual(args[0], "media_1")
self.assertEqual(args[1]["media_id"], "media_1")
self.assertEqual(args[1]["media_length"], 50)
self.assertGreater(args[1]["created_ts"], 0)
self.assertIsNone(args[1]["upload_name"])
self.assertIsNone(args[1]["last_access_ts"])