Add information on uploaded media to user export command. (#15107)

This commit is contained in:
Dirk Klimpel 2023-02-23 19:14:17 +01:00 committed by GitHub
parent 452b009eb0
commit a068ad7dd4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 136 additions and 16 deletions

View file

@ -0,0 +1 @@
Add media information to the command line [user data export tool](https://matrix-org.github.io/synapse/v1.79/usage/administration/admin_faq.html#how-can-i-export-user-data).

View file

@ -70,13 +70,55 @@ output-directory
│ ├───state
│ ├───invite_state
│ └───knock_state
└───user_data
├───account_data
│ ├───global
│ └───<room_id>
├───connections
├───devices
└───profile
├───user_data
│ ├───account_data
│ │ ├───global
│ │ └───<room_id>
│ ├───connections
│ ├───devices
│ └───profile
└───media_ids
└───<media_id>
```
The `media_ids` folder contains only the metadata of the media uploaded by the user.
It does not contain the media itself.
Furthermore, only the `media_ids` that Synapse manages itself are exported.
If another media repository (e.g. [matrix-media-repo](https://github.com/turt2live/matrix-media-repo))
is used, the data must be exported separately.
With the `media_ids` the media files can be downloaded.
Media that have been sent in encrypted rooms are only retrieved in encrypted form.
The following script can help with download the media files:
```bash
#!/usr/bin/env bash
# Parameters
#
# source_directory: Directory which contains the export with the media_ids.
# target_directory: Directory into which all files are to be downloaded.
# repository_url: Address of the media repository resp. media worker.
# serverName: Name of the server (`server_name` from homeserver.yaml).
#
# Example:
# ./download_media.sh /tmp/export_data/media_ids/ /tmp/export_data/media_files/ http://localhost:8008 matrix.example.com
source_directory=$1
target_directory=$2
repository_url=$3
serverName=$4
mkdir -p $target_directory
for file in $source_directory/*; do
filename=$(basename ${file})
url=$repository_url/_matrix/media/v3/download/$serverName/$filename
echo "Downloading $filename - $url"
if ! wget -o /dev/null -P $target_directory $url; then
echo "Could not download $filename"
fi
done
```
Manually resetting passwords

View file

@ -44,6 +44,7 @@ from synapse.storage.databases.main.event_push_actions import (
)
from synapse.storage.databases.main.events_worker import EventsWorkerStore
from synapse.storage.databases.main.filtering import FilteringWorkerStore
from synapse.storage.databases.main.media_repository import MediaRepositoryStore
from synapse.storage.databases.main.profile import ProfileWorkerStore
from synapse.storage.databases.main.push_rule import PushRulesWorkerStore
from synapse.storage.databases.main.receipts import ReceiptsWorkerStore
@ -86,6 +87,7 @@ class AdminCmdSlavedStore(
RegistrationWorkerStore,
RoomWorkerStore,
ProfileWorkerStore,
MediaRepositoryStore,
):
def __init__(
self,
@ -235,6 +237,14 @@ class FileExfiltrationWriter(ExfiltrationWriter):
with open(account_data_file, "a") as f:
json.dump(account_data, fp=f)
def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None:
file_directory = os.path.join(self.base_directory, "media_ids")
os.makedirs(file_directory, exist_ok=True)
media_id_file = os.path.join(file_directory, media_id)
with open(media_id_file, "w") as f:
json.dump(media_metadata, fp=f)
def finished(self) -> str:
return self.base_directory

View file

@ -252,16 +252,19 @@ class AdminHandler:
profile = await self.get_user(UserID.from_string(user_id))
if profile is not None:
writer.write_profile(profile)
logger.info("[%s] Written profile", user_id)
# Get all devices the user has
devices = await self._device_handler.get_devices_by_user(user_id)
writer.write_devices(devices)
logger.info("[%s] Written %s devices", user_id, len(devices))
# Get all connections the user has
connections = await self.get_whois(UserID.from_string(user_id))
writer.write_connections(
connections["devices"][""]["sessions"][0]["connections"]
)
logger.info("[%s] Written %s connections", user_id, len(connections))
# Get all account data the user has global and in rooms
global_data = await self._store.get_global_account_data_for_user(user_id)
@ -269,6 +272,29 @@ class AdminHandler:
writer.write_account_data("global", global_data)
for room_id in by_room_data:
writer.write_account_data(room_id, by_room_data[room_id])
logger.info(
"[%s] Written account data for %s rooms", user_id, len(by_room_data)
)
# Get all media ids the user has
limit = 100
start = 0
while True:
media_ids, total = await self._store.get_local_media_by_user_paginate(
start, limit, user_id
)
for media in media_ids:
writer.write_media_id(media["media_id"], media)
logger.info(
"[%s] Written %d media_ids of %s",
user_id,
(start + len(media_ids)),
total,
)
if (start + limit) >= total:
break
start += limit
return writer.finished()
@ -359,6 +385,18 @@ class ExfiltrationWriter(metaclass=abc.ABCMeta):
"""
raise NotImplementedError()
@abc.abstractmethod
def write_media_id(self, media_id: str, media_metadata: JsonDict) -> None:
"""Write the media's metadata of a user.
Exports only the metadata, as this can be fetched from the database via
read only. In order to access the files, a connection to the correct
media repository would be required.
Args:
media_id: ID of the media.
media_metadata: Metadata of one media file.
"""
@abc.abstractmethod
def finished(self) -> Any:
"""Called when all data has successfully been exported and written.

View file

@ -23,6 +23,7 @@ from synapse.api.constants import EventTypes, JoinRules
from synapse.api.room_versions import RoomVersions
from synapse.rest.client import knock, login, room
from synapse.server import HomeServer
from synapse.types import UserID
from synapse.util import Clock
from tests import unittest
@ -323,3 +324,31 @@ class ExfiltrateData(unittest.HomeserverTestCase):
args = writer.write_account_data.call_args_list[1][0]
self.assertEqual(args[0], "test_room")
self.assertEqual(args[1]["m.per_room"]["b"], 2)
def test_media_ids(self) -> None:
"""Tests that media's metadata get exported."""
self.get_success(
self._store.store_local_media(
media_id="media_1",
media_type="image/png",
time_now_ms=self.clock.time_msec(),
upload_name=None,
media_length=50,
user_id=UserID.from_string(self.user2),
)
)
writer = Mock()
self.get_success(self.admin_handler.export_user_data(self.user2, writer))
writer.write_media_id.assert_called_once()
args = writer.write_media_id.call_args[0]
self.assertEqual(args[0], "media_1")
self.assertEqual(args[1]["media_id"], "media_1")
self.assertEqual(args[1]["media_length"], 50)
self.assertGreater(args[1]["created_ts"], 0)
self.assertIsNone(args[1]["upload_name"])
self.assertIsNone(args[1]["last_access_ts"])