From e62914f739f9136a2ab49339c482136d95203790 Mon Sep 17 00:00:00 2001 From: David Ormsbee Date: Tue, 7 Apr 2026 21:43:18 -0400 Subject: [PATCH] feat: swtich LearningPackage restore to pydantic This is a re-implementation of the restore part of backup_restore, with the goal of making it more robust and maintainable in the long term. --- docs/decisions/0025-backup-restore.rst | 46 ++ requirements/base.in | 4 + requirements/base.txt | 26 +- requirements/dev.txt | 25 + requirements/doc.txt | 25 + requirements/quality.txt | 25 + requirements/test.txt | 25 + .../applets/backup_restore/api.py | 125 ++++- .../applets/backup_restore/loading.py | 326 ++++++++++++ .../applets/backup_restore/payload.py | 496 ++++++++++++++++++ .../applets/backup_restore/schema.py | 302 +++++++++++ .../applets/backup_restore/validation.py | 39 ++ .../management/commands/encode.py | 98 ++++ .../management/commands/lp_load2.py | 67 +++ .../payload_test_data/collections/broken.toml | 0 .../payload_test_data/collections/dupe_1.toml | 0 .../payload_test_data/collections/dupe_2.toml | 0 .../collections/fields_not_in_table.toml | 0 .../collections/missing_collection_table.toml | 0 .../payload_test_data/entities/broken.toml | 2 + .../payload_test_data/entities/dupe_1.toml | 4 + .../payload_test_data/entities/dupe_2.toml | 4 + .../entities/missing_entity_key.toml | 10 + .../entities/missing_entity_table.toml | 11 + .../entities/missing_versions.toml | 0 .../entities/normal_component.toml | 0 .../entities/normal_container.toml | 29 + .../root_packages/broken.toml | 3 + .../root_packages/fields_not_in_table.toml | 15 + .../root_packages/minimal.toml | 9 + .../missing_learning_package.toml | 3 + .../root_packages/missing_meta.toml | 4 + .../root_packages/normal_ulmo_v1.toml | 15 + .../root_packages/unknown_table.toml | 10 + .../unsupported_format_version_1_1.toml | 7 + .../unsupported_format_version_2.toml | 6 + .../unsupported_format_version_b.toml | 6 + .../applets/backup_restore/test_payload.py | 229 ++++++++ .../applets/backup_restore/test_restore.py | 12 +- 39 files changed, 1989 insertions(+), 19 deletions(-) create mode 100644 docs/decisions/0025-backup-restore.rst create mode 100644 src/openedx_content/applets/backup_restore/loading.py create mode 100644 src/openedx_content/applets/backup_restore/payload.py create mode 100644 src/openedx_content/applets/backup_restore/schema.py create mode 100644 src/openedx_content/applets/backup_restore/validation.py create mode 100644 src/openedx_content/management/commands/encode.py create mode 100644 src/openedx_content/management/commands/lp_load2.py create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/collections/broken.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/collections/dupe_1.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/collections/dupe_2.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/collections/fields_not_in_table.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/collections/missing_collection_table.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/broken.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_1.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_2.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_key.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_table.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_versions.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/normal_component.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/entities/normal_container.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/broken.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/fields_not_in_table.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/minimal.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_learning_package.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_meta.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/normal_ulmo_v1.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unknown_table.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_1_1.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_2.toml create mode 100644 tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_b.toml create mode 100644 tests/openedx_content/applets/backup_restore/test_payload.py diff --git a/docs/decisions/0025-backup-restore.rst b/docs/decisions/0025-backup-restore.rst new file mode 100644 index 000000000..be3b636f9 --- /dev/null +++ b/docs/decisions/0025-backup-restore.rst @@ -0,0 +1,46 @@ +25. Learning Package Serialization and Validation Approach +========================================================== + +Context +------- + +Content Libraries map 1:1 to LearningPackages and these need to be imported and exported as file archives. Initial support for this was released in Ulmo, but we wanted to revisit it to make it more robust during the Verawood timeline. This is part of that effort. + +* Flexibility of Structure +* Standardization of validation (JSON Schema) +* Justify ZIP +* Justify TOML +* Max 100,000 items. +* Use of fsspec as abstraction + +Phases + +Archive → Filesystem → Learning Package Doc + Resources → Input Models → LearningPackage + + +Decision +-------- + +Some key points: + +1. We intentionally separate input and output formats because the output format + will change over time, but the various input formats must continue to be + supported. We don't inherit from one from the other because we don't *want* + those changes to be automatically propogated--that breaks compatibility. +2. We assemble into giant JSON in order to simplify validation and allow for + more flexibility in structural representation. There's the archive layer and + then the logical layer and then serialization into the database. + + +Archive -> Model (validation) + Resources -> Database + + + +Consequences +------------ + + + +Rejected alternatives +--------------------- + diff --git a/requirements/base.in b/requirements/base.in index 508635e10..6fef37198 100644 --- a/requirements/base.in +++ b/requirements/base.in @@ -15,3 +15,7 @@ rules<4.0 # Django extension for rules-based authorization check tomlkit # Parses and writes TOML configuration files edx-organizations # Implemented the "Organization" model that CatalogCourse/CourseRun are keyed to + +fsspec # Used by openedx_content's backup_restore to abstract zip access + +pydantic[email] # Used by openedx_content's backup_restore for input validation diff --git a/requirements/base.txt b/requirements/base.txt index 380bcae62..64391e029 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -6,6 +6,8 @@ # amqp==5.3.1 # via kombu +annotated-types==0.7.0 + # via pydantic asgiref==3.11.1 # via django attrs==26.1.0 @@ -67,7 +69,9 @@ djangorestframework==3.17.1 # edx-drf-extensions # edx-organizations dnspython==2.8.0 - # via pymongo + # via + # email-validator + # pymongo drf-jwt==1.19.2 # via edx-drf-extensions edx-django-utils==8.0.1 @@ -82,8 +86,14 @@ edx-opaque-keys==4.0.0 # edx-organizations edx-organizations==8.0.0 # via -r requirements/base.in +email-validator==2.3.0 + # via pydantic +fsspec==2026.3.0 + # via -r requirements/base.in idna==3.11 - # via requests + # via + # email-validator + # requests kombu==5.6.2 # via celery packaging==26.0 @@ -96,6 +106,10 @@ psutil==7.2.2 # via edx-django-utils pycparser==3.0 # via cffi +pydantic[email]==2.13.3 + # via -r requirements/base.in +pydantic-core==2.46.3 + # via pydantic pyjwt[crypto]==2.12.1 # via # drf-jwt @@ -123,7 +137,13 @@ stevedore==5.7.0 tomlkit==0.14.0 # via -r requirements/base.in typing-extensions==4.15.0 - # via edx-opaque-keys + # via + # edx-opaque-keys + # pydantic + # pydantic-core + # typing-inspection +typing-inspection==0.4.2 + # via pydantic tzdata==2026.1 # via kombu tzlocal==5.3.1 diff --git a/requirements/dev.txt b/requirements/dev.txt index 606687503..03fae9cdb 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -8,6 +8,10 @@ amqp==5.3.1 # via # -r requirements/quality.txt # kombu +annotated-types==0.7.0 + # via + # -r requirements/quality.txt + # pydantic asgiref==3.11.1 # via # -r requirements/quality.txt @@ -164,6 +168,7 @@ djangorestframework-stubs==3.16.9 dnspython==2.8.0 # via # -r requirements/quality.txt + # email-validator # pymongo docutils==0.22.4 # via @@ -192,6 +197,10 @@ edx-opaque-keys==4.0.0 # edx-organizations edx-organizations==8.0.0 # via -r requirements/quality.txt +email-validator==2.3.0 + # via + # -r requirements/quality.txt + # pydantic filelock==3.25.2 # via # -r requirements/ci.txt @@ -200,6 +209,8 @@ filelock==3.25.2 # virtualenv freezegun==1.5.5 # via -r requirements/quality.txt +fsspec==2026.3.0 + # via -r requirements/quality.txt grimp==3.14 # via # -r requirements/quality.txt @@ -211,6 +222,7 @@ id==1.6.1 idna==3.11 # via # -r requirements/quality.txt + # email-validator # requests import-linter==2.11 # via -r requirements/quality.txt @@ -353,6 +365,12 @@ pycparser==3.0 # via # -r requirements/quality.txt # cffi +pydantic[email]==2.13.3 + # via -r requirements/quality.txt +pydantic-core==2.46.3 + # via + # -r requirements/quality.txt + # pydantic pydocstyle==6.3.0 # via -r requirements/quality.txt pygments==2.20.0 @@ -516,6 +534,13 @@ typing-extensions==4.15.0 # grimp # import-linter # mypy + # pydantic + # pydantic-core + # typing-inspection +typing-inspection==0.4.2 + # via + # -r requirements/quality.txt + # pydantic tzdata==2026.1 # via # -r requirements/quality.txt diff --git a/requirements/doc.txt b/requirements/doc.txt index 37e89a84d..92e49a580 100644 --- a/requirements/doc.txt +++ b/requirements/doc.txt @@ -12,6 +12,10 @@ amqp==5.3.1 # via # -r requirements/test.txt # kombu +annotated-types==0.7.0 + # via + # -r requirements/test.txt + # pydantic asgiref==3.11.1 # via # -r requirements/test.txt @@ -132,6 +136,7 @@ djangorestframework-stubs==3.16.9 dnspython==2.8.0 # via # -r requirements/test.txt + # email-validator # pymongo doc8==2.0.0 # via -r requirements/doc.in @@ -161,8 +166,14 @@ edx-opaque-keys==4.0.0 # edx-organizations edx-organizations==8.0.0 # via -r requirements/test.txt +email-validator==2.3.0 + # via + # -r requirements/test.txt + # pydantic freezegun==1.5.5 # via -r requirements/test.txt +fsspec==2026.3.0 + # via -r requirements/test.txt grimp==3.14 # via # -r requirements/test.txt @@ -170,6 +181,7 @@ grimp==3.14 idna==3.11 # via # -r requirements/test.txt + # email-validator # requests imagesize==2.0.0 # via sphinx @@ -249,6 +261,12 @@ pycparser==3.0 # via # -r requirements/test.txt # cffi +pydantic[email]==2.13.3 + # via -r requirements/test.txt +pydantic-core==2.46.3 + # via + # -r requirements/test.txt + # pydantic pydata-sphinx-theme==0.16.1 # via sphinx-book-theme pygments==2.20.0 @@ -381,7 +399,14 @@ typing-extensions==4.15.0 # grimp # import-linter # mypy + # pydantic + # pydantic-core # pydata-sphinx-theme + # typing-inspection +typing-inspection==0.4.2 + # via + # -r requirements/test.txt + # pydantic tzdata==2026.1 # via # -r requirements/test.txt diff --git a/requirements/quality.txt b/requirements/quality.txt index 6198e2414..4f681fb3d 100644 --- a/requirements/quality.txt +++ b/requirements/quality.txt @@ -8,6 +8,10 @@ amqp==5.3.1 # via # -r requirements/test.txt # kombu +annotated-types==0.7.0 + # via + # -r requirements/test.txt + # pydantic asgiref==3.11.1 # via # -r requirements/test.txt @@ -134,6 +138,7 @@ djangorestframework-stubs==3.16.9 dnspython==2.8.0 # via # -r requirements/test.txt + # email-validator # pymongo docutils==0.22.4 # via readme-renderer @@ -158,8 +163,14 @@ edx-opaque-keys==4.0.0 # edx-organizations edx-organizations==8.0.0 # via -r requirements/test.txt +email-validator==2.3.0 + # via + # -r requirements/test.txt + # pydantic freezegun==1.5.5 # via -r requirements/test.txt +fsspec==2026.3.0 + # via -r requirements/test.txt grimp==3.14 # via # -r requirements/test.txt @@ -169,6 +180,7 @@ id==1.6.1 idna==3.11 # via # -r requirements/test.txt + # email-validator # requests import-linter==2.11 # via -r requirements/test.txt @@ -269,6 +281,12 @@ pycparser==3.0 # via # -r requirements/test.txt # cffi +pydantic[email]==2.13.3 + # via -r requirements/test.txt +pydantic-core==2.46.3 + # via + # -r requirements/test.txt + # pydantic pydocstyle==6.3.0 # via -r requirements/quality.in pygments==2.20.0 @@ -394,6 +412,13 @@ typing-extensions==4.15.0 # grimp # import-linter # mypy + # pydantic + # pydantic-core + # typing-inspection +typing-inspection==0.4.2 + # via + # -r requirements/test.txt + # pydantic tzdata==2026.1 # via # -r requirements/test.txt diff --git a/requirements/test.txt b/requirements/test.txt index 72c728628..4f3cc40bb 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -8,6 +8,10 @@ amqp==5.3.1 # via # -r requirements/base.txt # kombu +annotated-types==0.7.0 + # via + # -r requirements/base.txt + # pydantic asgiref==3.11.1 # via # -r requirements/base.txt @@ -118,6 +122,7 @@ djangorestframework-stubs==3.16.9 dnspython==2.8.0 # via # -r requirements/base.txt + # email-validator # pymongo drf-jwt==1.19.2 # via @@ -138,13 +143,20 @@ edx-opaque-keys==4.0.0 # edx-organizations edx-organizations==8.0.0 # via -r requirements/base.txt +email-validator==2.3.0 + # via + # -r requirements/base.txt + # pydantic freezegun==1.5.5 # via -r requirements/test.in +fsspec==2026.3.0 + # via -r requirements/base.txt grimp==3.14 # via import-linter idna==3.11 # via # -r requirements/base.txt + # email-validator # requests import-linter==2.11 # via -r requirements/test.in @@ -199,6 +211,12 @@ pycparser==3.0 # via # -r requirements/base.txt # cffi +pydantic[email]==2.13.3 + # via -r requirements/base.txt +pydantic-core==2.46.3 + # via + # -r requirements/base.txt + # pydantic pygments==2.20.0 # via # pytest @@ -279,6 +297,13 @@ typing-extensions==4.15.0 # grimp # import-linter # mypy + # pydantic + # pydantic-core + # typing-inspection +typing-inspection==0.4.2 + # via + # -r requirements/base.txt + # pydantic tzdata==2026.1 # via # -r requirements/base.txt diff --git a/src/openedx_content/applets/backup_restore/api.py b/src/openedx_content/applets/backup_restore/api.py index b4cda8828..dc552fadb 100644 --- a/src/openedx_content/applets/backup_restore/api.py +++ b/src/openedx_content/applets/backup_restore/api.py @@ -1,30 +1,127 @@ """ Backup Restore API + +Archive → Filesystem → Learning Package Doc + Resources → Input Models → LearningPackage + +Extract -> Validate -> Load + + +(FS + root) -> UnvalidatedLearningPackage -> ValidatedLearningPackageInput + """ -import zipfile +from datetime import datetime, timezone +from pathlib import Path +import attrs from django.contrib.auth.models import User as UserType # pylint: disable=imported-auth-user +from django.db.transaction import atomic +from fsspec.implementations.dirfs import DirFileSystem +from fsspec.implementations.zip import ZipFileSystem -from ..publishing.api import get_learning_package_by_key -from .zipper import LearningPackageUnzipper, LearningPackageZipper +from ..publishing import api as publishing_api +from .payload import extract_unvalidated_learning_package +from .loading import Loader +from .validation import validate +from .zipper import LearningPackageZipper, generate_staged_lp_key -def create_zip_file(lp_key: str, path: str, user: UserType | None = None, origin_server: str | None = None) -> None: - """ - Creates a dump zip file for the given learning package key at the given path. - The zip file contains a TOML representation of the learning package and its contents. +@attrs.define(frozen=True) +class ImportResult: + entities_created: int # Should this be a list of entity refs instead? - Can throw a NotFoundError at get_learning_package_by_key + +def load_learning_package( + path_str: str, + user: UserType, + package_ref: str | None = None, +) -> dict: """ - learning_package = get_learning_package_by_key(lp_key) - LearningPackageZipper(learning_package, user, origin_server).create_zip(path) + Loads a learning package from a zip file at the given path. + Restores the learning package and its contents to the database. + + TODO: Returns a dictionary with the status of the operation and any errors encountered. -def load_learning_package(path: str, key: str | None = None, user: UserType | None = None) -> dict: - """ Loads a learning package from a zip file at the given path. Restores the learning package and its contents to the database. Returns a dictionary with the status of the operation and any errors encountered. """ - with zipfile.ZipFile(path, "r") as zipf: - return LearningPackageUnzipper(zipf, key, user).load() + fs = _fs_for_path(path_str) + unvalidated_input = extract_unvalidated_learning_package(fs) + + # TODO: need to be able to exit early here if errors make the rest of this + # pointless. The Loader class currently knows how to make output that we can + # send up to platform, but maybe that knowledge should be in this module + # instead? + # if unvalidated_input.errors: + + validated_input = validate(unvalidated_input) + + if package_ref is None: + package_ref = generate_staged_lp_key( + validated_input.data.learning_package.key, user, + ) + + loader = Loader(validated_input) + now = datetime.now(tz=timezone.utc) + with atomic(): + learning_package = publishing_api.create_learning_package( + package_ref, "Temp Title", created=now + ) + load_target = Loader.Target(learning_package, user, now) + result = loader.load_into(load_target) + + return result + + +def _fs_for_path(path_str: str): + """ + If the path_str passed in is a directory, we treat that as the root of the + archive to be restored. Otherwise, we assume you're passing a Zip file. + + For future consideration: Using LibArchiveFileSystem would allow us to + support tar.gz, zip, 7z, and a bunch of other archiving formats in read-only + mode. I'm not doing it now because I'm not clear on whether the reliance on + libarchive makes things problematic, I don't understand the performance + implications, and I don't want to open the door on "supported archive + formats" to include everything under the sun. But it's an intriguing option + to consider. + """ + # TODO: Handling of the special types of path here + path = Path(path_str) + if path.is_dir(): + return DirFileSystem(path) + elif path.is_file() and path.suffix.lower() == ".zip": + return ZipFileSystem(path) + + raise ValueError(f"Could not load path {path_str}") + + + + +def pretty_print(obj): + from pydantic import TypeAdapter + from typing import Any + from rich import print_json + + print_json(TypeAdapter(Any).dump_json(obj, indent=2).decode("utf8")) + + +### This was pre-existing: + +def create_zip_file( + lp_key: str, + path: str, + user: UserType | None = None, + origin_server: str | None = None, +) -> None: + """ + Creates a dump zip file for the given learning package key at the given path. + The zip file contains a TOML representation of the learning package and its contents. + + This is used by lp_dump. + + Can throw a NotFoundError at get_learning_package_by_key + """ + learning_package = publishing_api.get_learning_package_by_key(lp_key) + LearningPackageZipper(learning_package, user, origin_server).create_zip(path) diff --git a/src/openedx_content/applets/backup_restore/loading.py b/src/openedx_content/applets/backup_restore/loading.py new file mode 100644 index 000000000..984743844 --- /dev/null +++ b/src/openedx_content/applets/backup_restore/loading.py @@ -0,0 +1,326 @@ +""" +Logic for taking the logical schema model for a Learning Package and loading it +into the database. +""" +import mimetypes +import os.path + +from dataclasses import asdict, dataclass +from datetime import datetime, timedelta +from functools import cache, partial + +from django.contrib.auth.models import User as UserType # pylint: disable=imported-auth-user + +from ..components import api as components_api +from ..containers import api as containers_api +from ..collections import api as collections_api +from ..media import api as media_api +from ..publishing import api as publishing_api +from ..publishing.models import LearningPackage +from ..sections.models import Section +from ..subsections.models import Subsection +from ..units.models import Unit + +from .schema import ( + SectionInputData, + SubsectionInputData, + UnitInputData, +) +from .validation import ValidatedLearningPackageInput +from .zipper import RestoreResult, RestoreLearningPackageData, BackupMetadata + + +class Loader: + """ + Loads the validated input into a Learning Package in the database. + + This class does not understand the specifics of the archive file format. It + only needs the ValidatedLearningPackageInput. + """ + + @dataclass(frozen=True) + class Target: + learning_package: LearningPackage + user: UserType + loaded_at: datetime + + def __init__(self, validated_input: ValidatedLearningPackageInput): + self.validated_input = validated_input + self.component_inputs = {} + self.section_inputs = {} + self.subsection_inputs = {} + self.unit_inputs = {} + + entities = validated_input.data.entities + + # Split our entities into separate dicts for convenience. + for entity_ref, entity_input in sorted(entities.items()): + match entity_input.container: + case SectionInputData(): + self.section_inputs[entity_ref] = entity_input + case SubsectionInputData(): + self.subsection_inputs[entity_ref] = entity_input + case UnitInputData(): + self.unit_inputs[entity_ref] = entity_input + case None: + # For the moment, if it's not a Container, it's a Component + self.component_inputs[entity_ref] = entity_input + + def load_into(self, target: Target): + """ + This method intentionally takes a target (LearningPackage, User, + Datetime) instead of putting that information into Loader object state. + My hope is that this pattern will make it easier to adapt into handling + incremental imports where we have to test the same input being imported + into multiple Learning Packages with existing state. + """ + bulk_change_context_for_time = partial( + publishing_api.bulk_draft_changes_for, + target.learning_package.id, + changed_by=target.user.id, + ) + + # DraftChangeLog 1: Add all the PublishableEntities and their versions, + # and set their versions to prepare for for publishing. + with bulk_change_context_for_time(changed_at=target.loaded_at): + loaded_components = self.load_components_into(target) + loaded_entities = self.load_containers_into(target, loaded_components) + self.set_draft_versions(target, for_publishing=True) + + publishing_api.publish_all_drafts( + target.learning_package.id, + published_at=target.loaded_at, + published_by=target.user.id, + message="Restore from backup.", + ) + + # DraftChangeLog 2: Set all PublishableEntities to their proper draft. + # At this point, all versions have been loaded, and the correct versions + # have been published, but the current draft version might be wrong. + # + # The history display will want draft changes to be slightly after the + # published log entry. + changed_at = target.loaded_at + timedelta(seconds=1) + with bulk_change_context_for_time(changed_at=changed_at): + self.set_draft_versions(target, for_publishing=False) + + # Collections are added at the end, in case publishing of contents would + # cause more thrashing w.r.t. search indexing. + self.load_collections_into(target, loaded_entities) + + return self.build_restore_result(target) + + def build_restore_result(self, target: Target): + """ + This is for compatibility with what we're already sending to the frontend. + + TODO: We should return something more structured for our API and let the + calling api.py handle the translation into what the REST API expects. + """ + validated_data = self.validated_input.data + + # Fix this with better parsing later. + _lib, org, slug = validated_data.learning_package.key.split(":") + + loaded_entities = publishing_api.get_publishable_entities(target.learning_package.id) + + result = RestoreResult( + status="success", + log_file_error=None, + lp_restored_data=RestoreLearningPackageData( + id=target.learning_package.id, + key=target.learning_package.key, + archive_lp_key=validated_data.learning_package.key, + archive_org_key=org, + archive_slug=slug, + title=target.learning_package.title, + num_containers=loaded_entities.filter(container__isnull=False).count(), + num_sections=loaded_entities.filter(container__section__isnull=False).count(), + num_subsections=loaded_entities.filter(container__subsection__isnull=False).count(), + num_units=loaded_entities.filter(container__unit__isnull=False).count(), + num_components=loaded_entities.filter(component__isnull=False).count(), + num_collections=collections_api.get_collections(target.learning_package.id).count(), + ), + backup_metadata=BackupMetadata( + format_version=validated_data.meta.format_version, + created_by=validated_data.meta.created_by, + created_by_email=validated_data.meta.created_by, + created_at=validated_data.meta.created_at, + original_server=validated_data.meta.origin_server, + ), + ) + return asdict(result) + + def load_components_into(self, target: Target): + """ """ + + @cache # inner fn, so won't persist across calls to load_components_into + def _get_component_type(namespace: str, name: str): + return components_api.get_or_create_component_type(namespace, name) + + @cache # inner fn, so won't persist across calls to load_components_into + def _get_media_type(mime_type: str): + return media_api.get_or_create_media_type(mime_type) + + mapping = {} + for entity_ref, entity_input in self.component_inputs.items(): + namespace, name, component_code = entity_ref.split(":") + component_type = _get_component_type(namespace, name) + component = components_api.create_component( + target.learning_package.id, + component_type=component_type, + local_key=component_code, + created=target.loaded_at, + created_by=target.user.id, + ) + # TODO: Validate missing children + sorted_version_inputs = sorted( + entity_input.versions, key=lambda v: v.version_num + ) + for version_input in sorted_version_inputs: + media_to_replace = {} + for path, text_val in version_input.component.media.items(): + filename = os.path.basename(path) + if filename == "block.xml": + media_type = _get_media_type( + f"application/vnd.openedx.xblock.v1.{component_type.name}+xml" + ) + else: + media_type_str, _encoding = mimetypes.guess_type(filename) + media_type_str = media_type_str or "application/octet-stream" + media_type = _get_media_type(media_type_str) + + # TODO: Adopt data-urls for this. + if path.startswith('static/'): + # This is where we could add base64 encoded versions + # right now, we just use fs:/path/to/file + _resource_type, filepath = text_val.split(":") + new_media = media_api.get_or_create_file_media( + target.learning_package.id, + media_type.id, + data=self.validated_input.fs.read_bytes(filepath), + created=target.loaded_at, + ) + else: + new_media = media_api.get_or_create_text_media( + target.learning_package.id, + media_type.id, + text=text_val, + created=target.loaded_at, + ) + + media_to_replace[path] = new_media.id + + # TODO: Modify create_next_component_version to take a Component + # as an option, to save the needless fetches. + components_api.create_next_component_version( + component.pk, + title=version_input.title, + media_to_replace=media_to_replace, + created=target.loaded_at, + created_by=target.user.id, + force_version_num=version_input.version_num, + ) + mapping[entity_ref] = component + + return mapping + + def load_containers_into(self, target: Target, component_mapping: dict): + + # Ordering matters, since we want to build the references bottom-up. + container_types_to_inputs = { + Unit: self.unit_inputs, + Subsection: self.subsection_inputs, + Section: self.section_inputs, + } + mapping = component_mapping.copy() + for container_type, container_inputs in container_types_to_inputs.items(): + for entity_ref, entity_input in container_inputs.items(): + container = containers_api.create_container( + target.learning_package.id, + entity_ref, + created=target.loaded_at, + created_by=target.user.id, + container_cls=container_type, + ) + + # TODO: Validate missing children + sorted_version_inputs = sorted( + entity_input.versions, key=lambda v: v.version_num + ) + for version_input in sorted_version_inputs: + containers_api.create_next_container_version( + container, + title=version_input.title, + entities=[ + mapping[child_ref] + for child_ref in version_input.container.children + ], + created=target.loaded_at, + created_by=target.user.id, + force_version_num=version_input.version_num, + ) + + mapping[entity_ref] = container + + return mapping + + def load_collections_into(self, target: Target, loaded_entities): + for collection_input in self.validated_input.data.collections: + collections_api.create_collection( + target.learning_package.id, + key=collection_input.key, + title=collection_input.title, + created_by=target.user.id, + description=collection_input.description, + ) + loaded_entity_refs = [ + ref for ref in collection_input.entities if ref in loaded_entities + ] + entities = publishing_api.get_publishable_entities( + target.learning_package.id + ).filter(key__in=loaded_entity_refs) + + collections_api.add_to_collection( + target.learning_package.id, + key=collection_input.key, + entities_qset=entities, + ) + + def set_draft_versions(self, target: Target, for_publishing: bool): + entity_inputs = self.validated_input.data.entities + + saved_entities = publishing_api.get_publishable_entities( + target.learning_package.id + ) + for saved_entity in saved_entities: + saved_draft_version = publishing_api.get_draft_version(saved_entity) + input_entity = entity_inputs[saved_entity.key] + + if for_publishing: + input_version_num = input_entity.published.version_num + else: + input_version_num = input_entity.draft.version_num + + # The version we want to set is already the current draft, which + # means there's nothing to do. + if ( + saved_draft_version + and saved_draft_version.version_num == input_version_num + ): + continue + + if input_version_num is None: + version_id_to_set = None + else: + version_model_to_publish = saved_entity.versions.get( + version_num=input_version_num + ) + version_id_to_set = version_model_to_publish.id + + publishing_api.set_draft_version( + saved_entity.id, + version_id_to_set, + set_at=target.loaded_at, + set_by=target.user.id, + ) diff --git a/src/openedx_content/applets/backup_restore/payload.py b/src/openedx_content/applets/backup_restore/payload.py new file mode 100644 index 000000000..7de95966e --- /dev/null +++ b/src/openedx_content/applets/backup_restore/payload.py @@ -0,0 +1,496 @@ +""" +This module works with the actual files in our backup archive. It is agnostic to +the archive container format that the files are bundled in, e.g. a local file +system directory, a zip file archive, or something more exotic down the line. + +Some high level considerations for this module: + +1. The error checking is for the file format itself, i.e. extracting values + from the TOML files and statica assets and assembling them for validation. + In some cases, this means we do have to look for particular fields to handle +""" + +from __future__ import annotations +from numbers import Number +import os.path # fsspec doesn't work well with Path objects. +import tomllib + +import attrs +from fsspec import AbstractFileSystem + +ROOT_PACKAGE_PATH = "package.toml" + + +@attrs.define(frozen=True) +class UnvalidatedLearningPackageInput: + raw_data: dict + errors: list[ExtractionError] + fs: AbstractFileSystem + + # Mapping of entity refs to the paths where we found them. + entity_path_mapping: dict[str, str] + + +class ExtractionError(Exception): + """ + Any error during the extraction process. + + At the moment, any error is fatal. The point of the different errors is to + provide useful debug logging and to let us write tests that look for + specific errors. + """ + + def __init__(self, message, path=None): + super().__init__(message) + self.message = message + self.path = path + + def __str__(self): + return f"{self.path}: {self.message}" + + +class InvalidTOMLError(ExtractionError): + def __init__(self, file_description, details, path): + message = f"Cannot decode TOML for {file_description}: {details}" + super().__init__(message, path=path) + + +class TableNotFoundError(ExtractionError): + def __init__(self, file_description, table, path): + self.table = table + message = f"Table [{table}] not found in {file_description}." + super().__init__(message, path=path) + + +class FieldsNotInTable(ExtractionError): + def __init__(self, file_description, fields, path): + self.fields = sorted(fields) + message = f"{file_description} has fields not in a table: {', '.join(fields)}" + super().__init__(message, path=path) + + +class FieldMissing(ExtractionError): + def __init__(self, file_description, table, missing_field, path): + self.table = table + self.missing_field = missing_field + message = ( + f'{file_description} is missing required field "{missing_field}" ' + f"from table [{table}]" + ) + super().__init__(message, path=path) + + +class FileNotFoundError(ExtractionError): + def __init__(self, file_description, path): + message = f"{file_description} file not found at expected path" + super().__init__(message, path=path) + + +class DuplicateFoundError(ExtractionError): + def __init__(self, description, original_path, path): + self.original_path = original_path + message = f"{description} already defined in {original_path}" + super().__init__(message, path=path) + + +class UnsupportedFormatError(ExtractionError): + pass + + +def extract_unvalidated_learning_package( + fs: AbstractFileSystem, +) -> UnvalidatedLearningPackageInput: + """ + Extract the raw, unvalidated Learning Package metadata. + + We scan through the archive and compile a Python dictionary that can be + validated against CompletePackageInputData. This mostly involves reading a + bunch of TOML-serialized files and copying their contents, with some minor + data transformations where idiomatic TOML doesn't really give the structure + we want in our final model. + + The purpose of this abstraction is to later allow different ways to assemble + the JSON that we want to do our validation on. By default, this is a bunch + of TOML files, but folks who have specialized authoring needs may prefer a + different set of conventions. For instance, the MIT Disciplinary Experts in + Learning Technology and Applications team prefers to author in a way that + encodes large parts of the hierarchy (Section -> Subsection -> Unit) in a + single file, with pointers to certain Components in different files. + + Errors can happen at this layer, but they are errors related to the + consistency of the archive payload format itself. So errors that need to be + checked here are things like: + + * Missing critical files, like package.toml + * Duplicated entity files, as this is not possible to represent in the + CompletePackageInputData schema. + + Things like missing fields and incorrect field values will be handled at the + validation step which happens after this. In other words, the only things + that are errors here are the things that prevent us from creating a + UnvalidatedLearningPackageInput at all. + """ + # The general philosophy here is to always march on and get as much as + # possible, even if we know the upload is doomed. + unvalidated = {} + errors = [] + + # Root Package Metadata + try: + # This adds the "meta" and "learning_package" keys + unvalidated |= extract_root_package_data(fs, ROOT_PACKAGE_PATH) + except ExtractionError as err: + errors.append(err) + + # PublishableEntities & versions (components, units, sections, subsections) + entities_data, entity_path_mapping, entities_errors = extract_entities_data( + fs, get_entity_file_paths(fs) + ) + unvalidated["entities"] = entities_data + errors.extend(entities_errors) + + # Collections + # TODO: Duplicate collections are a problem too. + collections = [] + for collection_file_path in sorted(fs.glob("collections/*.toml")): + try: + collections.append(extract_collection_data(fs, collection_file_path)) + except ExtractionError as err: + errors.append(err) + unvalidated["collections"] = collections + + return UnvalidatedLearningPackageInput( + raw_data=unvalidated, + errors=errors, + fs=fs, + entity_path_mapping=entity_path_mapping, + ) + + +def extract_root_package_data(fs: AbstractFileSystem, path: str) -> dict: + """ + Extract the "meta" and "learning_package" from the TOML file at path. + + This is a straightforward extraction because we don't have to transform the + actual fields in the data. We expect to see a TOML file that looks something + like this: + + [meta] + format_version = 1 + created_by = "eddy" + created_by_email = "eddy@axim.org" + created_at = 2026-03-11T19:20:20.394360Z + origin_server = "studio.local.openedx.io" + + [learning_package] + title = "Fun Library" + key = "lib:Axim:FunLib" + description = "My very fun library! 🐢" + created = 2026-02-11T16:32:47.524556Z + updated = 2026-02-20T16:32:47.524556Z + + The output should look like: + + { + 'meta': { + 'format_version': 1, + 'created_by': 'eddy', + 'created_by_email': 'eddy@axim.org', + 'created_at': datetime(2026, 3, 11, 19, 20, 20, 394360, tzinfo=timezone.utc), + 'origin_server': 'studio.local.openedx.io' + }, + 'learning_package': { + 'title': 'Fun Library', + 'key': 'lib:Axim:FunLib', + 'description': 'My very fun library! 🐢', + 'created': datetime(2026, 2, 11, 16, 32, 47, 524556, tzinfo=timezone.utc), + 'updated': datetime(2026, 2, 20, 16, 32, 47, 524556, tzinfo=timezone.utc), + } + } + + We need to return a Python dict that we get from parsing this. Most of this + function is error handling. The error checking at this layer is minimal, and + is mostly focused on making sure that the file exists, is parseable, and has + the two tables we expect it to have. + """ + file_description = "Root Package" + + # Check: Root Package file exists at all. + if not fs.exists(path): + raise FileNotFoundError(file_description, path=path) + + # Check: Is it a valid TOML file? + with fs.open(path, "rb") as package_toml_file: + try: + root_package_dict = tomllib.load(package_toml_file) + except tomllib.TOMLDecodeError as dec_err: + raise InvalidTOMLError( + file_description, details=str(dec_err), path=path + ) from dec_err + + # Check: Don't allow top-level fields outside a [table] + _check_all_fields_in_tables(root_package_dict, file_description, path) + + # Check: The "[meta]" and "[learning_package]" tables are mandatory + if "meta" not in root_package_dict: + raise TableNotFoundError(file_description, table="meta", path=path) + if "learning_package" not in root_package_dict: + raise TableNotFoundError(file_description, table="learning_package", path=path) + + # Check: We only support format_version 1, and don't know what to do with + # anything higher. This leaves us some wiggle-room to declare a 1.x version + # that is backwards compatible, i.e. it will reject 2 and higher, but accept + # 1.1, 1.2, etc. + format_version = root_package_dict["meta"].get("format_version") + if not isinstance(format_version, Number) or format_version >= 2: + raise UnsupportedFormatError( + f"Format version {format_version} is unsupported (only 1 is supported).", + path=path, + ) + + return root_package_dict + + +def get_entity_file_paths(fs: AbstractFileSystem) -> list[str]: + """ + Find all the PublishableEntity TOML file paths in our archive. + + We expect our entity TOML files to be in the entities directory, but we have + two categories right now: + + * Component TOML: entities/xblock.v1/{component_type}/{component_code} + * Container TOML: entities/{entity_ref} + + This function looks for TOML files in entities/ or any of its subdirs. We + only exclude matches inside the component_version data, to make sure that we + don't accidentally match media files in the unlikely event where people have + TOML files as static assets. + """ + paths = [ + path + for path in fs.glob("entities/**/*.toml") + # Filter out TOML files that are in component media, e.g. static assets: + if "/component_versions/" not in path + ] + return sorted(paths) # Make the ordering deterministic. + + +def extract_entities_data(fs: AbstractFileSystem, paths: list[str]): + entities_data = {} + entity_path_mapping = {} + errors = [] + for entity_file_path in paths: + try: + entity_ref, entity_data = extract_entity_data( + fs, entity_file_path, entity_path_mapping + ) + entities_data[entity_ref] = entity_data + entity_path_mapping[entity_ref] = entity_file_path + except ExtractionError as err: + errors.append(err) + + return entities_data, entity_path_mapping, errors + + +def extract_entity_data( + fs: AbstractFileSystem, path: str, entity_path_mapping: dict[str, str] | None = None +) -> tuple[str, dict]: + """ + This extracts raw entity data from an Entity TOML file. + + PublishableEntities can be both Components (XBlock problems, videos, etc.), + as well as Containers like Units, Subsections, and Sections. Some sample + TOML: + + [entity] + can_stand_alone = true + key = "section-9-ac4b9f" + created = 2026-04-08T15:22:12.780012Z + + [entity.draft] + version_num = 2 + + [entity.published] + version_num = 1 + + [entity.container.section] + + # ### Versions + + [[version]] + title = "Section 9" + version_num = 2 + + [version.container] + children = ["week-7-e73782", "subsection-001-e4bbe5"] + + [[version]] + title = "Section 9" + version_num = 1 + + [version.container] + children = ["week-7-e73782"] + + We return a tuple where the first element is the Entity's key + ("section-9-ac4b9f"), and the second is a dict that would look like: + + { + 'can_stand_alone': True, + 'created': datetime(2026, 4, 8, 15, 22, 12, 780012, tzinfo=timezone.utc), + 'draft': { + 'version_num': 2 + }, + 'published': { + 'version_num': 1 + }, + 'container': { + 'section': {} + }, + 'versions': [ + { + 'title': 'Section 9', + 'version_num': 2, + 'container': { + 'children': [ + 'week-7-e73782', + 'subsection-001-e4bbe5' + ] + } + }, + { + 'title': 'Section 9', + 'version_num': 1, + 'container': { + 'children': [ + 'week-7-e73782' + ] + } + } + ] + } + + Note some key differences: + + 1. The "entity" table elements have been popped out to the top level. + 2. The "version" list has been renamed to "versions" to feel more natural. + 3. The "key" field (a.k.a. entity_ref) has been popped out to pass back as + part of the tuple. This will become a key/value pair in an "entities" + dict that will hold all publishable entity input data. + """ + file_description = "Entity" + if entity_path_mapping is None: + entity_path_mapping = {} + + # Check: Is it a valid TOML file? + with fs.open(path, "rb") as entity_file: + try: + entity_root_dict = tomllib.load(entity_file) + except tomllib.TOMLDecodeError as dec_err: + raise InvalidTOMLError( + file_description, details=str(dec_err), path=path + ) from dec_err + + # Check: Don't allow top-level fields outside a [table] + _check_all_fields_in_tables(entity_root_dict, file_description, path) + + # Check: Does it define a top level "[entity]" table? Note that this can + # pass if they define a sub-table like "[entity.draft]", since the existence + # of "[entity]" is implicit in that case. If we get that far, rely on + # catching it at the validation step (i.e. after payload extraction). + if "entity" not in entity_root_dict: + raise TableNotFoundError(file_description, "entity", path=path) + + # Check: Does it define an Entity key (i.e. entity_ref)? We need to check + # this now because the dict we have to assemble will use these as keys. + entity = entity_root_dict["entity"] + entity_ref = entity.pop("key", None) + if not entity_ref: + raise FieldMissing(file_description, "entity", "key", path) + + # Check: Is it a duplicate of an Entity that has already been defined + # elsewhere in this archive? + if entity_ref in entity_path_mapping: + raise DuplicateFoundError( + f"Entity key {entity_ref}", entity_path_mapping[entity_ref], path + ) + + # Note case difference: we're renaming "version" in the TOML to "versions" + # in the data dict we're assembling. + entity["versions"] = entity_root_dict.pop("version", []) + for version in entity["versions"]: + # Do our best to put together entity version data (and component version + # data), but don't worry about validating the results (that can happen + # during the validation step). + version_num = version.get("version_num") + comp_ver_dir = os.path.join( + os.path.splitext(path)[0], + "component_versions", + f"v{version_num}", + ) + if fs.exists(comp_ver_dir): + version["component"] = {} + media = { + os.path.relpath(path, comp_ver_dir): fs.read_text(path) + for path in fs.glob(f"{comp_ver_dir}/*") + if fs.isfile(path) + } + # Any static files are encoded as pointers. + # TODO: Convert this to data-urls later + for static_file_path in fs.glob(f"{comp_ver_dir}/static/**"): + if fs.isfile(static_file_path): + rel_path = os.path.relpath(static_file_path, comp_ver_dir) + media[rel_path] = f"fs:{static_file_path}" + + version["component"]["media"] = media + + return entity_ref, entity + + +def extract_collection_data(fs: AbstractFileSystem, path: str) -> dict: + file_description = "Collection" + + with fs.open(path, "rb") as collection_toml_file: + try: + collection_root_dict = tomllib.load(collection_toml_file) + except tomllib.TOMLDecodeError as dec_err: + raise InvalidTOMLError(file_description, details=str(dec_err), path=path) + + _check_all_fields_in_tables(collection_root_dict, file_description, path) + if "collection" not in collection_root_dict: + raise TableNotFoundError( + file_description, table="collection", path=path + ) + + collection_data = collection_root_dict["collection"] + collection_data["src_path"] = path + + return collection_data + + +def _check_all_fields_in_tables(data: dict, file_description, path): + """ + Raise an error if fields are declared outside of a table. + + The convention for our TOML files is that keys are always in a table, so if + it's *not* in a table, that's likely an omission/error that might otherwise + be difficult to catch because they'd be "missing" from the place they're + supposed to be in the parsed data structure, but that wouldn't be obvious to + someone editing the files by hand. + """ + fields_outside_of_tables = [ + field + for field, val in data.items() + if not isinstance(val, dict) and not isinstance(val, list) + ] + if fields_outside_of_tables: + raise FieldsNotInTable( + file_description, fields=fields_outside_of_tables, path=path + ) + + +def pretty_print(obj): + from pydantic import TypeAdapter + from typing import Any + from rich import print_json + + print_json(TypeAdapter(Any).dump_json(obj, indent=2).decode("utf8")) diff --git a/src/openedx_content/applets/backup_restore/schema.py b/src/openedx_content/applets/backup_restore/schema.py new file mode 100644 index 000000000..6bde51697 --- /dev/null +++ b/src/openedx_content/applets/backup_restore/schema.py @@ -0,0 +1,302 @@ +""" +This module defines the schema that we use during the backup/restore process. + +The pydantic models defined in this module are divided into InputData and +OutputData. These are intentionally kept separate and do not inherit from each +other. The InputData classes will be much more permissive, with many optional +fields. The OutputData classes are meant for internal use when generating +exports, and will be stricter. +""" +from __future__ import annotations +from pathlib import Path + +from pydantic import ( + AwareDatetime, + BaseModel, + ConfigDict, + Field, + EmailStr, + StrictStr, + StringConstraints, + field_validator +) +from typing import Annotated, Literal + +# Refs are arbitrary identifiers that we do almost no validation of, and are +# mainly there to assure uniqueness within some namespace. +REF_CONSTRAINTS = StringConstraints( + strict=True, + strip_whitespace=True, +), + +# This is for things like the collection_code, library_code, etc. +CODE_CONSTRAINTS = StringConstraints( + strict=True, + strip_whitespace=True, + # Note that we can't use \Z to indicate the end of line in our regex because + # that's not supported syntax in JavaScript, and pydantic will raise an + # error when trying to generate a JSON Schema. However, the combination of $ + # and strip_whitespace=True means that we're sure that we won't allow any + # trailing newlines. + pattern=r"^[a-zA-Z0-9_.-]+$", +), + + +class InputData(BaseModel): + """ + Base class for all inputs, here to set config defaults. + + InputData classes are frozen, i.e. they should only be initialized once from + the unvalidated input. Allowing gradual mutations makes things much harder + to debug. + + InputData clases are also set to allow parameters that they don't recognize + (extra="allow") for the sake of forwards compatibility. As any given file + format gets iterated on, it will get new attributes. Older installs of the + platform should ignore these new attributes and just load the things that we + know how to handle. The reason we don't set this to "ignore" is because + unrecognized fields could be simple typos of known fields, so we still want + to capture that information so we can potentially display warnings about it. + """ + model_config = ConfigDict(frozen=True, extra="allow") + + +class CompletePackageInputData(InputData): + """ + The contents of the entire Learning Package. + """ + meta: MetaInputData + learning_package: LearningPackageInputData + + # Mapping of entity refs to EntityInputData + entities: dict[Annotated[str, REF_CONSTRAINTS], EntityInputData] + + collections: list[CollectionInput] + + @field_validator('collections', mode='after') + @classmethod + def check_for_duplicate_keys(cls, collections: list[CollectionInput]): + """ + Raise a ValueError if we encounter a duplicate collection entry. + + In the longer term, we may want to be able to remove the duplicate + entries (and other broken entries), while still otherwise allowing the + restore to proceed. But for now, any error kills the restore process. + """ + collection_keys_to_paths = {} + for collection in collections: + if collection.key in collection_keys_to_paths: + originally_defined_collection = collection_keys_to_paths[collection.key] + raise ValueError( + f'Collection "{collection.key}" redefined in ' + f'{collection.src_path} (original in ' + f'{originally_defined_collection.src_path})' + ) + else: + collection_keys_to_paths[collection.key] = collection + + return collections + + +class MetaInputData(InputData): + """ + Input Package Metadata, Version 1 + + This is data about the backup file itself, as opposed to the Learning + Package that it contains: who created this backup, when was it created, etc. + On the input side, the fields here are only here so that we can give useful + preview information when the user is uploading this to a new instance. None + of these values are necessary for creating a new Learning Package—in fact, + none of these can even be trusted, since a malicious actor could manipulate + them to say whatever they wanted. It's just meant as a sanity check to help + assure the user that they're restoring the correct package archive. + + The only truly critical field is ``format_version``, since that will one day + affect input validation rules. + """ + format_version: Literal[1] # Only supported version at the moment + created_by: StrictStr | None = Field(min_length=1) + created_by_email: EmailStr | None + created_at: AwareDatetime | None + origin_server: StrictStr | None + +class LearningPackageInputData(InputData): + """ + High level data for a Learning Package itself (not its contents). + """ + title: StrictStr = Field(min_length=1, default="Untitled Library") + key: Annotated[ + str, + REF_CONSTRAINTS, + Field( + description=( + "This is often a LibraryLocatorV2-formatted string, but can be " + "any arbitrary string at the moment. It must be unique within a" + " given server instance." + ), + examples=[ + "lib:OrgName:LibraryName", + "lib:Axim:IntroPhysics", + "lp-restore:Axim:IntroPhysics:1775752130941", + ], + ), + ] + description: StrictStr | None = Field(default="", max_length=10_000) + created: AwareDatetime | None + updated: AwareDatetime | None + + +class DraftInputData(InputData): + version_num: Annotated[int, Field(gt=0)] | None = None + + +class PublishedInputData(InputData): + version_num: Annotated[int, Field(gt=0)] | None = None + + +class EntityInputData(InputData): + can_stand_alone: bool = True + + # key: str + created: AwareDatetime + + # Weird edge case: If you create something, never publish it, and then do a + # "reset to published state", the resulting export in Ulmo would omit the + # [entity.draft] section entirely, rather than it being an empty dictionary. + draft: DraftInputData = DraftInputData(version_num=None) + published: PublishedInputData = PublishedInputData(version_num=None) + + versions: list[VersionInput] = [] + + # Not all entities are containers, and we may one day have containers that + # this version of the code does not understand. So we have a generic dict + # for unknown containers and None means it's something that is not a + # container. + # + # TODO: Test unknown container type. + container: UnitInputData | SubsectionInputData | SectionInputData | dict | None = None + + +class SectionInputData(InputData): + section: dict = {} + +class SubsectionInputData(InputData): + subsection: dict = {} + +class UnitInputData(InputData): + unit: dict = {} + + +class VersionInput(InputData): + version_num: Annotated[int, Field(gt=0)] + title: str + component: ComponentVersionInput | None = None + container: ContainerVersionInput | None = None + + +class ComponentVersionInput(InputData): + media: dict + + +class ContainerVersionInput(InputData): + children: list[Annotated[str, REF_CONSTRAINTS]] + + +class VersionsInput(InputData): + versions: dict[ + Annotated[int, Field(gt=0)], + VersionInput, + ] + + +class CollectionInput(InputData): + title: StrictStr = Field(min_length=1) + key: Annotated[ + str, + CODE_CONSTRAINTS, + Field( + description=( + "A unique slug-like code field. Must be unique within a given Learning Package." + ), + examples=[ + "difficult-problems", + "practice-exams", + ], + ), + ] + description: StrictStr | None = Field(default="", max_length=10_000) + created: AwareDatetime | None + + # It looks like we weren't actually serializing the modified date. + created: AwareDatetime | None + + # This is the source file where this Collection was defined. This is only + # for being able to create useful error messages. We should never be reading + # from this file directly because the exact format of this file should be + # free to change as needed. That's the responsibilty of the payload.py + # module. + src_path: Path | None + + +##### We're not actually using anything below this line yet. + + + + + + +class PackageConfigOutputData(BaseModel): + """ + Writes the package.toml file when we're writing a backup archive. + """ + meta: MetaOutputData + learning_package: LearningPackageOutputData + + +class PackageConfigInputData(BaseModel): + """ + Reads the package.toml file when we're reading from a backup archive. + """ + meta: MetaInputData + learning_package: LearningPackageInputData + + +class MetaOutputData(BaseModel): + """ + Output Package Metadata + + This is metadata that is written so that people can more easily figure out + where a backup archive came from. + + The "created_by", "created_by_email", and "created_at" fields all refer to + the user who created the backup archive, not the user who created the + Library (Learning Package). + """ + format_version: Literal[1] + created_by: StrictStr = Field(min_length=1) + created_by_email: EmailStr + created_at: AwareDatetime + origin_server: StrictStr + + + + + +class LearningPackageOutputData(BaseModel): + """ + High level data for a Learning Package. + """ + title: StrictStr = Field(min_length=1) + key: StrictStr = Field( + pattern=r"^lib:[\w\-.]+:[\w\-.]+$", + description="This is a LibraryLocatorV2", + examples=[ + "lib:OrgName:LibraryName", + "lib:Axim:IntroPhysics", + ] + ) + description: StrictStr + created: AwareDatetime + updated: AwareDatetime + + diff --git a/src/openedx_content/applets/backup_restore/validation.py b/src/openedx_content/applets/backup_restore/validation.py new file mode 100644 index 000000000..c86063197 --- /dev/null +++ b/src/openedx_content/applets/backup_restore/validation.py @@ -0,0 +1,39 @@ +""" +This is an archive-agnostic validation of the data models. I might actually just +move this to api.py, since most of this work will be done in schema.py +""" +import attrs + +from pydantic_core import InitErrorDetails +from fsspec import AbstractFileSystem + +from .schema import CompletePackageInputData +from .payload import UnvalidatedLearningPackageInput + +@attrs.define(frozen=True) +class ValidatedLearningPackageInput: + data: CompletePackageInputData | None # None if it's too broken + + fs: AbstractFileSystem + + # All these names are terrible. + + # These are the errors that mean this is actually malformed, i.e. JSON + # Schema level validation. + structural_errors: list[InitErrorDetails] + + deeper_errors: list # This is stuff we have to dig deeper for, e.g. missing parent-child relationship + +def validate( + unvalidated_lp: UnvalidatedLearningPackageInput, +) -> ValidatedLearningPackageInput: + """ """ + validated = CompletePackageInputData.model_validate(unvalidated_lp.raw_data) + # pretty_print(validated) + + return ValidatedLearningPackageInput( + data=validated, + fs=unvalidated_lp.fs, + structural_errors=[], + deeper_errors=[], + ) \ No newline at end of file diff --git a/src/openedx_content/management/commands/encode.py b/src/openedx_content/management/commands/encode.py new file mode 100644 index 000000000..84171c9fb --- /dev/null +++ b/src/openedx_content/management/commands/encode.py @@ -0,0 +1,98 @@ +from datetime import datetime, timezone +import logging + +from django.core.management import CommandError +from django.core.management.base import BaseCommand + +logger = logging.getLogger(__name__) + +import json + + +from pydantic import BaseModel, Field +from pydantic.config import ConfigDict +from pydantic.json_schema import models_json_schema + +from typing import Annotated, Optional + + +class EntityVersion(BaseModel): + version_num: int + title: str + +class VersionRef(BaseModel): + version_num: Optional[int] = None + +class Entity(BaseModel): + can_stand_alone: bool + key: str + created: datetime + draft: VersionRef + published: VersionRef + versions: list[EntityVersion] + +class EntityRoot(BaseModel): + entity: Entity + +from openedx_content.applets.backup_restore.schema import ( + LearningPackageOutputData, PackageConfigOutputData, MetaOutputData +) + +class Command(BaseCommand): + """ + Django management command to export a learning package to a zip file. + """ + help = 'Export a learning package to a zip file.' + + def add_arguments(self, parser): + pass + + def handle(self, *args, **options): + now = datetime.now(tz=timezone.utc) + config = PackageConfigOutputData( + meta=MetaOutputData( + format_version=1, + created_by="dave", + created_by_email="dave@axim.org", + created_at=now, + ), + learning_package=LearningPackageOutputData( + title="Fun Library", + key="lib:Axim:FunLib", + description="", + created=now, + updated=now, + origin_server="studio.local.openedx.io:8001", + ) + ) + toml_output = tomli_w.dumps(config.model_dump(exclude_defaults=False)) + print(toml_output) + + print(json.dumps(PackageConfigOutputData.model_json_schema(), indent=2)) + + + def handle_old(self, *args, **options): + e = Entity( + can_stand_alone=True, + key="xblock.v1:html:hi-there-9d01929cda81", + created=datetime.now(tz=timezone.utc), + draft=VersionRef(version_num=3), + published=VersionRef(version_num=None), + versions = [ + EntityVersion( + version_num=x, + title=f"Title {x}", + ) + for x in range(10) + ], + ) + base = EntityRoot( + entity=e, + ) + + #toml_output = tomli_w.dumps(base.model_dump(exclude_defaults=True)) + #print(toml_output) + + + print(json.dumps(EntityRoot.model_json_schema(), indent=2)) + diff --git a/src/openedx_content/management/commands/lp_load2.py b/src/openedx_content/management/commands/lp_load2.py new file mode 100644 index 000000000..f23592b09 --- /dev/null +++ b/src/openedx_content/management/commands/lp_load2.py @@ -0,0 +1,67 @@ +""" +Django management commands to handle restore learning packages (WIP) +""" +import logging +import time + +from django.contrib.auth import get_user_model +from django.core.management import CommandError +from django.core.management.base import BaseCommand + +from openedx_content.applets.backup_restore.api import load_learning_package + +logger = logging.getLogger(__name__) + +User = get_user_model() + + + +class Command(BaseCommand): + """ + Django management command to load a Learning Package. + """ + help = 'Load a learning package from a zip file.' + + def add_arguments(self, parser): + parser.add_argument('path', type=str, help='The path of the directory or file to load from.') + parser.add_argument('package_ref', type=str, help="Learning Package Ref: often a v2 library key.") + parser.add_argument('username', type=str, help='The username of the user performing the load operation.') + + + def handle(self, *args, **options): + path = options['path'] + package_ref = options['package_ref'] + username = options['username'] + + user = User.objects.get(username=username) + + load_learning_package(path, user=user, package_ref=package_ref) + + return 0 + if not path.lower().endswith(".zip"): + raise CommandError("Input file name must end with .zip") + try: + start_time = time.time() + # Get the user performing the operation + user = User.objects.get(username=username) + + result = load_learning_package(path, user=user) + duration = time.time() - start_time + if result["status"] == "error": + message = "Errors encountered during restore:\n" + log_buffer = result.get("log_file_error") + if log_buffer: + message += log_buffer.getvalue() + raise CommandError(message) + message = f'{path} loaded successfully (duration: {duration:.2f} seconds)' + self.stdout.write(self.style.SUCCESS(message)) + except FileNotFoundError as exc: + message = f"Learning package file {path} not found: {exc}" + raise CommandError(message) from exc + except Exception as e: + message = f"Failed to load '{path}': {e}" + logger.exception( + "Failed to load zip file %s ", + path, + ) + raise CommandError(message) from e diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/collections/broken.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/collections/broken.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/collections/dupe_1.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/collections/dupe_1.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/collections/dupe_2.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/collections/dupe_2.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/collections/fields_not_in_table.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/collections/fields_not_in_table.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/collections/missing_collection_table.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/collections/missing_collection_table.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/broken.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/broken.toml new file mode 100644 index 000000000..6537aa087 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/broken.toml @@ -0,0 +1,2 @@ +[entity] +key = " diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_1.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_1.toml new file mode 100644 index 000000000..e332af3b5 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_1.toml @@ -0,0 +1,4 @@ +[entity] +can_stand_alone = true +key = "dupe-key" +created = 2025-10-31T16:41:57.691331Z \ No newline at end of file diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_2.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_2.toml new file mode 100644 index 000000000..e332af3b5 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/dupe_2.toml @@ -0,0 +1,4 @@ +[entity] +can_stand_alone = true +key = "dupe-key" +created = 2025-10-31T16:41:57.691331Z \ No newline at end of file diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_key.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_key.toml new file mode 100644 index 000000000..ffee075be --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_key.toml @@ -0,0 +1,10 @@ +# No [entity] key +[entity] +can_stand_alone = true +created = 2025-10-31T16:42:04.158245Z + +# ### Versions + +[[version]] +title = "Text" +version_num = 3 diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_table.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_table.toml new file mode 100644 index 000000000..9451c4173 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_entity_table.toml @@ -0,0 +1,11 @@ +# Typo: [entitty] instead of [entity] +[entitty] +can_stand_alone = true +key = "xblock.v1:html:9f221fc4-42f1-4d07-ada4-653409bc5fff" +created = 2025-10-31T16:42:04.158245Z + +# ### Versions + +[[version]] +title = "Text" +version_num = 3 diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_versions.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/missing_versions.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/normal_component.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/normal_component.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/entities/normal_container.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/normal_container.toml new file mode 100644 index 000000000..39ff085ff --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/entities/normal_container.toml @@ -0,0 +1,29 @@ +# Pretty normal Section with a couple of child Subsections +[entity] +can_stand_alone = true +key = "section-9-ac4b9f" +created = 2026-04-08T15:22:12.780012Z + +[entity.draft] +version_num = 2 + +[entity.published] +version_num = 1 + +[entity.container.section] + +# ### Versions + +[[version]] +title = "Section 9" +version_num = 2 + +[version.container] +children = ["week-7-e73782", "subsection-001-e4bbe5"] + +[[version]] +title = "Section 9" +version_num = 1 + +[version.container] +children = ["week-7-e73782"] diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/broken.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/broken.toml new file mode 100644 index 000000000..09db43a4a --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/broken.toml @@ -0,0 +1,3 @@ +# This is just malformed TOML. +[meta] +format_version = diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/fields_not_in_table.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/fields_not_in_table.toml new file mode 100644 index 000000000..2b05c95ab --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/fields_not_in_table.toml @@ -0,0 +1,15 @@ +# The problem here is that created_by and formt_version are not in a table, and +# we don't allow that. +created_by = "eddy" +format_version = 1 + +[meta] +created_at = 2026-03-11T19:20:20.394360Z +origin_server = "studio.local.openedx.io" + +[learning_package] +title = "Fun Library" +key = "lib:Axim:FunLib" +description = "My very fun library! 🐢" +created = 2026-02-11T16:32:47.524556Z +updated = 2026-02-20T16:32:47.524556Z \ No newline at end of file diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/minimal.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/minimal.toml new file mode 100644 index 000000000..4874bb574 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/minimal.toml @@ -0,0 +1,9 @@ +# This is the absolute minimum necessary to pass the payload extraction step. +# It doesn't matter that required fields are missing --> that's handled in +# validation. Payload extraction just requires that the tables we expect exist. +# +# Ordering is also irrelevant for this file. +[learning_package] + +[meta] +format_version = 1 diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_learning_package.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_learning_package.toml new file mode 100644 index 000000000..03e087cf7 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_learning_package.toml @@ -0,0 +1,3 @@ +# This errors because the [learning_package] section is missing. +[meta] +format_version = 1 diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_meta.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_meta.toml new file mode 100644 index 000000000..95c65a9dd --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/missing_meta.toml @@ -0,0 +1,4 @@ +# This errors because the [meta] section is missing. + +[learning_package] +title = "Fun Library" diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/normal_ulmo_v1.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/normal_ulmo_v1.toml new file mode 100644 index 000000000..5ff8122fe --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/normal_ulmo_v1.toml @@ -0,0 +1,15 @@ +# This is a fully specified package TOML file with no errors, like we would +# expect from an Ulmo instance. +[meta] +format_version = 1 +created_by = "eddy" +created_by_email = "eddy@axim.org" +created_at = 2026-03-11T19:20:20.394360Z +origin_server = "studio.local.openedx.io" + +[learning_package] +title = "Fun Library" +key = "lib:Axim:FunLib" +description = "My very fun library! 🐢" +created = 2026-02-11T16:32:47.524556Z +updated = 2026-02-20T16:32:47.524556Z \ No newline at end of file diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unknown_table.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unknown_table.toml new file mode 100644 index 000000000..36d1161ea --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unknown_table.toml @@ -0,0 +1,10 @@ +# This is a fully specified package TOML file with no errors, like we would +# expect from an Ulmo instance. +[meta] +format_version = 1 + +[learning_package] +title = "Fun Library" + +[unknown] +new_field = "Aloha!" diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_1_1.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_1_1.toml new file mode 100644 index 000000000..bbb1d49e2 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_1_1.toml @@ -0,0 +1,7 @@ +# We're going to accept format_version 1.1 as a hedge against backwards +# compatible additions to this format (we may never use this). +[meta] +format_version = 1.1 + +[learning_package] +title = "Fun Library" diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_2.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_2.toml new file mode 100644 index 000000000..dc2227b41 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_2.toml @@ -0,0 +1,6 @@ +# We don't support format_version > 1 +[meta] +format_version = 2 + +[learning_package] +title = "Fun Library" diff --git a/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_b.toml b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_b.toml new file mode 100644 index 000000000..8a9df5dc2 --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/payload_test_data/root_packages/unsupported_format_version_b.toml @@ -0,0 +1,6 @@ +# format_version needs to be an integer +[meta] +format_version = "b" + +[learning_package] +title = "Fun Library" diff --git a/tests/openedx_content/applets/backup_restore/test_payload.py b/tests/openedx_content/applets/backup_restore/test_payload.py new file mode 100644 index 000000000..71c80fefb --- /dev/null +++ b/tests/openedx_content/applets/backup_restore/test_payload.py @@ -0,0 +1,229 @@ +""" +IMPORTANT: If you are adding new fields/behaviors, they should take the form of +*new* tests on new test data files, and not modifications to existing ones. +Please be very cautious about whether you are breaking backwards compatibility. + +This module tests our ability to extract data from the backup archive TOML files +and resources, and assemble them into a combined document that represents the +entire LearningPackage, and is encapsulated in UnvalidatedLearningPackageInput. +Most of these test functions that examine individual files. The functions in +payload.py were designed to mostly accept an AbstractFileSystem and path as +arguments, so it should be possible to do simple test calls on TOML files and +dirs without having to mock anything. + +These tests are strictly for the payload module, and therefore don't need Django +to run. +""" + +from datetime import datetime, timezone +from pathlib import Path +from unittest import TestCase, skip + +from fsspec.implementations.dirfs import DirFileSystem + +from openedx_content.applets.backup_restore import payload + + +TEST_DATA_ROOT = Path(__file__).parent / "payload_test_data" + + +class ExtractRootPackageFileTest(TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.fs = DirFileSystem(TEST_DATA_ROOT / "root_packages") + + @classmethod + def tearDownClass(cls): + del cls.fs + super().tearDownClass() + + def test_file_not_found(self): + with self.assertRaises(payload.FileNotFoundError) as err: + payload.extract_root_package_data(self.fs, "does_not_exist.toml") + assert err.path == "does_not_exist.toml" + + def test_broken_toml(self): + with self.assertRaises(payload.InvalidTOMLError) as err: + payload.extract_root_package_data(self.fs, "broken.toml") + assert err.path == "broken.toml" + + def test_fields_not_in_table(self): + with self.assertRaises(payload.FieldsNotInTable) as err: + payload.extract_root_package_data(self.fs, "fields_not_in_table.toml") + assert err.path == "fields_not_in_table.toml" + assert err.fields == ["created_by", "format_version"] + + def test_missing_meta_table(self): + with self.assertRaises(payload.TableNotFoundError) as err: + payload.extract_root_package_data(self.fs, "missing_meta.toml") + assert err.path == "missing_meta.toml" + assert err.table == "meta" + assert "[meta]" in str(err) + + def test_missing_learning_package_table(self): + with self.assertRaises(payload.TableNotFoundError) as err: + payload.extract_root_package_data(self.fs, "missing_learning_package.toml") + assert err.path == "missing_learning_package.toml" + assert err.table == "learning_package" + assert "[learning_package]" in str(err) + + def test_unsupported_format_version(self): + # We don't support format_version=2 + with self.assertRaises(payload.UnsupportedFormatError) as err: + payload.extract_root_package_data( + self.fs, "unsupported_format_version_2.toml" + ) + # We don't support format_version as anthing other than number + with self.assertRaises(payload.UnsupportedFormatError) as err: + payload.extract_root_package_data( + self.fs, "unsupported_format_version_b.toml" + ) + + # We will allow format_version 1.x though, in case we want to extend our + # format in a fully backwards compatible way. + root_data = payload.extract_root_package_data( + self.fs, "unsupported_format_version_1_1.toml" + ) + assert root_data["meta"]["format_version"] == 1.1 + + def test_ignore_unknown_tables(self): + """Allow for forwards compatibility.""" + assert "unknown" in payload.extract_root_package_data( + self.fs, "unknown_table.toml" + ) + + def test_minimal(self): + data = payload.extract_root_package_data(self.fs, "minimal.toml") + assert data == { + "meta": { + "format_version": 1, + }, + "learning_package": {}, + } + + def test_normal(self): + data = payload.extract_root_package_data(self.fs, "normal_ulmo_v1.toml") + assert data == { + "meta": { + "format_version": 1, + "created_by": "eddy", + "created_by_email": "eddy@axim.org", + "created_at": datetime( + 2026, 3, 11, 19, 20, 20, 394360, tzinfo=timezone.utc + ), + "origin_server": "studio.local.openedx.io", + }, + "learning_package": { + "title": "Fun Library", + "key": "lib:Axim:FunLib", + "description": "My very fun library! 🐢", + "created": datetime( + 2026, 2, 11, 16, 32, 47, 524556, tzinfo=timezone.utc + ), + "updated": datetime( + 2026, 2, 20, 16, 32, 47, 524556, tzinfo=timezone.utc + ), + }, + } + + +class ExtractEntityDataTest(TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.fs = DirFileSystem(TEST_DATA_ROOT / "entities") + + @classmethod + def tearDownClass(cls): + del cls.fs + super().tearDownClass() + + def test_broken_toml(self): + with self.assertRaises(payload.InvalidTOMLError) as err: + payload.extract_entity_data(self.fs, "broken.toml") + + def test_missing_entity_table(self): + with self.assertRaises(payload.TableNotFoundError) as err: + payload.extract_entity_data(self.fs, "missing_entity_table.toml") + assert err.path == "missing_entity_table.toml" + assert err.table == "entity" + assert "[entity]" in str(err) + + def test_missing_entity_key(self): + with self.assertRaises(payload.FieldMissing) as err: + payload.extract_entity_data(self.fs, "missing_entity_key.toml") + assert err.missing_field == "key" + assert err.table == "entity" + + def test_dupes(self): + """ + Test for duplicate entities. + + If we didn't explicitly check for this, a second file defining the same + entity one entity would just overwrite + the other, which would confuse people who might be assembling an archive + file for restoring. + + This test is different from the others because extract_entities_data + doesn't raise exceptions, it collects them from its calls to + extract_entity_data(). + """ + paths = ["dupe_1.toml", "dupe_2.toml"] + data, _path_mapping, errors = payload.extract_entities_data(self.fs, paths) + assert "dupe-key" in data # The first one should have succeeded... + assert len(data) == 1 # but the duplicate never made it in. + assert len(errors) == 1 # There should be only one error. + + error = errors[0] + assert error.original_path == "dupe_1.toml" # path of the original + assert error.path == "dupe_2.toml" # path where error was marked + + @skip + def test_ignore_unknown_tables(self): + # assert "unknown" in payload.extract_root_package_data(self.fs, "unknown_table.toml") + pass + + @skip + def test_normal_component(self): + pass + + def test_normal_container(self): + ref, data = payload.extract_entity_data(self.fs, "normal_container.toml") + assert ref == "section-9-ac4b9f" + assert data == { + 'can_stand_alone': True, + 'created': datetime(2026, 4, 8, 15, 22, 12, 780012, tzinfo=timezone.utc), + 'draft': { + 'version_num': 2 + }, + 'published': { + 'version_num': 1 + }, + 'container': { + 'section': {} + }, + 'versions': [ + { + 'title': 'Section 9', + 'version_num': 2, + 'container': { + 'children': [ + 'week-7-e73782', + 'subsection-001-e4bbe5' + ] + } + }, + { + 'title': 'Section 9', + 'version_num': 1, + 'container': { + 'children': [ + 'week-7-e73782' + ] + } + } + ] + } + + diff --git a/tests/openedx_content/applets/backup_restore/test_restore.py b/tests/openedx_content/applets/backup_restore/test_restore.py index 0116731c4..2a25c9787 100644 --- a/tests/openedx_content/applets/backup_restore/test_restore.py +++ b/tests/openedx_content/applets/backup_restore/test_restore.py @@ -285,7 +285,7 @@ def test_error_learning_package_missing_key(self): # Mock parse_learning_package_toml to return a dict without 'key' with patch( - "openedx_content.applets.backup_restore.zipper.parse_learning_package_toml", + "openedx_content.applets.backup_restore.zipper.LearningPackageUnzipper.extract_root_package_data", return_value={ "learning_package": { "title": "Library test", @@ -316,7 +316,7 @@ def test_error_no_metadata_section(self): # Mock parse_learning_package_toml to return a dict without 'meta' with patch( - "openedx_content.applets.backup_restore.zipper.parse_learning_package_toml", + "openedx_content.applets.backup_restore.zipper.LearningPackageUnzipper.extract_root_package_data", return_value={ "learning_package": { "title": "Library test", @@ -354,6 +354,14 @@ def test_success_metadata_using_user_context(self): assert metadata == expected_metadata +from textwrap import dedent + +class RestoreV2TestCase(RestoreTestCase): + + def test_package_toml_parsing(self): + pass + + class RestoreUtilitiesTest(TestCase): """Tests for utility functions used in the restore process."""