diff --git a/src/access_moppy/vocabulary_processors.py b/src/access_moppy/vocabulary_processors.py index 30d8d5bd..219e32e7 100644 --- a/src/access_moppy/vocabulary_processors.py +++ b/src/access_moppy/vocabulary_processors.py @@ -883,6 +883,47 @@ def generate_filename( return rendered_filename + # Canonical CMIP6-CV ``experiment`` labels resolved via esgvoc, keyed by + # experiment_id. esgvoc lookups touch a database, so resolve each + # experiment at most once per process. + _EXPERIMENT_LABEL_CACHE: Dict[str, Optional[str]] = {} + + def _resolve_experiment_label(self) -> str: + """Return the canonical ``experiment`` global-attribute value. + + The WCRP compliance checker (cc-plugin-wcrp + esgvoc) compares the + global ``experiment`` attribute against esgvoc's CMIP6 controlled + vocabulary, whose label (e.g. ``"Historical simulation"``) differs from + the descriptive phrase carried in the legacy CMIP6_CVs JSON bundled with + this package (e.g. ``"all-forcing simulation of the recent past"``). + + Resolve the label from esgvoc so the written attribute matches what the + checker validates. Fall back to the bundled CV value when esgvoc is + unavailable or carries no label for this experiment -- in the latter + case the checker skips the comparison, so the legacy value is accepted. + """ + legacy_label = self.experiment.get("experiment", "") + + eid = self.experiment_id + if eid not in CMIP6Vocabulary._EXPERIMENT_LABEL_CACHE: + label: Optional[str] = None + try: + import esgvoc.api as voc + + term = voc.get_term_in_collection( + project_id="cmip6", + collection_id="experiment_id", + term_id=eid, + ) + if term is not None: + label = getattr(term, "experiment", None) + except Exception: + label = None + CMIP6Vocabulary._EXPERIMENT_LABEL_CACHE[eid] = label + + resolved = CMIP6Vocabulary._EXPERIMENT_LABEL_CACHE[eid] + return resolved if resolved else legacy_label + def get_required_global_attributes(self) -> Dict[str, Any]: now = datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") variant = self.get_variant_components() @@ -892,7 +933,7 @@ def get_required_global_attributes(self) -> Dict[str, Any]: "activity_id": self._resolve_activity_id(), "creation_date": now, "data_specs_version": self.cmip_table["Header"].get("data_specs_version"), - "experiment": self.experiment["experiment"], + "experiment": self._resolve_experiment_label(), "experiment_id": self.experiment_id, "forcing_index": variant["forcing_index"], "frequency": self.variable["frequency"], diff --git a/tests/unit/test_vocabulary_processors.py b/tests/unit/test_vocabulary_processors.py index 4a39abee..2ceca9c8 100644 --- a/tests/unit/test_vocabulary_processors.py +++ b/tests/unit/test_vocabulary_processors.py @@ -1,5 +1,7 @@ """Unit tests for vocabulary processor helper methods.""" +import sys +from types import SimpleNamespace from unittest.mock import mock_open, patch import numpy as np @@ -890,3 +892,83 @@ def test_load_table_error_includes_filename_and_directory( msg = str(exc_info.value) assert "looked for" in msg assert str(vocab.table_dir) in msg + + +# --------------------------------------------------------------------------- +# _resolve_experiment_label: canonical CMIP6 `experiment` global attribute +# (see issue-experiment-attribute-cv-mismatch). The WCRP checker compares the +# file's `experiment` against esgvoc's label, which differs from the legacy +# CMIP6_CVs phrase bundled with this package. +# --------------------------------------------------------------------------- +def _fake_esgvoc_module(term): + """Build a stand-in ``esgvoc.api`` module returning ``term`` from any lookup.""" + api = SimpleNamespace(get_term_in_collection=lambda **kwargs: term) + return {"esgvoc": SimpleNamespace(api=api), "esgvoc.api": api} + + +@pytest.mark.unit +def test_resolve_experiment_label_uses_esgvoc(vocabulary_instance): + """esgvoc's canonical label overrides the legacy CV description.""" + CMIP6Vocabulary._EXPERIMENT_LABEL_CACHE.clear() + vocabulary_instance.experiment_id = "historical" + vocabulary_instance.experiment = { + "experiment": "all-forcing simulation of the recent past" + } + term = SimpleNamespace(experiment="Historical simulation") + with patch.dict(sys.modules, _fake_esgvoc_module(term)): + assert ( + vocabulary_instance._resolve_experiment_label() == "Historical simulation" + ) + + +@pytest.mark.unit +def test_resolve_experiment_label_falls_back_when_esgvoc_label_empty( + vocabulary_instance, +): + """When esgvoc has no label for the experiment, keep the legacy CV value.""" + CMIP6Vocabulary._EXPERIMENT_LABEL_CACHE.clear() + vocabulary_instance.experiment_id = "piControl" + vocabulary_instance.experiment = {"experiment": "pre-industrial control"} + term = SimpleNamespace(experiment=None) + with patch.dict(sys.modules, _fake_esgvoc_module(term)): + assert ( + vocabulary_instance._resolve_experiment_label() == "pre-industrial control" + ) + + +@pytest.mark.unit +def test_resolve_experiment_label_falls_back_when_esgvoc_missing(vocabulary_instance): + """Without esgvoc installed, fall back to the legacy CV value.""" + CMIP6Vocabulary._EXPERIMENT_LABEL_CACHE.clear() + vocabulary_instance.experiment_id = "historical" + vocabulary_instance.experiment = { + "experiment": "all-forcing simulation of the recent past" + } + # Make `import esgvoc.api` raise ImportError. + with patch.dict(sys.modules, {"esgvoc": None, "esgvoc.api": None}): + assert ( + vocabulary_instance._resolve_experiment_label() + == "all-forcing simulation of the recent past" + ) + + +@pytest.mark.unit +def test_resolve_experiment_label_is_cached(vocabulary_instance): + """The esgvoc lookup is performed at most once per experiment_id.""" + CMIP6Vocabulary._EXPERIMENT_LABEL_CACHE.clear() + vocabulary_instance.experiment_id = "historical" + vocabulary_instance.experiment = {"experiment": "legacy"} + calls = {"n": 0} + + def _counting_lookup(**kwargs): + calls["n"] += 1 + return SimpleNamespace(experiment="Historical simulation") + + api = SimpleNamespace(get_term_in_collection=_counting_lookup) + fake = {"esgvoc": SimpleNamespace(api=api), "esgvoc.api": api} + with patch.dict(sys.modules, fake): + first = vocabulary_instance._resolve_experiment_label() + second = vocabulary_instance._resolve_experiment_label() + + assert first == second == "Historical simulation" + assert calls["n"] == 1