From cfabab6034430dc46351a1476abc6cb1c4669932 Mon Sep 17 00:00:00 2001 From: rhaegar325 Date: Fri, 26 Jun 2026 12:53:04 +1000 Subject: [PATCH 1/4] fix a couple of issues related to Oyr data cmorisation --- src/access_moppy/base.py | 27 +++++ src/access_moppy/driver.py | 13 ++- src/access_moppy/ocean.py | 12 +++ src/access_moppy/utilities.py | 122 ++++++++++++++++++++-- src/access_moppy/vocabulary_processors.py | 4 + tests/unit/test_utilities.py | 120 +++++++++++++++++++++ tests/unit/test_vocabulary_processors.py | 24 +++++ 7 files changed, 313 insertions(+), 9 deletions(-) diff --git a/src/access_moppy/base.py b/src/access_moppy/base.py index 7e48f806..7d163d50 100644 --- a/src/access_moppy/base.py +++ b/src/access_moppy/base.py @@ -22,6 +22,7 @@ calculate_longitude_bounds, calculate_time_bounds, normalize_cf_time_units, + parse_cmip6_table_frequency, type_mapping, validate_and_resample_if_needed, validate_cmip6_frequency_compatibility, @@ -596,6 +597,29 @@ def rechunk_dataset(self): else: logger.debug("No dataset loaded, cannot rechunk") + def _target_frequency_hint(self): + """Map the CMOR table's target frequency to a coarse label + ("daily"/"monthly"/"yearly") for time-bounds construction. + + Used only as a fallback when the time axis has a single point and the + frequency cannot be inferred from point spacing. Returns None when the + frequency is not determinable or is sub-daily. + """ + if not self.compound_name: + return None + try: + target = parse_cmip6_table_frequency(self.compound_name) + except Exception: + return None + days = target.total_seconds() / 86400 + if 0.9 <= days <= 1.1: + return "daily" + if 28 <= days <= 31: + return "monthly" + if 360 <= days <= 366: + return "yearly" + return None + def calculate_missing_bounds_variables(self, bnds_required): """Calculate missing bounds variables for coordinates.""" for bnds_var in bnds_required: @@ -622,6 +646,9 @@ def calculate_missing_bounds_variables(self, bnds_required): self.ds, time_coord=coord_name, bnds_name="bnds", # Atmosphere uses "bnds" + # Fallback for a single time point (e.g. one resampled + # year) where the frequency cannot be inferred. + freq_hint=self._target_frequency_hint(), ) elif coord_name in ["lat", "latitude", "y"]: diff --git a/src/access_moppy/driver.py b/src/access_moppy/driver.py index 68d3d7cc..65ce29c1 100644 --- a/src/access_moppy/driver.py +++ b/src/access_moppy/driver.py @@ -120,7 +120,7 @@ def __init__( parent_info: dict[str, dict[str, Any]] | None = None, model_id: str | None = None, validate_frequency: bool = True, - enable_resampling: bool = False, + enable_resampling: bool = True, enable_chunking: bool = False, resampling_method: str = "auto", input_folder: str | Path | None = None, @@ -157,7 +157,10 @@ def __init__( validate_frequency: Validate temporal frequency consistency across file inputs. This is disabled automatically for xarray inputs. enable_resampling: Enable automatic temporal resampling when - frequency mismatches are detected. + frequency mismatches are detected. Defaults to ``True``; + resampling is a no-op when the input already matches the target + frequency, and only triggers on a genuine mismatch (e.g. monthly + input for an ``Oyr`` table). Pass ``False`` to disable. enable_chunking: Enable dask chunking in supported component CMORisers. resampling_method: Temporal resampling method: ``"auto"``, @@ -515,6 +518,9 @@ def __init__( vocab=self.vocab, variable_mapping=self.variable_mapping.to_dict(), drs_root=drs_root if drs_root else None, + validate_frequency=self.validate_frequency, + enable_resampling=self.enable_resampling, + resampling_method=self.resampling_method, ) else: # ACCESS-OM2 uses MOM5 (B-grid) — handled by a separate CMORiser class @@ -528,6 +534,9 @@ def __init__( vocab=self.vocab, variable_mapping=self.variable_mapping.to_dict(), drs_root=drs_root if drs_root else None, + validate_frequency=self.validate_frequency, + enable_resampling=self.enable_resampling, + resampling_method=self.resampling_method, ) elif table in ("SImon", "SIday") or table.startswith(_mip_seaice_prefixes): self.cmoriser = SeaIce_CMORiser( diff --git a/src/access_moppy/ocean.py b/src/access_moppy/ocean.py index 51d5dc23..a685d3e0 100644 --- a/src/access_moppy/ocean.py +++ b/src/access_moppy/ocean.py @@ -334,6 +334,9 @@ def __init__( vocab: CMIP6Vocabulary, variable_mapping: Dict[str, Any], drs_root: Optional[Path] = None, + validate_frequency: bool = True, + enable_resampling: bool = False, + resampling_method: str = "auto", # Backward compatibility input_paths: Optional[Union[str, List[str]]] = None, ): @@ -345,6 +348,9 @@ def __init__( vocab=vocab, variable_mapping=variable_mapping, drs_root=drs_root, + validate_frequency=validate_frequency, + enable_resampling=enable_resampling, + resampling_method=resampling_method, ) nominal_resolution = vocab._get_nominal_resolution(target_realm="ocean") @@ -406,6 +412,9 @@ def __init__( vocab: CMIP6Vocabulary, variable_mapping: Dict[str, Any], drs_root: Optional[Path] = None, + validate_frequency: bool = True, + enable_resampling: bool = False, + resampling_method: str = "auto", # Backward compatibility input_paths: Optional[Union[str, List[str]]] = None, ): @@ -417,6 +426,9 @@ def __init__( vocab=vocab, variable_mapping=variable_mapping, drs_root=drs_root, + validate_frequency=validate_frequency, + enable_resampling=enable_resampling, + resampling_method=resampling_method, ) nominal_resolution = vocab._get_nominal_resolution(target_realm="ocean") diff --git a/src/access_moppy/utilities.py b/src/access_moppy/utilities.py index 3207f5b7..36a7cbf0 100644 --- a/src/access_moppy/utilities.py +++ b/src/access_moppy/utilities.py @@ -1917,6 +1917,55 @@ def get_resampling_frequency_string(target_freq: pd.Timedelta) -> str: return f"{int(years)}YE" +def _normalise_calendar_name(calendar: Optional[str]) -> Optional[str]: + """Map the non-CF ``"GREGORIAN"`` label to ``"proleptic_gregorian"``. + + ACCESS-ESM1-5 files label their (proleptic-Gregorian) time axis with the + non-CF name ``"GREGORIAN"``; ``base.CMORiser._check_calendar`` rewrites this + to ``"proleptic_gregorian"`` on the written file. Date arithmetic for bounds + and resampling must use the same calendar — otherwise cftime treats + ``"GREGORIAN"`` as the mixed Julian/Gregorian ``"standard"`` calendar and + shifts pre-1582 dates by ~1 day. All other names are returned unchanged. + """ + return "proleptic_gregorian" if calendar == "GREGORIAN" else calendar + + +def _shift_resampled_time_to_period_midpoint( + time_da: xr.DataArray, target_freq: pd.Timedelta +) -> xr.DataArray: + """Move a resampled time coordinate from the period boundary to its midpoint. + + pandas/xarray ``resample`` labels each bin on the period boundary (e.g. the + yearly frequency "YE" lands every value on 31 December). The CMOR convention + is to centre the time coordinate on the averaging period — a yearly mean sits + on ~2 July (12:00 in a 365-day year, 00:00 in a 366-day year), the midpoint of + ``[Jan 1, next Jan 1]``. This recomputes the coordinate as that midpoint. + + Sub-daily or unrecognised frequencies are returned unchanged. + """ + days = target_freq.total_seconds() / 86400 + if 360 <= days <= 366: + bounds_fn = _calculate_yearly_bounds + elif 28 <= days <= 31: + bounds_fn = _calculate_monthly_bounds + elif 0.9 <= days <= 1.1: + bounds_fn = _calculate_daily_bounds + else: + return time_da + + values = time_da.values + if values.size == 0: + return time_da + is_cftime = isinstance(values.flat[0], cftime.datetime) + calendar = time_da.attrs.get("calendar", "proleptic_gregorian") + + bounds = bounds_fn(values, calendar, is_cftime) + midpoints = np.array( + [lo + (hi - lo) / 2 for lo, hi in bounds], dtype=values.dtype + ) + return time_da.copy(data=midpoints) + + def resample_dataset_temporal( ds: xr.Dataset, target_freq: pd.Timedelta, @@ -1943,6 +1992,20 @@ def resample_dataset_temporal( f"Available coordinates: {sorted(ds.coords)}" ) + # xarray's resample requires a monotonic time index. Multi-file inputs supplied + # in non-chronological order (e.g. an unsorted glob) concatenate into a + # non-monotonic time axis, so sort here before resampling. + ds = ds.sortby(time_coord) + + # Normalise the non-CF "GREGORIAN" calendar label to "proleptic_gregorian" + # (the calendar the written file ultimately declares) before decoding, so the + # resampled values, the period midpoint and the restored encoding are all + # computed in that calendar rather than cftime's Julian "standard" reading. + if ds[time_coord].attrs.get("calendar") == "GREGORIAN": + ds = ds.assign_coords( + {time_coord: ds[time_coord].assign_attrs(calendar="proleptic_gregorian")} + ) + # Convert target frequency to resampling string freq_str = get_resampling_frequency_string(target_freq) @@ -2006,6 +2069,35 @@ def resample_dataset_temporal( if coord_name != time_coord: ds_resampled[coord_name] = ds[coord_name] + # Centre the resampled time coordinate on each period's midpoint + # (CMOR convention) instead of the period boundary that resample labels it + # with (e.g. yearly means on ~2 July rather than 31 December). + ds_resampled[time_coord] = _shift_resampled_time_to_period_midpoint( + ds_resampled[time_coord], target_freq + ) + + # Restore the original CF time encoding. decode_cf() moved units/calendar + # off the coordinate (into encoding) for the resample, so without this the + # written file would have a time axis with no units — CF-invalid. Re-encode + # the (cftime) midpoints back to numeric using the input units/calendar and + # reattach the original attributes, matching the decode_cf=False pipeline. + orig_time_attrs = dict(ds[time_coord].attrs) + orig_units = orig_time_attrs.get("units", "") + if "since" in orig_units: + orig_calendar = orig_time_attrs.get("calendar", "standard") + time_vals = ds_resampled[time_coord].values + # decode_cf yields cftime for non-standard/pre-1582 calendars but + # numpy datetime64 otherwise; date2num needs cftime/datetime objects, + # so convert datetime64 to python datetimes first. + if time_vals.size and not isinstance(time_vals.flat[0], cftime.datetime): + time_vals = pd.to_datetime(time_vals).to_pydatetime() + numeric_time = date2num( + time_vals, units=orig_units, calendar=orig_calendar + ) + ds_resampled[time_coord] = xr.DataArray( + numeric_time, dims=[time_coord], attrs=orig_time_attrs + ) + # Update attributes ds_resampled.attrs = ds.attrs.copy() @@ -2165,7 +2257,10 @@ def normalize_cf_time_units(units: Optional[str]) -> Optional[str]: def calculate_time_bounds( - ds: xr.Dataset, time_coord: str = "time", bnds_name: str = "nv" + ds: xr.Dataset, + time_coord: str = "time", + bnds_name: str = "nv", + freq_hint: Optional[str] = None, ) -> xr.DataArray: """ Calculate time bounds from time coordinate for CMIP6 compliance. @@ -2186,6 +2281,12 @@ def calculate_time_bounds( bnds_name : str, default "nv" Name of the bounds dimension. Use "nv" for ocean data (default), or "bnds" for atmosphere data + freq_hint : str, optional + Frequency label ("daily", "monthly", "yearly") used only as a fallback + when the frequency cannot be inferred from the time axis itself — i.e. + when there is a single time point (e.g. a multi-year input resampled + down to one year). Inference from ≥2 points always takes precedence, so + this never changes existing multi-point behaviour. Returns ------- @@ -2205,14 +2306,13 @@ def calculate_time_bounds( time = ds[time_coord] n_times = len(time) - if n_times < 2: - raise ValueError("Need at least 2 time points to infer time bounds") - # Compute only the 1-D time coordinate. Using .compute().values (rather than # plain .values) ensures that only the time coordinate's dask graph is # triggered, not any larger graph that happens to reference the same chunks. time_values = time.compute().values - calendar = time.attrs.get("calendar", "proleptic_gregorian") + calendar = _normalise_calendar_name( + time.attrs.get("calendar", "proleptic_gregorian") + ) units = time.attrs.get("units") # Determine the type of time coordinate @@ -2236,8 +2336,16 @@ def calculate_time_bounds( ) is_cftime = True - # Try to infer frequency - freq = _infer_frequency(time_values) + # Infer frequency from the spacing of time points. When only a single time + # point is present (e.g. a multi-year input resampled down to one year), + # inference returns None; fall back to the caller-supplied frequency hint + # (derived from the CMOR table) so per-period bounds can still be built. + freq = _infer_frequency(time_values) or freq_hint + if freq is None: + raise ValueError( + "Need at least 2 time points to infer time bounds, or pass freq_hint " + "(e.g. 'yearly') derived from the target table frequency" + ) # Initialize bounds array time_bnds = np.empty((n_times, 2), dtype=object if is_cftime else time_values.dtype) diff --git a/src/access_moppy/vocabulary_processors.py b/src/access_moppy/vocabulary_processors.py index 97301069..5cacab93 100644 --- a/src/access_moppy/vocabulary_processors.py +++ b/src/access_moppy/vocabulary_processors.py @@ -891,6 +891,7 @@ def generate_filename( table_lower = table_name.lower() is_subdaily_data = any(freq in table_lower for freq in ["3hr", "6hr", "hr"]) is_daily_data = "day" in table_lower + is_yearly_data = "yr" in table_lower # Format time range based on frequency if is_subdaily_data: @@ -902,6 +903,9 @@ def generate_filename( elif is_daily_data: # Daily data: include day (YYYYMMDD) start, end = [f"{t.year:04d}{t.month:02d}{t.day:02d}" for t in times] + elif is_yearly_data: + # Yearly data (e.g. Oyr): year only (YYYY) + start, end = [f"{t.year:04d}" for t in times] else: # Monthly or other data: year and month only (YYYYMM) start, end = [f"{t.year:04d}{t.month:02d}" for t in times] diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py index 97cc2369..071242c7 100644 --- a/tests/unit/test_utilities.py +++ b/tests/unit/test_utilities.py @@ -26,6 +26,7 @@ detect_time_frequency_lazy, get_requested_variables_from_data_request, normalize_cf_time_units, + resample_dataset_temporal, ) @@ -46,6 +47,125 @@ def test_insufficient_time_points(self): with pytest.raises(ValueError, match="Need at least 2 time points"): calculate_time_bounds(ds) + def test_single_point_with_freq_hint(self): + """A single time point succeeds when a frequency hint is supplied + (e.g. a multi-year input resampled down to one year, where the + frequency cannot be inferred from point spacing).""" + ds = xr.Dataset(coords={"time": [np.datetime64("2000-06-15")]}) + + time_bnds = calculate_time_bounds(ds, freq_hint="yearly") + + assert time_bnds.shape == (1, 2) + assert time_bnds.values[0, 0] == np.datetime64("2000-01-01") + assert time_bnds.values[0, 1] == np.datetime64("2001-01-01") + + def test_gregorian_label_treated_as_proleptic(self): + """The non-CF "GREGORIAN" label (ACCESS-ESM1-5) must be computed as + proleptic_gregorian, matching _check_calendar's rewrite, so pre-1582 + bounds are not shifted by ~1 day into the Julian "standard" calendar. + + For year 0101, Jan 1 is day 36524 in proleptic_gregorian (36525 in + Julian/standard).""" + ds = xr.Dataset( + coords={ + "time": ( + "time", + [36706.5, 37071.5], # yearly midpoints, days since 0001-01-01 + {"units": "days since 0001-01-01", "calendar": "GREGORIAN"}, + ) + } + ) + + time_bnds = calculate_time_bounds(ds, freq_hint="yearly") + + # proleptic_gregorian year boundaries, not the Julian 36525/36890 + assert time_bnds.values[0, 0] == 36524.0 + assert time_bnds.values[0, 1] == 36889.0 + + +def test_normalise_calendar_name(): + """Only the non-CF "GREGORIAN" label is rewritten; other names pass through.""" + from access_moppy.utilities import _normalise_calendar_name + + assert _normalise_calendar_name("GREGORIAN") == "proleptic_gregorian" + assert _normalise_calendar_name("gregorian") == "gregorian" + assert _normalise_calendar_name("standard") == "standard" + assert _normalise_calendar_name("noleap") == "noleap" + assert _normalise_calendar_name(None) is None + + +class TestResampleTimeMidpoint: + """Resampling must centre the time coordinate on each period's midpoint + (CMOR convention), not the period boundary that resample() labels it with.""" + + def test_monthly_to_yearly_lands_on_midyear(self): + months = pd.date_range("1950-01-16", "1954-12-16", freq="MS") + pd.Timedelta( + days=15 + ) + ds = xr.Dataset( + {"v": (["time"], np.arange(len(months), dtype="f4"))}, + coords={"time": ("time", months)}, + ) + + out = resample_dataset_temporal( + ds, pd.Timedelta(days=365), "v", "time", "auto" + ) + + times = pd.to_datetime(out["time"].values) + # Every yearly value sits on ~2 July, never on a year boundary. + assert all((t.month, t.day) == (7, 2) for t in times) + # 365-day years are centred at 12:00; the 366-day leap year (1952) at 00:00. + assert times[0].hour == 12 # 1950 (non-leap) + assert times[2].hour == 0 # 1952 (leap) + + def test_resample_handles_unsorted_time(self): + """Non-chronological multi-file inputs concatenate into a non-monotonic + time axis; resampling must sort first rather than raising + 'Index must be monotonic for resampling'.""" + months = pd.date_range("1950-01-16", periods=24, freq="MS") + pd.Timedelta( + days=15 + ) + shuffled = months[np.array([12, 0, 6, 18, 3] + [i for i in range(24) if i not in (12, 0, 6, 18, 3)])] + ds = xr.Dataset( + {"v": (["time"], np.arange(24, dtype="f4"))}, + coords={"time": ("time", shuffled)}, + ) + assert not pd.Index(ds["time"].values).is_monotonic_increasing + + out = resample_dataset_temporal( + ds, pd.Timedelta(days=365), "v", "time", "auto" + ) + + # Resampling succeeded (no monotonicity error) and produced a sorted axis. + assert out.sizes["time"] >= 2 + assert pd.Index(out["time"].values).is_monotonic_increasing + + def test_resample_preserves_cf_time_units(self): + """Resampling decodes time internally; it must restore the original CF + units/calendar (as numeric values) so the written file keeps a valid + time axis rather than a units-less coordinate.""" + from cftime import date2num + + months = xr.cftime_range( + "1950-01-16", periods=24, freq="MS", calendar="standard" + ) + units = "days since 1900-01-01" + numeric = date2num(months.values, units, "standard") + ds = xr.Dataset( + {"v": (["time"], np.arange(24, dtype="f4"))}, + coords={ + "time": ("time", numeric, {"units": units, "calendar": "standard"}) + }, + ) + + out = resample_dataset_temporal( + ds, pd.Timedelta(days=365), "v", "time", "auto" + ) + + assert out["time"].attrs.get("units") == units + assert out["time"].attrs.get("calendar") == "standard" + assert np.issubdtype(np.asarray(out["time"].values).dtype, np.floating) + class TestCalculateTimeBoundsMonthly: """Test monthly frequency time bounds calculation.""" diff --git a/tests/unit/test_vocabulary_processors.py b/tests/unit/test_vocabulary_processors.py index 3a883522..80bf1817 100644 --- a/tests/unit/test_vocabulary_processors.py +++ b/tests/unit/test_vocabulary_processors.py @@ -454,6 +454,30 @@ def test_generate_filename_datetime64_time_branch(vocabulary_instance): assert "202001-202002" in filename +@pytest.mark.unit +def test_generate_filename_yearly_year_only(vocabulary_instance): + """Yearly tables (Oyr) format the time range as YYYY-YYYY, not YYYYMM.""" + cf_time = xr.cftime_range("2020-01-01", periods=2, freq="YS", calendar="gregorian") + ds = xr.Dataset( + { + "no3": xr.DataArray( + np.array([1.0, 2.0]), + dims=["time"], + coords={"time": cf_time}, + ) + } + ) + attrs = {**_FILENAME_ATTRS, "variable_id": "no3", "table_id": "Oyr"} + + with patch.object( + CMIP6Vocabulary, "_load_drs_templates", return_value=_TIME_RANGE_TEMPLATE + ): + filename = vocabulary_instance.generate_filename(attrs, ds, "no3", "Oyr.no3") + + assert "2020-2021" in filename + assert "202001" not in filename # no month component + + @pytest.mark.unit def test_generate_filename_numeric_time_branch(vocabulary_instance): """Numeric float64 time – uses num2date (else) branch.""" From 52d5fe24366afbb7e6170127b8543443090f02c0 Mon Sep 17 00:00:00 2001 From: rhaegar325 Date: Fri, 26 Jun 2026 13:01:39 +1000 Subject: [PATCH 2/4] pre-commit fix --- src/access_moppy/utilities.py | 8 ++------ tests/unit/test_utilities.py | 18 ++++++++---------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/src/access_moppy/utilities.py b/src/access_moppy/utilities.py index 36a7cbf0..bbb00b25 100644 --- a/src/access_moppy/utilities.py +++ b/src/access_moppy/utilities.py @@ -1960,9 +1960,7 @@ def _shift_resampled_time_to_period_midpoint( calendar = time_da.attrs.get("calendar", "proleptic_gregorian") bounds = bounds_fn(values, calendar, is_cftime) - midpoints = np.array( - [lo + (hi - lo) / 2 for lo, hi in bounds], dtype=values.dtype - ) + midpoints = np.array([lo + (hi - lo) / 2 for lo, hi in bounds], dtype=values.dtype) return time_da.copy(data=midpoints) @@ -2091,9 +2089,7 @@ def resample_dataset_temporal( # so convert datetime64 to python datetimes first. if time_vals.size and not isinstance(time_vals.flat[0], cftime.datetime): time_vals = pd.to_datetime(time_vals).to_pydatetime() - numeric_time = date2num( - time_vals, units=orig_units, calendar=orig_calendar - ) + numeric_time = date2num(time_vals, units=orig_units, calendar=orig_calendar) ds_resampled[time_coord] = xr.DataArray( numeric_time, dims=[time_coord], attrs=orig_time_attrs ) diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py index 071242c7..8ee9fa9a 100644 --- a/tests/unit/test_utilities.py +++ b/tests/unit/test_utilities.py @@ -107,9 +107,7 @@ def test_monthly_to_yearly_lands_on_midyear(self): coords={"time": ("time", months)}, ) - out = resample_dataset_temporal( - ds, pd.Timedelta(days=365), "v", "time", "auto" - ) + out = resample_dataset_temporal(ds, pd.Timedelta(days=365), "v", "time", "auto") times = pd.to_datetime(out["time"].values) # Every yearly value sits on ~2 July, never on a year boundary. @@ -125,16 +123,18 @@ def test_resample_handles_unsorted_time(self): months = pd.date_range("1950-01-16", periods=24, freq="MS") + pd.Timedelta( days=15 ) - shuffled = months[np.array([12, 0, 6, 18, 3] + [i for i in range(24) if i not in (12, 0, 6, 18, 3)])] + shuffled = months[ + np.array( + [12, 0, 6, 18, 3] + [i for i in range(24) if i not in (12, 0, 6, 18, 3)] + ) + ] ds = xr.Dataset( {"v": (["time"], np.arange(24, dtype="f4"))}, coords={"time": ("time", shuffled)}, ) assert not pd.Index(ds["time"].values).is_monotonic_increasing - out = resample_dataset_temporal( - ds, pd.Timedelta(days=365), "v", "time", "auto" - ) + out = resample_dataset_temporal(ds, pd.Timedelta(days=365), "v", "time", "auto") # Resampling succeeded (no monotonicity error) and produced a sorted axis. assert out.sizes["time"] >= 2 @@ -158,9 +158,7 @@ def test_resample_preserves_cf_time_units(self): }, ) - out = resample_dataset_temporal( - ds, pd.Timedelta(days=365), "v", "time", "auto" - ) + out = resample_dataset_temporal(ds, pd.Timedelta(days=365), "v", "time", "auto") assert out["time"].attrs.get("units") == units assert out["time"].attrs.get("calendar") == "standard" From 2008aa00393fe42c8c8b519de0d4b0abe366a5e1 Mon Sep 17 00:00:00 2001 From: rhaegar325 Date: Fri, 26 Jun 2026 13:17:35 +1000 Subject: [PATCH 3/4] test cover update --- tests/unit/test_base.py | 23 +++++++++++++++ tests/unit/test_utilities.py | 56 ++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/tests/unit/test_base.py b/tests/unit/test_base.py index 7c0ca085..fda99f2c 100644 --- a/tests/unit/test_base.py +++ b/tests/unit/test_base.py @@ -7,6 +7,7 @@ import logging from pathlib import Path +from types import SimpleNamespace from unittest.mock import MagicMock, Mock, patch import dask.array as da @@ -2602,3 +2603,25 @@ def test_above_max_error_includes_actual_maximum(self): msg = str(exc_info.value) assert "Actual maximum found" in msg assert "9" in msg + + +class TestTargetFrequencyHint: + """_target_frequency_hint maps the CMOR table frequency to a coarse label, + used only as a single-point fallback in time-bounds construction.""" + + @pytest.mark.parametrize( + "compound_name, expected", + [ + ("Oyr.no3", "yearly"), + ("Omon.tos", "monthly"), + ("Oday.tos", "daily"), + ("3hr.x", None), # sub-daily has no coarse bucket + ("fx.areacello", None), # time-independent + ("Bogus.zzz", None), # unparseable table -> None (exception swallowed) + (None, None), # no compound_name + ("", None), + ], + ) + def test_frequency_hint(self, compound_name, expected): + stub = SimpleNamespace(compound_name=compound_name) + assert CMORiser._target_frequency_hint(stub) == expected diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py index 8ee9fa9a..628e8f8f 100644 --- a/tests/unit/test_utilities.py +++ b/tests/unit/test_utilities.py @@ -164,6 +164,62 @@ def test_resample_preserves_cf_time_units(self): assert out["time"].attrs.get("calendar") == "standard" assert np.issubdtype(np.asarray(out["time"].values).dtype, np.floating) + def test_resample_gregorian_label_normalised(self): + """A "GREGORIAN"-labelled axis (values written by the model in proleptic + semantics) must be resampled and declared as proleptic_gregorian, matching + _check_calendar, so it is not read as the Julian "standard" calendar.""" + from cftime import date2num + + months = xr.cftime_range( + "1950-01-16", periods=24, freq="MS", calendar="proleptic_gregorian" + ) + units = "days since 1900-01-01" + numeric = date2num(months.values, units, "proleptic_gregorian") + ds = xr.Dataset( + {"v": (["time"], np.arange(24, dtype="f4"))}, + coords={ + "time": ("time", numeric, {"units": units, "calendar": "GREGORIAN"}) + }, + ) + + out = resample_dataset_temporal(ds, pd.Timedelta(days=365), "v", "time", "auto") + + assert out["time"].attrs.get("calendar") == "proleptic_gregorian" + + def test_midpoint_shift_per_frequency(self): + """_shift_resampled_time_to_period_midpoint centres monthly/daily periods + too, and is a no-op for sub-daily / empty inputs.""" + from access_moppy.utilities import _shift_resampled_time_to_period_midpoint + + # Monthly: a period label in January -> mid-January (day 16, 12:00). + jan = xr.DataArray( + xr.cftime_range("2000-01-01", periods=1, calendar="standard").values, + dims="time", + name="time", + ) + out_mon = _shift_resampled_time_to_period_midpoint(jan, pd.Timedelta(days=30)) + assert out_mon.values[0].day == 16 + assert out_mon.values[0].hour == 12 + + # Daily: midnight -> noon. + day = xr.DataArray( + xr.cftime_range("2000-01-01", periods=1, freq="D", calendar="standard").values, + dims="time", + name="time", + ) + out_day = _shift_resampled_time_to_period_midpoint(day, pd.Timedelta(days=1)) + assert out_day.values[0].hour == 12 + + # Sub-daily target frequency: unchanged (no recognised period). + out_noop = _shift_resampled_time_to_period_midpoint(jan, pd.Timedelta(hours=6)) + assert out_noop.values[0] == jan.values[0] + + # Empty axis: returned unchanged. + empty = xr.DataArray(np.array([], dtype=object), dims="time", name="time") + assert _shift_resampled_time_to_period_midpoint( + empty, pd.Timedelta(days=365) + ).size == 0 + class TestCalculateTimeBoundsMonthly: """Test monthly frequency time bounds calculation.""" From 8eab7f2a30ded00b273c1fc4397621531ad7a78a Mon Sep 17 00:00:00 2001 From: rhaegar325 Date: Fri, 26 Jun 2026 13:19:23 +1000 Subject: [PATCH 4/4] test cover update --- tests/unit/test_utilities.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_utilities.py b/tests/unit/test_utilities.py index 628e8f8f..105d1a82 100644 --- a/tests/unit/test_utilities.py +++ b/tests/unit/test_utilities.py @@ -203,7 +203,9 @@ def test_midpoint_shift_per_frequency(self): # Daily: midnight -> noon. day = xr.DataArray( - xr.cftime_range("2000-01-01", periods=1, freq="D", calendar="standard").values, + xr.cftime_range( + "2000-01-01", periods=1, freq="D", calendar="standard" + ).values, dims="time", name="time", ) @@ -216,9 +218,10 @@ def test_midpoint_shift_per_frequency(self): # Empty axis: returned unchanged. empty = xr.DataArray(np.array([], dtype=object), dims="time", name="time") - assert _shift_resampled_time_to_period_midpoint( - empty, pd.Timedelta(days=365) - ).size == 0 + assert ( + _shift_resampled_time_to_period_midpoint(empty, pd.Timedelta(days=365)).size + == 0 + ) class TestCalculateTimeBoundsMonthly: