diff --git a/Models/.gitignore b/Models/.gitignore new file mode 100644 index 0000000..b23cdd8 --- /dev/null +++ b/Models/.gitignore @@ -0,0 +1,3 @@ +/* +!*/ +!.gitignore diff --git a/README.md b/README.md index f6bb878..3527b21 100644 --- a/README.md +++ b/README.md @@ -1,55 +1,63 @@ -# Ice Sheet Simulation compliance checker - -The script checks the compliance of a simulation dataset according criteria, which are related to: - -* naming conventions -* admissible numerical values, -* spatial definition of the grid which differs according to the ice sheet (AIS vs GIS), -* time recording dependent of the experiments. - -The compliance criteria of output variables are defined in a separate csv file. The compliance criteria of experiments are directly defined as a dictionnary in the python file. - -=> For ISMIP6 simulations, the criteria are following the conventions defined in the [ISMIP6 wiki](https://www.climate-cryosphere.org/wiki/index.php?title=ISMIP6-Projections-Antarctica#Appendix_1_.E2.80.93_Output_grid_definition_and_interpolation). The associated csv file is [ismip6_criteria.csv](https://github.com/jbbarre/ISM_SimulationChecker/blob/main/ismip6_criteria.csv) - -=> ISMIP6 2300 file name convention: check carrefully the section _A2.1 File name convention_ of the [ISMIP6 2300 wiki](https://www.climate-cryosphere.org/wiki/index.php?title=ISMIP6-Projections2300-Antarctica) - -************************************************* - -### Python and dependencies - -The code has been developed with python 3.9 and the following modules: - -* os -* xarray -* cftime -* numpy -* pandas -* datetime -* tqdm - -=> Conda users can install the **isscheck** environnment with the YML file [isschecker_env.yml](https://github.com/jbbarre/ISM_SimulationChecker/blob/main/isschecker_env.yml). - -************************************************* - -### Test the code - -1. Conda users: activate the isschecker environnement: `> conda activate isschecker`. - For others, be sure that the dependencies specified in the YML file [isschecker_env.yml] (https://github.com/jbbarre/ISM_SimulationChecker/blob/main/isschecker_env.yml) are installed. - -2. In a terminal, run the script: `> python compliance_checker.py`. A progression bar appears in the terminal and shows the progression. - -3. Without any changes, the script checks the `test` directory, which contains a single file. After processing the check, open the *compliance_checker_log.txt* file created in the `test` directory. The compliance checker raises errors because the test data is just a short extraction of a complete dataset. - -************************************************* - -### How to launch a compliance check ? - -1. In a terminal, run the script with the source path and experiment set: -`> ./compliance_checker.py --source-path ./test --experiment-set ismip6` - -2. Use `--experiment-set ismip6_ext` to test the ISMIP6 extension (2300) experiment set. - -3. The script creates a *compliance_checker_log.txt* file in the source path, which reports the errors and warnings. - - - +# Ice Sheet Simulation Compliance Checker + +Checks ISMIP7 NetCDF simulation datasets for compliance with the [ISMIP7 data request conventions](https://www.ismip.org/). The following categories are validated for every file: + +1. **Naming** — variable name, region field, ISM member id (`mNNN`), ESM name (CMIP6/CMIP7 registry), forcing member id (`fNNN`), set counter (`[C|E|P]NNN`), and year range (`YYYY-YYYY` matching the actual time axis). +2. **Numerical** — units match the data request; all values lie within the allowed min/max range for the relevant region; array is not entirely fill values. +3. **Spatial** *(xyt variables only)* — grid corners lie within the expected AIS or GrIS extents; resolution is one of the allowed values; x and y cell size are equal. +4. **Time** — time dimension is present, unlimited, and monotonically increasing; annual cadence; experiment end date and duration match `experiments_ismip7.csv`. +5. **Attributes** — required global and coordinate attributes are present and have correct values; `standard_name` matches data request; `_FillValue` equals the NetCDF4 default for the variable's dtype; variable and time are float32; `scale_factor` and `add_offset` are not allowed. + +Compliance criteria are defined in `conventions/ISMIP7_variable_request.xlsx` (variable metadata) and `experiments_ismip7.csv` (valid experiment date ranges). + +--- + +## Setup + +```bash +conda env create -f isschecker_env.yml +conda activate isschecker +``` + +Dependencies: Python 3.14, `numpy` 2.4, `pandas` 3.0, `openpyxl` 3.1, `xarray` 2026.4, `cftime` 1.6, `netCDF4` 1.7, `tqdm` 4.67. + +--- + +## Running the checker + +The script must be run from the repository root. It writes `compliance_checker_log.txt` into the `--source-path` directory. + +```bash +# Check x,y,t (3D spatial) variables +python compliance_checker.py --source-path ./Models/GrIS/ISMIP7/SYNTH1/CORE --variable-list ismip7_xyt + +# Check scalar (time-only) variables +python compliance_checker.py --source-path ./Models/AIS/ISMIP7/SYNTH1/CORE --variable-list ismip7_scalars + +# Check both +python compliance_checker.py --source-path ./Models/GrIS/ISMIP7/SYNTH1/CORE --variable-list ismip7 +``` + +| Option | Default | Description | +|--------|---------|-------------| +| `--source-path` | `./Models/GrIS/ISMIP7/SYNTH1/CORE` | Directory containing `.nc` files to check | +| `--variable-list` | `ismip7_scalars` | `ismip7_xyt`, `ismip7_scalars`, or `ismip7` (both) | + +--- + +## Generating synthetic test files + +`test/generate_test_files.py` creates ISMIP7-style NetCDF test files with synthetic data. See [test/README.md](test/README.md) for full options and examples. + +```bash +conda activate isschecker + +# Generate 286-year GrIS ctrl xyt variables +python test/generate_test_files.py --grid GrIS_16000m --scenario ctrl --xyt --nyears 286 --start-year 2015 + +# Generate 286-year AIS ctrl scalar variables +python test/generate_test_files.py --grid AIS_16000m --scenario ctrl --scalars --nyears 286 --start-year 2015 + +# List available grids +python test/generate_test_files.py --list-grids +``` diff --git a/compliance_checker.py b/compliance_checker.py index a65b85a..64ec357 100755 --- a/compliance_checker.py +++ b/compliance_checker.py @@ -1,84 +1,155 @@ #!/usr/bin/env python3 +# +# ISMIP7 Compliance Checker — check summary +# +# 1. Naming (_check_naming) +# - Variable name matches the expected ISMIP7 name from the data request. +# - Region field in filename matches the region inferred from the grid (AIS/GrIS). +# - ISM member id (field 4) matches format mNNN (e.g. m001). +# - ESM name (field 5) is a recognised CMIP6/CMIP7 model name. +# - Forcing member id (field 6) matches format fNNN (e.g. f001). +# - Set counter (field 8) matches format [C|E|P]NNN (e.g. C001, E041, P132). +# - Year range (field 9) matches format YYYY-YYYY; start <= end; both match the actual time axis. +# +# 2. Numerical (_check_numerical) +# - Variable units match the data request. +# - All values lie within the allowed min/max range for the relevant region. +# - Array is not entirely fill/missing values. +# +# 3. Spatial (_check_spatial) [xyt variables only] +# - Lower-left and upper-right grid corners lie within the expected AIS or GrIS extents. +# - Grid resolution is one of the allowed values (1, 2, 4, 8, 16, 32 km). +# - x and y resolution are equal (square cells). +# +# 4. Time (_check_time) +# - Time dimension is present, is an unlimited (record) dimension, and its values are monotonically increasing. +# - Time step matches the expected annual cadence (within tolerance). +# - Experiment end date matches experiments_ismip7.csv; duration >= 1 year for historical, exact for others. +# +# 5. Attributes (_check_attributes) +# - Global attributes present: group, model, contact_name, contact_email, crs +# (epsg:3413 for GrIS, epsg:3031 for AIS). +# - Coordinate attributes: time has units, calendar, bounds (FL vars); x/y have units (xyt). +# - Variable standard_name matches data request (if specified). +# - _FillValue must be present and equal the default netCDF4 fill value for the variable's dtype. +# If missing_value is also present, it must equal _FillValue. +# - Main variable and time coordinate are single-precision float (float32 / f4). +# - scale_factor and add_offset are not allowed on the main variable. + + import datetime import os +import re import subprocess import argparse -import cftime import numpy as np import pandas as pd import xarray as xr +import netCDF4 from tqdm import tqdm -DEFAULT_SOURCE_PATH = "./test" -DEFAULT_EXPERIMENT_SET = "ismip6" -EXPERIMENT_SET_CHOICES = ("ismip6", "ismip6_ext") +DEFAULT_SOURCE_PATH = "./Models/GrIS/ISMIP7/SYNTH1/CORE" +DEFAULT_VARIABLE_LIST = "ismip7_scalars" +VARIABLE_LIST_CHOICES = ("ismip7_scalars", "ismip7_xyt", "ismip7") -CRITERIA_CSV_FILENAME = "ismip6_criteria.csv" -EXPERIMENTS_ISMIP6_CSV_FILENAME = "experiments_ismip6.csv" -EXPERIMENTS_ISMIP6_EXT_CSV_FILENAME = "experiments_ismip6_ext.csv" +VARIABLE_REQUEST_XLSX = os.path.join("conventions", "ISMIP7_variable_request.xlsx") -AIS_GRID_EXTENT = [-3040000, -3040000, 3040000, 3040000] -GIS_GRID_EXTENT = [-720000, -3450000, 960000, -570000] -AIS_POSSIBLE_RESOLUTION = [1, 2, 4, 8, 16, 32] -GIS_POSSIBLE_RESOLUTION = [1, 2, 4, 5, 10, 20] +EXPERIMENTS_ISMIP7_CSV_FILENAME = "experiments_ismip7.csv" -TIME_STEP_MIN_DAYS = 360 -TIME_STEP_MAX_DAYS = 367 -AVERAGE_YEAR_DAYS = 365 +AIS_GRID_EXTENT = [-3040000, -3040000, 3040000, 3040000] +GrIS_GRID_EXTENT = [-720000, -3450000, 960000, -570000] +AIS_POSSIBLE_RESOLUTION = [2, 4, 8, 16, 32] +GrIS_POSSIBLE_RESOLUTION = [1, 2, 4, 8, 16, 32] + +TIME_STEP_MIN_DAYS = 365 +TIME_STEP_MAX_DAYS = 366 + +# ISMIP7 CORE file naming convention: +# {var}_{region}_{group}_{model}_{modelid}_{ESM}_{forcingid}_{experiment}_{configid}_{startyear}-{endyear}.nc +ISMIP7_FILENAME_PARTS = 10 +ISMIP7_FILENAME_VAR_IDX = 0 +ISMIP7_FILENAME_REGION_IDX = 1 +ISMIP7_FILENAME_ISM_MEMBER_IDX = 4 +ISMIP7_FILENAME_ESM_IDX = 5 +ISMIP7_FILENAME_FORCING_MEMBER_IDX = 6 +ISMIP7_FILENAME_EXPERIMENT_IDX = 7 +ISMIP7_FILENAME_SET_COUNTER_IDX = 8 +ISMIP7_FILENAME_YEAR_RANGE_IDX = 9 + +# Known CMIP6 and CMIP7 model names valid as ESM forcing identifiers (field 5). +VALID_ESM_NAMES: set[str] = { + # CMIP6 + "ACCESS-CM2", "ACCESS-ESM1-5", + "AWI-CM-1-1-MR", "AWI-ESM-1-1-LR", + "BCC-CSM2-MR", "BCC-ESM1", + "CAMS-CSM1-0", + "CAS-ESM2-0", + "CESM2", "CESM2-FV2", "CESM2-WACCM", "CESM2-WACCM-FV2", + "CIESM", + "CMCC-CM2-HR4", "CMCC-CM2-SR5", "CMCC-ESM2", + "CNRM-CM6-1", "CNRM-CM6-1-HR", "CNRM-ESM2-1", + "CanESM5", "CanESM5-1", "CanESM5-CanOE", + "E3SM-1-0", "E3SM-1-1", "E3SM-1-1-ECA", "E3SM-2-0", + "EC-Earth3", "EC-Earth3-AerChem", "EC-Earth3-CC", "EC-Earth3-Veg", "EC-Earth3-Veg-LR", + "FGOALS-f3-L", "FGOALS-g3", + "FIO-ESM-2-0", + "GFDL-CM4", "GFDL-ESM4", + "GISS-E2-1-G", "GISS-E2-1-G-CC", "GISS-E2-1-H", "GISS-E2-2-G", "GISS-E2-2-H", + "HadGEM3-GC31-LL", "HadGEM3-GC31-MM", + "IITM-ESM", + "INM-CM4-8", "INM-CM5-0", + "IPSL-CM5A2-INCA", "IPSL-CM6A-LR", "IPSL-CM6A-LR-INCA", + "KACE-1-0-G", + "KIOST-ESM", + "MCM-UA-1-0", + "MIROC-ES2H", "MIROC-ES2L", "MIROC6", + "MPI-ESM-1-2-HAM", "MPI-ESM1-2-HR", "MPI-ESM1-2-LR", + "MRI-ESM2-0", + "NESM3", + "NorCPM1", "NorESM2-LM", "NorESM2-MM", + "SAM0-UNICON", + "TaiESM1", + "UKESM1-0-LL", "UKESM1-1-LL", + # CMIP7 (extend as models are registered) + "ACCESS-ESM2-0", + "CanESM6", + "CESM3", + "CNRM-CM7", "CNRM-ESM2-2", + "EC-Earth4", + "GFDL-ESM5", + "GISS-E3", + "HadGEM4-GC51-LL", + "IPSL-CM7A-LR", + "MIROC7", + "MPI-ESM2-0", + "MRI-ESM3-0", + "NorESM3-LM", "NorESM3-MM", + "UKESM2-0-LL", +} def main() -> None: args = _parse_args() source_path = args.source_path - experiment_set = args.experiment_set + variable_list = args.variable_list workdir = os.getcwd() commit_num = _get_commit_number() - ismip_meta, ismip_var, mandatory_variables = _load_criteria(workdir) - - experiments_ismip6_ext = _load_experiments_csv( - os.path.join(workdir, EXPERIMENTS_ISMIP6_EXT_CSV_FILENAME) - ) - experiments_ismip6 = _load_experiments_csv( - os.path.join(workdir, EXPERIMENTS_ISMIP6_CSV_FILENAME) + experiments_ismip7 = _load_experiments_csv( + os.path.join(workdir, EXPERIMENTS_ISMIP7_CSV_FILENAME) ) - - scalar_variables_ismip6 = [ - "lim", - "limnsw", - "iareagr", - "iareafl", - "tendacabf", - "tendlibmassbf", - "tendlibmassbffl", - "tendlicalvf", - "tendlifmassbf", - "tendligroundf", - ] - scalar_variables = scalar_variables_ismip6 - - # Set up the experiment list: extension (2300) or ISMIP6 (2100). - if experiment_set == "ismip6_ext": - experiments = experiments_ismip6_ext - elif experiment_set == "ismip6": - experiments = experiments_ismip6 - else: - raise ValueError( - "Experiment set not recognized. Please choose between 'ismip6' and 'ismip6_ext'." - ) + ismip_meta, ismip_var, mandatory_variables = _load_criteria(workdir, variable_list) _run_compliance_checker( source_path=source_path, commit_num=commit_num, ismip_meta=ismip_meta, ismip_var=ismip_var, - variables=ismip_var, mandatory_variables=mandatory_variables, - experiments=experiments, - experiments_ismip6_ext=experiments_ismip6_ext, - scalar_variables=scalar_variables, + experiments=experiments_ismip7, + criteria_file=VARIABLE_REQUEST_XLSX, ) @@ -89,48 +160,68 @@ def _get_commit_number() -> str: commit_num, _error = process.communicate() return commit_num.decode("UTF-8") except Exception: - print( - "Commit number associated with this code. Is there a .git in this directory ?" - ) + print("Could not retrieve git commit number. Is there a .git directory here?") return "No commit number identified." def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Check simulation NetCDF datasets for ISMIP6 compliance." + description="Check simulation NetCDF datasets for ISMIP compliance." ) parser.add_argument( "--source-path", default=DEFAULT_SOURCE_PATH, - help="Path to the directory containing experiment subdirectories.", + help="Path to the directory containing the CORE NetCDF files.", ) parser.add_argument( - "--experiment-set", - choices=EXPERIMENT_SET_CHOICES, - default=DEFAULT_EXPERIMENT_SET, - help="Experiment set rules to apply: ismip6 or ismip6_ext.", + "--variable-list", + choices=VARIABLE_LIST_CHOICES, + default=DEFAULT_VARIABLE_LIST, + help="Variable list to apply: ismip7_xyt, ismip7_scalars, or ismip7 (both).", ) return parser.parse_args() -def _load_criteria(workdir: str): +def _load_criteria(workdir: str, variable_list: str): + excel_path = os.path.join(workdir, VARIABLE_REQUEST_XLSX) try: - ismip = pd.read_csv( - os.path.join(workdir, CRITERIA_CSV_FILENAME), - delimiter=";", - decimal=",", - ) + df = pd.read_excel(excel_path, sheet_name="ISM") except IOError: print( - "ERROR: Unable to open the compliance criteria file (.csv required with ; as delimiter and , for decimal.). Is the path to the file correct ? " - + workdir - + "ismip6_criteria_v0.csv" + "ERROR: Unable to open the variable request file. Is the path correct? " + + excel_path ) raise - ismip_meta = ismip.to_dict("records") - ismip_var = [dic["variable"] for dic in ismip_meta] - ismip_mandatory_var = ismip["variable"][ismip.mandatory == 1].tolist() + df = df.dropna(subset=["Variable Name"]) + + if variable_list == "ismip7_xyt": + df = df[df["Dim"] == "x,y,t"] + elif variable_list == "ismip7_scalars": + df = df[df["Dim"] == "t"] + + ismip_meta = [] + for _, row in df.iterrows(): + entry = { + "variable": row["Variable Name"], + "dim": str(row["Dim"]), + "units": str(row["units"]) if pd.notna(row["units"]) else "", + "mandatory": 1 if str(row["Mandatory (yes/no)"]).lower() == "yes" else 0, + "standard_name": str(row["standard_name"]) if pd.notna(row["standard_name"]) else None, + "type": str(row["Type"]) if pd.notna(row["Type"]) else "", + } + for col in df.columns: + lc = str(col).lower() + if lc.startswith("min_") or lc.startswith("max_"): + try: + val = row[col] + entry[lc] = None if pd.isna(val) else float(val) + except Exception: + entry[lc] = None + ismip_meta.append(entry) + + ismip_var = [d["variable"] for d in ismip_meta] + ismip_mandatory_var = [d["variable"] for d in ismip_meta if d["mandatory"] == 1] return ismip_meta, ismip_var, ismip_mandatory_var @@ -156,97 +247,101 @@ def _run_compliance_checker( commit_num: str, ismip_meta, ismip_var, - variables, mandatory_variables, experiments, - experiments_ismip6_ext, - scalar_variables, + criteria_file, ) -> None: - _ = (experiments_ismip6_ext, scalar_variables) + if not os.path.isdir(source_path): + print(f"ERROR: Directory not found: '{source_path}'. Please check your --source-path argument.") + return try: with open(os.path.join(source_path, "compliance_checker_log.txt"), "w") as f: print("-> Checking " + source_path) print() - experiment_directories, files = _files_and_subdirectories(source_path) - _ = files today = datetime.date.today() + _write_log_header(f, commit_num, source_path, today, criteria_file) - _write_log_header(f, commit_num, source_path, today) + experiment_groups = _group_files_by_experiment(source_path) + if not experiment_groups: + msg = f"No .nc files found in directory '{source_path}'. Please check your --source-path argument." + print(f"ERROR: {msg}") + f.write(f"ERROR: {msg}\n") + return summary = _process_experiments( log_file=f, source_path=source_path, - experiment_directories=experiment_directories, + experiment_groups=experiment_groups, mandatory_variables=mandatory_variables, experiments=experiments, - variables=variables, ismip_var=ismip_var, ismip_meta=ismip_meta, ) - exp_counter = summary["exp_counter"] - file_counter = summary["file_counter"] - total_errors = summary["total_errors"] - total_warnings = summary["total_warnings"] - total_naming_errors = summary["total_naming_errors"] - total_num_errors = summary["total_num_errors"] - total_spatial_errors = summary["total_spatial_errors"] - total_time_errors = summary["total_time_errors"] - total_file_errors = summary["total_file_errors"] - report_naming_issues = summary["report_naming_issues"] - _insert_synthesis( source_path=source_path, - exp_counter=exp_counter, - file_counter=file_counter, - total_errors=total_errors, - total_file_errors=total_file_errors, - total_naming_errors=total_naming_errors, - total_num_errors=total_num_errors, - total_spatial_errors=total_spatial_errors, - total_time_errors=total_time_errors, - total_warnings=total_warnings, - report_naming_issues=report_naming_issues, + exp_counter=summary["exp_counter"], + file_counter=summary["file_counter"], + total_errors=summary["total_errors"], + total_file_errors=summary["total_file_errors"], + total_naming_errors=summary["total_naming_errors"], + total_num_errors=summary["total_num_errors"], + total_spatial_errors=summary["total_spatial_errors"], + total_time_errors=summary["total_time_errors"], + total_attr_errors=summary["total_attr_errors"], + report_naming_issues=summary["report_naming_issues"], ) except TypeError as err: print( - "Something went wrong with your dataset. Please, check your file(s) carrefully. Error:", + "Something went wrong with your dataset. Please, check your file(s) carefully. Error:", err, ) +def _group_files_by_experiment(source_path: str) -> dict: + groups = {} + for f in sorted(os.listdir(source_path)): + if not f.endswith(".nc"): + continue + parts = f.split("_") + exp_name = parts[ISMIP7_FILENAME_EXPERIMENT_IDX] if len(parts) == ISMIP7_FILENAME_PARTS else "_unknown" + if exp_name not in groups: + groups[exp_name] = [] + groups[exp_name].append(f) + return groups + + def _process_experiments( log_file, source_path: str, - experiment_directories, + experiment_groups: dict, mandatory_variables, experiments, - variables, ismip_var, ismip_meta, ): - total_warnings = 0 total_naming_errors = 0 total_num_errors = 0 total_spatial_errors = 0 total_time_errors = 0 + total_attr_errors = 0 total_file_errors = 0 report_naming_issues = [] file_counter = 0 exp_counter = 0 - for xp in experiment_directories: + for experiment_name, exp_files in experiment_groups.items(): exp_counter += 1 exp_summary = _process_single_experiment( log_file=log_file, source_path=source_path, - xp=xp, + experiment_name=experiment_name, + exp_files=exp_files, mandatory_variables=mandatory_variables, experiments=experiments, - variables=variables, ismip_var=ismip_var, ismip_meta=ismip_meta, report_naming_issues=report_naming_issues, @@ -257,6 +352,7 @@ def _process_experiments( total_num_errors += exp_summary["exp_num_errors"] total_spatial_errors += exp_summary["exp_spatial_errors"] total_time_errors += exp_summary["exp_time_errors"] + total_attr_errors += exp_summary["exp_attr_errors"] total_file_errors += exp_summary["exp_file_errors"] _print_experiment_summary( @@ -269,6 +365,7 @@ def _process_experiments( + total_num_errors + total_spatial_errors + total_time_errors + + total_attr_errors + total_file_errors ) _print_total_summary(source_path=source_path, total_errors=total_errors) @@ -277,11 +374,11 @@ def _process_experiments( "exp_counter": exp_counter, "file_counter": file_counter, "total_errors": total_errors, - "total_warnings": total_warnings, "total_naming_errors": total_naming_errors, "total_num_errors": total_num_errors, "total_spatial_errors": total_spatial_errors, "total_time_errors": total_time_errors, + "total_attr_errors": total_attr_errors, "total_file_errors": total_file_errors, "report_naming_issues": report_naming_issues, } @@ -290,50 +387,27 @@ def _process_experiments( def _process_single_experiment( log_file, source_path: str, - xp: str, + experiment_name: str, + exp_files: list, mandatory_variables, experiments, - variables, ismip_var, ismip_meta, report_naming_issues, ): - exp_dir, exp_files = _files_and_subdirectories(os.path.join(source_path, xp)) - _ = exp_dir - exp_files = list(filter(lambda file: file.split(".")[-1] == "nc", exp_files)) - - exp_errors = 0 exp_naming_errors = 0 exp_num_errors = 0 exp_spatial_errors = 0 exp_time_errors = 0 + exp_attr_errors = 0 exp_file_errors = 0 - exp_warnings = 0 - exp_naming_warnings = 0 - exp_num_warnings = 0 - exp_spatial_warnings = 0 - exp_time_warnings = 0 + temp_mandatory_var = list(mandatory_variables) for i in exp_files: - file_name_split = i.split("_") - variable = file_name_split[0] - temp_mandatory_var = mandatory_variables - if variable in mandatory_variables: + variable = i.split("_")[ISMIP7_FILENAME_VAR_IDX] + if variable in temp_mandatory_var: temp_mandatory_var.remove(variable) - experiment_chain = xp.split("_") - if len(experiment_chain) == 2: - experiment_name = "_".join(experiment_chain[:-1]) - grid_resolution = int(experiment_chain[-1]) - else: - experiment_name = xp - grid_resolution = 0 - print( - "Error in the naming of the experiment ", - xp, - ". Should be similar to expXXX_RES", - ) - file_counter = 0 if experiment_name in [dic["experiment"] for dic in experiments]: log_file.write("\n ") @@ -344,13 +418,13 @@ def _process_single_experiment( if not temp_mandatory_var: log_file.write( "Mandatory variables Test: " - + xp + + experiment_name + " : all mandatory variables exist. \n" ) else: log_file.write( "ERROR: In experiment " - + xp + + experiment_name + ", these mandatory variable(s) is (are) missing: " + str(temp_mandatory_var) + "\n" @@ -362,33 +436,19 @@ def _process_single_experiment( file_summary = _process_single_file( log_file=log_file, source_path=source_path, - xp=xp, file=file, experiment_name=experiment_name, - grid_resolution=grid_resolution, - variables=variables, ismip_var=ismip_var, ismip_meta=ismip_meta, experiments=experiments, report_naming_issues=report_naming_issues, ) - exp_naming_errors = exp_naming_errors + file_summary["var_naming_errors"] - exp_num_errors = exp_num_errors + file_summary["var_num_errors"] - exp_spatial_errors = exp_spatial_errors + file_summary["var_spatial_errors"] - exp_time_errors = exp_time_errors + file_summary["var_time_errors"] - exp_errors = ( - exp_time_errors - + exp_spatial_errors - + exp_num_errors - + exp_naming_errors - + exp_file_errors - ) - exp_num_warnings = exp_num_warnings + file_summary["var_num_warnings"] - exp_spatial_warnings = ( - exp_spatial_warnings + file_summary["var_spatial_warnings"] - ) - exp_time_warnings = exp_time_warnings + file_summary["var_time_warnings"] + exp_naming_errors += file_summary["var_naming_errors"] + exp_num_errors += file_summary["var_num_errors"] + exp_spatial_errors += file_summary["var_spatial_errors"] + exp_time_errors += file_summary["var_time_errors"] + exp_attr_errors += file_summary["var_attr_errors"] else: log_file.write("\n ") @@ -404,20 +464,20 @@ def _process_single_experiment( + ". \n" ) exp_naming_errors += 1 - exp_errors = ( - exp_time_errors - + exp_spatial_errors - + exp_num_errors - + exp_naming_errors - + exp_file_errors - ) report_naming_issues.append( "Compliance check ignored : experiment " + experiment_name + " not in the experiments list." ) - _ = (exp_warnings, exp_naming_warnings) + exp_errors = ( + exp_time_errors + + exp_spatial_errors + + exp_num_errors + + exp_naming_errors + + exp_attr_errors + + exp_file_errors + ) return { "file_counter": file_counter, "experiment_name": experiment_name, @@ -426,6 +486,7 @@ def _process_single_experiment( "exp_num_errors": exp_num_errors, "exp_spatial_errors": exp_spatial_errors, "exp_time_errors": exp_time_errors, + "exp_attr_errors": exp_attr_errors, "exp_file_errors": exp_file_errors, } @@ -433,70 +494,52 @@ def _process_single_experiment( def _process_single_file( log_file, source_path: str, - xp: str, file: str, experiment_name: str, - grid_resolution: int, - variables, ismip_var, ismip_meta, experiments, report_naming_issues, ): - var_errors = 0 - var_warnings = 0 var_naming_errors = 0 var_num_errors = 0 var_spatial_errors = 0 var_time_errors = 0 - var_warnings = 0 - var_warnings = 0 - var_naming_warnings = 0 - var_num_warnings = 0 - var_spatial_warnings = 0 - var_time_warnings = 0 - - split_path = os.path.normpath(file).split(os.sep) - file_name = split_path[-1] - file_name_split = file_name.split("_") + var_attr_errors = 0 - considered_variable = file_name_split[0] - region = file_name_split[1] - group = file_name_split[2] - model = file_name_split[3] - _ = (group, model) - file_extention = file_name_split[len(file_name_split) - 1][-2:] + file_name = os.path.basename(file) + file_name_split = file_name.split("_") - ds = xr.open_dataset(os.path.join(source_path, xp, file)) - file_variables = list(ds.data_vars) + considered_variable = file_name_split[ISMIP7_FILENAME_VAR_IDX] + region = file_name_split[ISMIP7_FILENAME_REGION_IDX] - if file_extention != "nc": - log_file.write( - " !! " - + file_name - + " is not a NETCDF file. The compliance check is ignored." - + "\n" - ) + try: + ds = xr.open_dataset(os.path.join(source_path, file), + decode_times=xr.coders.CFDatetimeCoder(use_cftime=True)) + except (ValueError, TypeError) as e: + log_file.write(" - ERROR: Cannot open " + file_name + ": " + str(e) + "\n") + var_naming_errors += 1 return { "var_naming_errors": var_naming_errors, "var_num_errors": var_num_errors, "var_spatial_errors": var_spatial_errors, "var_time_errors": var_time_errors, - "var_num_warnings": var_num_warnings, - "var_spatial_warnings": var_spatial_warnings, - "var_time_warnings": var_time_warnings, + "var_attr_errors": var_attr_errors, } + file_variables = list(ds.data_vars) - if int(len(file_name_split)) != 5: + if len(file_name_split) != ISMIP7_FILENAME_PARTS: log_file.write( " - ERROR: the file name " + file_name - + " do not follow the naming convention.\n" + + " does not follow the naming convention (expected " + + str(ISMIP7_FILENAME_PARTS) + + " underscore-separated fields).\n" ) report_naming_issues.append( "Compliance check ignored: file " + file_name - + " do not follow the naming convention." + + " does not follow the naming convention." ) var_naming_errors += 1 return { @@ -504,19 +547,17 @@ def _process_single_file( "var_num_errors": var_num_errors, "var_spatial_errors": var_spatial_errors, "var_time_errors": var_time_errors, - "var_num_warnings": var_num_warnings, - "var_spatial_warnings": var_spatial_warnings, - "var_time_warnings": var_time_warnings, + "var_attr_errors": var_attr_errors, } - experiment_varname = file_name_split[4][:-3] + experiment_varname = file_name_split[ISMIP7_FILENAME_EXPERIMENT_IDX] if experiment_varname != experiment_name: log_file.write( " - ERROR: in the file name " + file_name + ", the experiment name (" + experiment_varname - + ") do not match the directory name: " + + ") does not match the expected experiment: " + experiment_name + ".\n" ) @@ -525,7 +566,7 @@ def _process_single_file( + file_name + ", the experiment name (" + experiment_varname - + ") do not match the directory name: " + + ") does not match the expected experiment: " + experiment_name + ".\n" ) @@ -535,20 +576,17 @@ def _process_single_file( "var_num_errors": var_num_errors, "var_spatial_errors": var_spatial_errors, "var_time_errors": var_time_errors, - "var_num_warnings": var_num_warnings, - "var_spatial_warnings": var_spatial_warnings, - "var_time_warnings": var_time_warnings, + "var_attr_errors": var_attr_errors, } - if considered_variable in variables: - var_naming_errors, var_num_errors, var_spatial_errors, var_time_errors = ( + if considered_variable in ismip_var: + var_naming_errors, var_num_errors, var_spatial_errors, var_time_errors, var_attr_errors = ( _run_variable_checks( log_file=log_file, ds=ds, file_name=file_name, considered_variable=considered_variable, experiment_name=experiment_name, - grid_resolution=grid_resolution, file_variables=file_variables, region=region, ismip_var=ismip_var, @@ -558,16 +596,7 @@ def _process_single_file( ) ) - var_errors = ( - var_errors - + var_naming_errors - + var_num_errors - + var_spatial_errors - + var_time_errors - ) - var_warnings = ( - var_warnings + var_num_warnings + var_spatial_warnings + var_time_warnings - ) + var_errors = var_naming_errors + var_num_errors + var_spatial_errors + var_time_errors + var_attr_errors log_file.write("\n") log_file.write("----------------------------------------------------------\n") @@ -575,17 +604,10 @@ def _process_single_file( experiment_name + " - " + considered_variable + " - File:" + file_name + "\n" ) if var_errors > 0: - log_file.write( - str(var_errors) + " error(s). Please review before sharing." + "\n" - ) - else: - log_file.write("No errors. Good job !" + "\n") - if var_warnings > 0: - log_file.write( - str(var_warnings) + " warning(s). Please review before sharing." + "\n" - ) + log_file.write(str(var_errors) + " error(s). Please review before sharing.\n") else: - log_file.write("No warnings." + "\n") + log_file.write("No errors. Good job !\n") + log_file.write("No warnings.\n") log_file.write("----------------------------------------------------------\n") return { @@ -593,163 +615,186 @@ def _process_single_file( "var_num_errors": var_num_errors, "var_spatial_errors": var_spatial_errors, "var_time_errors": var_time_errors, - "var_num_warnings": var_num_warnings, - "var_spatial_warnings": var_spatial_warnings, - "var_time_warnings": var_time_warnings, + "var_attr_errors": var_attr_errors, } -def _run_variable_checks( +def _check_naming( log_file, ds, file_name: str, - considered_variable: str, - experiment_name: str, - grid_resolution: int, - file_variables, region: str, - ismip_var, - ismip_meta, - experiments, - report_naming_issues, -): - var_naming_errors = 0 - var_num_errors = 0 - var_spatial_errors = 0 - var_time_errors = 0 + dim: set, + isscalar: bool, + report_naming_issues: list, +) -> int: + errors = 0 - log_file.write(" \n") - log_file.write("Experiment: " + experiment_name + " - File: " + file_name + "\n") - log_file.write(" \n") - - header_ds = ds.to_dict(data=False) - dim = set(list(header_ds["coords"].keys())) + log_file.write("NAMING Tests \n") - if not set(["x", "y"]).issubset(dim): + if not isscalar and not {"x", "y"}.issubset(dim): log_file.write( - "- ERROR: Compliance check ignored: x or y in the mandatory dimensions (x,y,t) is missing.\n" + " - ERROR: Compliance check ignored: x or y in the mandatory dimensions (x,y,t) is missing.\n" ) log_file.write( - " Only " - + str(list(header_ds["coords"].keys())) - + " has been detected.\n" + " Only " + str(list(dim)) + " has been detected.\n" ) report_naming_issues.append( "Compliance check ignored: x or y in the mandatory dimensions (x,y,t) is missing in " + file_name ) - var_naming_errors += 1 - return var_naming_errors, var_num_errors, var_spatial_errors, var_time_errors + return errors + 1 - if region.upper() not in ["AIS", "GIS"]: + if region not in ["AIS", "GrIS"]: log_file.write( - "- ERROR: Region " + " - ERROR: Region " + region - + " not recognized. It should be AIS or GIS. The compliance check has been interrupted for this variable.\n" + + " not recognized. It should be AIS or GrIS. The compliance check has been interrupted for this variable.\n" ) report_naming_issues.append( - "Compliance check ignored: region (AIS/GIS) not identified in the file " + "Compliance check ignored: region (AIS/GrIS) not identified in the file " + file_name + " due to wrong naming." ) - var_naming_errors += 1 - return var_naming_errors, var_num_errors, var_spatial_errors, var_time_errors + errors += 1 - if region == "AIS": - grid_extent = AIS_GRID_EXTENT - possible_resolution = AIS_POSSIBLE_RESOLUTION - else: - grid_extent = GIS_GRID_EXTENT - possible_resolution = GIS_POSSIBLE_RESOLUTION + parts = file_name.split("_") + if len(parts) == ISMIP7_FILENAME_PARTS: + ism_member = parts[ISMIP7_FILENAME_ISM_MEMBER_IDX] + if not re.fullmatch(r"m\d{3}", ism_member): + log_file.write( + f" - ERROR: ISM member id '{ism_member}' (field {ISMIP7_FILENAME_ISM_MEMBER_IDX}) does not match expected format mNNN (e.g. m001).\n" + ) + errors += 1 - for ivar in file_variables: - if ivar in ismip_var: - log_file.write("** Tested Variable: " + ivar + "\n") - log_file.write(" \n") - var_index = [k for k in range(len(ismip_var)) if ismip_var[k] == ivar] + esm_name = parts[ISMIP7_FILENAME_ESM_IDX] + if esm_name not in VALID_ESM_NAMES: + log_file.write( + f" - ERROR: ESM name '{esm_name}' (field {ISMIP7_FILENAME_ESM_IDX}) is not a recognised CMIP6/CMIP7 model name.\n" + ) + errors += 1 - log_file.write("NUMERICAL Tests \n") - if ds[ivar].attrs["units"] == ismip_meta[var_index[0]]["units"]: - log_file.write( - " - The unit is correct: " + ds[ivar].attrs["units"] + "\n" - ) - else: + forcing_member = parts[ISMIP7_FILENAME_FORCING_MEMBER_IDX] + if not re.fullmatch(r"f\d{3}", forcing_member): + log_file.write( + f" - ERROR: forcing member id '{forcing_member}' (field {ISMIP7_FILENAME_FORCING_MEMBER_IDX}) does not match expected format fNNN (e.g. f001).\n" + ) + errors += 1 + + set_counter = parts[ISMIP7_FILENAME_SET_COUNTER_IDX] + if not re.fullmatch(r"[CEP]\d{3}", set_counter): + log_file.write( + f" - ERROR: set counter '{set_counter}' (field {ISMIP7_FILENAME_SET_COUNTER_IDX}) does not match expected format [C|E|P]NNN (e.g. C001, E041, P132).\n" + ) + errors += 1 + + year_range_field = parts[ISMIP7_FILENAME_YEAR_RANGE_IDX].removesuffix(".nc") + year_range_match = re.fullmatch(r"(\d{4})-(\d{4})", year_range_field) + if not year_range_match: + log_file.write( + f" - ERROR: year range '{year_range_field}' (field {ISMIP7_FILENAME_YEAR_RANGE_IDX}) does not match expected format YYYY-YYYY (e.g. 2015-2300).\n" + ) + errors += 1 + else: + fn_start_year = int(year_range_match.group(1)) + fn_end_year = int(year_range_match.group(2)) + if fn_start_year > fn_end_year: log_file.write( - " - ERROR: The unit of the variable is " - + ds[ivar].attrs["units"] - + " and should be " - + ismip_meta[var_index[0]]["units"] - + " \n" + f" - ERROR: year range '{year_range_field}': start year {fn_start_year} is after end year {fn_end_year}.\n" ) - var_num_errors += 1 - - if False in ds[ivar].isnull(): - if ( - ds[ivar].min(skipna=True).item() - >= ismip_meta[var_index[0]]["min_value_" + region.lower()] - ): - log_file.write(" - The minimum value successfully verified.\n") - else: + errors += 1 + elif "time" in ds.coords: + actual_start = min(ds["time"]).values.astype("datetime64[D]").item().year + actual_end = max(ds["time"]).values.astype("datetime64[D]").item().year + if fn_start_year != actual_start: log_file.write( - " - ERROR: The minimum value (" - + str(ds[ivar].min(skipna=True).values.item(0)) - + ") is out of range. Min value accepted: " - + str(ismip_meta[var_index[0]]["min_value_" + region.lower()]) - + "\n" + f" - ERROR: filename start year {fn_start_year} does not match first time step year {actual_start}.\n" ) - var_num_errors += 1 - - if ( - ds[ivar].max(skipna=True).item() - <= ismip_meta[var_index[0]]["max_value_" + region.lower()] - ): - log_file.write(" - The maximum value successfully verified.\n") - else: + errors += 1 + if fn_end_year != actual_end: log_file.write( - " - ERROR: The maximum value (" - + str(ds[ivar].max(skipna=True).values.item(0)) - + ") is out of range. Max value accepted: " - + str(ismip_meta[var_index[0]]["max_value_" + region.lower()]) - + "\n" + f" - ERROR: filename end year {fn_end_year} does not match last time step year {actual_end}.\n" ) - var_num_errors += 1 + errors += 1 + if fn_start_year == actual_start and fn_end_year == actual_end: + log_file.write( + f" - Filename year range {fn_start_year}-{fn_end_year} matches time axis: OK\n" + ) + + return errors + + +def _check_numerical( + log_file, + ds, + ivar: str, + ismip_meta: list, + var_index: int, + region: str, + isscalar: bool, +) -> int: + errors = 0 + + log_file.write("NUMERICAL Tests \n") + + if ds[ivar].attrs["units"] == ismip_meta[var_index]["units"]: + log_file.write(" - The unit is correct: " + ds[ivar].attrs["units"] + "\n") + else: + log_file.write( + " - ERROR: The unit of the variable is " + + ds[ivar].attrs["units"] + + " and should be " + + ismip_meta[var_index]["units"] + + " \n" + ) + errors += 1 + + if not isscalar: + if False in ds[ivar].isnull(): + if ( + ds[ivar].min(skipna=True).item() + >= ismip_meta[var_index]["min_value_" + region.lower()] + ): + log_file.write(" - The minimum value successfully verified.\n") else: - log_file.write(" - ERROR: The array only contains Nan values.\n") - var_num_errors += 1 + log_file.write( + " - ERROR: The minimum value (" + + str(ds[ivar].min(skipna=True).values.item(0)) + + ") is out of range. Min value accepted: " + + str(ismip_meta[var_index]["min_value_" + region.lower()]) + + "\n" + ) + errors += 1 - ( - var_spatial_errors, - var_time_errors, - ) = _run_spatial_and_time_checks( - log_file=log_file, - ds=ds, - dim=dim, - region=region, - experiments=experiments, - experiment_name=experiment_name, - grid_extent=grid_extent, - possible_resolution=possible_resolution, - grid_resolution=grid_resolution, - var_spatial_errors=var_spatial_errors, - var_time_errors=var_time_errors, - ) + if ( + ds[ivar].max(skipna=True).item() + <= ismip_meta[var_index]["max_value_" + region.lower()] + ): + log_file.write(" - The maximum value successfully verified.\n") + else: + log_file.write( + " - ERROR: The maximum value (" + + str(ds[ivar].max(skipna=True).values.item(0)) + + ") is out of range. Max value accepted: " + + str(ismip_meta[var_index]["max_value_" + region.lower()]) + + "\n" + ) + errors += 1 + else: + log_file.write(" - ERROR: The array only contains missing values.\n") + errors += 1 - return var_naming_errors, var_num_errors, var_spatial_errors, var_time_errors + return errors -def _run_spatial_and_time_checks( +def _check_spatial( log_file, ds, - dim, - region: str, - experiments, - experiment_name: str, - grid_extent, - possible_resolution, - grid_resolution: int, - var_spatial_errors: int, - var_time_errors: int, -): + grid_extent: list, + possible_resolution: list, +) -> int: + errors = 0 + log_file.write("SPATIAL Tests \n") coords = ds.coords.to_dataset() Xbottomleft = int(min(coords["x"]).values.item()) @@ -757,154 +802,148 @@ def _run_spatial_and_time_checks( Xtopright = int(max(coords["x"]).values.item()) Ytopright = int(max(coords["y"]).values.item()) - if Xbottomleft == grid_extent[0] & Ybottomleft == grid_extent[1]: + if Xbottomleft == grid_extent[0] and Ybottomleft == grid_extent[1]: log_file.write(" - Grid: Lowest left corner is well defined.\n") else: log_file.write( " - ERROR: Lowest left corner of the grid [" - + str(Xbottomleft) - + "," - + str(Ybottomleft) + + str(Xbottomleft) + "," + str(Ybottomleft) + "] is not correctly defined. [" - + str(grid_extent[0]) - + "," - + str(grid_extent[1]) + + str(grid_extent[0]) + "," + str(grid_extent[1]) + "] Expected\n" ) - var_spatial_errors += 1 - if Xtopright == grid_extent[2] & Ytopright == grid_extent[3]: + errors += 1 + + if Xtopright == grid_extent[2] and Ytopright == grid_extent[3]: log_file.write(" - Grid: Upper right corner is well defined.\n") else: log_file.write( - " - ERROR: Upper rigth corner of the grid [" - + str(Xtopright) - + "," - + str(Ytopright) + " - ERROR: Upper right corner of the grid [" + + str(Xtopright) + "," + str(Ytopright) + "] is not correctly defined. [" - + str(grid_extent[0]) - + "," - + str(grid_extent[1]) + + str(grid_extent[2]) + "," + str(grid_extent[3]) + "] Expected\n" ) - var_spatial_errors += 1 + errors += 1 Xresolution = round((coords["x"][1].values - coords["x"][0].values) / 1000, 0) Yresolution = round((coords["y"][1].values - coords["y"][0].values) / 1000, 0) - if Xresolution in set(possible_resolution) and Yresolution in set( - possible_resolution - ): - if Xresolution == grid_resolution and Yresolution == grid_resolution: - log_file.write( - " - The grid resolution (" - + str(Xresolution) - + ") was successfully verified.\n" - ) - else: - log_file.write( - " - ERROR: The grid resolution ( " - + str(Xresolution) - + " or " - + str(Yresolution) - + ") is different of " - + str(grid_resolution) - + "declared in the file name.\n" - ) - var_spatial_errors += 1 + if Xresolution in set(possible_resolution) and Yresolution in set(possible_resolution): + log_file.write( + " - The grid resolution (" + + str(int(Xresolution)) + + " km) was successfully verified.\n" + ) else: log_file.write( - " - Error: x: " + " - ERROR: resolution x=" + str(Xresolution) - + ",y: " + + " km, y=" + str(Yresolution) - + " is not an authorized grid resolution.\n" + + " km is not an authorized grid resolution. Allowed: " + + str(possible_resolution) + + " km\n" ) - var_spatial_errors += 1 - - var_time_errors = _run_time_checks( - log_file=log_file, - ds=ds, - dim=dim, - experiments=experiments, - experiment_name=experiment_name, - var_time_errors=var_time_errors, - ) - return var_spatial_errors, var_time_errors + errors += 1 + + return errors -def _run_time_checks( +def _check_time( log_file, ds, - dim, - experiments, + dim: set, + experiments: list, experiment_name: str, - var_time_errors: int, -): +) -> int: + errors = 0 + log_file.write("TIME Tests \n") - if not (set(["t"]).issubset(dim) or set(["time"]).issubset(dim)): + if not ({"t"}.issubset(dim) or {"time"}.issubset(dim)): + log_file.write( + " - ERROR: The time dimension is missing. Time Tests have been ignored.\n" + ) + return errors + 1 + + time_dim = "time" if "time" in ds.dims else "t" + unlimited_dims = ds.encoding.get("unlimited_dims", set()) + if time_dim in unlimited_dims: + log_file.write(" - Time is a record (unlimited) dimension: OK\n") + else: log_file.write( - " - ERROR: The time dimensions is missing. Time Tests have been ignored.\n" + f" - ERROR: dimension '{time_dim}' is not a record (unlimited) dimension.\n" ) - return var_time_errors + 1 + errors += 1 - iteration = len(ds.coords["time"]) start_exp = min(ds["time"]).values.astype("datetime64[D]") end_exp = max(ds["time"]).values.astype("datetime64[D]") - avgyear = AVERAGE_YEAR_DAYS - duration_days = end_exp - start_exp - duration_years = duration_days.astype("timedelta64[Y]") / np.timedelta64(1, "Y") - _ = duration_years + duration_years = end_exp.item().year - start_exp.item().year + 1 index_exp = [dic["experiment"] for dic in experiments].index(experiment_name) if not ( np.issubdtype(start_exp.dtype, np.datetime64) - & np.issubdtype(start_exp.dtype, np.datetime64) + & np.issubdtype(end_exp.dtype, np.datetime64) ): log_file.write( - " - ERROR: the time format of the Netcdf file is not recognized.Time Tests have been ignored.\n" + " - ERROR: the time format of the Netcdf file is not recognized. Time Tests have been ignored.\n" ) - return var_time_errors + 1 + return errors + 1 if not _strictly_increasing(ds.coords["time"]): log_file.write( - " - ERROR: the time serie is not monotonous. Time segments have probably been concatenate in a wrong order.\n" + " - ERROR: the time series is not monotonically increasing. Time segments may have been concatenated in the wrong order.\n" ) - return var_time_errors + 1 + return errors + 1 - if isinstance(ds["time"].values[1] - ds["time"].values[0], datetime.timedelta): - time_step = (ds["time"].values[1] - ds["time"].values[0]).days - else: - if isinstance(ds["time"].values[1] - ds["time"].values[0], np.timedelta64): + if len(ds["time"].values) > 1: + if isinstance(ds["time"].values[1] - ds["time"].values[0], datetime.timedelta): + time_step = (ds["time"].values[1] - ds["time"].values[0]).days + elif isinstance(ds["time"].values[1] - ds["time"].values[0], np.timedelta64): time_step = np.timedelta64( - ds["time"].values[1] - ds["time"].values[0], - "D", + ds["time"].values[1] - ds["time"].values[0], "D" ) / np.timedelta64(1, "D") else: - time_step = ds["time"].values[1] - ds["time"].values[10] + time_step = ds["time"].values[1] - ds["time"].values[0] - if TIME_STEP_MIN_DAYS <= time_step <= TIME_STEP_MAX_DAYS: - log_file.write(" - Time step: " + str(time_step) + " days" + "\n") + if TIME_STEP_MIN_DAYS <= time_step <= TIME_STEP_MAX_DAYS: + log_file.write(" - Time step: " + str(time_step) + " days\n") + else: + log_file.write( + " - ERROR: the time step (" + + str(time_step) + + ") should be comprised between [" + + str(TIME_STEP_MIN_DAYS) + + " and " + + str(TIME_STEP_MAX_DAYS) + + "].\n" + ) + errors += 1 else: - log_file.write( - " - ERROR: the time step(" - + str(time_step) - + ") should be comprised between [360,367].\n" - ) - var_time_errors += 1 + log_file.write(" - Only one time step present; time step check skipped.\n") + + exp = experiments[index_exp] + dateformat_end_exp = datetime.datetime( + end_exp.item().year, end_exp.item().month, end_exp.item().day + ) + + if exp["duration"] == -1: + # Undetermined length: only require >= 1 year, start and end date in range + if duration_years >= 1: + log_file.write( + " - Experiment lasts " + str(duration_years) + " years (>= 1 year required): OK\n" + ) + else: + log_file.write( + " - ERROR: the experiment lasts " + + str(duration_years) + + " years but must be at least 1 year long.\n" + ) + errors += 1 - duration_days = pd.to_timedelta(time_step * iteration, "D") - duration_years = round(pd.to_numeric(duration_days.days / avgyear)) - if duration_years == experiments[index_exp]["duration"]: - log_file.write(" - Experiment lasts " + str(duration_years) + " years.\n") dateformat_start_exp = datetime.datetime( - start_exp.item().year, - start_exp.item().month, - start_exp.item().day, + start_exp.item().year, start_exp.item().month, start_exp.item().day ) - if ( - experiments[index_exp]["startinf"] - <= dateformat_start_exp - <= experiments[index_exp]["startsup"] - ): + if exp["startinf"] <= dateformat_start_exp <= exp["startsup"]: log_file.write( " - Experiment starts correctly on " + start_exp.item().strftime("%Y-%m-%d") @@ -915,23 +954,14 @@ def _run_time_checks( " - ERROR: the experiment starts the " + start_exp.item().strftime("%Y-%m-%d") + ". The date should be comprised between " - + experiments[index_exp]["startinf"].strftime("%Y-%m-%d") + + exp["startinf"].strftime("%Y-%m-%d") + " and " - + experiments[index_exp]["startsup"].strftime("%Y-%m-%d") + + exp["startsup"].strftime("%Y-%m-%d") + "\n" ) - var_time_errors += 1 + errors += 1 - dateformat_end_exp = datetime.datetime( - end_exp.item().year, - end_exp.item().month, - end_exp.item().day, - ) - if ( - experiments[index_exp]["endinf"] - <= dateformat_end_exp - <= experiments[index_exp]["endsup"] - ): + if exp["endinf"] <= dateformat_end_exp <= exp["endsup"]: log_file.write( " - Experiment ends correctly on " + end_exp.item().strftime("%Y-%m-%d") @@ -942,31 +972,303 @@ def _run_time_checks( " - ERROR: the experiment ends on " + end_exp.item().strftime("%Y-%m-%d") + ". The date should be comprised between " - + experiments[index_exp]["endinf"].strftime("%Y-%m-%d") + + exp["endinf"].strftime("%Y-%m-%d") + " and " - + experiments[index_exp]["endsup"].strftime("%Y-%m-%d") + + exp["endsup"].strftime("%Y-%m-%d") + "\n" ) - var_time_errors += 1 + errors += 1 else: - end_date = start_exp + np.timedelta64(experiments[2]["duration"] * 365, "D") + if duration_years == exp["duration"]: + log_file.write(" - Experiment lasts " + str(duration_years) + " years.\n") + dateformat_start_exp = datetime.datetime( + start_exp.item().year, + start_exp.item().month, + start_exp.item().day, + ) + if exp["startinf"] <= dateformat_start_exp <= exp["startsup"]: + log_file.write( + " - Experiment starts correctly on " + + start_exp.item().strftime("%Y-%m-%d") + + ".\n" + ) + else: + log_file.write( + " - ERROR: the experiment starts the " + + start_exp.item().strftime("%Y-%m-%d") + + ". The date should be comprised between " + + exp["startinf"].strftime("%Y-%m-%d") + + " and " + + exp["startsup"].strftime("%Y-%m-%d") + + "\n" + ) + errors += 1 + + if exp["endinf"] <= dateformat_end_exp <= exp["endsup"]: + log_file.write( + " - Experiment ends correctly on " + + end_exp.item().strftime("%Y-%m-%d") + + ".\n" + ) + else: + log_file.write( + " - ERROR: the experiment ends on " + + end_exp.item().strftime("%Y-%m-%d") + + ". The date should be comprised between " + + exp["endinf"].strftime("%Y-%m-%d") + + " and " + + exp["endsup"].strftime("%Y-%m-%d") + + "\n" + ) + errors += 1 + else: + end_date = start_exp + np.timedelta64(exp["duration"] * 365, "D") + log_file.write( + " - ERROR: the experiment lasts " + + str(duration_years) + + " years. The duration should be " + + str(exp["duration"]) + + " years\n" + ) + log_file.write( + " - As the experiment started on " + + start_exp.item().strftime("%Y-%m-%d") + + " , it should end on " + + end_date.item().strftime("%Y-%m-%d") + + "\n" + ) + errors += 1 + + return errors + + +def _check_attributes( + log_file, + ds, + ivar: str, + ismip_meta: list, + var_index: int, + isscalar: bool, + var_type: str, + region: str, +) -> int: + errors = 0 + + log_file.write("ATTRIBUTE Tests \n") + + # Sub-test 1: global attributes + required_global = ["group", "model", "contact_name", "contact_email"] + global_errors = 0 + for attr in required_global: + if attr not in ds.attrs: + log_file.write(f" - ERROR (attributes): global attribute '{attr}' is missing.\n") + global_errors += 1 + expected_crs = "epsg:3413" if region == "GrIS" else "epsg:3031" + actual_crs = ds.attrs.get("crs") + if actual_crs is None: + log_file.write(" - ERROR (attributes): global attribute 'crs' is missing.\n") + global_errors += 1 + elif actual_crs.lower() != expected_crs: log_file.write( - " - ERROR: the experiment lasts " - + str(duration_years) - + " years. The duration should be " - + str(experiments[index_exp]["duration"]) - + " years\n" + f" - ERROR (attributes): global attribute 'crs' is '{actual_crs}'," + f" expected '{expected_crs}' (case-insensitive) for region {region}.\n" ) + global_errors += 1 + if global_errors == 0: + log_file.write(" - Global attributes: OK\n") + errors += global_errors + + # Sub-test 2: coordinate attributes + coord_errors = 0 + time_coord = None + for name in ("time", "t"): + if name in ds.coords: + time_coord = name + break + if time_coord is None: + log_file.write(" - ERROR (attributes): coordinate 'time' not found.\n") + coord_errors += 1 + else: + # xarray decodes 'units' and 'calendar' into .encoding; 'bounds' stays in .attrs + time_var = ds[time_coord] + combined = {**time_var.encoding, **time_var.attrs} + time_attrs_required = ["units", "calendar"] + if var_type != "ST": + time_attrs_required.append("bounds") + for attr in time_attrs_required: + if attr not in combined: + log_file.write( + f" - ERROR (attributes): coordinate '{time_coord}' missing attribute '{attr}'.\n" + ) + coord_errors += 1 + if "units" in combined and combined["units"] != "days since 1850-01-01": + log_file.write( + f" - ERROR (attributes): time 'units' is '{combined['units']}', expected 'days since 1850-01-01'.\n" + ) + coord_errors += 1 + if "calendar" in combined and combined["calendar"] != "standard": + log_file.write( + f" - ERROR (attributes): time 'calendar' is '{combined['calendar']}', expected 'standard'.\n" + ) + coord_errors += 1 + if not isscalar: + for coord in ("x", "y"): + if coord in ds.coords: + if "units" not in ds[coord].attrs: + log_file.write( + f" - ERROR (attributes): coordinate '{coord}' missing attribute 'units'.\n" + ) + coord_errors += 1 + else: + log_file.write( + f" - ERROR (attributes): coordinate '{coord}' not found.\n" + ) + coord_errors += 1 + if coord_errors == 0: + log_file.write(" - Coordinate attributes: OK\n") + errors += coord_errors + + # Sub-test 3: variable standard_name + var_errors = 0 + expected_standard_name = ismip_meta[var_index].get("standard_name") + if expected_standard_name is not None and ivar in ds: + if "standard_name" not in ds[ivar].attrs: + log_file.write( + f" - ERROR (attributes): variable '{ivar}' missing 'standard_name' attribute.\n" + ) + var_errors += 1 + elif ds[ivar].attrs["standard_name"] != expected_standard_name: + log_file.write( + f" - ERROR (attributes): variable '{ivar}' standard_name" + f" '{ds[ivar].attrs['standard_name']}'" + f" does not match expected '{expected_standard_name}'.\n" + ) + var_errors += 1 + if var_errors == 0: + log_file.write(" - Variable attributes: OK\n") + errors += var_errors + + # Sub-test 4: _FillValue must equal the default netCDF4 fill value; + # if missing_value is also present it must equal _FillValue. + fill_errors = 0 + if ivar in ds: + dtype = ds[ivar].dtype + nc4_dtype_key = dtype.kind + str(dtype.itemsize) + default_fill = netCDF4.default_fillvals.get(nc4_dtype_key) + fill_value = ds[ivar].encoding.get("_FillValue") + # xarray moves missing_value from attrs to encoding on read (CF fill-value handling) + missing_value = ds[ivar].attrs.get("missing_value") or ds[ivar].encoding.get("missing_value") + if fill_value is None: + log_file.write(f" - ERROR (attributes): variable '{ivar}' missing '_FillValue'.\n") + fill_errors += 1 + elif default_fill is not None and fill_value != default_fill: + log_file.write( + f" - ERROR (attributes): variable '{ivar}' _FillValue {fill_value}" + f" does not match default netCDF4 fill value {default_fill} for dtype {dtype}.\n" + ) + fill_errors += 1 + if fill_value is not None and missing_value is not None and fill_value != missing_value: + log_file.write( + f" - ERROR (attributes): variable '{ivar}' _FillValue {fill_value}" + f" and missing_value {missing_value} are not equal.\n" + ) + fill_errors += 1 + if fill_errors == 0: + log_file.write(" - Fill value attributes: OK\n") + errors += fill_errors + + # Sub-test 5: main variable and time must be single-precision float (f4) + dtype_errors = 0 + if ivar in ds and ds[ivar].dtype != np.float32: log_file.write( - " - As the experiment started on " - + start_exp.item().strftime("%Y-%m-%d") - + " , it should end on " - + end_date.item().strftime("%Y-%m-%d") - + "\n" + f" - ERROR (attributes): variable '{ivar}' dtype is {ds[ivar].dtype}," + f" expected float32 (f4).\n" ) - var_time_errors += 1 + dtype_errors += 1 + if time_coord is not None: + # xarray decodes CF time to datetime objects in memory; check the on-disk dtype from encoding + time_encoded_dtype = ds[time_coord].encoding.get("dtype", ds[time_coord].dtype) + if time_encoded_dtype != np.float32: + log_file.write( + f" - ERROR (attributes): coordinate '{time_coord}' on-disk dtype is {time_encoded_dtype}," + f" expected float32 (f4).\n" + ) + dtype_errors += 1 + if dtype_errors == 0: + log_file.write(" - Dtype attributes: OK\n") + errors += dtype_errors + + # Sub-test 6: scale_factor and add_offset must not be present + pack_errors = 0 + if ivar in ds: + # xarray moves these to .encoding on decode; check both locations + combined = {**ds[ivar].attrs, **ds[ivar].encoding} + for forbidden in ("scale_factor", "add_offset"): + if forbidden in combined: + log_file.write( + f" - ERROR (attributes): variable '{ivar}' must not have '{forbidden}'.\n" + ) + pack_errors += 1 + if pack_errors == 0: + log_file.write(" - Packing attributes: OK\n") + errors += pack_errors + + return errors + + +def _run_variable_checks( + log_file, + ds, + file_name: str, + considered_variable: str, + experiment_name: str, + file_variables, + region: str, + ismip_var, + ismip_meta, + experiments, + report_naming_issues, +): + var_naming_errors = 0 + var_num_errors = 0 + var_spatial_errors = 0 + var_time_errors = 0 + var_attr_errors = 0 + + log_file.write(" \n") + log_file.write("Experiment: " + experiment_name + " - File: " + file_name + "\n") + log_file.write(" \n") + + header_ds = ds.to_dict(data=False) + dim = set(list(header_ds["coords"].keys())) + + index = ismip_var.index(considered_variable) + isscalar = ismip_meta[index]["dim"] == "t" + + n_err = _check_naming(log_file, ds, file_name, region, dim, isscalar, report_naming_issues) + var_naming_errors += n_err + if n_err > 0: + return var_naming_errors, var_num_errors, var_spatial_errors, var_time_errors, var_attr_errors - return var_time_errors + grid_extent = AIS_GRID_EXTENT if region == "AIS" else GrIS_GRID_EXTENT + possible_resolution = AIS_POSSIBLE_RESOLUTION if region == "AIS" else GrIS_POSSIBLE_RESOLUTION + + for ivar in file_variables: + if ivar in ismip_var: + log_file.write("** Tested Variable: " + ivar + "\n") + log_file.write(" \n") + var_index = [k for k in range(len(ismip_var)) if ismip_var[k] == ivar][0] + + var_num_errors += _check_numerical(log_file, ds, ivar, ismip_meta, var_index, region, isscalar) + + if not isscalar: + var_spatial_errors += _check_spatial(log_file, ds, grid_extent, possible_resolution) + + var_time_errors += _check_time(log_file, ds, dim, experiments, experiment_name) + + var_attr_errors += _check_attributes(log_file, ds, ivar, ismip_meta, var_index, isscalar, ismip_meta[var_index].get("type", ""), region) + + return var_naming_errors, var_num_errors, var_spatial_errors, var_time_errors, var_attr_errors def _print_experiment_summary(experiment_name: str, exp_errors: int) -> None: @@ -994,23 +1296,12 @@ def _print_total_summary(source_path: str, total_errors: int) -> None: print("-------------------------------------------------------------------------") -def _files_and_subdirectories(path: str): - files = [] - directories = [] - for f in os.listdir(path): - if os.path.isfile(os.path.join(path, f)): - files.append(f) - elif os.path.isdir(os.path.join(path, f)): - directories.append(f) - return directories, files - - def _strictly_increasing(values) -> bool: return all(x < y for x, y in zip(values, values[1:])) def _write_log_header( - log_file, commit_num: str, source_path: str, today: datetime.date + log_file, commit_num: str, source_path: str, today: datetime.date, criteria_file: str, ) -> None: log_file.write( "************************************************************************************\n" @@ -1022,9 +1313,9 @@ def _write_log_header( "************************************************************************************\n" ) log_file.write(f"Commit Number: {commit_num} \n") - log_file.write("verification criteria: ismip6_criteria.csv \n") + log_file.write("verification criteria: " + criteria_file + "\n") log_file.write("date: " + today.strftime("%Y/%m/%d") + "\n") - log_file.write("source: https://github.com/jbbarre/ISM_SimulationChecker \n") + log_file.write("source: https://github.com/ismip/ISM_SimulationChecker \n") log_file.write(" \n") log_file.write( "------------------------------------------------------------------------------------\n" @@ -1046,7 +1337,7 @@ def _write_log_header( log_file.write( "====================================================================================\n" ) - log_file.write("Tips: Use Cltr+F to look for specific problems. \n") + log_file.write("Hint: Use Cltr+F to look for specific problems. \n") log_file.write(" \n") @@ -1060,7 +1351,7 @@ def _insert_synthesis( total_num_errors: int, total_spatial_errors: int, total_time_errors: int, - total_warnings: int, + total_attr_errors: int, report_naming_issues, ) -> None: with open(os.path.join(source_path, "compliance_checker_log.txt"), "r") as f: @@ -1069,33 +1360,23 @@ def _insert_synthesis( iline = 11 contents.insert(iline, str(exp_counter) + " experiments checked.\n") iline += 1 - contents.insert( - iline, str(file_counter) + " files checked (Scalar files are ignored).\n" - ) + contents.insert(iline, str(file_counter) + " files checked.\n") iline += 2 contents.insert(iline, str(total_errors) + " error(s) detected.\n") iline += 1 - contents.insert( - iline, " - Mandatory variables: " + str(total_file_errors) + " error(s)\n" - ) + contents.insert(iline, " - Mandatory variables: " + str(total_file_errors) + " error(s)\n") iline += 1 - contents.insert( - iline, " - Naming Tests : " + str(total_naming_errors) + " error(s)\n" - ) + contents.insert(iline, " - Naming Tests : " + str(total_naming_errors) + " error(s)\n") iline += 1 - contents.insert( - iline, " - Numerical Tests : " + str(total_num_errors) + " error(s)\n" - ) + contents.insert(iline, " - Numerical Tests : " + str(total_num_errors) + " error(s)\n") iline += 1 - contents.insert( - iline, " - Spatial Tests : " + str(total_spatial_errors) + " error(s)\n" - ) + contents.insert(iline, " - Spatial Tests : " + str(total_spatial_errors) + " error(s)\n") iline += 1 - contents.insert( - iline, " - Time Tests : " + str(total_time_errors) + " error(s)\n" - ) + contents.insert(iline, " - Time Tests : " + str(total_time_errors) + " error(s)\n") + iline += 1 + contents.insert(iline, " - Attribute Tests : " + str(total_attr_errors) + " error(s)\n") iline += 2 - contents.insert(iline, str(total_warnings) + " warning(s) detected.\n") + contents.insert(iline, "0 warning(s) detected.\n") iline += 2 if total_naming_errors > 0: contents.insert(iline, "Naming tests errors report: \n") diff --git a/conventions/ISMIP7_variable_request.xlsx b/conventions/ISMIP7_variable_request.xlsx new file mode 100644 index 0000000..92a42d4 Binary files /dev/null and b/conventions/ISMIP7_variable_request.xlsx differ diff --git a/experiments_ismip6.csv b/experiments_ismip6.csv deleted file mode 100644 index 18cc78f..0000000 --- a/experiments_ismip6.csv +++ /dev/null @@ -1,17 +0,0 @@ -experiment;startinf;startsup;endinf;endsup;duration -hist;1979-06-30;1980-01-01;2014-06-30;2015-01-01;35 -ctrl;1979-06-30;1980-01-01;2100-06-30;2101-01-01;120 -ctrl_proj;2015-01-01;2016-01-02;2100-07-01;2101-01-01;86 -exp01;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp02;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp03;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp04;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp05;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp06;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp07;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp08;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp09;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp10;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp11;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp12;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 -exp13;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 diff --git a/experiments_ismip6_ext.csv b/experiments_ismip6_ext.csv deleted file mode 100644 index 51fcde5..0000000 --- a/experiments_ismip6_ext.csv +++ /dev/null @@ -1,16 +0,0 @@ -experiment;startinf;startsup;endinf;endsup;duration -ctrlAE;2015-01-01;2016-01-02;2300-07-01;2301-01-01;286 -expAE01;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE02;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE03;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE04;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE05;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE06;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE07;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE08;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE09;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE10;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE11;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE12;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE13;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 -expAE14;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 diff --git a/experiments_ismip7.csv b/experiments_ismip7.csv new file mode 100644 index 0000000..8b55547 --- /dev/null +++ b/experiments_ismip7.csv @@ -0,0 +1,6 @@ +experiment;startinf;startsup;endinf;endsup;duration +historical;1850-01-01;2013-12-31;2014-06-30;2015-01-01;-1 +ssp370;2015-01-01;2016-01-02;2100-06-30;2101-01-01;86 +ssp126;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 +ssp585;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 +ctrl;2015-01-01;2016-01-02;2300-06-30;2301-01-01;286 diff --git a/gdfs/gdf_ISMIP7_AIS_02000m.txt b/gdfs/gdf_ISMIP7_AIS_02000m.txt new file mode 100644 index 0000000..a9200f1 --- /dev/null +++ b/gdfs/gdf_ISMIP7_AIS_02000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Antarctica EPSG:3031 +# +gridtype = projection +xsize = 3041 +ysize = 3041 +xunits = "meter" +yunits = "meter" +xfirst = -3040000 +yfirst = -3040000 +xinc = 2000 +yinc = 2000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=0 +lat_ts=71 +lat_0=-90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_AIS_04000m.txt b/gdfs/gdf_ISMIP7_AIS_04000m.txt new file mode 100644 index 0000000..86f7f0f --- /dev/null +++ b/gdfs/gdf_ISMIP7_AIS_04000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Antarctica EPSG:3031 +# +gridtype = projection +xsize = 1521 +ysize = 1521 +xunits = "meter" +yunits = "meter" +xfirst = -3040000 +yfirst = -3040000 +xinc = 4000 +yinc = 4000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=0 +lat_ts=71 +lat_0=-90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_AIS_08000m.txt b/gdfs/gdf_ISMIP7_AIS_08000m.txt new file mode 100644 index 0000000..076a520 --- /dev/null +++ b/gdfs/gdf_ISMIP7_AIS_08000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Antarctica EPSG:3031 +# +gridtype = projection +xsize = 761 +ysize = 761 +xunits = "meter" +yunits = "meter" +xfirst = -3040000 +yfirst = -3040000 +xinc = 8000 +yinc = 8000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=0 +lat_ts=71 +lat_0=-90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_AIS_16000m.txt b/gdfs/gdf_ISMIP7_AIS_16000m.txt new file mode 100644 index 0000000..bebd6ae --- /dev/null +++ b/gdfs/gdf_ISMIP7_AIS_16000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Antarctica EPSG:3031 +# +gridtype = projection +xsize = 381 +ysize = 381 +xunits = "meter" +yunits = "meter" +xfirst = -3040000 +yfirst = -3040000 +xinc = 16000 +yinc = 16000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=0 +lat_ts=71 +lat_0=-90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_AIS_32000m.txt b/gdfs/gdf_ISMIP7_AIS_32000m.txt new file mode 100644 index 0000000..2daeaf2 --- /dev/null +++ b/gdfs/gdf_ISMIP7_AIS_32000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Antarctica EPSG:3031 +# +gridtype = projection +xsize = 191 +ysize = 191 +xunits = "meter" +yunits = "meter" +xfirst = -3040000 +yfirst = -3040000 +xinc = 32000 +yinc = 32000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=0 +lat_ts=71 +lat_0=-90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_GrIS_01000m.txt b/gdfs/gdf_ISMIP7_GrIS_01000m.txt new file mode 100644 index 0000000..5ab820d --- /dev/null +++ b/gdfs/gdf_ISMIP7_GrIS_01000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Greenland EPSG:3413 +# +gridtype = projection +xsize = 1681 +ysize = 2881 +xunits = "meter" +yunits = "meter" +xfirst = -720000 +yfirst = -3450000 +xinc = 1000 +yinc = 1000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=-45 +lat_ts=70 +lat_0=90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_GrIS_02000m.txt b/gdfs/gdf_ISMIP7_GrIS_02000m.txt new file mode 100644 index 0000000..18eff11 --- /dev/null +++ b/gdfs/gdf_ISMIP7_GrIS_02000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Greenland EPSG:3413 +# +gridtype = projection +xsize = 841 +ysize = 1441 +xunits = "meter" +yunits = "meter" +xfirst = -720000 +yfirst = -3450000 +xinc = 2000 +yinc = 2000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=-45 +lat_ts=70 +lat_0=90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_GrIS_04000m.txt b/gdfs/gdf_ISMIP7_GrIS_04000m.txt new file mode 100644 index 0000000..f4e0bac --- /dev/null +++ b/gdfs/gdf_ISMIP7_GrIS_04000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Greenland EPSG:3413 +# +gridtype = projection +xsize = 421 +ysize = 721 +xunits = "meter" +yunits = "meter" +xfirst = -720000 +yfirst = -3450000 +xinc = 4000 +yinc = 4000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=-45 +lat_ts=70 +lat_0=90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_GrIS_05000m.txt b/gdfs/gdf_ISMIP7_GrIS_05000m.txt new file mode 100644 index 0000000..e28a980 --- /dev/null +++ b/gdfs/gdf_ISMIP7_GrIS_05000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Greenland EPSG:3413 +# +gridtype = projection +xsize = 337 +ysize = 577 +xunits = "meter" +yunits = "meter" +xfirst = -720000 +yfirst = -3450000 +xinc = 5000 +yinc = 5000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=-45 +lat_ts=70 +lat_0=90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_GrIS_08000m.txt b/gdfs/gdf_ISMIP7_GrIS_08000m.txt new file mode 100644 index 0000000..93b3edb --- /dev/null +++ b/gdfs/gdf_ISMIP7_GrIS_08000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Greenland EPSG:3413 +# +gridtype = projection +xsize = 211 +ysize = 361 +xunits = "meter" +yunits = "meter" +xfirst = -720000 +yfirst = -3450000 +xinc = 8000 +yinc = 8000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=-45 +lat_ts=70 +lat_0=90 +x_0=0 +y_0=0" diff --git a/gdfs/gdf_ISMIP7_GrIS_16000m.txt b/gdfs/gdf_ISMIP7_GrIS_16000m.txt new file mode 100644 index 0000000..b20a330 --- /dev/null +++ b/gdfs/gdf_ISMIP7_GrIS_16000m.txt @@ -0,0 +1,16 @@ +# +# ISMIP7 grid description file +# Greenland EPSG:3413 +# +gridtype = projection +xsize = 106 +ysize = 181 +xunits = "meter" +yunits = "meter" +xfirst = -720000 +yfirst = -3450000 +xinc = 16000 +yinc = 16000 +grid_mapping = crs +grid_mapping_name = polar_stereographic +proj_params = "+proj=stere +lon_0=-45 +lat_ts=70 +lat_0=90 +x_0=0 +y_0=0" diff --git a/ismip6_criteria.csv b/ismip6_criteria.csv deleted file mode 100644 index 2b30853..0000000 --- a/ismip6_criteria.csv +++ /dev/null @@ -1,28 +0,0 @@ -variable;dim;type;variable_name;standard_name;alias_name;units;mandatory;min_value_ais;max_value_ais;min_value_gis;max_value_gis -acabf;x-y-t;FL;Surface mass balance flux;land_ice_surface_specific_mass_balance_flux;;kg m-2 s-1;1;-0,0001;0,0010;-0,0001;0,0010 -base;x-y-t;ST;Base elevation;base_altitude;;m;1;-4000;4000;-3000;4000 -dlithkdt;x-y-t;FL;Ice thickness imbalance;tendency_of_land_ice_thickness;;m s-1;0;-0,0001000000;0,000100000;-0,0001000000;0,000100000 -hfgeoubed;x-y-t;FL;Geothermal heat flux;upward_geothermal_heat_flux_in_land_ice;upward_geothermal_heat_flux_at_ground_level;W m-2;0;0,0;0,3;0,0;0,3 -libmassbffl;x-y-t;FL;Basal mass balance flux beneath floating ice;land_ice_basal_specific_mass_balance_flux;;kg m-2 s-1;1;-0,0040;0,0010;-0,0040;0,0010 -libmassbfgr;x-y-t;FL;Basal mass balance flux beneath grounded ice;land_ice_basal_specific_mass_balance_flux;;kg m-2 s-1;1;-0,0003;0,0000;-0,0003;0,0000 -licalvf;x-y-t;FL;Calving flux;land_ice_specific_mass_flux_due_to_calving;;kg m-2 s-1;0;0;100000000000;0;100000000000 -lifmassbf;x-y-t;FL;Ice front melt and calving flux;land_ice_specific_mass_flux_due_to_calving_and_ice_front_melting;;kg m-2 s-1;0;0;100000000000;0;100000000000 -ligroundf;x-y-t;FL;Grounding line flux;land_ice_specific_mass_flux_at_grounding_line;;kg m-2 s-1;0;100000;100000000000;100000;100000000000 -litempbotfl;x-y-t;ST;Basal temperature beneath floating ice shelf;temperature_at_base_of_ice_sheet_model;land_ice_basal_temperature;K;0;183;274;183;274 -litempbotgr;x-y-t;ST;Basal temperature beneath grounded ice sheet;temperature_at_base_of_ice_sheet_model;land_ice_basal_temperature;K;0;183;274;183;274 -litemptop;x-y-t;ST;Surface temperature;temperature_at_top_of_ice_sheet_model;temperature_at_ground_level_in_snow_or_firn;K;0;183;274;183;274 -lithk;x-y-t;ST;Ice thickness;land_ice_thickness;;m;1;0;5000;0;5000 -orog;x-y-t;ST;Surface elevation;surface_altitude;;m;1;0;4500;0;4500 -sftflf;x-y-t;ST;Floating ice sheet area fraction;floating_ice_shelf_area_fraction;floating_ice_sheet_area_fraction;1;1;0;1;0;1 -sftgif;x-y-t;ST;Land ice area fraction;land_ice_area_fraction;;1;1;0;1;0;1 -sftgrf;x-y-t;ST;Grounded ice sheet area fraction;grounded_ice_sheet_area_fraction;;1;1;0;1;0;1 -strbasemag;x-y-t;ST;Basal drag;land_ice_basal_drag;magnitude_of_land_ice_basal_drag;Pa;1;0;200000;0;200000 -topg;x-y-t;ST;Bedrock elevation;bedrock_altitude;;m;1;-7000;4000;-3000;4000 -xvelbase;x-y-t;ST;Basal velocity in x;land_ice_basal_x_velocity;;m s-1;0;-0,000400;0,000400;-0,000800;0,000800 -xvelmean;x-y-t;ST;Mean velocity in x;land_ice_vertical_mean_x_velocity;;m s-1;1;-0,000400;0,000400;-0,000800;0,000800 -xvelsurf;x-y-t;ST;Surface velocity in x;land_ice_surface_x_velocity;;m s-1;0;-0,000400;0,000400;-0,000800;0,000800 -yvelbase;x-y-t;ST;Basal velocity in y;land_ice_basal_y_velocity;;m s-1;0;-0,000400;0,000400;-0,000800;0,000800 -yvelmean;x-y-t;ST;Mean velocity in y;land_ice_vertical_mean_y_velocity;;m s-1;1;-0,000400;0,000400;-0,000800;0,000800 -yvelsurf;x-y-t;ST;Surface velocity in y;land_ice_surface_y_velocity;;m s-1;0;-0,000400;0,000400;-0,000800;0,000800 -zvelbase;x-y-t;ST;Basal velocity in z;land_ice_basal_upward_velocity;;m s-1;0;-0,00000500;0,00000500;-0,00000800;0,00000800 -zvelsurf;x-y-t;ST;Surface velocity in z;land_ice_surface_upward_velocity;;m s-1;0;-0,00000400;0,00000400;-0,00000800;0,00000800 diff --git a/isschecker_env.yml b/isschecker_env.yml index 3b3457e..47c4900 100644 --- a/isschecker_env.yml +++ b/isschecker_env.yml @@ -3,15 +3,16 @@ channels: - conda-forge - defaults dependencies: - - python=3.9 - - pip=22.1.2 + - python=3.14 + - pip=26.0.1 -# Core scientific python - - numpy=1.23.0 - - pandas=1.4.3 - - tqdm=4.64.0 +# Core scientific python + - numpy=2.4.3 + - pandas=3.0.2 + - openpyxl=3.1.5 + - tqdm=4.67.3 -# Spatial packages - - xarray=2022.3.0 - - cftime=1.5.1.1 - - netcdf4=1.5.7 +# Spatial packages + - xarray=2026.4.0 + - cftime=1.6.5 + - netcdf4=1.7.4 diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..8e59f56 --- /dev/null +++ b/test/README.md @@ -0,0 +1,70 @@ +# ISMIP7 NetCDF generator + +`test/generate_test_files.py` creates ISMIP7-style NetCDF test files with synthetic data, one file per variable, following the naming convention and grid definitions used by the compliance checker. + +Files are written to `Models/{GrIS|AIS}/ISMIP7/SYNTH1/CORE/`. + +## Usage + +```bash +conda activate isschecker +python test/generate_test_files.py [OPTIONS] +``` + +## Key options + +| Option | Default | Description | +|--------|---------|-------------| +| `--grid` | `GrIS_16000m` | Grid to use (run `--list-grids` to see all available) | +| `--scenario` | `ctrl` | Experiment name written into filenames and the time axis | +| `--start-year` | `2015` | First year of the time axis | +| `--nyears` | `5` | Number of annual time steps | +| `--xyt` | off | Include x,y,t (3D) variables | +| `--scalars` | off | Include scalar (time-only) variables | +| `--include-non-mandatory` | off | Also generate non-mandatory variables | +| `--ism-member-id` | `m001` | ISM ensemble member id (written into filenames) | +| `--esm-id` | `CESM2-WACCM` | ESM id (written into filenames) | +| `--forcing-member-id` | `f001` | Forcing ensemble member id (written into filenames) | +| `--set-counter` | `C001` | Set counter id (written into filenames) | +| `--list-grids` | — | List all available grids and exit | + +## Examples + +```bash +# List available grids +python test/generate_test_files.py --list-grids + +# Generate 286-year GrIS ctrl files (x,y,t variables) +python test/generate_test_files.py --grid GrIS_16000m --scenario ctrl \ + --xyt --nyears 286 --start-year 2015 + +# Generate 286-year AIS ssp370 files +python test/generate_test_files.py --grid AIS_08000m --scenario ssp370 \ + --xyt --nyears 286 --start-year 2015 + +# Generate scalar-only variables +python test/generate_test_files.py --grid GrIS_16000m --scenario ctrl \ + --scalars --nyears 286 --start-year 2015 + +# Generate both 3D and scalar variables, including non-mandatory ones +python test/generate_test_files.py --grid GrIS_16000m --scenario ctrl \ + --xyt --scalars --include-non-mandatory --nyears 286 --start-year 2015 +``` + +## Implemented conventions + +- CF-1.7 as baseline. +- Time encoding: `days since 1850-01-01`, `calendar='standard'`. + - State (ST) variables: snapshot at year end (Dec 31). + - Flux (FL) variables: mid-year (Jul 1) with bounds from Jan 1 to Jan 1 next year. +- Single precision (`float32`) for all variables and time. +- `_FillValue` and `missing_value` set to NetCDF4 default `f4` fill value. +- `time` is an unlimited (record) dimension. +- `x` and `y` are 1-D coordinate variables in metres. +- CRS set per domain: GrIS → `EPSG:3413`, AIS → `EPSG:3031`. + +## Notes + +- Variable metadata is read from `conventions/ISMIP7_variable_request.xlsx`; grid definitions from the top-level `gdfs/` directory. +- `group`, `model`, `contact_name`, and `contact_email` are hardcoded in `create_netcdf_file()` to synthetic defaults — edit there to customise. +- Generated files are synthetic and intended for testing the compliance checker, not for scientific use. diff --git a/test/compliance_checker_log.ref.txt b/test/compliance_checker_log.ref.txt deleted file mode 100644 index dbdd3db..0000000 --- a/test/compliance_checker_log.ref.txt +++ /dev/null @@ -1,59 +0,0 @@ -************************************************************************************ -************* Ice Sheet Model Simulations - Compliance Checker ************* -************************************************************************************ -Commit Number: 'f932fbf' -verification criteria: ismip6_criteria.csv -date: 2022/09/12 -source: https://github.com/jbbarre/ISM_SimulationChecker - ------------------------------------------------------------------------------------- -Verified directory: ./test ------------------------------------------------------------------------------------- -1 experiments checked. -1 files checked (Scalar files are ignored). - -12 error(s) detected. - - Mandatory variables: 12 error(s) - - Naming Tests : 0 error(s) - - Numerical Tests : 0 error(s) - - Spatial Tests : 0 error(s) - - Time Tests : 0 error(s) - -0 warning(s) detected. - - -==================================================================================== -================ DETAILED RESULTS ================ -==================================================================================== -Tips: Use Cltr+F to look for specific problems. - - - ********************************************************** - ** Experiment: exp05 - ********************************************************** - - ERROR: In experiment exp05_32, these mandatory variable(s) is (are) missing: ['acabf', 'base', 'libmassbfgr', 'lithk', 'orog', 'sftflf', 'sftgif', 'sftgrf', 'strbasemag', 'topg', 'xvelmean', 'yvelmean'] - -Experiment: exp05 - File: libmassbffl_AIS_IMAU_IMAUICE1_exp05.nc - -** Tested Variable: libmassbffl - -NUMERICAL Tests - - The unit is correct: kg m-2 s-1 - - The minimum value successfully verified. - - The maximum value successfully verified. -SPATIAL Tests - - Grid: Lowest left corner is well defined. - - Grid: Upper right corner is well defined. - - The grid resolution (32.0) was successfully verified. -TIME Tests - - Time step: 366.0 days - - Experiment lasts 86 years. - - Experiment starts correctly on 2015-07-01. - - Experiment ends correctly on 2100-07-01. - ----------------------------------------------------------- -exp05 - libmassbffl - File:libmassbffl_AIS_IMAU_IMAUICE1_exp05.nc -No errors. Good job ! -No warnings. ----------------------------------------------------------- diff --git a/test/exp05_32/libmassbffl_AIS_IMAU_IMAUICE1_exp05.nc b/test/exp05_32/libmassbffl_AIS_IMAU_IMAUICE1_exp05.nc deleted file mode 100644 index 0cf4a69..0000000 Binary files a/test/exp05_32/libmassbffl_AIS_IMAU_IMAUICE1_exp05.nc and /dev/null differ diff --git a/test/generate_test_files.py b/test/generate_test_files.py new file mode 100644 index 0000000..c81f54f --- /dev/null +++ b/test/generate_test_files.py @@ -0,0 +1,755 @@ +#!/usr/bin/env python3 +""" +NetCDF file generator for ISMIP7 ice sheet simulation data. + +This script generates NetCDF files with variables and metadata following +ISMIP7 conventions as defined in the criteria CSV files and grid definitions. +""" +import re +import numpy as np +import xarray as xr +from datetime import datetime +from pathlib import Path +import netCDF4 + + +def get_available_grids(conventions_dir): + """ + Get available grid definitions from gdfs directory. + + Parameters + ---------- + conventions_dir : str + Path to conventions directory + + Returns + ------- + dict + Dictionary with grid info: {'GrIS': [...], 'AIS': [...]} + """ + # Grid definitions moved to project root `gdfs` directory + gdf_dir = Path(conventions_dir).parent / 'gdfs' + grids = {'GrIS': [], 'AIS': []} + + if not gdf_dir.exists(): + return grids + + for file in sorted(gdf_dir.glob('gdf_ISMIP7_*.txt')): + # Extract grid type and resolution from filename + # e.g., gdf_ISMIP7_GrIS_16000m.txt + match = re.search(r'gdf_ISMIP7_(GrIS|AIS)_(\d+[a-z]+)', file.name) + if match: + grid_type = match.group(1) + resolution = match.group(2) + grids[grid_type].append(resolution) + + return grids + + +def parse_grid_file(gdf_file): + """ + Parse ISMIP7 grid definition file. + + Parameters + ---------- + gdf_file : str + Path to grid definition file + + Returns + ------- + dict + Dictionary with grid parameters (xsize, ysize, xfirst, yfirst, xinc, yinc, etc.) + """ + grid_params = {} + + with open(gdf_file, 'r') as f: + for line in f: + line = line.strip() + if line.startswith('#') or not line or '=' not in line: + continue + + key, value = line.split('=', 1) + key = key.strip() + value = value.strip().strip('"') + + try: + # Try to convert to int or float + if '.' in value: + grid_params[key] = float(value) + else: + grid_params[key] = int(value) + except ValueError: + grid_params[key] = value + + return grid_params + + +def read_variable_criteria(excel_file, include_non_mandatory=False): + """ + Read variable criteria from Excel file. + + Parameters + ---------- + excel_file : str + Path to the Excel file + include_non_mandatory : bool + Whether to include non-mandatory variables + + Returns + ------- + dict + Dictionary with variable information + """ + import pandas as pd + + variables = {} + + # Read the Excel file + df = pd.read_excel(excel_file, sheet_name='ISM') + + # Filter out rows that don't have variable names + df = df.dropna(subset=['Variable Name']) + + # Filter mandatory variables if requested + if not include_non_mandatory: + df = df[df['Mandatory (yes/no)'].str.lower() == 'yes'] + + for idx, row in df.iterrows(): + var_name = row['Variable Name'] + # Parse dimensions from Dim column + dim_str = row['Dim'] + if dim_str == 'x,y,t': + dimensions = ['x', 'y', 't'] + elif dim_str == 't': + dimensions = ['t'] + else: + dimensions = ['x', 'y', 't'] # Default + + variables[var_name] = { + 'dimensions': dimensions, + 'type': row['Type'], + 'description': row['long_name'], # Use long_name from Excel + 'standard_name': row['standard_name'] if pd.notna(row['standard_name']) else '', + 'units': str(row['units']) if pd.notna(row['units']) else '', + 'mandatory': row['Mandatory (yes/no)'].lower() == 'yes', + } + + # Collect any min_/max_ columns (case-insensitive) from the Excel sheet + # Normalize keys to lowercase (e.g., 'min_gris', 'max_ais') and store + for col in df.columns: + try: + lc = str(col).lower() + except Exception: + continue + if lc.startswith('min_') or lc.startswith('max_'): + try: + val = row[col] + if pd.isna(val): + parsed = None + else: + parsed = float(val) + except Exception: + parsed = None + variables[var_name][lc] = parsed + + return variables + + +def generate_synthetic_data(shape, min_val, max_val, dtype=np.float32): + """ + Generate synthetic data within specified range. + + Parameters + ---------- + shape : tuple + Shape of the output array + min_val : float + Minimum value + max_val : float + Maximum value + dtype : type + Data type for the array + + Returns + ------- + np.ndarray + Random data within the specified range + """ + data = np.random.uniform(min_val, max_val, shape).astype(dtype) + return data + + +def create_netcdf_file(output_file, grid_name='GrIS_16000m', scenario='ctrl', start_year=2015, nyears=5, + conventions_dir=None, include_non_mandatory=False, include_scalars=False, + include_xyt=False, + ism_member_id='m001', esm_id='CESM2-WACCM', forcing_member_id='f001', set_counter='C001'): + """ + Create NetCDF files with ISMIP7 variables (one file per variable). + + Parameters + ---------- + output_file : str + Output file path (if None, will use default based on grid type) + grid_name : str + Grid name (e.g., 'GrIS_16000m', 'AIS_16000m') + scenario : str + Scenario name (default: 'ctrl') + start_year : int + First calendar year for the output time axis (default: 2015) + nyears : int + Number of years in the output (default: 5) + conventions_dir : str + Path to conventions directory containing gfds + include_non_mandatory : bool + Whether to include non-mandatory variables + include_scalars : bool + Whether to include scalar time-series variables + include_xyt : bool + Whether to include non-scalar x,y,t variables (3D). Set to False to skip x,y,t variables. + """ + + group='ISMIP7' + model='SYNTH1' + contact_names='Your Name' + contact_emails='your@email.org' + + # Determine conventions directory + if conventions_dir is None: + # Try to find conventions directory relative to this script + script_dir = Path(__file__).parent.parent + conventions_dir = script_dir / 'conventions' + + conventions_dir = Path(conventions_dir) + + # Parse grid name to get grid type and resolution + match = re.match(r'(GrIS|AIS)_(.+)', grid_name) + if not match: + raise ValueError(f"Invalid grid name: {grid_name}. Expected format: GrIS_16000m or AIS_16000m") + + grid_type, resolution = match.groups() + + # Choose CRS per domain: GrIS -> EPSG:3413, AIS -> EPSG:3031 + if grid_type == 'GrIS': + domain_crs = 'EPSG:3413' + else: + domain_crs = 'EPSG:3031' + + # Determine output directory + models_dir = Path(__file__).parent.parent / 'Models' + if grid_type == 'AIS': + output_dir = models_dir / 'AIS' / group / model / 'CORE' + else: # GrIS + output_dir = models_dir / 'GrIS' / group / model / 'CORE' + + output_dir.mkdir(parents=True, exist_ok=True) + + # Load grid definition + gdf_file = conventions_dir.parent / 'gdfs' / f'gdf_ISMIP7_{grid_type}_{resolution}.txt' + if not gdf_file.exists(): + raise FileNotFoundError(f"Grid definition file not found: {gdf_file}") + + grid_params = parse_grid_file(str(gdf_file)) + + nx = grid_params['xsize'] + ny = grid_params['ysize'] + xfirst = grid_params['xfirst'] + yfirst = grid_params['yfirst'] + xinc = grid_params['xinc'] + yinc = grid_params['yinc'] + + # Read variable criteria from Excel file + excel_file = conventions_dir / 'ISMIP7_variable_request.xlsx' + if not excel_file.exists(): + print(f"Warning: {excel_file} not found") + variables = {} + else: + variables = read_variable_criteria(str(excel_file), include_non_mandatory) + + # Create coordinate arrays + x = np.arange(nx, dtype=np.float32) * xinc + xfirst + y = np.arange(ny, dtype=np.float32) * yinc + yfirst + + # Create time coordinates - will be set per variable type later + # ST (State) variables: end of year (e.g., 0.999, 1.999, ...) + # FL (Flux) variables: middle of year (e.g., 0.5, 1.5, ...) with bounds + + # Select min/max values based on grid type + val_key_min = f'min_value_{grid_type.lower()}' + val_key_max = f'max_value_{grid_type.lower()}' + + # Fill value for single-precision floats (ISMIP7 recommends netCDF4 default f4) + fillval = netCDF4.default_fillvals['f4'] + + # contact defaults are provided via function signature + + # Create variables with x-y-t dimensions + xyt_vars = {var: info for var, info in variables.items() + if info['dimensions'] == ['x', 'y', 't']} + + # Create scalar time-series variables (t dimension only) + scalar_vars = {var: info for var, info in variables.items() + if info['dimensions'] == ['t']} + + # Compute time range string for filename + time_range = f"{start_year}-{start_year + nyears - 1}" + + # Create filename template following required pattern: + # ________.nc + domain_id = grid_type + source_id = group + ism_id = model + experiment_id = scenario + filename_template = ( + f"{domain_id}_{source_id}_{ism_id}_{ism_member_id}_{esm_id}_{forcing_member_id}_" + f"{experiment_id}_{set_counter}_{time_range}.nc" + ) + + # Create separate file for each variable + created_files = [] + + # Process x-y-t variables (3D). Can be enabled via `include_xyt`. + if include_xyt: + for var_name, var_info in xyt_vars.items(): + var_type = var_info['type'] + + # Select appropriate time coordinate (days since 1850, calendar: standard) + origin = datetime(1850, 1, 1).date() + if var_type == 'ST': + # End of year (Dec 31) + time_days = [] + time_bounds = None + for i in range(nyears): + year = start_year + i + dt = datetime(year, 12, 31).date() + time_days.append(float((dt - origin).days)) + time_coord = np.array(time_days, dtype=np.float32) + elif var_type == 'FL': + # Middle of year (Jul 1) and bounds from Jan 1 to Jan 1 next year + time_days = [] + time_bounds = np.zeros((nyears, 2), dtype=np.float32) + for i in range(nyears): + year = start_year + i + mid = datetime(year, 7, 1).date() + t0 = datetime(year, 1, 1).date() + t1 = datetime(year + 1, 1, 1).date() + time_days.append(float((mid - origin).days)) + time_bounds[i, 0] = float((t0 - origin).days) + time_bounds[i, 1] = float((t1 - origin).days) + time_coord = np.array(time_days, dtype=np.float32) + else: + # Default to state variable timing (end of year) + time_days = [] + time_bounds = None + for i in range(nyears): + year = start_year + i + dt = datetime(year, 12, 31).date() + time_days.append(float((dt - origin).days)) + time_coord = np.array(time_days, dtype=np.float32) + + # Determine min/max for this variable (per-grid if available) + min_val = var_info.get(val_key_min) + max_val = var_info.get(val_key_max) + if min_val is None and max_val is None: + min_val = -1e6 + max_val = 1e6 + else: + if min_val is None: + min_val = max_val - 1.0 + if max_val is None: + max_val = min_val + 1.0 + if min_val >= max_val: + max_val = min_val + 1.0 + + data = generate_synthetic_data((nyears, ny, nx), min_val, max_val) + + # Create data array with metadata + data_vars = { + var_name: ( + ('time', 'y', 'x'), + data.astype(np.float32), + { + 'long_name': var_info['description'], + 'units': var_info['units'], + 'standard_name': var_info['standard_name'], + } + ) + } + + # Add time bounds for flux variables (units: days since 1850) + if time_bounds is not None: + data_vars[f'time_bounds'] = ( + ('time', 'bounds'), + time_bounds.astype(np.float32), + { + 'long_name': 'time bounds', + 'units': 'days since 1850-01-01', + } + ) + # The 'time' coordinate is created in `coords`, so we attach the + # 'bounds' attribute to the Dataset's time coordinate after + # constructing the Dataset (see below). + + # Create xarray Dataset + coords = { + 'x': ('x', x, {'long_name': 'x-coordinate', 'units': 'm'}), + 'y': ('y', y, {'long_name': 'y-coordinate', 'units': 'm'}), + 'time': ('time', time_coord, {'long_name': 'time', 'units': 'days since 1850-01-01', 'calendar': 'standard'}), + } + + ds = xr.Dataset(data_vars, coords=coords) + + # If time bounds were created, attach the bounds attribute to the time coordinate + if time_bounds is not None: + ds['time'].attrs['bounds'] = 'time_bounds' + + # Add grid mapping information if available + grid_mapping_attrs = {} + if 'proj_params' in grid_params: + grid_mapping_attrs['proj_params'] = grid_params['proj_params'] + if 'grid_mapping_name' in grid_params: + grid_mapping_attrs['grid_mapping_name'] = grid_params['grid_mapping_name'] + + # Add global attributes + ds.attrs.update({ + 'title': f'ISMIP7 synthetic data - {var_name}', + 'history': f'Generated on {datetime.now().isoformat()}', + 'Conventions': 'CF-1.7', + 'grid_type': grid_type, + 'grid_resolution': resolution, + 'group': group, + 'model': model, + 'contact_name': contact_names, + 'contact_email': contact_emails, + 'crs': domain_crs, + }) + + # Add grid mapping attributes to global attributes + ds.attrs.update(grid_mapping_attrs) + + # Create filename + filename = f"{var_name}_{filename_template}" + output_path = output_dir / filename + + # Prepare encoding to comply with conventions (single precision, fill values, unlimited time) + encoding = {} + encoding[var_name] = {'dtype': 'f4', '_FillValue': fillval} + encoding['time'] = {'dtype': 'f4', '_FillValue': fillval} + # time bounds are stored under the fixed name 'time_bounds' + if 'time_bounds' in data_vars: + encoding['time_bounds'] = {'dtype': 'f4', '_FillValue': fillval} + + ds.to_netcdf(output_path, unlimited_dims=('time',), encoding=encoding) + # xarray drops missing_value from attrs when _FillValue is in encoding; + # write it explicitly via netCDF4 to guarantee the attribute is present. + with netCDF4.Dataset(output_path, 'a') as nc: + nc[var_name].missing_value = fillval + created_files.append(str(output_path)) + else: + print(f"Skipping x,y,t variables (include_xyt=False); {len(xyt_vars)} variables not written") + + # Process scalar variables + if include_scalars: + for var_name, var_info in scalar_vars.items(): + var_type = var_info['type'] + + # Select appropriate time coordinate (days since 1850) + origin = datetime(1850, 1, 1).date() + if var_type == 'ST': + time_days = [] + time_bounds = None + for i in range(nyears): + year = start_year + i + dt = datetime(year, 12, 31).date() + time_days.append(float((dt - origin).days)) + time_coord = np.array(time_days, dtype=np.float32) + elif var_type == 'FL': + time_days = [] + time_bounds = np.zeros((nyears, 2), dtype=np.float32) + for i in range(nyears): + year = start_year + i + mid = datetime(year, 7, 1).date() + t0 = datetime(year, 1, 1).date() + t1 = datetime(year + 1, 1, 1).date() + time_days.append(float((mid - origin).days)) + time_bounds[i, 0] = float((t0 - origin).days) + time_bounds[i, 1] = float((t1 - origin).days) + time_coord = np.array(time_days, dtype=np.float32) + else: + time_days = [] + time_bounds = None + for i in range(nyears): + year = start_year + i + dt = datetime(year, 12, 31).date() + time_days.append(float((dt - origin).days)) + time_coord = np.array(time_days, dtype=np.float32) + + # Determine min/max for this scalar variable (per-grid if available) + min_val = var_info.get(val_key_min) + max_val = var_info.get(val_key_max) + if min_val is None and max_val is None: + min_val = -1e12 + max_val = 1e12 + else: + if min_val is None: + min_val = max_val - 1.0 + if max_val is None: + max_val = min_val + 1.0 + if min_val >= max_val: + max_val = min_val + 1.0 + + # Generate synthetic 1D data + data = generate_synthetic_data((nyears,), min_val, max_val).astype(np.float32) # Wide range for scalar data + # Create data array with metadata + data_vars = { + var_name: ( + ('time',), + data, + { + 'long_name': var_info['description'], + 'units': var_info['units'], + 'standard_name': var_info['standard_name'], + } + ) + } + if time_bounds is not None: + data_vars[f'time_bounds'] = ( + ('time', 'bounds'), + time_bounds.astype(np.float32), + { + 'long_name': 'time bounds', + 'units': 'days since 1850-01-01', + } + ) + # The 'time' coordinate is created in `coords`, so attach the + # bounds attribute to the Dataset's time coordinate after + # constructing the Dataset (see below). + + # Create xarray Dataset + coords = { + 'time': ('time', time_coord, {'long_name': 'time', 'units': 'days since 1850-01-01', 'calendar': 'standard'}), + } + + ds = xr.Dataset(data_vars, coords=coords) + # If time bounds were created, attach the bounds attribute to the time coordinate + if time_bounds is not None: + ds['time'].attrs['bounds'] = 'time_bounds' + + # Add global attributes (include mandatory ISMIP7 attributes) + ds.attrs.update({ + 'title': f'ISMIP7 synthetic data - {var_name}', + 'history': f'Generated on {datetime.now().isoformat()}', + 'Conventions': 'CF-1.7', + 'grid_type': grid_type, + 'grid_resolution': resolution, + 'nt': nyears, + 'group': group, + 'model': model, + 'scenario': scenario, + 'contact_name': contact_names, + 'contact_email': contact_emails, + 'crs': domain_crs, + }) + + # Create filename + filename = f"{var_name}_{filename_template}" + output_path = output_dir / filename + + # Prepare encoding for scalars + encoding = {var_name: {'dtype': 'f4', '_FillValue': fillval}, + 'time': {'dtype': 'f4', '_FillValue': fillval}} + if f'time_bounds' in data_vars: + encoding[f'time_bounds'] = {'dtype': 'f4', '_FillValue': fillval} + + ds.to_netcdf(output_path, unlimited_dims=('time',), encoding=encoding) + # xarray drops missing_value from attrs when _FillValue is in encoding; + # write it explicitly via netCDF4 to guarantee the attribute is present. + with netCDF4.Dataset(output_path, 'a') as nc: + nc[var_name].missing_value = fillval + created_files.append(str(output_path)) + + print(f"Created {len(created_files)} files in {output_dir}") + print(f" Grid: {grid_name} ({nx} x {ny})") + print(f" Years: {nyears}") + + + return created_files + + +def create_multiple_files(output_dir=None, n_files=3, conventions_dir=None, + start_year=2015, contact_names='Your Name', contact_emails='your@email.org', + include_xyt=True, include_non_mandatory=False): + """ + Create multiple NetCDF files for testing (one file per variable per grid). + + Parameters + ---------- + output_dir : str + Output directory (if None, uses default Models structure) + n_files : int + Number of files to generate per grid + conventions_dir : str + Path to conventions directory + """ + + if conventions_dir is None: + # Try to find conventions directory relative to this script + script_dir = Path(__file__).parent.parent + conventions_dir = script_dir / 'conventions' + + # Get available grids + grids = get_available_grids(str(conventions_dir)) + + total_files = 0 + + # Create files for both Antarctica (AIS) and Greenland (GrIS) + for grid_type in ['GrIS', 'AIS']: + if grid_type not in grids or not grids[grid_type]: + print(f"No {grid_type} grids found") + continue + + # Use the first n_files available grids + for i, resolution in enumerate(grids[grid_type][:n_files]): + grid_name = f'{grid_type}_{resolution}' + + created_files = create_netcdf_file( + None, # Use default output path + grid_name=grid_name, + nyears=5, # Default 5 years for testing + start_year=start_year, + conventions_dir=conventions_dir, + include_scalars=(i == 0), + include_xyt=include_xyt, + include_non_mandatory=include_non_mandatory, + ) + total_files += len(created_files) + + print(f"Total files created: {total_files}") + + +if __name__ == '__main__': + import argparse + + # Get available grids + script_dir = Path(__file__).parent.parent + conventions_dir = script_dir / 'conventions' + available_grids = get_available_grids(str(conventions_dir)) + + # Create list of available grid choices, excluding some high-resolution entries + grid_choices = [] + for grid_type in ['GrIS', 'AIS']: + if grid_type in available_grids: + for resolution in available_grids[grid_type]: + name = f'{grid_type}_{resolution}' + grid_choices.append(name) + + parser = argparse.ArgumentParser( + description='Generate ISMIP7 NetCDF files with synthetic data' + ) + parser.add_argument( + '--grid', + default='GrIS_16000m', + choices=grid_choices, + help=f'Grid definition to use (default: GrIS_16000m). Available: {", ".join(grid_choices[:5])}...' + ) + parser.add_argument( + '--start-year', + type=int, + default=2015, + help='First calendar year for the output time axis (default: 2015)' + ) + parser.add_argument( + '--nyears', + type=int, + default=5, + help='Number of years in the output (default: 5)' + ) + parser.add_argument( + '--scenario', + default='ctrl', + help='Scenario name (default: ctrl)' + ) + parser.add_argument( + '--scalars', + action='store_true', + help='Include scalar time-series variables' + ) + parser.add_argument( + '--xyt', + action='store_true', + help='Write x,y,t variables' + ) + parser.add_argument( + '--include-non-mandatory', + action='store_true', + help='Include non-mandatory variables from the ISM Excel variable list' + ) + parser.add_argument( + '--multiple', + action='store_true', + help='Generate multiple test files with all available grids' + ) + parser.add_argument( + '--conventions-dir', + default=str(conventions_dir), + help=f'Path to conventions directory (default: {conventions_dir})' + ) + parser.add_argument( + '--ism-member-id', + default='m001', + help='ISM ensemble member id (default: m001)' + ) + parser.add_argument( + '--esm-id', + default='CESM2-WACCM', + help='ESM id (default: CESM2-WACCM)' + ) + parser.add_argument( + '--forcing-member-id', + default='f001', + help='Forcing ensemble member id (default: f001)' + ) + parser.add_argument( + '--set-counter', + default='C001', + help='Set counter id (default: C001)' + ) + parser.add_argument( + '--list-grids', + action='store_true', + help='List all available grids and exit' + ) + + args = parser.parse_args() + + # Handle list grids option + if args.list_grids: + print("\nAvailable grids:\n") + for grid_type in ['GrIS', 'AIS']: + if grid_type in available_grids: + print(f"{grid_type}:") + for resolution in available_grids[grid_type]: + print(f" - {grid_type}_{resolution}") + print() + exit(0) + + if args.multiple: + create_multiple_files(conventions_dir=args.conventions_dir, + start_year=args.start_year, + include_xyt=args.xyt, + include_non_mandatory=args.include_non_mandatory) + else: + create_netcdf_file( + None, # Use default output path + grid_name=args.grid, + scenario=args.scenario, + start_year=args.start_year, + nyears=args.nyears, + conventions_dir=args.conventions_dir, + include_scalars=args.scalars, + include_xyt=args.xyt, + include_non_mandatory=args.include_non_mandatory, + ism_member_id=args.ism_member_id, + esm_id=args.esm_id, + forcing_member_id=args.forcing_member_id, + set_counter=args.set_counter, + )