Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions pyistp/_impl.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,24 @@
from .drivers import current_driver, Driver


def _driver_factory(file_or_buffer):
if isinstance(file_or_buffer, bytes):
magic = file_or_buffer[:4]
else:
with open(file_or_buffer, "rb") as f:
magic = f.read(4)
if magic in (b'\x89HDF', b'CDF'):
from .drivers.netcdf import Driver as NetCDFDriver
return NetCDFDriver(file_or_buffer)
return current_driver(file_or_buffer)
from .data_variable import DataVariable
from .support_data_variable import SupportDataVariable
import re
import numpy as np
from typing import List, Optional
import logging

DEPEND_REGEX = re.compile("DEPEND_\\d")
DEPEND_REGEX = re.compile("DEPEND_\\d", re.IGNORECASE)

ISTP_NOT_COMPLIANT_W = "Non compliant ISTP file"

Expand Down Expand Up @@ -97,9 +109,9 @@ class ISTPLoaderImpl:
def __init__(self, file=None, buffer=None, master_file=None, master_buffer=None):
if file is not None:
log.debug(f"Loading {file}")
self.cdf = current_driver(file or buffer)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in another comment, I would like to take into account the fact that the driver used to read the master file may differ from the one used to read the data file.

self.cdf = _driver_factory(file or buffer)
if master_file or master_buffer:
self.master_cdf = current_driver(master_file or master_buffer)
self.master_cdf = _driver_factory(master_file or master_buffer)
else:
self.master_cdf = self.cdf
self.data_variables = []
Expand All @@ -116,7 +128,9 @@ def _update_data_vars_lis(self):
self.data_variables = []
for var in self.master_cdf.variables():
var_attrs = self.master_cdf.variable_attributes(var)
var_type = self.master_cdf.variable_attribute_value(var, 'VAR_TYPE')
# search for the VAR_TYPE attribute, regardless of its case
var_type_attr = next((a for a in var_attrs if a.upper() == 'VAR_TYPE'), None)
var_type = self.master_cdf.variable_attribute_value(var, var_type_attr) if var_type_attr else None
param_type = (self.master_cdf.variable_attribute_value(var,
'PARAMETER_TYPE') or "").lower() # another cluster CSA crap
if (var_type == 'data' or param_type == 'data') and not self.master_cdf.is_char(var):
Expand Down
146 changes: 146 additions & 0 deletions pyistp/drivers/netcdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
import netCDF4
import numpy as np
from typing import Any


class Driver:
"""NetCDF4 driver implementing the PyISTP Driver protocol."""

def __init__(self, file):
# Accept either a file path (str) or a bytes buffer
if isinstance(file, bytes):
self._ds = netCDF4.Dataset("in_memory.nc", memory=file)
else:
self._ds = netCDF4.Dataset(str(file), "r")

def variables(self):
return list(self._ds.variables.keys())

def has_variable(self, name):
return name in self._ds.variables

def variable_attributes(self, var):
if var not in self._ds.variables:
return []
return list(self._ds[var].ncattrs())

def variable_attribute_value(self, var, attr):
if var not in self._ds.variables:
return None
try:
return self._ds[var].getncattr(attr)
except AttributeError:
return None

def is_char(self, var):
if var not in self._ds.variables:
return False
return self._ds[var].dtype == str

def is_nrv(self, var): # NOSONAR
# NRV concept does not exist in NetCDF4
return False

def shape(self, var):
return tuple(self._ds[var].shape)

def attributes(self):
return list(self._ds.ncattrs())

def attribute(self, key):
try:
return self._ds.getncattr(key)
except AttributeError:
return None

# Mapping from numpy dtype kinds to CDF type strings
_DTYPE_TO_CDF = {
'f4': 'CDF_FLOAT',
'f8': 'CDF_DOUBLE',
'i1': 'CDF_INT1',
'i2': 'CDF_INT2',
'i4': 'CDF_INT4',
'i8': 'CDF_INT8',
'u1': 'CDF_UINT1',
'u2': 'CDF_UINT2',
'u4': 'CDF_UINT4',
'S': 'CDF_CHAR',
}

# Milliseconds between CDF epoch (year 0000) and Unix epoch (1970-01-01)
_CDF_EPOCH_OFFSET_MS = 62_167_219_200_000

def _get_units(self, var):
v = self._ds[var]
for key in v.ncattrs():
if key.lower() == 'units':
return v.getncattr(key)
return None

def _is_cf_time(self, var):
"""Return True if the variable uses CF time conventions
(units attribute containing 'since')."""
units = self._get_units(var)
return isinstance(units, str) and 'since' in units

def _is_cdf_epoch(self, var):
"""Return True if the variable uses CDF_EPOCH convention
(float64, units='ms')."""
units = self._get_units(var)
return (isinstance(units, str)
and units.strip().lower() == 'ms'
and self._ds[var].dtype == np.float64)

def _cf_time_to_datetime64(self, var):
"""Convert a CF time variable (units with 'since') to
datetime64[ns]."""
v = self._ds[var]
units = v.getncattr('units')
# netCDF4.num2date converts CF floats to cftime objects
dates: Any = netCDF4.num2date(
v[:], units, only_use_cftime_datetimes=False
)
# Convert to datetime64[ns] via ISO string representation
return np.array([np.datetime64(str(d), 'ns') for d in dates])

def _cdf_epoch_to_datetime64(self, var):
"""Convert CDF_EPOCH (ms since year 0000) to datetime64[ns]."""
ms = np.array(self._ds[var][:], dtype=np.float64)
unix_ms = ms - self._CDF_EPOCH_OFFSET_MS
return (unix_ms * 1_000_000).astype('datetime64[ns]')

def _is_unix_ms_time(self, var):
units = self._get_units(var)
return isinstance(units, str) and units.strip().lower() == 'milliseconds'

def _unix_ms_time_to_datetime64(self, var):
"""Convert ms since Unix epoch (1970-01-01) to datetime64[ns]."""
ms = np.array(self._ds[var][:], dtype=np.int64)
return (ms * 1_000_000).astype('datetime64[ns]')

def values(self, var, is_metadata_variable=False): # NOSONAR
v = self._ds[var]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, it is not the responsibility of the pyistp driver to interpret the data and convert it into datetime64; this should instead be handled by the consuming tool (in our case, the Speasy codec).
The data should be provided as-is, exactly as they appear in the file.
The pyistp library should only identify which variable contains the time information for the other variables.
This seems even more important given that, in the case of netCDF, the type of a time variable is not always clearly defined (unlike CDF, which uses CDF_EPOCH, CDF_EPOCH16, or TT2000).
As stated in the specification:
https://github.com/IHDE-Alliance/ISTP_metadata/blob/main/ISTP_metadata_guidelines/docs/04_metadata-variables.md#netcdf-times

NetCDF files can include the CDF time variables, with CDF_TIME_TT2000 especially recommended, but will require using the CDF library time routines for conversion. Otherwise, netCDF times are typically something like seconds from some specific time epoch, with UNITS = "seconds from 2000-01-01 UTC" or similar. In either case, the ISTP time variable attributes should be added.

If we move this interpretation logic into Speasy, we will be able to adapt it more easily depending on the provider.
For example, in AMDA / DDSERVER, the netCDF data files in our local database use a time format called DDTIME (which is not used anywhere else - for historical reasons).
It would not make sense to implement support for this format in pyistp, whereas in Speasy we could provide a callback mechanism in the netCDF codec to handle such very specific cases.

if self._is_cf_time(var):
return self._cf_time_to_datetime64(var)
if self._is_cdf_epoch(var):
return self._cdf_epoch_to_datetime64(var)
if self._is_unix_ms_time(var):
return self._unix_ms_time_to_datetime64(var)
if v.dtype == str:
# Native NetCDF4 string — return as numpy array of strings
raw = v[()]
if isinstance(raw, str):
raw = [raw]
return np.array(raw)
return np.array(v[:])

def cdf_type(self, var):
if self._is_cf_time(var) or self._is_unix_ms_time(var):
return 'CDF_TIME_TT2000'
if self._is_cdf_epoch(var):
return 'CDF_EPOCH'
v = self._ds[var]
if v.dtype == str:
return 'CDF_CHAR'
dtype_str = v.dtype.str.lstrip('<>=!')
return self._DTYPE_TO_CDF.get(dtype_str, f'CDF_UNKNOWN_{dtype_str}')
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[build-system]
build-backend = "flit_core.buildapi"
requires = ["flit_core"]
Expand Down Expand Up @@ -30,7 +30,7 @@
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
dependencies = ['pycdfpp>=0.6.0']
dependencies = ['pycdfpp>=0.6.0', 'netCDF4']
[project.urls]
homepage = "https://github.com/SciQLop/PyISTP"
repository = "https://github.com/SciQLop/PyISTP"
Expand Down
2 changes: 2 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@ coverage
Sphinx
twine
ddt
netCDF4
pytest-cov
Binary file added tests/resources/ac_h2s_mfi_cdaweb.nc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading