From 1fca9cd743ed7b024aff5178557fb017c782bf60 Mon Sep 17 00:00:00 2001 From: Simon Pinches Date: Thu, 18 Jun 2026 15:55:44 +0200 Subject: [PATCH] Sort IMAS file globs to make checksums platform-independent imas.checksum.checksum() feeds the files returned by imas_files() into a single running SHA1 hash in iteration order. imas_files() built the HDF5 and ASCII file lists from Path.glob(), which returns entries in an arbitrary, filesystem-dependent order. As a result the same byte-identical IMAS data could hash to different checksums on Windows vs Linux. Sort the glob results explicitly by file name so the iteration order is deterministic and identical across platforms. Sort by p.name rather than relying on Path comparison, which is itself platform-dependent (Windows folds case, Linux does not). Add regression tests for imas_files ordering. --- src/simdb/imas/utils.py | 4 +-- tests/test_imas_utils.py | 53 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 2 deletions(-) create mode 100644 tests/test_imas_utils.py diff --git a/src/simdb/imas/utils.py b/src/simdb/imas/utils.py index 0cc80c43..b2b83223 100644 --- a/src/simdb/imas/utils.py +++ b/src/simdb/imas/utils.py @@ -287,7 +287,7 @@ def imas_files(uri: URI) -> List[Path]: path = _get_path(uri) if backend == "hdf5": - return [p.absolute() for p in path.glob("*.h5")] + return [p.absolute() for p in sorted(path.glob("*.h5"), key=lambda p: p.name)] elif backend == "mdsplus": return [ path / "ids_001.characteristics", @@ -295,7 +295,7 @@ def imas_files(uri: URI) -> List[Path]: path / "ids_001.tree", ] elif backend == "ascii": - return [p.absolute() for p in path.glob("*.ids")] + return [p.absolute() for p in sorted(path.glob("*.ids"), key=lambda p: p.name)] else: raise ValueError(f"Unknown IMAS backend {backend}") diff --git a/tests/test_imas_utils.py b/tests/test_imas_utils.py new file mode 100644 index 00000000..bd048ebf --- /dev/null +++ b/tests/test_imas_utils.py @@ -0,0 +1,53 @@ +import unittest +from pathlib import Path +from tempfile import TemporaryDirectory + +from simdb.imas.utils import imas_files +from simdb.uri import URI + + +class ImasFilesTests(unittest.TestCase): + """Tests for simdb.imas.utils.imas_files. + + The checksum is computed by feeding the files into a single running hash + in the order imas_files returns them, so that order must be deterministic + and identical across platforms. Path.glob() does not sort, so imas_files + sorts explicitly by file name. See utils.imas_files / imas.checksum.checksum. + """ + + def _make_files(self, directory, names): + # Create files in an order that does not match the expected sorted order + for name in names: + (Path(directory) / name).write_bytes(b"") + + def test_hdf5_files_sorted_by_name(self): + names = [ + "equilibrium.h5", + "core_profiles.h5", + "master.h5", + "summary.h5", + ] + with TemporaryDirectory() as tmp: + self._make_files(tmp, names) + uri = URI(f"imas:hdf5?path={tmp}") + result = [p.name for p in imas_files(uri)] + self.assertEqual(result, sorted(names)) + + def test_ascii_files_sorted_by_name(self): + names = ["equilibrium.ids", "core_profiles.ids", "summary.ids"] + with TemporaryDirectory() as tmp: + self._make_files(tmp, names) + uri = URI(f"imas:ascii?path={tmp}") + result = [p.name for p in imas_files(uri)] + self.assertEqual(result, sorted(names)) + + def test_hdf5_files_returns_absolute_paths(self): + with TemporaryDirectory() as tmp: + self._make_files(tmp, ["core_profiles.h5"]) + uri = URI(f"imas:hdf5?path={tmp}") + result = imas_files(uri) + self.assertTrue(all(p.is_absolute() for p in result)) + + +if __name__ == "__main__": + unittest.main()