diff --git a/doc/index.rst b/doc/index.rst index f2e4495..f874ee8 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -33,80 +33,12 @@ Finally, a :doc:`cli` allows comparing two files in any of the formats above, or directory trees full of files, as long as they can be loaded with :func:`xarray.open_dataset`. -Examples --------- - -.. code:: - - from recursive_diff import recursive_diff - - lhs = { - 'foo': [1, 2, ('one', 5.2), 4], - 'only_lhs': 1 - } - rhs = { - 'foo': [1, 2, ['two', 5.200001, 3]], - 'only_rhs': 1 - } - - for diff in recursive_diff(lhs, rhs, abs_tol=.1): - print(diff) - -Output:: - - Pair only_lhs:1 is in LHS only - Pair only_rhs:1 is in RHS only - [foo]: LHS has 1 more elements than RHS: [4] - [foo][2]: object type differs: tuple != list - [foo][2]: RHS has 1 more elements than LHS: [3] - [foo][2][0]: one != two - - -Or as a unit test: - -.. code:: - - from recursive_diff import recursive_eq - - def test1(): - recursive_eq(lhs, rhs, abs_tol=.1) - -py.test output:: - - ==================== FAILURES =================== - E AssertionError: 6 differences found - - -------------- Captured stdout call -------------- - - Pair only_lhs:1 is in LHS only - Pair only_rhs:1 is in RHS only - [foo]: LHS has 1 more elements than RHS: [4] - [foo][2]: object type differs: tuple != list - [foo][2]: RHS has 1 more elements than LHS: [3] - [foo][2][0]: one != two - - -Compare two nested directory trees that contain ``.json``, ``.jsonl``, ``.yaml``, -``.msgpack``, ``.nc``, or ``.zarr`` files: - -.. code:: - - from recursive_diff import recursive_open, recursive_eq - - lhs = recursive_open("baseline") - rhs = recursive_open("new_output") - recursive_eq(lhs, rhs) - - -Same as above, but from the command line:: - - $ recursive-diff -r baseline new_output - Index ----- .. toctree:: + quickstart installing api extend diff --git a/doc/quickstart.rst b/doc/quickstart.rst new file mode 100644 index 0000000..73313c9 --- /dev/null +++ b/doc/quickstart.rst @@ -0,0 +1,68 @@ +Quick Start +=========== + +.. code:: + + from recursive_diff import recursive_diff + + lhs = { + 'foo': [1, 2, ('one', 5.2), 4], + 'only_lhs': 1 + } + rhs = { + 'foo': [1, 2, ['two', 5.200001, 3]], + 'only_rhs': 1 + } + + for diff in recursive_diff(lhs, rhs, abs_tol=.1): + print(diff) + +Output:: + + Pair only_lhs:1 is in LHS only + Pair only_rhs:1 is in RHS only + [foo]: LHS has 1 more elements than RHS: [4] + [foo][2]: object type differs: tuple != list + [foo][2]: RHS has 1 more elements than LHS: [3] + [foo][2][0]: one != two + + +Or as a unit test: + +.. code:: + + from recursive_diff import recursive_eq + + def test1(): + recursive_eq(lhs, rhs, abs_tol=.1) + +py.test output:: + + ==================== FAILURES =================== + E AssertionError: 6 differences found + + -------------- Captured stdout call -------------- + + Pair only_lhs:1 is in LHS only + Pair only_rhs:1 is in RHS only + [foo]: LHS has 1 more elements than RHS: [4] + [foo][2]: object type differs: tuple != list + [foo][2]: RHS has 1 more elements than LHS: [3] + [foo][2][0]: one != two + + +Compare two nested directory trees that contain ``.json``, ``.jsonl``, ``.yaml``, +``.msgpack``, ``.nc``, or ``.zarr`` files: + +.. code:: + + from recursive_diff import recursive_open, recursive_eq + + lhs = recursive_open("baseline") + rhs = recursive_open("new_output") + recursive_eq(lhs, rhs) + + +Same as above, but from the command line:: + + $ recursive-diff -r baseline new_output diff --git a/recursive_diff/__init__.py b/recursive_diff/__init__.py index 1123bb9..aca398b 100644 --- a/recursive_diff/__init__.py +++ b/recursive_diff/__init__.py @@ -2,9 +2,12 @@ from recursive_diff.cast import cast from recursive_diff.files import open, recursive_open -from recursive_diff.public import diff_arrays, display_diffs -from recursive_diff.recursive_diff import recursive_diff -from recursive_diff.recursive_eq import recursive_eq +from recursive_diff.public import ( + diff_arrays, + display_diffs, + recursive_diff, + recursive_eq, +) try: __version__ = importlib.metadata.version("recursive_diff") diff --git a/recursive_diff/cli/recursive_diff.py b/recursive_diff/cli/recursive_diff.py index f51a7e7..9d70916 100644 --- a/recursive_diff/cli/recursive_diff.py +++ b/recursive_diff/cli/recursive_diff.py @@ -21,10 +21,8 @@ logger, recursive_open, ) -from recursive_diff.files import ( - open as open_, -) -from recursive_diff.recursive_diff import recursive_diff +from recursive_diff.files import open as open_ +from recursive_diff.public import recursive_diff LOGFORMAT = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" diff --git a/recursive_diff/public.py b/recursive_diff/public.py index 506cd09..5eb7afa 100644 --- a/recursive_diff/public.py +++ b/recursive_diff/public.py @@ -2,13 +2,147 @@ from __future__ import annotations -from typing import Any, Collection, Hashable, Literal +from collections.abc import Collection, Generator, Hashable +from typing import Any, Literal import numpy as np import pandas as pd -from recursive_diff.dask_compat import compute -from recursive_diff.recursive_diff import _recursive_diff +from recursive_diff.dask_compat import Array, Delayed, compute +from recursive_diff.recursive_diff import recursive_diff_impl + + +def recursive_diff( + lhs: Any, + rhs: Any, + *, + rel_tol: float = 1e-09, + abs_tol: float = 0.0, + brief_dims: Collection[Hashable] | Literal["all"] = (), +) -> Generator[str]: + """Compare two objects and yield all differences. + The two objects must any of: + + - basic types (int, float, complex, bool, str, bytes) + - basic collections (list, tuple, dict, set, frozenset) + - numpy scalar types + - :class:`numpy.ndarray` + - :class:`pandas.Series` + - :class:`pandas.DataFrame` + - :class:`pandas.Index` + - :class:`xarray.DataArray` + - :class:`xarray.Dataset` + - :class:`dask.delayed.Delayed` + - any recursive combination of the above + - any other object (compared with ==) + + Special treatment is reserved to different types: + + - floats and ints are compared with tolerance, using :func:`math.isclose` + - complex numbers are compared with tolerance, using :func:`math.isclose` + separately on the real and imaginary parts + - NaN equals to NaN + - floats without decimals compare as equal to ints + - complex numbers without imaginary part DO NOT compare as equal to floats, + as they have substantially different behaviour + - bools are only equal to other bools + - numpy arrays are compared elementwise and with tolerance, + also testing the dtype, using :func:`numpy.isclose(lhs, rhs) ` + for numeric arrays and equality for other dtypes. + - pandas and Xarray objects are compared elementwise, with tolerance, and + without order. Duplicate indices are not supported. + - Xarray dimensions and variables are compared without order + - collections (list, tuple, dict, set, frozenset) are recursively + descended into + - generic/unknown objects are compared with == + + Custom classes can be registered to benefit from the above behaviour; + see :func:`cast`. + + :param lhs: + left-hand-side data structure + :param rhs: + right-hand-side data structure + :param float rel_tol: + relative tolerance when comparing numbers. + Applies to floats, integers, and all numpy-based data. + :param float abs_tol: + absolute tolerance when comparing numbers. + Applies to floats, integers, and all numpy-based data. + :param brief_dims: + One of: + + - collection of strings representing Xarray dimensions. If one or more + differences are found along one of these dimensions, only one message + will be reported, stating the differences count. + - "all", to produce one line only for every Xarray variable that + differs + + Omit to output a line for every single different cell. + + Yields strings containing difference messages, prepended by the path to + the point that differs. + """ + # For as long as we don't encounter any Delayed or dask-backed xarray objects in lhs + # or rhs, yield diff messages directly from the recursive generator, without + # accumulating them. This allows to start printing differences as soon as they are + # found, without waiting for the whole recursion to finish. Once we encounter a + # Delayed or dask-backed xarray object, we start accumulating all eager messages and + # Delayed[list[str]] in a list and compute all the delayeds at once. + diffs: list[list[str] | Array | Delayed] = [] + for diff in recursive_diff_impl( + lhs, + rhs, + rel_tol=rel_tol, + abs_tol=abs_tol, + brief_dims=brief_dims, + as_dataframes=False, + path=[], + seen_lhs={}, + seen_rhs={}, + ): + if isinstance(diff, str): + if diffs: + diffs.append([diff]) + else: + yield diff + else: + assert isinstance(diff, (Delayed, Array)) + # Comparison of Delayed objects or Dask-backed arrays + diffs.append(diff) + + (computed_diffs,) = compute(diffs) + for diff_batch in computed_diffs: + yield from diff_batch + + +def recursive_eq( + lhs: Any, + rhs: Any, + rel_tol: float = 1e-09, + abs_tol: float = 0.0, + *, # TODO move before rel_tol (breaking change) + brief_dims: Collection[Hashable] | Literal["all"] = (), +) -> None: + """Wrapper around :func:`recursive_diff`. + + Print out all differences to stdout and finally assert that there are none. + This is meant to be used inside pytest, where stdout is captured. + """ + diffs_iter = recursive_diff( + lhs, rhs, rel_tol=rel_tol, abs_tol=abs_tol, brief_dims=brief_dims + ) + i = -1 + for i, diff in enumerate(diffs_iter): # noqa: B007 + print(diff) + i += 1 + if i == 0: + return + if brief_dims: + msg = "Found differences; see stdout" + else: + msg = f"Found {i} differences; see stdout" + raise AssertionError(msg) def diff_arrays( @@ -19,16 +153,17 @@ def diff_arrays( abs_tol: float = 0.0, brief_dims: Collection[Hashable] | Literal["all"] = (), ) -> tuple[dict[str, pd.DataFrame], list[str]]: - """Compare two objects with :func:`~recursive_diff.recursive_diff`. + """Compare two objects with :func:`recursive_diff`. Return tuple of: - - {path: dataframe of differences} for all array objects found. + - {path: dataframe of differences} for all NumPy, Pandas, and Xarray objects found. Arrays with no differences won't be returned. - - List of all other differences found. + - List of all other differences found. This includes differences in metadata, + shape, dtype, and indices in NumPy, Pandas, and Xarray objects. """ diffs = list( - _recursive_diff( + recursive_diff_impl( lhs, rhs, rel_tol=rel_tol, @@ -74,10 +209,10 @@ def display_diffs( abs_tol: float = 0.0, brief_dims: Collection[Hashable] | Literal["all"] = (), ) -> None: - """Compare two objects with :func:`~recursive_diff.recursive_diff`. + """Compare two objects with :func:`recursive_diff`. - Display all differences in Jupyter notebook, with diffs in array objects - displayed as tables. + Display all differences in Jupyter notebook, with diffs in NumPy, Pandas, and Xarray + objects displayed as tables. """ from IPython.display import HTML, display diff --git a/recursive_diff/recursive_diff.py b/recursive_diff/recursive_diff.py index f2e98fb..10fbcb2 100644 --- a/recursive_diff/recursive_diff.py +++ b/recursive_diff/recursive_diff.py @@ -18,7 +18,7 @@ import xarray from recursive_diff.cast import MissingKeys, cast -from recursive_diff.dask_compat import Array, Delayed, compute +from recursive_diff.dask_compat import Array, Delayed NUMPY_GE_200 = int(np.__version__.split(".")[0]) >= 2 PANDAS_GE_200 = int(pd.__version__.split(".")[0]) >= 2 @@ -48,111 +48,7 @@ def is_basic_noncontainer(x: object) -> bool: DO_NOT_CAST_TYPES = {bool, int, float, complex, str, bytes, list, dict, set, type(None)} -def recursive_diff( - lhs: Any, - rhs: Any, - *, - rel_tol: float = 1e-09, - abs_tol: float = 0.0, - brief_dims: Collection[Hashable] | Literal["all"] = (), -) -> Generator[str]: - """Compare two objects and yield all differences. - The two objects must any of: - - - basic types (int, float, complex, bool, str, bytes) - - basic collections (list, tuple, dict, set, frozenset) - - numpy scalar types - - :class:`numpy.ndarray` - - :class:`pandas.Series` - - :class:`pandas.DataFrame` - - :class:`pandas.Index` - - :class:`xarray.DataArray` - - :class:`xarray.Dataset` - - :class:`dask.delayed.Delayed` - - any recursive combination of the above - - any other object (compared with ==) - - Special treatment is reserved to different types: - - - floats and ints are compared with tolerance, using :func:`math.isclose` - - complex numbers are compared with tolerance, using :func:`math.isclose` - separately on the real and imaginary parts - - NaN equals to NaN - - floats without decimals compare as equal to ints - - complex numbers without imaginary part DO NOT compare as equal to floats, - as they have substantially different behaviour - - bools are only equal to other bools - - numpy arrays are compared elementwise and with tolerance, - also testing the dtype, using :func:`numpy.isclose(lhs, rhs) ` - for numeric arrays and equality for other dtypes. - - pandas and Xarray objects are compared elementwise, with tolerance, and - without order. Duplicate indices are not supported. - - Xarray dimensions and variables are compared without order - - collections (list, tuple, dict, set, frozenset) are recursively - descended into - - generic/unknown objects are compared with == - - Custom classes can be registered to benefit from the above behaviour; - see :func:`cast`. - - :param lhs: - left-hand-side data structure - :param rhs: - right-hand-side data structure - :param float rel_tol: - relative tolerance when comparing numbers. - Applies to floats, integers, and all numpy-based data. - :param float abs_tol: - absolute tolerance when comparing numbers. - Applies to floats, integers, and all numpy-based data. - :param brief_dims: - One of: - - - collection of strings representing Xarray dimensions. If one or more - differences are found along one of these dimensions, only one message - will be reported, stating the differences count. - - "all", to produce one line only for every Xarray variable that - differs - - Omit to output a line for every single different cell. - - Yields strings containing difference messages, prepended by the path to - the point that differs. - """ - # For as long as we don't encounter any Delayed or dask-backed xarray objects in lhs - # or rhs, yield diff messages directly from the recursive generator, without - # accumulating them. This allows to start printing differences as soon as they are - # found, without waiting for the whole recursion to finish. Once we encounter a - # Delayed or dask-backed xarray object, we start accumulating all eager messages and - # Delayed[list[str]] in a list and compute all the delayeds at once. - diffs: list[list[str] | Array | Delayed] = [] - for diff in _recursive_diff( - lhs, - rhs, - rel_tol=rel_tol, - abs_tol=abs_tol, - brief_dims=brief_dims, - as_dataframes=False, - path=[], - seen_lhs={}, - seen_rhs={}, - ): - if isinstance(diff, str): - if diffs: - diffs.append([diff]) - else: - yield diff - else: - assert isinstance(diff, (Delayed, Array)) - # Comparison of Delayed objects or Dask-backed arrays - diffs.append(diff) - - (computed_diffs,) = compute(diffs) - for diff_batch in computed_diffs: - yield from diff_batch - - -def _recursive_diff( +def recursive_diff_impl( lhs: Any, rhs: Any, *, @@ -234,7 +130,7 @@ def diff(msg: str, print_path: list[object] = path) -> str: @delayed def _recursive_diff_d(*args, **kwargs): # type: ignore[no-untyped-def] - return list(_recursive_diff(*args, **kwargs)) + return list(recursive_diff_impl(*args, **kwargs)) yield _recursive_diff_d( lhs, @@ -297,7 +193,7 @@ def _recursive_diff_d(*args, **kwargs): # type: ignore[no-untyped-def] + _str_trunc(rhs[len(lhs) :]) ) for i, (lhs_i, rhs_i) in enumerate(zip(lhs, rhs)): - yield from _recursive_diff( + yield from recursive_diff_impl( lhs_i, rhs_i, rel_tol=rel_tol, @@ -372,7 +268,7 @@ def _recursive_diff_d(*args, **kwargs): # type: ignore[no-untyped-def] elif missing_keys is MissingKeys.PAIR: yield diff(f"Pair {key}:{_str_trunc(rhs[key])} is in RHS only") for key in sorted(lhs.keys() & rhs.keys(), key=repr): - yield from _recursive_diff( + yield from recursive_diff_impl( lhs[key], rhs[key], rel_tol=rel_tol, @@ -508,7 +404,7 @@ def _diff_dataarrays( if not lhs.dims: # 0-dimensional arrays - yield from _recursive_diff( + yield from recursive_diff_impl( _array0d_to_scalar(lhs), _array0d_to_scalar(rhs), rel_tol=rel_tol, diff --git a/recursive_diff/recursive_eq.py b/recursive_diff/recursive_eq.py deleted file mode 100644 index 21dde4e..0000000 --- a/recursive_diff/recursive_eq.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Tools for unit testing""" - -from __future__ import annotations - -from typing import Any, Collection, Hashable, Literal - -from recursive_diff.recursive_diff import recursive_diff - - -def recursive_eq( - lhs: Any, - rhs: Any, - rel_tol: float = 1e-09, - abs_tol: float = 0.0, - *, # TODO move before rel_tol (breaking change) - brief_dims: Collection[Hashable] | Literal["all"] = (), -) -> None: - """Wrapper around :func:`~recursive_diff.recursive_diff`. - Print out all differences to stdout and finally assert that there are none. - This is meant to be used inside pytest, where stdout is captured. - """ - diffs_iter = recursive_diff( - lhs, rhs, rel_tol=rel_tol, abs_tol=abs_tol, brief_dims=brief_dims - ) - i = -1 - for i, diff in enumerate(diffs_iter): # noqa: B007 - print(diff) - i += 1 - if i == 0: - return - if brief_dims: - msg = "Found differences; see stdout" - else: - msg = f"Found {i} differences; see stdout" - raise AssertionError(msg)