diff --git a/changelog.d/autoimpute-copy.fixed.md b/changelog.d/autoimpute-copy.fixed.md new file mode 100644 index 0000000..d1db482 --- /dev/null +++ b/changelog.d/autoimpute-copy.fixed.md @@ -0,0 +1 @@ +Fixed `autoimpute` mutating the caller's `receiver_data` (#13). The final `receiver_data[var] = median_imputations[var]` assignment could write through to the caller's original DataFrame depending on whether intermediate pandas operations returned a copy or a view — a subtle side effect that silently added imputed columns to the user's input frame. `autoimpute` now takes a defensive `.copy()` of `receiver_data` at the top of the function so the caller's frame is always preserved, and the imputed columns are returned exclusively through `result.receiver_data`. diff --git a/microimpute/comparisons/autoimpute.py b/microimpute/comparisons/autoimpute.py index 69b8317..2121c6a 100644 --- a/microimpute/comparisons/autoimpute.py +++ b/microimpute/comparisons/autoimpute.py @@ -441,6 +441,16 @@ def autoimpute( main_progress = tqdm(total=5, desc="AutoImputation progress") main_progress.set_description("Input validation") + # Defensive copy so that the caller's receiver_data is never + # mutated. Previously the final assignment + # ``receiver_data[var] = median_imputations[var]`` would write + # back through any local binding, and whether the user's frame + # was affected depended on whether the intermediate ``drop`` call + # returned a copy or a view (#13). Copying up-front makes this + # explicit and eliminates the side effect regardless of later + # pandas internals. + receiver_data = receiver_data.copy() + # Use provided quantiles or defaults quantiles = imputation_quantiles if imputation_quantiles else QUANTILES diff --git a/tests/test_autoimpute.py b/tests/test_autoimpute.py index 1118657..24ad1b0 100644 --- a/tests/test_autoimpute.py +++ b/tests/test_autoimpute.py @@ -403,6 +403,51 @@ def test_autoimpute_invalid_model_specification() -> None: ) +def test_autoimpute_does_not_mutate_caller_receiver() -> None: + """Regression test for #13: autoimpute must not mutate the caller's + receiver_data. Previously the final ``receiver_data[var] = ...`` + assignment could write back to the caller's frame depending on + pandas copy/view semantics during preprocessing, silently adding + new columns to the user's DataFrame. + """ + from microimpute.models import OLS + + np.random.seed(0) + donor = pd.DataFrame( + { + "x1": np.random.randn(80), + "x2": np.random.randn(80), + "y": np.random.randn(80), + } + ) + receiver = pd.DataFrame( + { + "x1": np.random.randn(30), + "x2": np.random.randn(30), + } + ) + snapshot = receiver.copy() + + result = autoimpute( + donor_data=donor, + receiver_data=receiver, + predictors=["x1", "x2"], + imputed_variables=["y"], + models=[OLS], + log_level="WARNING", + ) + + # Caller's frame must be untouched. + assert "y" not in receiver.columns, ( + "autoimpute silently added the imputed column to the caller's " + "receiver_data DataFrame" + ) + pd.testing.assert_frame_equal(receiver, snapshot) + + # The returned frame carries the imputed values. + assert "y" in result.receiver_data.columns + + # === Performance Tests ===