From 026292939a3a6e03dd3c0c4737f101485ac1a065 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Wed, 1 Jul 2026 01:49:01 +0200 Subject: [PATCH 1/6] feat: dPULearn.mine_negatives + get_labels + named sample colors (prototype #308) Three additive conveniences for the positive/unlabelled -> mined-negatives flow, removing the recurring vstack/label-vector/color-lookup plumbing in the gamma-secretase use case. All existing APIs stay byte-identical. - dPULearn.mine_negatives(X_pos, X_unlabelled, ...): one-call sugar over fit that returns the reliable-negative boolean mask over X_unlabelled. Equals the manual labels_[len(X_pos):]==0 result exactly (regression-tested). fit(X, labels=...) unchanged. - get_labels(df, positive_label=1, col_label="label"): binary int label vector, the single-call form of (df[col]==x).astype(int).to_numpy(). - COLOR_SAMPLES_POS/NEG/UNL/REL_NEG: public named aliases for the canonical sample colors, equal to plot_get_cdict("DICT_COLOR")["SAMPLES_*"] (golden-tested). Wired get_labels + the 4 color constants into __init__/__all__ (on the #308 wire-to-public-API list). Ripple: numpydoc + 2 executed example notebooks (get_labels, dpul_mine_negatives), 39 unit tests (positive+negative+regression), release-notes Unreleased entries, cheat-sheet rows. Part of #305 / prototype for #308 Co-Authored-By: Claude Opus 4.8 (1M context) --- aaanalysis/__init__.py | 9 +- aaanalysis/_constants.py | 9 + aaanalysis/data_handling/__init__.py | 6 +- aaanalysis/data_handling/_get_labels.py | 70 ++++ aaanalysis/pu_learning/_dpulearn.py | 91 +++++ docs/_cheatsheet/content.py | 2 + docs/source/index/release_notes.rst | 17 + examples/data_handling/get_labels.ipynb | 297 +++++++++++++++++ .../pu_learning/dpul_mine_negatives.ipynb | 314 ++++++++++++++++++ .../data_handling_tests/test_get_labels.py | 101 ++++++ .../test_dpulearn_mine_negatives.py | 184 ++++++++++ .../test_color_samples_constants.py | 36 ++ 12 files changed, 1133 insertions(+), 3 deletions(-) create mode 100644 aaanalysis/data_handling/_get_labels.py create mode 100644 examples/data_handling/get_labels.ipynb create mode 100644 examples/pu_learning/dpul_mine_negatives.ipynb create mode 100644 tests/unit/data_handling_tests/test_get_labels.py create mode 100644 tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py create mode 100644 tests/unit/plotting_tests/test_color_samples_constants.py diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py index 4fc1edee..17989828 100644 --- a/aaanalysis/__init__.py +++ b/aaanalysis/__init__.py @@ -1,4 +1,4 @@ -from .data_handling import (load_dataset, load_scales, load_features, +from .data_handling import (load_dataset, load_scales, load_features, get_labels, read_fasta, to_fasta, SequencePreprocessor, EmbeddingPreprocessor, @@ -14,6 +14,8 @@ comp_per_protein_ap, comp_detection_metrics, comp_bootstrap_ci, comp_smooth_scores) from .config import options +from ._constants import (COLOR_SAMPLES_POS, COLOR_SAMPLES_NEG, + COLOR_SAMPLES_UNL, COLOR_SAMPLES_REL_NEG) from importlib.metadata import version as _version, PackageNotFoundError @@ -28,6 +30,7 @@ "load_dataset", "load_scales", "load_features", + "get_labels", "read_fasta", "to_fasta", "SequencePreprocessor", @@ -72,6 +75,10 @@ "comp_detection_metrics", "comp_bootstrap_ci", "comp_smooth_scores", + "COLOR_SAMPLES_POS", + "COLOR_SAMPLES_NEG", + "COLOR_SAMPLES_UNL", + "COLOR_SAMPLES_REL_NEG", "options" ] diff --git a/aaanalysis/_constants.py b/aaanalysis/_constants.py index 2c22e606..2ce81416 100644 --- a/aaanalysis/_constants.py +++ b/aaanalysis/_constants.py @@ -478,6 +478,15 @@ def _folder_path(super_folder, folder_name): COLOR_NEG = "#ad4570" # (173,69,112) COLOR_REL_NEG = "#ad9745" # (173, 151, 69) +# Public, named aliases for the canonical sample-group colors (positive / negative / +# unlabeled / reliable-negative). They mirror the ``DICT_COLOR["SAMPLES_*"]`` entries +# exactly, so users can reference a named constant (``aa.COLOR_SAMPLES_POS``) instead +# of indexing ``plot_get_cdict("DICT_COLOR")`` by string key. +COLOR_SAMPLES_POS = COLOR_POS +COLOR_SAMPLES_NEG = COLOR_NEG +COLOR_SAMPLES_UNL = COLOR_UNL +COLOR_SAMPLES_REL_NEG = COLOR_REL_NEG + DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS, "SHAP_NEG": COLOR_SHAP_NEG, "FEAT_POS": COLOR_FEAT_POS, diff --git a/aaanalysis/data_handling/__init__.py b/aaanalysis/data_handling/__init__.py index 1fad9397..7ea8c663 100644 --- a/aaanalysis/data_handling/__init__.py +++ b/aaanalysis/data_handling/__init__.py @@ -1,8 +1,8 @@ """ Data loading and sequence/embedding preprocessing — the package's data entry point. -Public objects: load_dataset, load_scales, load_features, read_fasta, to_fasta, -SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums. +Public objects: load_dataset, load_scales, load_features, get_labels, read_fasta, +to_fasta, SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums. Produces the core data objects the rest of the pipeline consumes: ``load_dataset`` yields ``df_seq``, ``load_scales`` yields ``df_scales`` (fed to ``feature_engineering.AAclust`` / ``CPP``), ``load_features`` yields a reference @@ -17,6 +17,7 @@ from ._load_dataset import load_dataset from ._load_scales import load_scales from ._load_features import load_features +from ._get_labels import get_labels from ._read_fasta import read_fasta from ._to_fasta import to_fasta from ._seq_preproc import SequencePreprocessor @@ -27,6 +28,7 @@ "load_dataset", "load_scales", "load_features", + "get_labels", "read_fasta", "to_fasta", "SequencePreprocessor", diff --git a/aaanalysis/data_handling/_get_labels.py b/aaanalysis/data_handling/_get_labels.py new file mode 100644 index 00000000..9149deaf --- /dev/null +++ b/aaanalysis/data_handling/_get_labels.py @@ -0,0 +1,70 @@ +""" +This is a script for the frontend of the get_labels function, deriving a binary +label vector from a sequence DataFrame's label column. +""" +from typing import Any +import numpy as np +import pandas as pd + +import aaanalysis.utils as ut + + +# I Helper Functions +def check_match_df_positive_label(df=None, col_label=None, positive_label=None) -> None: + """Check that the positive label value is present in the label column.""" + present = set(df[col_label].tolist()) + if positive_label not in present: + raise ValueError(f"'positive_label' ({positive_label}) is not among the values of " + f"column '{col_label}' ({sorted(present, key=str)}).") + + +# II Main Functions +def get_labels(df: pd.DataFrame, + positive_label: Any = 1, + col_label: str = "label", + ) -> np.ndarray: + """ + Derive a binary ``int`` label vector from a column of a sequence DataFrame. + + Maps the value flagged as positive (``positive_label``) onto ``1`` and every other + value onto ``0``, the binary encoding consumed across the package (e.g. by + :meth:`CPP.run`, :class:`TreeModel`, and the ``labels`` argument of most tools). + This is the single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` + expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + df : pd.DataFrame, shape (n_samples, n_seq_info) + Sequence DataFrame (``df_seq``) containing the label column ``col_label``. + positive_label : int or str, default=1 + Value in ``col_label`` marking the positive class. All rows equal to it become + ``1``; all remaining rows become ``0``. Must be present in ``col_label``. + col_label : str, default='label' + Name of the column holding the (multi-value or already binary) labels. + + Returns + ------- + labels : array-like, shape (n_samples,) + Binary ``int`` label vector (``1`` = positive, ``0`` = otherwise), row-aligned + to ``df``. + + Notes + ----- + * The result equals ``(df[col_label] == positive_label).astype(int).to_numpy()``. + * Pass the resulting vector directly as the ``labels`` argument of CPP, TreeModel, + or other tools. For Positive-Unlabeled mining keep the package ``1`` (positive) / + ``2`` (unlabeled) markers instead and use :meth:`dPULearn.mine_negatives`. + + Examples + -------- + .. include:: examples/get_labels.rst + """ + # Check input + ut.check_df(name="df", df=df, cols_required=col_label) + ut.check_str(name="col_label", val=col_label, accept_none=False) + check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label) + # Derive binary int label vector + labels = (df[col_label] == positive_label).astype(int).to_numpy() + return labels diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py index 23b5f729..9931dbdc 100644 --- a/aaanalysis/pu_learning/_dpulearn.py +++ b/aaanalysis/pu_learning/_dpulearn.py @@ -133,6 +133,15 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None: raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})") +def check_match_X_pos_X_unlabelled(X_pos=None, X_unlabelled=None) -> None: + """Check that positive and unlabeled feature matrices share the same feature dimension.""" + n_features_pos = X_pos.shape[1] + n_features_unl = X_unlabelled.shape[1] + if n_features_pos != n_features_unl: + raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and " + f"'X_unlabelled' (n={n_features_unl})") + + # II Main Functions class dPULearn(Wrapper): """ @@ -358,6 +367,88 @@ def fit(self, self.df_pu_ = df_pu return self + def mine_negatives(self, + X_pos: ut.ArrayLike2D, + X_unlabelled: ut.ArrayLike2D, + n_neg: Optional[int] = None, + n_unl_to_neg: Optional[int] = None, + metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None, + n_components: Union[float, int] = 0.80, + ) -> np.ndarray: + """ + Mine reliable negatives from an unlabeled pool given the positives, in one call. + + Convenience wrapper around :meth:`dPULearn.fit` for the common positive/unlabeled + setup: instead of stacking ``X_pos`` and ``X_unlabelled`` by hand, building a + ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined + rows back out by index, pass the two feature matrices separately and receive a + **boolean mask over the rows of** ``X_unlabelled`` flagging the identified reliable + negatives. The mask equals ``labels_[len(X_pos):] == 0`` from the manual stacking + path exactly. + + After the call the instance is fitted: :attr:`dPULearn.labels_` (over the stacked + ``X_pos`` then ``X_unlabelled``) and :attr:`dPULearn.df_pu_` are set, so the + :class:`dPULearnPlot` methods work as usual. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + X_pos : array-like, shape (n_pos, n_features) + Feature matrix of the positive samples. + X_unlabelled : array-like, shape (n_unl, n_features) + Feature matrix of the unlabeled samples (the candidate pool). Must have the + same number of features as ``X_pos``. + n_neg : int, optional + Total number of reliable negatives to identify from the unlabeled pool. + Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg`` (with no pre-labeled + negatives the two are equivalent). + n_unl_to_neg : int, optional + Number of reliable negatives to identify directly from the unlabeled pool. + Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg``. + metric : str or None, optional + Distance metric for distance-based identification (``euclidean``, + ``manhattan``, ``cosine``). If ``None``, PCA-based identification is performed. + n_components : int or float, default=0.80 + Number of principal components (int >= 1) or fraction of variance covered + (float in ``(0.0, 1.0)``) when PCA is applied. + + Returns + ------- + mask_neg : array-like, shape (n_unl,) + Boolean mask over the rows of ``X_unlabelled``: ``True`` marks an identified + reliable negative. ``X_unlabelled[mask_neg]`` are the mined negatives. + + Notes + ----- + * This is purely additive sugar: it stacks the inputs and calls + :meth:`dPULearn.fit` with ``label_pos=1`` / ``label_unl=2`` internally, so the + identification result is identical to the manual path. + + See Also + -------- + * :meth:`dPULearn.fit`: the underlying fit on a stacked matrix and label vector. + * :func:`get_labels`: derive a binary label vector from a sequence DataFrame. + + Examples + -------- + .. include:: examples/dpul_mine_negatives.rst + """ + # Check input + X_pos = ut.check_X(X=X_pos, X_name="X_pos") + X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled") + check_match_X_pos_X_unlabelled(X_pos=X_pos, X_unlabelled=X_unlabelled) + # Stack positives over the unlabeled pool and fit with the package PU markers + n_pos = X_pos.shape[0] + X = np.vstack([X_pos, X_unlabelled]) + labels = np.array([1] * n_pos + [2] * X_unlabelled.shape[0]) + self.fit(X=X, labels=labels, label_pos=1, label_unl=2, + n_neg=n_neg, n_unl_to_neg=n_unl_to_neg, + metric=metric, n_components=n_components) + # Slice the mined reliable negatives (label 0) back out of the unlabeled block + mask_neg = np.asarray(self.labels_)[n_pos:] == 0 + return mask_neg + @staticmethod def eval(X: ut.ArrayLike2D, list_labels: ut.ArrayLike2D, diff --git a/docs/_cheatsheet/content.py b/docs/_cheatsheet/content.py index 801adc7f..da244c72 100644 --- a/docs/_cheatsheet/content.py +++ b/docs/_cheatsheet/content.py @@ -188,6 +188,7 @@ ("Load benchmark sequences", "load_dataset(name) → df_seq", None), ("Load AAontology scales", "load_scales() → df_scales", None), ("Load precomputed features", "load_features(name) → df_feat", None), + ("Binary labels from df column", "get_labels(df, positive_label) → labels", None, "v1.1"), ("Read / write FASTA", "read_fasta(file) → df_seq", None), ("Cluster redundant homologs", "filter_seq(df_seq) → df_clust [pro]", None), ]}, @@ -221,6 +222,7 @@ {"name": "Modeling & Explainability", "tag": "PU · classify · SHAP", "rows": [ ("Train with positives + unlabeled data", "dPULearn().fit(X, labels) [Wrapper]", None), + ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabelled) → mask", None, "v1.1"), ("Train + RFE + MC importance", "TreeModel().fit(X, labels) [Wrapper]", None), ("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels) [pro]", None), ]}, diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst index bc2e9618..5ddf838c 100644 --- a/docs/source/index/release_notes.rst +++ b/docs/source/index/release_notes.rst @@ -37,6 +37,9 @@ Added ``build_cat``, ``to_df_seq``). - **combine_dict_nums**: Concatenates per-residue tensors (embedding / structure / annotation) along the feature axis into one combined ``CPP.run_num`` input. +- **get_labels**: Derives a binary ``int`` label vector from a sequence DataFrame's + label column (``positive_label`` mapped to ``1``, everything else to ``0``) — the + single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` expression. **Feature Engineering** @@ -132,6 +135,16 @@ Added switches the pre-computed prediction per P1 (feature map + structure restyle) with no kernel, keeping the column-residue linking (warned past 40 sites, hard-capped at 200). +**PU Learning** + +- **dPULearn.mine_negatives**: One-call convenience over ``dPULearn.fit`` for the common + positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabelled`` separately instead of + stacking them by hand, building a ``1`` / ``2`` label vector, fitting, and slicing the + mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabelled`` + flagging the identified reliable negatives (equal to the manual ``labels_[len(X_pos):] + == 0`` result exactly). The instance is left fitted (``labels_`` / ``df_pu_`` set, so + ``dPULearnPlot`` works), and the existing ``fit(X, labels=...)`` path is unchanged. + **Sequence Analysis** - **AAWindowSampler**: Samples fixed-length sequence windows for PU-learning and @@ -191,6 +204,10 @@ Added - **plot_rank**: Standalone per-protein max-score-vs-rank scatter with group coloring and optional threshold lines (pairs with the new ``aa.metrics`` functions). +- **COLOR_SAMPLES_POS / COLOR_SAMPLES_NEG / COLOR_SAMPLES_UNL / COLOR_SAMPLES_REL_NEG**: + Public, named constants for the canonical sample-group colors (positive / negative / + unlabeled / reliable-negative). They equal the ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]`` + values exactly, so a named constant replaces indexing the color dict by string key. **Golden Pipelines** diff --git a/examples/data_handling/get_labels.ipynb b/examples/data_handling/get_labels.ipynb new file mode 100644 index 00000000..ef84686a --- /dev/null +++ b/examples/data_handling/get_labels.ipynb @@ -0,0 +1,297 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "30e77e51", + "metadata": {}, + "source": [ + "The ``get_labels`` function derives a binary ``int`` label vector from a column of a sequence DataFrame (``df_seq``). It is the single-call form of the recurring ``(df[col] == positive_label).astype(int).to_numpy()`` expression: the value flagged as positive becomes ``1`` and every other value becomes ``0``." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "14210edc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:29:34.142992Z", + "iopub.status.busy": "2026-06-30T23:29:34.142637Z", + "iopub.status.idle": "2026-06-30T23:29:35.599732Z", + "shell.execute_reply": "2026-06-30T23:29:35.599479Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataFrame shape: (10, 8)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 entrysequencelabeltmd_starttmd_stopjmd_ntmdjmd_c
1P05067MLPGLALLLLAAWTA...GYENPTYKFFEQMQN1701723FAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHH
2P14925MAGRARSGLLLLLLG...EEEYSAPLPKPAPSS1868890KLSTEPGSGVSVVLITTLLVIPVLVLLAIVMFIRWKKSRAFGD
3P70180MRSLLLFTFSACVLL...RELREDSIRSHFSVA1477499PCKSSGGLEESAVTGIVVGALLGAGLLMAFYFFRKKYRITIER
4Q03157MGPTSPAARGQGRRW...HGYENPTYRFLEERP1585607APSGTGVSREALSGLLIMGAGGGSLIVLSLLLLRKKKPYGTIS
5Q06481MAATGTAAAAATGRL...GYENPTYKYLEQMQI1694716LREDFSLSSSALIGLLVIAVAIATVIVISLVMLRKRQYGTISH
6P12821MGAASGRRGPGLLLP...SHGPQFGSEVELRHS212571276GLDLDAQQARVGQWLLLFLGIALLVATLGLSQRLFSIRHR
7P36896MAESAGASSFFPLVV...KKTLSQLSVQEDVKI2127149EHPSMWGPVELVGIIAGPVFLLFLIIIIVFLVINYHQRVYHNR
8Q8NER5MTRALCSALRQALLL...KKTISQLCVKEDCKA2114136PNAPKLGPMELAIIITVPVCLLSIAAMLTVWACQGRQCSYRKK
9P37023MTLGSPRKGLLMLLM...LQKISNSPEKPKVIQ2119141PSEQPGTDGQLALILGPVLALLALVALGVLGLWHVRRRQEKQR
10O43184MAARPLPVSPARALL...YPHQVPRSTHTAYIK2707729DSGPIRQADNQGLTIGILVTILCLLAAGFVVYLKRKTLIRLLF
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import aaanalysis as aa\n", + "aa.options[\"verbose\"] = False\n", + "\n", + "# A Positive-Unlabeled (PU) dataset: substrates (1) and unlabeled others (2).\n", + "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\", n=5)\n", + "aa.display_df(df=df_seq, n_rows=10, show_shape=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e84a0e34", + "metadata": {}, + "source": [ + "By default ``positive_label=1``: substrates map to ``1`` and everything else to ``0``." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a5ccd25b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:29:35.600825Z", + "iopub.status.busy": "2026-06-30T23:29:35.600758Z", + "iopub.status.idle": "2026-06-30T23:29:35.602749Z", + "shell.execute_reply": "2026-06-30T23:29:35.602535Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 1 1 1 1 0 0 0 0 0]\n" + ] + } + ], + "source": [ + "labels = aa.get_labels(df=df_seq, positive_label=1)\n", + "print(labels)" + ] + }, + { + "cell_type": "markdown", + "id": "5db29b65", + "metadata": {}, + "source": [ + "Pick any value as the positive class via ``positive_label`` (e.g. treat the unlabeled ``2`` as positive), and select a different column with ``col_label``. The result equals the manual ``(df[col_label] == positive_label).astype(int).to_numpy()`` expression." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d9a49747", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:29:35.603686Z", + "iopub.status.busy": "2026-06-30T23:29:35.603626Z", + "iopub.status.idle": "2026-06-30T23:29:35.605515Z", + "shell.execute_reply": "2026-06-30T23:29:35.605342Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 0 0 0 0 1 1 1 1 1]\n" + ] + } + ], + "source": [ + "labels_unl = aa.get_labels(df=df_seq, positive_label=2, col_label=\"label\")\n", + "print(labels_unl)" + ] + }, + { + "cell_type": "markdown", + "id": "82fd2bd1", + "metadata": {}, + "source": [ + "Pass the resulting vector straight into the ``labels`` argument of tools such as :meth:`CPP.run` or :class:`TreeModel`." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb new file mode 100644 index 00000000..cd9fc03a --- /dev/null +++ b/examples/pu_learning/dpul_mine_negatives.ipynb @@ -0,0 +1,314 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01ccf3ee", + "metadata": {}, + "source": [ + "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabelled``." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "710fdc35", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:30:36.652099Z", + "iopub.status.busy": "2026-06-30T23:30:36.652032Z", + "iopub.status.idle": "2026-06-30T23:30:38.227970Z", + "shell.execute_reply": "2026-06-30T23:30:38.227738Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "positives: 63 | unlabeled: 631\n" + ] + } + ], + "source": [ + "import aaanalysis as aa\n", + "import numpy as np\n", + "import pandas as pd\n", + "aa.options[\"verbose\"] = False\n", + "\n", + "# Build a CPP feature matrix for the gamma-secretase PU dataset (substrates vs unlabeled others).\n", + "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\")\n", + "df_feat = aa.load_features(name=\"DOM_GSEC\")\n", + "sf = aa.SequenceFeature()\n", + "X = sf.feature_matrix(features=df_feat[\"feature\"], df_parts=sf.get_df_parts(df_seq=df_seq))\n", + "labels = df_seq[\"label\"].to_numpy()\n", + "\n", + "# Split into the positive (1) and unlabeled (2) feature blocks.\n", + "X_pos = X[labels == 1]\n", + "X_unl = X[labels == 2]\n", + "print(f\"positives: {len(X_pos)} | unlabeled: {len(X_unl)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "38f3d31d", + "metadata": {}, + "source": [ + "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabelled``, which unlabeled samples were identified as reliable negatives." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bdccaa03", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:30:38.229229Z", + "iopub.status.busy": "2026-06-30T23:30:38.229155Z", + "iopub.status.idle": "2026-06-30T23:30:38.241855Z", + "shell.execute_reply": "2026-06-30T23:30:38.241652Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mined reliable negatives: 49 of 631 unlabeled\n" + ] + } + ], + "source": [ + "dpul = aa.dPULearn(random_state=42)\n", + "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=49)\n", + "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n", + "X_neg = X_unl[mask_neg] # the mined feature rows" + ] + }, + { + "cell_type": "markdown", + "id": "315d4798", + "metadata": {}, + "source": [ + "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabelled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c78d6eab", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:30:38.243063Z", + "iopub.status.busy": "2026-06-30T23:30:38.242994Z", + "iopub.status.idle": "2026-06-30T23:30:38.254654Z", + "shell.execute_reply": "2026-06-30T23:30:38.254464Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mask equals manual path: True\n" + ] + } + ], + "source": [ + "# Stack X_pos over X_unlabelled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n", + "labels_manual = np.asarray(\n", + " aa.dPULearn(random_state=42)\n", + " .fit(X=np.vstack([X_pos, X_unl]),\n", + " labels=np.array([1] * len(X_pos) + [2] * len(X_unl)),\n", + " n_unl_to_neg=49)\n", + " .labels_)\n", + "mask_manual = labels_manual[len(X_pos):] == 0\n", + "print(\"mask equals manual path:\", np.array_equal(mask_neg, mask_manual))" + ] + }, + { + "cell_type": "markdown", + "id": "cae2d79d", + "metadata": {}, + "source": [ + "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Use ``n_neg`` instead of ``n_unl_to_neg`` to request a total count, or set a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "85eff05c", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:30:38.255594Z", + "iopub.status.busy": "2026-06-30T23:30:38.255519Z", + "iopub.status.idle": "2026-06-30T23:30:38.282637Z", + "shell.execute_reply": "2026-06-30T23:30:38.282413Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 582\n", + "1 63\n", + "0 49\n", + "Name: count, dtype: int64\n", + "DataFrame shape: (49, 15)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 selection_viaPC1 (56.2%)PC2 (7.4%)PC3 (2.9%)PC4 (2.8%)
84PC10.021000-0.0478000.075200-0.005400
95PC20.032000-0.0821000.025800-0.037700
109PC10.026100-0.0585000.075700-0.020900
158PC10.023500-0.0607000.0540000.000900
161PC10.0259000.0314000.0449000.055400
170PC10.026100-0.0353000.0583000.025800
192PC60.040100-0.0022000.004300-0.053600
193PC10.024700-0.0569000.051300-0.035600
195PC50.0299000.0065000.0358000.050200
200PC10.021200-0.0562000.0057000.072600
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(pd.Series(dpul.labels_).value_counts())\n", + "aa.display_df(df=dpul.df_pu_[dpul.df_pu_[\"selection_via\"].str.contains(\"PC\", na=False)],\n", + " n_rows=10, n_cols=5, show_shape=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/unit/data_handling_tests/test_get_labels.py b/tests/unit/data_handling_tests/test_get_labels.py new file mode 100644 index 00000000..2f499e86 --- /dev/null +++ b/tests/unit/data_handling_tests/test_get_labels.py @@ -0,0 +1,101 @@ +""" +This script tests the top-level get_labels() function (issue #308). + +get_labels is the single-call form of the recurring +``(df[col] == positive_label).astype(int).to_numpy()`` expression that appears in 4+ places +of the gamma-secretase use case. It maps the positive value onto 1 and everything else onto 0. +""" +import numpy as np +import pandas as pd +import pytest + +import aaanalysis as aa + + +# Helper functions +def _manual(df, positive_label, col="label"): + return (df[col] == positive_label).astype(int).to_numpy() + + +# Normal Cases Test Class +class TestGetLabels: + """Test get_labels() for each parameter individually.""" + + def test_returns_int_numpy_array(self): + df = pd.DataFrame({"entry": ["a", "b", "c"], "label": [1, 2, 1]}) + labels = aa.get_labels(df=df, positive_label=1) + assert isinstance(labels, np.ndarray) + assert labels.dtype.kind == "i" + assert labels.shape == (3,) + + def test_positive_label_default(self): + df = pd.DataFrame({"label": [1, 0, 1, 0]}) + labels = aa.get_labels(df=df) + assert np.array_equal(labels, np.array([1, 0, 1, 0])) + + def test_df_parameter(self): + df = pd.DataFrame({"label": [2, 2, 1]}) + labels = aa.get_labels(df=df, positive_label=1) + assert np.array_equal(labels, np.array([0, 0, 1])) + + def test_col_label_parameter(self): + df = pd.DataFrame({"y": [1, 2, 1, 2]}) + labels = aa.get_labels(df=df, positive_label=2, col_label="y") + assert np.array_equal(labels, np.array([0, 1, 0, 1])) + + +# Golden equivalence to the manual expression (KPI: >= 2 encodings) +class TestGetLabelsEquivalence: + """Result equals the manual expression on multiple label encodings (KPI #308).""" + + def test_pu_encoding_1_2(self): + # PU encoding: 1 = positive, 2 = unlabeled + df = pd.DataFrame({"label": [1, 2, 1, 2, 2, 1]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1), + _manual(df, 1)) + + def test_binary_encoding_0_1(self): + # Standard {0, 1} encoding + df = pd.DataFrame({"label": [0, 1, 1, 0]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1), + _manual(df, 1)) + + def test_multiclass_encoding(self): + # Multi-class: pick one class as positive + df = pd.DataFrame({"label": [0, 1, 2, 0, 1, 2]}) + for pos in (0, 1, 2): + assert np.array_equal(aa.get_labels(df=df, positive_label=pos), + _manual(df, pos)) + + def test_string_labels(self): + df = pd.DataFrame({"label": ["sub", "non", "sub", "unl"]}) + assert np.array_equal(aa.get_labels(df=df, positive_label="sub"), + _manual(df, "sub")) + + +# Negative Cases Test Class +class TestGetLabelsNegative: + """Invalid inputs must raise informative ValueErrors.""" + + def test_df_none(self): + with pytest.raises(ValueError): + aa.get_labels(df=None, positive_label=1) + + def test_df_not_dataframe(self): + with pytest.raises(ValueError): + aa.get_labels(df=[1, 2, 3], positive_label=1) + + def test_missing_label_column(self): + df = pd.DataFrame({"entry": ["a", "b"], "y": [1, 0]}) + with pytest.raises(ValueError): + aa.get_labels(df=df, positive_label=1) + + def test_custom_col_missing(self): + df = pd.DataFrame({"label": [1, 0]}) + with pytest.raises(ValueError): + aa.get_labels(df=df, positive_label=1, col_label="missing") + + def test_positive_label_absent(self): + df = pd.DataFrame({"label": [1, 2, 1]}) + with pytest.raises(ValueError): + aa.get_labels(df=df, positive_label=9) diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py new file mode 100644 index 00000000..1329f585 --- /dev/null +++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py @@ -0,0 +1,184 @@ +""" +This script tests the dPULearn.mine_negatives() convenience method (issue #308). + +mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabelled, +builds a 1 (positive) / 2 (unlabeled) label vector, fits, and returns the boolean mask of +identified reliable negatives over the rows of X_unlabelled. The key contract is that the +mask equals the manual ``labels_[len(X_pos):] == 0`` result exactly, and that the existing +``fit`` path stays byte-identical (no algorithm change). +""" +import numpy as np +import pytest + +import aaanalysis as aa + + +# Helper functions +def _make_data(n_pos=20, n_unl=50, n_features=8, seed=0): + rng = np.random.default_rng(seed) + X_pos = rng.normal(0.0, 1.0, size=(n_pos, n_features)) + X_unl = rng.normal(0.6, 1.0, size=(n_unl, n_features)) + return X_pos, X_unl + + +def _manual_mask(X_pos, X_unl, random_state=42, **fit_kwargs): + """Reproduce the notebook cell 18/24 manual stacking path.""" + X_pool = np.vstack([X_pos, X_unl]) + y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl)) + dpul = aa.dPULearn(random_state=random_state, verbose=False) + dpul.fit(X=X_pool, labels=y_pool, **fit_kwargs) + return np.asarray(dpul.labels_)[len(X_pos):] == 0, dpul + + +# Normal Cases Test Class +class TestMineNegatives: + """Test dPULearn.mine_negatives() for each parameter individually.""" + + def test_returns_boolean_mask_over_unlabelled(self): + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + assert isinstance(mask, np.ndarray) + assert mask.dtype == bool + assert mask.shape == (X_unl.shape[0],) + assert mask.sum() == 10 + + def test_X_pos_parameter(self): + X_pos, X_unl = _make_data(n_pos=30) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5) + assert mask.shape[0] == X_unl.shape[0] + + def test_X_unlabelled_parameter(self): + X_pos, X_unl = _make_data(n_unl=70) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=12) + assert mask.shape[0] == 70 + assert mask.sum() == 12 + + def test_n_unl_to_neg_parameter(self): + X_pos, X_unl = _make_data() + for n in (1, 5, 25): + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=n) + assert mask.sum() == n + + def test_n_neg_parameter(self): + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=8) + assert mask.sum() == 8 + + def test_metric_parameter(self): + X_pos, X_unl = _make_data() + for metric in ("euclidean", "manhattan", "cosine"): + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, + n_unl_to_neg=10, metric=metric) + assert mask.sum() == 10 + + def test_n_components_parameter(self): + X_pos, X_unl = _make_data() + for n_components in (2, 3, 0.5): + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, + n_unl_to_neg=10, n_components=n_components) + assert mask.sum() == 10 + + def test_instance_attributes_set(self): + """After mining, labels_ / df_pu_ are set so the plotting class works.""" + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + assert dpul.labels_ is not None + assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0] + assert dpul.df_pu_ is not None + + +# Regression / golden equivalence +class TestMineNegativesEquivalence: + """The mask must equal the manual stacking path exactly (KPI #308).""" + + @pytest.mark.parametrize("seed", [0, 1, 7]) + def test_mask_equals_manual_pca(self, seed): + X_pos, X_unl = _make_data(seed=seed) + manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + assert np.array_equal(mask, manual_mask) + assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_)) + + def test_mask_equals_manual_metric(self): + X_pos, X_unl = _make_data(seed=3) + manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine") + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, + n_unl_to_neg=8, metric="cosine") + assert np.array_equal(mask, manual_mask) + + def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self): + X_pos, X_unl = _make_data() + dpul_a = aa.dPULearn(random_state=42, verbose=False) + mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + dpul_b = aa.dPULearn(random_state=42, verbose=False) + mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=10) + assert np.array_equal(mask_a, mask_b) + + +# Negative Cases Test Class +class TestMineNegativesNegative: + """Invalid inputs must raise informative ValueErrors.""" + + def test_feature_mismatch(self): + X_pos, _ = _make_data(n_features=8) + _, X_unl = _make_data(n_features=6) + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5) + + def test_both_counts_given(self): + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=5, n_unl_to_neg=5) + + def test_neither_count_given(self): + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl) + + def test_too_many_negatives_requested(self): + X_pos, X_unl = _make_data(n_unl=10) + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=999) + + def test_X_pos_none(self): + _, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=None, X_unlabelled=X_unl, n_unl_to_neg=5) + + def test_X_unlabelled_none(self): + X_pos, _ = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabelled=None, n_unl_to_neg=5) + + +# Existing-fit byte-identical regression +class TestFitUnchanged: + """The pre-existing fit(X, labels=...) path stays byte-identical (#308 no-change).""" + + def test_fit_pca_unchanged(self): + X_pos, X_unl = _make_data(seed=11) + X_pool = np.vstack([X_pos, X_unl]) + y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl)) + dpul = aa.dPULearn(random_state=42, verbose=False) + dpul.fit(X=X_pool, labels=y_pool, n_unl_to_neg=10) + labels = np.asarray(dpul.labels_) + # contract: positives stay 1, exactly 10 mined negatives become 0, rest stay 2 + assert (labels[:len(X_pos)] == 1).all() + assert (labels == 0).sum() == 10 + assert set(np.unique(labels)).issubset({0, 1, 2}) diff --git a/tests/unit/plotting_tests/test_color_samples_constants.py b/tests/unit/plotting_tests/test_color_samples_constants.py new file mode 100644 index 00000000..52826a1b --- /dev/null +++ b/tests/unit/plotting_tests/test_color_samples_constants.py @@ -0,0 +1,36 @@ +""" +This script tests the named sample-color constants exposed at top level (issue #308). + +COLOR_SAMPLES_POS / NEG / UNL / REL_NEG are public, named aliases for the canonical sample +colors. They must equal today's ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]`` values exactly, +so users can reference a named constant instead of indexing the color dict by string key. +""" +import pytest + +import aaanalysis as aa + + +# Golden equivalence test +class TestColorSamplesConstants: + """Named constants must equal the plot_get_cdict values (golden KPI #308).""" + + def test_constants_exist_at_top_level(self): + for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG", + "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"): + assert hasattr(aa, name) + assert name in aa.__all__ + + @pytest.mark.parametrize("const_name,dict_key", [ + ("COLOR_SAMPLES_POS", "SAMPLES_POS"), + ("COLOR_SAMPLES_NEG", "SAMPLES_NEG"), + ("COLOR_SAMPLES_UNL", "SAMPLES_UNL"), + ("COLOR_SAMPLES_REL_NEG", "SAMPLES_REL_NEG"), + ]) + def test_constant_equals_cdict_value(self, const_name, dict_key): + dict_color = aa.plot_get_cdict(name="DICT_COLOR") + assert getattr(aa, const_name) == dict_color[dict_key] + + def test_constants_are_strings(self): + for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG", + "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"): + assert isinstance(getattr(aa, name), str) From 27846b9ea0809f2266579ed014590dffd5103eb8 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Wed, 1 Jul 2026 04:50:56 +0200 Subject: [PATCH 2/6] refactor: validate col_label before using it as a required-column key in get_labels Reorder the get_labels Validate block so check_str(col_label) runs before check_df(cols_required=col_label). A non-str col_label now surfaces a clear 'col_label' error instead of an internal 'cols_required' one. No behaviour change on valid input. Co-Authored-By: Claude Opus 4.8 (1M context) --- aaanalysis/data_handling/_get_labels.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aaanalysis/data_handling/_get_labels.py b/aaanalysis/data_handling/_get_labels.py index 9149deaf..f23d8423 100644 --- a/aaanalysis/data_handling/_get_labels.py +++ b/aaanalysis/data_handling/_get_labels.py @@ -62,8 +62,8 @@ def get_labels(df: pd.DataFrame, .. include:: examples/get_labels.rst """ # Check input - ut.check_df(name="df", df=df, cols_required=col_label) ut.check_str(name="col_label", val=col_label, accept_none=False) + ut.check_df(name="df", df=df, cols_required=col_label) check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label) # Derive binary int label vector labels = (df[col_label] == positive_label).astype(int).to_numpy() From 233dc6497b565de8be6cffb42abec8928d8e1f0f Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Wed, 1 Jul 2026 05:21:06 +0200 Subject: [PATCH 3/6] round2(dpulearn): match manual path for small positive/unlabelled sets mine_negatives validated X_pos and X_unlabelled separately with the default check_X min_n_samples=3, so it rejected n_pos<3 inputs that the manual stacking path accepts (the >=3 floor belongs to the stacked matrix, which fit enforces). Relax the per-matrix check to min_n_samples=1 to restore exact equivalence; add tests for the small-positive-set equivalence and get_labels single-class/NaN mapping. Co-Authored-By: Claude Opus 4.8 (1M context) --- aaanalysis/pu_learning/_dpulearn.py | 8 +++++--- tests/unit/data_handling_tests/test_get_labels.py | 13 +++++++++++++ .../dpulearn_tests/test_dpulearn_mine_negatives.py | 9 +++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py index 9931dbdc..6e141995 100644 --- a/aaanalysis/pu_learning/_dpulearn.py +++ b/aaanalysis/pu_learning/_dpulearn.py @@ -434,9 +434,11 @@ def mine_negatives(self, -------- .. include:: examples/dpul_mine_negatives.rst """ - # Check input - X_pos = ut.check_X(X=X_pos, X_name="X_pos") - X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled") + # Check input (the >=3 sample floor applies to the stacked matrix, enforced by + # 'fit' below, so per-matrix validation only coerces + checks the feature dimension; + # this keeps mine_negatives accepting exactly what the manual stacking path accepts) + X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1) + X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled", min_n_samples=1) check_match_X_pos_X_unlabelled(X_pos=X_pos, X_unlabelled=X_unlabelled) # Stack positives over the unlabeled pool and fit with the package PU markers n_pos = X_pos.shape[0] diff --git a/tests/unit/data_handling_tests/test_get_labels.py b/tests/unit/data_handling_tests/test_get_labels.py index 2f499e86..0d63e291 100644 --- a/tests/unit/data_handling_tests/test_get_labels.py +++ b/tests/unit/data_handling_tests/test_get_labels.py @@ -72,6 +72,19 @@ def test_string_labels(self): assert np.array_equal(aa.get_labels(df=df, positive_label="sub"), _manual(df, "sub")) + def test_single_class_column_maps_all_ones(self): + # Pure mapping: unlike dPULearn.fit, get_labels does not require >1 distinct value, + # so an all-positive column maps to all ones rather than raising. + df = pd.DataFrame({"label": [1, 1, 1]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1), + np.array([1, 1, 1])) + + def test_nan_maps_to_zero(self): + # NaN never equals positive_label, so it becomes 0. + df = pd.DataFrame({"label": [1.0, np.nan, 1.0]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1.0), + np.array([1, 0, 1])) + # Negative Cases Test Class class TestGetLabelsNegative: diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py index 1329f585..19be06a9 100644 --- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py +++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py @@ -116,6 +116,15 @@ def test_mask_equals_manual_metric(self): n_unl_to_neg=8, metric="cosine") assert np.array_equal(mask, manual_mask) + def test_mask_equals_manual_few_positives(self): + # n_pos < 3: the manual stacked path accepts it (the >=3 floor applies to the + # stacked matrix), so mine_negatives must match it, not reject the small pos set. + X_pos, X_unl = _make_data(n_pos=1, seed=5) + manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=6) + assert np.array_equal(mask, manual_mask) + def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self): X_pos, X_unl = _make_data() dpul_a = aa.dPULearn(random_state=42, verbose=False) From 89e26cba6f4a52bf13934c65a09d269b3a7a61c7 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Wed, 1 Jul 2026 05:24:12 +0200 Subject: [PATCH 4/6] round3(dpulearn): rename X_unlabelled -> X_unlabeled for codebase consistency The rest of the package spells it 'unlabeled' (American, 85 uses) and abbreviates the marker as label_unl / n_unl_to_neg; the new public mine_negatives parameter used the British two-L 'X_unlabelled'. Rename the new/unreleased parameter, its match helper, docstrings, tests, cheat-sheet and release-notes entries, and re-execute the example notebook. Co-Authored-By: Claude Opus 4.8 (1M context) --- aaanalysis/pu_learning/_dpulearn.py | 28 +-- docs/_cheatsheet/content.py | 2 +- docs/source/index/release_notes.rst | 4 +- .../pu_learning/dpul_mine_negatives.ipynb | 184 +++++++++--------- .../test_dpulearn_mine_negatives.py | 48 ++--- 5 files changed, 133 insertions(+), 133 deletions(-) diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py index 6e141995..3172d93f 100644 --- a/aaanalysis/pu_learning/_dpulearn.py +++ b/aaanalysis/pu_learning/_dpulearn.py @@ -133,13 +133,13 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None: raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})") -def check_match_X_pos_X_unlabelled(X_pos=None, X_unlabelled=None) -> None: +def check_match_X_pos_X_unlabeled(X_pos=None, X_unlabeled=None) -> None: """Check that positive and unlabeled feature matrices share the same feature dimension.""" n_features_pos = X_pos.shape[1] - n_features_unl = X_unlabelled.shape[1] + n_features_unl = X_unlabeled.shape[1] if n_features_pos != n_features_unl: raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and " - f"'X_unlabelled' (n={n_features_unl})") + f"'X_unlabeled' (n={n_features_unl})") # II Main Functions @@ -369,7 +369,7 @@ def fit(self, def mine_negatives(self, X_pos: ut.ArrayLike2D, - X_unlabelled: ut.ArrayLike2D, + X_unlabeled: ut.ArrayLike2D, n_neg: Optional[int] = None, n_unl_to_neg: Optional[int] = None, metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None, @@ -379,15 +379,15 @@ def mine_negatives(self, Mine reliable negatives from an unlabeled pool given the positives, in one call. Convenience wrapper around :meth:`dPULearn.fit` for the common positive/unlabeled - setup: instead of stacking ``X_pos`` and ``X_unlabelled`` by hand, building a + setup: instead of stacking ``X_pos`` and ``X_unlabeled`` by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, pass the two feature matrices separately and receive a - **boolean mask over the rows of** ``X_unlabelled`` flagging the identified reliable + **boolean mask over the rows of** ``X_unlabeled`` flagging the identified reliable negatives. The mask equals ``labels_[len(X_pos):] == 0`` from the manual stacking path exactly. After the call the instance is fitted: :attr:`dPULearn.labels_` (over the stacked - ``X_pos`` then ``X_unlabelled``) and :attr:`dPULearn.df_pu_` are set, so the + ``X_pos`` then ``X_unlabeled``) and :attr:`dPULearn.df_pu_` are set, so the :class:`dPULearnPlot` methods work as usual. .. versionadded:: 1.1.0 @@ -396,7 +396,7 @@ def mine_negatives(self, ---------- X_pos : array-like, shape (n_pos, n_features) Feature matrix of the positive samples. - X_unlabelled : array-like, shape (n_unl, n_features) + X_unlabeled : array-like, shape (n_unl, n_features) Feature matrix of the unlabeled samples (the candidate pool). Must have the same number of features as ``X_pos``. n_neg : int, optional @@ -416,8 +416,8 @@ def mine_negatives(self, Returns ------- mask_neg : array-like, shape (n_unl,) - Boolean mask over the rows of ``X_unlabelled``: ``True`` marks an identified - reliable negative. ``X_unlabelled[mask_neg]`` are the mined negatives. + Boolean mask over the rows of ``X_unlabeled``: ``True`` marks an identified + reliable negative. ``X_unlabeled[mask_neg]`` are the mined negatives. Notes ----- @@ -438,12 +438,12 @@ def mine_negatives(self, # 'fit' below, so per-matrix validation only coerces + checks the feature dimension; # this keeps mine_negatives accepting exactly what the manual stacking path accepts) X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1) - X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled", min_n_samples=1) - check_match_X_pos_X_unlabelled(X_pos=X_pos, X_unlabelled=X_unlabelled) + X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1) + check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled) # Stack positives over the unlabeled pool and fit with the package PU markers n_pos = X_pos.shape[0] - X = np.vstack([X_pos, X_unlabelled]) - labels = np.array([1] * n_pos + [2] * X_unlabelled.shape[0]) + X = np.vstack([X_pos, X_unlabeled]) + labels = np.array([1] * n_pos + [2] * X_unlabeled.shape[0]) self.fit(X=X, labels=labels, label_pos=1, label_unl=2, n_neg=n_neg, n_unl_to_neg=n_unl_to_neg, metric=metric, n_components=n_components) diff --git a/docs/_cheatsheet/content.py b/docs/_cheatsheet/content.py index da244c72..0d04b6eb 100644 --- a/docs/_cheatsheet/content.py +++ b/docs/_cheatsheet/content.py @@ -222,7 +222,7 @@ {"name": "Modeling & Explainability", "tag": "PU · classify · SHAP", "rows": [ ("Train with positives + unlabeled data", "dPULearn().fit(X, labels) [Wrapper]", None), - ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabelled) → mask", None, "v1.1"), + ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabeled) → mask", None, "v1.1"), ("Train + RFE + MC importance", "TreeModel().fit(X, labels) [Wrapper]", None), ("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels) [pro]", None), ]}, diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst index 5ddf838c..d87656c2 100644 --- a/docs/source/index/release_notes.rst +++ b/docs/source/index/release_notes.rst @@ -138,9 +138,9 @@ Added **PU Learning** - **dPULearn.mine_negatives**: One-call convenience over ``dPULearn.fit`` for the common - positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabelled`` separately instead of + positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabeled`` separately instead of stacking them by hand, building a ``1`` / ``2`` label vector, fitting, and slicing the - mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabelled`` + mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabeled`` flagging the identified reliable negatives (equal to the manual ``labels_[len(X_pos):] == 0`` result exactly). The instance is left fitted (``labels_`` / ``df_pu_`` set, so ``dPULearnPlot`` works), and the existing ``fit(X, labels=...)`` path is unchanged. diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb index cd9fc03a..99bcca21 100644 --- a/examples/pu_learning/dpul_mine_negatives.ipynb +++ b/examples/pu_learning/dpul_mine_negatives.ipynb @@ -5,7 +5,7 @@ "id": "01ccf3ee", "metadata": {}, "source": [ - "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabelled``." + "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabeled``." ] }, { @@ -14,10 +14,10 @@ "id": "710fdc35", "metadata": { "execution": { - "iopub.execute_input": "2026-06-30T23:30:36.652099Z", - "iopub.status.busy": "2026-06-30T23:30:36.652032Z", - "iopub.status.idle": "2026-06-30T23:30:38.227970Z", - "shell.execute_reply": "2026-06-30T23:30:38.227738Z" + "iopub.execute_input": "2026-07-01T03:23:09.096722Z", + "iopub.status.busy": "2026-07-01T03:23:09.096500Z", + "iopub.status.idle": "2026-07-01T03:23:11.009355Z", + "shell.execute_reply": "2026-07-01T03:23:11.009064Z" } }, "outputs": [ @@ -53,7 +53,7 @@ "id": "38f3d31d", "metadata": {}, "source": [ - "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabelled``, which unlabeled samples were identified as reliable negatives." + "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives." ] }, { @@ -62,10 +62,10 @@ "id": "bdccaa03", "metadata": { "execution": { - "iopub.execute_input": "2026-06-30T23:30:38.229229Z", - "iopub.status.busy": "2026-06-30T23:30:38.229155Z", - "iopub.status.idle": "2026-06-30T23:30:38.241855Z", - "shell.execute_reply": "2026-06-30T23:30:38.241652Z" + "iopub.execute_input": "2026-07-01T03:23:11.010517Z", + "iopub.status.busy": "2026-07-01T03:23:11.010443Z", + "iopub.status.idle": "2026-07-01T03:23:11.023598Z", + "shell.execute_reply": "2026-07-01T03:23:11.023349Z" } }, "outputs": [ @@ -79,7 +79,7 @@ ], "source": [ "dpul = aa.dPULearn(random_state=42)\n", - "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=49)\n", + "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=49)\n", "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n", "X_neg = X_unl[mask_neg] # the mined feature rows" ] @@ -89,7 +89,7 @@ "id": "315d4798", "metadata": {}, "source": [ - "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabelled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``." + "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabeled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``." ] }, { @@ -98,10 +98,10 @@ "id": "c78d6eab", "metadata": { "execution": { - "iopub.execute_input": "2026-06-30T23:30:38.243063Z", - "iopub.status.busy": "2026-06-30T23:30:38.242994Z", - "iopub.status.idle": "2026-06-30T23:30:38.254654Z", - "shell.execute_reply": "2026-06-30T23:30:38.254464Z" + "iopub.execute_input": "2026-07-01T03:23:11.024869Z", + "iopub.status.busy": "2026-07-01T03:23:11.024779Z", + "iopub.status.idle": "2026-07-01T03:23:11.036398Z", + "shell.execute_reply": "2026-07-01T03:23:11.036167Z" } }, "outputs": [ @@ -114,7 +114,7 @@ } ], "source": [ - "# Stack X_pos over X_unlabelled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n", + "# Stack X_pos over X_unlabeled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n", "labels_manual = np.asarray(\n", " aa.dPULearn(random_state=42)\n", " .fit(X=np.vstack([X_pos, X_unl]),\n", @@ -139,10 +139,10 @@ "id": "85eff05c", "metadata": { "execution": { - "iopub.execute_input": "2026-06-30T23:30:38.255594Z", - "iopub.status.busy": "2026-06-30T23:30:38.255519Z", - "iopub.status.idle": "2026-06-30T23:30:38.282637Z", - "shell.execute_reply": "2026-06-30T23:30:38.282413Z" + "iopub.execute_input": "2026-07-01T03:23:11.037630Z", + "iopub.status.busy": "2026-07-01T03:23:11.037545Z", + "iopub.status.idle": "2026-07-01T03:23:11.072965Z", + "shell.execute_reply": "2026-07-01T03:23:11.072412Z" } }, "outputs": [ @@ -161,116 +161,116 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 selection_viaPC1 (56.2%)PC2 (7.4%)PC3 (2.9%)PC4 (2.8%)selection_viaPC1 (56.2%)PC2 (7.4%)PC3 (2.9%)PC4 (2.8%)
84PC10.021000-0.0478000.075200-0.00540084PC10.021000-0.0478000.075200-0.005400
95PC20.032000-0.0821000.025800-0.03770095PC20.032000-0.0821000.025800-0.037700
109PC10.026100-0.0585000.075700-0.020900109PC10.026100-0.0585000.075700-0.020900
158PC10.023500-0.0607000.0540000.000900158PC10.023500-0.0607000.0540000.000900
161PC10.0259000.0314000.0449000.055400161PC10.0259000.0314000.0449000.055400
170PC10.026100-0.0353000.0583000.025800170PC10.026100-0.0353000.0583000.025800
192PC60.040100-0.0022000.004300-0.053600192PC60.040100-0.0022000.004300-0.053600
193PC10.024700-0.0569000.051300-0.035600193PC10.024700-0.0569000.051300-0.035600
195PC50.0299000.0065000.0358000.050200195PC50.0299000.0065000.0358000.050200
200PC10.021200-0.0562000.0057000.072600200PC10.021200-0.0562000.0057000.072600
\n" diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py index 19be06a9..ecc20b72 100644 --- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py +++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py @@ -1,9 +1,9 @@ """ This script tests the dPULearn.mine_negatives() convenience method (issue #308). -mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabelled, +mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabeled, builds a 1 (positive) / 2 (unlabeled) label vector, fits, and returns the boolean mask of -identified reliable negatives over the rows of X_unlabelled. The key contract is that the +identified reliable negatives over the rows of X_unlabeled. The key contract is that the mask equals the manual ``labels_[len(X_pos):] == 0`` result exactly, and that the existing ``fit`` path stays byte-identical (no algorithm change). """ @@ -34,10 +34,10 @@ def _manual_mask(X_pos, X_unl, random_state=42, **fit_kwargs): class TestMineNegatives: """Test dPULearn.mine_negatives() for each parameter individually.""" - def test_returns_boolean_mask_over_unlabelled(self): + def test_returns_boolean_mask_over_unlabeled(self): X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) assert isinstance(mask, np.ndarray) assert mask.dtype == bool assert mask.shape == (X_unl.shape[0],) @@ -46,13 +46,13 @@ def test_returns_boolean_mask_over_unlabelled(self): def test_X_pos_parameter(self): X_pos, X_unl = _make_data(n_pos=30) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5) assert mask.shape[0] == X_unl.shape[0] - def test_X_unlabelled_parameter(self): + def test_X_unlabeled_parameter(self): X_pos, X_unl = _make_data(n_unl=70) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=12) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=12) assert mask.shape[0] == 70 assert mask.sum() == 12 @@ -60,20 +60,20 @@ def test_n_unl_to_neg_parameter(self): X_pos, X_unl = _make_data() for n in (1, 5, 25): dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=n) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=n) assert mask.sum() == n def test_n_neg_parameter(self): X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=8) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=8) assert mask.sum() == 8 def test_metric_parameter(self): X_pos, X_unl = _make_data() for metric in ("euclidean", "manhattan", "cosine"): dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10, metric=metric) assert mask.sum() == 10 @@ -81,7 +81,7 @@ def test_n_components_parameter(self): X_pos, X_unl = _make_data() for n_components in (2, 3, 0.5): dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10, n_components=n_components) assert mask.sum() == 10 @@ -89,7 +89,7 @@ def test_instance_attributes_set(self): """After mining, labels_ / df_pu_ are set so the plotting class works.""" X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) - dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) assert dpul.labels_ is not None assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0] assert dpul.df_pu_ is not None @@ -104,7 +104,7 @@ def test_mask_equals_manual_pca(self, seed): X_pos, X_unl = _make_data(seed=seed) manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) assert np.array_equal(mask, manual_mask) assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_)) @@ -112,7 +112,7 @@ def test_mask_equals_manual_metric(self): X_pos, X_unl = _make_data(seed=3) manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine") dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=8, metric="cosine") assert np.array_equal(mask, manual_mask) @@ -122,15 +122,15 @@ def test_mask_equals_manual_few_positives(self): X_pos, X_unl = _make_data(n_pos=1, seed=5) manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=6) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=6) assert np.array_equal(mask, manual_mask) def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self): X_pos, X_unl = _make_data() dpul_a = aa.dPULearn(random_state=42, verbose=False) - mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10) + mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) dpul_b = aa.dPULearn(random_state=42, verbose=False) - mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=10) + mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) assert np.array_equal(mask_a, mask_b) @@ -143,37 +143,37 @@ def test_feature_mismatch(self): _, X_unl = _make_data(n_features=6) dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5) def test_both_counts_given(self): X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=5, n_unl_to_neg=5) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5, n_unl_to_neg=5) def test_neither_count_given(self): X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl) def test_too_many_negatives_requested(self): X_pos, X_unl = _make_data(n_unl=10) dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=999) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=999) def test_X_pos_none(self): _, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=None, X_unlabelled=X_unl, n_unl_to_neg=5) + dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_unl_to_neg=5) - def test_X_unlabelled_none(self): + def test_X_unlabeled_none(self): X_pos, _ = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabelled=None, n_unl_to_neg=5) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_unl_to_neg=5) # Existing-fit byte-identical regression From 86ec2eadded132bce9e761a1786f5930fdcabbab Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Wed, 1 Jul 2026 05:29:43 +0200 Subject: [PATCH 5/6] round4(dpulearn): collapse mine_negatives counts to a single n_neg With no pre-labeled negatives, n_neg (total) and n_unl_to_neg (from the pool) are always equivalent in mine_negatives, so exposing both was redundant. Replace them with a single required n_neg (the method is new/unreleased, so non-breaking); it calls fit(n_unl_to_neg=n_neg) internally. Update docstring, tests, and the re-executed example notebook. Co-Authored-By: Claude Opus 4.8 (1M context) --- aaanalysis/pu_learning/_dpulearn.py | 17 +- .../pu_learning/dpul_mine_negatives.ipynb | 180 +++++++++--------- .../test_dpulearn_mine_negatives.py | 54 ++---- 3 files changed, 113 insertions(+), 138 deletions(-) diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py index 3172d93f..40d44b44 100644 --- a/aaanalysis/pu_learning/_dpulearn.py +++ b/aaanalysis/pu_learning/_dpulearn.py @@ -370,8 +370,7 @@ def fit(self, def mine_negatives(self, X_pos: ut.ArrayLike2D, X_unlabeled: ut.ArrayLike2D, - n_neg: Optional[int] = None, - n_unl_to_neg: Optional[int] = None, + n_neg: int, metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None, n_components: Union[float, int] = 0.80, ) -> np.ndarray: @@ -399,13 +398,9 @@ def mine_negatives(self, X_unlabeled : array-like, shape (n_unl, n_features) Feature matrix of the unlabeled samples (the candidate pool). Must have the same number of features as ``X_pos``. - n_neg : int, optional - Total number of reliable negatives to identify from the unlabeled pool. - Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg`` (with no pre-labeled - negatives the two are equivalent). - n_unl_to_neg : int, optional - Number of reliable negatives to identify directly from the unlabeled pool. - Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg``. + n_neg : int + Number of reliable negatives to identify from the unlabeled pool. Must not + exceed the number of unlabeled samples. metric : str or None, optional Distance metric for distance-based identification (``euclidean``, ``manhattan``, ``cosine``). If ``None``, PCA-based identification is performed. @@ -444,9 +439,9 @@ def mine_negatives(self, n_pos = X_pos.shape[0] X = np.vstack([X_pos, X_unlabeled]) labels = np.array([1] * n_pos + [2] * X_unlabeled.shape[0]) + # No pre-labeled negatives here, so n_neg is exactly the count to draw from the pool self.fit(X=X, labels=labels, label_pos=1, label_unl=2, - n_neg=n_neg, n_unl_to_neg=n_unl_to_neg, - metric=metric, n_components=n_components) + n_unl_to_neg=n_neg, metric=metric, n_components=n_components) # Slice the mined reliable negatives (label 0) back out of the unlabeled block mask_neg = np.asarray(self.labels_)[n_pos:] == 0 return mask_neg diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb index 99bcca21..fc56d1fc 100644 --- a/examples/pu_learning/dpul_mine_negatives.ipynb +++ b/examples/pu_learning/dpul_mine_negatives.ipynb @@ -14,10 +14,10 @@ "id": "710fdc35", "metadata": { "execution": { - "iopub.execute_input": "2026-07-01T03:23:09.096722Z", - "iopub.status.busy": "2026-07-01T03:23:09.096500Z", - "iopub.status.idle": "2026-07-01T03:23:11.009355Z", - "shell.execute_reply": "2026-07-01T03:23:11.009064Z" + "iopub.execute_input": "2026-07-01T03:27:51.939100Z", + "iopub.status.busy": "2026-07-01T03:27:51.937608Z", + "iopub.status.idle": "2026-07-01T03:28:11.600218Z", + "shell.execute_reply": "2026-07-01T03:28:11.552756Z" } }, "outputs": [ @@ -53,7 +53,7 @@ "id": "38f3d31d", "metadata": {}, "source": [ - "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives." + "Mine a fixed number of reliable negatives from the unlabeled pool with ``n_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives." ] }, { @@ -62,10 +62,10 @@ "id": "bdccaa03", "metadata": { "execution": { - "iopub.execute_input": "2026-07-01T03:23:11.010517Z", - "iopub.status.busy": "2026-07-01T03:23:11.010443Z", - "iopub.status.idle": "2026-07-01T03:23:11.023598Z", - "shell.execute_reply": "2026-07-01T03:23:11.023349Z" + "iopub.execute_input": "2026-07-01T03:28:11.767224Z", + "iopub.status.busy": "2026-07-01T03:28:11.766642Z", + "iopub.status.idle": "2026-07-01T03:28:11.816538Z", + "shell.execute_reply": "2026-07-01T03:28:11.795116Z" } }, "outputs": [ @@ -79,7 +79,7 @@ ], "source": [ "dpul = aa.dPULearn(random_state=42)\n", - "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=49)\n", + "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=49)\n", "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n", "X_neg = X_unl[mask_neg] # the mined feature rows" ] @@ -98,10 +98,10 @@ "id": "c78d6eab", "metadata": { "execution": { - "iopub.execute_input": "2026-07-01T03:23:11.024869Z", - "iopub.status.busy": "2026-07-01T03:23:11.024779Z", - "iopub.status.idle": "2026-07-01T03:23:11.036398Z", - "shell.execute_reply": "2026-07-01T03:23:11.036167Z" + "iopub.execute_input": "2026-07-01T03:28:11.824245Z", + "iopub.status.busy": "2026-07-01T03:28:11.824003Z", + "iopub.status.idle": "2026-07-01T03:28:12.020230Z", + "shell.execute_reply": "2026-07-01T03:28:11.952823Z" } }, "outputs": [ @@ -130,7 +130,7 @@ "id": "cae2d79d", "metadata": {}, "source": [ - "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Use ``n_neg`` instead of ``n_unl_to_neg`` to request a total count, or set a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification." + "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Pass ``n_neg`` (the number of reliable negatives to mine) and optionally a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification instead of the default PCA." ] }, { @@ -139,10 +139,10 @@ "id": "85eff05c", "metadata": { "execution": { - "iopub.execute_input": "2026-07-01T03:23:11.037630Z", - "iopub.status.busy": "2026-07-01T03:23:11.037545Z", - "iopub.status.idle": "2026-07-01T03:23:11.072965Z", - "shell.execute_reply": "2026-07-01T03:23:11.072412Z" + "iopub.execute_input": "2026-07-01T03:28:12.179951Z", + "iopub.status.busy": "2026-07-01T03:28:12.179661Z", + "iopub.status.idle": "2026-07-01T03:28:12.745973Z", + "shell.execute_reply": "2026-07-01T03:28:12.744957Z" } }, "outputs": [ @@ -161,116 +161,116 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 selection_viaPC1 (56.2%)PC2 (7.4%)PC3 (2.9%)PC4 (2.8%)selection_viaPC1 (56.2%)PC2 (7.4%)PC3 (2.9%)PC4 (2.8%)
84PC10.021000-0.0478000.075200-0.00540084PC10.021000-0.0478000.075200-0.005400
95PC20.032000-0.0821000.025800-0.03770095PC20.032000-0.0821000.025800-0.037700
109PC10.026100-0.0585000.075700-0.020900109PC10.026100-0.0585000.075700-0.020900
158PC10.023500-0.0607000.0540000.000900158PC10.023500-0.0607000.0540000.000900
161PC10.0259000.0314000.0449000.055400161PC10.0259000.0314000.0449000.055400
170PC10.026100-0.0353000.0583000.025800170PC10.026100-0.0353000.0583000.025800
192PC60.040100-0.0022000.004300-0.053600192PC60.040100-0.0022000.004300-0.053600
193PC10.024700-0.0569000.051300-0.035600193PC10.024700-0.0569000.051300-0.035600
195PC50.0299000.0065000.0358000.050200195PC50.0299000.0065000.0358000.050200
200PC10.021200-0.0562000.0057000.072600200PC10.021200-0.0562000.0057000.072600
\n" diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py index ecc20b72..e11044a7 100644 --- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py +++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py @@ -37,7 +37,7 @@ class TestMineNegatives: def test_returns_boolean_mask_over_unlabeled(self): X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) assert isinstance(mask, np.ndarray) assert mask.dtype == bool assert mask.shape == (X_unl.shape[0],) @@ -46,35 +46,29 @@ def test_returns_boolean_mask_over_unlabeled(self): def test_X_pos_parameter(self): X_pos, X_unl = _make_data(n_pos=30) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5) assert mask.shape[0] == X_unl.shape[0] def test_X_unlabeled_parameter(self): X_pos, X_unl = _make_data(n_unl=70) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=12) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=12) assert mask.shape[0] == 70 assert mask.sum() == 12 - def test_n_unl_to_neg_parameter(self): + def test_n_neg_parameter(self): X_pos, X_unl = _make_data() for n in (1, 5, 25): dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=n) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=n) assert mask.sum() == n - def test_n_neg_parameter(self): - X_pos, X_unl = _make_data() - dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=8) - assert mask.sum() == 8 - def test_metric_parameter(self): X_pos, X_unl = _make_data() for metric in ("euclidean", "manhattan", "cosine"): dpul = aa.dPULearn(random_state=42, verbose=False) mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, - n_unl_to_neg=10, metric=metric) + n_neg=10, metric=metric) assert mask.sum() == 10 def test_n_components_parameter(self): @@ -82,14 +76,14 @@ def test_n_components_parameter(self): for n_components in (2, 3, 0.5): dpul = aa.dPULearn(random_state=42, verbose=False) mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, - n_unl_to_neg=10, n_components=n_components) + n_neg=10, n_components=n_components) assert mask.sum() == 10 def test_instance_attributes_set(self): """After mining, labels_ / df_pu_ are set so the plotting class works.""" X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) - dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) assert dpul.labels_ is not None assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0] assert dpul.df_pu_ is not None @@ -104,7 +98,7 @@ def test_mask_equals_manual_pca(self, seed): X_pos, X_unl = _make_data(seed=seed) manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) assert np.array_equal(mask, manual_mask) assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_)) @@ -113,7 +107,7 @@ def test_mask_equals_manual_metric(self): manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine") dpul = aa.dPULearn(random_state=42, verbose=False) mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, - n_unl_to_neg=8, metric="cosine") + n_neg=8, metric="cosine") assert np.array_equal(mask, manual_mask) def test_mask_equals_manual_few_positives(self): @@ -122,17 +116,9 @@ def test_mask_equals_manual_few_positives(self): X_pos, X_unl = _make_data(n_pos=1, seed=5) manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6) dpul = aa.dPULearn(random_state=42, verbose=False) - mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=6) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=6) assert np.array_equal(mask, manual_mask) - def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self): - X_pos, X_unl = _make_data() - dpul_a = aa.dPULearn(random_state=42, verbose=False) - mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10) - dpul_b = aa.dPULearn(random_state=42, verbose=False) - mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) - assert np.array_equal(mask_a, mask_b) - # Negative Cases Test Class class TestMineNegativesNegative: @@ -143,37 +129,31 @@ def test_feature_mismatch(self): _, X_unl = _make_data(n_features=6) dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5) - - def test_both_counts_given(self): - X_pos, X_unl = _make_data() - dpul = aa.dPULearn(random_state=42, verbose=False) - with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5, n_unl_to_neg=5) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5) - def test_neither_count_given(self): + def test_n_neg_below_one(self): X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=0) def test_too_many_negatives_requested(self): X_pos, X_unl = _make_data(n_unl=10) dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=999) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=999) def test_X_pos_none(self): _, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_unl_to_neg=5) + dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_neg=5) def test_X_unlabeled_none(self): X_pos, _ = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) with pytest.raises(ValueError): - dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_unl_to_neg=5) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_neg=5) # Existing-fit byte-identical regression From 6158eab340df5120903b1f77ad4345351982dfb2 Mon Sep 17 00:00:00 2001 From: Stephan Breimann Date: Wed, 1 Jul 2026 05:31:49 +0200 Subject: [PATCH 6/6] round5(dpulearn): validate n_neg in the frontend for a correctly-named error mine_negatives delegated n_neg validation to fit (which sees it as n_unl_to_neg), so an invalid n_neg raised an error naming the internal parameter. Validate n_neg explicitly in the frontend so the message names n_neg, and assert the name in the negative test. Co-Authored-By: Claude Opus 4.8 (1M context) --- aaanalysis/pu_learning/_dpulearn.py | 2 ++ tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py index 40d44b44..12da26f6 100644 --- a/aaanalysis/pu_learning/_dpulearn.py +++ b/aaanalysis/pu_learning/_dpulearn.py @@ -435,6 +435,8 @@ def mine_negatives(self, X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1) X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1) check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled) + # Validate n_neg here so the message names 'n_neg' (fit sees it as 'n_unl_to_neg') + ut.check_number_range(name="n_neg", val=n_neg, min_val=1, just_int=True) # Stack positives over the unlabeled pool and fit with the package PU markers n_pos = X_pos.shape[0] X = np.vstack([X_pos, X_unlabeled]) diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py index e11044a7..2accb502 100644 --- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py +++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py @@ -134,7 +134,8 @@ def test_feature_mismatch(self): def test_n_neg_below_one(self): X_pos, X_unl = _make_data() dpul = aa.dPULearn(random_state=42, verbose=False) - with pytest.raises(ValueError): + # The error must name 'n_neg' (not the internal 'n_unl_to_neg' fit sees). + with pytest.raises(ValueError, match="n_neg"): dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=0) def test_too_many_negatives_requested(self):