From 026292939a3a6e03dd3c0c4737f101485ac1a065 Mon Sep 17 00:00:00 2001
From: Stephan Breimann <stephanbreimann@gmail.com>
Date: Wed, 1 Jul 2026 01:49:01 +0200
Subject: [PATCH 1/6] feat: dPULearn.mine_negatives + get_labels + named sample
 colors (prototype #308)

Three additive conveniences for the positive/unlabelled -> mined-negatives flow,
removing the recurring vstack/label-vector/color-lookup plumbing in the
gamma-secretase use case. All existing APIs stay byte-identical.

- dPULearn.mine_negatives(X_pos, X_unlabelled, ...): one-call sugar over fit that
  returns the reliable-negative boolean mask over X_unlabelled. Equals the manual
  labels_[len(X_pos):]==0 result exactly (regression-tested). fit(X, labels=...)
  unchanged.
- get_labels(df, positive_label=1, col_label="label"): binary int label vector,
  the single-call form of (df[col]==x).astype(int).to_numpy().
- COLOR_SAMPLES_POS/NEG/UNL/REL_NEG: public named aliases for the canonical sample
  colors, equal to plot_get_cdict("DICT_COLOR")["SAMPLES_*"] (golden-tested).

Wired get_labels + the 4 color constants into __init__/__all__ (on the #308
wire-to-public-API list). Ripple: numpydoc + 2 executed example notebooks
(get_labels, dpul_mine_negatives), 39 unit tests (positive+negative+regression),
release-notes Unreleased entries, cheat-sheet rows.

Part of #305 / prototype for #308

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aaanalysis/__init__.py                        |   9 +-
 aaanalysis/_constants.py                      |   9 +
 aaanalysis/data_handling/__init__.py          |   6 +-
 aaanalysis/data_handling/_get_labels.py       |  70 ++++
 aaanalysis/pu_learning/_dpulearn.py           |  91 +++++
 docs/_cheatsheet/content.py                   |   2 +
 docs/source/index/release_notes.rst           |  17 +
 examples/data_handling/get_labels.ipynb       | 297 +++++++++++++++++
 .../pu_learning/dpul_mine_negatives.ipynb     | 314 ++++++++++++++++++
 .../data_handling_tests/test_get_labels.py    | 101 ++++++
 .../test_dpulearn_mine_negatives.py           | 184 ++++++++++
 .../test_color_samples_constants.py           |  36 ++
 12 files changed, 1133 insertions(+), 3 deletions(-)
 create mode 100644 aaanalysis/data_handling/_get_labels.py
 create mode 100644 examples/data_handling/get_labels.ipynb
 create mode 100644 examples/pu_learning/dpul_mine_negatives.ipynb
 create mode 100644 tests/unit/data_handling_tests/test_get_labels.py
 create mode 100644 tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
 create mode 100644 tests/unit/plotting_tests/test_color_samples_constants.py

diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py
index 4fc1edee..17989828 100644
--- a/aaanalysis/__init__.py
+++ b/aaanalysis/__init__.py
@@ -1,4 +1,4 @@
-from .data_handling import (load_dataset, load_scales, load_features,
+from .data_handling import (load_dataset, load_scales, load_features, get_labels,
                             read_fasta, to_fasta,
                             SequencePreprocessor,
                             EmbeddingPreprocessor,
@@ -14,6 +14,8 @@
                       comp_per_protein_ap, comp_detection_metrics,
                       comp_bootstrap_ci, comp_smooth_scores)
 from .config import options
+from ._constants import (COLOR_SAMPLES_POS, COLOR_SAMPLES_NEG,
+                        COLOR_SAMPLES_UNL, COLOR_SAMPLES_REL_NEG)
 
 from importlib.metadata import version as _version, PackageNotFoundError
 
@@ -28,6 +30,7 @@
     "load_dataset",
     "load_scales",
     "load_features",
+    "get_labels",
     "read_fasta",
     "to_fasta",
     "SequencePreprocessor",
@@ -72,6 +75,10 @@
     "comp_detection_metrics",
     "comp_bootstrap_ci",
     "comp_smooth_scores",
+    "COLOR_SAMPLES_POS",
+    "COLOR_SAMPLES_NEG",
+    "COLOR_SAMPLES_UNL",
+    "COLOR_SAMPLES_REL_NEG",
     "options"
 ]
 
diff --git a/aaanalysis/_constants.py b/aaanalysis/_constants.py
index 2c22e606..2ce81416 100644
--- a/aaanalysis/_constants.py
+++ b/aaanalysis/_constants.py
@@ -478,6 +478,15 @@ def _folder_path(super_folder, folder_name):
 COLOR_NEG = "#ad4570"   # (173,69,112)
 COLOR_REL_NEG = "#ad9745" # (173, 151, 69)
 
+# Public, named aliases for the canonical sample-group colors (positive / negative /
+# unlabeled / reliable-negative). They mirror the ``DICT_COLOR["SAMPLES_*"]`` entries
+# exactly, so users can reference a named constant (``aa.COLOR_SAMPLES_POS``) instead
+# of indexing ``plot_get_cdict("DICT_COLOR")`` by string key.
+COLOR_SAMPLES_POS = COLOR_POS
+COLOR_SAMPLES_NEG = COLOR_NEG
+COLOR_SAMPLES_UNL = COLOR_UNL
+COLOR_SAMPLES_REL_NEG = COLOR_REL_NEG
+
 DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
               "SHAP_NEG": COLOR_SHAP_NEG,
               "FEAT_POS": COLOR_FEAT_POS,
diff --git a/aaanalysis/data_handling/__init__.py b/aaanalysis/data_handling/__init__.py
index 1fad9397..7ea8c663 100644
--- a/aaanalysis/data_handling/__init__.py
+++ b/aaanalysis/data_handling/__init__.py
@@ -1,8 +1,8 @@
 """
 Data loading and sequence/embedding preprocessing — the package's data entry point.
 
-Public objects: load_dataset, load_scales, load_features, read_fasta, to_fasta,
-SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
+Public objects: load_dataset, load_scales, load_features, get_labels, read_fasta,
+to_fasta, SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
 Produces the core data objects the rest of the pipeline consumes: ``load_dataset``
 yields ``df_seq``, ``load_scales`` yields ``df_scales`` (fed to
 ``feature_engineering.AAclust`` / ``CPP``), ``load_features`` yields a reference
@@ -17,6 +17,7 @@
 from ._load_dataset import load_dataset
 from ._load_scales import load_scales
 from ._load_features import load_features
+from ._get_labels import get_labels
 from ._read_fasta import read_fasta
 from ._to_fasta import to_fasta
 from ._seq_preproc import SequencePreprocessor
@@ -27,6 +28,7 @@
     "load_dataset",
     "load_scales",
     "load_features",
+    "get_labels",
     "read_fasta",
     "to_fasta",
     "SequencePreprocessor",
diff --git a/aaanalysis/data_handling/_get_labels.py b/aaanalysis/data_handling/_get_labels.py
new file mode 100644
index 00000000..9149deaf
--- /dev/null
+++ b/aaanalysis/data_handling/_get_labels.py
@@ -0,0 +1,70 @@
+"""
+This is a script for the frontend of the get_labels function, deriving a binary
+label vector from a sequence DataFrame's label column.
+"""
+from typing import Any
+import numpy as np
+import pandas as pd
+
+import aaanalysis.utils as ut
+
+
+# I Helper Functions
+def check_match_df_positive_label(df=None, col_label=None, positive_label=None) -> None:
+    """Check that the positive label value is present in the label column."""
+    present = set(df[col_label].tolist())
+    if positive_label not in present:
+        raise ValueError(f"'positive_label' ({positive_label}) is not among the values of "
+                         f"column '{col_label}' ({sorted(present, key=str)}).")
+
+
+# II Main Functions
+def get_labels(df: pd.DataFrame,
+               positive_label: Any = 1,
+               col_label: str = "label",
+               ) -> np.ndarray:
+    """
+    Derive a binary ``int`` label vector from a column of a sequence DataFrame.
+
+    Maps the value flagged as positive (``positive_label``) onto ``1`` and every other
+    value onto ``0``, the binary encoding consumed across the package (e.g. by
+    :meth:`CPP.run`, :class:`TreeModel`, and the ``labels`` argument of most tools).
+    This is the single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()``
+    expression.
+
+    .. versionadded:: 1.1.0
+
+    Parameters
+    ----------
+    df : pd.DataFrame, shape (n_samples, n_seq_info)
+        Sequence DataFrame (``df_seq``) containing the label column ``col_label``.
+    positive_label : int or str, default=1
+        Value in ``col_label`` marking the positive class. All rows equal to it become
+        ``1``; all remaining rows become ``0``. Must be present in ``col_label``.
+    col_label : str, default='label'
+        Name of the column holding the (multi-value or already binary) labels.
+
+    Returns
+    -------
+    labels : array-like, shape (n_samples,)
+        Binary ``int`` label vector (``1`` = positive, ``0`` = otherwise), row-aligned
+        to ``df``.
+
+    Notes
+    -----
+    * The result equals ``(df[col_label] == positive_label).astype(int).to_numpy()``.
+    * Pass the resulting vector directly as the ``labels`` argument of CPP, TreeModel,
+      or other tools. For Positive-Unlabeled mining keep the package ``1`` (positive) /
+      ``2`` (unlabeled) markers instead and use :meth:`dPULearn.mine_negatives`.
+
+    Examples
+    --------
+    .. include:: examples/get_labels.rst
+    """
+    # Check input
+    ut.check_df(name="df", df=df, cols_required=col_label)
+    ut.check_str(name="col_label", val=col_label, accept_none=False)
+    check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label)
+    # Derive binary int label vector
+    labels = (df[col_label] == positive_label).astype(int).to_numpy()
+    return labels
diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py
index 23b5f729..9931dbdc 100644
--- a/aaanalysis/pu_learning/_dpulearn.py
+++ b/aaanalysis/pu_learning/_dpulearn.py
@@ -133,6 +133,15 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None:
         raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})")
 
 
+def check_match_X_pos_X_unlabelled(X_pos=None, X_unlabelled=None) -> None:
+    """Check that positive and unlabeled feature matrices share the same feature dimension."""
+    n_features_pos = X_pos.shape[1]
+    n_features_unl = X_unlabelled.shape[1]
+    if n_features_pos != n_features_unl:
+        raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and "
+                         f"'X_unlabelled' (n={n_features_unl})")
+
+
 # II Main Functions
 class dPULearn(Wrapper):
     """
@@ -358,6 +367,88 @@ def fit(self,
         self.df_pu_ = df_pu
         return self
 
+    def mine_negatives(self,
+                       X_pos: ut.ArrayLike2D,
+                       X_unlabelled: ut.ArrayLike2D,
+                       n_neg: Optional[int] = None,
+                       n_unl_to_neg: Optional[int] = None,
+                       metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None,
+                       n_components: Union[float, int] = 0.80,
+                       ) -> np.ndarray:
+        """
+        Mine reliable negatives from an unlabeled pool given the positives, in one call.
+
+        Convenience wrapper around :meth:`dPULearn.fit` for the common positive/unlabeled
+        setup: instead of stacking ``X_pos`` and ``X_unlabelled`` by hand, building a
+        ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined
+        rows back out by index, pass the two feature matrices separately and receive a
+        **boolean mask over the rows of** ``X_unlabelled`` flagging the identified reliable
+        negatives. The mask equals ``labels_[len(X_pos):] == 0`` from the manual stacking
+        path exactly.
+
+        After the call the instance is fitted: :attr:`dPULearn.labels_` (over the stacked
+        ``X_pos`` then ``X_unlabelled``) and :attr:`dPULearn.df_pu_` are set, so the
+        :class:`dPULearnPlot` methods work as usual.
+
+        .. versionadded:: 1.1.0
+
+        Parameters
+        ----------
+        X_pos : array-like, shape (n_pos, n_features)
+            Feature matrix of the positive samples.
+        X_unlabelled : array-like, shape (n_unl, n_features)
+            Feature matrix of the unlabeled samples (the candidate pool). Must have the
+            same number of features as ``X_pos``.
+        n_neg : int, optional
+            Total number of reliable negatives to identify from the unlabeled pool.
+            Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg`` (with no pre-labeled
+            negatives the two are equivalent).
+        n_unl_to_neg : int, optional
+            Number of reliable negatives to identify directly from the unlabeled pool.
+            Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg``.
+        metric : str or None, optional
+            Distance metric for distance-based identification (``euclidean``,
+            ``manhattan``, ``cosine``). If ``None``, PCA-based identification is performed.
+        n_components : int or float, default=0.80
+            Number of principal components (int >= 1) or fraction of variance covered
+            (float in ``(0.0, 1.0)``) when PCA is applied.
+
+        Returns
+        -------
+        mask_neg : array-like, shape (n_unl,)
+            Boolean mask over the rows of ``X_unlabelled``: ``True`` marks an identified
+            reliable negative. ``X_unlabelled[mask_neg]`` are the mined negatives.
+
+        Notes
+        -----
+        * This is purely additive sugar: it stacks the inputs and calls
+          :meth:`dPULearn.fit` with ``label_pos=1`` / ``label_unl=2`` internally, so the
+          identification result is identical to the manual path.
+
+        See Also
+        --------
+        * :meth:`dPULearn.fit`: the underlying fit on a stacked matrix and label vector.
+        * :func:`get_labels`: derive a binary label vector from a sequence DataFrame.
+
+        Examples
+        --------
+        .. include:: examples/dpul_mine_negatives.rst
+        """
+        # Check input
+        X_pos = ut.check_X(X=X_pos, X_name="X_pos")
+        X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled")
+        check_match_X_pos_X_unlabelled(X_pos=X_pos, X_unlabelled=X_unlabelled)
+        # Stack positives over the unlabeled pool and fit with the package PU markers
+        n_pos = X_pos.shape[0]
+        X = np.vstack([X_pos, X_unlabelled])
+        labels = np.array([1] * n_pos + [2] * X_unlabelled.shape[0])
+        self.fit(X=X, labels=labels, label_pos=1, label_unl=2,
+                 n_neg=n_neg, n_unl_to_neg=n_unl_to_neg,
+                 metric=metric, n_components=n_components)
+        # Slice the mined reliable negatives (label 0) back out of the unlabeled block
+        mask_neg = np.asarray(self.labels_)[n_pos:] == 0
+        return mask_neg
+
     @staticmethod
     def eval(X: ut.ArrayLike2D,
              list_labels: ut.ArrayLike2D,
diff --git a/docs/_cheatsheet/content.py b/docs/_cheatsheet/content.py
index 801adc7f..da244c72 100644
--- a/docs/_cheatsheet/content.py
+++ b/docs/_cheatsheet/content.py
@@ -188,6 +188,7 @@
          ("Load benchmark sequences", "load_dataset(name) → df_seq", None),
          ("Load AAontology scales", "load_scales() → df_scales", None),
          ("Load precomputed features", "load_features(name) → df_feat", None),
+         ("Binary labels from df column", "get_labels(df, positive_label) → labels", None, "v1.1"),
          ("Read / write FASTA", "read_fasta(file) → df_seq", None),
          ("Cluster redundant homologs", "filter_seq(df_seq) → df_clust  [pro]", None),
      ]},
@@ -221,6 +222,7 @@
     {"name": "Modeling & Explainability", "tag": "PU · classify · SHAP",
      "rows": [
          ("Train with positives + unlabeled data", "dPULearn().fit(X, labels)  [Wrapper]", None),
+         ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabelled) → mask", None, "v1.1"),
          ("Train + RFE + MC importance", "TreeModel().fit(X, labels)  [Wrapper]", None),
          ("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels)  [pro]", None),
      ]},
diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst
index bc2e9618..5ddf838c 100644
--- a/docs/source/index/release_notes.rst
+++ b/docs/source/index/release_notes.rst
@@ -37,6 +37,9 @@ Added
   ``build_cat``, ``to_df_seq``).
 - **combine_dict_nums**: Concatenates per-residue tensors (embedding / structure /
   annotation) along the feature axis into one combined ``CPP.run_num`` input.
+- **get_labels**: Derives a binary ``int`` label vector from a sequence DataFrame's
+  label column (``positive_label`` mapped to ``1``, everything else to ``0``) — the
+  single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` expression.
 
 **Feature Engineering**
 
@@ -132,6 +135,16 @@ Added
   switches the pre-computed prediction per P1 (feature map + structure restyle) with no kernel,
   keeping the column-residue linking (warned past 40 sites, hard-capped at 200).
 
+**PU Learning**
+
+- **dPULearn.mine_negatives**: One-call convenience over ``dPULearn.fit`` for the common
+  positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabelled`` separately instead of
+  stacking them by hand, building a ``1`` / ``2`` label vector, fitting, and slicing the
+  mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabelled``
+  flagging the identified reliable negatives (equal to the manual ``labels_[len(X_pos):]
+  == 0`` result exactly). The instance is left fitted (``labels_`` / ``df_pu_`` set, so
+  ``dPULearnPlot`` works), and the existing ``fit(X, labels=...)`` path is unchanged.
+
 **Sequence Analysis**
 
 - **AAWindowSampler**: Samples fixed-length sequence windows for PU-learning and
@@ -191,6 +204,10 @@ Added
 
 - **plot_rank**: Standalone per-protein max-score-vs-rank scatter with group coloring and
   optional threshold lines (pairs with the new ``aa.metrics`` functions).
+- **COLOR_SAMPLES_POS / COLOR_SAMPLES_NEG / COLOR_SAMPLES_UNL / COLOR_SAMPLES_REL_NEG**:
+  Public, named constants for the canonical sample-group colors (positive / negative /
+  unlabeled / reliable-negative). They equal the ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]``
+  values exactly, so a named constant replaces indexing the color dict by string key.
 
 **Golden Pipelines**
 
diff --git a/examples/data_handling/get_labels.ipynb b/examples/data_handling/get_labels.ipynb
new file mode 100644
index 00000000..ef84686a
--- /dev/null
+++ b/examples/data_handling/get_labels.ipynb
@@ -0,0 +1,297 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "30e77e51",
+   "metadata": {},
+   "source": [
+    "The ``get_labels`` function derives a binary ``int`` label vector from a column of a sequence DataFrame (``df_seq``). It is the single-call form of the recurring ``(df[col] == positive_label).astype(int).to_numpy()`` expression: the value flagged as positive becomes ``1`` and every other value becomes ``0``."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "14210edc",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-30T23:29:34.142992Z",
+     "iopub.status.busy": "2026-06-30T23:29:34.142637Z",
+     "iopub.status.idle": "2026-06-30T23:29:35.599732Z",
+     "shell.execute_reply": "2026-06-30T23:29:35.599479Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DataFrame shape: (10, 8)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_5a879 thead th {\n",
+       "  background-color: white;\n",
+       "  color: black;\n",
+       "}\n",
+       "#T_5a879 tbody tr:nth-child(odd) {\n",
+       "  background-color: #f2f2f2;\n",
+       "}\n",
+       "#T_5a879 tbody tr:nth-child(even) {\n",
+       "  background-color: white;\n",
+       "}\n",
+       "#T_5a879 th {\n",
+       "  padding: 5px;\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       "#T_5a879  td {\n",
+       "  padding: 5px;\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_5a879\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th id=\"T_5a879_level0_col0\" class=\"col_heading level0 col0\" >entry</th>\n",
+       "      <th id=\"T_5a879_level0_col1\" class=\"col_heading level0 col1\" >sequence</th>\n",
+       "      <th id=\"T_5a879_level0_col2\" class=\"col_heading level0 col2\" >label</th>\n",
+       "      <th id=\"T_5a879_level0_col3\" class=\"col_heading level0 col3\" >tmd_start</th>\n",
+       "      <th id=\"T_5a879_level0_col4\" class=\"col_heading level0 col4\" >tmd_stop</th>\n",
+       "      <th id=\"T_5a879_level0_col5\" class=\"col_heading level0 col5\" >jmd_n</th>\n",
+       "      <th id=\"T_5a879_level0_col6\" class=\"col_heading level0 col6\" >tmd</th>\n",
+       "      <th id=\"T_5a879_level0_col7\" class=\"col_heading level0 col7\" >jmd_c</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row0\" class=\"row_heading level0 row0\" >1</th>\n",
+       "      <td id=\"T_5a879_row0_col0\" class=\"data row0 col0\" >P05067</td>\n",
+       "      <td id=\"T_5a879_row0_col1\" class=\"data row0 col1\" >MLPGLALLLLAAWTA...GYENPTYKFFEQMQN</td>\n",
+       "      <td id=\"T_5a879_row0_col2\" class=\"data row0 col2\" >1</td>\n",
+       "      <td id=\"T_5a879_row0_col3\" class=\"data row0 col3\" >701</td>\n",
+       "      <td id=\"T_5a879_row0_col4\" class=\"data row0 col4\" >723</td>\n",
+       "      <td id=\"T_5a879_row0_col5\" class=\"data row0 col5\" >FAEDVGSNKG</td>\n",
+       "      <td id=\"T_5a879_row0_col6\" class=\"data row0 col6\" >AIIGLMVGGVVIATVIVITLVML</td>\n",
+       "      <td id=\"T_5a879_row0_col7\" class=\"data row0 col7\" >KKKQYTSIHH</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row1\" class=\"row_heading level0 row1\" >2</th>\n",
+       "      <td id=\"T_5a879_row1_col0\" class=\"data row1 col0\" >P14925</td>\n",
+       "      <td id=\"T_5a879_row1_col1\" class=\"data row1 col1\" >MAGRARSGLLLLLLG...EEEYSAPLPKPAPSS</td>\n",
+       "      <td id=\"T_5a879_row1_col2\" class=\"data row1 col2\" >1</td>\n",
+       "      <td id=\"T_5a879_row1_col3\" class=\"data row1 col3\" >868</td>\n",
+       "      <td id=\"T_5a879_row1_col4\" class=\"data row1 col4\" >890</td>\n",
+       "      <td id=\"T_5a879_row1_col5\" class=\"data row1 col5\" >KLSTEPGSGV</td>\n",
+       "      <td id=\"T_5a879_row1_col6\" class=\"data row1 col6\" >SVVLITTLLVIPVLVLLAIVMFI</td>\n",
+       "      <td id=\"T_5a879_row1_col7\" class=\"data row1 col7\" >RWKKSRAFGD</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row2\" class=\"row_heading level0 row2\" >3</th>\n",
+       "      <td id=\"T_5a879_row2_col0\" class=\"data row2 col0\" >P70180</td>\n",
+       "      <td id=\"T_5a879_row2_col1\" class=\"data row2 col1\" >MRSLLLFTFSACVLL...RELREDSIRSHFSVA</td>\n",
+       "      <td id=\"T_5a879_row2_col2\" class=\"data row2 col2\" >1</td>\n",
+       "      <td id=\"T_5a879_row2_col3\" class=\"data row2 col3\" >477</td>\n",
+       "      <td id=\"T_5a879_row2_col4\" class=\"data row2 col4\" >499</td>\n",
+       "      <td id=\"T_5a879_row2_col5\" class=\"data row2 col5\" >PCKSSGGLEE</td>\n",
+       "      <td id=\"T_5a879_row2_col6\" class=\"data row2 col6\" >SAVTGIVVGALLGAGLLMAFYFF</td>\n",
+       "      <td id=\"T_5a879_row2_col7\" class=\"data row2 col7\" >RKKYRITIER</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row3\" class=\"row_heading level0 row3\" >4</th>\n",
+       "      <td id=\"T_5a879_row3_col0\" class=\"data row3 col0\" >Q03157</td>\n",
+       "      <td id=\"T_5a879_row3_col1\" class=\"data row3 col1\" >MGPTSPAARGQGRRW...HGYENPTYRFLEERP</td>\n",
+       "      <td id=\"T_5a879_row3_col2\" class=\"data row3 col2\" >1</td>\n",
+       "      <td id=\"T_5a879_row3_col3\" class=\"data row3 col3\" >585</td>\n",
+       "      <td id=\"T_5a879_row3_col4\" class=\"data row3 col4\" >607</td>\n",
+       "      <td id=\"T_5a879_row3_col5\" class=\"data row3 col5\" >APSGTGVSRE</td>\n",
+       "      <td id=\"T_5a879_row3_col6\" class=\"data row3 col6\" >ALSGLLIMGAGGGSLIVLSLLLL</td>\n",
+       "      <td id=\"T_5a879_row3_col7\" class=\"data row3 col7\" >RKKKPYGTIS</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row4\" class=\"row_heading level0 row4\" >5</th>\n",
+       "      <td id=\"T_5a879_row4_col0\" class=\"data row4 col0\" >Q06481</td>\n",
+       "      <td id=\"T_5a879_row4_col1\" class=\"data row4 col1\" >MAATGTAAAAATGRL...GYENPTYKYLEQMQI</td>\n",
+       "      <td id=\"T_5a879_row4_col2\" class=\"data row4 col2\" >1</td>\n",
+       "      <td id=\"T_5a879_row4_col3\" class=\"data row4 col3\" >694</td>\n",
+       "      <td id=\"T_5a879_row4_col4\" class=\"data row4 col4\" >716</td>\n",
+       "      <td id=\"T_5a879_row4_col5\" class=\"data row4 col5\" >LREDFSLSSS</td>\n",
+       "      <td id=\"T_5a879_row4_col6\" class=\"data row4 col6\" >ALIGLLVIAVAIATVIVISLVML</td>\n",
+       "      <td id=\"T_5a879_row4_col7\" class=\"data row4 col7\" >RKRQYGTISH</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row5\" class=\"row_heading level0 row5\" >6</th>\n",
+       "      <td id=\"T_5a879_row5_col0\" class=\"data row5 col0\" >P12821</td>\n",
+       "      <td id=\"T_5a879_row5_col1\" class=\"data row5 col1\" >MGAASGRRGPGLLLP...SHGPQFGSEVELRHS</td>\n",
+       "      <td id=\"T_5a879_row5_col2\" class=\"data row5 col2\" >2</td>\n",
+       "      <td id=\"T_5a879_row5_col3\" class=\"data row5 col3\" >1257</td>\n",
+       "      <td id=\"T_5a879_row5_col4\" class=\"data row5 col4\" >1276</td>\n",
+       "      <td id=\"T_5a879_row5_col5\" class=\"data row5 col5\" >GLDLDAQQAR</td>\n",
+       "      <td id=\"T_5a879_row5_col6\" class=\"data row5 col6\" >VGQWLLLFLGIALLVATLGL</td>\n",
+       "      <td id=\"T_5a879_row5_col7\" class=\"data row5 col7\" >SQRLFSIRHR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row6\" class=\"row_heading level0 row6\" >7</th>\n",
+       "      <td id=\"T_5a879_row6_col0\" class=\"data row6 col0\" >P36896</td>\n",
+       "      <td id=\"T_5a879_row6_col1\" class=\"data row6 col1\" >MAESAGASSFFPLVV...KKTLSQLSVQEDVKI</td>\n",
+       "      <td id=\"T_5a879_row6_col2\" class=\"data row6 col2\" >2</td>\n",
+       "      <td id=\"T_5a879_row6_col3\" class=\"data row6 col3\" >127</td>\n",
+       "      <td id=\"T_5a879_row6_col4\" class=\"data row6 col4\" >149</td>\n",
+       "      <td id=\"T_5a879_row6_col5\" class=\"data row6 col5\" >EHPSMWGPVE</td>\n",
+       "      <td id=\"T_5a879_row6_col6\" class=\"data row6 col6\" >LVGIIAGPVFLLFLIIIIVFLVI</td>\n",
+       "      <td id=\"T_5a879_row6_col7\" class=\"data row6 col7\" >NYHQRVYHNR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row7\" class=\"row_heading level0 row7\" >8</th>\n",
+       "      <td id=\"T_5a879_row7_col0\" class=\"data row7 col0\" >Q8NER5</td>\n",
+       "      <td id=\"T_5a879_row7_col1\" class=\"data row7 col1\" >MTRALCSALRQALLL...KKTISQLCVKEDCKA</td>\n",
+       "      <td id=\"T_5a879_row7_col2\" class=\"data row7 col2\" >2</td>\n",
+       "      <td id=\"T_5a879_row7_col3\" class=\"data row7 col3\" >114</td>\n",
+       "      <td id=\"T_5a879_row7_col4\" class=\"data row7 col4\" >136</td>\n",
+       "      <td id=\"T_5a879_row7_col5\" class=\"data row7 col5\" >PNAPKLGPME</td>\n",
+       "      <td id=\"T_5a879_row7_col6\" class=\"data row7 col6\" >LAIIITVPVCLLSIAAMLTVWAC</td>\n",
+       "      <td id=\"T_5a879_row7_col7\" class=\"data row7 col7\" >QGRQCSYRKK</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row8\" class=\"row_heading level0 row8\" >9</th>\n",
+       "      <td id=\"T_5a879_row8_col0\" class=\"data row8 col0\" >P37023</td>\n",
+       "      <td id=\"T_5a879_row8_col1\" class=\"data row8 col1\" >MTLGSPRKGLLMLLM...LQKISNSPEKPKVIQ</td>\n",
+       "      <td id=\"T_5a879_row8_col2\" class=\"data row8 col2\" >2</td>\n",
+       "      <td id=\"T_5a879_row8_col3\" class=\"data row8 col3\" >119</td>\n",
+       "      <td id=\"T_5a879_row8_col4\" class=\"data row8 col4\" >141</td>\n",
+       "      <td id=\"T_5a879_row8_col5\" class=\"data row8 col5\" >PSEQPGTDGQ</td>\n",
+       "      <td id=\"T_5a879_row8_col6\" class=\"data row8 col6\" >LALILGPVLALLALVALGVLGLW</td>\n",
+       "      <td id=\"T_5a879_row8_col7\" class=\"data row8 col7\" >HVRRRQEKQR</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_5a879_level0_row9\" class=\"row_heading level0 row9\" >10</th>\n",
+       "      <td id=\"T_5a879_row9_col0\" class=\"data row9 col0\" >O43184</td>\n",
+       "      <td id=\"T_5a879_row9_col1\" class=\"data row9 col1\" >MAARPLPVSPARALL...YPHQVPRSTHTAYIK</td>\n",
+       "      <td id=\"T_5a879_row9_col2\" class=\"data row9 col2\" >2</td>\n",
+       "      <td id=\"T_5a879_row9_col3\" class=\"data row9 col3\" >707</td>\n",
+       "      <td id=\"T_5a879_row9_col4\" class=\"data row9 col4\" >729</td>\n",
+       "      <td id=\"T_5a879_row9_col5\" class=\"data row9 col5\" >DSGPIRQADN</td>\n",
+       "      <td id=\"T_5a879_row9_col6\" class=\"data row9 col6\" >QGLTIGILVTILCLLAAGFVVYL</td>\n",
+       "      <td id=\"T_5a879_row9_col7\" class=\"data row9 col7\" >KRKTLIRLLF</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import aaanalysis as aa\n",
+    "aa.options[\"verbose\"] = False\n",
+    "\n",
+    "# A Positive-Unlabeled (PU) dataset: substrates (1) and unlabeled others (2).\n",
+    "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\", n=5)\n",
+    "aa.display_df(df=df_seq, n_rows=10, show_shape=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e84a0e34",
+   "metadata": {},
+   "source": [
+    "By default ``positive_label=1``: substrates map to ``1`` and everything else to ``0``."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a5ccd25b",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-30T23:29:35.600825Z",
+     "iopub.status.busy": "2026-06-30T23:29:35.600758Z",
+     "iopub.status.idle": "2026-06-30T23:29:35.602749Z",
+     "shell.execute_reply": "2026-06-30T23:29:35.602535Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[1 1 1 1 1 0 0 0 0 0]\n"
+     ]
+    }
+   ],
+   "source": [
+    "labels = aa.get_labels(df=df_seq, positive_label=1)\n",
+    "print(labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5db29b65",
+   "metadata": {},
+   "source": [
+    "Pick any value as the positive class via ``positive_label`` (e.g. treat the unlabeled ``2`` as positive), and select a different column with ``col_label``. The result equals the manual ``(df[col_label] == positive_label).astype(int).to_numpy()`` expression."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d9a49747",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-30T23:29:35.603686Z",
+     "iopub.status.busy": "2026-06-30T23:29:35.603626Z",
+     "iopub.status.idle": "2026-06-30T23:29:35.605515Z",
+     "shell.execute_reply": "2026-06-30T23:29:35.605342Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0 0 0 0 0 1 1 1 1 1]\n"
+     ]
+    }
+   ],
+   "source": [
+    "labels_unl = aa.get_labels(df=df_seq, positive_label=2, col_label=\"label\")\n",
+    "print(labels_unl)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "82fd2bd1",
+   "metadata": {},
+   "source": [
+    "Pass the resulting vector straight into the ``labels`` argument of tools such as :meth:`CPP.run` or :class:`TreeModel`."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb
new file mode 100644
index 00000000..cd9fc03a
--- /dev/null
+++ b/examples/pu_learning/dpul_mine_negatives.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "01ccf3ee",
+   "metadata": {},
+   "source": [
+    "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabelled``."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "710fdc35",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-30T23:30:36.652099Z",
+     "iopub.status.busy": "2026-06-30T23:30:36.652032Z",
+     "iopub.status.idle": "2026-06-30T23:30:38.227970Z",
+     "shell.execute_reply": "2026-06-30T23:30:38.227738Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "positives: 63 | unlabeled: 631\n"
+     ]
+    }
+   ],
+   "source": [
+    "import aaanalysis as aa\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "aa.options[\"verbose\"] = False\n",
+    "\n",
+    "# Build a CPP feature matrix for the gamma-secretase PU dataset (substrates vs unlabeled others).\n",
+    "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\")\n",
+    "df_feat = aa.load_features(name=\"DOM_GSEC\")\n",
+    "sf = aa.SequenceFeature()\n",
+    "X = sf.feature_matrix(features=df_feat[\"feature\"], df_parts=sf.get_df_parts(df_seq=df_seq))\n",
+    "labels = df_seq[\"label\"].to_numpy()\n",
+    "\n",
+    "# Split into the positive (1) and unlabeled (2) feature blocks.\n",
+    "X_pos = X[labels == 1]\n",
+    "X_unl = X[labels == 2]\n",
+    "print(f\"positives: {len(X_pos)} | unlabeled: {len(X_unl)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "38f3d31d",
+   "metadata": {},
+   "source": [
+    "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabelled``, which unlabeled samples were identified as reliable negatives."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bdccaa03",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-30T23:30:38.229229Z",
+     "iopub.status.busy": "2026-06-30T23:30:38.229155Z",
+     "iopub.status.idle": "2026-06-30T23:30:38.241855Z",
+     "shell.execute_reply": "2026-06-30T23:30:38.241652Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mined reliable negatives: 49 of 631 unlabeled\n"
+     ]
+    }
+   ],
+   "source": [
+    "dpul = aa.dPULearn(random_state=42)\n",
+    "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=49)\n",
+    "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n",
+    "X_neg = X_unl[mask_neg]  # the mined feature rows"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "315d4798",
+   "metadata": {},
+   "source": [
+    "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabelled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c78d6eab",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-30T23:30:38.243063Z",
+     "iopub.status.busy": "2026-06-30T23:30:38.242994Z",
+     "iopub.status.idle": "2026-06-30T23:30:38.254654Z",
+     "shell.execute_reply": "2026-06-30T23:30:38.254464Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mask equals manual path: True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Stack X_pos over X_unlabelled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n",
+    "labels_manual = np.asarray(\n",
+    "    aa.dPULearn(random_state=42)\n",
+    "    .fit(X=np.vstack([X_pos, X_unl]),\n",
+    "         labels=np.array([1] * len(X_pos) + [2] * len(X_unl)),\n",
+    "         n_unl_to_neg=49)\n",
+    "    .labels_)\n",
+    "mask_manual = labels_manual[len(X_pos):] == 0\n",
+    "print(\"mask equals manual path:\", np.array_equal(mask_neg, mask_manual))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cae2d79d",
+   "metadata": {},
+   "source": [
+    "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Use ``n_neg`` instead of ``n_unl_to_neg`` to request a total count, or set a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "85eff05c",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-06-30T23:30:38.255594Z",
+     "iopub.status.busy": "2026-06-30T23:30:38.255519Z",
+     "iopub.status.idle": "2026-06-30T23:30:38.282637Z",
+     "shell.execute_reply": "2026-06-30T23:30:38.282413Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2    582\n",
+      "1     63\n",
+      "0     49\n",
+      "Name: count, dtype: int64\n",
+      "DataFrame shape: (49, 15)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style type=\"text/css\">\n",
+       "#T_f7f56 thead th {\n",
+       "  background-color: white;\n",
+       "  color: black;\n",
+       "}\n",
+       "#T_f7f56 tbody tr:nth-child(odd) {\n",
+       "  background-color: #f2f2f2;\n",
+       "}\n",
+       "#T_f7f56 tbody tr:nth-child(even) {\n",
+       "  background-color: white;\n",
+       "}\n",
+       "#T_f7f56 th {\n",
+       "  padding: 5px;\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       "#T_f7f56  td {\n",
+       "  padding: 5px;\n",
+       "  white-space: nowrap;\n",
+       "}\n",
+       "</style>\n",
+       "<table id=\"T_f7f56\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th class=\"blank level0\" >&nbsp;</th>\n",
+       "      <th id=\"T_f7f56_level0_col0\" class=\"col_heading level0 col0\" >selection_via</th>\n",
+       "      <th id=\"T_f7f56_level0_col1\" class=\"col_heading level0 col1\" >PC1 (56.2%)</th>\n",
+       "      <th id=\"T_f7f56_level0_col2\" class=\"col_heading level0 col2\" >PC2 (7.4%)</th>\n",
+       "      <th id=\"T_f7f56_level0_col3\" class=\"col_heading level0 col3\" >PC3 (2.9%)</th>\n",
+       "      <th id=\"T_f7f56_level0_col4\" class=\"col_heading level0 col4\" >PC4 (2.8%)</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row0\" class=\"row_heading level0 row0\" >84</th>\n",
+       "      <td id=\"T_f7f56_row0_col0\" class=\"data row0 col0\" >PC1</td>\n",
+       "      <td id=\"T_f7f56_row0_col1\" class=\"data row0 col1\" >0.021000</td>\n",
+       "      <td id=\"T_f7f56_row0_col2\" class=\"data row0 col2\" >-0.047800</td>\n",
+       "      <td id=\"T_f7f56_row0_col3\" class=\"data row0 col3\" >0.075200</td>\n",
+       "      <td id=\"T_f7f56_row0_col4\" class=\"data row0 col4\" >-0.005400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row1\" class=\"row_heading level0 row1\" >95</th>\n",
+       "      <td id=\"T_f7f56_row1_col0\" class=\"data row1 col0\" >PC2</td>\n",
+       "      <td id=\"T_f7f56_row1_col1\" class=\"data row1 col1\" >0.032000</td>\n",
+       "      <td id=\"T_f7f56_row1_col2\" class=\"data row1 col2\" >-0.082100</td>\n",
+       "      <td id=\"T_f7f56_row1_col3\" class=\"data row1 col3\" >0.025800</td>\n",
+       "      <td id=\"T_f7f56_row1_col4\" class=\"data row1 col4\" >-0.037700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row2\" class=\"row_heading level0 row2\" >109</th>\n",
+       "      <td id=\"T_f7f56_row2_col0\" class=\"data row2 col0\" >PC1</td>\n",
+       "      <td id=\"T_f7f56_row2_col1\" class=\"data row2 col1\" >0.026100</td>\n",
+       "      <td id=\"T_f7f56_row2_col2\" class=\"data row2 col2\" >-0.058500</td>\n",
+       "      <td id=\"T_f7f56_row2_col3\" class=\"data row2 col3\" >0.075700</td>\n",
+       "      <td id=\"T_f7f56_row2_col4\" class=\"data row2 col4\" >-0.020900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row3\" class=\"row_heading level0 row3\" >158</th>\n",
+       "      <td id=\"T_f7f56_row3_col0\" class=\"data row3 col0\" >PC1</td>\n",
+       "      <td id=\"T_f7f56_row3_col1\" class=\"data row3 col1\" >0.023500</td>\n",
+       "      <td id=\"T_f7f56_row3_col2\" class=\"data row3 col2\" >-0.060700</td>\n",
+       "      <td id=\"T_f7f56_row3_col3\" class=\"data row3 col3\" >0.054000</td>\n",
+       "      <td id=\"T_f7f56_row3_col4\" class=\"data row3 col4\" >0.000900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row4\" class=\"row_heading level0 row4\" >161</th>\n",
+       "      <td id=\"T_f7f56_row4_col0\" class=\"data row4 col0\" >PC1</td>\n",
+       "      <td id=\"T_f7f56_row4_col1\" class=\"data row4 col1\" >0.025900</td>\n",
+       "      <td id=\"T_f7f56_row4_col2\" class=\"data row4 col2\" >0.031400</td>\n",
+       "      <td id=\"T_f7f56_row4_col3\" class=\"data row4 col3\" >0.044900</td>\n",
+       "      <td id=\"T_f7f56_row4_col4\" class=\"data row4 col4\" >0.055400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row5\" class=\"row_heading level0 row5\" >170</th>\n",
+       "      <td id=\"T_f7f56_row5_col0\" class=\"data row5 col0\" >PC1</td>\n",
+       "      <td id=\"T_f7f56_row5_col1\" class=\"data row5 col1\" >0.026100</td>\n",
+       "      <td id=\"T_f7f56_row5_col2\" class=\"data row5 col2\" >-0.035300</td>\n",
+       "      <td id=\"T_f7f56_row5_col3\" class=\"data row5 col3\" >0.058300</td>\n",
+       "      <td id=\"T_f7f56_row5_col4\" class=\"data row5 col4\" >0.025800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row6\" class=\"row_heading level0 row6\" >192</th>\n",
+       "      <td id=\"T_f7f56_row6_col0\" class=\"data row6 col0\" >PC6</td>\n",
+       "      <td id=\"T_f7f56_row6_col1\" class=\"data row6 col1\" >0.040100</td>\n",
+       "      <td id=\"T_f7f56_row6_col2\" class=\"data row6 col2\" >-0.002200</td>\n",
+       "      <td id=\"T_f7f56_row6_col3\" class=\"data row6 col3\" >0.004300</td>\n",
+       "      <td id=\"T_f7f56_row6_col4\" class=\"data row6 col4\" >-0.053600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row7\" class=\"row_heading level0 row7\" >193</th>\n",
+       "      <td id=\"T_f7f56_row7_col0\" class=\"data row7 col0\" >PC1</td>\n",
+       "      <td id=\"T_f7f56_row7_col1\" class=\"data row7 col1\" >0.024700</td>\n",
+       "      <td id=\"T_f7f56_row7_col2\" class=\"data row7 col2\" >-0.056900</td>\n",
+       "      <td id=\"T_f7f56_row7_col3\" class=\"data row7 col3\" >0.051300</td>\n",
+       "      <td id=\"T_f7f56_row7_col4\" class=\"data row7 col4\" >-0.035600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row8\" class=\"row_heading level0 row8\" >195</th>\n",
+       "      <td id=\"T_f7f56_row8_col0\" class=\"data row8 col0\" >PC5</td>\n",
+       "      <td id=\"T_f7f56_row8_col1\" class=\"data row8 col1\" >0.029900</td>\n",
+       "      <td id=\"T_f7f56_row8_col2\" class=\"data row8 col2\" >0.006500</td>\n",
+       "      <td id=\"T_f7f56_row8_col3\" class=\"data row8 col3\" >0.035800</td>\n",
+       "      <td id=\"T_f7f56_row8_col4\" class=\"data row8 col4\" >0.050200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th id=\"T_f7f56_level0_row9\" class=\"row_heading level0 row9\" >200</th>\n",
+       "      <td id=\"T_f7f56_row9_col0\" class=\"data row9 col0\" >PC1</td>\n",
+       "      <td id=\"T_f7f56_row9_col1\" class=\"data row9 col1\" >0.021200</td>\n",
+       "      <td id=\"T_f7f56_row9_col2\" class=\"data row9 col2\" >-0.056200</td>\n",
+       "      <td id=\"T_f7f56_row9_col3\" class=\"data row9 col3\" >0.005700</td>\n",
+       "      <td id=\"T_f7f56_row9_col4\" class=\"data row9 col4\" >0.072600</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "print(pd.Series(dpul.labels_).value_counts())\n",
+    "aa.display_df(df=dpul.df_pu_[dpul.df_pu_[\"selection_via\"].str.contains(\"PC\", na=False)],\n",
+    "              n_rows=10, n_cols=5, show_shape=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/unit/data_handling_tests/test_get_labels.py b/tests/unit/data_handling_tests/test_get_labels.py
new file mode 100644
index 00000000..2f499e86
--- /dev/null
+++ b/tests/unit/data_handling_tests/test_get_labels.py
@@ -0,0 +1,101 @@
+"""
+This script tests the top-level get_labels() function (issue #308).
+
+get_labels is the single-call form of the recurring
+``(df[col] == positive_label).astype(int).to_numpy()`` expression that appears in 4+ places
+of the gamma-secretase use case. It maps the positive value onto 1 and everything else onto 0.
+"""
+import numpy as np
+import pandas as pd
+import pytest
+
+import aaanalysis as aa
+
+
+# Helper functions
+def _manual(df, positive_label, col="label"):
+    return (df[col] == positive_label).astype(int).to_numpy()
+
+
+# Normal Cases Test Class
+class TestGetLabels:
+    """Test get_labels() for each parameter individually."""
+
+    def test_returns_int_numpy_array(self):
+        df = pd.DataFrame({"entry": ["a", "b", "c"], "label": [1, 2, 1]})
+        labels = aa.get_labels(df=df, positive_label=1)
+        assert isinstance(labels, np.ndarray)
+        assert labels.dtype.kind == "i"
+        assert labels.shape == (3,)
+
+    def test_positive_label_default(self):
+        df = pd.DataFrame({"label": [1, 0, 1, 0]})
+        labels = aa.get_labels(df=df)
+        assert np.array_equal(labels, np.array([1, 0, 1, 0]))
+
+    def test_df_parameter(self):
+        df = pd.DataFrame({"label": [2, 2, 1]})
+        labels = aa.get_labels(df=df, positive_label=1)
+        assert np.array_equal(labels, np.array([0, 0, 1]))
+
+    def test_col_label_parameter(self):
+        df = pd.DataFrame({"y": [1, 2, 1, 2]})
+        labels = aa.get_labels(df=df, positive_label=2, col_label="y")
+        assert np.array_equal(labels, np.array([0, 1, 0, 1]))
+
+
+# Golden equivalence to the manual expression (KPI: >= 2 encodings)
+class TestGetLabelsEquivalence:
+    """Result equals the manual expression on multiple label encodings (KPI #308)."""
+
+    def test_pu_encoding_1_2(self):
+        # PU encoding: 1 = positive, 2 = unlabeled
+        df = pd.DataFrame({"label": [1, 2, 1, 2, 2, 1]})
+        assert np.array_equal(aa.get_labels(df=df, positive_label=1),
+                              _manual(df, 1))
+
+    def test_binary_encoding_0_1(self):
+        # Standard {0, 1} encoding
+        df = pd.DataFrame({"label": [0, 1, 1, 0]})
+        assert np.array_equal(aa.get_labels(df=df, positive_label=1),
+                              _manual(df, 1))
+
+    def test_multiclass_encoding(self):
+        # Multi-class: pick one class as positive
+        df = pd.DataFrame({"label": [0, 1, 2, 0, 1, 2]})
+        for pos in (0, 1, 2):
+            assert np.array_equal(aa.get_labels(df=df, positive_label=pos),
+                                  _manual(df, pos))
+
+    def test_string_labels(self):
+        df = pd.DataFrame({"label": ["sub", "non", "sub", "unl"]})
+        assert np.array_equal(aa.get_labels(df=df, positive_label="sub"),
+                              _manual(df, "sub"))
+
+
+# Negative Cases Test Class
+class TestGetLabelsNegative:
+    """Invalid inputs must raise informative ValueErrors."""
+
+    def test_df_none(self):
+        with pytest.raises(ValueError):
+            aa.get_labels(df=None, positive_label=1)
+
+    def test_df_not_dataframe(self):
+        with pytest.raises(ValueError):
+            aa.get_labels(df=[1, 2, 3], positive_label=1)
+
+    def test_missing_label_column(self):
+        df = pd.DataFrame({"entry": ["a", "b"], "y": [1, 0]})
+        with pytest.raises(ValueError):
+            aa.get_labels(df=df, positive_label=1)
+
+    def test_custom_col_missing(self):
+        df = pd.DataFrame({"label": [1, 0]})
+        with pytest.raises(ValueError):
+            aa.get_labels(df=df, positive_label=1, col_label="missing")
+
+    def test_positive_label_absent(self):
+        df = pd.DataFrame({"label": [1, 2, 1]})
+        with pytest.raises(ValueError):
+            aa.get_labels(df=df, positive_label=9)
diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
new file mode 100644
index 00000000..1329f585
--- /dev/null
+++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
@@ -0,0 +1,184 @@
+"""
+This script tests the dPULearn.mine_negatives() convenience method (issue #308).
+
+mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabelled,
+builds a 1 (positive) / 2 (unlabeled) label vector, fits, and returns the boolean mask of
+identified reliable negatives over the rows of X_unlabelled. The key contract is that the
+mask equals the manual ``labels_[len(X_pos):] == 0`` result exactly, and that the existing
+``fit`` path stays byte-identical (no algorithm change).
+"""
+import numpy as np
+import pytest
+
+import aaanalysis as aa
+
+
+# Helper functions
+def _make_data(n_pos=20, n_unl=50, n_features=8, seed=0):
+    rng = np.random.default_rng(seed)
+    X_pos = rng.normal(0.0, 1.0, size=(n_pos, n_features))
+    X_unl = rng.normal(0.6, 1.0, size=(n_unl, n_features))
+    return X_pos, X_unl
+
+
+def _manual_mask(X_pos, X_unl, random_state=42, **fit_kwargs):
+    """Reproduce the notebook cell 18/24 manual stacking path."""
+    X_pool = np.vstack([X_pos, X_unl])
+    y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl))
+    dpul = aa.dPULearn(random_state=random_state, verbose=False)
+    dpul.fit(X=X_pool, labels=y_pool, **fit_kwargs)
+    return np.asarray(dpul.labels_)[len(X_pos):] == 0, dpul
+
+
+# Normal Cases Test Class
+class TestMineNegatives:
+    """Test dPULearn.mine_negatives() for each parameter individually."""
+
+    def test_returns_boolean_mask_over_unlabelled(self):
+        X_pos, X_unl = _make_data()
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        assert isinstance(mask, np.ndarray)
+        assert mask.dtype == bool
+        assert mask.shape == (X_unl.shape[0],)
+        assert mask.sum() == 10
+
+    def test_X_pos_parameter(self):
+        X_pos, X_unl = _make_data(n_pos=30)
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5)
+        assert mask.shape[0] == X_unl.shape[0]
+
+    def test_X_unlabelled_parameter(self):
+        X_pos, X_unl = _make_data(n_unl=70)
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=12)
+        assert mask.shape[0] == 70
+        assert mask.sum() == 12
+
+    def test_n_unl_to_neg_parameter(self):
+        X_pos, X_unl = _make_data()
+        for n in (1, 5, 25):
+            dpul = aa.dPULearn(random_state=42, verbose=False)
+            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=n)
+            assert mask.sum() == n
+
+    def test_n_neg_parameter(self):
+        X_pos, X_unl = _make_data()
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=8)
+        assert mask.sum() == 8
+
+    def test_metric_parameter(self):
+        X_pos, X_unl = _make_data()
+        for metric in ("euclidean", "manhattan", "cosine"):
+            dpul = aa.dPULearn(random_state=42, verbose=False)
+            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl,
+                                       n_unl_to_neg=10, metric=metric)
+            assert mask.sum() == 10
+
+    def test_n_components_parameter(self):
+        X_pos, X_unl = _make_data()
+        for n_components in (2, 3, 0.5):
+            dpul = aa.dPULearn(random_state=42, verbose=False)
+            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl,
+                                       n_unl_to_neg=10, n_components=n_components)
+            assert mask.sum() == 10
+
+    def test_instance_attributes_set(self):
+        """After mining, labels_ / df_pu_ are set so the plotting class works."""
+        X_pos, X_unl = _make_data()
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        assert dpul.labels_ is not None
+        assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0]
+        assert dpul.df_pu_ is not None
+
+
+# Regression / golden equivalence
+class TestMineNegativesEquivalence:
+    """The mask must equal the manual stacking path exactly (KPI #308)."""
+
+    @pytest.mark.parametrize("seed", [0, 1, 7])
+    def test_mask_equals_manual_pca(self, seed):
+        X_pos, X_unl = _make_data(seed=seed)
+        manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10)
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        assert np.array_equal(mask, manual_mask)
+        assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_))
+
+    def test_mask_equals_manual_metric(self):
+        X_pos, X_unl = _make_data(seed=3)
+        manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine")
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl,
+                                   n_unl_to_neg=8, metric="cosine")
+        assert np.array_equal(mask, manual_mask)
+
+    def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self):
+        X_pos, X_unl = _make_data()
+        dpul_a = aa.dPULearn(random_state=42, verbose=False)
+        mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        dpul_b = aa.dPULearn(random_state=42, verbose=False)
+        mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=10)
+        assert np.array_equal(mask_a, mask_b)
+
+
+# Negative Cases Test Class
+class TestMineNegativesNegative:
+    """Invalid inputs must raise informative ValueErrors."""
+
+    def test_feature_mismatch(self):
+        X_pos, _ = _make_data(n_features=8)
+        _, X_unl = _make_data(n_features=6)
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        with pytest.raises(ValueError):
+            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5)
+
+    def test_both_counts_given(self):
+        X_pos, X_unl = _make_data()
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        with pytest.raises(ValueError):
+            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=5, n_unl_to_neg=5)
+
+    def test_neither_count_given(self):
+        X_pos, X_unl = _make_data()
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        with pytest.raises(ValueError):
+            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl)
+
+    def test_too_many_negatives_requested(self):
+        X_pos, X_unl = _make_data(n_unl=10)
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        with pytest.raises(ValueError):
+            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=999)
+
+    def test_X_pos_none(self):
+        _, X_unl = _make_data()
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        with pytest.raises(ValueError):
+            dpul.mine_negatives(X_pos=None, X_unlabelled=X_unl, n_unl_to_neg=5)
+
+    def test_X_unlabelled_none(self):
+        X_pos, _ = _make_data()
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        with pytest.raises(ValueError):
+            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=None, n_unl_to_neg=5)
+
+
+# Existing-fit byte-identical regression
+class TestFitUnchanged:
+    """The pre-existing fit(X, labels=...) path stays byte-identical (#308 no-change)."""
+
+    def test_fit_pca_unchanged(self):
+        X_pos, X_unl = _make_data(seed=11)
+        X_pool = np.vstack([X_pos, X_unl])
+        y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl))
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        dpul.fit(X=X_pool, labels=y_pool, n_unl_to_neg=10)
+        labels = np.asarray(dpul.labels_)
+        # contract: positives stay 1, exactly 10 mined negatives become 0, rest stay 2
+        assert (labels[:len(X_pos)] == 1).all()
+        assert (labels == 0).sum() == 10
+        assert set(np.unique(labels)).issubset({0, 1, 2})
diff --git a/tests/unit/plotting_tests/test_color_samples_constants.py b/tests/unit/plotting_tests/test_color_samples_constants.py
new file mode 100644
index 00000000..52826a1b
--- /dev/null
+++ b/tests/unit/plotting_tests/test_color_samples_constants.py
@@ -0,0 +1,36 @@
+"""
+This script tests the named sample-color constants exposed at top level (issue #308).
+
+COLOR_SAMPLES_POS / NEG / UNL / REL_NEG are public, named aliases for the canonical sample
+colors. They must equal today's ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]`` values exactly,
+so users can reference a named constant instead of indexing the color dict by string key.
+"""
+import pytest
+
+import aaanalysis as aa
+
+
+# Golden equivalence test
+class TestColorSamplesConstants:
+    """Named constants must equal the plot_get_cdict values (golden KPI #308)."""
+
+    def test_constants_exist_at_top_level(self):
+        for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG",
+                     "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"):
+            assert hasattr(aa, name)
+            assert name in aa.__all__
+
+    @pytest.mark.parametrize("const_name,dict_key", [
+        ("COLOR_SAMPLES_POS", "SAMPLES_POS"),
+        ("COLOR_SAMPLES_NEG", "SAMPLES_NEG"),
+        ("COLOR_SAMPLES_UNL", "SAMPLES_UNL"),
+        ("COLOR_SAMPLES_REL_NEG", "SAMPLES_REL_NEG"),
+    ])
+    def test_constant_equals_cdict_value(self, const_name, dict_key):
+        dict_color = aa.plot_get_cdict(name="DICT_COLOR")
+        assert getattr(aa, const_name) == dict_color[dict_key]
+
+    def test_constants_are_strings(self):
+        for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG",
+                     "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"):
+            assert isinstance(getattr(aa, name), str)

From 27846b9ea0809f2266579ed014590dffd5103eb8 Mon Sep 17 00:00:00 2001
From: Stephan Breimann <stephanbreimann@gmail.com>
Date: Wed, 1 Jul 2026 04:50:56 +0200
Subject: [PATCH 2/6] refactor: validate col_label before using it as a
 required-column key in get_labels

Reorder the get_labels Validate block so check_str(col_label) runs before
check_df(cols_required=col_label). A non-str col_label now surfaces a clear
'col_label' error instead of an internal 'cols_required' one. No behaviour
change on valid input.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aaanalysis/data_handling/_get_labels.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aaanalysis/data_handling/_get_labels.py b/aaanalysis/data_handling/_get_labels.py
index 9149deaf..f23d8423 100644
--- a/aaanalysis/data_handling/_get_labels.py
+++ b/aaanalysis/data_handling/_get_labels.py
@@ -62,8 +62,8 @@ def get_labels(df: pd.DataFrame,
     .. include:: examples/get_labels.rst
     """
     # Check input
-    ut.check_df(name="df", df=df, cols_required=col_label)
     ut.check_str(name="col_label", val=col_label, accept_none=False)
+    ut.check_df(name="df", df=df, cols_required=col_label)
     check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label)
     # Derive binary int label vector
     labels = (df[col_label] == positive_label).astype(int).to_numpy()

From 233dc6497b565de8be6cffb42abec8928d8e1f0f Mon Sep 17 00:00:00 2001
From: Stephan Breimann <stephanbreimann@gmail.com>
Date: Wed, 1 Jul 2026 05:21:06 +0200
Subject: [PATCH 3/6] round2(dpulearn): match manual path for small
 positive/unlabelled sets

mine_negatives validated X_pos and X_unlabelled separately with the default
check_X min_n_samples=3, so it rejected n_pos<3 inputs that the manual stacking
path accepts (the >=3 floor belongs to the stacked matrix, which fit enforces).
Relax the per-matrix check to min_n_samples=1 to restore exact equivalence; add
tests for the small-positive-set equivalence and get_labels single-class/NaN
mapping.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aaanalysis/pu_learning/_dpulearn.py                 |  8 +++++---
 tests/unit/data_handling_tests/test_get_labels.py   | 13 +++++++++++++
 .../dpulearn_tests/test_dpulearn_mine_negatives.py  |  9 +++++++++
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py
index 9931dbdc..6e141995 100644
--- a/aaanalysis/pu_learning/_dpulearn.py
+++ b/aaanalysis/pu_learning/_dpulearn.py
@@ -434,9 +434,11 @@ def mine_negatives(self,
         --------
         .. include:: examples/dpul_mine_negatives.rst
         """
-        # Check input
-        X_pos = ut.check_X(X=X_pos, X_name="X_pos")
-        X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled")
+        # Check input (the >=3 sample floor applies to the stacked matrix, enforced by
+        # 'fit' below, so per-matrix validation only coerces + checks the feature dimension;
+        # this keeps mine_negatives accepting exactly what the manual stacking path accepts)
+        X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1)
+        X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled", min_n_samples=1)
         check_match_X_pos_X_unlabelled(X_pos=X_pos, X_unlabelled=X_unlabelled)
         # Stack positives over the unlabeled pool and fit with the package PU markers
         n_pos = X_pos.shape[0]
diff --git a/tests/unit/data_handling_tests/test_get_labels.py b/tests/unit/data_handling_tests/test_get_labels.py
index 2f499e86..0d63e291 100644
--- a/tests/unit/data_handling_tests/test_get_labels.py
+++ b/tests/unit/data_handling_tests/test_get_labels.py
@@ -72,6 +72,19 @@ def test_string_labels(self):
         assert np.array_equal(aa.get_labels(df=df, positive_label="sub"),
                               _manual(df, "sub"))
 
+    def test_single_class_column_maps_all_ones(self):
+        # Pure mapping: unlike dPULearn.fit, get_labels does not require >1 distinct value,
+        # so an all-positive column maps to all ones rather than raising.
+        df = pd.DataFrame({"label": [1, 1, 1]})
+        assert np.array_equal(aa.get_labels(df=df, positive_label=1),
+                              np.array([1, 1, 1]))
+
+    def test_nan_maps_to_zero(self):
+        # NaN never equals positive_label, so it becomes 0.
+        df = pd.DataFrame({"label": [1.0, np.nan, 1.0]})
+        assert np.array_equal(aa.get_labels(df=df, positive_label=1.0),
+                              np.array([1, 0, 1]))
+
 
 # Negative Cases Test Class
 class TestGetLabelsNegative:
diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
index 1329f585..19be06a9 100644
--- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
+++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
@@ -116,6 +116,15 @@ def test_mask_equals_manual_metric(self):
                                    n_unl_to_neg=8, metric="cosine")
         assert np.array_equal(mask, manual_mask)
 
+    def test_mask_equals_manual_few_positives(self):
+        # n_pos < 3: the manual stacked path accepts it (the >=3 floor applies to the
+        # stacked matrix), so mine_negatives must match it, not reject the small pos set.
+        X_pos, X_unl = _make_data(n_pos=1, seed=5)
+        manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6)
+        dpul = aa.dPULearn(random_state=42, verbose=False)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=6)
+        assert np.array_equal(mask, manual_mask)
+
     def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self):
         X_pos, X_unl = _make_data()
         dpul_a = aa.dPULearn(random_state=42, verbose=False)

From 89e26cba6f4a52bf13934c65a09d269b3a7a61c7 Mon Sep 17 00:00:00 2001
From: Stephan Breimann <stephanbreimann@gmail.com>
Date: Wed, 1 Jul 2026 05:24:12 +0200
Subject: [PATCH 4/6] round3(dpulearn): rename X_unlabelled -> X_unlabeled for
 codebase consistency

The rest of the package spells it 'unlabeled' (American, 85 uses) and abbreviates
the marker as label_unl / n_unl_to_neg; the new public mine_negatives parameter
used the British two-L 'X_unlabelled'. Rename the new/unreleased parameter, its
match helper, docstrings, tests, cheat-sheet and release-notes entries, and
re-execute the example notebook.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aaanalysis/pu_learning/_dpulearn.py           |  28 +--
 docs/_cheatsheet/content.py                   |   2 +-
 docs/source/index/release_notes.rst           |   4 +-
 .../pu_learning/dpul_mine_negatives.ipynb     | 184 +++++++++---------
 .../test_dpulearn_mine_negatives.py           |  48 ++---
 5 files changed, 133 insertions(+), 133 deletions(-)

diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py
index 6e141995..3172d93f 100644
--- a/aaanalysis/pu_learning/_dpulearn.py
+++ b/aaanalysis/pu_learning/_dpulearn.py
@@ -133,13 +133,13 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None:
         raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})")
 
 
-def check_match_X_pos_X_unlabelled(X_pos=None, X_unlabelled=None) -> None:
+def check_match_X_pos_X_unlabeled(X_pos=None, X_unlabeled=None) -> None:
     """Check that positive and unlabeled feature matrices share the same feature dimension."""
     n_features_pos = X_pos.shape[1]
-    n_features_unl = X_unlabelled.shape[1]
+    n_features_unl = X_unlabeled.shape[1]
     if n_features_pos != n_features_unl:
         raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and "
-                         f"'X_unlabelled' (n={n_features_unl})")
+                         f"'X_unlabeled' (n={n_features_unl})")
 
 
 # II Main Functions
@@ -369,7 +369,7 @@ def fit(self,
 
     def mine_negatives(self,
                        X_pos: ut.ArrayLike2D,
-                       X_unlabelled: ut.ArrayLike2D,
+                       X_unlabeled: ut.ArrayLike2D,
                        n_neg: Optional[int] = None,
                        n_unl_to_neg: Optional[int] = None,
                        metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None,
@@ -379,15 +379,15 @@ def mine_negatives(self,
         Mine reliable negatives from an unlabeled pool given the positives, in one call.
 
         Convenience wrapper around :meth:`dPULearn.fit` for the common positive/unlabeled
-        setup: instead of stacking ``X_pos`` and ``X_unlabelled`` by hand, building a
+        setup: instead of stacking ``X_pos`` and ``X_unlabeled`` by hand, building a
         ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined
         rows back out by index, pass the two feature matrices separately and receive a
-        **boolean mask over the rows of** ``X_unlabelled`` flagging the identified reliable
+        **boolean mask over the rows of** ``X_unlabeled`` flagging the identified reliable
         negatives. The mask equals ``labels_[len(X_pos):] == 0`` from the manual stacking
         path exactly.
 
         After the call the instance is fitted: :attr:`dPULearn.labels_` (over the stacked
-        ``X_pos`` then ``X_unlabelled``) and :attr:`dPULearn.df_pu_` are set, so the
+        ``X_pos`` then ``X_unlabeled``) and :attr:`dPULearn.df_pu_` are set, so the
         :class:`dPULearnPlot` methods work as usual.
 
         .. versionadded:: 1.1.0
@@ -396,7 +396,7 @@ def mine_negatives(self,
         ----------
         X_pos : array-like, shape (n_pos, n_features)
             Feature matrix of the positive samples.
-        X_unlabelled : array-like, shape (n_unl, n_features)
+        X_unlabeled : array-like, shape (n_unl, n_features)
             Feature matrix of the unlabeled samples (the candidate pool). Must have the
             same number of features as ``X_pos``.
         n_neg : int, optional
@@ -416,8 +416,8 @@ def mine_negatives(self,
         Returns
         -------
         mask_neg : array-like, shape (n_unl,)
-            Boolean mask over the rows of ``X_unlabelled``: ``True`` marks an identified
-            reliable negative. ``X_unlabelled[mask_neg]`` are the mined negatives.
+            Boolean mask over the rows of ``X_unlabeled``: ``True`` marks an identified
+            reliable negative. ``X_unlabeled[mask_neg]`` are the mined negatives.
 
         Notes
         -----
@@ -438,12 +438,12 @@ def mine_negatives(self,
         # 'fit' below, so per-matrix validation only coerces + checks the feature dimension;
         # this keeps mine_negatives accepting exactly what the manual stacking path accepts)
         X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1)
-        X_unlabelled = ut.check_X(X=X_unlabelled, X_name="X_unlabelled", min_n_samples=1)
-        check_match_X_pos_X_unlabelled(X_pos=X_pos, X_unlabelled=X_unlabelled)
+        X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1)
+        check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled)
         # Stack positives over the unlabeled pool and fit with the package PU markers
         n_pos = X_pos.shape[0]
-        X = np.vstack([X_pos, X_unlabelled])
-        labels = np.array([1] * n_pos + [2] * X_unlabelled.shape[0])
+        X = np.vstack([X_pos, X_unlabeled])
+        labels = np.array([1] * n_pos + [2] * X_unlabeled.shape[0])
         self.fit(X=X, labels=labels, label_pos=1, label_unl=2,
                  n_neg=n_neg, n_unl_to_neg=n_unl_to_neg,
                  metric=metric, n_components=n_components)
diff --git a/docs/_cheatsheet/content.py b/docs/_cheatsheet/content.py
index da244c72..0d04b6eb 100644
--- a/docs/_cheatsheet/content.py
+++ b/docs/_cheatsheet/content.py
@@ -222,7 +222,7 @@
     {"name": "Modeling & Explainability", "tag": "PU · classify · SHAP",
      "rows": [
          ("Train with positives + unlabeled data", "dPULearn().fit(X, labels)  [Wrapper]", None),
-         ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabelled) → mask", None, "v1.1"),
+         ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabeled) → mask", None, "v1.1"),
          ("Train + RFE + MC importance", "TreeModel().fit(X, labels)  [Wrapper]", None),
          ("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels)  [pro]", None),
      ]},
diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst
index 5ddf838c..d87656c2 100644
--- a/docs/source/index/release_notes.rst
+++ b/docs/source/index/release_notes.rst
@@ -138,9 +138,9 @@ Added
 **PU Learning**
 
 - **dPULearn.mine_negatives**: One-call convenience over ``dPULearn.fit`` for the common
-  positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabelled`` separately instead of
+  positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabeled`` separately instead of
   stacking them by hand, building a ``1`` / ``2`` label vector, fitting, and slicing the
-  mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabelled``
+  mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabeled``
   flagging the identified reliable negatives (equal to the manual ``labels_[len(X_pos):]
   == 0`` result exactly). The instance is left fitted (``labels_`` / ``df_pu_`` set, so
   ``dPULearnPlot`` works), and the existing ``fit(X, labels=...)`` path is unchanged.
diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb
index cd9fc03a..99bcca21 100644
--- a/examples/pu_learning/dpul_mine_negatives.ipynb
+++ b/examples/pu_learning/dpul_mine_negatives.ipynb
@@ -5,7 +5,7 @@
    "id": "01ccf3ee",
    "metadata": {},
    "source": [
-    "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabelled``."
+    "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabeled``."
    ]
   },
   {
@@ -14,10 +14,10 @@
    "id": "710fdc35",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-06-30T23:30:36.652099Z",
-     "iopub.status.busy": "2026-06-30T23:30:36.652032Z",
-     "iopub.status.idle": "2026-06-30T23:30:38.227970Z",
-     "shell.execute_reply": "2026-06-30T23:30:38.227738Z"
+     "iopub.execute_input": "2026-07-01T03:23:09.096722Z",
+     "iopub.status.busy": "2026-07-01T03:23:09.096500Z",
+     "iopub.status.idle": "2026-07-01T03:23:11.009355Z",
+     "shell.execute_reply": "2026-07-01T03:23:11.009064Z"
     }
    },
    "outputs": [
@@ -53,7 +53,7 @@
    "id": "38f3d31d",
    "metadata": {},
    "source": [
-    "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabelled``, which unlabeled samples were identified as reliable negatives."
+    "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives."
    ]
   },
   {
@@ -62,10 +62,10 @@
    "id": "bdccaa03",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-06-30T23:30:38.229229Z",
-     "iopub.status.busy": "2026-06-30T23:30:38.229155Z",
-     "iopub.status.idle": "2026-06-30T23:30:38.241855Z",
-     "shell.execute_reply": "2026-06-30T23:30:38.241652Z"
+     "iopub.execute_input": "2026-07-01T03:23:11.010517Z",
+     "iopub.status.busy": "2026-07-01T03:23:11.010443Z",
+     "iopub.status.idle": "2026-07-01T03:23:11.023598Z",
+     "shell.execute_reply": "2026-07-01T03:23:11.023349Z"
     }
    },
    "outputs": [
@@ -79,7 +79,7 @@
    ],
    "source": [
     "dpul = aa.dPULearn(random_state=42)\n",
-    "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=49)\n",
+    "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=49)\n",
     "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n",
     "X_neg = X_unl[mask_neg]  # the mined feature rows"
    ]
@@ -89,7 +89,7 @@
    "id": "315d4798",
    "metadata": {},
    "source": [
-    "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabelled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``."
+    "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabeled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``."
    ]
   },
   {
@@ -98,10 +98,10 @@
    "id": "c78d6eab",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-06-30T23:30:38.243063Z",
-     "iopub.status.busy": "2026-06-30T23:30:38.242994Z",
-     "iopub.status.idle": "2026-06-30T23:30:38.254654Z",
-     "shell.execute_reply": "2026-06-30T23:30:38.254464Z"
+     "iopub.execute_input": "2026-07-01T03:23:11.024869Z",
+     "iopub.status.busy": "2026-07-01T03:23:11.024779Z",
+     "iopub.status.idle": "2026-07-01T03:23:11.036398Z",
+     "shell.execute_reply": "2026-07-01T03:23:11.036167Z"
     }
    },
    "outputs": [
@@ -114,7 +114,7 @@
     }
    ],
    "source": [
-    "# Stack X_pos over X_unlabelled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n",
+    "# Stack X_pos over X_unlabeled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n",
     "labels_manual = np.asarray(\n",
     "    aa.dPULearn(random_state=42)\n",
     "    .fit(X=np.vstack([X_pos, X_unl]),\n",
@@ -139,10 +139,10 @@
    "id": "85eff05c",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-06-30T23:30:38.255594Z",
-     "iopub.status.busy": "2026-06-30T23:30:38.255519Z",
-     "iopub.status.idle": "2026-06-30T23:30:38.282637Z",
-     "shell.execute_reply": "2026-06-30T23:30:38.282413Z"
+     "iopub.execute_input": "2026-07-01T03:23:11.037630Z",
+     "iopub.status.busy": "2026-07-01T03:23:11.037545Z",
+     "iopub.status.idle": "2026-07-01T03:23:11.072965Z",
+     "shell.execute_reply": "2026-07-01T03:23:11.072412Z"
     }
    },
    "outputs": [
@@ -161,116 +161,116 @@
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_f7f56 thead th {\n",
+       "#T_b98c1 thead th {\n",
        "  background-color: white;\n",
        "  color: black;\n",
        "}\n",
-       "#T_f7f56 tbody tr:nth-child(odd) {\n",
+       "#T_b98c1 tbody tr:nth-child(odd) {\n",
        "  background-color: #f2f2f2;\n",
        "}\n",
-       "#T_f7f56 tbody tr:nth-child(even) {\n",
+       "#T_b98c1 tbody tr:nth-child(even) {\n",
        "  background-color: white;\n",
        "}\n",
-       "#T_f7f56 th {\n",
+       "#T_b98c1 th {\n",
        "  padding: 5px;\n",
        "  white-space: nowrap;\n",
        "}\n",
-       "#T_f7f56  td {\n",
+       "#T_b98c1  td {\n",
        "  padding: 5px;\n",
        "  white-space: nowrap;\n",
        "}\n",
        "</style>\n",
-       "<table id=\"T_f7f56\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n",
+       "<table id=\"T_b98c1\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_f7f56_level0_col0\" class=\"col_heading level0 col0\" >selection_via</th>\n",
-       "      <th id=\"T_f7f56_level0_col1\" class=\"col_heading level0 col1\" >PC1 (56.2%)</th>\n",
-       "      <th id=\"T_f7f56_level0_col2\" class=\"col_heading level0 col2\" >PC2 (7.4%)</th>\n",
-       "      <th id=\"T_f7f56_level0_col3\" class=\"col_heading level0 col3\" >PC3 (2.9%)</th>\n",
-       "      <th id=\"T_f7f56_level0_col4\" class=\"col_heading level0 col4\" >PC4 (2.8%)</th>\n",
+       "      <th id=\"T_b98c1_level0_col0\" class=\"col_heading level0 col0\" >selection_via</th>\n",
+       "      <th id=\"T_b98c1_level0_col1\" class=\"col_heading level0 col1\" >PC1 (56.2%)</th>\n",
+       "      <th id=\"T_b98c1_level0_col2\" class=\"col_heading level0 col2\" >PC2 (7.4%)</th>\n",
+       "      <th id=\"T_b98c1_level0_col3\" class=\"col_heading level0 col3\" >PC3 (2.9%)</th>\n",
+       "      <th id=\"T_b98c1_level0_col4\" class=\"col_heading level0 col4\" >PC4 (2.8%)</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row0\" class=\"row_heading level0 row0\" >84</th>\n",
-       "      <td id=\"T_f7f56_row0_col0\" class=\"data row0 col0\" >PC1</td>\n",
-       "      <td id=\"T_f7f56_row0_col1\" class=\"data row0 col1\" >0.021000</td>\n",
-       "      <td id=\"T_f7f56_row0_col2\" class=\"data row0 col2\" >-0.047800</td>\n",
-       "      <td id=\"T_f7f56_row0_col3\" class=\"data row0 col3\" >0.075200</td>\n",
-       "      <td id=\"T_f7f56_row0_col4\" class=\"data row0 col4\" >-0.005400</td>\n",
+       "      <th id=\"T_b98c1_level0_row0\" class=\"row_heading level0 row0\" >84</th>\n",
+       "      <td id=\"T_b98c1_row0_col0\" class=\"data row0 col0\" >PC1</td>\n",
+       "      <td id=\"T_b98c1_row0_col1\" class=\"data row0 col1\" >0.021000</td>\n",
+       "      <td id=\"T_b98c1_row0_col2\" class=\"data row0 col2\" >-0.047800</td>\n",
+       "      <td id=\"T_b98c1_row0_col3\" class=\"data row0 col3\" >0.075200</td>\n",
+       "      <td id=\"T_b98c1_row0_col4\" class=\"data row0 col4\" >-0.005400</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row1\" class=\"row_heading level0 row1\" >95</th>\n",
-       "      <td id=\"T_f7f56_row1_col0\" class=\"data row1 col0\" >PC2</td>\n",
-       "      <td id=\"T_f7f56_row1_col1\" class=\"data row1 col1\" >0.032000</td>\n",
-       "      <td id=\"T_f7f56_row1_col2\" class=\"data row1 col2\" >-0.082100</td>\n",
-       "      <td id=\"T_f7f56_row1_col3\" class=\"data row1 col3\" >0.025800</td>\n",
-       "      <td id=\"T_f7f56_row1_col4\" class=\"data row1 col4\" >-0.037700</td>\n",
+       "      <th id=\"T_b98c1_level0_row1\" class=\"row_heading level0 row1\" >95</th>\n",
+       "      <td id=\"T_b98c1_row1_col0\" class=\"data row1 col0\" >PC2</td>\n",
+       "      <td id=\"T_b98c1_row1_col1\" class=\"data row1 col1\" >0.032000</td>\n",
+       "      <td id=\"T_b98c1_row1_col2\" class=\"data row1 col2\" >-0.082100</td>\n",
+       "      <td id=\"T_b98c1_row1_col3\" class=\"data row1 col3\" >0.025800</td>\n",
+       "      <td id=\"T_b98c1_row1_col4\" class=\"data row1 col4\" >-0.037700</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row2\" class=\"row_heading level0 row2\" >109</th>\n",
-       "      <td id=\"T_f7f56_row2_col0\" class=\"data row2 col0\" >PC1</td>\n",
-       "      <td id=\"T_f7f56_row2_col1\" class=\"data row2 col1\" >0.026100</td>\n",
-       "      <td id=\"T_f7f56_row2_col2\" class=\"data row2 col2\" >-0.058500</td>\n",
-       "      <td id=\"T_f7f56_row2_col3\" class=\"data row2 col3\" >0.075700</td>\n",
-       "      <td id=\"T_f7f56_row2_col4\" class=\"data row2 col4\" >-0.020900</td>\n",
+       "      <th id=\"T_b98c1_level0_row2\" class=\"row_heading level0 row2\" >109</th>\n",
+       "      <td id=\"T_b98c1_row2_col0\" class=\"data row2 col0\" >PC1</td>\n",
+       "      <td id=\"T_b98c1_row2_col1\" class=\"data row2 col1\" >0.026100</td>\n",
+       "      <td id=\"T_b98c1_row2_col2\" class=\"data row2 col2\" >-0.058500</td>\n",
+       "      <td id=\"T_b98c1_row2_col3\" class=\"data row2 col3\" >0.075700</td>\n",
+       "      <td id=\"T_b98c1_row2_col4\" class=\"data row2 col4\" >-0.020900</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row3\" class=\"row_heading level0 row3\" >158</th>\n",
-       "      <td id=\"T_f7f56_row3_col0\" class=\"data row3 col0\" >PC1</td>\n",
-       "      <td id=\"T_f7f56_row3_col1\" class=\"data row3 col1\" >0.023500</td>\n",
-       "      <td id=\"T_f7f56_row3_col2\" class=\"data row3 col2\" >-0.060700</td>\n",
-       "      <td id=\"T_f7f56_row3_col3\" class=\"data row3 col3\" >0.054000</td>\n",
-       "      <td id=\"T_f7f56_row3_col4\" class=\"data row3 col4\" >0.000900</td>\n",
+       "      <th id=\"T_b98c1_level0_row3\" class=\"row_heading level0 row3\" >158</th>\n",
+       "      <td id=\"T_b98c1_row3_col0\" class=\"data row3 col0\" >PC1</td>\n",
+       "      <td id=\"T_b98c1_row3_col1\" class=\"data row3 col1\" >0.023500</td>\n",
+       "      <td id=\"T_b98c1_row3_col2\" class=\"data row3 col2\" >-0.060700</td>\n",
+       "      <td id=\"T_b98c1_row3_col3\" class=\"data row3 col3\" >0.054000</td>\n",
+       "      <td id=\"T_b98c1_row3_col4\" class=\"data row3 col4\" >0.000900</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row4\" class=\"row_heading level0 row4\" >161</th>\n",
-       "      <td id=\"T_f7f56_row4_col0\" class=\"data row4 col0\" >PC1</td>\n",
-       "      <td id=\"T_f7f56_row4_col1\" class=\"data row4 col1\" >0.025900</td>\n",
-       "      <td id=\"T_f7f56_row4_col2\" class=\"data row4 col2\" >0.031400</td>\n",
-       "      <td id=\"T_f7f56_row4_col3\" class=\"data row4 col3\" >0.044900</td>\n",
-       "      <td id=\"T_f7f56_row4_col4\" class=\"data row4 col4\" >0.055400</td>\n",
+       "      <th id=\"T_b98c1_level0_row4\" class=\"row_heading level0 row4\" >161</th>\n",
+       "      <td id=\"T_b98c1_row4_col0\" class=\"data row4 col0\" >PC1</td>\n",
+       "      <td id=\"T_b98c1_row4_col1\" class=\"data row4 col1\" >0.025900</td>\n",
+       "      <td id=\"T_b98c1_row4_col2\" class=\"data row4 col2\" >0.031400</td>\n",
+       "      <td id=\"T_b98c1_row4_col3\" class=\"data row4 col3\" >0.044900</td>\n",
+       "      <td id=\"T_b98c1_row4_col4\" class=\"data row4 col4\" >0.055400</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row5\" class=\"row_heading level0 row5\" >170</th>\n",
-       "      <td id=\"T_f7f56_row5_col0\" class=\"data row5 col0\" >PC1</td>\n",
-       "      <td id=\"T_f7f56_row5_col1\" class=\"data row5 col1\" >0.026100</td>\n",
-       "      <td id=\"T_f7f56_row5_col2\" class=\"data row5 col2\" >-0.035300</td>\n",
-       "      <td id=\"T_f7f56_row5_col3\" class=\"data row5 col3\" >0.058300</td>\n",
-       "      <td id=\"T_f7f56_row5_col4\" class=\"data row5 col4\" >0.025800</td>\n",
+       "      <th id=\"T_b98c1_level0_row5\" class=\"row_heading level0 row5\" >170</th>\n",
+       "      <td id=\"T_b98c1_row5_col0\" class=\"data row5 col0\" >PC1</td>\n",
+       "      <td id=\"T_b98c1_row5_col1\" class=\"data row5 col1\" >0.026100</td>\n",
+       "      <td id=\"T_b98c1_row5_col2\" class=\"data row5 col2\" >-0.035300</td>\n",
+       "      <td id=\"T_b98c1_row5_col3\" class=\"data row5 col3\" >0.058300</td>\n",
+       "      <td id=\"T_b98c1_row5_col4\" class=\"data row5 col4\" >0.025800</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row6\" class=\"row_heading level0 row6\" >192</th>\n",
-       "      <td id=\"T_f7f56_row6_col0\" class=\"data row6 col0\" >PC6</td>\n",
-       "      <td id=\"T_f7f56_row6_col1\" class=\"data row6 col1\" >0.040100</td>\n",
-       "      <td id=\"T_f7f56_row6_col2\" class=\"data row6 col2\" >-0.002200</td>\n",
-       "      <td id=\"T_f7f56_row6_col3\" class=\"data row6 col3\" >0.004300</td>\n",
-       "      <td id=\"T_f7f56_row6_col4\" class=\"data row6 col4\" >-0.053600</td>\n",
+       "      <th id=\"T_b98c1_level0_row6\" class=\"row_heading level0 row6\" >192</th>\n",
+       "      <td id=\"T_b98c1_row6_col0\" class=\"data row6 col0\" >PC6</td>\n",
+       "      <td id=\"T_b98c1_row6_col1\" class=\"data row6 col1\" >0.040100</td>\n",
+       "      <td id=\"T_b98c1_row6_col2\" class=\"data row6 col2\" >-0.002200</td>\n",
+       "      <td id=\"T_b98c1_row6_col3\" class=\"data row6 col3\" >0.004300</td>\n",
+       "      <td id=\"T_b98c1_row6_col4\" class=\"data row6 col4\" >-0.053600</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row7\" class=\"row_heading level0 row7\" >193</th>\n",
-       "      <td id=\"T_f7f56_row7_col0\" class=\"data row7 col0\" >PC1</td>\n",
-       "      <td id=\"T_f7f56_row7_col1\" class=\"data row7 col1\" >0.024700</td>\n",
-       "      <td id=\"T_f7f56_row7_col2\" class=\"data row7 col2\" >-0.056900</td>\n",
-       "      <td id=\"T_f7f56_row7_col3\" class=\"data row7 col3\" >0.051300</td>\n",
-       "      <td id=\"T_f7f56_row7_col4\" class=\"data row7 col4\" >-0.035600</td>\n",
+       "      <th id=\"T_b98c1_level0_row7\" class=\"row_heading level0 row7\" >193</th>\n",
+       "      <td id=\"T_b98c1_row7_col0\" class=\"data row7 col0\" >PC1</td>\n",
+       "      <td id=\"T_b98c1_row7_col1\" class=\"data row7 col1\" >0.024700</td>\n",
+       "      <td id=\"T_b98c1_row7_col2\" class=\"data row7 col2\" >-0.056900</td>\n",
+       "      <td id=\"T_b98c1_row7_col3\" class=\"data row7 col3\" >0.051300</td>\n",
+       "      <td id=\"T_b98c1_row7_col4\" class=\"data row7 col4\" >-0.035600</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row8\" class=\"row_heading level0 row8\" >195</th>\n",
-       "      <td id=\"T_f7f56_row8_col0\" class=\"data row8 col0\" >PC5</td>\n",
-       "      <td id=\"T_f7f56_row8_col1\" class=\"data row8 col1\" >0.029900</td>\n",
-       "      <td id=\"T_f7f56_row8_col2\" class=\"data row8 col2\" >0.006500</td>\n",
-       "      <td id=\"T_f7f56_row8_col3\" class=\"data row8 col3\" >0.035800</td>\n",
-       "      <td id=\"T_f7f56_row8_col4\" class=\"data row8 col4\" >0.050200</td>\n",
+       "      <th id=\"T_b98c1_level0_row8\" class=\"row_heading level0 row8\" >195</th>\n",
+       "      <td id=\"T_b98c1_row8_col0\" class=\"data row8 col0\" >PC5</td>\n",
+       "      <td id=\"T_b98c1_row8_col1\" class=\"data row8 col1\" >0.029900</td>\n",
+       "      <td id=\"T_b98c1_row8_col2\" class=\"data row8 col2\" >0.006500</td>\n",
+       "      <td id=\"T_b98c1_row8_col3\" class=\"data row8 col3\" >0.035800</td>\n",
+       "      <td id=\"T_b98c1_row8_col4\" class=\"data row8 col4\" >0.050200</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_f7f56_level0_row9\" class=\"row_heading level0 row9\" >200</th>\n",
-       "      <td id=\"T_f7f56_row9_col0\" class=\"data row9 col0\" >PC1</td>\n",
-       "      <td id=\"T_f7f56_row9_col1\" class=\"data row9 col1\" >0.021200</td>\n",
-       "      <td id=\"T_f7f56_row9_col2\" class=\"data row9 col2\" >-0.056200</td>\n",
-       "      <td id=\"T_f7f56_row9_col3\" class=\"data row9 col3\" >0.005700</td>\n",
-       "      <td id=\"T_f7f56_row9_col4\" class=\"data row9 col4\" >0.072600</td>\n",
+       "      <th id=\"T_b98c1_level0_row9\" class=\"row_heading level0 row9\" >200</th>\n",
+       "      <td id=\"T_b98c1_row9_col0\" class=\"data row9 col0\" >PC1</td>\n",
+       "      <td id=\"T_b98c1_row9_col1\" class=\"data row9 col1\" >0.021200</td>\n",
+       "      <td id=\"T_b98c1_row9_col2\" class=\"data row9 col2\" >-0.056200</td>\n",
+       "      <td id=\"T_b98c1_row9_col3\" class=\"data row9 col3\" >0.005700</td>\n",
+       "      <td id=\"T_b98c1_row9_col4\" class=\"data row9 col4\" >0.072600</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
index 19be06a9..ecc20b72 100644
--- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
+++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
@@ -1,9 +1,9 @@
 """
 This script tests the dPULearn.mine_negatives() convenience method (issue #308).
 
-mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabelled,
+mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabeled,
 builds a 1 (positive) / 2 (unlabeled) label vector, fits, and returns the boolean mask of
-identified reliable negatives over the rows of X_unlabelled. The key contract is that the
+identified reliable negatives over the rows of X_unlabeled. The key contract is that the
 mask equals the manual ``labels_[len(X_pos):] == 0`` result exactly, and that the existing
 ``fit`` path stays byte-identical (no algorithm change).
 """
@@ -34,10 +34,10 @@ def _manual_mask(X_pos, X_unl, random_state=42, **fit_kwargs):
 class TestMineNegatives:
     """Test dPULearn.mine_negatives() for each parameter individually."""
 
-    def test_returns_boolean_mask_over_unlabelled(self):
+    def test_returns_boolean_mask_over_unlabeled(self):
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
         assert isinstance(mask, np.ndarray)
         assert mask.dtype == bool
         assert mask.shape == (X_unl.shape[0],)
@@ -46,13 +46,13 @@ def test_returns_boolean_mask_over_unlabelled(self):
     def test_X_pos_parameter(self):
         X_pos, X_unl = _make_data(n_pos=30)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5)
         assert mask.shape[0] == X_unl.shape[0]
 
-    def test_X_unlabelled_parameter(self):
+    def test_X_unlabeled_parameter(self):
         X_pos, X_unl = _make_data(n_unl=70)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=12)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=12)
         assert mask.shape[0] == 70
         assert mask.sum() == 12
 
@@ -60,20 +60,20 @@ def test_n_unl_to_neg_parameter(self):
         X_pos, X_unl = _make_data()
         for n in (1, 5, 25):
             dpul = aa.dPULearn(random_state=42, verbose=False)
-            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=n)
+            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=n)
             assert mask.sum() == n
 
     def test_n_neg_parameter(self):
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=8)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=8)
         assert mask.sum() == 8
 
     def test_metric_parameter(self):
         X_pos, X_unl = _make_data()
         for metric in ("euclidean", "manhattan", "cosine"):
             dpul = aa.dPULearn(random_state=42, verbose=False)
-            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl,
+            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
                                        n_unl_to_neg=10, metric=metric)
             assert mask.sum() == 10
 
@@ -81,7 +81,7 @@ def test_n_components_parameter(self):
         X_pos, X_unl = _make_data()
         for n_components in (2, 3, 0.5):
             dpul = aa.dPULearn(random_state=42, verbose=False)
-            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl,
+            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
                                        n_unl_to_neg=10, n_components=n_components)
             assert mask.sum() == 10
 
@@ -89,7 +89,7 @@ def test_instance_attributes_set(self):
         """After mining, labels_ / df_pu_ are set so the plotting class works."""
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
         assert dpul.labels_ is not None
         assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0]
         assert dpul.df_pu_ is not None
@@ -104,7 +104,7 @@ def test_mask_equals_manual_pca(self, seed):
         X_pos, X_unl = _make_data(seed=seed)
         manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
         assert np.array_equal(mask, manual_mask)
         assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_))
 
@@ -112,7 +112,7 @@ def test_mask_equals_manual_metric(self):
         X_pos, X_unl = _make_data(seed=3)
         manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine")
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl,
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
                                    n_unl_to_neg=8, metric="cosine")
         assert np.array_equal(mask, manual_mask)
 
@@ -122,15 +122,15 @@ def test_mask_equals_manual_few_positives(self):
         X_pos, X_unl = _make_data(n_pos=1, seed=5)
         manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=6)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=6)
         assert np.array_equal(mask, manual_mask)
 
     def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self):
         X_pos, X_unl = _make_data()
         dpul_a = aa.dPULearn(random_state=42, verbose=False)
-        mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=10)
+        mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
         dpul_b = aa.dPULearn(random_state=42, verbose=False)
-        mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=10)
+        mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
         assert np.array_equal(mask_a, mask_b)
 
 
@@ -143,37 +143,37 @@ def test_feature_mismatch(self):
         _, X_unl = _make_data(n_features=6)
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=5)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5)
 
     def test_both_counts_given(self):
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_neg=5, n_unl_to_neg=5)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5, n_unl_to_neg=5)
 
     def test_neither_count_given(self):
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl)
 
     def test_too_many_negatives_requested(self):
         X_pos, X_unl = _make_data(n_unl=10)
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=X_unl, n_unl_to_neg=999)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=999)
 
     def test_X_pos_none(self):
         _, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=None, X_unlabelled=X_unl, n_unl_to_neg=5)
+            dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_unl_to_neg=5)
 
-    def test_X_unlabelled_none(self):
+    def test_X_unlabeled_none(self):
         X_pos, _ = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabelled=None, n_unl_to_neg=5)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_unl_to_neg=5)
 
 
 # Existing-fit byte-identical regression

From 86ec2eadded132bce9e761a1786f5930fdcabbab Mon Sep 17 00:00:00 2001
From: Stephan Breimann <stephanbreimann@gmail.com>
Date: Wed, 1 Jul 2026 05:29:43 +0200
Subject: [PATCH 5/6] round4(dpulearn): collapse mine_negatives counts to a
 single n_neg

With no pre-labeled negatives, n_neg (total) and n_unl_to_neg (from the pool)
are always equivalent in mine_negatives, so exposing both was redundant. Replace
them with a single required n_neg (the method is new/unreleased, so non-breaking);
it calls fit(n_unl_to_neg=n_neg) internally. Update docstring, tests, and the
re-executed example notebook.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aaanalysis/pu_learning/_dpulearn.py           |  17 +-
 .../pu_learning/dpul_mine_negatives.ipynb     | 180 +++++++++---------
 .../test_dpulearn_mine_negatives.py           |  54 ++----
 3 files changed, 113 insertions(+), 138 deletions(-)

diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py
index 3172d93f..40d44b44 100644
--- a/aaanalysis/pu_learning/_dpulearn.py
+++ b/aaanalysis/pu_learning/_dpulearn.py
@@ -370,8 +370,7 @@ def fit(self,
     def mine_negatives(self,
                        X_pos: ut.ArrayLike2D,
                        X_unlabeled: ut.ArrayLike2D,
-                       n_neg: Optional[int] = None,
-                       n_unl_to_neg: Optional[int] = None,
+                       n_neg: int,
                        metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None,
                        n_components: Union[float, int] = 0.80,
                        ) -> np.ndarray:
@@ -399,13 +398,9 @@ def mine_negatives(self,
         X_unlabeled : array-like, shape (n_unl, n_features)
             Feature matrix of the unlabeled samples (the candidate pool). Must have the
             same number of features as ``X_pos``.
-        n_neg : int, optional
-            Total number of reliable negatives to identify from the unlabeled pool.
-            Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg`` (with no pre-labeled
-            negatives the two are equivalent).
-        n_unl_to_neg : int, optional
-            Number of reliable negatives to identify directly from the unlabeled pool.
-            Provide **exactly one** of ``n_neg`` or ``n_unl_to_neg``.
+        n_neg : int
+            Number of reliable negatives to identify from the unlabeled pool. Must not
+            exceed the number of unlabeled samples.
         metric : str or None, optional
             Distance metric for distance-based identification (``euclidean``,
             ``manhattan``, ``cosine``). If ``None``, PCA-based identification is performed.
@@ -444,9 +439,9 @@ def mine_negatives(self,
         n_pos = X_pos.shape[0]
         X = np.vstack([X_pos, X_unlabeled])
         labels = np.array([1] * n_pos + [2] * X_unlabeled.shape[0])
+        # No pre-labeled negatives here, so n_neg is exactly the count to draw from the pool
         self.fit(X=X, labels=labels, label_pos=1, label_unl=2,
-                 n_neg=n_neg, n_unl_to_neg=n_unl_to_neg,
-                 metric=metric, n_components=n_components)
+                 n_unl_to_neg=n_neg, metric=metric, n_components=n_components)
         # Slice the mined reliable negatives (label 0) back out of the unlabeled block
         mask_neg = np.asarray(self.labels_)[n_pos:] == 0
         return mask_neg
diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb
index 99bcca21..fc56d1fc 100644
--- a/examples/pu_learning/dpul_mine_negatives.ipynb
+++ b/examples/pu_learning/dpul_mine_negatives.ipynb
@@ -14,10 +14,10 @@
    "id": "710fdc35",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-07-01T03:23:09.096722Z",
-     "iopub.status.busy": "2026-07-01T03:23:09.096500Z",
-     "iopub.status.idle": "2026-07-01T03:23:11.009355Z",
-     "shell.execute_reply": "2026-07-01T03:23:11.009064Z"
+     "iopub.execute_input": "2026-07-01T03:27:51.939100Z",
+     "iopub.status.busy": "2026-07-01T03:27:51.937608Z",
+     "iopub.status.idle": "2026-07-01T03:28:11.600218Z",
+     "shell.execute_reply": "2026-07-01T03:28:11.552756Z"
     }
    },
    "outputs": [
@@ -53,7 +53,7 @@
    "id": "38f3d31d",
    "metadata": {},
    "source": [
-    "Mine a fixed number of reliable negatives directly from the unlabeled pool with ``n_unl_to_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives."
+    "Mine a fixed number of reliable negatives from the unlabeled pool with ``n_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives."
    ]
   },
   {
@@ -62,10 +62,10 @@
    "id": "bdccaa03",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-07-01T03:23:11.010517Z",
-     "iopub.status.busy": "2026-07-01T03:23:11.010443Z",
-     "iopub.status.idle": "2026-07-01T03:23:11.023598Z",
-     "shell.execute_reply": "2026-07-01T03:23:11.023349Z"
+     "iopub.execute_input": "2026-07-01T03:28:11.767224Z",
+     "iopub.status.busy": "2026-07-01T03:28:11.766642Z",
+     "iopub.status.idle": "2026-07-01T03:28:11.816538Z",
+     "shell.execute_reply": "2026-07-01T03:28:11.795116Z"
     }
    },
    "outputs": [
@@ -79,7 +79,7 @@
    ],
    "source": [
     "dpul = aa.dPULearn(random_state=42)\n",
-    "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=49)\n",
+    "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=49)\n",
     "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n",
     "X_neg = X_unl[mask_neg]  # the mined feature rows"
    ]
@@ -98,10 +98,10 @@
    "id": "c78d6eab",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-07-01T03:23:11.024869Z",
-     "iopub.status.busy": "2026-07-01T03:23:11.024779Z",
-     "iopub.status.idle": "2026-07-01T03:23:11.036398Z",
-     "shell.execute_reply": "2026-07-01T03:23:11.036167Z"
+     "iopub.execute_input": "2026-07-01T03:28:11.824245Z",
+     "iopub.status.busy": "2026-07-01T03:28:11.824003Z",
+     "iopub.status.idle": "2026-07-01T03:28:12.020230Z",
+     "shell.execute_reply": "2026-07-01T03:28:11.952823Z"
     }
    },
    "outputs": [
@@ -130,7 +130,7 @@
    "id": "cae2d79d",
    "metadata": {},
    "source": [
-    "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Use ``n_neg`` instead of ``n_unl_to_neg`` to request a total count, or set a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification."
+    "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Pass ``n_neg`` (the number of reliable negatives to mine) and optionally a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification instead of the default PCA."
    ]
   },
   {
@@ -139,10 +139,10 @@
    "id": "85eff05c",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-07-01T03:23:11.037630Z",
-     "iopub.status.busy": "2026-07-01T03:23:11.037545Z",
-     "iopub.status.idle": "2026-07-01T03:23:11.072965Z",
-     "shell.execute_reply": "2026-07-01T03:23:11.072412Z"
+     "iopub.execute_input": "2026-07-01T03:28:12.179951Z",
+     "iopub.status.busy": "2026-07-01T03:28:12.179661Z",
+     "iopub.status.idle": "2026-07-01T03:28:12.745973Z",
+     "shell.execute_reply": "2026-07-01T03:28:12.744957Z"
     }
    },
    "outputs": [
@@ -161,116 +161,116 @@
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_b98c1 thead th {\n",
+       "#T_4ca60 thead th {\n",
        "  background-color: white;\n",
        "  color: black;\n",
        "}\n",
-       "#T_b98c1 tbody tr:nth-child(odd) {\n",
+       "#T_4ca60 tbody tr:nth-child(odd) {\n",
        "  background-color: #f2f2f2;\n",
        "}\n",
-       "#T_b98c1 tbody tr:nth-child(even) {\n",
+       "#T_4ca60 tbody tr:nth-child(even) {\n",
        "  background-color: white;\n",
        "}\n",
-       "#T_b98c1 th {\n",
+       "#T_4ca60 th {\n",
        "  padding: 5px;\n",
        "  white-space: nowrap;\n",
        "}\n",
-       "#T_b98c1  td {\n",
+       "#T_4ca60  td {\n",
        "  padding: 5px;\n",
        "  white-space: nowrap;\n",
        "}\n",
        "</style>\n",
-       "<table id=\"T_b98c1\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n",
+       "<table id=\"T_4ca60\" style='display:block; max-height: 300px; max-width: 100%; overflow-x: auto; overflow-y: auto;'>\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
-       "      <th id=\"T_b98c1_level0_col0\" class=\"col_heading level0 col0\" >selection_via</th>\n",
-       "      <th id=\"T_b98c1_level0_col1\" class=\"col_heading level0 col1\" >PC1 (56.2%)</th>\n",
-       "      <th id=\"T_b98c1_level0_col2\" class=\"col_heading level0 col2\" >PC2 (7.4%)</th>\n",
-       "      <th id=\"T_b98c1_level0_col3\" class=\"col_heading level0 col3\" >PC3 (2.9%)</th>\n",
-       "      <th id=\"T_b98c1_level0_col4\" class=\"col_heading level0 col4\" >PC4 (2.8%)</th>\n",
+       "      <th id=\"T_4ca60_level0_col0\" class=\"col_heading level0 col0\" >selection_via</th>\n",
+       "      <th id=\"T_4ca60_level0_col1\" class=\"col_heading level0 col1\" >PC1 (56.2%)</th>\n",
+       "      <th id=\"T_4ca60_level0_col2\" class=\"col_heading level0 col2\" >PC2 (7.4%)</th>\n",
+       "      <th id=\"T_4ca60_level0_col3\" class=\"col_heading level0 col3\" >PC3 (2.9%)</th>\n",
+       "      <th id=\"T_4ca60_level0_col4\" class=\"col_heading level0 col4\" >PC4 (2.8%)</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row0\" class=\"row_heading level0 row0\" >84</th>\n",
-       "      <td id=\"T_b98c1_row0_col0\" class=\"data row0 col0\" >PC1</td>\n",
-       "      <td id=\"T_b98c1_row0_col1\" class=\"data row0 col1\" >0.021000</td>\n",
-       "      <td id=\"T_b98c1_row0_col2\" class=\"data row0 col2\" >-0.047800</td>\n",
-       "      <td id=\"T_b98c1_row0_col3\" class=\"data row0 col3\" >0.075200</td>\n",
-       "      <td id=\"T_b98c1_row0_col4\" class=\"data row0 col4\" >-0.005400</td>\n",
+       "      <th id=\"T_4ca60_level0_row0\" class=\"row_heading level0 row0\" >84</th>\n",
+       "      <td id=\"T_4ca60_row0_col0\" class=\"data row0 col0\" >PC1</td>\n",
+       "      <td id=\"T_4ca60_row0_col1\" class=\"data row0 col1\" >0.021000</td>\n",
+       "      <td id=\"T_4ca60_row0_col2\" class=\"data row0 col2\" >-0.047800</td>\n",
+       "      <td id=\"T_4ca60_row0_col3\" class=\"data row0 col3\" >0.075200</td>\n",
+       "      <td id=\"T_4ca60_row0_col4\" class=\"data row0 col4\" >-0.005400</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row1\" class=\"row_heading level0 row1\" >95</th>\n",
-       "      <td id=\"T_b98c1_row1_col0\" class=\"data row1 col0\" >PC2</td>\n",
-       "      <td id=\"T_b98c1_row1_col1\" class=\"data row1 col1\" >0.032000</td>\n",
-       "      <td id=\"T_b98c1_row1_col2\" class=\"data row1 col2\" >-0.082100</td>\n",
-       "      <td id=\"T_b98c1_row1_col3\" class=\"data row1 col3\" >0.025800</td>\n",
-       "      <td id=\"T_b98c1_row1_col4\" class=\"data row1 col4\" >-0.037700</td>\n",
+       "      <th id=\"T_4ca60_level0_row1\" class=\"row_heading level0 row1\" >95</th>\n",
+       "      <td id=\"T_4ca60_row1_col0\" class=\"data row1 col0\" >PC2</td>\n",
+       "      <td id=\"T_4ca60_row1_col1\" class=\"data row1 col1\" >0.032000</td>\n",
+       "      <td id=\"T_4ca60_row1_col2\" class=\"data row1 col2\" >-0.082100</td>\n",
+       "      <td id=\"T_4ca60_row1_col3\" class=\"data row1 col3\" >0.025800</td>\n",
+       "      <td id=\"T_4ca60_row1_col4\" class=\"data row1 col4\" >-0.037700</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row2\" class=\"row_heading level0 row2\" >109</th>\n",
-       "      <td id=\"T_b98c1_row2_col0\" class=\"data row2 col0\" >PC1</td>\n",
-       "      <td id=\"T_b98c1_row2_col1\" class=\"data row2 col1\" >0.026100</td>\n",
-       "      <td id=\"T_b98c1_row2_col2\" class=\"data row2 col2\" >-0.058500</td>\n",
-       "      <td id=\"T_b98c1_row2_col3\" class=\"data row2 col3\" >0.075700</td>\n",
-       "      <td id=\"T_b98c1_row2_col4\" class=\"data row2 col4\" >-0.020900</td>\n",
+       "      <th id=\"T_4ca60_level0_row2\" class=\"row_heading level0 row2\" >109</th>\n",
+       "      <td id=\"T_4ca60_row2_col0\" class=\"data row2 col0\" >PC1</td>\n",
+       "      <td id=\"T_4ca60_row2_col1\" class=\"data row2 col1\" >0.026100</td>\n",
+       "      <td id=\"T_4ca60_row2_col2\" class=\"data row2 col2\" >-0.058500</td>\n",
+       "      <td id=\"T_4ca60_row2_col3\" class=\"data row2 col3\" >0.075700</td>\n",
+       "      <td id=\"T_4ca60_row2_col4\" class=\"data row2 col4\" >-0.020900</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row3\" class=\"row_heading level0 row3\" >158</th>\n",
-       "      <td id=\"T_b98c1_row3_col0\" class=\"data row3 col0\" >PC1</td>\n",
-       "      <td id=\"T_b98c1_row3_col1\" class=\"data row3 col1\" >0.023500</td>\n",
-       "      <td id=\"T_b98c1_row3_col2\" class=\"data row3 col2\" >-0.060700</td>\n",
-       "      <td id=\"T_b98c1_row3_col3\" class=\"data row3 col3\" >0.054000</td>\n",
-       "      <td id=\"T_b98c1_row3_col4\" class=\"data row3 col4\" >0.000900</td>\n",
+       "      <th id=\"T_4ca60_level0_row3\" class=\"row_heading level0 row3\" >158</th>\n",
+       "      <td id=\"T_4ca60_row3_col0\" class=\"data row3 col0\" >PC1</td>\n",
+       "      <td id=\"T_4ca60_row3_col1\" class=\"data row3 col1\" >0.023500</td>\n",
+       "      <td id=\"T_4ca60_row3_col2\" class=\"data row3 col2\" >-0.060700</td>\n",
+       "      <td id=\"T_4ca60_row3_col3\" class=\"data row3 col3\" >0.054000</td>\n",
+       "      <td id=\"T_4ca60_row3_col4\" class=\"data row3 col4\" >0.000900</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row4\" class=\"row_heading level0 row4\" >161</th>\n",
-       "      <td id=\"T_b98c1_row4_col0\" class=\"data row4 col0\" >PC1</td>\n",
-       "      <td id=\"T_b98c1_row4_col1\" class=\"data row4 col1\" >0.025900</td>\n",
-       "      <td id=\"T_b98c1_row4_col2\" class=\"data row4 col2\" >0.031400</td>\n",
-       "      <td id=\"T_b98c1_row4_col3\" class=\"data row4 col3\" >0.044900</td>\n",
-       "      <td id=\"T_b98c1_row4_col4\" class=\"data row4 col4\" >0.055400</td>\n",
+       "      <th id=\"T_4ca60_level0_row4\" class=\"row_heading level0 row4\" >161</th>\n",
+       "      <td id=\"T_4ca60_row4_col0\" class=\"data row4 col0\" >PC1</td>\n",
+       "      <td id=\"T_4ca60_row4_col1\" class=\"data row4 col1\" >0.025900</td>\n",
+       "      <td id=\"T_4ca60_row4_col2\" class=\"data row4 col2\" >0.031400</td>\n",
+       "      <td id=\"T_4ca60_row4_col3\" class=\"data row4 col3\" >0.044900</td>\n",
+       "      <td id=\"T_4ca60_row4_col4\" class=\"data row4 col4\" >0.055400</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row5\" class=\"row_heading level0 row5\" >170</th>\n",
-       "      <td id=\"T_b98c1_row5_col0\" class=\"data row5 col0\" >PC1</td>\n",
-       "      <td id=\"T_b98c1_row5_col1\" class=\"data row5 col1\" >0.026100</td>\n",
-       "      <td id=\"T_b98c1_row5_col2\" class=\"data row5 col2\" >-0.035300</td>\n",
-       "      <td id=\"T_b98c1_row5_col3\" class=\"data row5 col3\" >0.058300</td>\n",
-       "      <td id=\"T_b98c1_row5_col4\" class=\"data row5 col4\" >0.025800</td>\n",
+       "      <th id=\"T_4ca60_level0_row5\" class=\"row_heading level0 row5\" >170</th>\n",
+       "      <td id=\"T_4ca60_row5_col0\" class=\"data row5 col0\" >PC1</td>\n",
+       "      <td id=\"T_4ca60_row5_col1\" class=\"data row5 col1\" >0.026100</td>\n",
+       "      <td id=\"T_4ca60_row5_col2\" class=\"data row5 col2\" >-0.035300</td>\n",
+       "      <td id=\"T_4ca60_row5_col3\" class=\"data row5 col3\" >0.058300</td>\n",
+       "      <td id=\"T_4ca60_row5_col4\" class=\"data row5 col4\" >0.025800</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row6\" class=\"row_heading level0 row6\" >192</th>\n",
-       "      <td id=\"T_b98c1_row6_col0\" class=\"data row6 col0\" >PC6</td>\n",
-       "      <td id=\"T_b98c1_row6_col1\" class=\"data row6 col1\" >0.040100</td>\n",
-       "      <td id=\"T_b98c1_row6_col2\" class=\"data row6 col2\" >-0.002200</td>\n",
-       "      <td id=\"T_b98c1_row6_col3\" class=\"data row6 col3\" >0.004300</td>\n",
-       "      <td id=\"T_b98c1_row6_col4\" class=\"data row6 col4\" >-0.053600</td>\n",
+       "      <th id=\"T_4ca60_level0_row6\" class=\"row_heading level0 row6\" >192</th>\n",
+       "      <td id=\"T_4ca60_row6_col0\" class=\"data row6 col0\" >PC6</td>\n",
+       "      <td id=\"T_4ca60_row6_col1\" class=\"data row6 col1\" >0.040100</td>\n",
+       "      <td id=\"T_4ca60_row6_col2\" class=\"data row6 col2\" >-0.002200</td>\n",
+       "      <td id=\"T_4ca60_row6_col3\" class=\"data row6 col3\" >0.004300</td>\n",
+       "      <td id=\"T_4ca60_row6_col4\" class=\"data row6 col4\" >-0.053600</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row7\" class=\"row_heading level0 row7\" >193</th>\n",
-       "      <td id=\"T_b98c1_row7_col0\" class=\"data row7 col0\" >PC1</td>\n",
-       "      <td id=\"T_b98c1_row7_col1\" class=\"data row7 col1\" >0.024700</td>\n",
-       "      <td id=\"T_b98c1_row7_col2\" class=\"data row7 col2\" >-0.056900</td>\n",
-       "      <td id=\"T_b98c1_row7_col3\" class=\"data row7 col3\" >0.051300</td>\n",
-       "      <td id=\"T_b98c1_row7_col4\" class=\"data row7 col4\" >-0.035600</td>\n",
+       "      <th id=\"T_4ca60_level0_row7\" class=\"row_heading level0 row7\" >193</th>\n",
+       "      <td id=\"T_4ca60_row7_col0\" class=\"data row7 col0\" >PC1</td>\n",
+       "      <td id=\"T_4ca60_row7_col1\" class=\"data row7 col1\" >0.024700</td>\n",
+       "      <td id=\"T_4ca60_row7_col2\" class=\"data row7 col2\" >-0.056900</td>\n",
+       "      <td id=\"T_4ca60_row7_col3\" class=\"data row7 col3\" >0.051300</td>\n",
+       "      <td id=\"T_4ca60_row7_col4\" class=\"data row7 col4\" >-0.035600</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row8\" class=\"row_heading level0 row8\" >195</th>\n",
-       "      <td id=\"T_b98c1_row8_col0\" class=\"data row8 col0\" >PC5</td>\n",
-       "      <td id=\"T_b98c1_row8_col1\" class=\"data row8 col1\" >0.029900</td>\n",
-       "      <td id=\"T_b98c1_row8_col2\" class=\"data row8 col2\" >0.006500</td>\n",
-       "      <td id=\"T_b98c1_row8_col3\" class=\"data row8 col3\" >0.035800</td>\n",
-       "      <td id=\"T_b98c1_row8_col4\" class=\"data row8 col4\" >0.050200</td>\n",
+       "      <th id=\"T_4ca60_level0_row8\" class=\"row_heading level0 row8\" >195</th>\n",
+       "      <td id=\"T_4ca60_row8_col0\" class=\"data row8 col0\" >PC5</td>\n",
+       "      <td id=\"T_4ca60_row8_col1\" class=\"data row8 col1\" >0.029900</td>\n",
+       "      <td id=\"T_4ca60_row8_col2\" class=\"data row8 col2\" >0.006500</td>\n",
+       "      <td id=\"T_4ca60_row8_col3\" class=\"data row8 col3\" >0.035800</td>\n",
+       "      <td id=\"T_4ca60_row8_col4\" class=\"data row8 col4\" >0.050200</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th id=\"T_b98c1_level0_row9\" class=\"row_heading level0 row9\" >200</th>\n",
-       "      <td id=\"T_b98c1_row9_col0\" class=\"data row9 col0\" >PC1</td>\n",
-       "      <td id=\"T_b98c1_row9_col1\" class=\"data row9 col1\" >0.021200</td>\n",
-       "      <td id=\"T_b98c1_row9_col2\" class=\"data row9 col2\" >-0.056200</td>\n",
-       "      <td id=\"T_b98c1_row9_col3\" class=\"data row9 col3\" >0.005700</td>\n",
-       "      <td id=\"T_b98c1_row9_col4\" class=\"data row9 col4\" >0.072600</td>\n",
+       "      <th id=\"T_4ca60_level0_row9\" class=\"row_heading level0 row9\" >200</th>\n",
+       "      <td id=\"T_4ca60_row9_col0\" class=\"data row9 col0\" >PC1</td>\n",
+       "      <td id=\"T_4ca60_row9_col1\" class=\"data row9 col1\" >0.021200</td>\n",
+       "      <td id=\"T_4ca60_row9_col2\" class=\"data row9 col2\" >-0.056200</td>\n",
+       "      <td id=\"T_4ca60_row9_col3\" class=\"data row9 col3\" >0.005700</td>\n",
+       "      <td id=\"T_4ca60_row9_col4\" class=\"data row9 col4\" >0.072600</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
index ecc20b72..e11044a7 100644
--- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
+++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
@@ -37,7 +37,7 @@ class TestMineNegatives:
     def test_returns_boolean_mask_over_unlabeled(self):
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
         assert isinstance(mask, np.ndarray)
         assert mask.dtype == bool
         assert mask.shape == (X_unl.shape[0],)
@@ -46,35 +46,29 @@ def test_returns_boolean_mask_over_unlabeled(self):
     def test_X_pos_parameter(self):
         X_pos, X_unl = _make_data(n_pos=30)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5)
         assert mask.shape[0] == X_unl.shape[0]
 
     def test_X_unlabeled_parameter(self):
         X_pos, X_unl = _make_data(n_unl=70)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=12)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=12)
         assert mask.shape[0] == 70
         assert mask.sum() == 12
 
-    def test_n_unl_to_neg_parameter(self):
+    def test_n_neg_parameter(self):
         X_pos, X_unl = _make_data()
         for n in (1, 5, 25):
             dpul = aa.dPULearn(random_state=42, verbose=False)
-            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=n)
+            mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=n)
             assert mask.sum() == n
 
-    def test_n_neg_parameter(self):
-        X_pos, X_unl = _make_data()
-        dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=8)
-        assert mask.sum() == 8
-
     def test_metric_parameter(self):
         X_pos, X_unl = _make_data()
         for metric in ("euclidean", "manhattan", "cosine"):
             dpul = aa.dPULearn(random_state=42, verbose=False)
             mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
-                                       n_unl_to_neg=10, metric=metric)
+                                       n_neg=10, metric=metric)
             assert mask.sum() == 10
 
     def test_n_components_parameter(self):
@@ -82,14 +76,14 @@ def test_n_components_parameter(self):
         for n_components in (2, 3, 0.5):
             dpul = aa.dPULearn(random_state=42, verbose=False)
             mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
-                                       n_unl_to_neg=10, n_components=n_components)
+                                       n_neg=10, n_components=n_components)
             assert mask.sum() == 10
 
     def test_instance_attributes_set(self):
         """After mining, labels_ / df_pu_ are set so the plotting class works."""
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
+        dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
         assert dpul.labels_ is not None
         assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0]
         assert dpul.df_pu_ is not None
@@ -104,7 +98,7 @@ def test_mask_equals_manual_pca(self, seed):
         X_pos, X_unl = _make_data(seed=seed)
         manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
         assert np.array_equal(mask, manual_mask)
         assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_))
 
@@ -113,7 +107,7 @@ def test_mask_equals_manual_metric(self):
         manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine")
         dpul = aa.dPULearn(random_state=42, verbose=False)
         mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
-                                   n_unl_to_neg=8, metric="cosine")
+                                   n_neg=8, metric="cosine")
         assert np.array_equal(mask, manual_mask)
 
     def test_mask_equals_manual_few_positives(self):
@@ -122,17 +116,9 @@ def test_mask_equals_manual_few_positives(self):
         X_pos, X_unl = _make_data(n_pos=1, seed=5)
         manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6)
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=6)
+        mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=6)
         assert np.array_equal(mask, manual_mask)
 
-    def test_n_neg_equivalent_to_n_unl_to_neg_without_pre_neg(self):
-        X_pos, X_unl = _make_data()
-        dpul_a = aa.dPULearn(random_state=42, verbose=False)
-        mask_a = dpul_a.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=10)
-        dpul_b = aa.dPULearn(random_state=42, verbose=False)
-        mask_b = dpul_b.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
-        assert np.array_equal(mask_a, mask_b)
-
 
 # Negative Cases Test Class
 class TestMineNegativesNegative:
@@ -143,37 +129,31 @@ def test_feature_mismatch(self):
         _, X_unl = _make_data(n_features=6)
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=5)
-
-    def test_both_counts_given(self):
-        X_pos, X_unl = _make_data()
-        dpul = aa.dPULearn(random_state=42, verbose=False)
-        with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5, n_unl_to_neg=5)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5)
 
-    def test_neither_count_given(self):
+    def test_n_neg_below_one(self):
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=0)
 
     def test_too_many_negatives_requested(self):
         X_pos, X_unl = _make_data(n_unl=10)
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_unl_to_neg=999)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=999)
 
     def test_X_pos_none(self):
         _, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_unl_to_neg=5)
+            dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_neg=5)
 
     def test_X_unlabeled_none(self):
         X_pos, _ = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
         with pytest.raises(ValueError):
-            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_unl_to_neg=5)
+            dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_neg=5)
 
 
 # Existing-fit byte-identical regression

From 6158eab340df5120903b1f77ad4345351982dfb2 Mon Sep 17 00:00:00 2001
From: Stephan Breimann <stephanbreimann@gmail.com>
Date: Wed, 1 Jul 2026 05:31:49 +0200
Subject: [PATCH 6/6] round5(dpulearn): validate n_neg in the frontend for a
 correctly-named error

mine_negatives delegated n_neg validation to fit (which sees it as n_unl_to_neg),
so an invalid n_neg raised an error naming the internal parameter. Validate n_neg
explicitly in the frontend so the message names n_neg, and assert the name in the
negative test.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aaanalysis/pu_learning/_dpulearn.py                       | 2 ++
 tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py
index 40d44b44..12da26f6 100644
--- a/aaanalysis/pu_learning/_dpulearn.py
+++ b/aaanalysis/pu_learning/_dpulearn.py
@@ -435,6 +435,8 @@ def mine_negatives(self,
         X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1)
         X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1)
         check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled)
+        # Validate n_neg here so the message names 'n_neg' (fit sees it as 'n_unl_to_neg')
+        ut.check_number_range(name="n_neg", val=n_neg, min_val=1, just_int=True)
         # Stack positives over the unlabeled pool and fit with the package PU markers
         n_pos = X_pos.shape[0]
         X = np.vstack([X_pos, X_unlabeled])
diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
index e11044a7..2accb502 100644
--- a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
+++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
@@ -134,7 +134,8 @@ def test_feature_mismatch(self):
     def test_n_neg_below_one(self):
         X_pos, X_unl = _make_data()
         dpul = aa.dPULearn(random_state=42, verbose=False)
-        with pytest.raises(ValueError):
+        # The error must name 'n_neg' (not the internal 'n_unl_to_neg' fit sees).
+        with pytest.raises(ValueError, match="n_neg"):
             dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=0)
 
     def test_too_many_negatives_requested(self):