diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py
index 4fc1edee..17989828 100644
--- a/aaanalysis/__init__.py
+++ b/aaanalysis/__init__.py
@@ -1,4 +1,4 @@
-from .data_handling import (load_dataset, load_scales, load_features,
+from .data_handling import (load_dataset, load_scales, load_features, get_labels,
read_fasta, to_fasta,
SequencePreprocessor,
EmbeddingPreprocessor,
@@ -14,6 +14,8 @@
comp_per_protein_ap, comp_detection_metrics,
comp_bootstrap_ci, comp_smooth_scores)
from .config import options
+from ._constants import (COLOR_SAMPLES_POS, COLOR_SAMPLES_NEG,
+ COLOR_SAMPLES_UNL, COLOR_SAMPLES_REL_NEG)
from importlib.metadata import version as _version, PackageNotFoundError
@@ -28,6 +30,7 @@
"load_dataset",
"load_scales",
"load_features",
+ "get_labels",
"read_fasta",
"to_fasta",
"SequencePreprocessor",
@@ -72,6 +75,10 @@
"comp_detection_metrics",
"comp_bootstrap_ci",
"comp_smooth_scores",
+ "COLOR_SAMPLES_POS",
+ "COLOR_SAMPLES_NEG",
+ "COLOR_SAMPLES_UNL",
+ "COLOR_SAMPLES_REL_NEG",
"options"
]
diff --git a/aaanalysis/_constants.py b/aaanalysis/_constants.py
index 2c22e606..2ce81416 100644
--- a/aaanalysis/_constants.py
+++ b/aaanalysis/_constants.py
@@ -478,6 +478,15 @@ def _folder_path(super_folder, folder_name):
COLOR_NEG = "#ad4570" # (173,69,112)
COLOR_REL_NEG = "#ad9745" # (173, 151, 69)
+# Public, named aliases for the canonical sample-group colors (positive / negative /
+# unlabeled / reliable-negative). They mirror the ``DICT_COLOR["SAMPLES_*"]`` entries
+# exactly, so users can reference a named constant (``aa.COLOR_SAMPLES_POS``) instead
+# of indexing ``plot_get_cdict("DICT_COLOR")`` by string key.
+COLOR_SAMPLES_POS = COLOR_POS
+COLOR_SAMPLES_NEG = COLOR_NEG
+COLOR_SAMPLES_UNL = COLOR_UNL
+COLOR_SAMPLES_REL_NEG = COLOR_REL_NEG
+
DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
"SHAP_NEG": COLOR_SHAP_NEG,
"FEAT_POS": COLOR_FEAT_POS,
diff --git a/aaanalysis/data_handling/__init__.py b/aaanalysis/data_handling/__init__.py
index 1fad9397..7ea8c663 100644
--- a/aaanalysis/data_handling/__init__.py
+++ b/aaanalysis/data_handling/__init__.py
@@ -1,8 +1,8 @@
"""
Data loading and sequence/embedding preprocessing — the package's data entry point.
-Public objects: load_dataset, load_scales, load_features, read_fasta, to_fasta,
-SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
+Public objects: load_dataset, load_scales, load_features, get_labels, read_fasta,
+to_fasta, SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
Produces the core data objects the rest of the pipeline consumes: ``load_dataset``
yields ``df_seq``, ``load_scales`` yields ``df_scales`` (fed to
``feature_engineering.AAclust`` / ``CPP``), ``load_features`` yields a reference
@@ -17,6 +17,7 @@
from ._load_dataset import load_dataset
from ._load_scales import load_scales
from ._load_features import load_features
+from ._get_labels import get_labels
from ._read_fasta import read_fasta
from ._to_fasta import to_fasta
from ._seq_preproc import SequencePreprocessor
@@ -27,6 +28,7 @@
"load_dataset",
"load_scales",
"load_features",
+ "get_labels",
"read_fasta",
"to_fasta",
"SequencePreprocessor",
diff --git a/aaanalysis/data_handling/_get_labels.py b/aaanalysis/data_handling/_get_labels.py
new file mode 100644
index 00000000..f23d8423
--- /dev/null
+++ b/aaanalysis/data_handling/_get_labels.py
@@ -0,0 +1,70 @@
+"""
+This is a script for the frontend of the get_labels function, deriving a binary
+label vector from a sequence DataFrame's label column.
+"""
+from typing import Any
+import numpy as np
+import pandas as pd
+
+import aaanalysis.utils as ut
+
+
+# I Helper Functions
+def check_match_df_positive_label(df=None, col_label=None, positive_label=None) -> None:
+ """Check that the positive label value is present in the label column."""
+ present = set(df[col_label].tolist())
+ if positive_label not in present:
+ raise ValueError(f"'positive_label' ({positive_label}) is not among the values of "
+ f"column '{col_label}' ({sorted(present, key=str)}).")
+
+
+# II Main Functions
+def get_labels(df: pd.DataFrame,
+ positive_label: Any = 1,
+ col_label: str = "label",
+ ) -> np.ndarray:
+ """
+ Derive a binary ``int`` label vector from a column of a sequence DataFrame.
+
+ Maps the value flagged as positive (``positive_label``) onto ``1`` and every other
+ value onto ``0``, the binary encoding consumed across the package (e.g. by
+ :meth:`CPP.run`, :class:`TreeModel`, and the ``labels`` argument of most tools).
+ This is the single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()``
+ expression.
+
+ .. versionadded:: 1.1.0
+
+ Parameters
+ ----------
+ df : pd.DataFrame, shape (n_samples, n_seq_info)
+ Sequence DataFrame (``df_seq``) containing the label column ``col_label``.
+ positive_label : int or str, default=1
+ Value in ``col_label`` marking the positive class. All rows equal to it become
+ ``1``; all remaining rows become ``0``. Must be present in ``col_label``.
+ col_label : str, default='label'
+ Name of the column holding the (multi-value or already binary) labels.
+
+ Returns
+ -------
+ labels : array-like, shape (n_samples,)
+ Binary ``int`` label vector (``1`` = positive, ``0`` = otherwise), row-aligned
+ to ``df``.
+
+ Notes
+ -----
+ * The result equals ``(df[col_label] == positive_label).astype(int).to_numpy()``.
+ * Pass the resulting vector directly as the ``labels`` argument of CPP, TreeModel,
+ or other tools. For Positive-Unlabeled mining keep the package ``1`` (positive) /
+ ``2`` (unlabeled) markers instead and use :meth:`dPULearn.mine_negatives`.
+
+ Examples
+ --------
+ .. include:: examples/get_labels.rst
+ """
+ # Check input
+ ut.check_str(name="col_label", val=col_label, accept_none=False)
+ ut.check_df(name="df", df=df, cols_required=col_label)
+ check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label)
+ # Derive binary int label vector
+ labels = (df[col_label] == positive_label).astype(int).to_numpy()
+ return labels
diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py
index 23b5f729..12da26f6 100644
--- a/aaanalysis/pu_learning/_dpulearn.py
+++ b/aaanalysis/pu_learning/_dpulearn.py
@@ -133,6 +133,15 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None:
raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})")
+def check_match_X_pos_X_unlabeled(X_pos=None, X_unlabeled=None) -> None:
+ """Check that positive and unlabeled feature matrices share the same feature dimension."""
+ n_features_pos = X_pos.shape[1]
+ n_features_unl = X_unlabeled.shape[1]
+ if n_features_pos != n_features_unl:
+ raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and "
+ f"'X_unlabeled' (n={n_features_unl})")
+
+
# II Main Functions
class dPULearn(Wrapper):
"""
@@ -358,6 +367,87 @@ def fit(self,
self.df_pu_ = df_pu
return self
+ def mine_negatives(self,
+ X_pos: ut.ArrayLike2D,
+ X_unlabeled: ut.ArrayLike2D,
+ n_neg: int,
+ metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None,
+ n_components: Union[float, int] = 0.80,
+ ) -> np.ndarray:
+ """
+ Mine reliable negatives from an unlabeled pool given the positives, in one call.
+
+ Convenience wrapper around :meth:`dPULearn.fit` for the common positive/unlabeled
+ setup: instead of stacking ``X_pos`` and ``X_unlabeled`` by hand, building a
+ ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined
+ rows back out by index, pass the two feature matrices separately and receive a
+ **boolean mask over the rows of** ``X_unlabeled`` flagging the identified reliable
+ negatives. The mask equals ``labels_[len(X_pos):] == 0`` from the manual stacking
+ path exactly.
+
+ After the call the instance is fitted: :attr:`dPULearn.labels_` (over the stacked
+ ``X_pos`` then ``X_unlabeled``) and :attr:`dPULearn.df_pu_` are set, so the
+ :class:`dPULearnPlot` methods work as usual.
+
+ .. versionadded:: 1.1.0
+
+ Parameters
+ ----------
+ X_pos : array-like, shape (n_pos, n_features)
+ Feature matrix of the positive samples.
+ X_unlabeled : array-like, shape (n_unl, n_features)
+ Feature matrix of the unlabeled samples (the candidate pool). Must have the
+ same number of features as ``X_pos``.
+ n_neg : int
+ Number of reliable negatives to identify from the unlabeled pool. Must not
+ exceed the number of unlabeled samples.
+ metric : str or None, optional
+ Distance metric for distance-based identification (``euclidean``,
+ ``manhattan``, ``cosine``). If ``None``, PCA-based identification is performed.
+ n_components : int or float, default=0.80
+ Number of principal components (int >= 1) or fraction of variance covered
+ (float in ``(0.0, 1.0)``) when PCA is applied.
+
+ Returns
+ -------
+ mask_neg : array-like, shape (n_unl,)
+ Boolean mask over the rows of ``X_unlabeled``: ``True`` marks an identified
+ reliable negative. ``X_unlabeled[mask_neg]`` are the mined negatives.
+
+ Notes
+ -----
+ * This is purely additive sugar: it stacks the inputs and calls
+ :meth:`dPULearn.fit` with ``label_pos=1`` / ``label_unl=2`` internally, so the
+ identification result is identical to the manual path.
+
+ See Also
+ --------
+ * :meth:`dPULearn.fit`: the underlying fit on a stacked matrix and label vector.
+ * :func:`get_labels`: derive a binary label vector from a sequence DataFrame.
+
+ Examples
+ --------
+ .. include:: examples/dpul_mine_negatives.rst
+ """
+ # Check input (the >=3 sample floor applies to the stacked matrix, enforced by
+ # 'fit' below, so per-matrix validation only coerces + checks the feature dimension;
+ # this keeps mine_negatives accepting exactly what the manual stacking path accepts)
+ X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1)
+ X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1)
+ check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled)
+ # Validate n_neg here so the message names 'n_neg' (fit sees it as 'n_unl_to_neg')
+ ut.check_number_range(name="n_neg", val=n_neg, min_val=1, just_int=True)
+ # Stack positives over the unlabeled pool and fit with the package PU markers
+ n_pos = X_pos.shape[0]
+ X = np.vstack([X_pos, X_unlabeled])
+ labels = np.array([1] * n_pos + [2] * X_unlabeled.shape[0])
+ # No pre-labeled negatives here, so n_neg is exactly the count to draw from the pool
+ self.fit(X=X, labels=labels, label_pos=1, label_unl=2,
+ n_unl_to_neg=n_neg, metric=metric, n_components=n_components)
+ # Slice the mined reliable negatives (label 0) back out of the unlabeled block
+ mask_neg = np.asarray(self.labels_)[n_pos:] == 0
+ return mask_neg
+
@staticmethod
def eval(X: ut.ArrayLike2D,
list_labels: ut.ArrayLike2D,
diff --git a/docs/_cheatsheet/content.py b/docs/_cheatsheet/content.py
index 801adc7f..0d04b6eb 100644
--- a/docs/_cheatsheet/content.py
+++ b/docs/_cheatsheet/content.py
@@ -188,6 +188,7 @@
("Load benchmark sequences", "load_dataset(name) → df_seq", None),
("Load AAontology scales", "load_scales() → df_scales", None),
("Load precomputed features", "load_features(name) → df_feat", None),
+ ("Binary labels from df column", "get_labels(df, positive_label) → labels", None, "v1.1"),
("Read / write FASTA", "read_fasta(file) → df_seq", None),
("Cluster redundant homologs", "filter_seq(df_seq) → df_clust [pro]", None),
]},
@@ -221,6 +222,7 @@
{"name": "Modeling & Explainability", "tag": "PU · classify · SHAP",
"rows": [
("Train with positives + unlabeled data", "dPULearn().fit(X, labels) [Wrapper]", None),
+ ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabeled) → mask", None, "v1.1"),
("Train + RFE + MC importance", "TreeModel().fit(X, labels) [Wrapper]", None),
("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels) [pro]", None),
]},
diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst
index 89fa7d74..817f4b56 100644
--- a/docs/source/index/release_notes.rst
+++ b/docs/source/index/release_notes.rst
@@ -35,6 +35,11 @@ Added
per-residue PTM and functional-site annotations and encodes them into tensors
(``fetch_uniprot``, ``ingest``, ``register_feature``, ``encode``, ``build_scales``,
``build_cat``, ``to_df_seq``).
+- **combine_dict_nums**: Concatenates per-residue tensors (embedding / structure /
+ annotation) along the feature axis into one combined ``CPP.run_num`` input.
+- **get_labels**: Derives a binary ``int`` label vector from a sequence DataFrame's
+ label column (``positive_label`` mapped to ``1``, everything else to ``0``) — the
+ single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` expression.
- :func:`~aaanalysis.combine_dict_nums`: Concatenates per-residue tensors (embedding / structure /
annotation) along the feature axis into one combined :meth:`~aaanalysis.CPP.run_num` input.
@@ -132,6 +137,16 @@ Added
switches the pre-computed prediction per P1 (feature map + structure restyle) with no kernel,
keeping the column-residue linking (warned past 40 sites, hard-capped at 200).
+**PU Learning**
+
+- **dPULearn.mine_negatives**: One-call convenience over ``dPULearn.fit`` for the common
+ positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabeled`` separately instead of
+ stacking them by hand, building a ``1`` / ``2`` label vector, fitting, and slicing the
+ mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabeled``
+ flagging the identified reliable negatives (equal to the manual ``labels_[len(X_pos):]
+ == 0`` result exactly). The instance is left fitted (``labels_`` / ``df_pu_`` set, so
+ ``dPULearnPlot`` works), and the existing ``fit(X, labels=...)`` path is unchanged.
+
**Sequence Analysis**
- :class:`~aaanalysis.AAWindowSampler`: Samples fixed-length sequence windows for PU-learning and
@@ -191,6 +206,10 @@ Added
- :func:`~aaanalysis.plot_rank`: Standalone per-protein max-score-vs-rank scatter with group coloring and
optional threshold lines (pairs with the new ``aa.metrics`` functions).
+- **COLOR_SAMPLES_POS / COLOR_SAMPLES_NEG / COLOR_SAMPLES_UNL / COLOR_SAMPLES_REL_NEG**:
+ Public, named constants for the canonical sample-group colors (positive / negative /
+ unlabeled / reliable-negative). They equal the ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]``
+ values exactly, so a named constant replaces indexing the color dict by string key.
**Golden Pipelines**
diff --git a/examples/data_handling/get_labels.ipynb b/examples/data_handling/get_labels.ipynb
new file mode 100644
index 00000000..ef84686a
--- /dev/null
+++ b/examples/data_handling/get_labels.ipynb
@@ -0,0 +1,297 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "30e77e51",
+ "metadata": {},
+ "source": [
+ "The ``get_labels`` function derives a binary ``int`` label vector from a column of a sequence DataFrame (``df_seq``). It is the single-call form of the recurring ``(df[col] == positive_label).astype(int).to_numpy()`` expression: the value flagged as positive becomes ``1`` and every other value becomes ``0``."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "14210edc",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-06-30T23:29:34.142992Z",
+ "iopub.status.busy": "2026-06-30T23:29:34.142637Z",
+ "iopub.status.idle": "2026-06-30T23:29:35.599732Z",
+ "shell.execute_reply": "2026-06-30T23:29:35.599479Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "DataFrame shape: (10, 8)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | | \n",
+ " entry | \n",
+ " sequence | \n",
+ " label | \n",
+ " tmd_start | \n",
+ " tmd_stop | \n",
+ " jmd_n | \n",
+ " tmd | \n",
+ " jmd_c | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 1 | \n",
+ " P05067 | \n",
+ " MLPGLALLLLAAWTA...GYENPTYKFFEQMQN | \n",
+ " 1 | \n",
+ " 701 | \n",
+ " 723 | \n",
+ " FAEDVGSNKG | \n",
+ " AIIGLMVGGVVIATVIVITLVML | \n",
+ " KKKQYTSIHH | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " P14925 | \n",
+ " MAGRARSGLLLLLLG...EEEYSAPLPKPAPSS | \n",
+ " 1 | \n",
+ " 868 | \n",
+ " 890 | \n",
+ " KLSTEPGSGV | \n",
+ " SVVLITTLLVIPVLVLLAIVMFI | \n",
+ " RWKKSRAFGD | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " P70180 | \n",
+ " MRSLLLFTFSACVLL...RELREDSIRSHFSVA | \n",
+ " 1 | \n",
+ " 477 | \n",
+ " 499 | \n",
+ " PCKSSGGLEE | \n",
+ " SAVTGIVVGALLGAGLLMAFYFF | \n",
+ " RKKYRITIER | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Q03157 | \n",
+ " MGPTSPAARGQGRRW...HGYENPTYRFLEERP | \n",
+ " 1 | \n",
+ " 585 | \n",
+ " 607 | \n",
+ " APSGTGVSRE | \n",
+ " ALSGLLIMGAGGGSLIVLSLLLL | \n",
+ " RKKKPYGTIS | \n",
+ "
\n",
+ " \n",
+ " | 5 | \n",
+ " Q06481 | \n",
+ " MAATGTAAAAATGRL...GYENPTYKYLEQMQI | \n",
+ " 1 | \n",
+ " 694 | \n",
+ " 716 | \n",
+ " LREDFSLSSS | \n",
+ " ALIGLLVIAVAIATVIVISLVML | \n",
+ " RKRQYGTISH | \n",
+ "
\n",
+ " \n",
+ " | 6 | \n",
+ " P12821 | \n",
+ " MGAASGRRGPGLLLP...SHGPQFGSEVELRHS | \n",
+ " 2 | \n",
+ " 1257 | \n",
+ " 1276 | \n",
+ " GLDLDAQQAR | \n",
+ " VGQWLLLFLGIALLVATLGL | \n",
+ " SQRLFSIRHR | \n",
+ "
\n",
+ " \n",
+ " | 7 | \n",
+ " P36896 | \n",
+ " MAESAGASSFFPLVV...KKTLSQLSVQEDVKI | \n",
+ " 2 | \n",
+ " 127 | \n",
+ " 149 | \n",
+ " EHPSMWGPVE | \n",
+ " LVGIIAGPVFLLFLIIIIVFLVI | \n",
+ " NYHQRVYHNR | \n",
+ "
\n",
+ " \n",
+ " | 8 | \n",
+ " Q8NER5 | \n",
+ " MTRALCSALRQALLL...KKTISQLCVKEDCKA | \n",
+ " 2 | \n",
+ " 114 | \n",
+ " 136 | \n",
+ " PNAPKLGPME | \n",
+ " LAIIITVPVCLLSIAAMLTVWAC | \n",
+ " QGRQCSYRKK | \n",
+ "
\n",
+ " \n",
+ " | 9 | \n",
+ " P37023 | \n",
+ " MTLGSPRKGLLMLLM...LQKISNSPEKPKVIQ | \n",
+ " 2 | \n",
+ " 119 | \n",
+ " 141 | \n",
+ " PSEQPGTDGQ | \n",
+ " LALILGPVLALLALVALGVLGLW | \n",
+ " HVRRRQEKQR | \n",
+ "
\n",
+ " \n",
+ " | 10 | \n",
+ " O43184 | \n",
+ " MAARPLPVSPARALL...YPHQVPRSTHTAYIK | \n",
+ " 2 | \n",
+ " 707 | \n",
+ " 729 | \n",
+ " DSGPIRQADN | \n",
+ " QGLTIGILVTILCLLAAGFVVYL | \n",
+ " KRKTLIRLLF | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "import aaanalysis as aa\n",
+ "aa.options[\"verbose\"] = False\n",
+ "\n",
+ "# A Positive-Unlabeled (PU) dataset: substrates (1) and unlabeled others (2).\n",
+ "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\", n=5)\n",
+ "aa.display_df(df=df_seq, n_rows=10, show_shape=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e84a0e34",
+ "metadata": {},
+ "source": [
+ "By default ``positive_label=1``: substrates map to ``1`` and everything else to ``0``."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a5ccd25b",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-06-30T23:29:35.600825Z",
+ "iopub.status.busy": "2026-06-30T23:29:35.600758Z",
+ "iopub.status.idle": "2026-06-30T23:29:35.602749Z",
+ "shell.execute_reply": "2026-06-30T23:29:35.602535Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[1 1 1 1 1 0 0 0 0 0]\n"
+ ]
+ }
+ ],
+ "source": [
+ "labels = aa.get_labels(df=df_seq, positive_label=1)\n",
+ "print(labels)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5db29b65",
+ "metadata": {},
+ "source": [
+ "Pick any value as the positive class via ``positive_label`` (e.g. treat the unlabeled ``2`` as positive), and select a different column with ``col_label``. The result equals the manual ``(df[col_label] == positive_label).astype(int).to_numpy()`` expression."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "d9a49747",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-06-30T23:29:35.603686Z",
+ "iopub.status.busy": "2026-06-30T23:29:35.603626Z",
+ "iopub.status.idle": "2026-06-30T23:29:35.605515Z",
+ "shell.execute_reply": "2026-06-30T23:29:35.605342Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "[0 0 0 0 0 1 1 1 1 1]\n"
+ ]
+ }
+ ],
+ "source": [
+ "labels_unl = aa.get_labels(df=df_seq, positive_label=2, col_label=\"label\")\n",
+ "print(labels_unl)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "82fd2bd1",
+ "metadata": {},
+ "source": [
+ "Pass the resulting vector straight into the ``labels`` argument of tools such as :meth:`CPP.run` or :class:`TreeModel`."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb
new file mode 100644
index 00000000..fc56d1fc
--- /dev/null
+++ b/examples/pu_learning/dpul_mine_negatives.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "01ccf3ee",
+ "metadata": {},
+ "source": [
+ "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabeled``."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "710fdc35",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-07-01T03:27:51.939100Z",
+ "iopub.status.busy": "2026-07-01T03:27:51.937608Z",
+ "iopub.status.idle": "2026-07-01T03:28:11.600218Z",
+ "shell.execute_reply": "2026-07-01T03:28:11.552756Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "positives: 63 | unlabeled: 631\n"
+ ]
+ }
+ ],
+ "source": [
+ "import aaanalysis as aa\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "aa.options[\"verbose\"] = False\n",
+ "\n",
+ "# Build a CPP feature matrix for the gamma-secretase PU dataset (substrates vs unlabeled others).\n",
+ "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\")\n",
+ "df_feat = aa.load_features(name=\"DOM_GSEC\")\n",
+ "sf = aa.SequenceFeature()\n",
+ "X = sf.feature_matrix(features=df_feat[\"feature\"], df_parts=sf.get_df_parts(df_seq=df_seq))\n",
+ "labels = df_seq[\"label\"].to_numpy()\n",
+ "\n",
+ "# Split into the positive (1) and unlabeled (2) feature blocks.\n",
+ "X_pos = X[labels == 1]\n",
+ "X_unl = X[labels == 2]\n",
+ "print(f\"positives: {len(X_pos)} | unlabeled: {len(X_unl)}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "38f3d31d",
+ "metadata": {},
+ "source": [
+ "Mine a fixed number of reliable negatives from the unlabeled pool with ``n_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "bdccaa03",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-07-01T03:28:11.767224Z",
+ "iopub.status.busy": "2026-07-01T03:28:11.766642Z",
+ "iopub.status.idle": "2026-07-01T03:28:11.816538Z",
+ "shell.execute_reply": "2026-07-01T03:28:11.795116Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "mined reliable negatives: 49 of 631 unlabeled\n"
+ ]
+ }
+ ],
+ "source": [
+ "dpul = aa.dPULearn(random_state=42)\n",
+ "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=49)\n",
+ "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n",
+ "X_neg = X_unl[mask_neg] # the mined feature rows"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "315d4798",
+ "metadata": {},
+ "source": [
+ "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabeled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "c78d6eab",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-07-01T03:28:11.824245Z",
+ "iopub.status.busy": "2026-07-01T03:28:11.824003Z",
+ "iopub.status.idle": "2026-07-01T03:28:12.020230Z",
+ "shell.execute_reply": "2026-07-01T03:28:11.952823Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "mask equals manual path: True\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Stack X_pos over X_unlabeled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n",
+ "labels_manual = np.asarray(\n",
+ " aa.dPULearn(random_state=42)\n",
+ " .fit(X=np.vstack([X_pos, X_unl]),\n",
+ " labels=np.array([1] * len(X_pos) + [2] * len(X_unl)),\n",
+ " n_unl_to_neg=49)\n",
+ " .labels_)\n",
+ "mask_manual = labels_manual[len(X_pos):] == 0\n",
+ "print(\"mask equals manual path:\", np.array_equal(mask_neg, mask_manual))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cae2d79d",
+ "metadata": {},
+ "source": [
+ "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Pass ``n_neg`` (the number of reliable negatives to mine) and optionally a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification instead of the default PCA."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "85eff05c",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2026-07-01T03:28:12.179951Z",
+ "iopub.status.busy": "2026-07-01T03:28:12.179661Z",
+ "iopub.status.idle": "2026-07-01T03:28:12.745973Z",
+ "shell.execute_reply": "2026-07-01T03:28:12.744957Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2 582\n",
+ "1 63\n",
+ "0 49\n",
+ "Name: count, dtype: int64\n",
+ "DataFrame shape: (49, 15)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " | | \n",
+ " selection_via | \n",
+ " PC1 (56.2%) | \n",
+ " PC2 (7.4%) | \n",
+ " PC3 (2.9%) | \n",
+ " PC4 (2.8%) | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 84 | \n",
+ " PC1 | \n",
+ " 0.021000 | \n",
+ " -0.047800 | \n",
+ " 0.075200 | \n",
+ " -0.005400 | \n",
+ "
\n",
+ " \n",
+ " | 95 | \n",
+ " PC2 | \n",
+ " 0.032000 | \n",
+ " -0.082100 | \n",
+ " 0.025800 | \n",
+ " -0.037700 | \n",
+ "
\n",
+ " \n",
+ " | 109 | \n",
+ " PC1 | \n",
+ " 0.026100 | \n",
+ " -0.058500 | \n",
+ " 0.075700 | \n",
+ " -0.020900 | \n",
+ "
\n",
+ " \n",
+ " | 158 | \n",
+ " PC1 | \n",
+ " 0.023500 | \n",
+ " -0.060700 | \n",
+ " 0.054000 | \n",
+ " 0.000900 | \n",
+ "
\n",
+ " \n",
+ " | 161 | \n",
+ " PC1 | \n",
+ " 0.025900 | \n",
+ " 0.031400 | \n",
+ " 0.044900 | \n",
+ " 0.055400 | \n",
+ "
\n",
+ " \n",
+ " | 170 | \n",
+ " PC1 | \n",
+ " 0.026100 | \n",
+ " -0.035300 | \n",
+ " 0.058300 | \n",
+ " 0.025800 | \n",
+ "
\n",
+ " \n",
+ " | 192 | \n",
+ " PC6 | \n",
+ " 0.040100 | \n",
+ " -0.002200 | \n",
+ " 0.004300 | \n",
+ " -0.053600 | \n",
+ "
\n",
+ " \n",
+ " | 193 | \n",
+ " PC1 | \n",
+ " 0.024700 | \n",
+ " -0.056900 | \n",
+ " 0.051300 | \n",
+ " -0.035600 | \n",
+ "
\n",
+ " \n",
+ " | 195 | \n",
+ " PC5 | \n",
+ " 0.029900 | \n",
+ " 0.006500 | \n",
+ " 0.035800 | \n",
+ " 0.050200 | \n",
+ "
\n",
+ " \n",
+ " | 200 | \n",
+ " PC1 | \n",
+ " 0.021200 | \n",
+ " -0.056200 | \n",
+ " 0.005700 | \n",
+ " 0.072600 | \n",
+ "
\n",
+ " \n",
+ "
\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "print(pd.Series(dpul.labels_).value_counts())\n",
+ "aa.display_df(df=dpul.df_pu_[dpul.df_pu_[\"selection_via\"].str.contains(\"PC\", na=False)],\n",
+ " n_rows=10, n_cols=5, show_shape=True)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tests/unit/data_handling_tests/test_get_labels.py b/tests/unit/data_handling_tests/test_get_labels.py
new file mode 100644
index 00000000..0d63e291
--- /dev/null
+++ b/tests/unit/data_handling_tests/test_get_labels.py
@@ -0,0 +1,114 @@
+"""
+This script tests the top-level get_labels() function (issue #308).
+
+get_labels is the single-call form of the recurring
+``(df[col] == positive_label).astype(int).to_numpy()`` expression that appears in 4+ places
+of the gamma-secretase use case. It maps the positive value onto 1 and everything else onto 0.
+"""
+import numpy as np
+import pandas as pd
+import pytest
+
+import aaanalysis as aa
+
+
+# Helper functions
+def _manual(df, positive_label, col="label"):
+ return (df[col] == positive_label).astype(int).to_numpy()
+
+
+# Normal Cases Test Class
+class TestGetLabels:
+ """Test get_labels() for each parameter individually."""
+
+ def test_returns_int_numpy_array(self):
+ df = pd.DataFrame({"entry": ["a", "b", "c"], "label": [1, 2, 1]})
+ labels = aa.get_labels(df=df, positive_label=1)
+ assert isinstance(labels, np.ndarray)
+ assert labels.dtype.kind == "i"
+ assert labels.shape == (3,)
+
+ def test_positive_label_default(self):
+ df = pd.DataFrame({"label": [1, 0, 1, 0]})
+ labels = aa.get_labels(df=df)
+ assert np.array_equal(labels, np.array([1, 0, 1, 0]))
+
+ def test_df_parameter(self):
+ df = pd.DataFrame({"label": [2, 2, 1]})
+ labels = aa.get_labels(df=df, positive_label=1)
+ assert np.array_equal(labels, np.array([0, 0, 1]))
+
+ def test_col_label_parameter(self):
+ df = pd.DataFrame({"y": [1, 2, 1, 2]})
+ labels = aa.get_labels(df=df, positive_label=2, col_label="y")
+ assert np.array_equal(labels, np.array([0, 1, 0, 1]))
+
+
+# Golden equivalence to the manual expression (KPI: >= 2 encodings)
+class TestGetLabelsEquivalence:
+ """Result equals the manual expression on multiple label encodings (KPI #308)."""
+
+ def test_pu_encoding_1_2(self):
+ # PU encoding: 1 = positive, 2 = unlabeled
+ df = pd.DataFrame({"label": [1, 2, 1, 2, 2, 1]})
+ assert np.array_equal(aa.get_labels(df=df, positive_label=1),
+ _manual(df, 1))
+
+ def test_binary_encoding_0_1(self):
+ # Standard {0, 1} encoding
+ df = pd.DataFrame({"label": [0, 1, 1, 0]})
+ assert np.array_equal(aa.get_labels(df=df, positive_label=1),
+ _manual(df, 1))
+
+ def test_multiclass_encoding(self):
+ # Multi-class: pick one class as positive
+ df = pd.DataFrame({"label": [0, 1, 2, 0, 1, 2]})
+ for pos in (0, 1, 2):
+ assert np.array_equal(aa.get_labels(df=df, positive_label=pos),
+ _manual(df, pos))
+
+ def test_string_labels(self):
+ df = pd.DataFrame({"label": ["sub", "non", "sub", "unl"]})
+ assert np.array_equal(aa.get_labels(df=df, positive_label="sub"),
+ _manual(df, "sub"))
+
+ def test_single_class_column_maps_all_ones(self):
+ # Pure mapping: unlike dPULearn.fit, get_labels does not require >1 distinct value,
+ # so an all-positive column maps to all ones rather than raising.
+ df = pd.DataFrame({"label": [1, 1, 1]})
+ assert np.array_equal(aa.get_labels(df=df, positive_label=1),
+ np.array([1, 1, 1]))
+
+ def test_nan_maps_to_zero(self):
+ # NaN never equals positive_label, so it becomes 0.
+ df = pd.DataFrame({"label": [1.0, np.nan, 1.0]})
+ assert np.array_equal(aa.get_labels(df=df, positive_label=1.0),
+ np.array([1, 0, 1]))
+
+
+# Negative Cases Test Class
+class TestGetLabelsNegative:
+ """Invalid inputs must raise informative ValueErrors."""
+
+ def test_df_none(self):
+ with pytest.raises(ValueError):
+ aa.get_labels(df=None, positive_label=1)
+
+ def test_df_not_dataframe(self):
+ with pytest.raises(ValueError):
+ aa.get_labels(df=[1, 2, 3], positive_label=1)
+
+ def test_missing_label_column(self):
+ df = pd.DataFrame({"entry": ["a", "b"], "y": [1, 0]})
+ with pytest.raises(ValueError):
+ aa.get_labels(df=df, positive_label=1)
+
+ def test_custom_col_missing(self):
+ df = pd.DataFrame({"label": [1, 0]})
+ with pytest.raises(ValueError):
+ aa.get_labels(df=df, positive_label=1, col_label="missing")
+
+ def test_positive_label_absent(self):
+ df = pd.DataFrame({"label": [1, 2, 1]})
+ with pytest.raises(ValueError):
+ aa.get_labels(df=df, positive_label=9)
diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
new file mode 100644
index 00000000..2accb502
--- /dev/null
+++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py
@@ -0,0 +1,174 @@
+"""
+This script tests the dPULearn.mine_negatives() convenience method (issue #308).
+
+mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabeled,
+builds a 1 (positive) / 2 (unlabeled) label vector, fits, and returns the boolean mask of
+identified reliable negatives over the rows of X_unlabeled. The key contract is that the
+mask equals the manual ``labels_[len(X_pos):] == 0`` result exactly, and that the existing
+``fit`` path stays byte-identical (no algorithm change).
+"""
+import numpy as np
+import pytest
+
+import aaanalysis as aa
+
+
+# Helper functions
+def _make_data(n_pos=20, n_unl=50, n_features=8, seed=0):
+ rng = np.random.default_rng(seed)
+ X_pos = rng.normal(0.0, 1.0, size=(n_pos, n_features))
+ X_unl = rng.normal(0.6, 1.0, size=(n_unl, n_features))
+ return X_pos, X_unl
+
+
+def _manual_mask(X_pos, X_unl, random_state=42, **fit_kwargs):
+ """Reproduce the notebook cell 18/24 manual stacking path."""
+ X_pool = np.vstack([X_pos, X_unl])
+ y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl))
+ dpul = aa.dPULearn(random_state=random_state, verbose=False)
+ dpul.fit(X=X_pool, labels=y_pool, **fit_kwargs)
+ return np.asarray(dpul.labels_)[len(X_pos):] == 0, dpul
+
+
+# Normal Cases Test Class
+class TestMineNegatives:
+ """Test dPULearn.mine_negatives() for each parameter individually."""
+
+ def test_returns_boolean_mask_over_unlabeled(self):
+ X_pos, X_unl = _make_data()
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
+ assert isinstance(mask, np.ndarray)
+ assert mask.dtype == bool
+ assert mask.shape == (X_unl.shape[0],)
+ assert mask.sum() == 10
+
+ def test_X_pos_parameter(self):
+ X_pos, X_unl = _make_data(n_pos=30)
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5)
+ assert mask.shape[0] == X_unl.shape[0]
+
+ def test_X_unlabeled_parameter(self):
+ X_pos, X_unl = _make_data(n_unl=70)
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=12)
+ assert mask.shape[0] == 70
+ assert mask.sum() == 12
+
+ def test_n_neg_parameter(self):
+ X_pos, X_unl = _make_data()
+ for n in (1, 5, 25):
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=n)
+ assert mask.sum() == n
+
+ def test_metric_parameter(self):
+ X_pos, X_unl = _make_data()
+ for metric in ("euclidean", "manhattan", "cosine"):
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
+ n_neg=10, metric=metric)
+ assert mask.sum() == 10
+
+ def test_n_components_parameter(self):
+ X_pos, X_unl = _make_data()
+ for n_components in (2, 3, 0.5):
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
+ n_neg=10, n_components=n_components)
+ assert mask.sum() == 10
+
+ def test_instance_attributes_set(self):
+ """After mining, labels_ / df_pu_ are set so the plotting class works."""
+ X_pos, X_unl = _make_data()
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
+ assert dpul.labels_ is not None
+ assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0]
+ assert dpul.df_pu_ is not None
+
+
+# Regression / golden equivalence
+class TestMineNegativesEquivalence:
+ """The mask must equal the manual stacking path exactly (KPI #308)."""
+
+ @pytest.mark.parametrize("seed", [0, 1, 7])
+ def test_mask_equals_manual_pca(self, seed):
+ X_pos, X_unl = _make_data(seed=seed)
+ manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10)
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10)
+ assert np.array_equal(mask, manual_mask)
+ assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_))
+
+ def test_mask_equals_manual_metric(self):
+ X_pos, X_unl = _make_data(seed=3)
+ manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine")
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl,
+ n_neg=8, metric="cosine")
+ assert np.array_equal(mask, manual_mask)
+
+ def test_mask_equals_manual_few_positives(self):
+ # n_pos < 3: the manual stacked path accepts it (the >=3 floor applies to the
+ # stacked matrix), so mine_negatives must match it, not reject the small pos set.
+ X_pos, X_unl = _make_data(n_pos=1, seed=5)
+ manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6)
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=6)
+ assert np.array_equal(mask, manual_mask)
+
+
+# Negative Cases Test Class
+class TestMineNegativesNegative:
+ """Invalid inputs must raise informative ValueErrors."""
+
+ def test_feature_mismatch(self):
+ X_pos, _ = _make_data(n_features=8)
+ _, X_unl = _make_data(n_features=6)
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ with pytest.raises(ValueError):
+ dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5)
+
+ def test_n_neg_below_one(self):
+ X_pos, X_unl = _make_data()
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ # The error must name 'n_neg' (not the internal 'n_unl_to_neg' fit sees).
+ with pytest.raises(ValueError, match="n_neg"):
+ dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=0)
+
+ def test_too_many_negatives_requested(self):
+ X_pos, X_unl = _make_data(n_unl=10)
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ with pytest.raises(ValueError):
+ dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=999)
+
+ def test_X_pos_none(self):
+ _, X_unl = _make_data()
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ with pytest.raises(ValueError):
+ dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_neg=5)
+
+ def test_X_unlabeled_none(self):
+ X_pos, _ = _make_data()
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ with pytest.raises(ValueError):
+ dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_neg=5)
+
+
+# Existing-fit byte-identical regression
+class TestFitUnchanged:
+ """The pre-existing fit(X, labels=...) path stays byte-identical (#308 no-change)."""
+
+ def test_fit_pca_unchanged(self):
+ X_pos, X_unl = _make_data(seed=11)
+ X_pool = np.vstack([X_pos, X_unl])
+ y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl))
+ dpul = aa.dPULearn(random_state=42, verbose=False)
+ dpul.fit(X=X_pool, labels=y_pool, n_unl_to_neg=10)
+ labels = np.asarray(dpul.labels_)
+ # contract: positives stay 1, exactly 10 mined negatives become 0, rest stay 2
+ assert (labels[:len(X_pos)] == 1).all()
+ assert (labels == 0).sum() == 10
+ assert set(np.unique(labels)).issubset({0, 1, 2})
diff --git a/tests/unit/plotting_tests/test_color_samples_constants.py b/tests/unit/plotting_tests/test_color_samples_constants.py
new file mode 100644
index 00000000..52826a1b
--- /dev/null
+++ b/tests/unit/plotting_tests/test_color_samples_constants.py
@@ -0,0 +1,36 @@
+"""
+This script tests the named sample-color constants exposed at top level (issue #308).
+
+COLOR_SAMPLES_POS / NEG / UNL / REL_NEG are public, named aliases for the canonical sample
+colors. They must equal today's ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]`` values exactly,
+so users can reference a named constant instead of indexing the color dict by string key.
+"""
+import pytest
+
+import aaanalysis as aa
+
+
+# Golden equivalence test
+class TestColorSamplesConstants:
+ """Named constants must equal the plot_get_cdict values (golden KPI #308)."""
+
+ def test_constants_exist_at_top_level(self):
+ for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG",
+ "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"):
+ assert hasattr(aa, name)
+ assert name in aa.__all__
+
+ @pytest.mark.parametrize("const_name,dict_key", [
+ ("COLOR_SAMPLES_POS", "SAMPLES_POS"),
+ ("COLOR_SAMPLES_NEG", "SAMPLES_NEG"),
+ ("COLOR_SAMPLES_UNL", "SAMPLES_UNL"),
+ ("COLOR_SAMPLES_REL_NEG", "SAMPLES_REL_NEG"),
+ ])
+ def test_constant_equals_cdict_value(self, const_name, dict_key):
+ dict_color = aa.plot_get_cdict(name="DICT_COLOR")
+ assert getattr(aa, const_name) == dict_color[dict_key]
+
+ def test_constants_are_strings(self):
+ for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG",
+ "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"):
+ assert isinstance(getattr(aa, name), str)