diff --git a/aaanalysis/__init__.py b/aaanalysis/__init__.py index 4fc1edee..17989828 100644 --- a/aaanalysis/__init__.py +++ b/aaanalysis/__init__.py @@ -1,4 +1,4 @@ -from .data_handling import (load_dataset, load_scales, load_features, +from .data_handling import (load_dataset, load_scales, load_features, get_labels, read_fasta, to_fasta, SequencePreprocessor, EmbeddingPreprocessor, @@ -14,6 +14,8 @@ comp_per_protein_ap, comp_detection_metrics, comp_bootstrap_ci, comp_smooth_scores) from .config import options +from ._constants import (COLOR_SAMPLES_POS, COLOR_SAMPLES_NEG, + COLOR_SAMPLES_UNL, COLOR_SAMPLES_REL_NEG) from importlib.metadata import version as _version, PackageNotFoundError @@ -28,6 +30,7 @@ "load_dataset", "load_scales", "load_features", + "get_labels", "read_fasta", "to_fasta", "SequencePreprocessor", @@ -72,6 +75,10 @@ "comp_detection_metrics", "comp_bootstrap_ci", "comp_smooth_scores", + "COLOR_SAMPLES_POS", + "COLOR_SAMPLES_NEG", + "COLOR_SAMPLES_UNL", + "COLOR_SAMPLES_REL_NEG", "options" ] diff --git a/aaanalysis/_constants.py b/aaanalysis/_constants.py index 2c22e606..2ce81416 100644 --- a/aaanalysis/_constants.py +++ b/aaanalysis/_constants.py @@ -478,6 +478,15 @@ def _folder_path(super_folder, folder_name): COLOR_NEG = "#ad4570" # (173,69,112) COLOR_REL_NEG = "#ad9745" # (173, 151, 69) +# Public, named aliases for the canonical sample-group colors (positive / negative / +# unlabeled / reliable-negative). They mirror the ``DICT_COLOR["SAMPLES_*"]`` entries +# exactly, so users can reference a named constant (``aa.COLOR_SAMPLES_POS``) instead +# of indexing ``plot_get_cdict("DICT_COLOR")`` by string key. +COLOR_SAMPLES_POS = COLOR_POS +COLOR_SAMPLES_NEG = COLOR_NEG +COLOR_SAMPLES_UNL = COLOR_UNL +COLOR_SAMPLES_REL_NEG = COLOR_REL_NEG + DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS, "SHAP_NEG": COLOR_SHAP_NEG, "FEAT_POS": COLOR_FEAT_POS, diff --git a/aaanalysis/data_handling/__init__.py b/aaanalysis/data_handling/__init__.py index 1fad9397..7ea8c663 100644 --- a/aaanalysis/data_handling/__init__.py +++ b/aaanalysis/data_handling/__init__.py @@ -1,8 +1,8 @@ """ Data loading and sequence/embedding preprocessing — the package's data entry point. -Public objects: load_dataset, load_scales, load_features, read_fasta, to_fasta, -SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums. +Public objects: load_dataset, load_scales, load_features, get_labels, read_fasta, +to_fasta, SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums. Produces the core data objects the rest of the pipeline consumes: ``load_dataset`` yields ``df_seq``, ``load_scales`` yields ``df_scales`` (fed to ``feature_engineering.AAclust`` / ``CPP``), ``load_features`` yields a reference @@ -17,6 +17,7 @@ from ._load_dataset import load_dataset from ._load_scales import load_scales from ._load_features import load_features +from ._get_labels import get_labels from ._read_fasta import read_fasta from ._to_fasta import to_fasta from ._seq_preproc import SequencePreprocessor @@ -27,6 +28,7 @@ "load_dataset", "load_scales", "load_features", + "get_labels", "read_fasta", "to_fasta", "SequencePreprocessor", diff --git a/aaanalysis/data_handling/_get_labels.py b/aaanalysis/data_handling/_get_labels.py new file mode 100644 index 00000000..f23d8423 --- /dev/null +++ b/aaanalysis/data_handling/_get_labels.py @@ -0,0 +1,70 @@ +""" +This is a script for the frontend of the get_labels function, deriving a binary +label vector from a sequence DataFrame's label column. +""" +from typing import Any +import numpy as np +import pandas as pd + +import aaanalysis.utils as ut + + +# I Helper Functions +def check_match_df_positive_label(df=None, col_label=None, positive_label=None) -> None: + """Check that the positive label value is present in the label column.""" + present = set(df[col_label].tolist()) + if positive_label not in present: + raise ValueError(f"'positive_label' ({positive_label}) is not among the values of " + f"column '{col_label}' ({sorted(present, key=str)}).") + + +# II Main Functions +def get_labels(df: pd.DataFrame, + positive_label: Any = 1, + col_label: str = "label", + ) -> np.ndarray: + """ + Derive a binary ``int`` label vector from a column of a sequence DataFrame. + + Maps the value flagged as positive (``positive_label``) onto ``1`` and every other + value onto ``0``, the binary encoding consumed across the package (e.g. by + :meth:`CPP.run`, :class:`TreeModel`, and the ``labels`` argument of most tools). + This is the single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` + expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + df : pd.DataFrame, shape (n_samples, n_seq_info) + Sequence DataFrame (``df_seq``) containing the label column ``col_label``. + positive_label : int or str, default=1 + Value in ``col_label`` marking the positive class. All rows equal to it become + ``1``; all remaining rows become ``0``. Must be present in ``col_label``. + col_label : str, default='label' + Name of the column holding the (multi-value or already binary) labels. + + Returns + ------- + labels : array-like, shape (n_samples,) + Binary ``int`` label vector (``1`` = positive, ``0`` = otherwise), row-aligned + to ``df``. + + Notes + ----- + * The result equals ``(df[col_label] == positive_label).astype(int).to_numpy()``. + * Pass the resulting vector directly as the ``labels`` argument of CPP, TreeModel, + or other tools. For Positive-Unlabeled mining keep the package ``1`` (positive) / + ``2`` (unlabeled) markers instead and use :meth:`dPULearn.mine_negatives`. + + Examples + -------- + .. include:: examples/get_labels.rst + """ + # Check input + ut.check_str(name="col_label", val=col_label, accept_none=False) + ut.check_df(name="df", df=df, cols_required=col_label) + check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label) + # Derive binary int label vector + labels = (df[col_label] == positive_label).astype(int).to_numpy() + return labels diff --git a/aaanalysis/pu_learning/_dpulearn.py b/aaanalysis/pu_learning/_dpulearn.py index 23b5f729..12da26f6 100644 --- a/aaanalysis/pu_learning/_dpulearn.py +++ b/aaanalysis/pu_learning/_dpulearn.py @@ -133,6 +133,15 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None: raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})") +def check_match_X_pos_X_unlabeled(X_pos=None, X_unlabeled=None) -> None: + """Check that positive and unlabeled feature matrices share the same feature dimension.""" + n_features_pos = X_pos.shape[1] + n_features_unl = X_unlabeled.shape[1] + if n_features_pos != n_features_unl: + raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and " + f"'X_unlabeled' (n={n_features_unl})") + + # II Main Functions class dPULearn(Wrapper): """ @@ -358,6 +367,87 @@ def fit(self, self.df_pu_ = df_pu return self + def mine_negatives(self, + X_pos: ut.ArrayLike2D, + X_unlabeled: ut.ArrayLike2D, + n_neg: int, + metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None, + n_components: Union[float, int] = 0.80, + ) -> np.ndarray: + """ + Mine reliable negatives from an unlabeled pool given the positives, in one call. + + Convenience wrapper around :meth:`dPULearn.fit` for the common positive/unlabeled + setup: instead of stacking ``X_pos`` and ``X_unlabeled`` by hand, building a + ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined + rows back out by index, pass the two feature matrices separately and receive a + **boolean mask over the rows of** ``X_unlabeled`` flagging the identified reliable + negatives. The mask equals ``labels_[len(X_pos):] == 0`` from the manual stacking + path exactly. + + After the call the instance is fitted: :attr:`dPULearn.labels_` (over the stacked + ``X_pos`` then ``X_unlabeled``) and :attr:`dPULearn.df_pu_` are set, so the + :class:`dPULearnPlot` methods work as usual. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + X_pos : array-like, shape (n_pos, n_features) + Feature matrix of the positive samples. + X_unlabeled : array-like, shape (n_unl, n_features) + Feature matrix of the unlabeled samples (the candidate pool). Must have the + same number of features as ``X_pos``. + n_neg : int + Number of reliable negatives to identify from the unlabeled pool. Must not + exceed the number of unlabeled samples. + metric : str or None, optional + Distance metric for distance-based identification (``euclidean``, + ``manhattan``, ``cosine``). If ``None``, PCA-based identification is performed. + n_components : int or float, default=0.80 + Number of principal components (int >= 1) or fraction of variance covered + (float in ``(0.0, 1.0)``) when PCA is applied. + + Returns + ------- + mask_neg : array-like, shape (n_unl,) + Boolean mask over the rows of ``X_unlabeled``: ``True`` marks an identified + reliable negative. ``X_unlabeled[mask_neg]`` are the mined negatives. + + Notes + ----- + * This is purely additive sugar: it stacks the inputs and calls + :meth:`dPULearn.fit` with ``label_pos=1`` / ``label_unl=2`` internally, so the + identification result is identical to the manual path. + + See Also + -------- + * :meth:`dPULearn.fit`: the underlying fit on a stacked matrix and label vector. + * :func:`get_labels`: derive a binary label vector from a sequence DataFrame. + + Examples + -------- + .. include:: examples/dpul_mine_negatives.rst + """ + # Check input (the >=3 sample floor applies to the stacked matrix, enforced by + # 'fit' below, so per-matrix validation only coerces + checks the feature dimension; + # this keeps mine_negatives accepting exactly what the manual stacking path accepts) + X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1) + X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1) + check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled) + # Validate n_neg here so the message names 'n_neg' (fit sees it as 'n_unl_to_neg') + ut.check_number_range(name="n_neg", val=n_neg, min_val=1, just_int=True) + # Stack positives over the unlabeled pool and fit with the package PU markers + n_pos = X_pos.shape[0] + X = np.vstack([X_pos, X_unlabeled]) + labels = np.array([1] * n_pos + [2] * X_unlabeled.shape[0]) + # No pre-labeled negatives here, so n_neg is exactly the count to draw from the pool + self.fit(X=X, labels=labels, label_pos=1, label_unl=2, + n_unl_to_neg=n_neg, metric=metric, n_components=n_components) + # Slice the mined reliable negatives (label 0) back out of the unlabeled block + mask_neg = np.asarray(self.labels_)[n_pos:] == 0 + return mask_neg + @staticmethod def eval(X: ut.ArrayLike2D, list_labels: ut.ArrayLike2D, diff --git a/docs/_cheatsheet/content.py b/docs/_cheatsheet/content.py index 801adc7f..0d04b6eb 100644 --- a/docs/_cheatsheet/content.py +++ b/docs/_cheatsheet/content.py @@ -188,6 +188,7 @@ ("Load benchmark sequences", "load_dataset(name) → df_seq", None), ("Load AAontology scales", "load_scales() → df_scales", None), ("Load precomputed features", "load_features(name) → df_feat", None), + ("Binary labels from df column", "get_labels(df, positive_label) → labels", None, "v1.1"), ("Read / write FASTA", "read_fasta(file) → df_seq", None), ("Cluster redundant homologs", "filter_seq(df_seq) → df_clust [pro]", None), ]}, @@ -221,6 +222,7 @@ {"name": "Modeling & Explainability", "tag": "PU · classify · SHAP", "rows": [ ("Train with positives + unlabeled data", "dPULearn().fit(X, labels) [Wrapper]", None), + ("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabeled) → mask", None, "v1.1"), ("Train + RFE + MC importance", "TreeModel().fit(X, labels) [Wrapper]", None), ("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels) [pro]", None), ]}, diff --git a/docs/source/index/release_notes.rst b/docs/source/index/release_notes.rst index 89fa7d74..817f4b56 100644 --- a/docs/source/index/release_notes.rst +++ b/docs/source/index/release_notes.rst @@ -35,6 +35,11 @@ Added per-residue PTM and functional-site annotations and encodes them into tensors (``fetch_uniprot``, ``ingest``, ``register_feature``, ``encode``, ``build_scales``, ``build_cat``, ``to_df_seq``). +- **combine_dict_nums**: Concatenates per-residue tensors (embedding / structure / + annotation) along the feature axis into one combined ``CPP.run_num`` input. +- **get_labels**: Derives a binary ``int`` label vector from a sequence DataFrame's + label column (``positive_label`` mapped to ``1``, everything else to ``0``) — the + single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` expression. - :func:`~aaanalysis.combine_dict_nums`: Concatenates per-residue tensors (embedding / structure / annotation) along the feature axis into one combined :meth:`~aaanalysis.CPP.run_num` input. @@ -132,6 +137,16 @@ Added switches the pre-computed prediction per P1 (feature map + structure restyle) with no kernel, keeping the column-residue linking (warned past 40 sites, hard-capped at 200). +**PU Learning** + +- **dPULearn.mine_negatives**: One-call convenience over ``dPULearn.fit`` for the common + positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabeled`` separately instead of + stacking them by hand, building a ``1`` / ``2`` label vector, fitting, and slicing the + mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabeled`` + flagging the identified reliable negatives (equal to the manual ``labels_[len(X_pos):] + == 0`` result exactly). The instance is left fitted (``labels_`` / ``df_pu_`` set, so + ``dPULearnPlot`` works), and the existing ``fit(X, labels=...)`` path is unchanged. + **Sequence Analysis** - :class:`~aaanalysis.AAWindowSampler`: Samples fixed-length sequence windows for PU-learning and @@ -191,6 +206,10 @@ Added - :func:`~aaanalysis.plot_rank`: Standalone per-protein max-score-vs-rank scatter with group coloring and optional threshold lines (pairs with the new ``aa.metrics`` functions). +- **COLOR_SAMPLES_POS / COLOR_SAMPLES_NEG / COLOR_SAMPLES_UNL / COLOR_SAMPLES_REL_NEG**: + Public, named constants for the canonical sample-group colors (positive / negative / + unlabeled / reliable-negative). They equal the ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]`` + values exactly, so a named constant replaces indexing the color dict by string key. **Golden Pipelines** diff --git a/examples/data_handling/get_labels.ipynb b/examples/data_handling/get_labels.ipynb new file mode 100644 index 00000000..ef84686a --- /dev/null +++ b/examples/data_handling/get_labels.ipynb @@ -0,0 +1,297 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "30e77e51", + "metadata": {}, + "source": [ + "The ``get_labels`` function derives a binary ``int`` label vector from a column of a sequence DataFrame (``df_seq``). It is the single-call form of the recurring ``(df[col] == positive_label).astype(int).to_numpy()`` expression: the value flagged as positive becomes ``1`` and every other value becomes ``0``." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "14210edc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:29:34.142992Z", + "iopub.status.busy": "2026-06-30T23:29:34.142637Z", + "iopub.status.idle": "2026-06-30T23:29:35.599732Z", + "shell.execute_reply": "2026-06-30T23:29:35.599479Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataFrame shape: (10, 8)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 entrysequencelabeltmd_starttmd_stopjmd_ntmdjmd_c
1P05067MLPGLALLLLAAWTA...GYENPTYKFFEQMQN1701723FAEDVGSNKGAIIGLMVGGVVIATVIVITLVMLKKKQYTSIHH
2P14925MAGRARSGLLLLLLG...EEEYSAPLPKPAPSS1868890KLSTEPGSGVSVVLITTLLVIPVLVLLAIVMFIRWKKSRAFGD
3P70180MRSLLLFTFSACVLL...RELREDSIRSHFSVA1477499PCKSSGGLEESAVTGIVVGALLGAGLLMAFYFFRKKYRITIER
4Q03157MGPTSPAARGQGRRW...HGYENPTYRFLEERP1585607APSGTGVSREALSGLLIMGAGGGSLIVLSLLLLRKKKPYGTIS
5Q06481MAATGTAAAAATGRL...GYENPTYKYLEQMQI1694716LREDFSLSSSALIGLLVIAVAIATVIVISLVMLRKRQYGTISH
6P12821MGAASGRRGPGLLLP...SHGPQFGSEVELRHS212571276GLDLDAQQARVGQWLLLFLGIALLVATLGLSQRLFSIRHR
7P36896MAESAGASSFFPLVV...KKTLSQLSVQEDVKI2127149EHPSMWGPVELVGIIAGPVFLLFLIIIIVFLVINYHQRVYHNR
8Q8NER5MTRALCSALRQALLL...KKTISQLCVKEDCKA2114136PNAPKLGPMELAIIITVPVCLLSIAAMLTVWACQGRQCSYRKK
9P37023MTLGSPRKGLLMLLM...LQKISNSPEKPKVIQ2119141PSEQPGTDGQLALILGPVLALLALVALGVLGLWHVRRRQEKQR
10O43184MAARPLPVSPARALL...YPHQVPRSTHTAYIK2707729DSGPIRQADNQGLTIGILVTILCLLAAGFVVYLKRKTLIRLLF
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import aaanalysis as aa\n", + "aa.options[\"verbose\"] = False\n", + "\n", + "# A Positive-Unlabeled (PU) dataset: substrates (1) and unlabeled others (2).\n", + "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\", n=5)\n", + "aa.display_df(df=df_seq, n_rows=10, show_shape=True)" + ] + }, + { + "cell_type": "markdown", + "id": "e84a0e34", + "metadata": {}, + "source": [ + "By default ``positive_label=1``: substrates map to ``1`` and everything else to ``0``." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a5ccd25b", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:29:35.600825Z", + "iopub.status.busy": "2026-06-30T23:29:35.600758Z", + "iopub.status.idle": "2026-06-30T23:29:35.602749Z", + "shell.execute_reply": "2026-06-30T23:29:35.602535Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1 1 1 1 1 0 0 0 0 0]\n" + ] + } + ], + "source": [ + "labels = aa.get_labels(df=df_seq, positive_label=1)\n", + "print(labels)" + ] + }, + { + "cell_type": "markdown", + "id": "5db29b65", + "metadata": {}, + "source": [ + "Pick any value as the positive class via ``positive_label`` (e.g. treat the unlabeled ``2`` as positive), and select a different column with ``col_label``. The result equals the manual ``(df[col_label] == positive_label).astype(int).to_numpy()`` expression." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d9a49747", + "metadata": { + "execution": { + "iopub.execute_input": "2026-06-30T23:29:35.603686Z", + "iopub.status.busy": "2026-06-30T23:29:35.603626Z", + "iopub.status.idle": "2026-06-30T23:29:35.605515Z", + "shell.execute_reply": "2026-06-30T23:29:35.605342Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 0 0 0 0 1 1 1 1 1]\n" + ] + } + ], + "source": [ + "labels_unl = aa.get_labels(df=df_seq, positive_label=2, col_label=\"label\")\n", + "print(labels_unl)" + ] + }, + { + "cell_type": "markdown", + "id": "82fd2bd1", + "metadata": {}, + "source": [ + "Pass the resulting vector straight into the ``labels`` argument of tools such as :meth:`CPP.run` or :class:`TreeModel`." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/pu_learning/dpul_mine_negatives.ipynb b/examples/pu_learning/dpul_mine_negatives.ipynb new file mode 100644 index 00000000..fc56d1fc --- /dev/null +++ b/examples/pu_learning/dpul_mine_negatives.ipynb @@ -0,0 +1,314 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01ccf3ee", + "metadata": {}, + "source": [ + "The ``dPULearn().mine_negatives()`` method mines reliable negatives from an unlabeled pool in one call. Instead of stacking the positive and unlabeled feature matrices by hand, building a ``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined rows back out by index, you pass the two matrices separately and receive a boolean mask over the rows of ``X_unlabeled``." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "710fdc35", + "metadata": { + "execution": { + "iopub.execute_input": "2026-07-01T03:27:51.939100Z", + "iopub.status.busy": "2026-07-01T03:27:51.937608Z", + "iopub.status.idle": "2026-07-01T03:28:11.600218Z", + "shell.execute_reply": "2026-07-01T03:28:11.552756Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "positives: 63 | unlabeled: 631\n" + ] + } + ], + "source": [ + "import aaanalysis as aa\n", + "import numpy as np\n", + "import pandas as pd\n", + "aa.options[\"verbose\"] = False\n", + "\n", + "# Build a CPP feature matrix for the gamma-secretase PU dataset (substrates vs unlabeled others).\n", + "df_seq = aa.load_dataset(name=\"DOM_GSEC_PU\")\n", + "df_feat = aa.load_features(name=\"DOM_GSEC\")\n", + "sf = aa.SequenceFeature()\n", + "X = sf.feature_matrix(features=df_feat[\"feature\"], df_parts=sf.get_df_parts(df_seq=df_seq))\n", + "labels = df_seq[\"label\"].to_numpy()\n", + "\n", + "# Split into the positive (1) and unlabeled (2) feature blocks.\n", + "X_pos = X[labels == 1]\n", + "X_unl = X[labels == 2]\n", + "print(f\"positives: {len(X_pos)} | unlabeled: {len(X_unl)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "38f3d31d", + "metadata": {}, + "source": [ + "Mine a fixed number of reliable negatives from the unlabeled pool with ``n_neg``. The returned boolean ``mask_neg`` flags, over the rows of ``X_unlabeled``, which unlabeled samples were identified as reliable negatives." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bdccaa03", + "metadata": { + "execution": { + "iopub.execute_input": "2026-07-01T03:28:11.767224Z", + "iopub.status.busy": "2026-07-01T03:28:11.766642Z", + "iopub.status.idle": "2026-07-01T03:28:11.816538Z", + "shell.execute_reply": "2026-07-01T03:28:11.795116Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mined reliable negatives: 49 of 631 unlabeled\n" + ] + } + ], + "source": [ + "dpul = aa.dPULearn(random_state=42)\n", + "mask_neg = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=49)\n", + "print(f\"mined reliable negatives: {int(mask_neg.sum())} of {len(X_unl)} unlabeled\")\n", + "X_neg = X_unl[mask_neg] # the mined feature rows" + ] + }, + { + "cell_type": "markdown", + "id": "315d4798", + "metadata": {}, + "source": [ + "The mask equals the manual stacking path exactly: stack ``X_pos`` over ``X_unlabeled``, fit with ``1`` / ``2`` labels, and slice ``labels_[len(X_pos):] == 0``." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c78d6eab", + "metadata": { + "execution": { + "iopub.execute_input": "2026-07-01T03:28:11.824245Z", + "iopub.status.busy": "2026-07-01T03:28:11.824003Z", + "iopub.status.idle": "2026-07-01T03:28:12.020230Z", + "shell.execute_reply": "2026-07-01T03:28:11.952823Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mask equals manual path: True\n" + ] + } + ], + "source": [ + "# Stack X_pos over X_unlabeled, fit with 1 / 2 labels, and slice labels_[len(X_pos):] == 0.\n", + "labels_manual = np.asarray(\n", + " aa.dPULearn(random_state=42)\n", + " .fit(X=np.vstack([X_pos, X_unl]),\n", + " labels=np.array([1] * len(X_pos) + [2] * len(X_unl)),\n", + " n_unl_to_neg=49)\n", + " .labels_)\n", + "mask_manual = labels_manual[len(X_pos):] == 0\n", + "print(\"mask equals manual path:\", np.array_equal(mask_neg, mask_manual))" + ] + }, + { + "cell_type": "markdown", + "id": "cae2d79d", + "metadata": {}, + "source": [ + "After mining, the instance is fitted: ``labels_`` (over the stacked positives then unlabeled) and ``df_pu_`` are set, so the :class:`dPULearnPlot` methods work as usual. Pass ``n_neg`` (the number of reliable negatives to mine) and optionally a distance ``metric`` (``euclidean`` / ``manhattan`` / ``cosine``) for distance-based identification instead of the default PCA." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "85eff05c", + "metadata": { + "execution": { + "iopub.execute_input": "2026-07-01T03:28:12.179951Z", + "iopub.status.busy": "2026-07-01T03:28:12.179661Z", + "iopub.status.idle": "2026-07-01T03:28:12.745973Z", + "shell.execute_reply": "2026-07-01T03:28:12.744957Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 582\n", + "1 63\n", + "0 49\n", + "Name: count, dtype: int64\n", + "DataFrame shape: (49, 15)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
 selection_viaPC1 (56.2%)PC2 (7.4%)PC3 (2.9%)PC4 (2.8%)
84PC10.021000-0.0478000.075200-0.005400
95PC20.032000-0.0821000.025800-0.037700
109PC10.026100-0.0585000.075700-0.020900
158PC10.023500-0.0607000.0540000.000900
161PC10.0259000.0314000.0449000.055400
170PC10.026100-0.0353000.0583000.025800
192PC60.040100-0.0022000.004300-0.053600
193PC10.024700-0.0569000.051300-0.035600
195PC50.0299000.0065000.0358000.050200
200PC10.021200-0.0562000.0057000.072600
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(pd.Series(dpul.labels_).value_counts())\n", + "aa.display_df(df=dpul.df_pu_[dpul.df_pu_[\"selection_via\"].str.contains(\"PC\", na=False)],\n", + " n_rows=10, n_cols=5, show_shape=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tests/unit/data_handling_tests/test_get_labels.py b/tests/unit/data_handling_tests/test_get_labels.py new file mode 100644 index 00000000..0d63e291 --- /dev/null +++ b/tests/unit/data_handling_tests/test_get_labels.py @@ -0,0 +1,114 @@ +""" +This script tests the top-level get_labels() function (issue #308). + +get_labels is the single-call form of the recurring +``(df[col] == positive_label).astype(int).to_numpy()`` expression that appears in 4+ places +of the gamma-secretase use case. It maps the positive value onto 1 and everything else onto 0. +""" +import numpy as np +import pandas as pd +import pytest + +import aaanalysis as aa + + +# Helper functions +def _manual(df, positive_label, col="label"): + return (df[col] == positive_label).astype(int).to_numpy() + + +# Normal Cases Test Class +class TestGetLabels: + """Test get_labels() for each parameter individually.""" + + def test_returns_int_numpy_array(self): + df = pd.DataFrame({"entry": ["a", "b", "c"], "label": [1, 2, 1]}) + labels = aa.get_labels(df=df, positive_label=1) + assert isinstance(labels, np.ndarray) + assert labels.dtype.kind == "i" + assert labels.shape == (3,) + + def test_positive_label_default(self): + df = pd.DataFrame({"label": [1, 0, 1, 0]}) + labels = aa.get_labels(df=df) + assert np.array_equal(labels, np.array([1, 0, 1, 0])) + + def test_df_parameter(self): + df = pd.DataFrame({"label": [2, 2, 1]}) + labels = aa.get_labels(df=df, positive_label=1) + assert np.array_equal(labels, np.array([0, 0, 1])) + + def test_col_label_parameter(self): + df = pd.DataFrame({"y": [1, 2, 1, 2]}) + labels = aa.get_labels(df=df, positive_label=2, col_label="y") + assert np.array_equal(labels, np.array([0, 1, 0, 1])) + + +# Golden equivalence to the manual expression (KPI: >= 2 encodings) +class TestGetLabelsEquivalence: + """Result equals the manual expression on multiple label encodings (KPI #308).""" + + def test_pu_encoding_1_2(self): + # PU encoding: 1 = positive, 2 = unlabeled + df = pd.DataFrame({"label": [1, 2, 1, 2, 2, 1]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1), + _manual(df, 1)) + + def test_binary_encoding_0_1(self): + # Standard {0, 1} encoding + df = pd.DataFrame({"label": [0, 1, 1, 0]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1), + _manual(df, 1)) + + def test_multiclass_encoding(self): + # Multi-class: pick one class as positive + df = pd.DataFrame({"label": [0, 1, 2, 0, 1, 2]}) + for pos in (0, 1, 2): + assert np.array_equal(aa.get_labels(df=df, positive_label=pos), + _manual(df, pos)) + + def test_string_labels(self): + df = pd.DataFrame({"label": ["sub", "non", "sub", "unl"]}) + assert np.array_equal(aa.get_labels(df=df, positive_label="sub"), + _manual(df, "sub")) + + def test_single_class_column_maps_all_ones(self): + # Pure mapping: unlike dPULearn.fit, get_labels does not require >1 distinct value, + # so an all-positive column maps to all ones rather than raising. + df = pd.DataFrame({"label": [1, 1, 1]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1), + np.array([1, 1, 1])) + + def test_nan_maps_to_zero(self): + # NaN never equals positive_label, so it becomes 0. + df = pd.DataFrame({"label": [1.0, np.nan, 1.0]}) + assert np.array_equal(aa.get_labels(df=df, positive_label=1.0), + np.array([1, 0, 1])) + + +# Negative Cases Test Class +class TestGetLabelsNegative: + """Invalid inputs must raise informative ValueErrors.""" + + def test_df_none(self): + with pytest.raises(ValueError): + aa.get_labels(df=None, positive_label=1) + + def test_df_not_dataframe(self): + with pytest.raises(ValueError): + aa.get_labels(df=[1, 2, 3], positive_label=1) + + def test_missing_label_column(self): + df = pd.DataFrame({"entry": ["a", "b"], "y": [1, 0]}) + with pytest.raises(ValueError): + aa.get_labels(df=df, positive_label=1) + + def test_custom_col_missing(self): + df = pd.DataFrame({"label": [1, 0]}) + with pytest.raises(ValueError): + aa.get_labels(df=df, positive_label=1, col_label="missing") + + def test_positive_label_absent(self): + df = pd.DataFrame({"label": [1, 2, 1]}) + with pytest.raises(ValueError): + aa.get_labels(df=df, positive_label=9) diff --git a/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py new file mode 100644 index 00000000..2accb502 --- /dev/null +++ b/tests/unit/dpulearn_tests/test_dpulearn_mine_negatives.py @@ -0,0 +1,174 @@ +""" +This script tests the dPULearn.mine_negatives() convenience method (issue #308). + +mine_negatives is additive sugar over dPULearn.fit: it stacks X_pos over X_unlabeled, +builds a 1 (positive) / 2 (unlabeled) label vector, fits, and returns the boolean mask of +identified reliable negatives over the rows of X_unlabeled. The key contract is that the +mask equals the manual ``labels_[len(X_pos):] == 0`` result exactly, and that the existing +``fit`` path stays byte-identical (no algorithm change). +""" +import numpy as np +import pytest + +import aaanalysis as aa + + +# Helper functions +def _make_data(n_pos=20, n_unl=50, n_features=8, seed=0): + rng = np.random.default_rng(seed) + X_pos = rng.normal(0.0, 1.0, size=(n_pos, n_features)) + X_unl = rng.normal(0.6, 1.0, size=(n_unl, n_features)) + return X_pos, X_unl + + +def _manual_mask(X_pos, X_unl, random_state=42, **fit_kwargs): + """Reproduce the notebook cell 18/24 manual stacking path.""" + X_pool = np.vstack([X_pos, X_unl]) + y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl)) + dpul = aa.dPULearn(random_state=random_state, verbose=False) + dpul.fit(X=X_pool, labels=y_pool, **fit_kwargs) + return np.asarray(dpul.labels_)[len(X_pos):] == 0, dpul + + +# Normal Cases Test Class +class TestMineNegatives: + """Test dPULearn.mine_negatives() for each parameter individually.""" + + def test_returns_boolean_mask_over_unlabeled(self): + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) + assert isinstance(mask, np.ndarray) + assert mask.dtype == bool + assert mask.shape == (X_unl.shape[0],) + assert mask.sum() == 10 + + def test_X_pos_parameter(self): + X_pos, X_unl = _make_data(n_pos=30) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5) + assert mask.shape[0] == X_unl.shape[0] + + def test_X_unlabeled_parameter(self): + X_pos, X_unl = _make_data(n_unl=70) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=12) + assert mask.shape[0] == 70 + assert mask.sum() == 12 + + def test_n_neg_parameter(self): + X_pos, X_unl = _make_data() + for n in (1, 5, 25): + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=n) + assert mask.sum() == n + + def test_metric_parameter(self): + X_pos, X_unl = _make_data() + for metric in ("euclidean", "manhattan", "cosine"): + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, + n_neg=10, metric=metric) + assert mask.sum() == 10 + + def test_n_components_parameter(self): + X_pos, X_unl = _make_data() + for n_components in (2, 3, 0.5): + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, + n_neg=10, n_components=n_components) + assert mask.sum() == 10 + + def test_instance_attributes_set(self): + """After mining, labels_ / df_pu_ are set so the plotting class works.""" + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) + assert dpul.labels_ is not None + assert dpul.labels_.shape[0] == X_pos.shape[0] + X_unl.shape[0] + assert dpul.df_pu_ is not None + + +# Regression / golden equivalence +class TestMineNegativesEquivalence: + """The mask must equal the manual stacking path exactly (KPI #308).""" + + @pytest.mark.parametrize("seed", [0, 1, 7]) + def test_mask_equals_manual_pca(self, seed): + X_pos, X_unl = _make_data(seed=seed) + manual_mask, dpul_m = _manual_mask(X_pos, X_unl, n_unl_to_neg=10) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=10) + assert np.array_equal(mask, manual_mask) + assert np.array_equal(np.asarray(dpul.labels_), np.asarray(dpul_m.labels_)) + + def test_mask_equals_manual_metric(self): + X_pos, X_unl = _make_data(seed=3) + manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=8, metric="cosine") + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, + n_neg=8, metric="cosine") + assert np.array_equal(mask, manual_mask) + + def test_mask_equals_manual_few_positives(self): + # n_pos < 3: the manual stacked path accepts it (the >=3 floor applies to the + # stacked matrix), so mine_negatives must match it, not reject the small pos set. + X_pos, X_unl = _make_data(n_pos=1, seed=5) + manual_mask, _ = _manual_mask(X_pos, X_unl, n_unl_to_neg=6) + dpul = aa.dPULearn(random_state=42, verbose=False) + mask = dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=6) + assert np.array_equal(mask, manual_mask) + + +# Negative Cases Test Class +class TestMineNegativesNegative: + """Invalid inputs must raise informative ValueErrors.""" + + def test_feature_mismatch(self): + X_pos, _ = _make_data(n_features=8) + _, X_unl = _make_data(n_features=6) + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=5) + + def test_n_neg_below_one(self): + X_pos, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + # The error must name 'n_neg' (not the internal 'n_unl_to_neg' fit sees). + with pytest.raises(ValueError, match="n_neg"): + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=0) + + def test_too_many_negatives_requested(self): + X_pos, X_unl = _make_data(n_unl=10) + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=X_unl, n_neg=999) + + def test_X_pos_none(self): + _, X_unl = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=None, X_unlabeled=X_unl, n_neg=5) + + def test_X_unlabeled_none(self): + X_pos, _ = _make_data() + dpul = aa.dPULearn(random_state=42, verbose=False) + with pytest.raises(ValueError): + dpul.mine_negatives(X_pos=X_pos, X_unlabeled=None, n_neg=5) + + +# Existing-fit byte-identical regression +class TestFitUnchanged: + """The pre-existing fit(X, labels=...) path stays byte-identical (#308 no-change).""" + + def test_fit_pca_unchanged(self): + X_pos, X_unl = _make_data(seed=11) + X_pool = np.vstack([X_pos, X_unl]) + y_pool = np.array([1] * len(X_pos) + [2] * len(X_unl)) + dpul = aa.dPULearn(random_state=42, verbose=False) + dpul.fit(X=X_pool, labels=y_pool, n_unl_to_neg=10) + labels = np.asarray(dpul.labels_) + # contract: positives stay 1, exactly 10 mined negatives become 0, rest stay 2 + assert (labels[:len(X_pos)] == 1).all() + assert (labels == 0).sum() == 10 + assert set(np.unique(labels)).issubset({0, 1, 2}) diff --git a/tests/unit/plotting_tests/test_color_samples_constants.py b/tests/unit/plotting_tests/test_color_samples_constants.py new file mode 100644 index 00000000..52826a1b --- /dev/null +++ b/tests/unit/plotting_tests/test_color_samples_constants.py @@ -0,0 +1,36 @@ +""" +This script tests the named sample-color constants exposed at top level (issue #308). + +COLOR_SAMPLES_POS / NEG / UNL / REL_NEG are public, named aliases for the canonical sample +colors. They must equal today's ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]`` values exactly, +so users can reference a named constant instead of indexing the color dict by string key. +""" +import pytest + +import aaanalysis as aa + + +# Golden equivalence test +class TestColorSamplesConstants: + """Named constants must equal the plot_get_cdict values (golden KPI #308).""" + + def test_constants_exist_at_top_level(self): + for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG", + "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"): + assert hasattr(aa, name) + assert name in aa.__all__ + + @pytest.mark.parametrize("const_name,dict_key", [ + ("COLOR_SAMPLES_POS", "SAMPLES_POS"), + ("COLOR_SAMPLES_NEG", "SAMPLES_NEG"), + ("COLOR_SAMPLES_UNL", "SAMPLES_UNL"), + ("COLOR_SAMPLES_REL_NEG", "SAMPLES_REL_NEG"), + ]) + def test_constant_equals_cdict_value(self, const_name, dict_key): + dict_color = aa.plot_get_cdict(name="DICT_COLOR") + assert getattr(aa, const_name) == dict_color[dict_key] + + def test_constants_are_strings(self): + for name in ("COLOR_SAMPLES_POS", "COLOR_SAMPLES_NEG", + "COLOR_SAMPLES_UNL", "COLOR_SAMPLES_REL_NEG"): + assert isinstance(getattr(aa, name), str)