Skip to content
9 changes: 8 additions & 1 deletion aaanalysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .data_handling import (load_dataset, load_scales, load_features,
from .data_handling import (load_dataset, load_scales, load_features, get_labels,
read_fasta, to_fasta,
SequencePreprocessor,
EmbeddingPreprocessor,
Expand All @@ -14,6 +14,8 @@
comp_per_protein_ap, comp_detection_metrics,
comp_bootstrap_ci, comp_smooth_scores)
from .config import options
from ._constants import (COLOR_SAMPLES_POS, COLOR_SAMPLES_NEG,
COLOR_SAMPLES_UNL, COLOR_SAMPLES_REL_NEG)

from importlib.metadata import version as _version, PackageNotFoundError

Expand All @@ -28,6 +30,7 @@
"load_dataset",
"load_scales",
"load_features",
"get_labels",
"read_fasta",
"to_fasta",
"SequencePreprocessor",
Expand Down Expand Up @@ -72,6 +75,10 @@
"comp_detection_metrics",
"comp_bootstrap_ci",
"comp_smooth_scores",
"COLOR_SAMPLES_POS",
"COLOR_SAMPLES_NEG",
"COLOR_SAMPLES_UNL",
"COLOR_SAMPLES_REL_NEG",
"options"
]

Expand Down
9 changes: 9 additions & 0 deletions aaanalysis/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,15 @@ def _folder_path(super_folder, folder_name):
COLOR_NEG = "#ad4570" # (173,69,112)
COLOR_REL_NEG = "#ad9745" # (173, 151, 69)

# Public, named aliases for the canonical sample-group colors (positive / negative /
# unlabeled / reliable-negative). They mirror the ``DICT_COLOR["SAMPLES_*"]`` entries
# exactly, so users can reference a named constant (``aa.COLOR_SAMPLES_POS``) instead
# of indexing ``plot_get_cdict("DICT_COLOR")`` by string key.
COLOR_SAMPLES_POS = COLOR_POS
COLOR_SAMPLES_NEG = COLOR_NEG
COLOR_SAMPLES_UNL = COLOR_UNL
COLOR_SAMPLES_REL_NEG = COLOR_REL_NEG

DICT_COLOR = {"SHAP_POS": COLOR_SHAP_POS,
"SHAP_NEG": COLOR_SHAP_NEG,
"FEAT_POS": COLOR_FEAT_POS,
Expand Down
6 changes: 4 additions & 2 deletions aaanalysis/data_handling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""
Data loading and sequence/embedding preprocessing — the package's data entry point.

Public objects: load_dataset, load_scales, load_features, read_fasta, to_fasta,
SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
Public objects: load_dataset, load_scales, load_features, get_labels, read_fasta,
to_fasta, SequencePreprocessor, EmbeddingPreprocessor, combine_dict_nums.
Produces the core data objects the rest of the pipeline consumes: ``load_dataset``
yields ``df_seq``, ``load_scales`` yields ``df_scales`` (fed to
``feature_engineering.AAclust`` / ``CPP``), ``load_features`` yields a reference
Expand All @@ -17,6 +17,7 @@
from ._load_dataset import load_dataset
from ._load_scales import load_scales
from ._load_features import load_features
from ._get_labels import get_labels
from ._read_fasta import read_fasta
from ._to_fasta import to_fasta
from ._seq_preproc import SequencePreprocessor
Expand All @@ -27,6 +28,7 @@
"load_dataset",
"load_scales",
"load_features",
"get_labels",
"read_fasta",
"to_fasta",
"SequencePreprocessor",
Expand Down
70 changes: 70 additions & 0 deletions aaanalysis/data_handling/_get_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
This is a script for the frontend of the get_labels function, deriving a binary
label vector from a sequence DataFrame's label column.
"""
from typing import Any
import numpy as np
import pandas as pd

import aaanalysis.utils as ut


# I Helper Functions
def check_match_df_positive_label(df=None, col_label=None, positive_label=None) -> None:
"""Check that the positive label value is present in the label column."""
present = set(df[col_label].tolist())
if positive_label not in present:
raise ValueError(f"'positive_label' ({positive_label}) is not among the values of "
f"column '{col_label}' ({sorted(present, key=str)}).")


# II Main Functions
def get_labels(df: pd.DataFrame,
positive_label: Any = 1,
col_label: str = "label",
) -> np.ndarray:
"""
Derive a binary ``int`` label vector from a column of a sequence DataFrame.

Maps the value flagged as positive (``positive_label``) onto ``1`` and every other
value onto ``0``, the binary encoding consumed across the package (e.g. by
:meth:`CPP.run`, :class:`TreeModel`, and the ``labels`` argument of most tools).
This is the single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()``
expression.

.. versionadded:: 1.1.0

Parameters
----------
df : pd.DataFrame, shape (n_samples, n_seq_info)
Sequence DataFrame (``df_seq``) containing the label column ``col_label``.
positive_label : int or str, default=1
Value in ``col_label`` marking the positive class. All rows equal to it become
``1``; all remaining rows become ``0``. Must be present in ``col_label``.
col_label : str, default='label'
Name of the column holding the (multi-value or already binary) labels.

Returns
-------
labels : array-like, shape (n_samples,)
Binary ``int`` label vector (``1`` = positive, ``0`` = otherwise), row-aligned
to ``df``.

Notes
-----
* The result equals ``(df[col_label] == positive_label).astype(int).to_numpy()``.
* Pass the resulting vector directly as the ``labels`` argument of CPP, TreeModel,
or other tools. For Positive-Unlabeled mining keep the package ``1`` (positive) /
``2`` (unlabeled) markers instead and use :meth:`dPULearn.mine_negatives`.

Examples
--------
.. include:: examples/get_labels.rst
"""
# Check input
ut.check_str(name="col_label", val=col_label, accept_none=False)
ut.check_df(name="df", df=df, cols_required=col_label)
check_match_df_positive_label(df=df, col_label=col_label, positive_label=positive_label)
# Derive binary int label vector
labels = (df[col_label] == positive_label).astype(int).to_numpy()
return labels
90 changes: 90 additions & 0 deletions aaanalysis/pu_learning/_dpulearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,15 @@ def check_match_X_X_neg(X=None, X_neg=None) -> None:
raise ValueError(f"'n_features' does not match between 'X' (n={n_features}) and 'X_neg' (n={n_features_neg})")


def check_match_X_pos_X_unlabeled(X_pos=None, X_unlabeled=None) -> None:
"""Check that positive and unlabeled feature matrices share the same feature dimension."""
n_features_pos = X_pos.shape[1]
n_features_unl = X_unlabeled.shape[1]
if n_features_pos != n_features_unl:
raise ValueError(f"'n_features' does not match between 'X_pos' (n={n_features_pos}) and "
f"'X_unlabeled' (n={n_features_unl})")


# II Main Functions
class dPULearn(Wrapper):
"""
Expand Down Expand Up @@ -358,6 +367,87 @@ def fit(self,
self.df_pu_ = df_pu
return self

def mine_negatives(self,
X_pos: ut.ArrayLike2D,
X_unlabeled: ut.ArrayLike2D,
n_neg: int,
metric: Optional[Literal["euclidean", "manhattan", "cosine"]] = None,
n_components: Union[float, int] = 0.80,
) -> np.ndarray:
"""
Mine reliable negatives from an unlabeled pool given the positives, in one call.

Convenience wrapper around :meth:`dPULearn.fit` for the common positive/unlabeled
setup: instead of stacking ``X_pos`` and ``X_unlabeled`` by hand, building a
``1`` (positive) / ``2`` (unlabeled) label vector, fitting, and slicing the mined
rows back out by index, pass the two feature matrices separately and receive a
**boolean mask over the rows of** ``X_unlabeled`` flagging the identified reliable
negatives. The mask equals ``labels_[len(X_pos):] == 0`` from the manual stacking
path exactly.

After the call the instance is fitted: :attr:`dPULearn.labels_` (over the stacked
``X_pos`` then ``X_unlabeled``) and :attr:`dPULearn.df_pu_` are set, so the
:class:`dPULearnPlot` methods work as usual.

.. versionadded:: 1.1.0

Parameters
----------
X_pos : array-like, shape (n_pos, n_features)
Feature matrix of the positive samples.
X_unlabeled : array-like, shape (n_unl, n_features)
Feature matrix of the unlabeled samples (the candidate pool). Must have the
same number of features as ``X_pos``.
n_neg : int
Number of reliable negatives to identify from the unlabeled pool. Must not
exceed the number of unlabeled samples.
metric : str or None, optional
Distance metric for distance-based identification (``euclidean``,
``manhattan``, ``cosine``). If ``None``, PCA-based identification is performed.
n_components : int or float, default=0.80
Number of principal components (int >= 1) or fraction of variance covered
(float in ``(0.0, 1.0)``) when PCA is applied.

Returns
-------
mask_neg : array-like, shape (n_unl,)
Boolean mask over the rows of ``X_unlabeled``: ``True`` marks an identified
reliable negative. ``X_unlabeled[mask_neg]`` are the mined negatives.

Notes
-----
* This is purely additive sugar: it stacks the inputs and calls
:meth:`dPULearn.fit` with ``label_pos=1`` / ``label_unl=2`` internally, so the
identification result is identical to the manual path.

See Also
--------
* :meth:`dPULearn.fit`: the underlying fit on a stacked matrix and label vector.
* :func:`get_labels`: derive a binary label vector from a sequence DataFrame.

Examples
--------
.. include:: examples/dpul_mine_negatives.rst
"""
# Check input (the >=3 sample floor applies to the stacked matrix, enforced by
# 'fit' below, so per-matrix validation only coerces + checks the feature dimension;
# this keeps mine_negatives accepting exactly what the manual stacking path accepts)
X_pos = ut.check_X(X=X_pos, X_name="X_pos", min_n_samples=1)
X_unlabeled = ut.check_X(X=X_unlabeled, X_name="X_unlabeled", min_n_samples=1)
check_match_X_pos_X_unlabeled(X_pos=X_pos, X_unlabeled=X_unlabeled)
# Validate n_neg here so the message names 'n_neg' (fit sees it as 'n_unl_to_neg')
ut.check_number_range(name="n_neg", val=n_neg, min_val=1, just_int=True)
# Stack positives over the unlabeled pool and fit with the package PU markers
n_pos = X_pos.shape[0]
X = np.vstack([X_pos, X_unlabeled])
labels = np.array([1] * n_pos + [2] * X_unlabeled.shape[0])
# No pre-labeled negatives here, so n_neg is exactly the count to draw from the pool
self.fit(X=X, labels=labels, label_pos=1, label_unl=2,
n_unl_to_neg=n_neg, metric=metric, n_components=n_components)
# Slice the mined reliable negatives (label 0) back out of the unlabeled block
mask_neg = np.asarray(self.labels_)[n_pos:] == 0
return mask_neg

@staticmethod
def eval(X: ut.ArrayLike2D,
list_labels: ut.ArrayLike2D,
Expand Down
2 changes: 2 additions & 0 deletions docs/_cheatsheet/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@
("Load benchmark sequences", "load_dataset(name) → df_seq", None),
("Load AAontology scales", "load_scales() → df_scales", None),
("Load precomputed features", "load_features(name) → df_feat", None),
("Binary labels from df column", "get_labels(df, positive_label) → labels", None, "v1.1"),
("Read / write FASTA", "read_fasta(file) → df_seq", None),
("Cluster redundant homologs", "filter_seq(df_seq) → df_clust [pro]", None),
]},
Expand Down Expand Up @@ -221,6 +222,7 @@
{"name": "Modeling & Explainability", "tag": "PU · classify · SHAP",
"rows": [
("Train with positives + unlabeled data", "dPULearn().fit(X, labels) [Wrapper]", None),
("Mine reliable negatives (mask)", "dPULearn().mine_negatives(X_pos, X_unlabeled) → mask", None, "v1.1"),
("Train + RFE + MC importance", "TreeModel().fit(X, labels) [Wrapper]", None),
("Per-feature / sample SHAP impact", "ShapModel().fit(X, labels) [pro]", None),
]},
Expand Down
19 changes: 19 additions & 0 deletions docs/source/index/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ Added
per-residue PTM and functional-site annotations and encodes them into tensors
(``fetch_uniprot``, ``ingest``, ``register_feature``, ``encode``, ``build_scales``,
``build_cat``, ``to_df_seq``).
- **combine_dict_nums**: Concatenates per-residue tensors (embedding / structure /
annotation) along the feature axis into one combined ``CPP.run_num`` input.
- **get_labels**: Derives a binary ``int`` label vector from a sequence DataFrame's
label column (``positive_label`` mapped to ``1``, everything else to ``0``) — the
single-call form of the recurring ``(df[col] == x).astype(int).to_numpy()`` expression.
- :func:`~aaanalysis.combine_dict_nums`: Concatenates per-residue tensors (embedding / structure /
annotation) along the feature axis into one combined :meth:`~aaanalysis.CPP.run_num` input.

Expand Down Expand Up @@ -132,6 +137,16 @@ Added
switches the pre-computed prediction per P1 (feature map + structure restyle) with no kernel,
keeping the column-residue linking (warned past 40 sites, hard-capped at 200).

**PU Learning**

- **dPULearn.mine_negatives**: One-call convenience over ``dPULearn.fit`` for the common
positive / unlabeled setup. Pass ``X_pos`` and ``X_unlabeled`` separately instead of
stacking them by hand, building a ``1`` / ``2`` label vector, fitting, and slicing the
mined rows back out; it returns the **boolean mask over the rows of** ``X_unlabeled``
flagging the identified reliable negatives (equal to the manual ``labels_[len(X_pos):]
== 0`` result exactly). The instance is left fitted (``labels_`` / ``df_pu_`` set, so
``dPULearnPlot`` works), and the existing ``fit(X, labels=...)`` path is unchanged.

**Sequence Analysis**

- :class:`~aaanalysis.AAWindowSampler`: Samples fixed-length sequence windows for PU-learning and
Expand Down Expand Up @@ -191,6 +206,10 @@ Added

- :func:`~aaanalysis.plot_rank`: Standalone per-protein max-score-vs-rank scatter with group coloring and
optional threshold lines (pairs with the new ``aa.metrics`` functions).
- **COLOR_SAMPLES_POS / COLOR_SAMPLES_NEG / COLOR_SAMPLES_UNL / COLOR_SAMPLES_REL_NEG**:
Public, named constants for the canonical sample-group colors (positive / negative /
unlabeled / reliable-negative). They equal the ``plot_get_cdict("DICT_COLOR")["SAMPLES_*"]``
values exactly, so a named constant replaces indexing the color dict by string key.

**Golden Pipelines**

Expand Down
Loading
Loading