Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
"openssh-client",
"libaio-dev",
"unzip",
"yamllint"
"yamllint",
"time"
],
"install Ruby": [
"/usr/bin/apt",
Expand Down
41 changes: 41 additions & 0 deletions .github/workflows/acceptable_memory_usage_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
name: performance_test

on:
schedule:
- cron: "0 0 * * *"
timezone: "America/Vancouver"
workflow_dispatch:

concurrency:
group: test-${{ github.head_ref }}
cancel-in-progress: true

env:
PYTHONUNBUFFERED: "1"
FORCE_COLOR: "1"

jobs:
run:
name: Python ${{ matrix.python-version }} Performance Tests
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.11", "3.12", "3.13"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
apt update && apt install yamllint
pip install uv

- name: Run slow tests
run: uv run pytest --memray -m slow --no-cov
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies = [

[dependency-groups]
dev = [
"gprof2dot>=2025.4.14",
"mypy>=1.15.0",
"mypy-extensions>=1.0.0",
"pandas>=2.2.3",
Expand All @@ -47,6 +48,7 @@ dev = [
"pytest>=8.3.5",
"pytest-cov>=6.0.0",
"pytest-html>=4.1.1",
"pytest-memray>=1.8.0",
"pytest-mock>=3.14.0",
"pytest-xdist>=3.6.1",
"ruff>=0.9.9",
Expand Down
12 changes: 1 addition & 11 deletions src/hla_algorithm/hla_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,6 @@ def combine_standards_stepper(
- this is below our mismatch threshold.
If the mismatch threshold is 0, then we will only ever get the former.
"""
# Keep track of matches we've already found:
combos: dict[tuple[int, ...], int] = {}

current_rejection_threshold: int | float = float("inf")
for std_ai, std_a in enumerate(matching_stds):
if std_a.mismatch > current_rejection_threshold:
Expand All @@ -278,14 +275,7 @@ def combine_standards_stepper(
# same sequence, so check if this one's already been found.
combined_std_bin: tuple[int, ...] = tuple(int(s) for s in std_bin)

mismatches: int = -1
if combined_std_bin in combos:
mismatches = combos[combined_std_bin]

else:
# Note that seq is implicitly cast to a NumPy array:
mismatches = np.count_nonzero(std_bin ^ seq != 0)
combos[combined_std_bin] = mismatches # cache this value
mismatches: int = np.count_nonzero(std_bin ^ seq != 0)

if mismatches > current_rejection_threshold:
continue
Expand Down
96 changes: 96 additions & 0 deletions src/scripts/measure_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#! /usr/bin/env python

import argparse
import csv
import glob
import json
import os.path
import re
import subprocess
from typing import TypedDict


TIME_REGEX = re.compile(
r"^\s*Elapsed \(wall clock\) time \(h:mm:ss or m:ss\): (.*)$",
flags=re.MULTILINE,
)
MEMORY_REGEX = re.compile(
r"^\s*Maximum resident set size \(kbytes\): (.*)$",
flags=re.MULTILINE,
)

def get_wall_clock_time(time_output: str) -> str:
return TIME_REGEX.search(time_output).group(1)

def get_max_memory_usage(time_output: str) -> str:
return MEMORY_REGEX.search(time_output).group(1)


class ResourceSummary(TypedDict):
sample_name: str
wall_clock_time: str
max_memory_usage_kb: str


def main():
parser = argparse.ArgumentParser(
"Process HLA sequences and report the resource usage."
)
parser.add_argument("input_dir", help="Directory to scan for HLA sequences")
parser.add_argument("--output_csv", help="CSV file summary", default="out.csv")
Comment on lines +39 to +40
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would recommend these be both of type Path

https://docs.python.org/3/library/argparse.html#type

You could be a bit more explicit and type the directory as "a directory", see this example: https://stackoverflow.com/a/51212150

args = parser.parse_args()

resource_summaries: list[ResourceSummary] = []
sample_regex = re.compile(r"^.*/(.*)\.BA\.txt$")
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd probably add this regex to the other regexes defined above.

for exon1_filename in glob.glob(f"{args.input_dir}/*.BA.txt"):
sample_name: str = sample_regex.match(exon1_filename).group(1)
exon2_filename: str = os.path.join(args.input_dir, f"{sample_name}.BB.txt")
with open(exon1_filename) as f:
exon1: str = f.read().strip()
with open(exon2_filename) as f:
exon2: str = f.read().strip()
Comment on lines +45 to +51
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typing with Path, this could become

    for exon1_filepath in args.input_dr.glob("*.BA.txt"):
        sample_name: str = sample_regex.match(exon1_filepath.name).group(1)
        exon2_filepath: Path= exon1_filepath.with_name(exon1_filepath.name.replace("BA.txt", "BB.txt"))
        exon1 = exon1_filepath.read_text().strip()
        exon2 = exon2_filepath.read_text().strip()

    ...
    json_filepath = args.input_dir / f"{sample_name}.json"
    json_filepath.write_text(json.dumps(json_input))

    ...
    result = subprocess.run(
        [
            ...,
            json_filepath.as_posix(),
        ]


json_input = {
"seq1": exon1,
"seq2": exon2,
"locus": "B",
}
json_filename: str = os.path.join(args.input_dir, f"{sample_name}.json")
with open(json_filename, "w") as f:
json.dump(json_input, f)

print(f"----\nSample {sample_name}:")
result = subprocess.run(
[
"/usr/bin/time",
"-v",
"interpret_from_json",
json_filename,
],
capture_output=True,
text=True,
)
print("stdout:")
print(result.stdout)
print("stderr:")
print(result.stderr)

resource_summaries.append(
{
"sample_name": sample_name,
"wall_clock_time": get_wall_clock_time(result.stderr),
"max_memory_usage_kb": get_max_memory_usage(result.stderr),
}
)

with open(args.output_csv, "w") as f:
resource_summary_writer = csv.DictWriter(
f,
fieldnames=("sample_name", "wall_clock_time", "max_memory_usage_kb"),
)
resource_summary_writer.writeheader()
resource_summary_writer.writerows(resource_summaries)
Comment on lines +86 to +92
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see Pandas is in the dev dependency groups,

This could be

pandas.DataFrame(data=resource_summaries).to_csv(args.output)

I'm not too concerned with that one.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stop trying to make pandas a thing



if __name__ == "__main__":
main()
27 changes: 27 additions & 0 deletions tests/acceptable_memory_usage_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np
import pytest

from hla_algorithm.hla_algorithm import HLAAlgorithm
from hla_algorithm.models import HLASequence, HLAStandard


@pytest.mark.slow
@pytest.mark.limit_memory("500 MB")
def test_acceptable_memory_usage():
# We process a sequence produced by "mushing together" B*07:02:01G
# and B*45:01:01G, which as of the v2.63.0-alpha HLA alleles produces
# an expensive calculation.
hla_alg = HLAAlgorithm()

allele_1: HLAStandard = hla_alg.hla_standards["B"]["B*07:02:01G"]
allele_2: HLAStandard = hla_alg.hla_standards["B"]["B*45:01:01G"]

expensive_sequence = HLASequence(
two=(int(s) for s in np.array(allele_1.two) | np.array(allele_2.two)),
intron=(),
three=(int(s) for s in np.array(allele_1.three) | np.array(allele_2.three)),
name="expensive_sequence",
locus="B",
)

hla_alg.interpret(expensive_sequence)
Loading
Loading