-
Notifications
You must be signed in to change notification settings - Fork 0
Closes #22. Added a test that the memory usage doesn't balloon. #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| name: performance_test | ||
|
|
||
| on: | ||
| schedule: | ||
| - cron: "0 0 * * *" | ||
| timezone: "America/Vancouver" | ||
| workflow_dispatch: | ||
|
|
||
| concurrency: | ||
| group: test-${{ github.head_ref }} | ||
| cancel-in-progress: true | ||
|
|
||
| env: | ||
| PYTHONUNBUFFERED: "1" | ||
| FORCE_COLOR: "1" | ||
|
|
||
| jobs: | ||
| run: | ||
| name: Python ${{ matrix.python-version }} Performance Tests | ||
| runs-on: ${{ matrix.os }} | ||
| strategy: | ||
| fail-fast: false | ||
| matrix: | ||
| os: [ubuntu-latest] | ||
| python-version: ["3.11", "3.12", "3.13"] | ||
|
|
||
| steps: | ||
| - uses: actions/checkout@v4 | ||
|
|
||
| - name: Set up Python ${{ matrix.python-version }} | ||
| uses: actions/setup-python@v4 | ||
| with: | ||
| python-version: ${{ matrix.python-version }} | ||
|
|
||
| - name: Install dependencies | ||
| run: | | ||
| apt update && apt install yamllint | ||
| pip install uv | ||
|
|
||
| - name: Run slow tests | ||
| run: uv run pytest --memray -m slow --no-cov |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,96 @@ | ||
| #! /usr/bin/env python | ||
|
|
||
| import argparse | ||
| import csv | ||
| import glob | ||
| import json | ||
| import os.path | ||
| import re | ||
| import subprocess | ||
| from typing import TypedDict | ||
|
|
||
|
|
||
| TIME_REGEX = re.compile( | ||
| r"^\s*Elapsed \(wall clock\) time \(h:mm:ss or m:ss\): (.*)$", | ||
| flags=re.MULTILINE, | ||
| ) | ||
| MEMORY_REGEX = re.compile( | ||
| r"^\s*Maximum resident set size \(kbytes\): (.*)$", | ||
| flags=re.MULTILINE, | ||
| ) | ||
|
|
||
| def get_wall_clock_time(time_output: str) -> str: | ||
| return TIME_REGEX.search(time_output).group(1) | ||
|
|
||
| def get_max_memory_usage(time_output: str) -> str: | ||
| return MEMORY_REGEX.search(time_output).group(1) | ||
|
|
||
|
|
||
| class ResourceSummary(TypedDict): | ||
| sample_name: str | ||
| wall_clock_time: str | ||
| max_memory_usage_kb: str | ||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser( | ||
| "Process HLA sequences and report the resource usage." | ||
| ) | ||
| parser.add_argument("input_dir", help="Directory to scan for HLA sequences") | ||
| parser.add_argument("--output_csv", help="CSV file summary", default="out.csv") | ||
| args = parser.parse_args() | ||
|
|
||
| resource_summaries: list[ResourceSummary] = [] | ||
| sample_regex = re.compile(r"^.*/(.*)\.BA\.txt$") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd probably add this regex to the other regexes defined above. |
||
| for exon1_filename in glob.glob(f"{args.input_dir}/*.BA.txt"): | ||
| sample_name: str = sample_regex.match(exon1_filename).group(1) | ||
| exon2_filename: str = os.path.join(args.input_dir, f"{sample_name}.BB.txt") | ||
| with open(exon1_filename) as f: | ||
| exon1: str = f.read().strip() | ||
| with open(exon2_filename) as f: | ||
| exon2: str = f.read().strip() | ||
|
Comment on lines
+45
to
+51
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typing with for exon1_filepath in args.input_dr.glob("*.BA.txt"):
sample_name: str = sample_regex.match(exon1_filepath.name).group(1)
exon2_filepath: Path= exon1_filepath.with_name(exon1_filepath.name.replace("BA.txt", "BB.txt"))
exon1 = exon1_filepath.read_text().strip()
exon2 = exon2_filepath.read_text().strip()
...
json_filepath = args.input_dir / f"{sample_name}.json"
json_filepath.write_text(json.dumps(json_input))
...
result = subprocess.run(
[
...,
json_filepath.as_posix(),
] |
||
|
|
||
| json_input = { | ||
| "seq1": exon1, | ||
| "seq2": exon2, | ||
| "locus": "B", | ||
| } | ||
| json_filename: str = os.path.join(args.input_dir, f"{sample_name}.json") | ||
| with open(json_filename, "w") as f: | ||
| json.dump(json_input, f) | ||
|
|
||
| print(f"----\nSample {sample_name}:") | ||
| result = subprocess.run( | ||
| [ | ||
| "/usr/bin/time", | ||
| "-v", | ||
| "interpret_from_json", | ||
| json_filename, | ||
| ], | ||
| capture_output=True, | ||
| text=True, | ||
| ) | ||
| print("stdout:") | ||
| print(result.stdout) | ||
| print("stderr:") | ||
| print(result.stderr) | ||
|
|
||
| resource_summaries.append( | ||
| { | ||
| "sample_name": sample_name, | ||
| "wall_clock_time": get_wall_clock_time(result.stderr), | ||
| "max_memory_usage_kb": get_max_memory_usage(result.stderr), | ||
| } | ||
| ) | ||
|
|
||
| with open(args.output_csv, "w") as f: | ||
| resource_summary_writer = csv.DictWriter( | ||
| f, | ||
| fieldnames=("sample_name", "wall_clock_time", "max_memory_usage_kb"), | ||
| ) | ||
| resource_summary_writer.writeheader() | ||
| resource_summary_writer.writerows(resource_summaries) | ||
|
Comment on lines
+86
to
+92
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see Pandas is in the dev dependency groups, This could be pandas.DataFrame(data=resource_summaries).to_csv(args.output)I'm not too concerned with that one.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Stop trying to make pandas a thing |
||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| import numpy as np | ||
| import pytest | ||
|
|
||
| from hla_algorithm.hla_algorithm import HLAAlgorithm | ||
| from hla_algorithm.models import HLASequence, HLAStandard | ||
|
|
||
|
|
||
| @pytest.mark.slow | ||
| @pytest.mark.limit_memory("500 MB") | ||
| def test_acceptable_memory_usage(): | ||
| # We process a sequence produced by "mushing together" B*07:02:01G | ||
| # and B*45:01:01G, which as of the v2.63.0-alpha HLA alleles produces | ||
| # an expensive calculation. | ||
| hla_alg = HLAAlgorithm() | ||
|
|
||
| allele_1: HLAStandard = hla_alg.hla_standards["B"]["B*07:02:01G"] | ||
| allele_2: HLAStandard = hla_alg.hla_standards["B"]["B*45:01:01G"] | ||
|
|
||
| expensive_sequence = HLASequence( | ||
| two=(int(s) for s in np.array(allele_1.two) | np.array(allele_2.two)), | ||
| intron=(), | ||
| three=(int(s) for s in np.array(allele_1.three) | np.array(allele_2.three)), | ||
| name="expensive_sequence", | ||
| locus="B", | ||
| ) | ||
|
|
||
| hla_alg.interpret(expensive_sequence) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would recommend these be both of type
Pathhttps://docs.python.org/3/library/argparse.html#type
You could be a bit more explicit and type the directory as "a directory", see this example: https://stackoverflow.com/a/51212150