From da2ecc8ea179afb45dbbfe8bdbb78bb6bfe31ea8 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 7 May 2026 13:24:41 +0200 Subject: [PATCH 1/4] fix wf for negative control --- scripts/run_benchmark/run_test_local.sh | 18 ++++-------- .../process_prediction/script.py | 28 +++++++++++-------- src/workflows/run_benchmark/main.nf | 10 +++++-- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index 54d8e3d..befc33b 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -6,14 +6,6 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -# remove this when you have implemented the script -echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." -echo " Step 1: replace 'task_template' with the name of the task in the following command." -echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" -echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" -echo " Step 4: remove this message" -exit 1 - set -e echo "Running benchmark on test data" @@ -23,14 +15,14 @@ echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" RUN_ID="testrun_$(date +%Y-%m-%d_%H-%M-%S)" publish_dir="temp/results/${RUN_ID}" -nextflow run . \ +NXF_VER=25.10.4 nextflow run . \ -main-script target/nextflow/workflows/run_benchmark/main.nf \ -profile docker \ -resume \ -c common/nextflow_helpers/labels_ci.config \ - --id cxg_mouse_pancreas_atlas \ - --input_train resources_test/task_spatial_segmentation/mouse_brain_combined/train.h5ad \ - --input_test resources_test/task_spatial_segmentation/mouse_brain_combined/test.h5ad \ - --input_solution resources_test/task_spatial_segmentation/mouse_brain_combined/solution.h5ad \ + --id mouse_brain_combined \ + --input_spatial_unlabelled resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr \ + --input_spatial_solution resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr \ + --input_scrnaseq_reference resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad \ --output_state state.yaml \ --publish_dir "$publish_dir" diff --git a/src/data_processors/process_prediction/script.py b/src/data_processors/process_prediction/script.py index 17c3728..5d1ae2a 100644 --- a/src/data_processors/process_prediction/script.py +++ b/src/data_processors/process_prediction/script.py @@ -83,22 +83,28 @@ table.layers["normalized_log_scaled"] = table.X.copy() print(">> Computing highly variable genes", flush=True) -# Reset X to counts for HVG computation -table.X = table.layers["counts"].copy() -try: - sc.pp.highly_variable_genes(table, flavor="seurat_v3", layer="counts", n_top_genes=min(3000, table.n_vars)) -except ValueError: - # seurat_v3 loess fitting can fail on small datasets; fall back to seurat flavor - sc.pp.normalize_total(table, target_sum=1e4) - sc.pp.log1p(table) - sc.pp.highly_variable_genes(table, flavor="seurat", n_top_genes=min(3000, table.n_vars)) -table.var.rename(columns={"highly_variable": "hvg"}, inplace=True) +if table.n_vars == 0 or table.n_obs == 0: + # No cells detected (e.g. empty_labels negative control); mark all vars as non-HVG + table.var["hvg"] = False +else: + # Reset X to counts for HVG computation + table.X = table.layers["counts"].copy() + try: + sc.pp.highly_variable_genes(table, flavor="seurat_v3", layer="counts", n_top_genes=min(3000, table.n_vars)) + except ValueError: + # seurat_v3 loess fitting can fail on small datasets; fall back to seurat flavor + sc.pp.normalize_total(table, target_sum=1e4) + sc.pp.log1p(table) + sc.pp.highly_variable_genes(table, flavor="seurat", n_top_genes=min(3000, table.n_vars)) + table.var.rename(columns={"highly_variable": "hvg"}, inplace=True) table.uns["dataset_id"] = dataset_id table.uns["method_id"] = method_id table.uns["spatialdata_attrs"] = { "instance_key": "cell_id", - "region": ["segmentation"], + # Derive regions from actual obs to handle the empty-table case (e.g. empty_labels + # negative control) where no cells were detected and obs has 0 rows. + "region": list(table.obs["region"].unique()), "region_key": "region", } diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index d39386c..092e437 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -32,10 +32,11 @@ workflow run_wf { | map{ id, state -> [id, state + ["_meta": [join_id: id]]] } + | view() // extract the dataset metadata | extract_uns_metadata.run( - fromState: [input: "input_spatial_unlabelled"], + fromState: [input: "input_scrnaseq_reference"], toState: { id, output, state -> state + [ dataset_uns: readYaml(output.output).uns @@ -84,7 +85,10 @@ workflow run_wf { ) | process_prediction.run( - fromState: [input: "method_output"], + fromState: [ + input_prediction: "method_output", + input_spatial_unlabelled: "input_spatial_unlabelled" + ], toState: { id, output, state -> state + [ input_prediction: output.output @@ -100,7 +104,7 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ - input_solution: "input_solution", + input_solution: "input_spatial_solution", input_prediction: "input_prediction" ], // use 'toState' to publish that component's outputs to the overall state From f458f4a1fbcf34e0c068d93875c0fb3a5b149e03 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 7 May 2026 14:10:14 +0200 Subject: [PATCH 2/4] apply fixes --- scripts/create_resources/test_resources.sh | 2 +- scripts/run_benchmark/run_test_local.sh | 3 ++- src/data_processors/process_dataset/script.py | 25 +++++++++++++++++-- src/workflows/run_benchmark/main.nf | 1 - 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 59fc1c2..30694b5 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -30,7 +30,7 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --dataset_id mouse_brain_combined \ --dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \ --dataset_url "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717" \ - --dataset_reference "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;10.1038/s41586-023-06812-z" \ + --dataset_reference "10.1038/s41586-023-06812-z" \ --dataset_summary "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1);A high-resolution scRNAseq atlas of cell types in the whole mouse brain" \ --dataset_description "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581).;See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset." \ --dataset_organism "mus_musculus" diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index befc33b..c9ce726 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -25,4 +25,5 @@ NXF_VER=25.10.4 nextflow run . \ --input_spatial_solution resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr \ --input_scrnaseq_reference resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad \ --output_state state.yaml \ - --publish_dir "$publish_dir" + --publish_dir "$publish_dir" \ + -with-trace "$publish_dir/trace.txt" diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index ad85433..1cc7675 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -2,6 +2,8 @@ import pandas as pd import spatialdata as sd import scanpy as sc +import os +import shutil ## VIASH START par = { @@ -63,8 +65,9 @@ def sc_processing(adata): print(">> Override dataset metadata in .uns", flush=True) sc_data.uns["orig_dataset_id"] = sc_data.uns.get("dataset_id", None) -for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]: +for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_organism"]: sc_data.uns[key] = par[key] +sc_data.uns["dataset_reference"] = {"doi": par["dataset_reference"]} print(">> Writing scrnaseq reference", flush=True) sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip") @@ -80,7 +83,8 @@ def sc_processing(adata): "dataset_url": par["dataset_url"], "dataset_summary": par["dataset_summary"], "dataset_description": par["dataset_description"], - "dataset_reference": par["dataset_reference"], + # Join list into semicolon-separated string to match the expected string format + "dataset_reference": {"doi": par["dataset_reference"]}, "dataset_organism": par["dataset_organism"], "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), } @@ -114,6 +118,12 @@ def sc_processing(adata): ) print(">> Writing spatial unlabelled dataset", flush=True) +# remove if output exists +if os.path.exists(par["output_spatial_unlabelled"]): + if os.path.isdir(par["output_spatial_unlabelled"]): + shutil.rmtree(par["output_spatial_unlabelled"]) + else: + os.remove(par["output_spatial_unlabelled"]) output_spatial.write(par["output_spatial_unlabelled"], overwrite=True) # --------------------------------------------------------------- @@ -132,6 +142,12 @@ def sc_processing(adata): var=var_df, uns={ "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_reference": {"doi": par["dataset_reference"]}, + "dataset_organism": par["dataset_organism"], "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), "spatialdata_attrs": ref_table.uns["spatialdata_attrs"], }, @@ -151,4 +167,9 @@ def sc_processing(adata): ) print(">> Writing spatial solution", flush=True) +if os.path.exists(par["output_spatial_solution"]): + if os.path.isdir(par["output_spatial_solution"]): + shutil.rmtree(par["output_spatial_solution"]) + else: + os.remove(par["output_spatial_solution"]) output_solution.write(par["output_spatial_solution"], overwrite=True) diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 092e437..a61478f 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -32,7 +32,6 @@ workflow run_wf { | map{ id, state -> [id, state + ["_meta": [join_id: id]]] } - | view() // extract the dataset metadata | extract_uns_metadata.run( From e5ae118fd5e142b484579439b748cdc336b4ce5d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 7 May 2026 14:47:36 +0200 Subject: [PATCH 3/4] render report after local test run --- scripts/run_benchmark/run_test_local.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index c9ce726..adc34bb 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -27,3 +27,5 @@ NXF_VER=25.10.4 nextflow run . \ --output_state state.yaml \ --publish_dir "$publish_dir" \ -with-trace "$publish_dir/trace.txt" + +common/scripts/render_results_report local "$publish_dir" --output "$publish_dir/report/" \ No newline at end of file From f1b5bffa144a2e62edb10f57b0e6e29ea57b1539 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 7 May 2026 14:47:44 +0200 Subject: [PATCH 4/4] revert reference changes for now --- src/data_processors/process_dataset/script.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 1cc7675..e1174a2 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -65,9 +65,8 @@ def sc_processing(adata): print(">> Override dataset metadata in .uns", flush=True) sc_data.uns["orig_dataset_id"] = sc_data.uns.get("dataset_id", None) -for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_organism"]: +for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_organism", "dataset_reference"]: sc_data.uns[key] = par[key] -sc_data.uns["dataset_reference"] = {"doi": par["dataset_reference"]} print(">> Writing scrnaseq reference", flush=True) sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip") @@ -83,8 +82,7 @@ def sc_processing(adata): "dataset_url": par["dataset_url"], "dataset_summary": par["dataset_summary"], "dataset_description": par["dataset_description"], - # Join list into semicolon-separated string to match the expected string format - "dataset_reference": {"doi": par["dataset_reference"]}, + "dataset_reference": par["dataset_reference"], "dataset_organism": par["dataset_organism"], "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), } @@ -146,7 +144,7 @@ def sc_processing(adata): "dataset_url": par["dataset_url"], "dataset_summary": par["dataset_summary"], "dataset_description": par["dataset_description"], - "dataset_reference": {"doi": par["dataset_reference"]}, + "dataset_reference": par["dataset_reference"], "dataset_organism": par["dataset_organism"], "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), "spatialdata_attrs": ref_table.uns["spatialdata_attrs"],