diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 59fc1c2..30694b5 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -30,7 +30,7 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --dataset_id mouse_brain_combined \ --dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \ --dataset_url "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717" \ - --dataset_reference "https://www.10xgenomics.com/datasets/fresh-frozen-mouse-brain-replicates-1-standard;10.1038/s41586-023-06812-z" \ + --dataset_reference "10.1038/s41586-023-06812-z" \ --dataset_summary "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1);A high-resolution scRNAseq atlas of cell types in the whole mouse brain" \ --dataset_description "Demonstration of gene expression profiling for fresh frozen mouse brain on the Xenium platform using the pre-designed Mouse Brain Gene Expression Panel (v1). Replicate results demonstrate the high reproducibility of data generated by the platform. 10x Genomics obtained tissue from a C57BL/6 mouse from Charles River Laboratories. Three adjacent 10µm sections were placed on the same slide. Tissues were prepared following the demonstrated protocols Xenium In Situ for Fresh Frozen Tissues - Tissue Preparation Guide (CG000579) and Xenium In Situ for Fresh Frozen Tissues - Fixation & Permeabilization (CG000581).;See dataset_reference for more information. Note that we only took the 10xv2 data from the dataset." \ --dataset_organism "mus_musculus" diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index 54d8e3d..adc34bb 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -6,14 +6,6 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -# remove this when you have implemented the script -echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it." -echo " Step 1: replace 'task_template' with the name of the task in the following command." -echo " Step 2: replace the rename keys parameters to fit your run_benchmark inputs" -echo " Step 3: replace the settings parameter to fit your run_benchmark outputs" -echo " Step 4: remove this message" -exit 1 - set -e echo "Running benchmark on test data" @@ -23,14 +15,17 @@ echo " Make sure to run 'scripts/project/build_all_docker_containers.sh'!" RUN_ID="testrun_$(date +%Y-%m-%d_%H-%M-%S)" publish_dir="temp/results/${RUN_ID}" -nextflow run . \ +NXF_VER=25.10.4 nextflow run . \ -main-script target/nextflow/workflows/run_benchmark/main.nf \ -profile docker \ -resume \ -c common/nextflow_helpers/labels_ci.config \ - --id cxg_mouse_pancreas_atlas \ - --input_train resources_test/task_spatial_segmentation/mouse_brain_combined/train.h5ad \ - --input_test resources_test/task_spatial_segmentation/mouse_brain_combined/test.h5ad \ - --input_solution resources_test/task_spatial_segmentation/mouse_brain_combined/solution.h5ad \ + --id mouse_brain_combined \ + --input_spatial_unlabelled resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr \ + --input_spatial_solution resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr \ + --input_scrnaseq_reference resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad \ --output_state state.yaml \ - --publish_dir "$publish_dir" + --publish_dir "$publish_dir" \ + -with-trace "$publish_dir/trace.txt" + +common/scripts/render_results_report local "$publish_dir" --output "$publish_dir/report/" \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index ad85433..e1174a2 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -2,6 +2,8 @@ import pandas as pd import spatialdata as sd import scanpy as sc +import os +import shutil ## VIASH START par = { @@ -63,7 +65,7 @@ def sc_processing(adata): print(">> Override dataset metadata in .uns", flush=True) sc_data.uns["orig_dataset_id"] = sc_data.uns.get("dataset_id", None) -for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]: +for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_organism", "dataset_reference"]: sc_data.uns[key] = par[key] print(">> Writing scrnaseq reference", flush=True) @@ -114,6 +116,12 @@ def sc_processing(adata): ) print(">> Writing spatial unlabelled dataset", flush=True) +# remove if output exists +if os.path.exists(par["output_spatial_unlabelled"]): + if os.path.isdir(par["output_spatial_unlabelled"]): + shutil.rmtree(par["output_spatial_unlabelled"]) + else: + os.remove(par["output_spatial_unlabelled"]) output_spatial.write(par["output_spatial_unlabelled"], overwrite=True) # --------------------------------------------------------------- @@ -132,6 +140,12 @@ def sc_processing(adata): var=var_df, uns={ "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_reference": par["dataset_reference"], + "dataset_organism": par["dataset_organism"], "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), "spatialdata_attrs": ref_table.uns["spatialdata_attrs"], }, @@ -151,4 +165,9 @@ def sc_processing(adata): ) print(">> Writing spatial solution", flush=True) +if os.path.exists(par["output_spatial_solution"]): + if os.path.isdir(par["output_spatial_solution"]): + shutil.rmtree(par["output_spatial_solution"]) + else: + os.remove(par["output_spatial_solution"]) output_solution.write(par["output_spatial_solution"], overwrite=True) diff --git a/src/data_processors/process_prediction/script.py b/src/data_processors/process_prediction/script.py index 17c3728..5d1ae2a 100644 --- a/src/data_processors/process_prediction/script.py +++ b/src/data_processors/process_prediction/script.py @@ -83,22 +83,28 @@ table.layers["normalized_log_scaled"] = table.X.copy() print(">> Computing highly variable genes", flush=True) -# Reset X to counts for HVG computation -table.X = table.layers["counts"].copy() -try: - sc.pp.highly_variable_genes(table, flavor="seurat_v3", layer="counts", n_top_genes=min(3000, table.n_vars)) -except ValueError: - # seurat_v3 loess fitting can fail on small datasets; fall back to seurat flavor - sc.pp.normalize_total(table, target_sum=1e4) - sc.pp.log1p(table) - sc.pp.highly_variable_genes(table, flavor="seurat", n_top_genes=min(3000, table.n_vars)) -table.var.rename(columns={"highly_variable": "hvg"}, inplace=True) +if table.n_vars == 0 or table.n_obs == 0: + # No cells detected (e.g. empty_labels negative control); mark all vars as non-HVG + table.var["hvg"] = False +else: + # Reset X to counts for HVG computation + table.X = table.layers["counts"].copy() + try: + sc.pp.highly_variable_genes(table, flavor="seurat_v3", layer="counts", n_top_genes=min(3000, table.n_vars)) + except ValueError: + # seurat_v3 loess fitting can fail on small datasets; fall back to seurat flavor + sc.pp.normalize_total(table, target_sum=1e4) + sc.pp.log1p(table) + sc.pp.highly_variable_genes(table, flavor="seurat", n_top_genes=min(3000, table.n_vars)) + table.var.rename(columns={"highly_variable": "hvg"}, inplace=True) table.uns["dataset_id"] = dataset_id table.uns["method_id"] = method_id table.uns["spatialdata_attrs"] = { "instance_key": "cell_id", - "region": ["segmentation"], + # Derive regions from actual obs to handle the empty-table case (e.g. empty_labels + # negative control) where no cells were detected and obs has 0 rows. + "region": list(table.obs["region"].unique()), "region_key": "region", } diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index d39386c..a61478f 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -35,7 +35,7 @@ workflow run_wf { // extract the dataset metadata | extract_uns_metadata.run( - fromState: [input: "input_spatial_unlabelled"], + fromState: [input: "input_scrnaseq_reference"], toState: { id, output, state -> state + [ dataset_uns: readYaml(output.output).uns @@ -84,7 +84,10 @@ workflow run_wf { ) | process_prediction.run( - fromState: [input: "method_output"], + fromState: [ + input_prediction: "method_output", + input_spatial_unlabelled: "input_spatial_unlabelled" + ], toState: { id, output, state -> state + [ input_prediction: output.output @@ -100,7 +103,7 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ - input_solution: "input_solution", + input_solution: "input_spatial_solution", input_prediction: "input_prediction" ], // use 'toState' to publish that component's outputs to the overall state