Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions scripts/create_resources/process_hae.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

# get the root of the directory
REPO_ROOT=$(git rev-parse --show-toplevel)

# ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

set -e

publish_dir="s3://openproblems-data/resources/datasets"

cat > /tmp/params.yaml << HERE
param_list:

- id: "10x_xenium/10x_mouse_breast_cancer_xenium/rep1"
input: https://cf.10xgenomics.com/samples/xenium/1.0.1/Xenium_FFPE_Human_Breast_Cancer_Rep1/Xenium_FFPE_Human_Breast_Cancer_Rep1_he_image.tif
dataset_name: "Xenium FFPE Human Breast Cancer Replicate 1"
dataset_url: "https://www.10xgenomics.com/products/xenium-in-situ/preview-dataset-human-breast"
dataset_summary: "The Xenium data was registered with post-Xenium IF / H&E images (workflow is non-destructive to the tissue) and integrated with Chromium and Visium data."
dataset_description: "Two formalin-fixed & paraffin-embedded (FFPE) breast cancer tissue blocks were obtained from Discovery Life Sciences. Sample #1 was annotated by a pathologist to be T2N1M0, Stage II-B, ER+/HER2+/PR−. Sample #2 was characterized as stage pT2 pN1a pMX, ER−/HER2+/PR−. Corresponding dissociated tumor cells for Sample #1, fresh frozen (FF) in liquid nitrogen, were also sampled from the same 2.5 cm biopsy. For the Chromium Flex workflow, two 25 μm curls were pooled as a single replicate. 5 μm sections from Sample #1 were taken from the FFPE tissue using a microtome. Two replicate 5 μm sections were taken each for Visium CytAssist and Xenium. A 5 μm section was also taken from Sample #2 for Xenium."
dataset_organism: "homo_sapiens"

output_dataset: "\$id/dataset.zarr"
output_state: "\$id/state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task_spatial_segmentation.git \
--revision build/main \
--pull-latest \
--main-script target/nextflow/workflows/process_datasets/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--entry-name auto \
--config common/nextflow_helpers/labels_tw.config \
--labels task_template,process_datasets
64 changes: 64 additions & 0 deletions src/datasets/loaders/hae/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: hae
namespace: datasets/loaders

argument_groups:
- name: Inputs
arguments:
- type: string
name: --input
required: true
description: A H&E image file
- name: Metadata
arguments:
- type: string
name: --dataset_id
description: "A unique identifier for the dataset"
required: true
- name: --dataset_name
type: string
description: Nicely formatted name.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- name: --dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
- name: --dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: --dataset_description
type: string
description: Long description of the dataset.
required: true
- name: --dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
- name: Outputs
arguments:
- name: "--output"
direction: output
required: true

resources:
- type: python_script
path: script.py

engines:
- type: docker
image: openproblems/base_python:1
setup:
- type: python
pypi:
- spatialdata-io
- type: native

runners:
- type: executable
- type: nextflow
directives:
label: [midmem, midcpu, midtime]
68 changes: 68 additions & 0 deletions src/datasets/loaders/hae/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import spatialdata as sd
import anndata as ad
from spatialdata_io import xenium
import shutil
import os
import zipfile
import tempfile
import tifffile as tiff
import json

## VIASH START
par = {
"input": "https://cf.10xgenomics.com/samples/xenium/1.0.1/Xenium_FFPE_Human_Breast_Cancer_Rep1/Xenium_FFPE_Human_Breast_Cancer_Rep1_he_image.tif",
"dataset_id": "value",
"dataset_name": "value",
"dataset_url": "value",
"dataset_reference": "value",
"dataset_summary": "value",
"dataset_description": "value",
"dataset_organism": "value",
"output": "temp/datasets/hae/breast/breast.tiff"
}
meta = {
"cpus": 1,
}

## VIASH END

# Download the data if it's a download url, extract the data if it's a zip file
par_input = par["input"]
with tempfile.TemporaryDirectory() as tmpdirname:
if par_input.startswith("http"):
print(f"Downloading data to {tmpdirname}", flush=True)
file_name = par_input.split("/")[-1]
# download the data
os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}")
par_input = tmpdirname + "/" + file_name

if zipfile.is_zipfile(par_input):
print(f"Extracting input zip to {tmpdirname}", flush=True)
with zipfile.ZipFile(par_input, "r") as zip_ref:
zip_ref.extractall(tmpdirname)
par_input = tmpdirname

# read the data
img = tiff.imread(par_input)

metadata = {
"dataset_id": par["dataset_id"],
"dataset_name": par["dataset_name"],
"dataset_url": par["dataset_url"],
"dataset_reference": par["dataset_reference"],
"dataset_summary": par["dataset_summary"],
"dataset_description": par["dataset_description"],
"dataset_organism": par["dataset_organism"],
"segmentation_id": par["segmentation_id"],
}

print(f"Writing to '{par['output']}'", flush=True)
if os.path.exists(par["output"]):
shutil.rmtree(par["output"])

tiff.imwrite(
par["output"],
img,
description=json.dumps(metadata),
metadata=metadata,
)
59 changes: 59 additions & 0 deletions src/datasets/workflows/process_hae/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: process_hae
namespace: datasets/workflows

argument_groups:
- name: Inputs
arguments:
- type: string
name: --input
required: true
description: A 10x xenium directory or zip file or download url
- name: Metadata
arguments:
- type: string
name: --id
description: "A unique identifier for the dataset"
required: true
- name: --dataset_name
type: string
description: Nicely formatted name.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- name: --dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
- name: --dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: --dataset_description
type: string
description: Long description of the dataset.
required: true
- name: --dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
- name: Outputs
arguments:
- name: "--output_dataset"
direction: output
required: true
default: "$id/dataset.tiff"

resources:
- type: nextflow_script
path: main.nf
entrypoint: run_wf

dependencies:
- name: datasets/loaders/hae

runners:
- type: nextflow
directives:
label: [highcpu, midmem, hightime]
39 changes: 39 additions & 0 deletions src/datasets/workflows/process_hae/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
workflow auto {
findStates(params, meta.config)
| meta.workflow.run(
auto: [publish: "state"]
)
}

workflow run_wf {
take:
input_ch

main:
output_ch = input_ch

// copy id to the state
| map{ id, state ->
def new_state = state + [dataset_id: id]
[id, new_state]
}

| hae.run(
fromState: [
"input",
"dataset_id",
"dataset_name",
"dataset_url",
"dataset_reference",
"dataset_summary",
"dataset_description",
"dataset_organism",
],
toState: ["output"]
)

| setState([output_dataset: "output"])

emit:
output_ch
}
Loading