Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/bin/bash

# Get the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)

# Ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

set -e

publish_dir="s3://openproblems-data/resources/datasets"

cat > /tmp/params.yaml << HERE
param_list:

- id: "allen_brain_cell_atlas_merfish/mouse1_coronal/rep1"
mouse: "mouse1_coronal"
experiment_id: "220422_wb3_co1_1B_6z18R_merfish5"
dataset_name: "Allen Brain Cell Atlas MERFISH Mouse 1 Coronal"
dataset_url: "https://download.brainimagelibrary.org/29/3c/293cc39ceea87f6d/"
dataset_summary: "Brain-wide MERFISH spatial transcriptomics of mouse 1 (coronal), Allen Brain Cell Atlas."
dataset_description: "Brain-wide MERFISH spatial transcriptomics data from the Zhuang lab. Mouse 1 coronal section imaged with ~1100 gene panel."
dataset_organism: "mus_musculus"
dataset_reference: "@article{Yao2023, author={Yao, Zizhen and others}, title={A high-resolution transcriptomic and spatial atlas of cell types in the whole mouse brain}, journal={Nature}, year={2023}}"
segmentation_id: ["cell"]

- id: "allen_brain_cell_atlas_merfish/mouse2_coronal/rep1"
mouse: "mouse2_coronal"
experiment_id: "220601_wb3_co2_1_5z18R2bd_merfish5"
dataset_name: "Allen Brain Cell Atlas MERFISH Mouse 2 Coronal"
dataset_url: "https://download.brainimagelibrary.org/29/3c/293cc39ceea87f6d/"
dataset_summary: "Brain-wide MERFISH spatial transcriptomics of mouse 2 (coronal), Allen Brain Cell Atlas."
dataset_description: "Brain-wide MERFISH spatial transcriptomics data from the Zhuang lab. Mouse 2 coronal section imaged with ~1100 gene panel."
dataset_organism: "mus_musculus"
dataset_reference: "@article{Yao2023, author={Yao, Zizhen and others}, title={A high-resolution transcriptomic and spatial atlas of cell types in the whole mouse brain}, journal={Nature}, year={2023}}"
segmentation_id: ["cell"]

- id: "allen_brain_cell_atlas_merfish/mouse3_sagittal/rep1"
mouse: "mouse3_sagittal"
experiment_id: "220609_wb3_sa1_1_5z18R_merfish5"
dataset_name: "Allen Brain Cell Atlas MERFISH Mouse 3 Sagittal"
dataset_url: "https://download.brainimagelibrary.org/29/3c/293cc39ceea87f6d/"
dataset_summary: "Brain-wide MERFISH spatial transcriptomics of mouse 3 (sagittal), Allen Brain Cell Atlas."
dataset_description: "Brain-wide MERFISH spatial transcriptomics data from the Zhuang lab. Mouse 3 sagittal section imaged with ~1100 gene panel."
dataset_organism: "mus_musculus"
dataset_reference: "@article{Yao2023, author={Yao, Zizhen and others}, title={A high-resolution transcriptomic and spatial atlas of cell types in the whole mouse brain}, journal={Nature}, year={2023}}"
segmentation_id: ["cell"]

- id: "allen_brain_cell_atlas_merfish/mouse4_sagittal/rep1"
mouse: "mouse4_sagittal"
experiment_id: "220912_wb3_sa2_2_5z18R_merfish5"
dataset_name: "Allen Brain Cell Atlas MERFISH Mouse 4 Sagittal"
dataset_url: "https://download.brainimagelibrary.org/29/3c/293cc39ceea87f6d/"
dataset_summary: "Brain-wide MERFISH spatial transcriptomics of mouse 4 (sagittal), Allen Brain Cell Atlas."
dataset_description: "Brain-wide MERFISH spatial transcriptomics data from the Zhuang lab. Mouse 4 sagittal section imaged with ~1100 gene panel."
dataset_organism: "mus_musculus"
dataset_reference: "@article{Yao2023, author={Yao, Zizhen and others}, title={A high-resolution transcriptomic and spatial atlas of cell types in the whole mouse brain}, journal={Nature}, year={2023}}"
segmentation_id: ["cell"]

output_dataset: "\$id/dataset.zarr"
output_state: "\$id/state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
--revision build/main \
--pull-latest \
--main-script target/nextflow/datasets/workflows/process_allen_brain_cell_atlas_merfish/main.nf \
--workspace 167877437119966 \
--compute-env 5hfmdCBxMRd4nHZaJKYEQZ \
--params-file /tmp/params.yaml \
--config src/base/labels_nebius.config \
--labels datasets,allen_brain_cell_atlas_merfish
50 changes: 50 additions & 0 deletions scripts/create_resources/spatial/process_bruker_cosmx_nebius.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

# Get the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)

# Ensure that the command below is run from the root of the repository
cd "$REPO_ROOT"

set -e

publish_dir="s3://openproblems-data/resources/datasets"

cat > /tmp/params.yaml << HERE
param_list:

- id: "bruker_cosmx/bruker_mouse_brain_cosmx/rep1"
input_raw: "https://smi-public.objects.liquidweb.services/HalfBrain.zip"
input_flat_files: "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip"
dataset_name: "Bruker CosMx Mouse Brain"
dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/cosmx-smi-mouse-brain-ffpe-dataset/"
dataset_summary: "Bruker CosMx Mouse Brain dataset on FFPE covering a full hemisphere of a mouse brain."
dataset_description: "Bruker CosMx Mouse Brain dataset on FFPE covering a full hemisphere of a mouse brain."
dataset_organism: "mus_musculus"
segmentation_id: ["cell"]

- id: "bruker_cosmx/bruker_human_liver_cosmx"
input_raw: "https://smi-public.objects.liquidweb.services/NormalLiverFiles.zip"
input_flat_files: "https://syncandshare.desy.de/index.php/s/zYT4fM28y86cZeW/download/NormalLiver.zip"
dataset_name: "Bruker CosMx Human Liver"
dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/human-liver-rna-ffpe-dataset/"
dataset_summary: "Bruker CosMx Human Liver dataset on FFPE."
dataset_description: "Bruker CosMx Human Liver dataset on FFPE."
dataset_organism: "homo_sapiens"
segmentation_id: ["cell"]


output_dataset: "\$id/dataset.zarr"
output_state: "\$id/state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task_ist_preprocessing.git \
--revision build/main \
--pull-latest \
--main-script target/nextflow/datasets/workflows/process_bruker_cosmx/main.nf \
--workspace 167877437119966 \
--compute-env 5hfmdCBxMRd4nHZaJKYEQZ \
--params-file /tmp/params.yaml \
--config src/base/labels_nebius.config \
--labels datasets,bruker_cosmx
2 changes: 0 additions & 2 deletions src/data_processors/process_dataset/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,6 @@ def subsample_adata_group_balanced(adata, group_key, n_samples, seed=0):

# Load the spatial data
sdata = sd.read_zarr(input_sp)
if _tmp_dir is not None:
shutil.rmtree(_tmp_dir)

# Subset single-cell data if it is too large
N_MAX_SC = 120000
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
name: allen_brain_cell_atlas_merfish
namespace: datasets/loaders

argument_groups:
- name: Inputs
arguments:
- type: string
name: --mouse
description: "Mouse identifier. One of: mouse1_coronal, mouse2_coronal, mouse3_sagittal, mouse4_sagittal."
required: true
choices:
- mouse1_coronal
- mouse2_coronal
- mouse3_sagittal
- mouse4_sagittal
- type: string
name: --experiment_id
description: "Experiment identifier (brain section). Must match an entry in the experiment_metadata.csv for the selected mouse."
required: true
- type: string
name: --abca_version
description: "Version tag for the Allen Brain Cell Atlas cell metadata release."
required: false
default: "20231215"
- type: string
name: --segmentation_id
default: ["cell"]
description: The segmentation identifier.
multiple: true
- name: Caching settings
arguments:
- type: boolean
name: --keep_files
default: true
description: "Keep downloaded intermediate files after processing."
- name: Metadata
arguments:
- type: string
name: --dataset_id
description: "A unique identifier for the dataset."
required: true
- name: --dataset_name
type: string
description: Nicely formatted name.
required: true
- type: string
name: --dataset_url
description: Link to the original source of the dataset.
required: false
- name: --dataset_reference
type: string
description: Bibtex reference of the paper in which the dataset was published.
required: false
- name: --dataset_summary
type: string
description: Short description of the dataset.
required: true
- name: --dataset_description
type: string
description: Long description of the dataset.
required: true
- name: --dataset_organism
type: string
description: The organism of the sample in the dataset.
required: false
- name: Outputs
arguments:
- name: "--output"
__merge__: /src/api/file_common_ist.yaml
direction: output
required: true

resources:
- type: python_script
path: script.py

engines:
- type: docker
image: openproblems/base_python:1
__merge__:
- /src/base/setup_spatialdata_partial.yaml
setup:
- type: python
pypi:
- geopandas
- tifffile
- shapely
- type: native

runners:
- type: executable
- type: nextflow
directives:
label: [midmem, midcpu, hightime]
Loading
Loading