diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 943834a641..ca2bfe03b7 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -20,3 +20,15 @@ * [ ] This submission follows the guidelines in our [Contributing](../blob/master/CONTRIBUTING.md) document * [ ] I have checked to ensure there aren't other open [Pull Requests](../pulls) for the same update/change + +### PR review checklist + +This PR will be evaluated on the basis of the following checks: + +* [ ] The task addresses a valid open problem in single-cell analysis +* [ ] The latest version of master is merged and tested +* [ ] The methods/metrics are imported to `__init__.py` and were tested in the pipeline +* [ ] Method and metric decorators are annotated with paper title, year, author, code version, and date +* [ ] The README gives an outline of the methods, metrics and datasets in the folder +* [ ] The README provides a satisfactory task explanation (for new tasks) +* [ ] The sample test data is appropriate to test implementation of all methods and metrics (for new tasks) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index fcfdf733b2..e34907b398 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -45,7 +45,7 @@ jobs: - name: Build Docker images run: | cd workflow - snakemake -j $(grep -c processor /proc/cpuinfo) docker + snakemake -j $(grep -c processor /proc/cpuinfo) docker_build cd .. - name: Push Docker images diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index b50fb757fc..2a39fdcdb2 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -109,10 +109,11 @@ jobs: sudo mkdir -p /mnt/openproblems-nextflow sudo chown $USER /mnt/openproblems-nextflow s3fs -o umask=0277,uid=$(id -u) openproblems-nextflow /mnt/openproblems-nextflow + # Create bucket/ work/ and cwd/ for dir in bucket work cwd; do - mkdir -p /mnt/openproblems-nextflow/${dir}/${{ github.ref }} + mkdir -p /mnt/openproblems-nextflow/${dir}/${{ env.BRANCH }} done - ls -l /mnt/openproblems-nextflow/*/${{ github.ref }} + ls -l /mnt/openproblems-nextflow/*/${{ env.BRANCH }} - name: Install package & dependencies run: | @@ -150,20 +151,20 @@ jobs: TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }} AWS_DEFAULT_REGION: us-west-2 run: >- - cd /mnt/openproblems-nextflow/cwd/${{ github.ref }} && + cd /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }} && nextflow run $WITH_TOWER -ansi-log false $RESUME -profile $PROFILE - -work-dir "/mnt/openproblems-nextflow/work/${{ github.ref }}" - -bucket-dir "s3://openproblems-nextflow/bucket/${{ github.ref }}" + -work-dir "/mnt/openproblems-nextflow/work/${{ env.BRANCH }}" + -bucket-dir "s3://openproblems-nextflow/bucket/${{ env.BRANCH }}" singlecellopenproblems/nf-openproblems --branch $BRANCH - name: Copy results run: | - cp -r /mnt/openproblems-nextflow/cwd/${{ github.ref }}/results . + cp -r /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }}/results . - name: Parse results # There's a bug with the results getting pulled from Netflow with caching, but @@ -182,9 +183,9 @@ jobs: for image in $(cd docker && ls -1d */ | tr -d '/'); do aws ecr batch-delete-image --region $AWS_DEFAULT_REGION --repository-name openproblems --image-ids "imageTag=${BRANCH}-${image}" done - aws s3 rm "s3://openproblems-nextflow/work/${{ github.ref }}" - aws s3 rm "s3://openproblems-nextflow/bucket/${{ github.ref }}" - aws s3 rm "s3://openproblems-nextflow/cwd/${{ github.ref }}" + aws s3 rm "s3://openproblems-nextflow/work/${{ env.BRANCH }}" + aws s3 rm "s3://openproblems-nextflow/bucket/${{ env.BRANCH }}" + aws s3 rm "s3://openproblems-nextflow/cwd/${{ env.BRANCH }}" - name: Remove untagged images if: startsWith(github.ref, 'refs/heads/master') diff --git a/.gitignore b/.gitignore index e160c24854..2336e1abb4 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,6 @@ resources/ # Nextflow nf-openproblems .nextflow + +# Editor +.idea diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 821e76d030..e2c3310704 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,9 +27,9 @@ link to it from your website, or simply star it in GitHub to say "I use it". + [Writing functions in R](#writing-functions-in-r) + [Adding package dependencies](#adding-package-dependencies) + [Adding a new dataset](#adding-a-new-dataset) - + [Adding a dataset / method / metric to a task](#adding-a-dataset---method---metric-to-a-task) + + [Adding a dataset / method / metric to a task](#adding-a-dataset--method--metric-to-a-task) + [Adding a new task](#adding-a-new-task) - + [Adding a new Docker container](#adding-a-new-container) + + [Adding a new Docker container](#adding-a-new-docker-container) * [Code Style and Testing](#code-style-and-testing) * [Code of Conduct](#code-of-conduct) * [Attribution](#attribution) @@ -177,6 +177,10 @@ If you are unable to write your method using our base dependencies, you may add Datasets are loaded under `openproblems/data`. Each data loading function should download the appropriate dataset from a stable location (e.g. from Figshare) be decorated with `openproblems.data.utils.loader` in order to cache the result. +To see a gold standard loader, look at [openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py](https://github.com/singlecellopenproblems/SingleCellOpenProblems/blob/master/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py) + +This file name should match `[First Author Last Name]_[Year Published]_short_Description_of_data.py`. E.g. the dataset of zebrafish embryos perturbed with CRISPR published in 2018 by Wagner _et al._ becomes `Wagner_2018_zebrafish_embryo_CRISPR.py` + ### Adding a dataset / method / metric to a task To add a dataset, method, or metric to a task, simply create a new `.py` file corresponding to your proposed new functionality and import the main function in the corresponding `__init__.py`. E.g., to add a "F2" metric to the label projection task, we would create `openproblems/tasks/label_projection/metrics/f2.py` and add a line @@ -235,7 +239,9 @@ Datasets, methods and metrics run inside Docker containers. We provide a few to ## Code Style and Testing -`singlecellopenproblems` is maintained at close to 100% code coverage. For datasets, methods, and metrics, tests are generated automatically. For additions outside this core functionality, contributors are encouraged to write tests for their code -- but if you do not know how to do so, please do not feel discouraged from contributing code! Others can always help you test your contribution. +`singlecellopenproblems` is maintained at close to 100% code coverage. For datasets, methods, and metrics, tests are generated programatically from each task's `api.py`. See the [Adding a new task](#adding-a-new-task) section for instructions on creating this file. + +For additions outside this core functionality, contributors are encouraged to write tests for their code -- but if you do not know how to do so, please do not feel discouraged from contributing code! Others can always help you test your contribution. Code is tested by GitHub Actions when you push your changes. However, if you wish to test locally, you can do so with the following command: ``` diff --git a/README.md b/README.md index c0474ac674..1f704576f4 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Style Guide: OpenStack](https://img.shields.io/badge/style%20guide-openstack-eb1a32.svg)](https://docs.openstack.org/hacking/latest/user/hacking.html#styleguide) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) +[![Video](https://img.shields.io/static/v1?label=YouTube&message=Visit%20channel&color=red&logo=youtube)](https://www.youtube.com/channel/UCJpqxlzxRamcA3Pv3KlYZHg) Formalizing and benchmarking open problems in single-cell genomics. @@ -15,8 +16,11 @@ Formalizing and benchmarking open problems in single-cell genomics. ## Guides * For contributing guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md) * For instructions on editing Docker images, see [docker/README.md](docker/README.md) +* For instructions on using the `openproblems-cli`, see [openproblems/api/README.md](https://github.com/singlecellopenproblems/SingleCellOpenProblems/tree/master/openproblems/api) * For a walkthrough of the GitHub Actions workflows and AWS Sagemaker, see [SAGEMAKER.md](SAGEMAKER.md) * For a description of existing an proposed tasks, see [TASKS.md](TASKS.md) +* For a video introduction to this GitHub repository, watch our [Repository introduction](https://www.youtube.com/watch?v=tHempZCdXyA) +* For a video tutorial on adding new tasks, watch our [How to add a new task tutorial](https://www.youtube.com/watch?v=tgVG3Hp6mBc) ## The team @@ -32,7 +36,7 @@ Formalizing and benchmarking open problems in single-cell genomics. * Wes Lewis (@weslewis) - Differential Abundance and Data Denoising * Mohammad Lotfallahi (@M0hammadL) - Label projection task * Qian Qin (@qinqian) - Predicting gene expression from chromatin accessibility -* Daniel Strobel (@danielStrobel) - Batch integration +* Daniel Strobl (@danielStrobl) - Batch integration * Michael Vinyard (@mvinyard) - Stress preservation in Dimensionality Reduction * Florian Wagner (@flo-compbio) - Data denoising diff --git a/SAGEMAKER.md b/SAGEMAKER.md index 8380ad722b..46c6cf6e05 100644 --- a/SAGEMAKER.md +++ b/SAGEMAKER.md @@ -31,7 +31,7 @@ There is a 1:1 correspondence between the steps to set up SageMaker using the CL - [Add user to SageMaker Studio](#add-user-to-sagemaker-studio) - [Open SageMaker Studio and Launch a Notebook using a Custom Image](#open-sagemaker-studio-and-launch-a-notebook-using-a-custom-image) * [Selecting an instance type](#selecting-an-instance-type) - * [Kernel not found error](#kernel-not-found-error) + * [Failed to start kernel (image does not exist)](#failed-to-start-kernel-image-does-not-exist) @@ -186,7 +186,7 @@ We've selected three instances to use during the Jamboree. Note, it is possible To change your instance, follow the [Change Instance Type](https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-run-and-manage-switch-instance-type.html) tutorial from AWS. -### Kernel not found error +### Failed to start kernel (image does not exist) If you see the following error: diff --git a/TASKS.md b/TASKS.md index 5c76559099..4e1e08ce39 100644 --- a/TASKS.md +++ b/TASKS.md @@ -1,4 +1,4 @@ -## Single-Cell Analysis Benchmarking Tasks +# Benchmarking Task Descriptions Table of Contents * [Predicting gene expression from chromatin accessibility](#predicting-gene-expression-from-chromatin-accessibility) diff --git a/docker/README.md b/docker/README.md index e7f275b5db..61e315e63c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,6 +15,7 @@ Note, all images must comply to the [AWS SageMaker Custom Image Specifications]( - [Building Docker images locally](#building-docker-images-locally) - [Building Docker images through GitHub Actions workflows](#building-docker-images-through-github-actions-workflows) - [Pulling images from the ECR to your local machine](#pulling-images-from-the-ecr-to-your-local-machine) +- [Running Docker images locally](#running-docker-images-locally) @@ -126,3 +127,38 @@ docker pull .dkr.ecr.us-west-2.amazonaws.com/openproblems: docker images +REPOSITORY TAG IMAGE ID CREATED SIZE +singlecellopenproblems/openproblems-python-extras latest f86e1c5ce9d0 14 hours ago 3.94GB +singlecellopenproblems/openproblems-r-base latest f8908c9fb387 21 hours ago 6.36GB +singlecellopenproblems/openproblems-r-extras latest 7e15120bb7ce 5 days ago 4.89GB +singlecellopenproblems/openproblems latest 14974cbd2f58 5 days ago 2.1GB +490915662541.dkr.ecr.us-west-2.amazonaws.com/openproblems batch_integration_docker-openproblems 3a1ce37e85f2 6 days ago 2.06GB +``` + +You can then run commands within a docker container using `docker run`. Consult the [Docker documentation](https://docs.docker.com/engine/reference/commandline/run/) to learn more about the `run` command. + +**Using `IMAGE ID`** +``` +docker run -it 90a9110c7d69 /bin/bash +``` + +**Using `RESPOSITORY:TAG`** +``` +docker run -it singlecellopenproblems/openproblems-python-extras:latest /bin/bash + +``` diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index cee39838ca..c3d8473423 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -5,3 +5,4 @@ phate pyensembl pybedtools git+https://github.com/czbiohub/molecular-cross-validation +git+https://github.com/atong01/SCOT diff --git a/docker/openproblems/Dockerfile b/docker/openproblems/Dockerfile index 60487de49f..b1b995c663 100644 --- a/docker/openproblems/Dockerfile +++ b/docker/openproblems/Dockerfile @@ -1,5 +1,7 @@ FROM python:3.8 +# Adding this comment as a temporary bugfix on 3/30 + # Setting up Sagemaker Studio Image from example # https://github.com/aws-samples/sagemaker-studio-custom-image-samples/blob/main/examples/echo-kernel-image/Dockerfile diff --git a/openproblems/api/README.md b/openproblems/api/README.md index bc235e7e9d..168cf94908 100644 --- a/openproblems/api/README.md +++ b/openproblems/api/README.md @@ -26,17 +26,52 @@ optional arguments: ``` ## Example (without docker) +Running the CLI requires commands to be run in a specific order: `load` -> `run` -> `evaluate`. +For example: ``` -openproblems-cli tasks -openproblems-cli list --datasets --task label_projection +# Download a task-specific dataset and save it to `dataset.h5ad` openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch -openproblems-cli list --methods --task label_projection +# Run a method on a datasets and save output to `method.h5ad` openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cpm -openproblems-cli list --metrics --task label_projection +# Evaluate the performance of a previously run method using the `accuracy` metric openproblems-cli evaluate --task label_projection --input method.h5ad accuracy ``` +You can list available tasks using `openproblems-cli tasks` +``` +> openproblems-cli tasks +denoising +dimensionality_reduction +label_projection +multimodal_data_integration +regulatory_effect_prediction +``` + +You can then list the avaiable datasets, methods, and metrics for a partiular task using `openproblems-cli list --[datasets|methods|metrics] --task [task_name]` +``` +> openproblems-cli list --datasets --task label_projection +pancreas_batch +pancreas_random +zebrafish_labels +zebrafish_random + +> openproblems-cli list --methods --task label_projection +knn_classifier_log_cpm +knn_classifier_scran +logistic_regression_log_cpm +logistic_regression_scran +mlp_log_cpm +mlp_scran + +> openproblems-cli list --metrics --task label_projection +accuracy +f1 +f1_micro +``` + +The output of these commands are allowable arguments to the respective `load`, `run`, and `evaluate` commands. + ### Sample output ``` diff --git a/openproblems/api/main.py b/openproblems/api/main.py index ce4c57b61f..f40a506319 100644 --- a/openproblems/api/main.py +++ b/openproblems/api/main.py @@ -40,7 +40,7 @@ def _main(args=None): def main(args=None, do_print=True): """Run the command-line interface.""" output = _main(args) - if do_print and output: + if do_print: utils.print_output(output) return 0 else: diff --git a/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py b/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py new file mode 100644 index 0000000000..7cf4d4eb92 --- /dev/null +++ b/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py @@ -0,0 +1,120 @@ +from . import utils + +import anndata +import scprep + + +@utils.loader +def load_zebrafish_chd_tyr(test=False): + """Download zebrafish data from GEO accession GSE112294""" + + # Information about the files to download from GEO + sample_info = [ + ("GSM3067201", "chd", "A"), + ("GSM3067202", "chd", "B"), + ("GSM3067203", "chd", "C"), + ("GSM3067204", "tyr", "A"), + ("GSM3067205", "tyr", "B"), + ("GSM3067206", "tyr", "C"), + ] + counts_url = ( + "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/" + "GSM3067nnn/{accession}/suppl/{accession}_{genotype}{replicate}" + ".csv.gz" + ) + clusters_url = ( + "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/" + "GSM3067nnn/{accession}/suppl/{accession}_{genotype}{replicate}_" + "clustID.txt.gz" + ) + cluster_names_url = ( + "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112294/" + "suppl/GSE112294_ClusterNames.csv.gz" + ) + + # Download each sample and append relevant UMI / cluster info to a list + sparse = True + counts_matrices = [] + batch_labels = [] + metadata = [] + for accession, genotype, replicate in sample_info: + curr_label = "{}{}".format(genotype, replicate) + batch_labels.append(curr_label) + + # Load UMI data from GEO + data = scprep.io.load_csv( + counts_url.format( + accession=accession, + genotype=genotype, + replicate=replicate, + ), + sparse=sparse, + cell_axis="column", + ) + counts_matrices.append(data) + + # Load cluster IDs from GEO + clusters = scprep.io.load_csv( + clusters_url.format( + accession=accession, genotype=genotype, replicate=replicate + ), + cell_names=data.index, + gene_names=["clusterID"], + sparse=sparse, + ) + metadata.append(clusters) + + # Merge data files from each sample + data, sample_labels = scprep.utils.combine_batches( + counts_matrices, batch_labels, append_to_cell_names=True + ) + metadata, _ = scprep.utils.combine_batches( + metadata, batch_labels, append_to_cell_names=True + ) + metadata["sample"] = sample_labels + genotype = [] + condition = [] + replicate = [] + for label in metadata["sample"]: + if label.startswith("chd"): + genotype.append("chd") + condition.append("treatment") + else: + genotype.append("tyr") + condition.append("control") + replicate.append({"A": "1", "B": "2", "C": "3"}[label[-1]]) + + metadata["genotype"] = genotype + metadata["condition"] = condition + metadata["replicate"] = replicate + + # Making cluster names more human-readable + ClusterNamesMaps = scprep.io.load_csv( + cluster_names_url, cell_names=False + ).set_index("ClusterID") + ClusterNamesMaps["ClusterName"] = ClusterNamesMaps["ClusterName"].str.slice(6) + cluster_names = ClusterNamesMaps["ClusterName"].loc[metadata["clusterID"]] + cluster_names.index = metadata.index + metadata["cluster"] = cluster_names + + # Filtering rare genes and cells with high library size + data = scprep.filter.filter_rare_genes(data) + data, metadata = scprep.filter.filter_library_size( + data, metadata, cutoff=15000, keep_cells="below" + ) + + # Removing cells with abnormally high expression of unannotated genes + data, metadata = scprep.filter.filter_gene_set_expression( + data, metadata, genes=["LOC101885394"], cutoff=164 + ) + + # Convert dataframes to AnnData + adata = anndata.AnnData(data, obs=metadata) + + # Sample uniformly from the "genotype" field to + if test: + adata = utils.subsample_even(adata, n_obs=500, even_obs="sample") + utils.filter_genes_cells(adata) + adata = adata[:, :100] + utils.filter_genes_cells(adata) + return adata diff --git a/openproblems/data/tenx_5k_pbmc.py b/openproblems/data/tenx_5k_pbmc.py index c7c003e29a..3fe2d9c946 100644 --- a/openproblems/data/tenx_5k_pbmc.py +++ b/openproblems/data/tenx_5k_pbmc.py @@ -33,6 +33,8 @@ def load_tenx_5k_pbmc(test=False): scprep.io.download.download_url(URL, filepath) adata = sc.read(filepath) + adata.var_names_make_unique() + # Ensure there are no cells or genes with 0 counts utils.filter_genes_cells(adata) diff --git a/openproblems/tasks/denoising/methods/alra.R b/openproblems/tasks/denoising/methods/alra.R new file mode 100644 index 0000000000..2f623e5a5b --- /dev/null +++ b/openproblems/tasks/denoising/methods/alra.R @@ -0,0 +1,15 @@ +library(SingleCellExperiment) +library(Matrix) +source("https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R") +library(rsvd) + +# save the matrix of the obsm variable train + +train_matrix <- reducedDim(sce, "train") + +# this is the format used in the alraSeurat2.R example on KlugerLab/ALRA/ repo +reducedDim(sce, "train") <- Matrix(t(alra(t(as.matrix(train_matrix)))[[3]])) + +# return + +sce diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py new file mode 100644 index 0000000000..ad86cde24c --- /dev/null +++ b/openproblems/tasks/denoising/methods/alra.py @@ -0,0 +1,37 @@ +from ....tools.conversion import r_function +from ....tools.decorators import method + +# should be imported by docker +import numpy as np +import scprep + +# from ....tools.utils import check_version + + +_alra = r_function("alra.R") + + +@method( + method_name="ALRA", + paper_name="Zero-preserving imputation of scRNA-seq...", + paper_url="https://www.biorxiv.org/content/10.1101/397588v1", + paper_year=2018, + code_url="https://github.com/KlugerLab/ALRA", + # code_version=check_version("scprep"), + image="openproblems-r-extras", +) +def alra(adata): + # libsize and sqrt norm + # overwrite obsm['train'] with normalized version + adata.obsm["train"] = scprep.utils.matrix_transform(adata.obsm["train"], np.sqrt) + adata.obsm["train"], libsize = scprep.normalize.library_size_normalize( + adata.obsm["train"], rescale=1, return_library_size=True + ) + # run alra + # _alra takes adata returns adata, edits "train" + Y = _alra(adata)["train"] + # transform back into original space + Y = scprep.utils.matrix_transform(Y, np.square) + Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) + adata.obsm["denoised"] = Y + return adata diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index e6653315b3..cd7a336817 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -19,6 +19,24 @@ Where $y_i$ is the sum of pairwise euclidian distances between each value embedd Kruskel's stress uses the RMSE, more or less in the now commonly-used MDS (multi-dimensional scaling). We can calculate and plot Kruskel's stress to get an idea where the majority of distortion of the ***topography*** of the data in high-dimensional space. +### Trustworthiness +--- + +_Adapted from the [sklearn documentation.](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.trustworthiness.html)_ + +Trustworthiness expresses to what extent the local structure in an embedding is retained. The trustworthiness is within [0, 1]. It is defined as + +$$ + T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1} + \sum_{j \in \mathcal{N}_{i}^{k}} \max(0, (r(i, j) - k)) +$$ + +where for each sample i, $\mathcal{N}_{i}^{k}$ are its k nearest neighbors in the output space, and every sample j is its $r(i, j)$-th nearest neighbor in the input space. In other words, any unexpected nearest neighbors in the output space are penalised in proportion to their rank in the input space. + +References: + * "Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study" J. Venna, S. Kaski + * "Learning a Parametric Embedding by Preserving Local Structure" L.J.P. van der Maaten + ## The results ![Stress-plot RMASE CITE-seq](https://i.imgur.com/ulyAF9j.png) @@ -40,4 +58,8 @@ We can make this comparison across multiple dimensionality reduction methods. We **Metrics** should calculate the quality or "goodness of fit" of a dimensional reduction **method**. +## Pre-processing + +There is a `preprocessing.py` function which contains a set of standard pre-processing functions. These perform steps such as transforming the raw data, selecting features and summarising dimensionality reduction. Where possible each **method** should first call one of these functions and use the processed `adata.X` or `adata.obsm['X_input']` slot as the input to the method. Variants of methods can be created by applying different pre-processing prior to the method itself (see `phate.py` for an example). + 1. Raimundo, F., Vallot, C. & Vert, J. Tuning parameters of dimensionality reduction methods for single-cell RNA-seq analysis. Genome Biol 21, 212 (2020). https://doi.org/10.1186/s13059-020-02128-7 diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 46bd7b97b7..b784f91b4d 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,4 +1,5 @@ from .pca import pca -from .phate import phate +from .phate import phate_default +from .phate import phate_scanpy from .tsne import tsne from .umap import umap diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index 6f40a5b718..061e4979a1 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -1,7 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version - -import scanpy as sc +from .preprocessing import preprocess_scanpy @method( @@ -14,6 +13,6 @@ code_version=check_version("scikit-learn"), ) def pca(adata): - sc.tl.pca(adata) - adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] + preprocess_scanpy(adata) + adata.obsm["X_emb"] = adata.obsm["X_input"][:, :2] return adata diff --git a/openproblems/tasks/dimensionality_reduction/methods/phate.py b/openproblems/tasks/dimensionality_reduction/methods/phate.py index 36217cff5a..016a1c64c7 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/phate.py +++ b/openproblems/tasks/dimensionality_reduction/methods/phate.py @@ -1,10 +1,19 @@ from ....tools.decorators import method from ....tools.normalize import sqrt_cpm from ....tools.utils import check_version +from .preprocessing import preprocess_scanpy + + +def _phate(adata): + from phate import PHATE + + phate_op = PHATE(verbose=False, n_jobs=-1) + adata.obsm["X_emb"] = phate_op.fit_transform(adata.X) + return adata @method( - method_name="PHATE", + method_name="PHATE (default pre-processing)", paper_name="Visualizing Transitions and Structure for Biological Data Exploration", paper_url="https://www.nature.com/articles/s41587-019-0336-3", paper_year=2019, @@ -12,10 +21,20 @@ code_version=check_version("phate"), image="openproblems-python-extras", ) -def phate(adata): - from phate import PHATE - +def phate_default(adata): sqrt_cpm(adata) - phate_op = PHATE(verbose=False, n_jobs=-1) - adata.obsm["X_emb"] = phate_op.fit_transform(adata.X) - return adata + return _phate(adata) + + +@method( + method_name="PHATE (scanpy pre-processing)", + paper_name="Visualizing Transitions and Structure for Biological Data Exploration", + paper_url="https://www.nature.com/articles/s41587-019-0336-3", + paper_year=2019, + code_url="https://github.com/KrishnaswamyLab/PHATE/", + code_version=check_version("phate"), + image="openproblems-python-extras", +) +def phate_scanpy(adata): + preprocess_scanpy(adata) + return _phate(adata) diff --git a/openproblems/tasks/dimensionality_reduction/methods/preprocessing.py b/openproblems/tasks/dimensionality_reduction/methods/preprocessing.py new file mode 100644 index 0000000000..e1d378cbb3 --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/methods/preprocessing.py @@ -0,0 +1,11 @@ +from ....tools.normalize import log_cpm + +import scanpy as sc + + +def preprocess_scanpy(adata): + + log_cpm(adata) + sc.pp.highly_variable_genes(adata, n_top_genes=1000) + sc.tl.pca(adata, n_comps=50, svd_solver="arpack") + adata.obsm["X_input"] = adata.obsm["X_pca"] diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index 4f82607877..2376de3116 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -1,5 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version +from .preprocessing import preprocess_scanpy import scanpy as sc @@ -15,7 +16,7 @@ image="openproblems-python-extras", ) def tsne(adata): - sc.pp.pca(adata) - sc.tl.tsne(adata) + preprocess_scanpy(adata) + sc.tl.tsne(adata, use_rep="X_input", n_pcs=50) adata.obsm["X_emb"] = adata.obsm["X_tsne"] return adata diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index e5366699d2..ce9c49c7dc 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -1,5 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version +from .preprocessing import preprocess_scanpy import scanpy as sc @@ -14,7 +15,8 @@ code_version=check_version("umap-learn"), ) def umap(adata): - sc.pp.neighbors(adata) + preprocess_scanpy(adata) + sc.pp.neighbors(adata, use_rep="X_input", n_pcs=50) sc.tl.umap(adata) adata.obsm["X_emb"] = adata.obsm["X_umap"] return adata diff --git a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py index fd866a280d..e83be2be05 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py @@ -1 +1,2 @@ from .root_mean_square_error import rmse +from .trustworthiness import trustworthiness diff --git a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py new file mode 100644 index 0000000000..fc91c42e44 --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py @@ -0,0 +1,19 @@ +from ....tools.decorators import metric +from anndata import AnnData +from sklearn import manifold + +import numpy as np + + +@metric(metric_name="trustworthiness", maximize=True) +def trustworthiness(adata: AnnData) -> float: + high_dim, low_dim = adata.X, adata.obsm["X_emb"] + + if np.any(~np.isfinite(low_dim)): + return 0.0 + + score = manifold.trustworthiness( + high_dim, low_dim, n_neighbors=15, metric="euclidean" + ) + # for large k close to #samples, it's higher than 1.0, e.g 1.0000073552559712 + return float(np.clip(score, 0, 1)) diff --git a/openproblems/tasks/multimodal_data_integration/methods/scot.py b/openproblems/tasks/multimodal_data_integration/methods/scot.py new file mode 100644 index 0000000000..6df7632fca --- /dev/null +++ b/openproblems/tasks/multimodal_data_integration/methods/scot.py @@ -0,0 +1,97 @@ +from ....tools.decorators import method +from ....tools.normalize import log_cpm +from ....tools.normalize import log_scran_pooling +from ....tools.normalize import sqrt_cpm +from ....tools.utils import check_version + +import sklearn.decomposition + + +def _scot(adata, n_svd=100, balanced=False): + from SCOT import SCOT + + # PCA reduction + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) + X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) + Y_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.obsm["mode2"]) + + # Initialize SCOT + scot = SCOT(X_pca, Y_pca) + + # call the unbalanced alignment + # From https://github.com/rsinghlab/SCOT/blob/master/examples/unbalanced_GW_SNAREseq.ipynb # noqa: 501 + X_new_unbal, y_new_unbal = scot.align( + k=50, e=1e-3, rho=0.0005, normalize=True, balanced=balanced + ) + adata.obsm["aligned"] = X_new_unbal + adata.obsm["mode2_aligned"] = y_new_unbal + + return adata + + +@method( + method_name="Single Cell Optimal Transport (sqrt CPM unbalanced)", + paper_name="Gromov-Wasserstein optimal transport" + "to align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_sqrt_cpm_unbalanced(adata, n_svd=100, balanced=False): + sqrt_cpm(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata + + +@method( + method_name="Single Cell Optimal Transport (sqrt CPM balanced)", + paper_name="Gromov-Wasserstein optimal transport to " + "align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_sqrt_cpm_balanced(adata, n_svd=100, balanced=True): + sqrt_cpm(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata + + +@method( + method_name="Single Cell Optimal Transport (log scran unbalanced)", + paper_name="Gromov-Wasserstein optimal transport to " + "align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_log_scran_pooling_unbalanced(adata, n_svd=100, balanced=False): + log_scran_pooling(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata + + +@method( + method_name="Single Cell Optimal Transport (log scran balanced)", + paper_name="Gromov-Wasserstein optimal transport to " + "align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_log_scran_pooling_balanced(adata, n_svd=100, balanced=True): + log_scran_pooling(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata diff --git a/test/test_cli.py b/test/test_cli.py index 73ec1f75f0..0b7014588b 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -128,6 +128,33 @@ def test_hash(task, function_type, function_name): assert h1 == h2 +def test_zero_metric(): + def __zero_metric(*args): + return 0.0 + + metric_name = utils.name.object_name(__zero_metric) + task = openproblems.TASKS[0] + setattr(task.metrics, metric_name, __zero_metric) + adata = task.api.sample_dataset() + with tempfile.TemporaryDirectory() as tempdir: + dataset_file = os.path.join(tempdir, "dataset.h5ad") + adata.write_h5ad(dataset_file) + + result = main( + [ + "evaluate", + "--task", + task.__name__.split(".")[-1], + "--input", + dataset_file, + metric_name, + ], + do_print=True, + ) + assert result == 0 + assert isinstance(result, int) + + def test_pipeline(): """Test evaluation pipeline.""" with tempfile.TemporaryDirectory() as tempdir: diff --git a/test/test_metrics.py b/test/test_metrics.py index 0c3bdbc752..babdcb13a5 100644 --- a/test/test_metrics.py +++ b/test/test_metrics.py @@ -45,3 +45,33 @@ def test_metric(task_name, metric_name, tempdir, image): ) m = metric(adata) assert isinstance(m, numbers.Number) + + +@parameterized.parameterized.expand( + [ + ( + utils.TEMPDIR.name, + openproblems.tasks.dimensionality_reduction.metrics.trustworthiness.metadata[ # noqa: E501 + "image" + ], + ) + ], + name_func=utils.name.name_test, +) +@utils.docker.docker_test +def test_trustworthiness_sparse(tempdir, image): + from scipy.sparse import csr_matrix + + task = openproblems.tasks.dimensionality_reduction + metric = openproblems.tasks.dimensionality_reduction.metrics.trustworthiness + + adata = task.api.sample_dataset() + adata = task.api.sample_method(adata) + openproblems.log.debug( + "Testing {} metric from {} task".format(metric.__name__, task.__name__) + ) + adata.X = csr_matrix(adata.X) + m = metric(adata) + + assert isinstance(m, float) + assert 0 <= m <= 1 diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 25cb6d7dc4..69908ef928 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -96,13 +96,26 @@ def parse_trace_to_dict(df): def parse_metric_results(results): """Add metric results to the trace output.""" + missing_traces = [] for filename in os.listdir("results/metrics"): with open(os.path.join("results/metrics", filename), "r") as handle: result = float(handle.read().strip()) task_name, dataset_name, method_name, metric_name = filename.replace( ".metric.txt", "" ).split(".") - results[task_name][dataset_name][method_name]["metrics"][metric_name] = result + try: + results[task_name][dataset_name][method_name]["metrics"][ + metric_name + ] = result + except KeyError: + missing_traces.append(filename) + if len(missing_traces) > 0: + print("Missing execution trace for: ") + for filename in missing_traces: + print(" {}".format(filename)) + raise ValueError( + "Missing execution traces for {} results.".format(len(missing_traces)) + ) return results