From a64c516252ea119288cd3a34d2cb42e1d426d467 Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Mon, 29 Mar 2021 09:07:27 -0400 Subject: [PATCH 01/35] Create alra.py Add alra.py file as created previously from the wrong branch. --- .../dimensionality_reduction/methods/alra.py | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 openproblems/tasks/dimensionality_reduction/methods/alra.py diff --git a/openproblems/tasks/dimensionality_reduction/methods/alra.py b/openproblems/tasks/dimensionality_reduction/methods/alra.py new file mode 100644 index 0000000000..47a4228f71 --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/methods/alra.py @@ -0,0 +1,51 @@ +from ....tools.decorators import method +from ....tools.utils import check_version + +import numpy as np +import scprep + +_alra = scprep.run.RFunction( + setup=""" + library(SingleCellExperiment) + library(Matrix) + library(rsvd) + source('https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R') + + """, + args="sce", + body=""" + assay(sce, "X") <- as(assay(sce, "X"), "CsparseMatrix") + reducedDim(sce, "train") <- as(reducedDim(sce, "train"), "CsparseMatrix") + completed <- alra(as.matrix(reducedDim(sce, "train"))) + normCompleted <- completed[[3]] + reducedDim(sce, "train") <- as.matrix(normCompleted) + sce + """, +) + + +@method( + method_name="ALRA", + paper_name="Zero-preserving imputation of scRNA-seq data using low-rank approximation", + paper_url="https://www.biorxiv.org/content/10.1101/397588v1", + paper_year=2018, + code_url="https://github.com/KlugerLab/ALRA", + code_version=check_version("scprep"), + image="openproblems-r-extras", +) +def alra(adata): + X, libsize = scprep.normalize.library_size_normalize( + adata.obsm["train"], rescale=1, return_library_size=True + ) + X = scprep.transform.sqrt( + X + ) # note that log transform is used in the examples of alra on github + adata.obsm[ + "train" + ] = X # we know that the scprep's .run.Rfunction uses anndata.2ri, where reducedDim(d, "PCA")=adata.obsm["X_PCA"], but that these dim-reduction equivalents + # do not encompass any given key hold true across all contexts. Thus, we use X_PCA for now to ensure compatability. + Y = _alra(X) + Y = scprep.utils.matrix_transform(Y, np.square) + Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) + adata.obsm["denoised"] = Y + return adata From 9c7ae440f48918c946ae606b3c667cd44763cc11 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Mon, 29 Mar 2021 09:23:32 -0400 Subject: [PATCH 02/35] Update TASKS.md --- TASKS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TASKS.md b/TASKS.md index 5c76559099..4e1e08ce39 100644 --- a/TASKS.md +++ b/TASKS.md @@ -1,4 +1,4 @@ -## Single-Cell Analysis Benchmarking Tasks +# Benchmarking Task Descriptions Table of Contents * [Predicting gene expression from chromatin accessibility](#predicting-gene-expression-from-chromatin-accessibility) From ce51fa2af08800405ba183c38a0df3a3fa94e628 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Mon, 29 Mar 2021 09:32:02 -0400 Subject: [PATCH 03/35] Update SAGEMAKER.md --- SAGEMAKER.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SAGEMAKER.md b/SAGEMAKER.md index 8380ad722b..46c6cf6e05 100644 --- a/SAGEMAKER.md +++ b/SAGEMAKER.md @@ -31,7 +31,7 @@ There is a 1:1 correspondence between the steps to set up SageMaker using the CL - [Add user to SageMaker Studio](#add-user-to-sagemaker-studio) - [Open SageMaker Studio and Launch a Notebook using a Custom Image](#open-sagemaker-studio-and-launch-a-notebook-using-a-custom-image) * [Selecting an instance type](#selecting-an-instance-type) - * [Kernel not found error](#kernel-not-found-error) + * [Failed to start kernel (image does not exist)](#failed-to-start-kernel-image-does-not-exist) @@ -186,7 +186,7 @@ We've selected three instances to use during the Jamboree. Note, it is possible To change your instance, follow the [Change Instance Type](https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-run-and-manage-switch-instance-type.html) tutorial from AWS. -### Kernel not found error +### Failed to start kernel (image does not exist) If you see the following error: From 8739ebf869f3061588f199916360b0d3fe46b867 Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Mon, 29 Mar 2021 09:39:33 -0400 Subject: [PATCH 04/35] Delete alra.py removing from the incorrect task --- .../dimensionality_reduction/methods/alra.py | 51 ------------------- 1 file changed, 51 deletions(-) delete mode 100644 openproblems/tasks/dimensionality_reduction/methods/alra.py diff --git a/openproblems/tasks/dimensionality_reduction/methods/alra.py b/openproblems/tasks/dimensionality_reduction/methods/alra.py deleted file mode 100644 index 47a4228f71..0000000000 --- a/openproblems/tasks/dimensionality_reduction/methods/alra.py +++ /dev/null @@ -1,51 +0,0 @@ -from ....tools.decorators import method -from ....tools.utils import check_version - -import numpy as np -import scprep - -_alra = scprep.run.RFunction( - setup=""" - library(SingleCellExperiment) - library(Matrix) - library(rsvd) - source('https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R') - - """, - args="sce", - body=""" - assay(sce, "X") <- as(assay(sce, "X"), "CsparseMatrix") - reducedDim(sce, "train") <- as(reducedDim(sce, "train"), "CsparseMatrix") - completed <- alra(as.matrix(reducedDim(sce, "train"))) - normCompleted <- completed[[3]] - reducedDim(sce, "train") <- as.matrix(normCompleted) - sce - """, -) - - -@method( - method_name="ALRA", - paper_name="Zero-preserving imputation of scRNA-seq data using low-rank approximation", - paper_url="https://www.biorxiv.org/content/10.1101/397588v1", - paper_year=2018, - code_url="https://github.com/KlugerLab/ALRA", - code_version=check_version("scprep"), - image="openproblems-r-extras", -) -def alra(adata): - X, libsize = scprep.normalize.library_size_normalize( - adata.obsm["train"], rescale=1, return_library_size=True - ) - X = scprep.transform.sqrt( - X - ) # note that log transform is used in the examples of alra on github - adata.obsm[ - "train" - ] = X # we know that the scprep's .run.Rfunction uses anndata.2ri, where reducedDim(d, "PCA")=adata.obsm["X_PCA"], but that these dim-reduction equivalents - # do not encompass any given key hold true across all contexts. Thus, we use X_PCA for now to ensure compatability. - Y = _alra(X) - Y = scprep.utils.matrix_transform(Y, np.square) - Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) - adata.obsm["denoised"] = Y - return adata From 86962c27f32749e328dd8d7c036aa580bd15c12f Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Mon, 29 Mar 2021 09:41:08 -0400 Subject: [PATCH 05/35] Create alra.py Add alra to denoising task --- openproblems/tasks/denoising/methods/alra.py | 51 ++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 openproblems/tasks/denoising/methods/alra.py diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py new file mode 100644 index 0000000000..47a4228f71 --- /dev/null +++ b/openproblems/tasks/denoising/methods/alra.py @@ -0,0 +1,51 @@ +from ....tools.decorators import method +from ....tools.utils import check_version + +import numpy as np +import scprep + +_alra = scprep.run.RFunction( + setup=""" + library(SingleCellExperiment) + library(Matrix) + library(rsvd) + source('https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R') + + """, + args="sce", + body=""" + assay(sce, "X") <- as(assay(sce, "X"), "CsparseMatrix") + reducedDim(sce, "train") <- as(reducedDim(sce, "train"), "CsparseMatrix") + completed <- alra(as.matrix(reducedDim(sce, "train"))) + normCompleted <- completed[[3]] + reducedDim(sce, "train") <- as.matrix(normCompleted) + sce + """, +) + + +@method( + method_name="ALRA", + paper_name="Zero-preserving imputation of scRNA-seq data using low-rank approximation", + paper_url="https://www.biorxiv.org/content/10.1101/397588v1", + paper_year=2018, + code_url="https://github.com/KlugerLab/ALRA", + code_version=check_version("scprep"), + image="openproblems-r-extras", +) +def alra(adata): + X, libsize = scprep.normalize.library_size_normalize( + adata.obsm["train"], rescale=1, return_library_size=True + ) + X = scprep.transform.sqrt( + X + ) # note that log transform is used in the examples of alra on github + adata.obsm[ + "train" + ] = X # we know that the scprep's .run.Rfunction uses anndata.2ri, where reducedDim(d, "PCA")=adata.obsm["X_PCA"], but that these dim-reduction equivalents + # do not encompass any given key hold true across all contexts. Thus, we use X_PCA for now to ensure compatability. + Y = _alra(X) + Y = scprep.utils.matrix_transform(Y, np.square) + Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) + adata.obsm["denoised"] = Y + return adata From bbb93131ef1a3897acd7a8f326f6f333ea76ef53 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Mon, 29 Mar 2021 09:48:46 -0400 Subject: [PATCH 06/35] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index c0474ac674..7c7245807e 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![Style Guide: OpenStack](https://img.shields.io/badge/style%20guide-openstack-eb1a32.svg)](https://docs.openstack.org/hacking/latest/user/hacking.html#styleguide) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) +[![Video](https://img.shields.io/static/v1?label=YouTube&message=Visit%20channel&color=red&logo=youtube)](https://www.youtube.com/channel/UCJpqxlzxRamcA3Pv3KlYZHg) Formalizing and benchmarking open problems in single-cell genomics. @@ -17,6 +18,8 @@ Formalizing and benchmarking open problems in single-cell genomics. * For instructions on editing Docker images, see [docker/README.md](docker/README.md) * For a walkthrough of the GitHub Actions workflows and AWS Sagemaker, see [SAGEMAKER.md](SAGEMAKER.md) * For a description of existing an proposed tasks, see [TASKS.md](TASKS.md) +* For a video introduction to this GitHub repository, watch our [Repository introduction](https://www.youtube.com/watch?v=tHempZCdXyA) +* For a video tutorial on adding new tasks, watch our [How to add a new task tutorial](https://www.youtube.com/watch?v=tgVG3Hp6mBc) ## The team From 884790fe4a724d94cb370d88c5d8baafaa426237 Mon Sep 17 00:00:00 2001 From: Daniel Strobl <50872326+danielStrobl@users.noreply.github.com> Date: Mon, 29 Mar 2021 17:28:04 +0200 Subject: [PATCH 07/35] Update README.md Correcting my name in README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7c7245807e..1b057da20c 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Formalizing and benchmarking open problems in single-cell genomics. * Wes Lewis (@weslewis) - Differential Abundance and Data Denoising * Mohammad Lotfallahi (@M0hammadL) - Label projection task * Qian Qin (@qinqian) - Predicting gene expression from chromatin accessibility -* Daniel Strobel (@danielStrobel) - Batch integration +* Daniel Strobl (@danielStrobl) - Batch integration * Michael Vinyard (@mvinyard) - Stress preservation in Dimensionality Reduction * Florian Wagner (@flo-compbio) - Data denoising From e5401bcebb6b04befb5c767d0702a87a71f0e9ed Mon Sep 17 00:00:00 2001 From: Nikolay Markov Date: Mon, 29 Mar 2021 22:11:47 -0500 Subject: [PATCH 08/35] Fix cli failing tests when metric returns 0.0 float --- openproblems/api/main.py | 2 +- test/test_cli.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/openproblems/api/main.py b/openproblems/api/main.py index ce4c57b61f..f40a506319 100644 --- a/openproblems/api/main.py +++ b/openproblems/api/main.py @@ -40,7 +40,7 @@ def _main(args=None): def main(args=None, do_print=True): """Run the command-line interface.""" output = _main(args) - if do_print and output: + if do_print: utils.print_output(output) return 0 else: diff --git a/test/test_cli.py b/test/test_cli.py index 73ec1f75f0..0b7014588b 100644 --- a/test/test_cli.py +++ b/test/test_cli.py @@ -128,6 +128,33 @@ def test_hash(task, function_type, function_name): assert h1 == h2 +def test_zero_metric(): + def __zero_metric(*args): + return 0.0 + + metric_name = utils.name.object_name(__zero_metric) + task = openproblems.TASKS[0] + setattr(task.metrics, metric_name, __zero_metric) + adata = task.api.sample_dataset() + with tempfile.TemporaryDirectory() as tempdir: + dataset_file = os.path.join(tempdir, "dataset.h5ad") + adata.write_h5ad(dataset_file) + + result = main( + [ + "evaluate", + "--task", + task.__name__.split(".")[-1], + "--input", + dataset_file, + metric_name, + ], + do_print=True, + ) + assert result == 0 + assert isinstance(result, int) + + def test_pipeline(): """Test evaluation pipeline.""" with tempfile.TemporaryDirectory() as tempdir: From b24765f35e09fcdf6ce2fbf092201d9d93fd718b Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Tue, 30 Mar 2021 08:19:34 -0400 Subject: [PATCH 09/35] Create Wagner_2018_zebrafish_embryo_CRISPR.py --- .../Wagner_2018_zebrafish_embryo_CRISPR.py | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py diff --git a/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py b/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py new file mode 100644 index 0000000000..7cf4d4eb92 --- /dev/null +++ b/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py @@ -0,0 +1,120 @@ +from . import utils + +import anndata +import scprep + + +@utils.loader +def load_zebrafish_chd_tyr(test=False): + """Download zebrafish data from GEO accession GSE112294""" + + # Information about the files to download from GEO + sample_info = [ + ("GSM3067201", "chd", "A"), + ("GSM3067202", "chd", "B"), + ("GSM3067203", "chd", "C"), + ("GSM3067204", "tyr", "A"), + ("GSM3067205", "tyr", "B"), + ("GSM3067206", "tyr", "C"), + ] + counts_url = ( + "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/" + "GSM3067nnn/{accession}/suppl/{accession}_{genotype}{replicate}" + ".csv.gz" + ) + clusters_url = ( + "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/" + "GSM3067nnn/{accession}/suppl/{accession}_{genotype}{replicate}_" + "clustID.txt.gz" + ) + cluster_names_url = ( + "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112294/" + "suppl/GSE112294_ClusterNames.csv.gz" + ) + + # Download each sample and append relevant UMI / cluster info to a list + sparse = True + counts_matrices = [] + batch_labels = [] + metadata = [] + for accession, genotype, replicate in sample_info: + curr_label = "{}{}".format(genotype, replicate) + batch_labels.append(curr_label) + + # Load UMI data from GEO + data = scprep.io.load_csv( + counts_url.format( + accession=accession, + genotype=genotype, + replicate=replicate, + ), + sparse=sparse, + cell_axis="column", + ) + counts_matrices.append(data) + + # Load cluster IDs from GEO + clusters = scprep.io.load_csv( + clusters_url.format( + accession=accession, genotype=genotype, replicate=replicate + ), + cell_names=data.index, + gene_names=["clusterID"], + sparse=sparse, + ) + metadata.append(clusters) + + # Merge data files from each sample + data, sample_labels = scprep.utils.combine_batches( + counts_matrices, batch_labels, append_to_cell_names=True + ) + metadata, _ = scprep.utils.combine_batches( + metadata, batch_labels, append_to_cell_names=True + ) + metadata["sample"] = sample_labels + genotype = [] + condition = [] + replicate = [] + for label in metadata["sample"]: + if label.startswith("chd"): + genotype.append("chd") + condition.append("treatment") + else: + genotype.append("tyr") + condition.append("control") + replicate.append({"A": "1", "B": "2", "C": "3"}[label[-1]]) + + metadata["genotype"] = genotype + metadata["condition"] = condition + metadata["replicate"] = replicate + + # Making cluster names more human-readable + ClusterNamesMaps = scprep.io.load_csv( + cluster_names_url, cell_names=False + ).set_index("ClusterID") + ClusterNamesMaps["ClusterName"] = ClusterNamesMaps["ClusterName"].str.slice(6) + cluster_names = ClusterNamesMaps["ClusterName"].loc[metadata["clusterID"]] + cluster_names.index = metadata.index + metadata["cluster"] = cluster_names + + # Filtering rare genes and cells with high library size + data = scprep.filter.filter_rare_genes(data) + data, metadata = scprep.filter.filter_library_size( + data, metadata, cutoff=15000, keep_cells="below" + ) + + # Removing cells with abnormally high expression of unannotated genes + data, metadata = scprep.filter.filter_gene_set_expression( + data, metadata, genes=["LOC101885394"], cutoff=164 + ) + + # Convert dataframes to AnnData + adata = anndata.AnnData(data, obs=metadata) + + # Sample uniformly from the "genotype" field to + if test: + adata = utils.subsample_even(adata, n_obs=500, even_obs="sample") + utils.filter_genes_cells(adata) + adata = adata[:, :100] + utils.filter_genes_cells(adata) + return adata From 3a919e8d4e9bdb8e083a878625ad46b3dd152434 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Tue, 30 Mar 2021 08:45:29 -0400 Subject: [PATCH 10/35] Update CONTRIBUTING.md --- CONTRIBUTING.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 821e76d030..a0d0898677 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -177,6 +177,10 @@ If you are unable to write your method using our base dependencies, you may add Datasets are loaded under `openproblems/data`. Each data loading function should download the appropriate dataset from a stable location (e.g. from Figshare) be decorated with `openproblems.data.utils.loader` in order to cache the result. +To see a gold standard loader, look at [openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py](https://github.com/singlecellopenproblems/SingleCellOpenProblems/blob/master/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py) + +This file name should match `[First Author Last Name]_[Year Published]_short_Description_of_data.py`. E.g. the dataset of zebrafish embryos perturbed with CRISPR published in 2018 by Wagner _et al._ becomes `Wagner_2018_zebrafish_embryo_CRISPR.py` + ### Adding a dataset / method / metric to a task To add a dataset, method, or metric to a task, simply create a new `.py` file corresponding to your proposed new functionality and import the main function in the corresponding `__init__.py`. E.g., to add a "F2" metric to the label projection task, we would create `openproblems/tasks/label_projection/metrics/f2.py` and add a line From 03d4e294289da5f2a8a3a7c2e5c0a8de57330f19 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Tue, 30 Mar 2021 13:03:33 -0400 Subject: [PATCH 11/35] Update CONTRIBUTING.md --- CONTRIBUTING.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a0d0898677..e2c3310704 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,9 +27,9 @@ link to it from your website, or simply star it in GitHub to say "I use it". + [Writing functions in R](#writing-functions-in-r) + [Adding package dependencies](#adding-package-dependencies) + [Adding a new dataset](#adding-a-new-dataset) - + [Adding a dataset / method / metric to a task](#adding-a-dataset---method---metric-to-a-task) + + [Adding a dataset / method / metric to a task](#adding-a-dataset--method--metric-to-a-task) + [Adding a new task](#adding-a-new-task) - + [Adding a new Docker container](#adding-a-new-container) + + [Adding a new Docker container](#adding-a-new-docker-container) * [Code Style and Testing](#code-style-and-testing) * [Code of Conduct](#code-of-conduct) * [Attribution](#attribution) @@ -239,7 +239,9 @@ Datasets, methods and metrics run inside Docker containers. We provide a few to ## Code Style and Testing -`singlecellopenproblems` is maintained at close to 100% code coverage. For datasets, methods, and metrics, tests are generated automatically. For additions outside this core functionality, contributors are encouraged to write tests for their code -- but if you do not know how to do so, please do not feel discouraged from contributing code! Others can always help you test your contribution. +`singlecellopenproblems` is maintained at close to 100% code coverage. For datasets, methods, and metrics, tests are generated programatically from each task's `api.py`. See the [Adding a new task](#adding-a-new-task) section for instructions on creating this file. + +For additions outside this core functionality, contributors are encouraged to write tests for their code -- but if you do not know how to do so, please do not feel discouraged from contributing code! Others can always help you test your contribution. Code is tested by GitHub Actions when you push your changes. However, if you wish to test locally, you can do so with the following command: ``` From 65656430bee369c35cd0587b6509967f79754142 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Tue, 30 Mar 2021 16:39:25 -0400 Subject: [PATCH 12/35] Dockerfile bugfix --- docker/openproblems/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/openproblems/Dockerfile b/docker/openproblems/Dockerfile index 60487de49f..b1b995c663 100644 --- a/docker/openproblems/Dockerfile +++ b/docker/openproblems/Dockerfile @@ -1,5 +1,7 @@ FROM python:3.8 +# Adding this comment as a temporary bugfix on 3/30 + # Setting up Sagemaker Studio Image from example # https://github.com/aws-samples/sagemaker-studio-custom-image-samples/blob/main/examples/echo-kernel-image/Dockerfile From b4b850fbe82d150b1ef025557acddef41437fc82 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Tue, 30 Mar 2021 16:41:53 -0400 Subject: [PATCH 13/35] Force docker_build on `master` --- .github/workflows/build_docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml index fcfdf733b2..e34907b398 100644 --- a/.github/workflows/build_docker.yml +++ b/.github/workflows/build_docker.yml @@ -45,7 +45,7 @@ jobs: - name: Build Docker images run: | cd workflow - snakemake -j $(grep -c processor /proc/cpuinfo) docker + snakemake -j $(grep -c processor /proc/cpuinfo) docker_build cd .. - name: Push Docker images From efab58094d81674b3a8c38276ca0faf0287e5325 Mon Sep 17 00:00:00 2001 From: LuckyMD Date: Wed, 31 Mar 2021 01:28:25 +0200 Subject: [PATCH 14/35] make var names unique for tenx data loader --- openproblems/data/tenx_5k_pbmc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openproblems/data/tenx_5k_pbmc.py b/openproblems/data/tenx_5k_pbmc.py index c7c003e29a..3fe2d9c946 100644 --- a/openproblems/data/tenx_5k_pbmc.py +++ b/openproblems/data/tenx_5k_pbmc.py @@ -33,6 +33,8 @@ def load_tenx_5k_pbmc(test=False): scprep.io.download.download_url(URL, filepath) adata = sc.read(filepath) + adata.var_names_make_unique() + # Ensure there are no cells or genes with 0 counts utils.filter_genes_cells(adata) From cca19ec5a326bbfc9b875e85041c7f7d132009e0 Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Tue, 30 Mar 2021 21:04:26 -0400 Subject: [PATCH 15/35] Update alra.py Change to utilize alra.R support script. --- openproblems/tasks/denoising/methods/alra.py | 37 ++++++-------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index 47a4228f71..00e41bda58 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -1,27 +1,14 @@ +from ....tools.conversion import r_function from ....tools.decorators import method +from ....tools.normalize import log_cpm +from ....tools.normalize import log_scran_pooling from ....tools.utils import check_version - +#should be imported by docker import numpy as np import scprep -_alra = scprep.run.RFunction( - setup=""" - library(SingleCellExperiment) - library(Matrix) - library(rsvd) - source('https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R') +_alra = r_function("alra.R") - """, - args="sce", - body=""" - assay(sce, "X") <- as(assay(sce, "X"), "CsparseMatrix") - reducedDim(sce, "train") <- as(reducedDim(sce, "train"), "CsparseMatrix") - completed <- alra(as.matrix(reducedDim(sce, "train"))) - normCompleted <- completed[[3]] - reducedDim(sce, "train") <- as.matrix(normCompleted) - sce - """, -) @method( @@ -34,17 +21,13 @@ image="openproblems-r-extras", ) def alra(adata): - X, libsize = scprep.normalize.library_size_normalize( + #libsize and sqrt norm + #overwrite obsm['train'] with normalized version + adata.obsm["train"], libsize = scprep.normalize.library_size_normalize( adata.obsm["train"], rescale=1, return_library_size=True ) - X = scprep.transform.sqrt( - X - ) # note that log transform is used in the examples of alra on github - adata.obsm[ - "train" - ] = X # we know that the scprep's .run.Rfunction uses anndata.2ri, where reducedDim(d, "PCA")=adata.obsm["X_PCA"], but that these dim-reduction equivalents - # do not encompass any given key hold true across all contexts. Thus, we use X_PCA for now to ensure compatability. - Y = _alra(X) + + Y = _alra(adata)["train"] Y = scprep.utils.matrix_transform(Y, np.square) Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) adata.obsm["denoised"] = Y From 1a1c25ed83b3c2f0968049e5f8dffcf3409d8511 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 31 Mar 2021 01:05:36 +0000 Subject: [PATCH 16/35] pre-commit --- openproblems/tasks/denoising/methods/alra.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index 00e41bda58..93c70cee1b 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -3,14 +3,14 @@ from ....tools.normalize import log_cpm from ....tools.normalize import log_scran_pooling from ....tools.utils import check_version -#should be imported by docker + +# should be imported by docker import numpy as np import scprep _alra = r_function("alra.R") - @method( method_name="ALRA", paper_name="Zero-preserving imputation of scRNA-seq data using low-rank approximation", @@ -21,8 +21,8 @@ image="openproblems-r-extras", ) def alra(adata): - #libsize and sqrt norm - #overwrite obsm['train'] with normalized version + # libsize and sqrt norm + # overwrite obsm['train'] with normalized version adata.obsm["train"], libsize = scprep.normalize.library_size_normalize( adata.obsm["train"], rescale=1, return_library_size=True ) From 014dd284d952ee70691eebeb489eb5edf3317b9a Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Tue, 30 Mar 2021 21:09:17 -0400 Subject: [PATCH 17/35] Create alra.R Add support script alra.R --- openproblems/tasks/denoising/methods/alra.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 openproblems/tasks/denoising/methods/alra.R diff --git a/openproblems/tasks/denoising/methods/alra.R b/openproblems/tasks/denoising/methods/alra.R new file mode 100644 index 0000000000..cbf3c322b7 --- /dev/null +++ b/openproblems/tasks/denoising/methods/alra.R @@ -0,0 +1,15 @@ +library(SingleCellExperiment) +library(Matrix) +source('https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R') +library(rsvd) + +#save the matrix of the obsm variable train + +train_matrix <- reducedDim(sce, 'train') + +#this is the format used in the alraSeurat2.R example script on the https://github.com/KlugerLab/ALRA/ repo +reducedDim(sce, "train") = Matrix(t(alra(t(as.matrix(train_matrix)))[[3]])) + +#return + +sce From fe8a14b1d171655b1a1db81e9753457dd21cafeb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 31 Mar 2021 01:10:44 +0000 Subject: [PATCH 18/35] pre-commit --- openproblems/tasks/denoising/methods/alra.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/openproblems/tasks/denoising/methods/alra.R b/openproblems/tasks/denoising/methods/alra.R index cbf3c322b7..f2f60e54da 100644 --- a/openproblems/tasks/denoising/methods/alra.R +++ b/openproblems/tasks/denoising/methods/alra.R @@ -1,15 +1,15 @@ library(SingleCellExperiment) library(Matrix) -source('https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R') +source("https://raw.githubusercontent.com/KlugerLab/ALRA/master/alra.R") library(rsvd) -#save the matrix of the obsm variable train +# save the matrix of the obsm variable train -train_matrix <- reducedDim(sce, 'train') +train_matrix <- reducedDim(sce, "train") -#this is the format used in the alraSeurat2.R example script on the https://github.com/KlugerLab/ALRA/ repo -reducedDim(sce, "train") = Matrix(t(alra(t(as.matrix(train_matrix)))[[3]])) +# this is the format used in the alraSeurat2.R example script on the https://github.com/KlugerLab/ALRA/ repo +reducedDim(sce, "train") <- Matrix(t(alra(t(as.matrix(train_matrix)))[[3]])) -#return +# return sce From df10a9521af50bb1356f575e85276c6018da95bd Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Tue, 30 Mar 2021 22:12:35 -0400 Subject: [PATCH 19/35] Update alra.R Changed comment to equal <80 characters in length --- openproblems/tasks/denoising/methods/alra.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openproblems/tasks/denoising/methods/alra.R b/openproblems/tasks/denoising/methods/alra.R index f2f60e54da..2f623e5a5b 100644 --- a/openproblems/tasks/denoising/methods/alra.R +++ b/openproblems/tasks/denoising/methods/alra.R @@ -7,7 +7,7 @@ library(rsvd) train_matrix <- reducedDim(sce, "train") -# this is the format used in the alraSeurat2.R example script on the https://github.com/KlugerLab/ALRA/ repo +# this is the format used in the alraSeurat2.R example on KlugerLab/ALRA/ repo reducedDim(sce, "train") <- Matrix(t(alra(t(as.matrix(train_matrix)))[[3]])) # return From c5836e1e37d580cb0c8c7ba38670cc5964f6d46d Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Tue, 30 Mar 2021 22:18:43 -0400 Subject: [PATCH 20/35] Update alra.py Remove scran pooling and log-cpm imports and shorten comment --- openproblems/tasks/denoising/methods/alra.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index 93c70cee1b..8c97d75805 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -1,7 +1,5 @@ from ....tools.conversion import r_function from ....tools.decorators import method -from ....tools.normalize import log_cpm -from ....tools.normalize import log_scran_pooling from ....tools.utils import check_version # should be imported by docker @@ -13,7 +11,7 @@ @method( method_name="ALRA", - paper_name="Zero-preserving imputation of scRNA-seq data using low-rank approximation", + paper_name="Zero-preserving imputation of scRNA-seq...", paper_url="https://www.biorxiv.org/content/10.1101/397588v1", paper_year=2018, code_url="https://github.com/KlugerLab/ALRA", From 32b3421dfa4c0ff05bce7f79a113604d194877be Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Wed, 31 Mar 2021 10:27:55 -0400 Subject: [PATCH 21/35] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1b057da20c..1f704576f4 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Formalizing and benchmarking open problems in single-cell genomics. ## Guides * For contributing guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md) * For instructions on editing Docker images, see [docker/README.md](docker/README.md) +* For instructions on using the `openproblems-cli`, see [openproblems/api/README.md](https://github.com/singlecellopenproblems/SingleCellOpenProblems/tree/master/openproblems/api) * For a walkthrough of the GitHub Actions workflows and AWS Sagemaker, see [SAGEMAKER.md](SAGEMAKER.md) * For a description of existing an proposed tasks, see [TASKS.md](TASKS.md) * For a video introduction to this GitHub repository, watch our [Repository introduction](https://www.youtube.com/watch?v=tHempZCdXyA) From 1c23436522216efa00b19bf699bb9ff782cd6aa4 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Wed, 31 Mar 2021 10:41:35 -0400 Subject: [PATCH 22/35] Update README.md --- docker/README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/docker/README.md b/docker/README.md index e7f275b5db..61e315e63c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,6 +15,7 @@ Note, all images must comply to the [AWS SageMaker Custom Image Specifications]( - [Building Docker images locally](#building-docker-images-locally) - [Building Docker images through GitHub Actions workflows](#building-docker-images-through-github-actions-workflows) - [Pulling images from the ECR to your local machine](#pulling-images-from-the-ecr-to-your-local-machine) +- [Running Docker images locally](#running-docker-images-locally) @@ -126,3 +127,38 @@ docker pull .dkr.ecr.us-west-2.amazonaws.com/openproblems: docker images +REPOSITORY TAG IMAGE ID CREATED SIZE +singlecellopenproblems/openproblems-python-extras latest f86e1c5ce9d0 14 hours ago 3.94GB +singlecellopenproblems/openproblems-r-base latest f8908c9fb387 21 hours ago 6.36GB +singlecellopenproblems/openproblems-r-extras latest 7e15120bb7ce 5 days ago 4.89GB +singlecellopenproblems/openproblems latest 14974cbd2f58 5 days ago 2.1GB +490915662541.dkr.ecr.us-west-2.amazonaws.com/openproblems batch_integration_docker-openproblems 3a1ce37e85f2 6 days ago 2.06GB +``` + +You can then run commands within a docker container using `docker run`. Consult the [Docker documentation](https://docs.docker.com/engine/reference/commandline/run/) to learn more about the `run` command. + +**Using `IMAGE ID`** +``` +docker run -it 90a9110c7d69 /bin/bash +``` + +**Using `RESPOSITORY:TAG`** +``` +docker run -it singlecellopenproblems/openproblems-python-extras:latest /bin/bash + +``` From e4c009f8c232bdae4c9e20b8e435d0fe474b3daf Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Wed, 31 Mar 2021 10:49:56 -0400 Subject: [PATCH 23/35] Update README.md --- openproblems/api/README.md | 43 ++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/openproblems/api/README.md b/openproblems/api/README.md index bc235e7e9d..168cf94908 100644 --- a/openproblems/api/README.md +++ b/openproblems/api/README.md @@ -26,17 +26,52 @@ optional arguments: ``` ## Example (without docker) +Running the CLI requires commands to be run in a specific order: `load` -> `run` -> `evaluate`. +For example: ``` -openproblems-cli tasks -openproblems-cli list --datasets --task label_projection +# Download a task-specific dataset and save it to `dataset.h5ad` openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch -openproblems-cli list --methods --task label_projection +# Run a method on a datasets and save output to `method.h5ad` openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cpm -openproblems-cli list --metrics --task label_projection +# Evaluate the performance of a previously run method using the `accuracy` metric openproblems-cli evaluate --task label_projection --input method.h5ad accuracy ``` +You can list available tasks using `openproblems-cli tasks` +``` +> openproblems-cli tasks +denoising +dimensionality_reduction +label_projection +multimodal_data_integration +regulatory_effect_prediction +``` + +You can then list the avaiable datasets, methods, and metrics for a partiular task using `openproblems-cli list --[datasets|methods|metrics] --task [task_name]` +``` +> openproblems-cli list --datasets --task label_projection +pancreas_batch +pancreas_random +zebrafish_labels +zebrafish_random + +> openproblems-cli list --methods --task label_projection +knn_classifier_log_cpm +knn_classifier_scran +logistic_regression_log_cpm +logistic_regression_scran +mlp_log_cpm +mlp_scran + +> openproblems-cli list --metrics --task label_projection +accuracy +f1 +f1_micro +``` + +The output of these commands are allowable arguments to the respective `load`, `run`, and `evaluate` commands. + ### Sample output ``` From 0068ba0c13999667d42a943c3726367425122811 Mon Sep 17 00:00:00 2001 From: Alexander Tong Date: Mon, 5 Apr 2021 14:54:35 -0400 Subject: [PATCH 24/35] Add Method SCOT to Multi-modal data Integration Task (#298) * Add SCOT method to multimodal data integration task * pre-commit * Fix flake things * pre-commit Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../requirements.txt | 1 + .../methods/scot.py | 97 +++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 openproblems/tasks/multimodal_data_integration/methods/scot.py diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt index cee39838ca..c3d8473423 100644 --- a/docker/openproblems-python-extras/requirements.txt +++ b/docker/openproblems-python-extras/requirements.txt @@ -5,3 +5,4 @@ phate pyensembl pybedtools git+https://github.com/czbiohub/molecular-cross-validation +git+https://github.com/atong01/SCOT diff --git a/openproblems/tasks/multimodal_data_integration/methods/scot.py b/openproblems/tasks/multimodal_data_integration/methods/scot.py new file mode 100644 index 0000000000..6df7632fca --- /dev/null +++ b/openproblems/tasks/multimodal_data_integration/methods/scot.py @@ -0,0 +1,97 @@ +from ....tools.decorators import method +from ....tools.normalize import log_cpm +from ....tools.normalize import log_scran_pooling +from ....tools.normalize import sqrt_cpm +from ....tools.utils import check_version + +import sklearn.decomposition + + +def _scot(adata, n_svd=100, balanced=False): + from SCOT import SCOT + + # PCA reduction + n_svd = min([n_svd, min(adata.X.shape) - 1, min(adata.obsm["mode2"].shape) - 1]) + X_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.X) + Y_pca = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(adata.obsm["mode2"]) + + # Initialize SCOT + scot = SCOT(X_pca, Y_pca) + + # call the unbalanced alignment + # From https://github.com/rsinghlab/SCOT/blob/master/examples/unbalanced_GW_SNAREseq.ipynb # noqa: 501 + X_new_unbal, y_new_unbal = scot.align( + k=50, e=1e-3, rho=0.0005, normalize=True, balanced=balanced + ) + adata.obsm["aligned"] = X_new_unbal + adata.obsm["mode2_aligned"] = y_new_unbal + + return adata + + +@method( + method_name="Single Cell Optimal Transport (sqrt CPM unbalanced)", + paper_name="Gromov-Wasserstein optimal transport" + "to align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_sqrt_cpm_unbalanced(adata, n_svd=100, balanced=False): + sqrt_cpm(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata + + +@method( + method_name="Single Cell Optimal Transport (sqrt CPM balanced)", + paper_name="Gromov-Wasserstein optimal transport to " + "align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_sqrt_cpm_balanced(adata, n_svd=100, balanced=True): + sqrt_cpm(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata + + +@method( + method_name="Single Cell Optimal Transport (log scran unbalanced)", + paper_name="Gromov-Wasserstein optimal transport to " + "align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_log_scran_pooling_unbalanced(adata, n_svd=100, balanced=False): + log_scran_pooling(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata + + +@method( + method_name="Single Cell Optimal Transport (log scran balanced)", + paper_name="Gromov-Wasserstein optimal transport to " + "align single-cell multi-omics data", + paper_url="https://www.biorxiv.org/content/10.1101/2020.04.28.066787", + paper_year=2020, + code_url="https://github.com/rsinghlab/SCOT", + code_version=check_version("SCOT"), + image="openproblems-python-extras", +) +def scot_log_scran_pooling_balanced(adata, n_svd=100, balanced=True): + log_scran_pooling(adata) + log_cpm(adata, obsm="mode2", obs="mode2_obs", var="mode2_var") + _scot(adata, n_svd=n_svd, balanced=balanced) + return adata From bee000399100631bf7d16900eb1f996d1566124d Mon Sep 17 00:00:00 2001 From: michalk8 <46717574+michalk8@users.noreply.github.com> Date: Tue, 6 Apr 2021 16:04:17 +0200 Subject: [PATCH 25/35] Add trustworthiness score for dimred task (#258) * Add trustworthiness score * Address comments, add test * Fix test * Update README.md * Update README.md * Update README.md * Update README.md Co-authored-by: Daniel Burkhardt --- .gitignore | 3 ++ .../tasks/dimensionality_reduction/README.md | 18 +++++++++++ .../metrics/__init__.py | 1 + .../metrics/trustworthiness.py | 19 ++++++++++++ test/test_metrics.py | 30 +++++++++++++++++++ 5 files changed, 71 insertions(+) create mode 100644 openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py diff --git a/.gitignore b/.gitignore index e160c24854..2336e1abb4 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,6 @@ resources/ # Nextflow nf-openproblems .nextflow + +# Editor +.idea diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index e6653315b3..c2cb8ccf0c 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -19,6 +19,24 @@ Where $y_i$ is the sum of pairwise euclidian distances between each value embedd Kruskel's stress uses the RMSE, more or less in the now commonly-used MDS (multi-dimensional scaling). We can calculate and plot Kruskel's stress to get an idea where the majority of distortion of the ***topography*** of the data in high-dimensional space. +### Trustworthiness +--- + +_Adapted from the [sklearn documentation.](https://scikit-learn.org/stable/modules/generated/sklearn.manifold.trustworthiness.html)_ + +Trustworthiness expresses to what extent the local structure in an embedding is retained. The trustworthiness is within [0, 1]. It is defined as + +$$ + T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1} + \sum_{j \in \mathcal{N}_{i}^{k}} \max(0, (r(i, j) - k)) +$$ + +where for each sample i, $\mathcal{N}_{i}^{k}$ are its k nearest neighbors in the output space, and every sample j is its $r(i, j)$-th nearest neighbor in the input space. In other words, any unexpected nearest neighbors in the output space are penalised in proportion to their rank in the input space. + +References: + * "Neighborhood Preservation in Nonlinear Projection Methods: An Experimental Study" J. Venna, S. Kaski + * "Learning a Parametric Embedding by Preserving Local Structure" L.J.P. van der Maaten + ## The results ![Stress-plot RMASE CITE-seq](https://i.imgur.com/ulyAF9j.png) diff --git a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py index fd866a280d..e83be2be05 100644 --- a/openproblems/tasks/dimensionality_reduction/metrics/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/metrics/__init__.py @@ -1 +1,2 @@ from .root_mean_square_error import rmse +from .trustworthiness import trustworthiness diff --git a/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py new file mode 100644 index 0000000000..fc91c42e44 --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py @@ -0,0 +1,19 @@ +from ....tools.decorators import metric +from anndata import AnnData +from sklearn import manifold + +import numpy as np + + +@metric(metric_name="trustworthiness", maximize=True) +def trustworthiness(adata: AnnData) -> float: + high_dim, low_dim = adata.X, adata.obsm["X_emb"] + + if np.any(~np.isfinite(low_dim)): + return 0.0 + + score = manifold.trustworthiness( + high_dim, low_dim, n_neighbors=15, metric="euclidean" + ) + # for large k close to #samples, it's higher than 1.0, e.g 1.0000073552559712 + return float(np.clip(score, 0, 1)) diff --git a/test/test_metrics.py b/test/test_metrics.py index 0c3bdbc752..babdcb13a5 100644 --- a/test/test_metrics.py +++ b/test/test_metrics.py @@ -45,3 +45,33 @@ def test_metric(task_name, metric_name, tempdir, image): ) m = metric(adata) assert isinstance(m, numbers.Number) + + +@parameterized.parameterized.expand( + [ + ( + utils.TEMPDIR.name, + openproblems.tasks.dimensionality_reduction.metrics.trustworthiness.metadata[ # noqa: E501 + "image" + ], + ) + ], + name_func=utils.name.name_test, +) +@utils.docker.docker_test +def test_trustworthiness_sparse(tempdir, image): + from scipy.sparse import csr_matrix + + task = openproblems.tasks.dimensionality_reduction + metric = openproblems.tasks.dimensionality_reduction.metrics.trustworthiness + + adata = task.api.sample_dataset() + adata = task.api.sample_method(adata) + openproblems.log.debug( + "Testing {} metric from {} task".format(metric.__name__, task.__name__) + ) + adata.X = csr_matrix(adata.X) + m = metric(adata) + + assert isinstance(m, float) + assert 0 <= m <= 1 From 2b2cc67cc5eef8d544f9678f3e906157f6e9e385 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Wed, 7 Apr 2021 15:06:42 +0200 Subject: [PATCH 26/35] Dimred methods preprocessing (#301) * Add preprocessing.py for dim red methods This file should contain standard pre-processing functions used to create variations on different methods * Add pre-processing function existing dim red methods Create a new PHATE variant using Scanpy pre-processing * Update dim red README to describe pre-processing * pre-commit * Pre-commit fixes * Move PHATE import inside function Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Daniel Burkhardt --- .../tasks/dimensionality_reduction/README.md | 4 +++ .../methods/__init__.py | 3 +- .../dimensionality_reduction/methods/pca.py | 7 ++-- .../dimensionality_reduction/methods/phate.py | 33 +++++++++++++++---- .../methods/preprocessing.py | 11 +++++++ .../dimensionality_reduction/methods/tsne.py | 5 +-- .../dimensionality_reduction/methods/umap.py | 4 ++- 7 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 openproblems/tasks/dimensionality_reduction/methods/preprocessing.py diff --git a/openproblems/tasks/dimensionality_reduction/README.md b/openproblems/tasks/dimensionality_reduction/README.md index c2cb8ccf0c..cd7a336817 100644 --- a/openproblems/tasks/dimensionality_reduction/README.md +++ b/openproblems/tasks/dimensionality_reduction/README.md @@ -58,4 +58,8 @@ We can make this comparison across multiple dimensionality reduction methods. We **Metrics** should calculate the quality or "goodness of fit" of a dimensional reduction **method**. +## Pre-processing + +There is a `preprocessing.py` function which contains a set of standard pre-processing functions. These perform steps such as transforming the raw data, selecting features and summarising dimensionality reduction. Where possible each **method** should first call one of these functions and use the processed `adata.X` or `adata.obsm['X_input']` slot as the input to the method. Variants of methods can be created by applying different pre-processing prior to the method itself (see `phate.py` for an example). + 1. Raimundo, F., Vallot, C. & Vert, J. Tuning parameters of dimensionality reduction methods for single-cell RNA-seq analysis. Genome Biol 21, 212 (2020). https://doi.org/10.1186/s13059-020-02128-7 diff --git a/openproblems/tasks/dimensionality_reduction/methods/__init__.py b/openproblems/tasks/dimensionality_reduction/methods/__init__.py index 46bd7b97b7..b784f91b4d 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/__init__.py +++ b/openproblems/tasks/dimensionality_reduction/methods/__init__.py @@ -1,4 +1,5 @@ from .pca import pca -from .phate import phate +from .phate import phate_default +from .phate import phate_scanpy from .tsne import tsne from .umap import umap diff --git a/openproblems/tasks/dimensionality_reduction/methods/pca.py b/openproblems/tasks/dimensionality_reduction/methods/pca.py index 6f40a5b718..061e4979a1 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/pca.py +++ b/openproblems/tasks/dimensionality_reduction/methods/pca.py @@ -1,7 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version - -import scanpy as sc +from .preprocessing import preprocess_scanpy @method( @@ -14,6 +13,6 @@ code_version=check_version("scikit-learn"), ) def pca(adata): - sc.tl.pca(adata) - adata.obsm["X_emb"] = adata.obsm["X_pca"][:, :2] + preprocess_scanpy(adata) + adata.obsm["X_emb"] = adata.obsm["X_input"][:, :2] return adata diff --git a/openproblems/tasks/dimensionality_reduction/methods/phate.py b/openproblems/tasks/dimensionality_reduction/methods/phate.py index 36217cff5a..016a1c64c7 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/phate.py +++ b/openproblems/tasks/dimensionality_reduction/methods/phate.py @@ -1,10 +1,19 @@ from ....tools.decorators import method from ....tools.normalize import sqrt_cpm from ....tools.utils import check_version +from .preprocessing import preprocess_scanpy + + +def _phate(adata): + from phate import PHATE + + phate_op = PHATE(verbose=False, n_jobs=-1) + adata.obsm["X_emb"] = phate_op.fit_transform(adata.X) + return adata @method( - method_name="PHATE", + method_name="PHATE (default pre-processing)", paper_name="Visualizing Transitions and Structure for Biological Data Exploration", paper_url="https://www.nature.com/articles/s41587-019-0336-3", paper_year=2019, @@ -12,10 +21,20 @@ code_version=check_version("phate"), image="openproblems-python-extras", ) -def phate(adata): - from phate import PHATE - +def phate_default(adata): sqrt_cpm(adata) - phate_op = PHATE(verbose=False, n_jobs=-1) - adata.obsm["X_emb"] = phate_op.fit_transform(adata.X) - return adata + return _phate(adata) + + +@method( + method_name="PHATE (scanpy pre-processing)", + paper_name="Visualizing Transitions and Structure for Biological Data Exploration", + paper_url="https://www.nature.com/articles/s41587-019-0336-3", + paper_year=2019, + code_url="https://github.com/KrishnaswamyLab/PHATE/", + code_version=check_version("phate"), + image="openproblems-python-extras", +) +def phate_scanpy(adata): + preprocess_scanpy(adata) + return _phate(adata) diff --git a/openproblems/tasks/dimensionality_reduction/methods/preprocessing.py b/openproblems/tasks/dimensionality_reduction/methods/preprocessing.py new file mode 100644 index 0000000000..e1d378cbb3 --- /dev/null +++ b/openproblems/tasks/dimensionality_reduction/methods/preprocessing.py @@ -0,0 +1,11 @@ +from ....tools.normalize import log_cpm + +import scanpy as sc + + +def preprocess_scanpy(adata): + + log_cpm(adata) + sc.pp.highly_variable_genes(adata, n_top_genes=1000) + sc.tl.pca(adata, n_comps=50, svd_solver="arpack") + adata.obsm["X_input"] = adata.obsm["X_pca"] diff --git a/openproblems/tasks/dimensionality_reduction/methods/tsne.py b/openproblems/tasks/dimensionality_reduction/methods/tsne.py index 4f82607877..2376de3116 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/tsne.py +++ b/openproblems/tasks/dimensionality_reduction/methods/tsne.py @@ -1,5 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version +from .preprocessing import preprocess_scanpy import scanpy as sc @@ -15,7 +16,7 @@ image="openproblems-python-extras", ) def tsne(adata): - sc.pp.pca(adata) - sc.tl.tsne(adata) + preprocess_scanpy(adata) + sc.tl.tsne(adata, use_rep="X_input", n_pcs=50) adata.obsm["X_emb"] = adata.obsm["X_tsne"] return adata diff --git a/openproblems/tasks/dimensionality_reduction/methods/umap.py b/openproblems/tasks/dimensionality_reduction/methods/umap.py index e5366699d2..ce9c49c7dc 100644 --- a/openproblems/tasks/dimensionality_reduction/methods/umap.py +++ b/openproblems/tasks/dimensionality_reduction/methods/umap.py @@ -1,5 +1,6 @@ from ....tools.decorators import method from ....tools.utils import check_version +from .preprocessing import preprocess_scanpy import scanpy as sc @@ -14,7 +15,8 @@ code_version=check_version("umap-learn"), ) def umap(adata): - sc.pp.neighbors(adata) + preprocess_scanpy(adata) + sc.pp.neighbors(adata, use_rep="X_input", n_pcs=50) sc.tl.umap(adata) adata.obsm["X_emb"] = adata.obsm["X_umap"] return adata From 0192a809d2931975c2d249e71a991c4adc2781b8 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Thu, 8 Apr 2021 09:19:08 -0400 Subject: [PATCH 27/35] Fix s3 bucket clash --- .github/workflows/run_benchmark.yml | 17 +++++++++-------- workflow/parse_nextflow.py | 15 ++++++++++++++- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index b50fb757fc..e118b668a1 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -109,10 +109,11 @@ jobs: sudo mkdir -p /mnt/openproblems-nextflow sudo chown $USER /mnt/openproblems-nextflow s3fs -o umask=0277,uid=$(id -u) openproblems-nextflow /mnt/openproblems-nextflow + # Create bucket/ work/ and cwd/ for dir in bucket work cwd; do - mkdir -p /mnt/openproblems-nextflow/${dir}/${{ github.ref }} + mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} done - ls -l /mnt/openproblems-nextflow/*/${{ github.ref }} + ls -l /mnt/openproblems-nextflow/*/${BRANCH} - name: Install package & dependencies run: | @@ -156,14 +157,14 @@ jobs: -ansi-log false $RESUME -profile $PROFILE - -work-dir "/mnt/openproblems-nextflow/work/${{ github.ref }}" - -bucket-dir "s3://openproblems-nextflow/bucket/${{ github.ref }}" + -work-dir "/mnt/openproblems-nextflow/work/${BRANCH}" + -bucket-dir "s3://openproblems-nextflow/bucket/${BRANCH}" singlecellopenproblems/nf-openproblems --branch $BRANCH - name: Copy results run: | - cp -r /mnt/openproblems-nextflow/cwd/${{ github.ref }}/results . + cp -r /mnt/openproblems-nextflow/cwd/${BRANCH}/results . - name: Parse results # There's a bug with the results getting pulled from Netflow with caching, but @@ -182,9 +183,9 @@ jobs: for image in $(cd docker && ls -1d */ | tr -d '/'); do aws ecr batch-delete-image --region $AWS_DEFAULT_REGION --repository-name openproblems --image-ids "imageTag=${BRANCH}-${image}" done - aws s3 rm "s3://openproblems-nextflow/work/${{ github.ref }}" - aws s3 rm "s3://openproblems-nextflow/bucket/${{ github.ref }}" - aws s3 rm "s3://openproblems-nextflow/cwd/${{ github.ref }}" + aws s3 rm "s3://openproblems-nextflow/work/${BRANCH}" + aws s3 rm "s3://openproblems-nextflow/bucket/${BRANCH}" + aws s3 rm "s3://openproblems-nextflow/cwd/${BRANCH}" - name: Remove untagged images if: startsWith(github.ref, 'refs/heads/master') diff --git a/workflow/parse_nextflow.py b/workflow/parse_nextflow.py index 25cb6d7dc4..69908ef928 100644 --- a/workflow/parse_nextflow.py +++ b/workflow/parse_nextflow.py @@ -96,13 +96,26 @@ def parse_trace_to_dict(df): def parse_metric_results(results): """Add metric results to the trace output.""" + missing_traces = [] for filename in os.listdir("results/metrics"): with open(os.path.join("results/metrics", filename), "r") as handle: result = float(handle.read().strip()) task_name, dataset_name, method_name, metric_name = filename.replace( ".metric.txt", "" ).split(".") - results[task_name][dataset_name][method_name]["metrics"][metric_name] = result + try: + results[task_name][dataset_name][method_name]["metrics"][ + metric_name + ] = result + except KeyError: + missing_traces.append(filename) + if len(missing_traces) > 0: + print("Missing execution trace for: ") + for filename in missing_traces: + print(" {}".format(filename)) + raise ValueError( + "Missing execution traces for {} results.".format(len(missing_traces)) + ) return results From 61e64d4b331c0e3caed881e05cda36840e19e1c7 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Thu, 8 Apr 2021 10:00:22 -0400 Subject: [PATCH 28/35] set branch variable in S3 setup job --- .github/workflows/run_benchmark.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index e118b668a1..71a69e8c61 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -103,6 +103,10 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: us-west-2 run: | + if [[ "${{ github.repository }}" == "singlecellopenproblems/SingleCellOpenProblems" ]]; \ + then echo "BRANCH=`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 40`"; \ + else echo "BRANCH=`echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6`-`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 33`"; \ + fi sudo apt-get install -qy s3fs echo $AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY > ~/.passwd-s3fs chmod 600 ~/.passwd-s3fs From d5ddd19104d5c29ad32d7725ba7cf189fe2dcc3a Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Thu, 8 Apr 2021 10:07:32 -0400 Subject: [PATCH 29/35] BRANCH -> WKDIR for s3 --- .github/workflows/run_benchmark.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index 71a69e8c61..e54b1486db 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -104,8 +104,8 @@ jobs: AWS_DEFAULT_REGION: us-west-2 run: | if [[ "${{ github.repository }}" == "singlecellopenproblems/SingleCellOpenProblems" ]]; \ - then echo "BRANCH=`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 40`"; \ - else echo "BRANCH=`echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6`-`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 33`"; \ + then echo "WKDIR=`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 40`"; \ + else echo "WKDIR=`echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6`-`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 33`"; \ fi sudo apt-get install -qy s3fs echo $AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY > ~/.passwd-s3fs @@ -115,9 +115,9 @@ jobs: s3fs -o umask=0277,uid=$(id -u) openproblems-nextflow /mnt/openproblems-nextflow # Create bucket/ work/ and cwd/ for dir in bucket work cwd; do - mkdir -p /mnt/openproblems-nextflow/${dir}/${BRANCH} + mkdir -p /mnt/openproblems-nextflow/${dir}/${WKDIR} done - ls -l /mnt/openproblems-nextflow/*/${BRANCH} + ls -l /mnt/openproblems-nextflow/*/${WKDIR} - name: Install package & dependencies run: | @@ -161,14 +161,14 @@ jobs: -ansi-log false $RESUME -profile $PROFILE - -work-dir "/mnt/openproblems-nextflow/work/${BRANCH}" - -bucket-dir "s3://openproblems-nextflow/bucket/${BRANCH}" + -work-dir "/mnt/openproblems-nextflow/work/${WKDIR}" + -bucket-dir "s3://openproblems-nextflow/bucket/${WKDIR}" singlecellopenproblems/nf-openproblems --branch $BRANCH - name: Copy results run: | - cp -r /mnt/openproblems-nextflow/cwd/${BRANCH}/results . + cp -r /mnt/openproblems-nextflow/cwd/${WKDIR}/results . - name: Parse results # There's a bug with the results getting pulled from Netflow with caching, but @@ -187,9 +187,9 @@ jobs: for image in $(cd docker && ls -1d */ | tr -d '/'); do aws ecr batch-delete-image --region $AWS_DEFAULT_REGION --repository-name openproblems --image-ids "imageTag=${BRANCH}-${image}" done - aws s3 rm "s3://openproblems-nextflow/work/${BRANCH}" - aws s3 rm "s3://openproblems-nextflow/bucket/${BRANCH}" - aws s3 rm "s3://openproblems-nextflow/cwd/${BRANCH}" + aws s3 rm "s3://openproblems-nextflow/work/${WKDIR}" + aws s3 rm "s3://openproblems-nextflow/bucket/${WKDIR}" + aws s3 rm "s3://openproblems-nextflow/cwd/${WKDIR}" - name: Remove untagged images if: startsWith(github.ref, 'refs/heads/master') From c8085db7dd083df039290f4be02f86c25b880557 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Thu, 8 Apr 2021 10:12:50 -0400 Subject: [PATCH 30/35] rm echo --- .github/workflows/run_benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index e54b1486db..dca6db8973 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -104,8 +104,8 @@ jobs: AWS_DEFAULT_REGION: us-west-2 run: | if [[ "${{ github.repository }}" == "singlecellopenproblems/SingleCellOpenProblems" ]]; \ - then echo "WKDIR=`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 40`"; \ - else echo "WKDIR=`echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6`-`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 33`"; \ + then WKDIR=`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 40`; \ + else WKDIR=`echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6`-`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 33`; \ fi sudo apt-get install -qy s3fs echo $AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY > ~/.passwd-s3fs From 9b8edef548a67861cf0565d7d133df50e1ddacd6 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Fri, 9 Apr 2021 09:26:25 -0400 Subject: [PATCH 31/35] use env.BRANCH --- .github/workflows/run_benchmark.yml | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index dca6db8973..e1abdf3b34 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -103,10 +103,6 @@ jobs: AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} AWS_DEFAULT_REGION: us-west-2 run: | - if [[ "${{ github.repository }}" == "singlecellopenproblems/SingleCellOpenProblems" ]]; \ - then WKDIR=`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 40`; \ - else WKDIR=`echo ${{ github.repository }} | awk '{print $1}' FS=/ | head -c 6`-`echo ${{ github.ref }} | sed 's:refs/[a-z]*/::' | sed 's:[/_]:-:g'| head -c 33`; \ - fi sudo apt-get install -qy s3fs echo $AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY > ~/.passwd-s3fs chmod 600 ~/.passwd-s3fs @@ -115,9 +111,9 @@ jobs: s3fs -o umask=0277,uid=$(id -u) openproblems-nextflow /mnt/openproblems-nextflow # Create bucket/ work/ and cwd/ for dir in bucket work cwd; do - mkdir -p /mnt/openproblems-nextflow/${dir}/${WKDIR} + mkdir -p /mnt/openproblems-nextflow/${dir}/${{ env.BRANCH }} done - ls -l /mnt/openproblems-nextflow/*/${WKDIR} + ls -l /mnt/openproblems-nextflow/*/${{ env.BRANCH }} - name: Install package & dependencies run: | @@ -161,14 +157,14 @@ jobs: -ansi-log false $RESUME -profile $PROFILE - -work-dir "/mnt/openproblems-nextflow/work/${WKDIR}" - -bucket-dir "s3://openproblems-nextflow/bucket/${WKDIR}" + -work-dir "/mnt/openproblems-nextflow/work/${{ env.BRANCH }}" + -bucket-dir "s3://openproblems-nextflow/bucket/${{ env.BRANCH }}" singlecellopenproblems/nf-openproblems --branch $BRANCH - name: Copy results run: | - cp -r /mnt/openproblems-nextflow/cwd/${WKDIR}/results . + cp -r /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }}/results . - name: Parse results # There's a bug with the results getting pulled from Netflow with caching, but @@ -187,9 +183,9 @@ jobs: for image in $(cd docker && ls -1d */ | tr -d '/'); do aws ecr batch-delete-image --region $AWS_DEFAULT_REGION --repository-name openproblems --image-ids "imageTag=${BRANCH}-${image}" done - aws s3 rm "s3://openproblems-nextflow/work/${WKDIR}" - aws s3 rm "s3://openproblems-nextflow/bucket/${WKDIR}" - aws s3 rm "s3://openproblems-nextflow/cwd/${WKDIR}" + aws s3 rm "s3://openproblems-nextflow/work/${{ env.BRANCH }}" + aws s3 rm "s3://openproblems-nextflow/bucket/${{ env.BRANCH }}" + aws s3 rm "s3://openproblems-nextflow/cwd/${{ env.BRANCH }}" - name: Remove untagged images if: startsWith(github.ref, 'refs/heads/master') From 2d00dfefd50ff250b8a84b7268e2197250e10fa5 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Fri, 9 Apr 2021 13:51:56 -0400 Subject: [PATCH 32/35] fix nf wkdir --- .github/workflows/run_benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index e1abdf3b34..ee2c8582fb 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -149,9 +149,9 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 + AWS_DEFAULT_REGION: us-west-2 s run: >- - cd /mnt/openproblems-nextflow/cwd/${{ github.ref }} && + cd /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }} && nextflow run $WITH_TOWER -ansi-log false From 229bbbcece0c37815ae8e112aaaf449c6edfac68 Mon Sep 17 00:00:00 2001 From: Daniel Burkhardt Date: Fri, 9 Apr 2021 14:43:35 -0400 Subject: [PATCH 33/35] fix region --- .github/workflows/run_benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml index ee2c8582fb..2a39fdcdb2 100644 --- a/.github/workflows/run_benchmark.yml +++ b/.github/workflows/run_benchmark.yml @@ -149,7 +149,7 @@ jobs: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }} - AWS_DEFAULT_REGION: us-west-2 s + AWS_DEFAULT_REGION: us-west-2 run: >- cd /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }} && nextflow run From 3c6956b5fd6cdec461de7368205421bee0417f78 Mon Sep 17 00:00:00 2001 From: MalteDLuecken Date: Wed, 21 Apr 2021 18:06:39 +0200 Subject: [PATCH 34/35] updated template for PR with PR evaluation checks (#314) --- .github/pull_request_template.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 943834a641..ca2bfe03b7 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -20,3 +20,15 @@ * [ ] This submission follows the guidelines in our [Contributing](../blob/master/CONTRIBUTING.md) document * [ ] I have checked to ensure there aren't other open [Pull Requests](../pulls) for the same update/change + +### PR review checklist + +This PR will be evaluated on the basis of the following checks: + +* [ ] The task addresses a valid open problem in single-cell analysis +* [ ] The latest version of master is merged and tested +* [ ] The methods/metrics are imported to `__init__.py` and were tested in the pipeline +* [ ] Method and metric decorators are annotated with paper title, year, author, code version, and date +* [ ] The README gives an outline of the methods, metrics and datasets in the folder +* [ ] The README provides a satisfactory task explanation (for new tasks) +* [ ] The sample test data is appropriate to test implementation of all methods and metrics (for new tasks) From a20bbc6b16ac0530a792cff9469d9b8beecc9cbf Mon Sep 17 00:00:00 2001 From: Wesley Lewis <59123674+wes-lewis@users.noreply.github.com> Date: Thu, 22 Apr 2021 08:51:38 -0400 Subject: [PATCH 35/35] Update alra.py (#304) * Update alra.py Fix pre-processing and transformation back into the original space * pre-commit * Update alra.py * make sure necessary methods are imported Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Daniel Burkhardt --- openproblems/tasks/denoising/methods/alra.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/openproblems/tasks/denoising/methods/alra.py b/openproblems/tasks/denoising/methods/alra.py index 8c97d75805..ad86cde24c 100644 --- a/openproblems/tasks/denoising/methods/alra.py +++ b/openproblems/tasks/denoising/methods/alra.py @@ -1,11 +1,13 @@ from ....tools.conversion import r_function from ....tools.decorators import method -from ....tools.utils import check_version # should be imported by docker import numpy as np import scprep +# from ....tools.utils import check_version + + _alra = r_function("alra.R") @@ -15,17 +17,20 @@ paper_url="https://www.biorxiv.org/content/10.1101/397588v1", paper_year=2018, code_url="https://github.com/KlugerLab/ALRA", - code_version=check_version("scprep"), + # code_version=check_version("scprep"), image="openproblems-r-extras", ) def alra(adata): # libsize and sqrt norm # overwrite obsm['train'] with normalized version + adata.obsm["train"] = scprep.utils.matrix_transform(adata.obsm["train"], np.sqrt) adata.obsm["train"], libsize = scprep.normalize.library_size_normalize( adata.obsm["train"], rescale=1, return_library_size=True ) - + # run alra + # _alra takes adata returns adata, edits "train" Y = _alra(adata)["train"] + # transform back into original space Y = scprep.utils.matrix_transform(Y, np.square) Y = scprep.utils.matrix_vector_elementwise_multiply(Y, libsize, axis=0) adata.obsm["denoised"] = Y