arl94 · arl94 · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -20,3 +20,15 @@
 
 * [ ] This submission follows the guidelines in our [Contributing](../blob/master/CONTRIBUTING.md) document
 * [ ] I have checked to ensure there aren't other open [Pull Requests](../pulls) for the same update/change
+
+### PR review checklist
+
+This PR will be evaluated on the basis of the following checks:
+
+* [ ] The task addresses a valid open problem in single-cell analysis
+* [ ] The latest version of master is merged and tested
+* [ ] The methods/metrics are imported to `__init__.py` and were tested in the pipeline
+* [ ] Method and metric decorators are annotated with paper title, year, author, code version, and date
+* [ ] The README gives an outline of the methods, metrics and datasets in the folder
+* [ ] The README provides a satisfactory task explanation (for new tasks)
+* [ ] The sample test data is appropriate to test implementation of all methods and metrics (for new tasks)
diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml
@@ -45,7 +45,7 @@ jobs:
     - name: Build Docker images
       run: |
         cd workflow
-        snakemake -j $(grep -c processor /proc/cpuinfo) docker
+        snakemake -j $(grep -c processor /proc/cpuinfo) docker_build
         cd ..
 
     - name: Push Docker images

diff --git a/.github/workflows/run_benchmark.yml b/.github/workflows/run_benchmark.yml
@@ -109,10 +109,11 @@ jobs:
         sudo mkdir -p /mnt/openproblems-nextflow
         sudo chown $USER /mnt/openproblems-nextflow
         s3fs -o umask=0277,uid=$(id -u) openproblems-nextflow /mnt/openproblems-nextflow
+        # Create bucket/ work/ and cwd/
         for dir in bucket work cwd; do
-          mkdir -p /mnt/openproblems-nextflow/${dir}/${{ github.ref }}
+          mkdir -p /mnt/openproblems-nextflow/${dir}/${{ env.BRANCH }}
         done
-        ls -l /mnt/openproblems-nextflow/*/${{ github.ref }}
+        ls -l /mnt/openproblems-nextflow/*/${{ env.BRANCH }}
 
     - name: Install package & dependencies
       run: |
@@ -150,20 +151,20 @@ jobs:
         TOWER_ACCESS_TOKEN: ${{ secrets.TOWER_ACCESS_KEY }}
         AWS_DEFAULT_REGION: us-west-2
       run: >-
-        cd /mnt/openproblems-nextflow/cwd/${{ github.ref }} &&
+        cd /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }} &&
         nextflow run
         $WITH_TOWER
         -ansi-log false
         $RESUME
         -profile $PROFILE
-        -work-dir "/mnt/openproblems-nextflow/work/${{ github.ref }}"
-        -bucket-dir "s3://openproblems-nextflow/bucket/${{ github.ref }}"
+        -work-dir "/mnt/openproblems-nextflow/work/${{ env.BRANCH }}"
+        -bucket-dir "s3://openproblems-nextflow/bucket/${{ env.BRANCH }}"
         singlecellopenproblems/nf-openproblems
         --branch $BRANCH
 
     - name: Copy results
       run: |
-        cp -r /mnt/openproblems-nextflow/cwd/${{ github.ref }}/results .
+        cp -r /mnt/openproblems-nextflow/cwd/${{ env.BRANCH }}/results .
 
     - name: Parse results
       # There's a bug with the results getting pulled from Netflow with caching, but
@@ -182,9 +183,9 @@ jobs:
         for image in $(cd docker && ls -1d */ | tr -d '/'); do
           aws ecr batch-delete-image --region $AWS_DEFAULT_REGION --repository-name openproblems --image-ids "imageTag=${BRANCH}-${image}"
         done
-        aws s3 rm "s3://openproblems-nextflow/work/${{ github.ref }}"
-        aws s3 rm "s3://openproblems-nextflow/bucket/${{ github.ref }}"
-        aws s3 rm "s3://openproblems-nextflow/cwd/${{ github.ref }}"
+        aws s3 rm "s3://openproblems-nextflow/work/${{ env.BRANCH }}"
+        aws s3 rm "s3://openproblems-nextflow/bucket/${{ env.BRANCH }}"
+        aws s3 rm "s3://openproblems-nextflow/cwd/${{ env.BRANCH }}"
 
     - name: Remove untagged images
       if: startsWith(github.ref, 'refs/heads/master')

diff --git a/.gitignore b/.gitignore
@@ -143,3 +143,6 @@ resources/
 # Nextflow
 nf-openproblems
 .nextflow
+
+# Editor
+.idea
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -27,9 +27,9 @@ link to it from your website, or simply star it in GitHub to say "I use it".
   + [Writing functions in R](#writing-functions-in-r)
   + [Adding package dependencies](#adding-package-dependencies)
   + [Adding a new dataset](#adding-a-new-dataset)
-  + [Adding a dataset / method / metric to a task](#adding-a-dataset---method---metric-to-a-task)
+  + [Adding a dataset / method / metric to a task](#adding-a-dataset--method--metric-to-a-task)
   + [Adding a new task](#adding-a-new-task)
-  + [Adding a new Docker container](#adding-a-new-container)
+  + [Adding a new Docker container](#adding-a-new-docker-container)
 * [Code Style and Testing](#code-style-and-testing)
 * [Code of Conduct](#code-of-conduct)
 * [Attribution](#attribution)
@@ -177,6 +177,10 @@ If you are unable to write your method using our base dependencies, you may add
 
 Datasets are loaded under `openproblems/data`. Each data loading function should download the appropriate dataset from a stable location (e.g. from Figshare) be decorated with `openproblems.data.utils.loader` in order to cache the result.
 
+To see a gold standard loader, look at [openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py](https://github.com/singlecellopenproblems/SingleCellOpenProblems/blob/master/openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py)
+
+This file name should match `[First Author Last Name]_[Year Published]_short_Description_of_data.py`. E.g. the dataset of zebrafish embryos perturbed with CRISPR published in 2018 by Wagner _et al._ becomes `Wagner_2018_zebrafish_embryo_CRISPR.py`
+
 ### Adding a dataset / method / metric to a task
 
 To add a dataset, method, or metric to a task, simply create a new `.py` file corresponding to your proposed new functionality and import the main function in the corresponding `__init__.py`. E.g., to add a "F2" metric to the label projection task, we would create `openproblems/tasks/label_projection/metrics/f2.py` and add a line
@@ -235,7 +239,9 @@ Datasets, methods and metrics run inside Docker containers. We provide a few to
 
 ## Code Style and Testing
 
-`singlecellopenproblems` is maintained at close to 100% code coverage. For datasets, methods, and metrics, tests are generated automatically. For additions outside this core functionality, contributors are encouraged to write tests for their code -- but if you do not know how to do so, please do not feel discouraged from contributing code! Others can always help you test your contribution.
+`singlecellopenproblems` is maintained at close to 100% code coverage. For datasets, methods, and metrics, tests are generated programatically from each task's `api.py`. See the [Adding a new task](#adding-a-new-task) section for instructions on creating this file.
+
+For additions outside this core functionality, contributors are encouraged to write tests for their code -- but if you do not know how to do so, please do not feel discouraged from contributing code! Others can always help you test your contribution.
 
 Code is tested by GitHub Actions when you push your changes. However, if you wish to test locally, you can do so with the following command:
 ```

diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@
 [![Code Style: Black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 [![Style Guide: OpenStack](https://img.shields.io/badge/style%20guide-openstack-eb1a32.svg)](https://docs.openstack.org/hacking/latest/user/hacking.html#styleguide)
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit)
+[![Video](https://img.shields.io/static/v1?label=YouTube&message=Visit%20channel&color=red&logo=youtube)](https://www.youtube.com/channel/UCJpqxlzxRamcA3Pv3KlYZHg)
 
 Formalizing and benchmarking open problems in single-cell genomics.
 
@@ -15,8 +16,11 @@ Formalizing and benchmarking open problems in single-cell genomics.
 ## Guides
 * For contributing guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md)  
 * For instructions on editing Docker images, see [docker/README.md](docker/README.md)  
+* For instructions on using the `openproblems-cli`, see [openproblems/api/README.md](https://github.com/singlecellopenproblems/SingleCellOpenProblems/tree/master/openproblems/api)
 * For a walkthrough of the GitHub Actions workflows and AWS Sagemaker, see [SAGEMAKER.md](SAGEMAKER.md)  
 * For a description of existing an proposed tasks, see [TASKS.md](TASKS.md)  
+* For a video introduction to this GitHub repository, watch our [Repository introduction](https://www.youtube.com/watch?v=tHempZCdXyA)
+* For a video tutorial on adding new tasks, watch our [How to add a new task tutorial](https://www.youtube.com/watch?v=tgVG3Hp6mBc)
 
 ## The team
 
@@ -32,7 +36,7 @@ Formalizing and benchmarking open problems in single-cell genomics.
 * Wes Lewis (@weslewis) - Differential Abundance and Data Denoising
 * Mohammad Lotfallahi (@M0hammadL) - Label projection task
 * Qian Qin (@qinqian) - Predicting gene expression from chromatin accessibility
-* Daniel Strobel (@danielStrobel) - Batch integration
+* Daniel Strobl (@danielStrobl) - Batch integration
 * Michael Vinyard (@mvinyard) - Stress preservation in Dimensionality Reduction
 * Florian Wagner (@flo-compbio) - Data denoising
 

diff --git a/SAGEMAKER.md b/SAGEMAKER.md
@@ -31,7 +31,7 @@ There is a 1:1 correspondence between the steps to set up SageMaker using the CL
 - [Add user to SageMaker Studio](#add-user-to-sagemaker-studio)
 - [Open SageMaker Studio and Launch a Notebook using a Custom Image](#open-sagemaker-studio-and-launch-a-notebook-using-a-custom-image)
   * [Selecting an instance type](#selecting-an-instance-type)
-  * [Kernel not found error](#kernel-not-found-error)
+  * [Failed to start kernel (image does not exist)](#failed-to-start-kernel-image-does-not-exist)
 
 <!-- Table of contents generated with [markdown-toc](http://ecotrust-canada.github.io/markdown-toc/) -->
 
@@ -186,7 +186,7 @@ We've selected three instances to use during the Jamboree. Note, it is possible
 To change your instance, follow the [Change Instance Type](https://docs.aws.amazon.com/sagemaker/latest/dg/notebooks-run-and-manage-switch-instance-type.html) tutorial from AWS.
 
 
-### Kernel not found error
+### Failed to start kernel (image does not exist)
 
 If you see the following error:
 

diff --git a/TASKS.md b/TASKS.md
@@ -1,4 +1,4 @@
-## Single-Cell Analysis Benchmarking Tasks
+# Benchmarking Task Descriptions
 
 Table of Contents
   * [Predicting gene expression from chromatin accessibility](#predicting-gene-expression-from-chromatin-accessibility)

diff --git a/docker/README.md b/docker/README.md
@@ -15,6 +15,7 @@ Note, all images must comply to the [AWS SageMaker Custom Image Specifications](
 - [Building Docker images locally](#building-docker-images-locally)
 - [Building Docker images through GitHub Actions workflows](#building-docker-images-through-github-actions-workflows)
 - [Pulling images from the ECR to your local machine](#pulling-images-from-the-ecr-to-your-local-machine)
+- [Running Docker images locally](#running-docker-images-locally)
 
 <!-- Table of contents generated with [markdown-toc](http://ecotrust-canada.github.io/markdown-toc/) -->
 
@@ -126,3 +127,38 @@ docker pull <aws_account_id>.dkr.ecr.us-west-2.amazonaws.com/openproblems:<Image
 ```
 
 If you would like to attach this image to AWS SageMaker, you can follow our [SageMaker and ECR tutorial.](https://github.com/singlecellopenproblems/SingleCellOpenProblems/blob/master/SAGEMAKER.md)
+
+You can also pull base images from [DockerHub](https://hub.docker.com/r/singlecellopenproblems/openproblems):
+```
+docker pull singlecellopenproblems/openproblems-python-extras:latest
+```
+
+
+## Running Docker images locally
+
+To run Docker images on your local machine, you must have `docker` installed. Follow the Docker guide to [Install Docker](https://docs.docker.com/get-docker/).
+
+Once you've either built Docker images locally or pulled them from ECR or the [singlecellopenproblems DockerHub](https://hub.docker.com/r/singlecellopenproblems/openproblems), you can see installed images using `docker images`.
+
+```
+> docker images
+REPOSITORY                                                  TAG                                                 IMAGE ID       CREATED        SIZE
+singlecellopenproblems/openproblems-python-extras           latest                                              f86e1c5ce9d0   14 hours ago   3.94GB
+singlecellopenproblems/openproblems-r-base                  latest                                              f8908c9fb387   21 hours ago   6.36GB
+singlecellopenproblems/openproblems-r-extras                latest                                              7e15120bb7ce   5 days ago     4.89GB
+singlecellopenproblems/openproblems                         latest                                              14974cbd2f58   5 days ago     2.1GB
+490915662541.dkr.ecr.us-west-2.amazonaws.com/openproblems   batch_integration_docker-openproblems               3a1ce37e85f2   6 days ago     2.06GB
+```
+
+You can then run commands within a docker container using `docker run`. Consult the [Docker documentation](https://docs.docker.com/engine/reference/commandline/run/) to learn more about the `run` command.
+
+**Using `IMAGE ID`**
+```
+docker run -it 90a9110c7d69 /bin/bash
+```
+
+**Using `RESPOSITORY:TAG`**
+```
+docker run -it singlecellopenproblems/openproblems-python-extras:latest  /bin/bash
+
+```
diff --git a/docker/openproblems-python-extras/requirements.txt b/docker/openproblems-python-extras/requirements.txt
@@ -5,3 +5,4 @@ phate
 pyensembl
 pybedtools
 git+https://github.com/czbiohub/molecular-cross-validation
+git+https://github.com/atong01/SCOT
diff --git a/docker/openproblems/Dockerfile b/docker/openproblems/Dockerfile
@@ -1,5 +1,7 @@
 FROM python:3.8
 
+# Adding this comment as a temporary bugfix on 3/30
+
 # Setting up Sagemaker Studio Image from example
 # https://github.com/aws-samples/sagemaker-studio-custom-image-samples/blob/main/examples/echo-kernel-image/Dockerfile
 

diff --git a/openproblems/api/README.md b/openproblems/api/README.md
@@ -26,17 +26,52 @@ optional arguments:
 ```
 
 ## Example (without docker)
+Running the CLI requires commands to be run in a specific order: `load` -> `run` -> `evaluate`.
 
+For example:
 ```
-openproblems-cli tasks
-openproblems-cli list --datasets --task label_projection
+# Download a task-specific dataset and save it to `dataset.h5ad`
 openproblems-cli load --task label_projection --output dataset.h5ad pancreas_batch
-openproblems-cli list --methods --task label_projection
+# Run a method on a datasets and save output to `method.h5ad`
 openproblems-cli run --task label_projection --input dataset.h5ad --output method.h5ad logistic_regression_log_cpm
-openproblems-cli list --metrics --task label_projection
+# Evaluate the performance of a previously run method using the `accuracy` metric
 openproblems-cli evaluate --task label_projection --input method.h5ad accuracy
 ```
 
+You can list available tasks using `openproblems-cli tasks`
+```
+> openproblems-cli tasks
+denoising
+dimensionality_reduction
+label_projection
+multimodal_data_integration
+regulatory_effect_prediction
+```
+
+You can then list the avaiable datasets, methods, and metrics for a partiular task using `openproblems-cli list --[datasets|methods|metrics] --task [task_name]`
+```
+> openproblems-cli list --datasets --task label_projection
+pancreas_batch
+pancreas_random
+zebrafish_labels
+zebrafish_random
+
+> openproblems-cli list --methods --task label_projection
+knn_classifier_log_cpm
+knn_classifier_scran
+logistic_regression_log_cpm
+logistic_regression_scran
+mlp_log_cpm
+mlp_scran
+
+> openproblems-cli list --metrics --task label_projection
+accuracy
+f1
+f1_micro
+```
+
+The output of these commands are allowable arguments to the respective `load`, `run`, and `evaluate` commands.
+
 ### Sample output
 
 ```

diff --git a/openproblems/api/main.py b/openproblems/api/main.py
@@ -40,7 +40,7 @@ def _main(args=None):
 def main(args=None, do_print=True):
     """Run the command-line interface."""
     output = _main(args)
-    if do_print and output:
+    if do_print:
         utils.print_output(output)
         return 0
     else: