AlexsLemonade
diff --git a/‎.github/workflows/deploy_prod_backend.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/deploy_prod_backend.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/deploy_staging_backend.yml‎
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/deploy_staging_backend.yml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 3 deletions b/‎.gitignore‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎api/scpca_portal/config/common.py‎
Lines changed: 3 additions & 0 deletions b/‎api/scpca_portal/config/common.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎api/scpca_portal/config/production.py‎
Lines changed: 4 additions & 3 deletions b/‎api/scpca_portal/config/production.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎api/scpca_portal/loader.py‎
Lines changed: 50 additions & 20 deletions b/‎api/scpca_portal/loader.py‎
Lines changed: 50 additions & 20 deletions
diff --git a/‎api/scpca_portal/management/commands/dispatch_to_batch.py‎
Lines changed: 91 additions & 0 deletions b/‎api/scpca_portal/management/commands/dispatch_to_batch.py‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎api/scpca_portal/management/commands/generate_computed_file.py‎
Lines changed: 75 additions & 0 deletions b/‎api/scpca_portal/management/commands/generate_computed_file.py‎
Lines changed: 75 additions & 0 deletions
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Load 1Password Secrets
         id: op-load-secrets
@@ -31,9 +31,9 @@ jobs:
           SENTRY_DSN: "${{ secrets.OP_SENTRY_DSN }}"
 
       - name: Setup Terraform
-        uses: hashicorp/setup-terraform@v1
+        uses: hashicorp/setup-terraform@v3
         with:
-          terraform_version: 0.12.26
+          terraform_version: 0.13.0
 
       - name: Deploy
         run: cd infrastructure && python3 deploy.py -e prod -u deployer -d ccdl -v $(git rev-parse HEAD)
 
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
 
       - name: Load 1Password Secrets
         id: op-load-secrets
@@ -31,9 +31,9 @@ jobs:
           SENTRY_DSN: "${{ secrets.OP_SENTRY_DSN }}"
 
       - name: Setup Terraform
-        uses: hashicorp/setup-terraform@v1
+        uses: hashicorp/setup-terraform@v3
         with:
-          terraform_version: 0.12.26
+          terraform_version: 0.13.0
 
       - name: Deploy
         run: cd infrastructure && python3 deploy.py -e staging -u deployer -d ccdlstaging -v $(git rev-parse HEAD)
 
@@ -128,6 +128,13 @@ infrastructure/.terraform.lock.hcl
 .vscode
 *.code-workspace
 
+# SSH keys
+*.pem
+*.pub
+
+# 1Password integration
+.op/
+
 #
 # Client
 #
@@ -147,9 +154,6 @@ client/out/
 # production
 client/build
 
-# misc
-*.pem
-
 # debug
 client/npm-debug.log*
 client/yarn-debug.log*
 
@@ -184,3 +184,6 @@ class Common(Configuration):
     CORS_ALLOW_HEADERS = default_headers + (API_KEY_HEADER,)
 
     TERMS_AND_CONDITIONS = "PLACEHOLDER"
+
+    # AWS
+    AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
@@ -15,13 +15,14 @@ class Production(Common):
 
     UPDATE_S3_DATA = True
 
-    # AWS
-    AWS_REGION = os.getenv("AWS_REGION")
-
     # AWS S3
     AWS_S3_INPUT_BUCKET_NAME = "scpca-portal-inputs"
     AWS_S3_OUTPUT_BUCKET_NAME = os.getenv("AWS_S3_BUCKET_NAME")
 
+    # AWS Batch
+    AWS_BATCH_JOB_QUEUE_NAME = os.environ.get("AWS_BATCH_JOB_QUEUE_NAME")
+    AWS_BATCH_JOB_DEFINITION_NAME = os.environ.get("AWS_BATCH_JOB_DEFINITION_NAME")
+
     # https://developers.google.com/web/fundamentals/performance/optimizing-content-efficiency/http-caching#cache-control
     # Response can be cached by browser and any intermediary caches
     # (i.e. it is "public") for up to 1 day
 
@@ -1,7 +1,6 @@
 import shutil
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
-from threading import Lock
 from typing import Any, Dict, List, Set
 
 from django.conf import settings
@@ -10,7 +9,14 @@
 
 from scpca_portal import common, metadata_file, s3
 from scpca_portal.config.logging import get_and_configure_logger
-from scpca_portal.models import ComputedFile, Contact, ExternalAccession, Project, Publication
+from scpca_portal.models import (
+    ComputedFile,
+    Contact,
+    ExternalAccession,
+    Project,
+    Publication,
+    Sample,
+)
 
 logger = get_and_configure_logger(__name__)
 
@@ -136,25 +142,55 @@ def create_project(
     return project
 
 
-def _create_computed_file(future, *, update_s3: bool, clean_up_output_data: bool) -> None:
+def _create_computed_file(
+    computed_file: ComputedFile, update_s3: bool, clean_up_output_data: bool
+) -> None:
     """
     Save computed file returned from future to the db.
     Upload file to s3 and clean up output data depending on passed options.
     """
-    if computed_file := future.result():
-
-        # Only upload and clean up projects and the last sample if multiplexed
-        if computed_file.project or computed_file.sample.is_last_multiplexed_sample:
-            if update_s3:
-                s3.upload_output_file(computed_file.s3_key, computed_file.s3_bucket)
-            if clean_up_output_data:
-                computed_file.clean_up_local_computed_file()
+    if update_s3:
+        s3.upload_output_file(computed_file.s3_key, computed_file.s3_bucket)
+    if clean_up_output_data:
+        computed_file.clean_up_local_computed_file()
+
+    if computed_file.sample and computed_file.has_multiplexed_data:
+        computed_files = computed_file.get_multiplexed_computed_files()
+        ComputedFile.objects.bulk_create(computed_files)
+    else:
         computed_file.save()
 
+
+def _create_computed_file_callback(future, *, update_s3: bool, clean_up_output_data: bool) -> None:
+    """
+    Wrap computed file saving and uploading to s3 in a way that accommodates multiprocessing.
+    """
+    if computed_file := future.result():
+        _create_computed_file(computed_file, update_s3, clean_up_output_data)
+
     # Close DB connection for each thread.
     connection.close()
 
 
+def generate_computed_file(
+    *,
+    download_config: Dict,
+    project: Project | None = None,
+    sample: Sample | None = None,
+    update_s3: bool = True,
+) -> None:
+
+    # Purge old computed file
+    if old_computed_file := (project or sample).get_computed_file(download_config):
+        old_computed_file.purge(update_s3)
+
+    if project and (computed_file := ComputedFile.get_project_file(project, download_config)):
+        _create_computed_file(computed_file, update_s3, clean_up_output_data=False)
+    if sample and (computed_file := ComputedFile.get_sample_file(sample, download_config)):
+        _create_computed_file(computed_file, update_s3, clean_up_output_data=False)
+        sample.project.update_downloadable_sample_count()
+
+
 def generate_computed_files(
     project: Project,
     max_workers: int,
@@ -170,33 +206,27 @@ def generate_computed_files(
 
     # Prep callback function
     on_get_file = partial(
-        _create_computed_file,
+        _create_computed_file_callback,
         update_s3=update_s3,
         clean_up_output_data=clean_up_output_data,
     )
-    # Prepare a threading.Lock for each sample, with the chief purpose being to protect
-    # multiplexed samples that share a zip file.
-    locks = {}
+
     with ThreadPoolExecutor(max_workers=max_workers) as tasks:
         # Generated project computed files
         for config in common.GENERATED_PROJECT_DOWNLOAD_CONFIGS:
             tasks.submit(
                 ComputedFile.get_project_file,
                 project,
                 config,
-                project.get_output_file_name(config),
             ).add_done_callback(on_get_file)
 
         # Generated sample computed files
-        for sample in project.samples.all():
+        for sample in project.samples_to_generate:
             for config in common.GENERATED_SAMPLE_DOWNLOAD_CONFIGS:
-                sample_lock = locks.setdefault(sample.get_config_identifier(config), Lock())
                 tasks.submit(
                     ComputedFile.get_sample_file,
                     sample,
                     config,
-                    sample.get_output_file_name(config),
-                    sample_lock,
                 ).add_done_callback(on_get_file)
 
     project.update_downloadable_sample_count()
@@ -0,0 +1,91 @@
+import logging
+
+from django.conf import settings
+from django.core.management.base import BaseCommand
+
+import boto3
+
+from scpca_portal import common
+from scpca_portal.models import Project
+
+batch = boto3.client(
+    "batch",
+    region_name=settings.AWS_REGION,
+)
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger.addHandler(logging.StreamHandler())
+
+
+class Command(BaseCommand):
+    help = """
+    Submits all computed file combinations to the specified AWS Batch job queue
+    for projects for which computed files have yet to be generated for them.
+    If a project-id is passed, then computed files are only submitted for that specific project.
+    """
+
+    def add_arguments(self, parser):
+        parser.add_argument("--project-id", type=str)
+
+    def handle(self, *args, **kwargs):
+        self.dispatch_to_batch(**kwargs)
+
+    def submit_job(
+        self,
+        *,
+        download_config_name: str,
+        project_id: str = "",
+        sample_id: str = "",
+    ) -> None:
+        """
+        Submit job to AWS Batch, accordingly to the resource_id and download_config combination.
+        """
+        resource_flag = "--project-id" if project_id else "--sample-id"
+        resource_id = project_id if project_id else sample_id
+        job_name = f"{resource_id}-{download_config_name}"
+
+        response = batch.submit_job(
+            jobName=job_name,
+            jobQueue=settings.AWS_BATCH_JOB_QUEUE_NAME,
+            jobDefinition=settings.AWS_BATCH_JOB_DEFINITION_NAME,
+            containerOverrides={
+                "command": [
+                    "python",
+                    "manage.py",
+                    "generate_computed_file",
+                    resource_flag,
+                    resource_id,
+                    "--download-config-name",
+                    download_config_name,
+                ],
+            },
+        )
+
+        logger.info(f'{job_name} submitted to Batch with jobId {response["jobId"]}')
+
+    def dispatch_to_batch(self, project_id: str = "", **kwargs):
+        """
+        Iterate over all projects that don't have computed files and submit each
+        resource_id and download_config combination to the Batch queue.
+        If a project id is passed, then computed files are created for all combinations
+        within that project.
+        """
+        projects = (
+            Project.objects.filter(project_computed_files__is_null=True)
+            if not project_id
+            else Project.objects.filter(scpca_id=project_id)
+        )
+
+        for project in projects:
+            for download_config_name in common.PROJECT_DOWNLOAD_CONFIGS.keys():
+                self.submit_job(
+                    project_id=project.scpca_id,
+                    download_config_name=download_config_name,
+                )
+
+            for sample in project.samples_to_generate:
+                for download_config_name in common.SAMPLE_DOWNLOAD_CONFIGS.keys():
+                    self.submit_job(
+                        sample_id=sample.scpca_id,
+                        download_config_name=download_config_name,
+                    )
@@ -0,0 +1,75 @@
+import logging
+
+from django.core.management.base import BaseCommand
+
+from scpca_portal import common, loader
+from scpca_portal.models import Project, Sample
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+logger.addHandler(logging.StreamHandler())
+
+
+class Command(BaseCommand):
+    help = """
+    This command is meant to be called as an entrypoint to AWS Batch Fargate job instance.
+    Individual files are computed according:
+        - To the project or sample id
+        - An appropriate corresponding download config
+
+    When computation is completed, files are uploaded to S3, and the job is marked as completed.
+
+    At which point the instance which generated this computed file will receive a new job
+    from the job queue and begin computing the next file.
+    """
+
+    def add_arguments(self, parser):
+        parser.add_argument("--project-id", type=str)
+        parser.add_argument("--sample-id", type=str)
+        parser.add_argument("--download-config-name", type=str)
+
+    def handle(self, *args, **kwargs):
+        self.generate_computed_file(**kwargs)
+
+    def generate_computed_file(
+        self,
+        project_id: str,
+        sample_id: str,
+        download_config_name: str,
+        **kwargs,
+    ) -> None:
+        """Generates a project's computed files according predetermined download configurations"""
+        loader.prep_data_dirs()
+
+        ids_not_mutually_exclusive = project_id and sample_id or (not project_id and not sample_id)
+        if ids_not_mutually_exclusive:
+            logger.error(
+                "Invalid id combination. Passed ids must be mutually exclusive."
+                "Either a project_id or a sample_id must be passed, but not both or neither."
+            )
+
+        if project_id:
+            project = Project.objects.filter(scpca_id=project_id).first()
+            if not project:
+                logger.error(f"{project} does not exist.")
+            if download_config_name not in common.PROJECT_DOWNLOAD_CONFIGS.keys():
+                logger.error(f"{download_config_name} is not a valid project download config name.")
+                logger.info(
+                    f"Here are valid download_config_name values for projects: "
+                    f"{common.PROJECT_DOWNLOAD_CONFIGS.keys()}"
+                )
+            download_config = common.PROJECT_DOWNLOAD_CONFIGS[download_config_name]
+            loader.generate_computed_file(project=project, download_config=download_config)
+
+        if sample_id:
+            sample = Sample.objects.filter(scpca_id=sample_id).first()
+            if not sample:
+                logger.error(f"{sample} does not exist.")
+            if download_config_name not in common.SAMPLE_DOWNLOAD_CONFIGS.keys():
+                logger.error(f"{download_config_name} is not a valid sample download config name.")
+                logger.info(
+                    f"Here are valid download_config_name values for samples: "
+                    f"{common.SAMPLE_DOWNLOAD_CONFIGS.keys()}"
+                )
+            download_config = common.SAMPLE_DOWNLOAD_CONFIGS[download_config_name]
+            loader.generate_computed_file(sample=sample, download_config=download_config)