Add samplesheet output with bams

pinin4fjords · pinin4fjords · commit 83e312f24029 · 2025-09-08T10:40:48.000+01:00
diff --git a/docs/output.md b/docs/output.md
@@ -868,6 +868,7 @@ A number of genome-specific files are generated by the pipeline because they are
   - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline.
   - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`.
   - Parameters used by the pipeline run: `params.json`.
+  - `samplesheet_with_bams.csv`: **Auto-generated complete samplesheet** containing all samples with BAM file paths. For samples processed from FASTQ, includes paths to newly generated BAMs; for samples that were BAM input, preserves the original input paths. This comprehensive samplesheet can be used directly for future pipeline runs, enabling efficient reprocessing without re-alignment.
 
 </details>
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -129,6 +129,7 @@ SAMPLE2,sample2_R1.fastq.gz,sample2_R2.fastq.gz,forward,,
 - When using BAM input, you can leave the FASTQ columns empty or omit them
 - Mixed samplesheets (some samples with FASTQ, others with BAM) are supported
 - For BAM file locations from pipeline outputs, see the [output documentation](https://nf-co.re/rnaseq/output)
+- **Automated samplesheet generation**: The pipeline automatically generates a `samplesheet_with_bams.csv` file in the `pipeline_info/` directory containing all samples with their BAM file paths. For FASTQ-derived samples, this includes paths to newly generated BAMs; for BAM input samples, it preserves the original input paths. This complete samplesheet can be used directly for future pipeline runs
 
 ## FASTQ sampling
 
diff --git a/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf b/subworkflows/local/utils_nfcore_rnaseq_pipeline/main.nf
@@ -177,15 +177,24 @@ def checkSamplesAfterGrouping(input) {
         def genome_bam = genome_bams?.find { it != null }
         def transcriptome_bam = transcriptome_bams?.find { it != null }
         
-        // Add BAM flags to meta
+        // Add BAM flags and original paths to meta
         def meta_with_bams = metas[0] + [
             has_genome_bam: genome_bam ? true : false,
-            has_transcriptome_bam: transcriptome_bam ? true : false
+            has_transcriptome_bam: transcriptome_bam ? true : false,
+            original_genome_bam: genome_bam ?: null,
+            original_transcriptome_bam: transcriptome_bam ?: null
         ]
         
         return [ meta_with_bams, fastqs, genome_bam, transcriptome_bam ]
     } else {
-        return [ metas[0], fastqs ]
+        // Add null BAM fields to meta for consistency
+        def meta_no_bams = metas[0] + [
+            has_genome_bam: false,
+            has_transcriptome_bam: false,
+            original_genome_bam: null,
+            original_transcriptome_bam: null
+        ]
+        return [ meta_no_bams, fastqs ]
     }
 }
 
@@ -635,6 +644,36 @@ def getInferexperimentStrandedness(inferexperiment_file, stranded_threshold = 0.
     return calculateStrandedness(forwardFragments, reverseFragments, unstrandedFragments, stranded_threshold, unstranded_threshold)
 }
 
+//
+// Function to map work directory BAM paths to published paths
+//
+def mapBamToPublishedPath(bam_path, sample_id, aligner, outdir) {
+    if (!bam_path) return ''
+    
+    def filename = file(bam_path).getName()
+    def base_dir = "${outdir}/${aligner}"
+    
+    // Map based on aligner type and filename patterns
+    if (aligner == 'star_salmon') {
+        if (filename.contains('Aligned.out.bam')) {
+            return "${base_dir}/${sample_id}.Aligned.out.bam"
+        } else if (filename.contains('toTranscriptome')) {
+            return "${base_dir}/${sample_id}.Aligned.toTranscriptome.out.bam"
+        }
+    } else if (aligner == 'star_rsem') {
+        if (filename.contains('genome.bam')) {
+            return "${base_dir}/${sample_id}.STAR.genome.bam"
+        } else if (filename.contains('transcript.bam')) {
+            return "${base_dir}/${sample_id}.transcript.bam"
+        }
+    } else if (aligner == 'hisat2') {
+        return "${base_dir}/${sample_id}.bam"
+    }
+    
+    // Fallback to original filename
+    return "${base_dir}/${filename}"
+}
+
 //
 // Print pipeline summary on completion
 //
diff --git a/tower.yml b/tower.yml
@@ -53,3 +53,5 @@ reports:
     display: "All samples STAR RSEM merged transcript raw counts"
   "**/star_rsem/rsem.merged.transcript_tpm.tsv":
     display: "All samples STAR RSEM merged transcript TPM counts"
+  "**/pipeline_info/samplesheet_with_bams.csv":
+    display: "Samplesheet with BAM paths for reanalysis"
diff --git a/workflows/rnaseq/main.nf b/workflows/rnaseq/main.nf
@@ -26,6 +26,7 @@ include { getStarPercentMapped           } from '../../subworkflows/local/utils_
 include { biotypeInGtf                   } from '../../subworkflows/local/utils_nfcore_rnaseq_pipeline'
 include { getInferexperimentStrandedness } from '../../subworkflows/local/utils_nfcore_rnaseq_pipeline'
 include { methodsDescriptionText         } from '../../subworkflows/local/utils_nfcore_rnaseq_pipeline'
+include { mapBamToPublishedPath          } from '../../subworkflows/local/utils_nfcore_rnaseq_pipeline'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -770,6 +771,39 @@ workflow RNASEQ {
         ch_multiqc_report = MULTIQC.out.report
     }
 
+    //
+    // Generate samplesheet with BAM paths for future runs
+    //
+    if (!params.skip_alignment) {
+        // Create channel with original input info and BAM paths
+        ch_fastq
+            .join(ch_genome_bam, by: 0, remainder: true)
+            .join(ch_transcriptome_bam, by: 0, remainder: true)
+            .map { meta, reads, genome_bam, transcriptome_bam ->
+                // Extract FASTQ paths
+                def fastq_1 = reads && reads.size() > 0 ? reads[0] : ''
+                def fastq_2 = reads && reads.size() > 1 ? reads[1] : ''
+                
+                // Handle BAM paths - use original input paths for BAM input samples, published paths for FASTQ-derived samples
+                def genome_bam_published = meta.has_genome_bam ? 
+                    (meta.original_genome_bam ?: '') : 
+                    mapBamToPublishedPath(genome_bam, meta.id, params.aligner, params.outdir)
+                    
+                def transcriptome_bam_published = meta.has_transcriptome_bam ? 
+                    (meta.original_transcriptome_bam ?: '') : 
+                    mapBamToPublishedPath(transcriptome_bam, meta.id, params.aligner, params.outdir)
+                
+                // Return CSV line
+                return "${meta.id},${fastq_1},${fastq_2},${meta.strandedness},${genome_bam_published},${transcriptome_bam_published}"
+            }
+            .collectFile(
+                name: 'samplesheet_with_bams.csv',
+                storeDir: "${params.outdir}/pipeline_info",
+                newLine: true,
+                seed: 'sample,fastq_1,fastq_2,strandedness,genome_bam,transcriptome_bam'
+            )
+    }
+
     emit:
     trim_status    = ch_trim_status    // channel: [id, boolean]
     map_status     = ch_map_status     // channel: [id, boolean]