Pangenomics/script.sh at main · sohaibzafar90/Pangenomics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/bin/bash
# ==============================================================
# Mycobacterium abscessus Complex (MABC) Comparative Genomics Pipeline
# Target: 12 Isolates (3-Subspecies: abscessus, massiliense, bolletii)
# Analyses: Pangenome, fastANI Phylogeny, & Metabolic Reconstruction
# ==============================================================

# Activate Conda Environment (created in external ssd)
eval "$(conda shell.bash hook)"
conda activate /mnt/samsung_ssd/conda_envs/anvio-9

# STEP 1: Setup Directories
mkdir -p 01_FASTA 02_CONTIGS 03_PAN 04_METABOLISM 05_ANI

# Genome files were downloaded individually from NCBI FTP server
# STEP 2: Decompress FASTA files downloaded via FTP (NCBI)
echo "Decompressing .fna.gz files..."
gunzip -f 01_FASTA/*.fna.gz

# STEP 3: Reformat fasta for anvio, Build contigsDBs, and Annotate
echo "Processing genomes..."
for f in 01_FASTA/*.fna; do
    # Extract the GCF number and replace the period with an underscore
    MAB=$(basename "$f" _genomic.fna | cut -d'_' -f1,2)
    MAB=${MAB//./_}

    echo "------------------------------------------------"
    echo " Running pipeline for: $MAB "
    echo "------------------------------------------------"

    anvi-script-reformat-fasta "$f" -o "01_FASTA/${MAB}.fasta" -l 1000 --simplify-names
    anvi-gen-contigs-database -f "01_FASTA/${MAB}.fasta" -o "02_CONTIGS/${MAB}.db" -n "$MAB"

    anvi-run-hmms -c "02_CONTIGS/${MAB}.db" --num-threads 10
    anvi-run-ncbi-cogs -c "02_CONTIGS/${MAB}.db" --num-threads 10
    anvi-scan-trnas -c "02_CONTIGS/${MAB}.db" --num-threads 10
    anvi-run-scg-taxonomy -c "02_CONTIGS/${MAB}.db" --num-threads 10
    anvi-run-kegg-kofams -c "02_CONTIGS/${MAB}.db" --num-threads 10
done


# STEP 4: Quality Control
echo "Generating external genomes file and checking completeness..."
anvi-script-gen-genomes-file --input-dir 02_CONTIGS -o external-genomes.txt
anvi-estimate-genome-completeness -e external-genomes.txt


# STEP 5: Compute Pangenome
echo "Consolidating storage and calculating Pangenome..."
anvi-gen-genomes-storage -e external-genomes.txt -o MAb-GENOMES.db

# MCL inflation set to 10 to resolve the highly conserved MAb core vs accessory genome
anvi-pan-genome -g MAb-GENOMES.db \
                --project-name MAb_Pan \
                --output-dir 03_PAN \
                --num-threads 10 \
                --mcl-inflation 10


# STEP 6: Metabolic Reconstruction
echo "Estimating metabolic module completeness..."
anvi-estimate-metabolism -e external-genomes.txt \
                         -O 04_METABOLISM/MAb_Metabolic_Modules \
                         --matrix-format
# STEP 6.1: Metabolic Pathway enrichment
anvi-compute-metabolic-enrichment -M 04_METABOLISM/MAb_Metabolic_Modules_modules.txt \
                                   -G 04_METABOLISM/subspecies.txt \
                                   -o MAb_metabolic_enrichment.txt
# Step 6.2: Interactive visualization of metabolic module completeness
anvi-interactive --manual \
                 -d 04_METABOLISM/MAb_Metabolic_Modules-module_pathwise_completeness-MATRIX.txt \
                 -p 04_METABOLISM/METABOLISM-PROFILE.db \
                 --title "MABC Metabolic Completeness & Enrichment"

# STEP 7: Compute Average Nucleotide Identity (ANI)
echo "Calculating genome similarity (ANI)..."
anvi-compute-genome-similarity -e external-genomes.txt \
                               --program fastANI \
                               --output-dir 05_ANI \
                               --num-threads 10 \
                               --pan-db 03_PAN/MAb_Pan-PAN.db \
                               --force-overwrite
=====================================================
echo "Pipeline complete! finished at $(date +%T)"
=====================================================
anvi-display-pan -p 03_PAN/MAb_Pan-PAN.db -g MAb-GENOMES.db