-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.sh
More file actions
90 lines (71 loc) · 3.77 KB
/
script.sh
File metadata and controls
90 lines (71 loc) · 3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/bin/bash
# ==============================================================
# Mycobacterium abscessus Complex (MABC) Comparative Genomics Pipeline
# Target: 12 Isolates (3-Subspecies: abscessus, massiliense, bolletii)
# Analyses: Pangenome, fastANI Phylogeny, & Metabolic Reconstruction
# ==============================================================
# Activate Conda Environment (created in external ssd)
eval "$(conda shell.bash hook)"
conda activate /mnt/samsung_ssd/conda_envs/anvio-9
# STEP 1: Setup Directories
mkdir -p 01_FASTA 02_CONTIGS 03_PAN 04_METABOLISM 05_ANI
# Genome files were downloaded individually from NCBI FTP server
# STEP 2: Decompress FASTA files downloaded via FTP (NCBI)
echo "Decompressing .fna.gz files..."
gunzip -f 01_FASTA/*.fna.gz
# STEP 3: Reformat fasta for anvio, Build contigsDBs, and Annotate
echo "Processing genomes..."
for f in 01_FASTA/*.fna; do
# Extract the GCF number and replace the period with an underscore
MAB=$(basename "$f" _genomic.fna | cut -d'_' -f1,2)
MAB=${MAB//./_}
echo "------------------------------------------------"
echo " Running pipeline for: $MAB "
echo "------------------------------------------------"
anvi-script-reformat-fasta "$f" -o "01_FASTA/${MAB}.fasta" -l 1000 --simplify-names
anvi-gen-contigs-database -f "01_FASTA/${MAB}.fasta" -o "02_CONTIGS/${MAB}.db" -n "$MAB"
anvi-run-hmms -c "02_CONTIGS/${MAB}.db" --num-threads 10
anvi-run-ncbi-cogs -c "02_CONTIGS/${MAB}.db" --num-threads 10
anvi-scan-trnas -c "02_CONTIGS/${MAB}.db" --num-threads 10
anvi-run-scg-taxonomy -c "02_CONTIGS/${MAB}.db" --num-threads 10
anvi-run-kegg-kofams -c "02_CONTIGS/${MAB}.db" --num-threads 10
done
# STEP 4: Quality Control
echo "Generating external genomes file and checking completeness..."
anvi-script-gen-genomes-file --input-dir 02_CONTIGS -o external-genomes.txt
anvi-estimate-genome-completeness -e external-genomes.txt
# STEP 5: Compute Pangenome
echo "Consolidating storage and calculating Pangenome..."
anvi-gen-genomes-storage -e external-genomes.txt -o MAb-GENOMES.db
# MCL inflation set to 10 to resolve the highly conserved MAb core vs accessory genome
anvi-pan-genome -g MAb-GENOMES.db \
--project-name MAb_Pan \
--output-dir 03_PAN \
--num-threads 10 \
--mcl-inflation 10
# STEP 6: Metabolic Reconstruction
echo "Estimating metabolic module completeness..."
anvi-estimate-metabolism -e external-genomes.txt \
-O 04_METABOLISM/MAb_Metabolic_Modules \
--matrix-format
# STEP 6.1: Metabolic Pathway enrichment
anvi-compute-metabolic-enrichment -M 04_METABOLISM/MAb_Metabolic_Modules_modules.txt \
-G 04_METABOLISM/subspecies.txt \
-o MAb_metabolic_enrichment.txt
# Step 6.2: Interactive visualization of metabolic module completeness
anvi-interactive --manual \
-d 04_METABOLISM/MAb_Metabolic_Modules-module_pathwise_completeness-MATRIX.txt \
-p 04_METABOLISM/METABOLISM-PROFILE.db \
--title "MABC Metabolic Completeness & Enrichment"
# STEP 7: Compute Average Nucleotide Identity (ANI)
echo "Calculating genome similarity (ANI)..."
anvi-compute-genome-similarity -e external-genomes.txt \
--program fastANI \
--output-dir 05_ANI \
--num-threads 10 \
--pan-db 03_PAN/MAb_Pan-PAN.db \
--force-overwrite
=====================================================
echo "Pipeline complete! finished at $(date +%T)"
=====================================================
anvi-display-pan -p 03_PAN/MAb_Pan-PAN.db -g MAb-GENOMES.db