Skip to content

Remove processing of R2 reverse reads, check for fastq files, use Python 3.6, and badge fixes #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ sudo: required
dist: xenial
python:
# We don't actually use the Travis Python, but this keeps it organized.
- "3.7"
- "3.6"
install:
- sudo apt-get update
- wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
Expand Down
10 changes: 8 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
install:
python3 setup.py -q install
pip install -e ./
test:
python3 -m pytest
pytest -s
local_test:
barseq -i tests/data/input/sequences -f -e test_barcodes -s tests/data/input/samples.csv -b tests/data/input/barcodes.csv -o tests/dump
local_test_ref_free:
barseq -i tests/data/input/sequences -f -e test -s tests/data/input/samples.csv --reference-free -o tests/dump
clean_dump:
rm -r tests/dump/results
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[![Build Status](https://travis-ci.com/eburgoswisc/barseq.svg?branch=master)](https://travis-ci.com/eburgoswisc/barseq)
![Python 3.7](https://img.shields.io/badge/python-3.7-blue.svg)
![Build Status](https://travis-ci.org/mjmlab/barseq.svg?branch=master)
![Python 3.6](https://img.shields.io/badge/python-3.7-blue.svg)

# barseq

Expand Down
105 changes: 79 additions & 26 deletions barseq/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from pathlib import Path

# Module import
from .utils import write_output, read_barcodes, format_filename, make_barseq_directories
from .process_reads import count_barcodes
from .utils import *
from .process_reads import *


__author__ = "Emanuel Burgos"
Expand All @@ -39,13 +39,40 @@ def __exit__(self, etype, value, traceback):
class Run:
""" Class that stores settings for barseq processes. """
def __init__(self, args):
# Required parameters
self.experiment = args.experiment
self.sequences = Path(args.input)
self.barcodes = Path(args.barcodes)
self.sequence_dir = Path(args.input)
self.sequence_files = Path(self.sequence_dir)
self.barcodes = args.barcodes
self.sample_ids = read_sample_ID(Path(args.samples))
self.reference_free = args.reference_free
# Settings for sequence search
self.flanking_left = args.flanking_left
self.flanking_right = args.flanking_right
self.barcode_length = args.barcode_length
self.pattern = f"({self.flanking_left}){{e<=1}}([ATGC]{{18}})({self.flanking_right}){{e<=1}}"
# If barcoded provided
if not self.reference_free:
if not Path(self.barcodes).exists():
logger.error('Reference barcode library provided does not exist. '
'If you do not have one, run --reference-free')
self.barcodes = Path(args.barcodes)
# If barcodes provided, grab sample
if self.reference_free:
if not Path(args.samples).exists():
logger.error("You selected to run barseq using --reference-free "
"but did not provide a file for sample names")
self.min_count = args.min_count
# self.barseq_sample_collection = list()
self.sample_dict = dict()
self.path = f"results/{self.experiment}/"
self.log = f"{self.path}log.txt"
self.path = Path(args.output).joinpath(f'results/{self.experiment}')
self.log = self.path.joinpath('log.txt')
self.force = args.force

def get_sample_name(self, seq_tag):
""" Gets sample id given the sequence file tag"""
# TODO: Use grep or regex to find the tag, may be given different ones
return self.sample_ids[seq_tag.name.split('_')[0]]


class SampleRecord:
Expand All @@ -60,46 +87,72 @@ def __init__(self, sample: str, filename: str, barcode_dict):

def main(args) -> None:
"""
Main pipeline for analyzing barseq data. Will be changed to be more modular, for now
I just need the algorithm worked out.
Main pipeline for analyzing barseq data. Will be changed to be more modular

"""
# ---- SET UP SETTINGS ---- #
runner = Run(args)
make_barseq_directories(runner)

# Add file handler
fh = logging.FileHandler(runner.log, mode="w")
fh.setFormatter(logging.Formatter(
"%(asctime)s - %(levelname)s - %(module)s - %(message)s",
datefmt="%Y-%m-%d %H:%M"))
logger.addHandler(fh)

logger.info("***** Starting barseq *****")
# Read in barcode
logger.info(f"Reading in barcodes from {runner.barcodes.name}")
barcodes = read_barcodes(runner.barcodes)
# Process each sequencing file
seq_files_list = sorted(os.listdir(runner.sequences))
for seq_file in seq_files_list:
if not seq_file.endswith(".DS_Store"):
sample = format_filename(seq_file)
logger.info(f"Counting Barcodes in {sample}")
runner.sample_dict[sample] = deepcopy(barcodes)
# Change cwd
with Cd(runner.sequences):
count_barcodes(seq_file, runner.sample_dict[sample])

# TODO: Add basic analysis
# Read in barcode if needed
if runner.reference_free:
logger.info('Will find putative unique barcodes and count without referencing a library')

# Write to output
logger.info(f"Writing results to {runner.path}")
write_output(runner.sample_dict, barcodes, runner)
else:
logger.info(f"Reading in barcodes from {runner.barcodes.name}")
barcodes = read_barcodes(runner.barcodes)

# Process each sequencing file
data_collection = []
for seq_file in runner.sequence_files.glob('*R1*'):
# Split seq_file name into its suffixes
if '.fastq' in seq_file.suffixes or '.gz' in seq_file.suffixes:
sample = runner.get_sample_name(seq_file)
logger.info(f"Counting Barcodes in {sample}")
# IF GIVEN BARCODES
if not runner.reference_free:
runner.sample_dict[sample] = deepcopy(barcodes)
count_barcodes(seq_file, runner.sample_dict[sample], runner)
# Write to output
logger.info(f"Writing results to {runner.path}")
# REFERENCE FREE
if runner.reference_free:
# Skip control and undetermined fastq files
if seq_file.stem.startswith('Undetermined_S0') or seq_file.stem.startswith('S1000'):
pass
else:
# Find sample name
sample_name = runner.get_sample_name(seq_file)
barcode_counts_dict, good_barcode_counts = count_barcodes_reference_free(seq_file=seq_file, runner=runner)
# Calculate stats on barcode counts
stats_barcode_dict = calculate_barcode_perc(barcode_counts_dict,
total_barcodes=good_barcode_counts,
min_count=runner.min_count)

# Get formatted row
row = format_sample_data(stats_barcode_dict, good_barcode_counts, sample_id=sample_name)
data_collection.append(row)

if data_collection:
save_results(data_collection, runner)

else:
write_output(runner.sample_dict, runner)

# Confirm completion of barseq
logger.info("***** barseq is complete! *****")


if __name__ == "__main__":

""" # -------- START HERE -------- # """
args = sys.argv[1:]
main(args)
Expand Down
62 changes: 51 additions & 11 deletions barseq/process_reads.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
3 # !/usr/bin/env python3

"""
Count barcode frequency in fastq/fasta files given by user.
Expand All @@ -8,7 +8,9 @@
import screed
import logging
import regex as re
import sys

# Module imports
from .utils import compile_regex_patterns

__author__ = "Emanuel Burgos"
__email__ = "[email protected]"
Expand All @@ -17,27 +19,24 @@
logger = logging.getLogger("barseq")


def count_barcodes(seq_file, barcode_dict) -> None:
def count_barcodes(seq_file, barcode_dict, runner) -> None:
"""
Count barcode frequency in sequence file.
Returns a DataFrame object

:param seq_file: file with reads
:param runner: Run object
:param barcode_dict: barcode dictionary of sample
:return:
"""
_other_reads = list()
# Compile regex patterns
flank_regex = re.compile("(GCTCATGCACTTGATTCC){e<=1}([ATGC]{18})(GACTTGACCTGGATGTCT){e<=1}")
barcode_regex = dict()
for b in barcode_dict:
barcode_regex[b] = re.compile("(%s){e<=1}" % b)
flank_regex, barcode_regex = compile_regex_patterns(barcode_dict, runner)
# Open sequence file
with screed.open(seq_file) as reads:
n_reads = 0
for read in reads:
try:
putative_barcode = re.search(flank_regex, read.sequence)[2]
putative_barcode = re.search(runner.pattern, read.sequence)[2]
for known_barcode in barcode_regex:
if re.search(barcode_regex[known_barcode], putative_barcode):
barcode_dict[known_barcode]["count"] += 1
Expand All @@ -56,10 +55,51 @@ def count_barcodes(seq_file, barcode_dict) -> None:
_other_reads = barcode_dict['_other']['count']

logger.info(f"For {seq_file}, {matched_reads} of "
f"{n_reads} ({round((matched_reads/n_reads) * 100, 2)}%) matched known barcodes.")
logger.info(f"Reads without barcode match: {_other_reads} ({round((_other_reads/n_reads)*100, 2)}%) for {seq_file}")
f"{n_reads} ({round((matched_reads / n_reads) * 100, 2)}%) matched known barcodes.")
logger.info(
f"Reads without barcode match: {_other_reads} ({round((_other_reads / n_reads) * 100, 2)}%) for {seq_file}")
return


def count_barcodes_reference_free(seq_file, runner):
"""
Count barcode frequency in sequence file.
Returns a DataFrame object

:param seq_file: file with reads
:param runner: Run object
:return barcode_counts:
"""
# Get regex pattern
# flank_regex = compile_regex_patterns(runner=runner)
# Search for barcodes in sequence file
# Barcode count
barcode_counts = dict()
barcode_counts["no_barcode_found"] = 0
n_reads = 0
with screed.open(seq_file) as reads:
for read in reads:
match = re.search(runner.pattern, read.sequence)
if match:
# Add to dictionary and count
barcode = match.groups(0)[1]
if barcode not in barcode_counts.keys():
barcode_counts[barcode] = 0
barcode_counts[barcode] += 1
else:
barcode_counts["no_barcode_found"] += 1
n_reads += 1

# Calculate matched reads
good_barcode_count = sum([barcode_counts[s] for s in barcode_counts if s != "no_barcode_found"])
_other_reads = barcode_counts['no_barcode_found']

logger.info(f"For {seq_file}, {good_barcode_count} of "
f"{n_reads} ({round((good_barcode_count / n_reads) * 100, 2)}%) are good barcodes.")
logger.info(
f"Reads without barcode match: {_other_reads} ({round((_other_reads / n_reads) * 100, 2)}%) for {seq_file}")
return barcode_counts, good_barcode_count


if __name__ == '__main__':
pass
13 changes: 13 additions & 0 deletions barseq/tests/data/input/barcodes.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Barcode,Gene
ATGAAGACTGTTGCCGTA,bar1
CACGACGCCCTCCGCGGA,bar2
ACTATTACGCAAAATAAT,bar3
ATGGAAGATATTATTATT,bar4
CCTCTCCAACCGGGTCTG,bar5
CCCGGTCGCCTAGCCCCG,bar6
GGCCCCCCGCCCGTCCCC,bar7
GGATCACTGCTAGCGTAT,bar8
CCTGCAGCAGCGGCCCGC,bar9
ACACATGCAGACATAGAG,bar10
CGCGCCATCCGCCGCCCA,bar11
AATATTCAGATGGGACGT,bar12
18 changes: 5 additions & 13 deletions barseq/tests/data/input/samples.csv
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
Barcode,Gene
ATGAAGACTGTTGCCGTA,bar1
CACGACGCCCTCCGCGGA,bar2
ACTATTACGCAAAATAAT,bar3
ATGGAAGATATTATTATT,bar4
CCTCTCCAACCGGGTCTG,bar5
CCCGGTCGCCTAGCCCCG,bar6
GGCCCCCCGCCCGTCCCC,bar7
GGATCACTGCTAGCGTAT,bar8
CCTGCAGCAGCGGCCCGC,bar9
ACACATGCAGACATAGAG,bar10
CGCGCCATCCGCCGCCCA,bar11
AATATTCAGATGGGACGT,bar12
ID,Sample_descriptor
data1,sample1
data2,sample2
data3,sample3
Undetermined,Undetermined
4 changes: 4 additions & 0 deletions barseq/tests/data/input/samples_ref_free.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
ID,Sample_descriptor
S001,WT-bar_1
S002,WT-bar_2
S003,WT-bar_3
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
14 changes: 0 additions & 14 deletions barseq/tests/data/output/barcode_counts_table.csv

This file was deleted.

14 changes: 14 additions & 0 deletions barseq/tests/data/output/barcode_counts_table.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Gene Barcodes sample1 sample2 sample3
bar1 ATGAAGACTGTTGCCGTA 15 7 16
bar2 CACGACGCCCTCCGCGGA 13 19 12
bar3 ACTATTACGCAAAATAAT 11 10 13
bar4 ATGGAAGATATTATTATT 2 7 5
bar5 CCTCTCCAACCGGGTCTG 9 13 7
bar6 CCCGGTCGCCTAGCCCCG 8 4 9
bar7 GGCCCCCCGCCCGTCCCC 3 4 2
bar8 GGATCACTGCTAGCGTAT 4 2 2
bar9 CCTGCAGCAGCGGCCCGC 4 4 7
bar10 ACACATGCAGACATAGAG 4 8 5
bar11 CGCGCCATCCGCCGCCCA 4 9 3
bar12 AATATTCAGATGGGACGT 9 10 17
_other _other 40 29 28
28 changes: 15 additions & 13 deletions barseq/tests/data/output/log.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
2019-06-11 10:35 - INFO - main - ***** Starting barseq *****
2019-06-11 10:35 - INFO - main - Reading in barcodes from samples.csv
2019-06-11 10:35 - INFO - main - Counting Barcodes in data1
2019-06-11 10:35 - INFO - process_reads - For data1.fastq, 86 of 126 (68.25%) matched known barcodes.
2019-06-11 10:35 - INFO - process_reads - Reads without barcode match: 40 (31.75%) for data1.fastq
2019-06-11 10:35 - INFO - main - Counting Barcodes in data2
2019-06-11 10:35 - INFO - process_reads - For data2.fastq, 97 of 126 (76.98%) matched known barcodes.
2019-06-11 10:35 - INFO - process_reads - Reads without barcode match: 29 (23.02%) for data2.fastq
2019-06-11 10:35 - INFO - main - Counting Barcodes in data3
2019-06-11 10:35 - INFO - process_reads - For data3.fastq, 98 of 126 (77.78%) matched known barcodes.
2019-06-11 10:35 - INFO - process_reads - Reads without barcode match: 28 (22.22%) for data3.fastq
2019-06-11 10:35 - INFO - main - Writing results to results/barseq_pytest
2019-06-11 10:35 - INFO - main - ***** barseq is complete! *****
2022-10-06 14:07 - INFO - main - ***** Starting barseq *****
2022-10-06 14:07 - INFO - main - Reading in barcodes from barcodes.csv
2022-10-06 14:07 - INFO - main - Counting Barcodes in sample1
2022-10-06 14:07 - INFO - process_reads - For tests/data/input/sequences/data1_R1.fastq, 86 of 126 (68.25%) matched known barcodes.
2022-10-06 14:07 - INFO - process_reads - Reads without barcode match: 40 (31.75%) for tests/data/input/sequences/data1_R1.fastq
2022-10-06 14:07 - INFO - main - Writing results to tests/dump/results/test_barcodes
2022-10-06 14:07 - INFO - main - Counting Barcodes in sample3
2022-10-06 14:07 - INFO - process_reads - For tests/data/input/sequences/data3_R1.fastq, 98 of 126 (77.78%) matched known barcodes.
2022-10-06 14:07 - INFO - process_reads - Reads without barcode match: 28 (22.22%) for tests/data/input/sequences/data3_R1.fastq
2022-10-06 14:07 - INFO - main - Writing results to tests/dump/results/test_barcodes
2022-10-06 14:07 - INFO - main - Counting Barcodes in sample2
2022-10-06 14:07 - INFO - process_reads - For tests/data/input/sequences/data2_R1.fastq, 97 of 126 (76.98%) matched known barcodes.
2022-10-06 14:07 - INFO - process_reads - Reads without barcode match: 29 (23.02%) for tests/data/input/sequences/data2_R1.fastq
2022-10-06 14:07 - INFO - main - Writing results to tests/dump/results/test_barcodes
2022-10-06 14:07 - INFO - main - ***** barseq is complete! *****
Loading