# HG changeset patch # User bgruening # Date 1465656394 14400 # Node ID 38a8018e57a5cf9d544b7209a396c2cfad5e01a6 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/barcode_collapse commit cb3ef4df8ea5a705039b32f6959d7513462cfcfc diff -r 000000000000 -r 38a8018e57a5 BarcodeCollapse_pe.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BarcodeCollapse_pe.xml Sat Jun 11 10:46:34 2016 -0400 @@ -0,0 +1,48 @@ + + + + pysam + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 10.1038/nmeth.3810 + + diff -r 000000000000 -r 38a8018e57a5 barcode_collapse_pe.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/barcode_collapse_pe.py Sat Jun 11 10:46:34 2016 -0400 @@ -0,0 +1,90 @@ +__author__ = 'gpratt' +""" + +barcode_collapse.py read in a .bam file where the +first 9 nt of the read name +are the barcode and merge reads mapped to the same position that have the same barcode + +""" + +from collections import Counter +import itertools +from optparse import OptionParser +import sys +import pysam + + +def stranded_read_start(read): + if read.is_reverse: + return read.positions[-1] + else: + return read.pos + + +def output_metrics(metrics_file, total_count, removed_count): + with open(metrics_file, 'w') as metrics: + metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n") + for barcode in total_count.keys(): + metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n") + + +def barcode_collapse(in_bam, out_bam): + number_of_unmapped_mate_pairs = 0 + different_chroms = 0 + removed_count = Counter() + total_count = Counter() + result_dict = {} + + with pysam.Samfile(in_bam, 'r') as samfile1: + with pysam.Samfile(in_bam, 'r') as samfile2: + samfile_read1 = itertools.islice(samfile1, 0, None, 2) + samfile_read2 = itertools.islice(samfile2, 1, None, 2) + for read1, read2 in itertools.izip(samfile_read1, samfile_read2): + if not read1.qname == read2.qname: + print read1.qname, read2.qname + raise Exception("Read Names don't match") + if read1.is_unmapped and read1.is_unmapped: + #Both reads don't map, don't even both saving them. + continue + if (not read1.is_unmapped and read2.is_unmapped) or (read1.is_unmapped and read2.is_unmapped): + number_of_unmapped_mate_pairs += 1 + continue + if read1.rname != read2.rname: + different_chroms += 1 + continue + randomer = read1.qname.split(":")[0] + + start = stranded_read_start(read1) + stop = stranded_read_start(read2) + strand = "-" if read1.is_reverse else "+" + unique_location = (read1.rname, start, stop, strand, randomer) + total_count[randomer] += 1 + if unique_location in result_dict: + removed_count[randomer] += 1 + continue + + result_dict[(read1.rname, start, stop, strand, randomer)] = (read1, read2) + + with pysam.Samfile(out_bam, 'wb', template=samfile1) as out_bam: + for key, (read1, read2) in result_dict.items(): + out_bam.write(read1) + out_bam.write(read2) + return total_count, removed_count + +if __name__ == "__main__": + description=""""Paired End randomer aware duplciate removal algorithm.""" + usage="""Assumes paired end reads are adjectent to each other in output file (ie only provide unsorted bams) + Also assumes no multimappers in the bam file, if there are multimappers behavior is undefined""" + parser = OptionParser(usage=usage, description=description) + parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse") + parser.add_option("-o", "--out_file", dest="out_file") + parser.add_option("-m", "--metrics_file", dest="metrics_file") + (options, args) = parser.parse_args() + + if not (options.bam.endswith(".bam")): + raise TypeError("%s, not bam file" % options.bam) + + total_count, removed_count = barcode_collapse(options.bam, options.out_file) + output_metrics(options.metrics_file, total_count, removed_count) + + sys.exit(0) diff -r 000000000000 -r 38a8018e57a5 test-data/Trial.bam Binary file test-data/Trial.bam has changed diff -r 000000000000 -r 38a8018e57a5 test-data/metrics.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/metrics.txt Sat Jun 11 10:46:34 2016 -0400 @@ -0,0 +1,7 @@ +randomer total_count removed_count +CCGACGCACC 1 0 +ACCAAGCAAT 1 0 +GCTCCCCACG 1 0 +CGTCATACAG 1 0 +CGCCAACGGA 1 0 +CCCATGGCCC 1 0 diff -r 000000000000 -r 38a8018e57a5 test-data/output.bam Binary file test-data/output.bam has changed