Mercurial > repos > bgruening > barcode_collapse
changeset 0:38a8018e57a5 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/barcode_collapse commit cb3ef4df8ea5a705039b32f6959d7513462cfcfc
author | bgruening |
---|---|
date | Sat, 11 Jun 2016 10:46:34 -0400 |
parents | |
children | |
files | BarcodeCollapse_pe.xml barcode_collapse_pe.py test-data/Trial.bam test-data/metrics.txt test-data/output.bam |
diffstat | 5 files changed, 145 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BarcodeCollapse_pe.xml Sat Jun 11 10:46:34 2016 -0400 @@ -0,0 +1,48 @@ +<tool id="barcode_collapse" name="Barcode collapse" version="0.1.0.0"> + <description></description> + <requirements> + <requirement type="package" version="0.8.4">pysam</requirement> + </requirements> + <stdio> + <!-- Anything other than zero is an error --> + <exit_code range="1:" /> + <exit_code range=":-1" /> + <!-- In case the return code has not been set propery check stderr too --> + <regex match="Error:" /> + <regex match="Exception:" /> + </stdio> + <command> +<![CDATA[ + ln -s '${input_file}' infile.bam && + ln -s '${input_file.metadata.bam_index}' infile.bai && + + python $__tool_directory__/barcode_collapse_pe.py + --bam infile.bam + --metrics_file '$metrics_file' + -o '$output' +]]> + </command> + <inputs> + <param name="input_file" type="data" format="bam" label="Unsorted bam file to barcode collapse"/> + </inputs> + <outputs> + <data name="output" format="bam"/> + <data name="metrics_file" format="txt"/> + </outputs> + <tests> + <test> + <param name="input_file" value="Trial.bam" ftype="bam"/> + <output name="metrics_file" file="metrics.txt"/> + <output name="output" file="output.bam"/> + </test> + </tests> + <help> +<![CDATA[ +Read in a BAM file where the first 9 nucleotides of the read name are the barcode and merge reads mapped to the same position that have the same barcode. +Assumes paired end reads are adjacent to each other in the output file (i.e only provide unsorted bams). Also assumes that there are no multimappers in the bam file, if there are multimappers behavior is undefined. +]]> + </help> + <citations> + <citation type="doi">10.1038/nmeth.3810</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/barcode_collapse_pe.py Sat Jun 11 10:46:34 2016 -0400 @@ -0,0 +1,90 @@ +__author__ = 'gpratt' +""" + +barcode_collapse.py read in a .bam file where the +first 9 nt of the read name +are the barcode and merge reads mapped to the same position that have the same barcode + +""" + +from collections import Counter +import itertools +from optparse import OptionParser +import sys +import pysam + + +def stranded_read_start(read): + if read.is_reverse: + return read.positions[-1] + else: + return read.pos + + +def output_metrics(metrics_file, total_count, removed_count): + with open(metrics_file, 'w') as metrics: + metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n") + for barcode in total_count.keys(): + metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n") + + +def barcode_collapse(in_bam, out_bam): + number_of_unmapped_mate_pairs = 0 + different_chroms = 0 + removed_count = Counter() + total_count = Counter() + result_dict = {} + + with pysam.Samfile(in_bam, 'r') as samfile1: + with pysam.Samfile(in_bam, 'r') as samfile2: + samfile_read1 = itertools.islice(samfile1, 0, None, 2) + samfile_read2 = itertools.islice(samfile2, 1, None, 2) + for read1, read2 in itertools.izip(samfile_read1, samfile_read2): + if not read1.qname == read2.qname: + print read1.qname, read2.qname + raise Exception("Read Names don't match") + if read1.is_unmapped and read1.is_unmapped: + #Both reads don't map, don't even both saving them. + continue + if (not read1.is_unmapped and read2.is_unmapped) or (read1.is_unmapped and read2.is_unmapped): + number_of_unmapped_mate_pairs += 1 + continue + if read1.rname != read2.rname: + different_chroms += 1 + continue + randomer = read1.qname.split(":")[0] + + start = stranded_read_start(read1) + stop = stranded_read_start(read2) + strand = "-" if read1.is_reverse else "+" + unique_location = (read1.rname, start, stop, strand, randomer) + total_count[randomer] += 1 + if unique_location in result_dict: + removed_count[randomer] += 1 + continue + + result_dict[(read1.rname, start, stop, strand, randomer)] = (read1, read2) + + with pysam.Samfile(out_bam, 'wb', template=samfile1) as out_bam: + for key, (read1, read2) in result_dict.items(): + out_bam.write(read1) + out_bam.write(read2) + return total_count, removed_count + +if __name__ == "__main__": + description=""""Paired End randomer aware duplciate removal algorithm.""" + usage="""Assumes paired end reads are adjectent to each other in output file (ie only provide unsorted bams) + Also assumes no multimappers in the bam file, if there are multimappers behavior is undefined""" + parser = OptionParser(usage=usage, description=description) + parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse") + parser.add_option("-o", "--out_file", dest="out_file") + parser.add_option("-m", "--metrics_file", dest="metrics_file") + (options, args) = parser.parse_args() + + if not (options.bam.endswith(".bam")): + raise TypeError("%s, not bam file" % options.bam) + + total_count, removed_count = barcode_collapse(options.bam, options.out_file) + output_metrics(options.metrics_file, total_count, removed_count) + + sys.exit(0)