Mercurial > repos > bgruening > barcode_collapse

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BarcodeCollapse_pe.xml	Sat Jun 11 10:46:34 2016 -0400
@@ -0,0 +1,48 @@
+<tool id="barcode_collapse" name="Barcode collapse" version="0.1.0.0">
+    <description></description>
+    <requirements>
+        <requirement type="package" version="0.8.4">pysam</requirement>
+    </requirements>
+    <stdio>
+        <!-- Anything other than zero is an error -->
+        <exit_code range="1:" />
+        <exit_code range=":-1" />
+        <!-- In case the return code has not been set propery check stderr too -->
+        <regex match="Error:" />
+        <regex match="Exception:" />
+    </stdio>
+    <command>
+<![CDATA[
+        ln -s '${input_file}' infile.bam &&
+        ln -s '${input_file.metadata.bam_index}' infile.bai &&
+
+        python $__tool_directory__/barcode_collapse_pe.py
+        --bam infile.bam
+        --metrics_file '$metrics_file'
+        -o '$output'
+]]>
+    </command>
+    <inputs>
+        <param name="input_file" type="data" format="bam" label="Unsorted bam file to barcode collapse"/>
+    </inputs>
+    <outputs>
+        <data name="output" format="bam"/>
+        <data name="metrics_file" format="txt"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input_file" value="Trial.bam" ftype="bam"/>
+            <output name="metrics_file" file="metrics.txt"/>
+            <output name="output" file="output.bam"/>
+        </test>
+    </tests>
+    <help>
+<![CDATA[
+Read in a BAM file where the first 9 nucleotides of the read name are the barcode and merge reads mapped to the same position that have the same barcode.
+Assumes paired end reads are adjacent to each other in the output file (i.e only provide unsorted bams). Also assumes that there are no multimappers in the bam file, if there are multimappers behavior is undefined.
+]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1038/nmeth.3810</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/barcode_collapse_pe.py	Sat Jun 11 10:46:34 2016 -0400
@@ -0,0 +1,90 @@
+__author__ = 'gpratt'
+"""
+
+barcode_collapse.py  read in a .bam file where the
+first 9 nt of the read name
+are the barcode and merge reads mapped to the same position that have the same barcode
+
+"""
+
+from collections import Counter
+import itertools
+from optparse import OptionParser
+import sys
+import pysam
+
+
+def stranded_read_start(read):
+    if read.is_reverse:
+        return read.positions[-1]
+    else:
+        return read.pos
+
+
+def output_metrics(metrics_file, total_count, removed_count):
+    with open(metrics_file, 'w') as metrics:
+        metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n")
+        for barcode in total_count.keys():
+            metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n")
+
+
+def barcode_collapse(in_bam, out_bam):
+    number_of_unmapped_mate_pairs = 0
+    different_chroms = 0
+    removed_count = Counter()
+    total_count = Counter()
+    result_dict = {}
+
+    with pysam.Samfile(in_bam, 'r') as samfile1:
+        with pysam.Samfile(in_bam, 'r') as samfile2:
+            samfile_read1 = itertools.islice(samfile1, 0, None, 2)
+            samfile_read2 = itertools.islice(samfile2, 1, None, 2)
+            for read1, read2 in itertools.izip(samfile_read1, samfile_read2):
+                if not read1.qname == read2.qname:
+                    print read1.qname, read2.qname
+                    raise Exception("Read Names don't match")
+                if read1.is_unmapped and read1.is_unmapped:
+                    #Both reads don't map, don't even both saving them.
+                    continue
+                if (not read1.is_unmapped and read2.is_unmapped) or (read1.is_unmapped and read2.is_unmapped):
+                    number_of_unmapped_mate_pairs += 1
+                    continue
+                if read1.rname != read2.rname:
+                    different_chroms += 1
+                    continue
+                randomer = read1.qname.split(":")[0]
+
+                start = stranded_read_start(read1)
+                stop = stranded_read_start(read2)
+                strand = "-" if read1.is_reverse else "+"
+                unique_location = (read1.rname, start, stop, strand, randomer)
+                total_count[randomer] += 1
+                if unique_location in result_dict:
+                    removed_count[randomer] += 1
+                    continue
+
+                result_dict[(read1.rname, start, stop, strand, randomer)] = (read1, read2)
+
+        with pysam.Samfile(out_bam, 'wb', template=samfile1) as out_bam:
+            for key, (read1, read2) in result_dict.items():
+                out_bam.write(read1)
+                out_bam.write(read2)
+    return total_count, removed_count
+
+if __name__ == "__main__":
+    description=""""Paired End randomer aware duplciate removal algorithm."""
+    usage="""Assumes paired end reads are adjectent to each other in output file (ie only provide unsorted bams)
+             Also assumes no multimappers in the bam file, if there are multimappers behavior is undefined"""
+    parser = OptionParser(usage=usage, description=description)
+    parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse")
+    parser.add_option("-o", "--out_file", dest="out_file")
+    parser.add_option("-m", "--metrics_file", dest="metrics_file")
+    (options, args) = parser.parse_args()
+
+    if not (options.bam.endswith(".bam")):
+        raise TypeError("%s, not bam file" % options.bam)
+
+    total_count, removed_count = barcode_collapse(options.bam, options.out_file)
+    output_metrics(options.metrics_file, total_count, removed_count)
+
+    sys.exit(0)
Binary file test-data/Trial.bam has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/metrics.txt	Sat Jun 11 10:46:34 2016 -0400
@@ -0,0 +1,7 @@
+randomer	total_count	removed_count
+CCGACGCACC	1	0
+ACCAAGCAAT	1	0
+GCTCCCCACG	1	0
+CGTCATACAG	1	0
+CGCCAACGGA	1	0
+CCCATGGCCC	1	0
Binary file test-data/output.bam has changed