diff merge_pcr_duplicates.py @ 50:0b9aab6aaebf draft

Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
author rnateam
date Tue, 26 Jan 2016 04:38:27 -0500
parents 570a7de9f151
children 4bedd35bcdff
line wrap: on
line diff
--- a/merge_pcr_duplicates.py	Sat Dec 19 06:16:22 2015 -0500
+++ b/merge_pcr_duplicates.py	Tue Jan 26 04:38:27 2016 -0500
@@ -1,5 +1,18 @@
 #!/usr/bin/env python
 
+import argparse
+import logging
+from sys import stdout
+import pandas as pd
+from subprocess import check_call
+from shutil import rmtree
+from tempfile import mkdtemp
+from os.path import isfile
+# avoid ugly python IOError when stdout output is piped into another program
+# and then truncated (such as piping to head)
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
 tool_description = """
 Merge PCR duplicates according to random barcode library.
 
@@ -28,24 +41,6 @@
 Status: Testing
 """
 
-import argparse
-import logging
-from sys import stdout
-from Bio import SeqIO
-import pandas as pd
-
-# avoid ugly python IOError when stdout output is piped into another program
-# and then truncated (such as piping to head)
-from signal import signal, SIGPIPE, SIG_DFL
-signal(SIGPIPE, SIG_DFL)
-
-
-def fasta_tuple_generator(fasta_iterator):
-    "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
-    for record in input_seq_iterator:
-        yield (record.id, str(record.seq))
-
-
 # parse command line arguments
 parser = argparse.ArgumentParser(description=tool_description,
                                  epilog=epilog,
@@ -56,16 +51,11 @@
     help="Path to bed6 file containing alignments.")
 parser.add_argument(
     "bclib",
-    help="Path to fasta barcode library.")
+    help="Path to fastq barcode library.")
 # optional arguments
 parser.add_argument(
     "-o", "--outfile",
     help="Write results to this file.")
-parser.add_argument(
-    "--fasta-library",
-    dest="fasta_library",
-    action="store_true",
-    help="Read random barcode library as fasta format.")
 # misc arguments
 parser.add_argument(
     "-v", "--verbose",
@@ -78,7 +68,7 @@
 parser.add_argument(
     '--version',
     action='version',
-    version='0.1.0')
+    version='0.2.0')
 
 args = parser.parse_args()
 
@@ -96,50 +86,65 @@
     logging.info("  outfile: '{}'".format(args.outfile))
 logging.info("")
 
-# load barcode library into dictionary
-input_handle = open(args.bclib, "rU")
-if args.fasta_library:
-    input_seq_iterator = SeqIO.parse(input_handle, "fasta")
-else:
-    input_seq_iterator = SeqIO.parse(input_handle, "fastq")
-bcs = pd.DataFrame.from_records(
-    data=fasta_tuple_generator(input_seq_iterator),
-    columns=["read_id", "bc"])
+# see if alignments are empty and the tool can quit
+n_alns = sum(1 for line in open(args.alignments))
+if n_alns == 0:
+    logging.warning("WARNING: Working on empty set of alignments, writing empty output.")
+    eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
+    eventalnout.close()
+    exit(0)
+
+# check input filenames
+if not isfile(args.bclib):
+    raise Exception("ERROR: barcode library '{}' not found.")
+if not isfile(args.alignments):
+    raise Exception("ERROR: alignments '{}' not found.")
+
+try:
+    tmpdir = mkdtemp()
+    logging.debug("tmpdir: " + tmpdir)
 
-# load alignments
-alns = pd.read_csv(
-    args.alignments,
-    sep="\t",
-    names=["chrom", "start", "stop", "read_id", "score", "strand"])
+    # prepare barcode library
+    syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv"
+    check_call(syscall1, shell=True)
+
+    # prepare alinments
+    syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv"
+    check_call(syscall2, shell=True)
 
-# keep id parts up to first whitespace
-alns["read_id"] = alns["read_id"].str.split(' ').str.get(0)
+    # join barcode library and alignments
+    syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv"
+    check_call(syscall3, shell=True)
 
-# combine barcode library and alignments
-bcalib = pd.merge(
-    bcs, alns,
-    on="read_id",
-    how="inner",
-    sort=False)
+    # get alignments combined with barcodes
+    bcalib = pd.read_csv(
+        tmpdir + "/bcalib.csv",
+        sep="\t",
+        names=["chrom", "start", "stop", "bc", "score", "strand"])
+finally:
+    logging.debug("removed tmpdir: " + tmpdir)
+    rmtree(tmpdir)
+
+# fail if alignments given but combined library is empty
 if bcalib.empty:
     raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
-n_alns = len(alns.index)
+
+# warn if not all alignments could be assigned a barcode
 n_bcalib = len(bcalib.index)
 if n_bcalib < n_alns:
     logging.warning(
-        "{} of {} alignments could not be associated with a random barcode.".format(
-            n_alns - n_bcalib, n_alns))
+        "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns))
 
 # remove entries with barcodes that has uncalled base N
 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
 n_bcalib_cleaned = len(bcalib_cleaned)
-if n_bcalib_cleaned < n_bcalib:
-    msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
-        n_bcalib - n_bcalib_cleaned, n_bcalib)
-    if n_bcalib_cleaned < (0.8 * n_bcalib):
-        logging.warning(msg)
-    else:
-        logging.info(msg)
+# if n_bcalib_cleaned < n_bcalib:
+#     msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
+#         n_bcalib - n_bcalib_cleaned, n_bcalib)
+#     if n_bcalib_cleaned < (0.8 * n_bcalib):
+#         logging.warning(msg)
+#     else:
+#         logging.info(msg)
 
 # count and merge pcr duplicates
 # grouping sorts by keys, so the ouput will be properly sorted