| 2 | 1 #!/usr/bin/env python | 
|  | 2 | 
| 50 | 3 import argparse | 
|  | 4 import logging | 
|  | 5 from sys import stdout | 
|  | 6 from subprocess import check_call | 
|  | 7 from shutil import rmtree | 
|  | 8 from tempfile import mkdtemp | 
|  | 9 from os.path import isfile | 
|  | 10 # avoid ugly python IOError when stdout output is piped into another program | 
|  | 11 # and then truncated (such as piping to head) | 
|  | 12 from signal import signal, SIGPIPE, SIG_DFL | 
|  | 13 signal(SIGPIPE, SIG_DFL) | 
|  | 14 | 
| 2 | 15 tool_description = """ | 
|  | 16 Merge PCR duplicates according to random barcode library. | 
|  | 17 | 
| 55 | 18 Barcodes containing uncalled base 'N' are removed. | 
| 2 | 19 | 
|  | 20 Input: | 
|  | 21 * bed6 file containing alignments with fastq read-id in name field | 
| 8 | 22 * fastq library of random barcodes | 
| 2 | 23 | 
|  | 24 Output: | 
|  | 25 * bed6 file with random barcode in name field and number of PCR duplicates as | 
|  | 26   score, sorted by fields chrom, start, stop, strand, name | 
|  | 27 | 
|  | 28 Example usage: | 
|  | 29 - read PCR duplicates from file duplicates.bed and write merged results to file | 
|  | 30   merged.bed: | 
| 55 | 31 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --outfile merged.bed | 
| 2 | 32 """ | 
|  | 33 | 
|  | 34 epilog = """ | 
|  | 35 Author: Daniel Maticzka | 
|  | 36 Copyright: 2015 | 
|  | 37 License: Apache | 
|  | 38 Email: maticzkd@informatik.uni-freiburg.de | 
|  | 39 Status: Testing | 
|  | 40 """ | 
|  | 41 | 
|  | 42 # parse command line arguments | 
|  | 43 parser = argparse.ArgumentParser(description=tool_description, | 
|  | 44                                  epilog=epilog, | 
|  | 45                                  formatter_class=argparse.RawDescriptionHelpFormatter) | 
|  | 46 # positional arguments | 
|  | 47 parser.add_argument( | 
|  | 48     "alignments", | 
|  | 49     help="Path to bed6 file containing alignments.") | 
|  | 50 parser.add_argument( | 
|  | 51     "bclib", | 
| 50 | 52     help="Path to fastq barcode library.") | 
| 2 | 53 # optional arguments | 
|  | 54 parser.add_argument( | 
|  | 55     "-o", "--outfile", | 
| 55 | 56     required=True, | 
| 2 | 57     help="Write results to this file.") | 
|  | 58 # misc arguments | 
|  | 59 parser.add_argument( | 
|  | 60     "-v", "--verbose", | 
|  | 61     help="Be verbose.", | 
|  | 62     action="store_true") | 
|  | 63 parser.add_argument( | 
|  | 64     "-d", "--debug", | 
|  | 65     help="Print lots of debugging information", | 
|  | 66     action="store_true") | 
|  | 67 parser.add_argument( | 
|  | 68     '--version', | 
|  | 69     action='version', | 
| 50 | 70     version='0.2.0') | 
| 2 | 71 | 
|  | 72 args = parser.parse_args() | 
|  | 73 | 
|  | 74 if args.debug: | 
|  | 75     logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | 
|  | 76 elif args.verbose: | 
|  | 77     logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | 
|  | 78 else: | 
|  | 79     logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | 
|  | 80 logging.info("Parsed arguments:") | 
|  | 81 logging.info("  alignments: '{}'".format(args.alignments)) | 
|  | 82 logging.info("  bclib: '{}'".format(args.bclib)) | 
|  | 83 if args.outfile: | 
|  | 84     logging.info("  outfile: enabled writing to file") | 
|  | 85     logging.info("  outfile: '{}'".format(args.outfile)) | 
|  | 86 logging.info("") | 
|  | 87 | 
| 50 | 88 # see if alignments are empty and the tool can quit | 
|  | 89 n_alns = sum(1 for line in open(args.alignments)) | 
|  | 90 if n_alns == 0: | 
|  | 91     logging.warning("WARNING: Working on empty set of alignments, writing empty output.") | 
|  | 92     eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout) | 
|  | 93     eventalnout.close() | 
|  | 94     exit(0) | 
|  | 95 | 
|  | 96 # check input filenames | 
|  | 97 if not isfile(args.bclib): | 
|  | 98     raise Exception("ERROR: barcode library '{}' not found.") | 
|  | 99 if not isfile(args.alignments): | 
|  | 100     raise Exception("ERROR: alignments '{}' not found.") | 
|  | 101 | 
|  | 102 try: | 
|  | 103     tmpdir = mkdtemp() | 
|  | 104     logging.debug("tmpdir: " + tmpdir) | 
| 2 | 105 | 
| 50 | 106     # prepare alinments | 
| 55 | 107     syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort --compress-program=gzip -k4,4 > " + tmpdir + "/alns.csv" | 
| 50 | 108     check_call(syscall2, shell=True) | 
| 2 | 109 | 
| 50 | 110     # join barcode library and alignments | 
| 55 | 111     syscall3 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort --compress-program=gzip -k1,1 | join -1 1 -2 4 - " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $3,$4,$5,$2,$6,$7}' | datamash --sort -g 1,2,3,4,6 count 4 | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $1,$2,$3,$4,$6,$5}' > " + args.outfile | 
|  | 112     # 'chrom', 'start', 'stop', 'bc', 'ndupes', 'strand' | 
| 50 | 113     check_call(syscall3, shell=True) | 
|  | 114 finally: | 
|  | 115     logging.debug("removed tmpdir: " + tmpdir) | 
|  | 116     rmtree(tmpdir) |