2
|
1 #!/usr/bin/env python
|
|
2
|
50
|
3 import argparse
|
|
4 import logging
|
|
5 from sys import stdout
|
|
6 from subprocess import check_call
|
|
7 from shutil import rmtree
|
|
8 from tempfile import mkdtemp
|
|
9 from os.path import isfile
|
|
10 # avoid ugly python IOError when stdout output is piped into another program
|
|
11 # and then truncated (such as piping to head)
|
|
12 from signal import signal, SIGPIPE, SIG_DFL
|
|
13 signal(SIGPIPE, SIG_DFL)
|
|
14
|
2
|
15 tool_description = """
|
|
16 Merge PCR duplicates according to random barcode library.
|
|
17
|
55
|
18 Barcodes containing uncalled base 'N' are removed.
|
2
|
19
|
|
20 Input:
|
|
21 * bed6 file containing alignments with fastq read-id in name field
|
8
|
22 * fastq library of random barcodes
|
2
|
23
|
|
24 Output:
|
|
25 * bed6 file with random barcode in name field and number of PCR duplicates as
|
|
26 score, sorted by fields chrom, start, stop, strand, name
|
|
27
|
|
28 Example usage:
|
|
29 - read PCR duplicates from file duplicates.bed and write merged results to file
|
|
30 merged.bed:
|
55
|
31 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --outfile merged.bed
|
2
|
32 """
|
|
33
|
|
34 epilog = """
|
|
35 Author: Daniel Maticzka
|
|
36 Copyright: 2015
|
|
37 License: Apache
|
|
38 Email: maticzkd@informatik.uni-freiburg.de
|
|
39 Status: Testing
|
|
40 """
|
|
41
|
|
42 # parse command line arguments
|
|
43 parser = argparse.ArgumentParser(description=tool_description,
|
|
44 epilog=epilog,
|
|
45 formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
46 # positional arguments
|
|
47 parser.add_argument(
|
|
48 "alignments",
|
|
49 help="Path to bed6 file containing alignments.")
|
|
50 parser.add_argument(
|
|
51 "bclib",
|
50
|
52 help="Path to fastq barcode library.")
|
2
|
53 # optional arguments
|
|
54 parser.add_argument(
|
|
55 "-o", "--outfile",
|
55
|
56 required=True,
|
2
|
57 help="Write results to this file.")
|
|
58 # misc arguments
|
|
59 parser.add_argument(
|
|
60 "-v", "--verbose",
|
|
61 help="Be verbose.",
|
|
62 action="store_true")
|
|
63 parser.add_argument(
|
|
64 "-d", "--debug",
|
|
65 help="Print lots of debugging information",
|
|
66 action="store_true")
|
|
67 parser.add_argument(
|
|
68 '--version',
|
|
69 action='version',
|
50
|
70 version='0.2.0')
|
2
|
71
|
|
72 args = parser.parse_args()
|
|
73
|
|
74 if args.debug:
|
|
75 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
|
|
76 elif args.verbose:
|
|
77 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
|
|
78 else:
|
|
79 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
|
|
80 logging.info("Parsed arguments:")
|
|
81 logging.info(" alignments: '{}'".format(args.alignments))
|
|
82 logging.info(" bclib: '{}'".format(args.bclib))
|
|
83 if args.outfile:
|
|
84 logging.info(" outfile: enabled writing to file")
|
|
85 logging.info(" outfile: '{}'".format(args.outfile))
|
|
86 logging.info("")
|
|
87
|
50
|
88 # see if alignments are empty and the tool can quit
|
|
89 n_alns = sum(1 for line in open(args.alignments))
|
|
90 if n_alns == 0:
|
|
91 logging.warning("WARNING: Working on empty set of alignments, writing empty output.")
|
|
92 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
|
|
93 eventalnout.close()
|
|
94 exit(0)
|
|
95
|
|
96 # check input filenames
|
|
97 if not isfile(args.bclib):
|
|
98 raise Exception("ERROR: barcode library '{}' not found.")
|
|
99 if not isfile(args.alignments):
|
|
100 raise Exception("ERROR: alignments '{}' not found.")
|
|
101
|
|
102 try:
|
|
103 tmpdir = mkdtemp()
|
|
104 logging.debug("tmpdir: " + tmpdir)
|
2
|
105
|
50
|
106 # prepare alinments
|
55
|
107 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort --compress-program=gzip -k4,4 > " + tmpdir + "/alns.csv"
|
50
|
108 check_call(syscall2, shell=True)
|
2
|
109
|
50
|
110 # join barcode library and alignments
|
55
|
111 syscall3 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort --compress-program=gzip -k1,1 | join -1 1 -2 4 - " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $3,$4,$5,$2,$6,$7}' | datamash --sort -g 1,2,3,4,6 count 4 | awk 'BEGIN{OFS=\"\\t\"}$4!~/N/{print $1,$2,$3,$4,$6,$5}' > " + args.outfile
|
|
112 # 'chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'
|
50
|
113 check_call(syscall3, shell=True)
|
|
114 finally:
|
|
115 logging.debug("removed tmpdir: " + tmpdir)
|
|
116 rmtree(tmpdir)
|