annotate merge_pcr_duplicates.py @ 58:bbbae1ee87e0 draft default tip

fix for flexbar with small data issue
author rnateam
date Tue, 16 Feb 2016 10:08:58 -0500
parents 4bedd35bcdff
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
3 import argparse
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
4 import logging
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
5 from sys import stdout
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
6 from subprocess import check_call
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
7 from shutil import rmtree
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
8 from tempfile import mkdtemp
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
9 from os.path import isfile
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
10 # avoid ugly python IOError when stdout output is piped into another program
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
11 # and then truncated (such as piping to head)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
12 from signal import signal, SIGPIPE, SIG_DFL
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
13 signal(SIGPIPE, SIG_DFL)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
14
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16 Merge PCR duplicates according to random barcode library.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17
55
4bedd35bcdff Move deps from macros to individual tools
rnateam
parents: 50
diff changeset
18 Barcodes containing uncalled base 'N' are removed.
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20 Input:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21 * bed6 file containing alignments with fastq read-id in name field
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
22 * fastq library of random barcodes
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 Output:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25 * bed6 file with random barcode in name field and number of PCR duplicates as
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 score, sorted by fields chrom, start, stop, strand, name
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29 - read PCR duplicates from file duplicates.bed and write merged results to file
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30 merged.bed:
55
4bedd35bcdff Move deps from macros to individual tools
rnateam
parents: 50
diff changeset
31 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --outfile merged.bed
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 Status: Testing
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
43 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
44 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
45 formatter_class=argparse.RawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
46 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
47 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48 "alignments",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 help="Path to bed6 file containing alignments.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
50 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
51 "bclib",
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
52 help="Path to fastq barcode library.")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
53 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
54 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
55 "-o", "--outfile",
55
4bedd35bcdff Move deps from macros to individual tools
rnateam
parents: 50
diff changeset
56 required=True,
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
57 help="Write results to this file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
58 # misc arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
59 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
60 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
61 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
64 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
65 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
66 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
67 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
68 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 action='version',
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
70 version='0.2.0')
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
71
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
72 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
77 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
78 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80 logging.info("Parsed arguments:")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
81 logging.info(" alignments: '{}'".format(args.alignments))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
82 logging.info(" bclib: '{}'".format(args.bclib))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
88 # see if alignments are empty and the tool can quit
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
89 n_alns = sum(1 for line in open(args.alignments))
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
90 if n_alns == 0:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
91 logging.warning("WARNING: Working on empty set of alignments, writing empty output.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
92 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
93 eventalnout.close()
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
94 exit(0)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
95
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
96 # check input filenames
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
97 if not isfile(args.bclib):
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
98 raise Exception("ERROR: barcode library '{}' not found.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
99 if not isfile(args.alignments):
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
100 raise Exception("ERROR: alignments '{}' not found.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
101
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
102 try:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
103 tmpdir = mkdtemp()
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
104 logging.debug("tmpdir: " + tmpdir)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
105
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
106 # prepare alinments
55
4bedd35bcdff Move deps from macros to individual tools
rnateam
parents: 50
diff changeset
107 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort --compress-program=gzip -k4,4 > " + tmpdir + "/alns.csv"
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
108 check_call(syscall2, shell=True)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
109
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
110 # join barcode library and alignments
58
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
111 # after join: id, bc, chr, start, stop, mapscore, strand
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
112 # after datamash: bc, chr, start, stop, strand, ndupes, idrepresentative
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
113 syscall3 = "cat " + \
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
114 args.bclib + \
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
115 " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' " + \
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
116 " | sort --compress-program=gzip -k1,1 | join -1 1 -2 4 - " + tmpdir + "/alns.csv " + \
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
117 " | awk 'BEGIN{OFS=\"\\t\"}$2!~/N/{print $1,$2,$3,$4,$5,$6,$7}' " + \
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
118 " | datamash --sort -g 2,3,4,5,7 count 2 first 1 " + \
bbbae1ee87e0 fix for flexbar with small data issue
rnateam
parents: 55
diff changeset
119 " | awk 'BEGIN{OFS=\"\\t\"}{print $2,$3,$4,$7,$6,$5}' > " + args.outfile
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
120 check_call(syscall3, shell=True)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
121 finally:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
122 logging.debug("removed tmpdir: " + tmpdir)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
123 rmtree(tmpdir)