annotate rm_spurious_events.py @ 58:bbbae1ee87e0 draft default tip

fix for flexbar with small data issue
author rnateam
date Tue, 16 Feb 2016 10:08:58 -0500
parents 0b9aab6aaebf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
3 import argparse
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
4 import logging
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
5 from subprocess import check_call
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
6 import os
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
7
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
8 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
9 Remove spurious events originating from errors in random sequence tags.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
10
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
11 This script compares all events sharing the same coordinates. Among each group
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
12 of events the maximum number of PCR duplicates is determined. All events that
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
13 are supported by less than 10 percent of this maximum count are removed.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
14
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
15 Input:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16 * bed6 file containing crosslinking events with score field set to number of PCR
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17 duplicates
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19 Output:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20 * bed6 file with spurious crosslinking events removed, sorted by fields chrom,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21 start, stop, strand
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
24 - remove spurious events from spurious.bed and write results to file cleaned.bed
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
25 rm_spurious_events.py spurious.bed --oufile cleaned.bed
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33 Status: Testing
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 argparse.RawDescriptionHelpFormatter):
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 pass
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
43 def main():
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
44 # parse command line arguments
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
45 parser = argparse.ArgumentParser(description=tool_description,
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
46 epilog=epilog,
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
47 formatter_class=DefaultsRawDescriptionHelpFormatter)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
48 # positional arguments
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
49 parser.add_argument(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
50 "events",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
51 help="Path to bed6 file containing alignments.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
52 # optional arguments
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
53 parser.add_argument(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
54 "-o", "--outfile",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
55 required=True,
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
56 help="Write results to this file.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
57 parser.add_argument(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
58 "-t", "--threshold",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
59 type=float,
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
60 default=0.1,
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
61 help="Threshold for spurious event removal."
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
62 )
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
63 # misc arguments
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
64 parser.add_argument(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
65 "-v", "--verbose",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
66 help="Be verbose.",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
67 action="store_true")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
68 parser.add_argument(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
69 "-d", "--debug",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
70 help="Print lots of debugging information",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
71 action="store_true")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
72 parser.add_argument(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
73 '--version',
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
74 action='version',
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
75 version='0.1.0')
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
77 args = parser.parse_args()
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
78
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
79 if args.debug:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
80 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
81 elif args.verbose:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
82 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
83 else:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
84 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
85 logging.info("Parsed arguments:")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
86 logging.info(" alignments: '{}'".format(args.events))
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
87 logging.info(" threshold: '{}'".format(args.threshold))
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
88 if args.outfile:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
89 logging.info(" outfile: enabled writing to file")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
90 logging.info(" outfile: '{}'".format(args.outfile))
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
91 logging.info("")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
92
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
93 # check threshold parameter value
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
94 if args.threshold < 0 or args.threshold > 1:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
95 raise ValueError("Threshold must be in [0,1].")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
96
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
97 if not os.path.isfile(args.events):
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
98 raise Exception("ERROR: file '{}' not found.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
99
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
100 # prepare barcode library
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
101 syscall = "cat " + args.events + " | sort -k1,1V -k6,6 -k2,2n -k3,3 -k5,5nr | perl " + os.path.dirname(os.path.realpath(__file__)) + "/rm_spurious_events.pl --frac_max " + str(args.threshold) + "| sort -k1,1V -k2,2n -k3,3n -k6,6 -k4,4 -k5,5nr > " + args.outfile
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
102 check_call(syscall, shell=True)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
103
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
104
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
105 if __name__ == "__main__":
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 2
diff changeset
106 main()