Mercurial > repos > artbio > lumpy_sv
comparison pairend_distro.py @ 0:a1225091082c draft
"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 6a109f553d1691e243372258ad564244586875c3"
author | artbio |
---|---|
date | Mon, 14 Oct 2019 07:11:39 -0400 |
parents | |
children | 1a4662c69bee |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a1225091082c |
---|---|
1 #!/usr/bin/env python | |
2 # (c) 2012 - Ryan M. Layer | |
3 # Hall Laboratory | |
4 # Quinlan Laboratory | |
5 # Department of Computer Science | |
6 # Department of Biochemistry and Molecular Genetics | |
7 # Department of Public Health Sciences and Center for Public Health Genomics, | |
8 # University of Virginia | |
9 # rl6sf@virginia.edu | |
10 | |
11 import sys | |
12 from optparse import OptionParser | |
13 | |
14 import numpy as np | |
15 | |
16 # some constants for sam/bam field ids | |
17 SAM_FLAG = 1 | |
18 SAM_REFNAME = 2 | |
19 SAM_MATE_REFNAME = 6 | |
20 SAM_ISIZE = 8 | |
21 | |
22 parser = OptionParser() | |
23 parser.add_option("-r", "--read_length", type="int", dest="read_length", | |
24 help="Read length") | |
25 parser.add_option("-X", dest="X", type="int", | |
26 help="Number of stdevs from mean to extend") | |
27 parser.add_option("-N", dest="N", type="int", help="Number to sample") | |
28 parser.add_option("-o", dest="output_file", help="Output file") | |
29 parser.add_option("-m", dest="mads", type="int", default=10, | |
30 help='''Outlier cutoff in # of median absolute deviations | |
31 (unscaled, upper only)''') | |
32 | |
33 | |
34 def unscaled_upper_mad(xs): | |
35 """Return a tuple consisting of the median of xs followed by the | |
36 unscaled median absolute deviation of the values in xs that lie | |
37 above the median. | |
38 """ | |
39 med = np.median(xs) | |
40 return med, np.median(xs[xs > med] - med) | |
41 | |
42 | |
43 (options, args) = parser.parse_args() | |
44 | |
45 if not options.read_length: | |
46 parser.error('Read length not given') | |
47 | |
48 if not options.X: | |
49 parser.error('X not given') | |
50 | |
51 if not options.N: | |
52 parser.error('N not given') | |
53 | |
54 if not options.output_file: | |
55 parser.error('Output file not given') | |
56 | |
57 | |
58 required = 97 | |
59 restricted = 3484 | |
60 flag_mask = required | restricted | |
61 | |
62 L = [] | |
63 c = 0 | |
64 | |
65 for l in sys.stdin: | |
66 if c >= options.N: | |
67 break | |
68 | |
69 A = l.rstrip().split('\t') | |
70 flag = int(A[SAM_FLAG]) | |
71 refname = A[SAM_REFNAME] | |
72 mate_refname = A[SAM_MATE_REFNAME] | |
73 isize = int(A[SAM_ISIZE]) | |
74 | |
75 want = mate_refname == "=" and flag & flag_mask == required and isize >= 0 | |
76 if want: | |
77 c += 1 | |
78 L.append(isize) | |
79 | |
80 # warn if very few elements in distribution | |
81 min_elements = 1000 | |
82 if len(L) < min_elements: | |
83 sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % | |
84 (len(L), min_elements)) | |
85 mean = "NA" | |
86 stdev = "NA" | |
87 | |
88 else: | |
89 # Remove outliers | |
90 L = np.array(L) | |
91 L.sort() | |
92 med, umad = unscaled_upper_mad(L) | |
93 upper_cutoff = med + options.mads * umad | |
94 L = L[L < upper_cutoff] | |
95 new_len = len(L) | |
96 removed = c - new_len | |
97 sys.stderr.write("Removed %d outliers with isize >= %d\n" % | |
98 (removed, upper_cutoff)) | |
99 c = new_len | |
100 | |
101 mean = np.mean(L) | |
102 stdev = np.std(L) | |
103 | |
104 start = options.read_length | |
105 end = int(mean + options.X*stdev) | |
106 | |
107 H = [0] * (end - start + 1) | |
108 s = 0 | |
109 | |
110 for x in L: | |
111 if (x >= start) and (x <= end): | |
112 j = int(x - start) | |
113 H[j] = H[int(x - start)] + 1 | |
114 s += 1 | |
115 | |
116 f = open(options.output_file, 'w') | |
117 | |
118 for i in range(end - start): | |
119 o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n" | |
120 f.write(o) | |
121 f.close() | |
122 print('mean:' + str(mean) + '\tstdev:' + str(stdev)) |