comparison pairend_distro.py @ 0:a1225091082c draft

"planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 6a109f553d1691e243372258ad564244586875c3"
author artbio
date Mon, 14 Oct 2019 07:11:39 -0400
parents
children 1a4662c69bee
comparison
equal deleted inserted replaced
-1:000000000000 0:a1225091082c
1 #!/usr/bin/env python
2 # (c) 2012 - Ryan M. Layer
3 # Hall Laboratory
4 # Quinlan Laboratory
5 # Department of Computer Science
6 # Department of Biochemistry and Molecular Genetics
7 # Department of Public Health Sciences and Center for Public Health Genomics,
8 # University of Virginia
9 # rl6sf@virginia.edu
10
11 import sys
12 from optparse import OptionParser
13
14 import numpy as np
15
16 # some constants for sam/bam field ids
17 SAM_FLAG = 1
18 SAM_REFNAME = 2
19 SAM_MATE_REFNAME = 6
20 SAM_ISIZE = 8
21
22 parser = OptionParser()
23 parser.add_option("-r", "--read_length", type="int", dest="read_length",
24 help="Read length")
25 parser.add_option("-X", dest="X", type="int",
26 help="Number of stdevs from mean to extend")
27 parser.add_option("-N", dest="N", type="int", help="Number to sample")
28 parser.add_option("-o", dest="output_file", help="Output file")
29 parser.add_option("-m", dest="mads", type="int", default=10,
30 help='''Outlier cutoff in # of median absolute deviations
31 (unscaled, upper only)''')
32
33
34 def unscaled_upper_mad(xs):
35 """Return a tuple consisting of the median of xs followed by the
36 unscaled median absolute deviation of the values in xs that lie
37 above the median.
38 """
39 med = np.median(xs)
40 return med, np.median(xs[xs > med] - med)
41
42
43 (options, args) = parser.parse_args()
44
45 if not options.read_length:
46 parser.error('Read length not given')
47
48 if not options.X:
49 parser.error('X not given')
50
51 if not options.N:
52 parser.error('N not given')
53
54 if not options.output_file:
55 parser.error('Output file not given')
56
57
58 required = 97
59 restricted = 3484
60 flag_mask = required | restricted
61
62 L = []
63 c = 0
64
65 for l in sys.stdin:
66 if c >= options.N:
67 break
68
69 A = l.rstrip().split('\t')
70 flag = int(A[SAM_FLAG])
71 refname = A[SAM_REFNAME]
72 mate_refname = A[SAM_MATE_REFNAME]
73 isize = int(A[SAM_ISIZE])
74
75 want = mate_refname == "=" and flag & flag_mask == required and isize >= 0
76 if want:
77 c += 1
78 L.append(isize)
79
80 # warn if very few elements in distribution
81 min_elements = 1000
82 if len(L) < min_elements:
83 sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" %
84 (len(L), min_elements))
85 mean = "NA"
86 stdev = "NA"
87
88 else:
89 # Remove outliers
90 L = np.array(L)
91 L.sort()
92 med, umad = unscaled_upper_mad(L)
93 upper_cutoff = med + options.mads * umad
94 L = L[L < upper_cutoff]
95 new_len = len(L)
96 removed = c - new_len
97 sys.stderr.write("Removed %d outliers with isize >= %d\n" %
98 (removed, upper_cutoff))
99 c = new_len
100
101 mean = np.mean(L)
102 stdev = np.std(L)
103
104 start = options.read_length
105 end = int(mean + options.X*stdev)
106
107 H = [0] * (end - start + 1)
108 s = 0
109
110 for x in L:
111 if (x >= start) and (x <= end):
112 j = int(x - start)
113 H[j] = H[int(x - start)] + 1
114 s += 1
115
116 f = open(options.output_file, 'w')
117
118 for i in range(end - start):
119 o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n"
120 f.write(o)
121 f.close()
122 print('mean:' + str(mean) + '\tstdev:' + str(stdev))