annotate repmatch_gff3_util.py @ 0:d33030c8e2cc draft

Uploaded
author greg
date Tue, 17 Nov 2015 14:26:08 -0500
parents
children 8159aaa7da4b
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d33030c8e2cc Uploaded
greg
parents:
diff changeset
1 import bisect
d33030c8e2cc Uploaded
greg
parents:
diff changeset
2 import csv
d33030c8e2cc Uploaded
greg
parents:
diff changeset
3 import os
d33030c8e2cc Uploaded
greg
parents:
diff changeset
4 import shutil
d33030c8e2cc Uploaded
greg
parents:
diff changeset
5 import sys
d33030c8e2cc Uploaded
greg
parents:
diff changeset
6 import tempfile
d33030c8e2cc Uploaded
greg
parents:
diff changeset
7
d33030c8e2cc Uploaded
greg
parents:
diff changeset
8 from matplotlib import pyplot
d33030c8e2cc Uploaded
greg
parents:
diff changeset
9
d33030c8e2cc Uploaded
greg
parents:
diff changeset
10 # Graph settings
d33030c8e2cc Uploaded
greg
parents:
diff changeset
11 Y_LABEL = 'Counts'
d33030c8e2cc Uploaded
greg
parents:
diff changeset
12 X_LABEL = 'Number of matched replicates'
d33030c8e2cc Uploaded
greg
parents:
diff changeset
13 TICK_WIDTH = 3
d33030c8e2cc Uploaded
greg
parents:
diff changeset
14 # Amount to shift the graph to make labels fit, [left, right, top, bottom]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
15 ADJUST = [0.180, 0.9, 0.9, 0.1]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
16 # Length of tick marks, use TICK_WIDTH for width
d33030c8e2cc Uploaded
greg
parents:
diff changeset
17 pyplot.rc('xtick.major', size=10.00)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
18 pyplot.rc('ytick.major', size=10.00)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
19 pyplot.rc('lines', linewidth=4.00)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
20 pyplot.rc('axes', linewidth=3.00)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
21 pyplot.rc('font', family='Arial', size=32.0)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
22
d33030c8e2cc Uploaded
greg
parents:
diff changeset
23 PLOT_FORMATS = ['png', 'pdf', 'svg']
d33030c8e2cc Uploaded
greg
parents:
diff changeset
24 COLORS = 'krb'
d33030c8e2cc Uploaded
greg
parents:
diff changeset
25
d33030c8e2cc Uploaded
greg
parents:
diff changeset
26
d33030c8e2cc Uploaded
greg
parents:
diff changeset
27 class Replicate(object):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
28
d33030c8e2cc Uploaded
greg
parents:
diff changeset
29 def __init__(self, id, dataset_path):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
30 self.id = id
d33030c8e2cc Uploaded
greg
parents:
diff changeset
31 self.dataset_path = dataset_path
d33030c8e2cc Uploaded
greg
parents:
diff changeset
32 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t'))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
33
d33030c8e2cc Uploaded
greg
parents:
diff changeset
34 def parse(self, reader):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
35 self.chromosomes = {}
d33030c8e2cc Uploaded
greg
parents:
diff changeset
36 for line in reader:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
37 if line[0].startswith("#") or line[0].startswith('"'):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
38 continue
d33030c8e2cc Uploaded
greg
parents:
diff changeset
39 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
d33030c8e2cc Uploaded
greg
parents:
diff changeset
40 attrs = parse_gff_attrs(attrs)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
41 distance = attrs['cw_distance']
d33030c8e2cc Uploaded
greg
parents:
diff changeset
42 mid = int(mid)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
43 midplus = int(midplus)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
44 value = float(value)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
45 distance = int(distance)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
46 if cname not in self.chromosomes:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
47 self.chromosomes[cname] = Chromosome(cname)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
48 chrom = self.chromosomes[cname]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
49 chrom.add_peak(Peak(cname, mid, value, distance, self))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
50 for chrom in self.chromosomes.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
51 chrom.sort_by_index()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
52
d33030c8e2cc Uploaded
greg
parents:
diff changeset
53 def filter(self, up_limit, low_limit):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
54 for chrom in self.chromosomes.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
55 chrom.filter(up_limit, low_limit)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
56
d33030c8e2cc Uploaded
greg
parents:
diff changeset
57 def size(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
58 return sum([len(c.peaks) for c in self.chromosomes.values()])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
59
d33030c8e2cc Uploaded
greg
parents:
diff changeset
60
d33030c8e2cc Uploaded
greg
parents:
diff changeset
61 class Chromosome(object):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
62
d33030c8e2cc Uploaded
greg
parents:
diff changeset
63 def __init__(self, name):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
64 self.name = name
d33030c8e2cc Uploaded
greg
parents:
diff changeset
65 self.peaks = []
d33030c8e2cc Uploaded
greg
parents:
diff changeset
66
d33030c8e2cc Uploaded
greg
parents:
diff changeset
67 def add_peak(self, peak):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
68 self.peaks.append(peak)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
69
d33030c8e2cc Uploaded
greg
parents:
diff changeset
70 def sort_by_index(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
71 self.peaks.sort(key=lambda peak: peak.midpoint)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
72 self.keys = make_keys(self.peaks)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
73
d33030c8e2cc Uploaded
greg
parents:
diff changeset
74 def remove_peak(self, peak):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
75 i = bisect.bisect_left(self.keys, peak.midpoint)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
76 # If the peak was actually found
d33030c8e2cc Uploaded
greg
parents:
diff changeset
77 if i < len(self.peaks) and self.peaks[i].midpoint == peak.midpoint:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
78 del self.keys[i]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
79 del self.peaks[i]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
80
d33030c8e2cc Uploaded
greg
parents:
diff changeset
81 def filter(self, up_limit, low_limit):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
82 self.peaks = [p for p in self.peaks if low_limit <= p.distance <= up_limit]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
83 self.keys = make_keys(self.peaks)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
84
d33030c8e2cc Uploaded
greg
parents:
diff changeset
85
d33030c8e2cc Uploaded
greg
parents:
diff changeset
86 class Peak(object):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
87
d33030c8e2cc Uploaded
greg
parents:
diff changeset
88 def __init__(self, chrom, midpoint, value, distance, replicate):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
89 self.chrom = chrom
d33030c8e2cc Uploaded
greg
parents:
diff changeset
90 self.value = value
d33030c8e2cc Uploaded
greg
parents:
diff changeset
91 self.midpoint = midpoint
d33030c8e2cc Uploaded
greg
parents:
diff changeset
92 self.distance = distance
d33030c8e2cc Uploaded
greg
parents:
diff changeset
93 self.replicate = replicate
d33030c8e2cc Uploaded
greg
parents:
diff changeset
94
d33030c8e2cc Uploaded
greg
parents:
diff changeset
95 def normalized_value(self, med):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
96 return self.value * med / self.replicate.median
d33030c8e2cc Uploaded
greg
parents:
diff changeset
97
d33030c8e2cc Uploaded
greg
parents:
diff changeset
98
d33030c8e2cc Uploaded
greg
parents:
diff changeset
99 class PeakGroup(object):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
100
d33030c8e2cc Uploaded
greg
parents:
diff changeset
101 def __init__(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
102 self.peaks = {}
d33030c8e2cc Uploaded
greg
parents:
diff changeset
103
d33030c8e2cc Uploaded
greg
parents:
diff changeset
104 def add_peak(self, repid, peak):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
105 self.peaks[repid] = peak
d33030c8e2cc Uploaded
greg
parents:
diff changeset
106
d33030c8e2cc Uploaded
greg
parents:
diff changeset
107 @property
d33030c8e2cc Uploaded
greg
parents:
diff changeset
108 def chrom(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
109 return self.peaks.values()[0].chrom
d33030c8e2cc Uploaded
greg
parents:
diff changeset
110
d33030c8e2cc Uploaded
greg
parents:
diff changeset
111 @property
d33030c8e2cc Uploaded
greg
parents:
diff changeset
112 def midpoint(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
113 return median([peak.midpoint for peak in self.peaks.values()])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
114
d33030c8e2cc Uploaded
greg
parents:
diff changeset
115 @property
d33030c8e2cc Uploaded
greg
parents:
diff changeset
116 def num_replicates(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
117 return len(self.peaks)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
118
d33030c8e2cc Uploaded
greg
parents:
diff changeset
119 @property
d33030c8e2cc Uploaded
greg
parents:
diff changeset
120 def median_distance(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
121 return median([peak.distance for peak in self.peaks.values()])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
122
d33030c8e2cc Uploaded
greg
parents:
diff changeset
123 @property
d33030c8e2cc Uploaded
greg
parents:
diff changeset
124 def value_sum(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
125 return sum([peak.value for peak in self.peaks.values()])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
126
d33030c8e2cc Uploaded
greg
parents:
diff changeset
127 def normalized_value(self, med):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
128 values = []
d33030c8e2cc Uploaded
greg
parents:
diff changeset
129 for peak in self.peaks.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
130 values.append(peak.normalized_value(med))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
131 return median(values)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
132
d33030c8e2cc Uploaded
greg
parents:
diff changeset
133 @property
d33030c8e2cc Uploaded
greg
parents:
diff changeset
134 def peakpeak_distance(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
135 keys = self.peaks.keys()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
136 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
137
d33030c8e2cc Uploaded
greg
parents:
diff changeset
138
d33030c8e2cc Uploaded
greg
parents:
diff changeset
139 class FrequencyDistribution(object):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
140
d33030c8e2cc Uploaded
greg
parents:
diff changeset
141 def __init__(self, d=None):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
142 self.dist = d or {}
d33030c8e2cc Uploaded
greg
parents:
diff changeset
143
d33030c8e2cc Uploaded
greg
parents:
diff changeset
144 def add(self, x):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
145 self.dist[x] = self.dist.get(x, 0) + 1
d33030c8e2cc Uploaded
greg
parents:
diff changeset
146
d33030c8e2cc Uploaded
greg
parents:
diff changeset
147 def graph_series(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
148 x = []
d33030c8e2cc Uploaded
greg
parents:
diff changeset
149 y = []
d33030c8e2cc Uploaded
greg
parents:
diff changeset
150 for key, val in self.dist.items():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
151 x.append(key)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
152 y.append(val)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
153 return x, y
d33030c8e2cc Uploaded
greg
parents:
diff changeset
154
d33030c8e2cc Uploaded
greg
parents:
diff changeset
155 def mode(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
156 return max(self.dist.items(), key=lambda data: data[1])[0]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
157
d33030c8e2cc Uploaded
greg
parents:
diff changeset
158 def size(self):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
159 return sum(self.dist.values())
d33030c8e2cc Uploaded
greg
parents:
diff changeset
160
d33030c8e2cc Uploaded
greg
parents:
diff changeset
161
d33030c8e2cc Uploaded
greg
parents:
diff changeset
162 def stop_err(msg):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
163 sys.stderr.write(msg)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
164 sys.exit(1)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
165
d33030c8e2cc Uploaded
greg
parents:
diff changeset
166
d33030c8e2cc Uploaded
greg
parents:
diff changeset
167 def median(data):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
168 """
d33030c8e2cc Uploaded
greg
parents:
diff changeset
169 Find the integer median of the data set.
d33030c8e2cc Uploaded
greg
parents:
diff changeset
170 """
d33030c8e2cc Uploaded
greg
parents:
diff changeset
171 if not data:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
172 return 0
d33030c8e2cc Uploaded
greg
parents:
diff changeset
173 sdata = sorted(data)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
174 if len(data) % 2 == 0:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
175 return (sdata[len(data)//2] + sdata[len(data)//2-1]) / 2
d33030c8e2cc Uploaded
greg
parents:
diff changeset
176 else:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
177 return sdata[len(data)//2]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
178
d33030c8e2cc Uploaded
greg
parents:
diff changeset
179
d33030c8e2cc Uploaded
greg
parents:
diff changeset
180 def make_keys(peaks):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
181 return [data.midpoint for data in peaks]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
182
d33030c8e2cc Uploaded
greg
parents:
diff changeset
183
d33030c8e2cc Uploaded
greg
parents:
diff changeset
184 def get_window(chromosome, target_peaks, distance):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
185 """
d33030c8e2cc Uploaded
greg
parents:
diff changeset
186 Returns a window of all peaks from a replicate within a certain distance of
d33030c8e2cc Uploaded
greg
parents:
diff changeset
187 a peak from another replicate.
d33030c8e2cc Uploaded
greg
parents:
diff changeset
188 """
d33030c8e2cc Uploaded
greg
parents:
diff changeset
189 lower = target_peaks[0].midpoint
d33030c8e2cc Uploaded
greg
parents:
diff changeset
190 upper = target_peaks[0].midpoint
d33030c8e2cc Uploaded
greg
parents:
diff changeset
191 for peak in target_peaks:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
192 lower = min(lower, peak.midpoint - distance)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
193 upper = max(upper, peak.midpoint + distance)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
194 start_index = bisect.bisect_left(chromosome.keys, lower)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
195 end_index = bisect.bisect_right(chromosome.keys, upper)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
196 return (chromosome.peaks[start_index: end_index], chromosome.name)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
197
d33030c8e2cc Uploaded
greg
parents:
diff changeset
198
d33030c8e2cc Uploaded
greg
parents:
diff changeset
199 def match_largest(window, peak, chrum):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
200 if not window:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
201 return None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
202 if peak.chrom != chrum:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
203 return None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
204 return max(window, key=lambda cpeak: cpeak.value)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
205
d33030c8e2cc Uploaded
greg
parents:
diff changeset
206
d33030c8e2cc Uploaded
greg
parents:
diff changeset
207 def match_closest(window, peak, chrum):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
208 if not window:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
209 return None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
210 if peak.chrom != chrum:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
211 return None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
212 return min(window, key=lambda match: abs(match.midpoint - peak.midpoint))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
213
d33030c8e2cc Uploaded
greg
parents:
diff changeset
214
d33030c8e2cc Uploaded
greg
parents:
diff changeset
215 def frequency_histogram(freqs, dataset_path, labels=[], title=''):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
216 pyplot.clf()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
217 pyplot.figure(figsize=(10, 10))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
218 for i, freq in enumerate(freqs):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
219 xvals, yvals = freq.graph_series()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
220 # Go from high to low
d33030c8e2cc Uploaded
greg
parents:
diff changeset
221 xvals.reverse()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
222 pyplot.bar([x-0.4 + 0.8/len(freqs)*i for x in xvals], yvals, width=0.8/len(freqs), color=COLORS[i])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
223 pyplot.xticks(range(min(xvals), max(xvals)+1), map(str, reversed(range(min(xvals), max(xvals)+1))))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
224 pyplot.xlabel(X_LABEL)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
225 pyplot.ylabel(Y_LABEL)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
226 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
227 ax = pyplot.gca()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
228 for l in ax.get_xticklines() + ax.get_yticklines():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
229 l.set_markeredgewidth(TICK_WIDTH)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
230 pyplot.savefig(dataset_path)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
231
d33030c8e2cc Uploaded
greg
parents:
diff changeset
232
d33030c8e2cc Uploaded
greg
parents:
diff changeset
233 METHODS = {'closest': match_closest, 'largest': match_largest}
d33030c8e2cc Uploaded
greg
parents:
diff changeset
234
d33030c8e2cc Uploaded
greg
parents:
diff changeset
235
d33030c8e2cc Uploaded
greg
parents:
diff changeset
236 def gff_attrs(d):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
237 if not d:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
238 return '.'
d33030c8e2cc Uploaded
greg
parents:
diff changeset
239 return ';'.join('%s=%s' % item for item in d.items())
d33030c8e2cc Uploaded
greg
parents:
diff changeset
240
d33030c8e2cc Uploaded
greg
parents:
diff changeset
241
d33030c8e2cc Uploaded
greg
parents:
diff changeset
242 def parse_gff_attrs(s):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
243 d = {}
d33030c8e2cc Uploaded
greg
parents:
diff changeset
244 if s == '.':
d33030c8e2cc Uploaded
greg
parents:
diff changeset
245 return d
d33030c8e2cc Uploaded
greg
parents:
diff changeset
246 for item in s.split(';'):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
247 key, val = item.split('=')
d33030c8e2cc Uploaded
greg
parents:
diff changeset
248 d[key] = val
d33030c8e2cc Uploaded
greg
parents:
diff changeset
249 return d
d33030c8e2cc Uploaded
greg
parents:
diff changeset
250
d33030c8e2cc Uploaded
greg
parents:
diff changeset
251
d33030c8e2cc Uploaded
greg
parents:
diff changeset
252 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
253 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
254
d33030c8e2cc Uploaded
greg
parents:
diff changeset
255
d33030c8e2cc Uploaded
greg
parents:
diff changeset
256 def get_temporary_plot_path(plot_format):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
257 """
d33030c8e2cc Uploaded
greg
parents:
diff changeset
258 Return the path to a temporary file with a valid image format
d33030c8e2cc Uploaded
greg
parents:
diff changeset
259 file extension that can be used with bioformats.
d33030c8e2cc Uploaded
greg
parents:
diff changeset
260 """
d33030c8e2cc Uploaded
greg
parents:
diff changeset
261 tmp_dir = tempfile.mkdtemp(prefix='tmp-repmatch-')
d33030c8e2cc Uploaded
greg
parents:
diff changeset
262 fd, name = tempfile.mkstemp(suffix='.%s' % plot_format, dir=tmp_dir)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
263 os.close(fd)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
264 return name
d33030c8e2cc Uploaded
greg
parents:
diff changeset
265
d33030c8e2cc Uploaded
greg
parents:
diff changeset
266
d33030c8e2cc Uploaded
greg
parents:
diff changeset
267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
268 plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
269 output_histogram_file = output_files in ["all"] and method in ["all"]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
270 if len(dataset_paths) < 2:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
271 return
d33030c8e2cc Uploaded
greg
parents:
diff changeset
272 if method == 'all':
d33030c8e2cc Uploaded
greg
parents:
diff changeset
273 match_methods = METHODS.keys()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
274 else:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
275 match_methods = [method]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
276 for match_method in match_methods:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
277 statistics = perform_process(dataset_paths,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
278 galaxy_hids,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
279 match_method,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
280 distance,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
281 step,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
282 replicates,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
283 up_limit,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
284 low_limit,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
285 output_files,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
286 plot_format,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
287 output_summary,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
288 output_orphan,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
289 output_detail,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
290 output_key,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
291 output_histogram)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
292 if output_histogram_file:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
293 tmp_histogram_path = get_temporary_plot_path(plot_format)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
294 frequency_histogram([stat['distribution'] for stat in [statistics]],
d33030c8e2cc Uploaded
greg
parents:
diff changeset
295 tmp_histogram_path,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
296 METHODS.keys())
d33030c8e2cc Uploaded
greg
parents:
diff changeset
297 shutil.move(tmp_histogram_path, output_histogram)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
298
d33030c8e2cc Uploaded
greg
parents:
diff changeset
299
d33030c8e2cc Uploaded
greg
parents:
diff changeset
300 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
301 plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
302 output_detail_file = output_files in ["all"] and output_detail is not None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
303 output_key_file = output_files in ["all"] and output_key is not None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
304 output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
305 output_histogram_file = output_files in ["all"] and output_histogram is not None
d33030c8e2cc Uploaded
greg
parents:
diff changeset
306 replicates = []
d33030c8e2cc Uploaded
greg
parents:
diff changeset
307 for i, dataset_path in enumerate(dataset_paths):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
308 try:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
309 galaxy_hid = galaxy_hids[i]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
310 r = Replicate(galaxy_hid, dataset_path)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
311 replicates.append(r)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
312 except Exception, e:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
313 stop_err('Unable to parse file "%s", exception: %s' % (dataset_path, str(e)))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
314 attrs = 'd%sr%s' % (distance, num_required)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
315 if up_limit != 1000:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
316 attrs += 'u%d' % up_limit
d33030c8e2cc Uploaded
greg
parents:
diff changeset
317 if low_limit != -1000:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
318 attrs += 'l%d' % low_limit
d33030c8e2cc Uploaded
greg
parents:
diff changeset
319 if step != 0:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
320 attrs += 's%d' % step
d33030c8e2cc Uploaded
greg
parents:
diff changeset
321
d33030c8e2cc Uploaded
greg
parents:
diff changeset
322 def td_writer(file_path):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
323 # Returns a tab-delimited writer for a certain output
d33030c8e2cc Uploaded
greg
parents:
diff changeset
324 return csv.writer(open(file_path, 'wt'), delimiter='\t')
d33030c8e2cc Uploaded
greg
parents:
diff changeset
325
d33030c8e2cc Uploaded
greg
parents:
diff changeset
326 labels = ('chrom',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
327 'median midpoint',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
328 'median midpoint+1',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
329 'median normalized reads',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
330 'replicates',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
331 'median c-w distance',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
332 'reads sum')
d33030c8e2cc Uploaded
greg
parents:
diff changeset
333 for replicate in replicates:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
334 labels += ('chrom',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
335 'median midpoint',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
336 'median midpoint+1',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
337 'c-w sum',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
338 'c-w distance',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
339 'replicate id')
d33030c8e2cc Uploaded
greg
parents:
diff changeset
340 summary_output = td_writer(output_summary)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
341 if output_key_file:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
342 key_output = td_writer(output_key)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
343 key_output.writerow(('data', 'median read count'))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
344 if output_detail_file:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
345 detail_output = td_writer(output_detail)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
346 detail_output.writerow(labels)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
347 if output_orphan_file:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
348 orphan_output = td_writer(output_orphan)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
349 orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id'))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
350 # Perform filtering
d33030c8e2cc Uploaded
greg
parents:
diff changeset
351 if up_limit < 1000 or low_limit > -1000:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
352 for replicate in replicates:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
353 replicate.filter(up_limit, low_limit)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
354 # Actually merge the peaks
d33030c8e2cc Uploaded
greg
parents:
diff changeset
355 peak_groups = []
d33030c8e2cc Uploaded
greg
parents:
diff changeset
356 orphans = []
d33030c8e2cc Uploaded
greg
parents:
diff changeset
357 freq = FrequencyDistribution()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
358
d33030c8e2cc Uploaded
greg
parents:
diff changeset
359 def do_match(reps, distance):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
360 # Copy list because we will mutate it, but keep replicate references.
d33030c8e2cc Uploaded
greg
parents:
diff changeset
361 reps = reps[:]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
362 while len(reps) > 1:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
363 # Iterate over each replicate as "main"
d33030c8e2cc Uploaded
greg
parents:
diff changeset
364 main = reps[0]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
365 reps.remove(main)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
366 for chromosome in main.chromosomes.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
367 peaks_by_value = chromosome.peaks[:]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
368 # Sort main replicate by value
d33030c8e2cc Uploaded
greg
parents:
diff changeset
369 peaks_by_value.sort(key=lambda peak: -peak.value)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
370
d33030c8e2cc Uploaded
greg
parents:
diff changeset
371 def search_for_matches(group):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
372 # Here we use multiple passes, expanding the window to be
d33030c8e2cc Uploaded
greg
parents:
diff changeset
373 # +- distance from any previously matched peak.
d33030c8e2cc Uploaded
greg
parents:
diff changeset
374 while True:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
375 new_match = False
d33030c8e2cc Uploaded
greg
parents:
diff changeset
376 for replicate in reps:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
377 if replicate.id in group.peaks:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
378 # Stop if match already found for this replicate
d33030c8e2cc Uploaded
greg
parents:
diff changeset
379 continue
d33030c8e2cc Uploaded
greg
parents:
diff changeset
380 try:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
381 # Lines changed to remove a major bug by Rohit Reja.
d33030c8e2cc Uploaded
greg
parents:
diff changeset
382 window, chrum = get_window(replicate.chromosomes[chromosome.name],
d33030c8e2cc Uploaded
greg
parents:
diff changeset
383 group.peaks.values(),
d33030c8e2cc Uploaded
greg
parents:
diff changeset
384 distance)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
385 match = METHODS[method](window, peak, chrum)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
386 except KeyError:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
387 continue
d33030c8e2cc Uploaded
greg
parents:
diff changeset
388 if match:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
389 group.add_peak(replicate.id, match)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
390 new_match = True
d33030c8e2cc Uploaded
greg
parents:
diff changeset
391 if not new_match:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
392 break
d33030c8e2cc Uploaded
greg
parents:
diff changeset
393 # Attempt to enlarge existing peak groups
d33030c8e2cc Uploaded
greg
parents:
diff changeset
394 for group in peak_groups:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
395 old_peaks = group.peaks.values()[:]
d33030c8e2cc Uploaded
greg
parents:
diff changeset
396 search_for_matches(group)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
397 for peak in group.peaks.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
398 if peak not in old_peaks:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
399 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
400 # Attempt to find new peaks groups. For each peak in the
d33030c8e2cc Uploaded
greg
parents:
diff changeset
401 # main replicate, search for matches in the other replicates
d33030c8e2cc Uploaded
greg
parents:
diff changeset
402 for peak in peaks_by_value:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
403 matches = PeakGroup()
d33030c8e2cc Uploaded
greg
parents:
diff changeset
404 matches.add_peak(main.id, peak)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
405 search_for_matches(matches)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
406 # Were enough replicates matched?
d33030c8e2cc Uploaded
greg
parents:
diff changeset
407 if matches.num_replicates >= num_required:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
408 for peak in matches.peaks.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
409 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
410 peak_groups.append(matches)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
411 # Zero or less = no stepping
d33030c8e2cc Uploaded
greg
parents:
diff changeset
412 if step <= 0:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
413 do_match(replicates, distance)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
414 else:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
415 for d in range(0, distance, step):
d33030c8e2cc Uploaded
greg
parents:
diff changeset
416 do_match(replicates, d)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
417 for group in peak_groups:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
418 freq.add(group.num_replicates)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
419 # Collect together the remaining orphans
d33030c8e2cc Uploaded
greg
parents:
diff changeset
420 for replicate in replicates:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
421 for chromosome in replicate.chromosomes.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
422 for peak in chromosome.peaks:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
423 freq.add(1)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
424 orphans.append(peak)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
425 # Average the orphan count in the graph by # replicates
d33030c8e2cc Uploaded
greg
parents:
diff changeset
426 med = median([peak.value for group in peak_groups for peak in group.peaks.values()])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
427 for replicate in replicates:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
428 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate])
d33030c8e2cc Uploaded
greg
parents:
diff changeset
429 key_output.writerow((replicate.id, replicate.median))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
430 for group in peak_groups:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
431 # Output summary (matched pairs).
d33030c8e2cc Uploaded
greg
parents:
diff changeset
432 summary_output.writerow(gff_row(cname=group.chrom,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
433 start=group.midpoint,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
434 end=group.midpoint+1,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
435 source='repmatch',
d33030c8e2cc Uploaded
greg
parents:
diff changeset
436 score=group.normalized_value(med),
d33030c8e2cc Uploaded
greg
parents:
diff changeset
437 attrs={'median_distance': group.median_distance,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
438 'replicates': group.num_replicates,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
439 'value_sum': group.value_sum}))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
440 if output_detail_file:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
441 summary = (group.chrom,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
442 group.midpoint,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
443 group.midpoint+1,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
444 group.normalized_value(med),
d33030c8e2cc Uploaded
greg
parents:
diff changeset
445 group.num_replicates,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
446 group.median_distance,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
447 group.value_sum)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
448 for peak in group.peaks.values():
d33030c8e2cc Uploaded
greg
parents:
diff changeset
449 summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
450 detail_output.writerow(summary)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
451 if output_orphan_file:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
452 for orphan in orphans:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
453 orphan_output.writerow((orphan.chrom,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
454 orphan.midpoint,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
455 orphan.midpoint+1,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
456 orphan.value,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
457 orphan.distance,
d33030c8e2cc Uploaded
greg
parents:
diff changeset
458 orphan.replicate.id))
d33030c8e2cc Uploaded
greg
parents:
diff changeset
459 if output_histogram_file:
d33030c8e2cc Uploaded
greg
parents:
diff changeset
460 tmp_histogram_path = get_temporary_plot_path(plot_format)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
461 frequency_histogram([freq], tmp_histogram_path)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
462 shutil.move(tmp_histogram_path, output_histogram)
d33030c8e2cc Uploaded
greg
parents:
diff changeset
463 return {'distribution': freq}