0
|
1 import bisect
|
|
2 import csv
|
|
3 import os
|
|
4 import shutil
|
|
5 import sys
|
|
6 import tempfile
|
1
|
7 import matplotlib
|
2
|
8 matplotlib.use('Agg')
|
0
|
9 from matplotlib import pyplot
|
|
10
|
|
11 # Graph settings
|
|
12 Y_LABEL = 'Counts'
|
|
13 X_LABEL = 'Number of matched replicates'
|
|
14 TICK_WIDTH = 3
|
|
15 # Amount to shift the graph to make labels fit, [left, right, top, bottom]
|
|
16 ADJUST = [0.180, 0.9, 0.9, 0.1]
|
|
17 # Length of tick marks, use TICK_WIDTH for width
|
|
18 pyplot.rc('xtick.major', size=10.00)
|
|
19 pyplot.rc('ytick.major', size=10.00)
|
|
20 pyplot.rc('lines', linewidth=4.00)
|
|
21 pyplot.rc('axes', linewidth=3.00)
|
3
|
22 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
|
0
|
23
|
|
24 PLOT_FORMATS = ['png', 'pdf', 'svg']
|
|
25 COLORS = 'krb'
|
|
26
|
|
27
|
|
28 class Replicate(object):
|
|
29
|
|
30 def __init__(self, id, dataset_path):
|
|
31 self.id = id
|
|
32 self.dataset_path = dataset_path
|
|
33 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t'))
|
|
34
|
|
35 def parse(self, reader):
|
|
36 self.chromosomes = {}
|
|
37 for line in reader:
|
|
38 if line[0].startswith("#") or line[0].startswith('"'):
|
|
39 continue
|
|
40 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
|
|
41 attrs = parse_gff_attrs(attrs)
|
|
42 distance = attrs['cw_distance']
|
|
43 mid = int(mid)
|
|
44 midplus = int(midplus)
|
|
45 value = float(value)
|
|
46 distance = int(distance)
|
|
47 if cname not in self.chromosomes:
|
|
48 self.chromosomes[cname] = Chromosome(cname)
|
|
49 chrom = self.chromosomes[cname]
|
|
50 chrom.add_peak(Peak(cname, mid, value, distance, self))
|
|
51 for chrom in self.chromosomes.values():
|
|
52 chrom.sort_by_index()
|
|
53
|
|
54 def filter(self, up_limit, low_limit):
|
|
55 for chrom in self.chromosomes.values():
|
|
56 chrom.filter(up_limit, low_limit)
|
|
57
|
|
58 def size(self):
|
|
59 return sum([len(c.peaks) for c in self.chromosomes.values()])
|
|
60
|
|
61
|
|
62 class Chromosome(object):
|
|
63
|
|
64 def __init__(self, name):
|
|
65 self.name = name
|
|
66 self.peaks = []
|
|
67
|
|
68 def add_peak(self, peak):
|
|
69 self.peaks.append(peak)
|
|
70
|
|
71 def sort_by_index(self):
|
|
72 self.peaks.sort(key=lambda peak: peak.midpoint)
|
|
73 self.keys = make_keys(self.peaks)
|
|
74
|
|
75 def remove_peak(self, peak):
|
|
76 i = bisect.bisect_left(self.keys, peak.midpoint)
|
|
77 # If the peak was actually found
|
|
78 if i < len(self.peaks) and self.peaks[i].midpoint == peak.midpoint:
|
|
79 del self.keys[i]
|
|
80 del self.peaks[i]
|
|
81
|
|
82 def filter(self, up_limit, low_limit):
|
|
83 self.peaks = [p for p in self.peaks if low_limit <= p.distance <= up_limit]
|
|
84 self.keys = make_keys(self.peaks)
|
|
85
|
|
86
|
|
87 class Peak(object):
|
|
88
|
|
89 def __init__(self, chrom, midpoint, value, distance, replicate):
|
|
90 self.chrom = chrom
|
|
91 self.value = value
|
|
92 self.midpoint = midpoint
|
|
93 self.distance = distance
|
|
94 self.replicate = replicate
|
|
95
|
|
96 def normalized_value(self, med):
|
|
97 return self.value * med / self.replicate.median
|
|
98
|
|
99
|
|
100 class PeakGroup(object):
|
|
101
|
|
102 def __init__(self):
|
|
103 self.peaks = {}
|
|
104
|
|
105 def add_peak(self, repid, peak):
|
|
106 self.peaks[repid] = peak
|
|
107
|
|
108 @property
|
|
109 def chrom(self):
|
|
110 return self.peaks.values()[0].chrom
|
|
111
|
|
112 @property
|
|
113 def midpoint(self):
|
|
114 return median([peak.midpoint for peak in self.peaks.values()])
|
|
115
|
|
116 @property
|
|
117 def num_replicates(self):
|
|
118 return len(self.peaks)
|
|
119
|
|
120 @property
|
|
121 def median_distance(self):
|
|
122 return median([peak.distance for peak in self.peaks.values()])
|
|
123
|
|
124 @property
|
|
125 def value_sum(self):
|
|
126 return sum([peak.value for peak in self.peaks.values()])
|
|
127
|
|
128 def normalized_value(self, med):
|
|
129 values = []
|
|
130 for peak in self.peaks.values():
|
|
131 values.append(peak.normalized_value(med))
|
|
132 return median(values)
|
|
133
|
|
134 @property
|
|
135 def peakpeak_distance(self):
|
|
136 keys = self.peaks.keys()
|
|
137 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
|
|
138
|
|
139
|
|
140 class FrequencyDistribution(object):
|
|
141
|
|
142 def __init__(self, d=None):
|
|
143 self.dist = d or {}
|
|
144
|
|
145 def add(self, x):
|
|
146 self.dist[x] = self.dist.get(x, 0) + 1
|
|
147
|
|
148 def graph_series(self):
|
|
149 x = []
|
|
150 y = []
|
|
151 for key, val in self.dist.items():
|
|
152 x.append(key)
|
|
153 y.append(val)
|
|
154 return x, y
|
|
155
|
|
156 def mode(self):
|
|
157 return max(self.dist.items(), key=lambda data: data[1])[0]
|
|
158
|
|
159 def size(self):
|
|
160 return sum(self.dist.values())
|
|
161
|
|
162
|
|
163 def stop_err(msg):
|
|
164 sys.stderr.write(msg)
|
|
165 sys.exit(1)
|
|
166
|
|
167
|
|
168 def median(data):
|
|
169 """
|
|
170 Find the integer median of the data set.
|
|
171 """
|
|
172 if not data:
|
|
173 return 0
|
|
174 sdata = sorted(data)
|
|
175 if len(data) % 2 == 0:
|
|
176 return (sdata[len(data)//2] + sdata[len(data)//2-1]) / 2
|
|
177 else:
|
|
178 return sdata[len(data)//2]
|
|
179
|
|
180
|
|
181 def make_keys(peaks):
|
|
182 return [data.midpoint for data in peaks]
|
|
183
|
|
184
|
|
185 def get_window(chromosome, target_peaks, distance):
|
|
186 """
|
|
187 Returns a window of all peaks from a replicate within a certain distance of
|
|
188 a peak from another replicate.
|
|
189 """
|
|
190 lower = target_peaks[0].midpoint
|
|
191 upper = target_peaks[0].midpoint
|
|
192 for peak in target_peaks:
|
|
193 lower = min(lower, peak.midpoint - distance)
|
|
194 upper = max(upper, peak.midpoint + distance)
|
|
195 start_index = bisect.bisect_left(chromosome.keys, lower)
|
|
196 end_index = bisect.bisect_right(chromosome.keys, upper)
|
|
197 return (chromosome.peaks[start_index: end_index], chromosome.name)
|
|
198
|
|
199
|
|
200 def match_largest(window, peak, chrum):
|
|
201 if not window:
|
|
202 return None
|
|
203 if peak.chrom != chrum:
|
|
204 return None
|
|
205 return max(window, key=lambda cpeak: cpeak.value)
|
|
206
|
|
207
|
|
208 def match_closest(window, peak, chrum):
|
|
209 if not window:
|
|
210 return None
|
|
211 if peak.chrom != chrum:
|
|
212 return None
|
|
213 return min(window, key=lambda match: abs(match.midpoint - peak.midpoint))
|
|
214
|
|
215
|
|
216 def frequency_histogram(freqs, dataset_path, labels=[], title=''):
|
|
217 pyplot.clf()
|
|
218 pyplot.figure(figsize=(10, 10))
|
|
219 for i, freq in enumerate(freqs):
|
|
220 xvals, yvals = freq.graph_series()
|
|
221 # Go from high to low
|
|
222 xvals.reverse()
|
|
223 pyplot.bar([x-0.4 + 0.8/len(freqs)*i for x in xvals], yvals, width=0.8/len(freqs), color=COLORS[i])
|
|
224 pyplot.xticks(range(min(xvals), max(xvals)+1), map(str, reversed(range(min(xvals), max(xvals)+1))))
|
|
225 pyplot.xlabel(X_LABEL)
|
|
226 pyplot.ylabel(Y_LABEL)
|
|
227 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3])
|
|
228 ax = pyplot.gca()
|
|
229 for l in ax.get_xticklines() + ax.get_yticklines():
|
|
230 l.set_markeredgewidth(TICK_WIDTH)
|
|
231 pyplot.savefig(dataset_path)
|
|
232
|
|
233
|
|
234 METHODS = {'closest': match_closest, 'largest': match_largest}
|
|
235
|
|
236
|
|
237 def gff_attrs(d):
|
|
238 if not d:
|
|
239 return '.'
|
|
240 return ';'.join('%s=%s' % item for item in d.items())
|
|
241
|
|
242
|
|
243 def parse_gff_attrs(s):
|
|
244 d = {}
|
|
245 if s == '.':
|
|
246 return d
|
|
247 for item in s.split(';'):
|
|
248 key, val = item.split('=')
|
|
249 d[key] = val
|
|
250 return d
|
|
251
|
|
252
|
|
253 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
|
|
254 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
|
|
255
|
|
256
|
|
257 def get_temporary_plot_path(plot_format):
|
|
258 """
|
|
259 Return the path to a temporary file with a valid image format
|
|
260 file extension that can be used with bioformats.
|
|
261 """
|
|
262 tmp_dir = tempfile.mkdtemp(prefix='tmp-repmatch-')
|
|
263 fd, name = tempfile.mkstemp(suffix='.%s' % plot_format, dir=tmp_dir)
|
|
264 os.close(fd)
|
|
265 return name
|
|
266
|
|
267
|
|
268 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files,
|
|
269 plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram):
|
|
270 output_histogram_file = output_files in ["all"] and method in ["all"]
|
|
271 if len(dataset_paths) < 2:
|
|
272 return
|
|
273 if method == 'all':
|
|
274 match_methods = METHODS.keys()
|
|
275 else:
|
|
276 match_methods = [method]
|
|
277 for match_method in match_methods:
|
|
278 statistics = perform_process(dataset_paths,
|
|
279 galaxy_hids,
|
|
280 match_method,
|
|
281 distance,
|
|
282 step,
|
|
283 replicates,
|
|
284 up_limit,
|
|
285 low_limit,
|
|
286 output_files,
|
|
287 plot_format,
|
|
288 output_summary,
|
|
289 output_orphan,
|
|
290 output_detail,
|
|
291 output_key,
|
|
292 output_histogram)
|
|
293 if output_histogram_file:
|
|
294 tmp_histogram_path = get_temporary_plot_path(plot_format)
|
|
295 frequency_histogram([stat['distribution'] for stat in [statistics]],
|
|
296 tmp_histogram_path,
|
|
297 METHODS.keys())
|
|
298 shutil.move(tmp_histogram_path, output_histogram)
|
|
299
|
|
300
|
|
301 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files,
|
|
302 plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram):
|
|
303 output_detail_file = output_files in ["all"] and output_detail is not None
|
|
304 output_key_file = output_files in ["all"] and output_key is not None
|
|
305 output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None
|
|
306 output_histogram_file = output_files in ["all"] and output_histogram is not None
|
|
307 replicates = []
|
|
308 for i, dataset_path in enumerate(dataset_paths):
|
|
309 try:
|
|
310 galaxy_hid = galaxy_hids[i]
|
|
311 r = Replicate(galaxy_hid, dataset_path)
|
|
312 replicates.append(r)
|
|
313 except Exception, e:
|
|
314 stop_err('Unable to parse file "%s", exception: %s' % (dataset_path, str(e)))
|
|
315 attrs = 'd%sr%s' % (distance, num_required)
|
|
316 if up_limit != 1000:
|
|
317 attrs += 'u%d' % up_limit
|
|
318 if low_limit != -1000:
|
|
319 attrs += 'l%d' % low_limit
|
|
320 if step != 0:
|
|
321 attrs += 's%d' % step
|
|
322
|
|
323 def td_writer(file_path):
|
|
324 # Returns a tab-delimited writer for a certain output
|
|
325 return csv.writer(open(file_path, 'wt'), delimiter='\t')
|
|
326
|
|
327 labels = ('chrom',
|
|
328 'median midpoint',
|
|
329 'median midpoint+1',
|
|
330 'median normalized reads',
|
|
331 'replicates',
|
|
332 'median c-w distance',
|
|
333 'reads sum')
|
|
334 for replicate in replicates:
|
|
335 labels += ('chrom',
|
|
336 'median midpoint',
|
|
337 'median midpoint+1',
|
|
338 'c-w sum',
|
|
339 'c-w distance',
|
|
340 'replicate id')
|
|
341 summary_output = td_writer(output_summary)
|
|
342 if output_key_file:
|
|
343 key_output = td_writer(output_key)
|
|
344 key_output.writerow(('data', 'median read count'))
|
|
345 if output_detail_file:
|
|
346 detail_output = td_writer(output_detail)
|
|
347 detail_output.writerow(labels)
|
|
348 if output_orphan_file:
|
|
349 orphan_output = td_writer(output_orphan)
|
|
350 orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id'))
|
|
351 # Perform filtering
|
|
352 if up_limit < 1000 or low_limit > -1000:
|
|
353 for replicate in replicates:
|
|
354 replicate.filter(up_limit, low_limit)
|
|
355 # Actually merge the peaks
|
|
356 peak_groups = []
|
|
357 orphans = []
|
|
358 freq = FrequencyDistribution()
|
|
359
|
|
360 def do_match(reps, distance):
|
|
361 # Copy list because we will mutate it, but keep replicate references.
|
|
362 reps = reps[:]
|
|
363 while len(reps) > 1:
|
|
364 # Iterate over each replicate as "main"
|
|
365 main = reps[0]
|
|
366 reps.remove(main)
|
|
367 for chromosome in main.chromosomes.values():
|
|
368 peaks_by_value = chromosome.peaks[:]
|
|
369 # Sort main replicate by value
|
|
370 peaks_by_value.sort(key=lambda peak: -peak.value)
|
|
371
|
|
372 def search_for_matches(group):
|
|
373 # Here we use multiple passes, expanding the window to be
|
|
374 # +- distance from any previously matched peak.
|
|
375 while True:
|
|
376 new_match = False
|
|
377 for replicate in reps:
|
|
378 if replicate.id in group.peaks:
|
|
379 # Stop if match already found for this replicate
|
|
380 continue
|
|
381 try:
|
|
382 # Lines changed to remove a major bug by Rohit Reja.
|
|
383 window, chrum = get_window(replicate.chromosomes[chromosome.name],
|
|
384 group.peaks.values(),
|
|
385 distance)
|
|
386 match = METHODS[method](window, peak, chrum)
|
|
387 except KeyError:
|
|
388 continue
|
|
389 if match:
|
|
390 group.add_peak(replicate.id, match)
|
|
391 new_match = True
|
|
392 if not new_match:
|
|
393 break
|
|
394 # Attempt to enlarge existing peak groups
|
|
395 for group in peak_groups:
|
|
396 old_peaks = group.peaks.values()[:]
|
|
397 search_for_matches(group)
|
|
398 for peak in group.peaks.values():
|
|
399 if peak not in old_peaks:
|
|
400 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
|
|
401 # Attempt to find new peaks groups. For each peak in the
|
|
402 # main replicate, search for matches in the other replicates
|
|
403 for peak in peaks_by_value:
|
|
404 matches = PeakGroup()
|
|
405 matches.add_peak(main.id, peak)
|
|
406 search_for_matches(matches)
|
|
407 # Were enough replicates matched?
|
|
408 if matches.num_replicates >= num_required:
|
|
409 for peak in matches.peaks.values():
|
|
410 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
|
|
411 peak_groups.append(matches)
|
|
412 # Zero or less = no stepping
|
|
413 if step <= 0:
|
|
414 do_match(replicates, distance)
|
|
415 else:
|
|
416 for d in range(0, distance, step):
|
|
417 do_match(replicates, d)
|
|
418 for group in peak_groups:
|
|
419 freq.add(group.num_replicates)
|
|
420 # Collect together the remaining orphans
|
|
421 for replicate in replicates:
|
|
422 for chromosome in replicate.chromosomes.values():
|
|
423 for peak in chromosome.peaks:
|
|
424 freq.add(1)
|
|
425 orphans.append(peak)
|
|
426 # Average the orphan count in the graph by # replicates
|
|
427 med = median([peak.value for group in peak_groups for peak in group.peaks.values()])
|
|
428 for replicate in replicates:
|
|
429 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate])
|
|
430 key_output.writerow((replicate.id, replicate.median))
|
|
431 for group in peak_groups:
|
|
432 # Output summary (matched pairs).
|
|
433 summary_output.writerow(gff_row(cname=group.chrom,
|
|
434 start=group.midpoint,
|
|
435 end=group.midpoint+1,
|
|
436 source='repmatch',
|
|
437 score=group.normalized_value(med),
|
|
438 attrs={'median_distance': group.median_distance,
|
|
439 'replicates': group.num_replicates,
|
|
440 'value_sum': group.value_sum}))
|
|
441 if output_detail_file:
|
|
442 summary = (group.chrom,
|
|
443 group.midpoint,
|
|
444 group.midpoint+1,
|
|
445 group.normalized_value(med),
|
|
446 group.num_replicates,
|
|
447 group.median_distance,
|
|
448 group.value_sum)
|
|
449 for peak in group.peaks.values():
|
|
450 summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id)
|
|
451 detail_output.writerow(summary)
|
|
452 if output_orphan_file:
|
|
453 for orphan in orphans:
|
|
454 orphan_output.writerow((orphan.chrom,
|
|
455 orphan.midpoint,
|
|
456 orphan.midpoint+1,
|
|
457 orphan.value,
|
|
458 orphan.distance,
|
|
459 orphan.replicate.id))
|
|
460 if output_histogram_file:
|
|
461 tmp_histogram_path = get_temporary_plot_path(plot_format)
|
|
462 frequency_histogram([freq], tmp_histogram_path)
|
|
463 shutil.move(tmp_histogram_path, output_histogram)
|
|
464 return {'distribution': freq}
|