0
|
1 import bisect
|
|
2 import csv
|
|
3 import os
|
|
4 import sys
|
|
5 import traceback
|
1
|
6 import matplotlib
|
|
7 matplotlib.use('Agg')
|
0
|
8 from matplotlib import pyplot
|
|
9
|
|
10 DETAILS = 'D'
|
|
11 FINAL_PLOTS = 'F'
|
|
12 ORPHANS = 'O'
|
|
13 PREVIEW_PLOTS = 'P'
|
|
14 SIMPLES = 'S'
|
|
15 STATS_GRAPH = 'C'
|
|
16 GFF_EXT = 'gff'
|
|
17 TABULAR_EXT = 'tabular'
|
|
18
|
|
19 # Graph settings.
|
|
20 COLORS = 'krg'
|
|
21 Y_LABEL = 'Peak-pair counts'
|
|
22 X_LABEL = 'Peak-pair distance (bp)'
|
|
23 TICK_WIDTH = 3
|
|
24 ADJUST = [0.140, 0.9, 0.9, 0.1]
|
|
25 pyplot.rc('xtick.major', size=10.00)
|
|
26 pyplot.rc('ytick.major', size=10.00)
|
|
27 pyplot.rc('lines', linewidth=4.00)
|
|
28 pyplot.rc('axes', linewidth=3.00)
|
1
|
29 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
|
0
|
30
|
|
31
|
|
32 class FrequencyDistribution(object):
|
|
33
|
|
34 def __init__(self, start, end, binsize=10, d=None):
|
|
35 self.start = start
|
|
36 self.end = end
|
|
37 self.dist = d or {}
|
|
38 self.binsize = binsize
|
|
39
|
|
40 def get_bin(self, x):
|
|
41 """
|
|
42 Returns the bin in which a data point falls
|
|
43 """
|
|
44 return self.start + (x-self.start) // self.binsize * self.binsize + self.binsize/2.0
|
|
45
|
|
46 def add(self, x):
|
|
47 x = self.get_bin(x)
|
|
48 self.dist[x] = self.dist.get(x, 0) + 1
|
|
49
|
|
50 def graph_series(self):
|
|
51 x = []
|
|
52 y = []
|
|
53 for i in range(self.start, self.end, self.binsize):
|
|
54 center = self.get_bin(i)
|
|
55 x.append(center)
|
|
56 y.append(self.dist.get(center, 0))
|
|
57 return x, y
|
|
58
|
|
59 def mode(self):
|
|
60 return max(self.dist.items(), key=lambda data: data[1])[0]
|
|
61
|
|
62 def size(self):
|
|
63 return sum(self.dist.values())
|
|
64
|
|
65
|
|
66 def stop_err(msg):
|
|
67 sys.stderr.write(msg)
|
|
68 sys.exit(1)
|
|
69
|
|
70
|
|
71 def distance(peak1, peak2):
|
|
72 return (peak2[1]+peak2[2])/2 - (peak1[1]+peak1[2])/2
|
|
73
|
|
74
|
|
75 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
|
|
76 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
|
|
77
|
|
78
|
|
79 def gff_attrs(d):
|
|
80 if not d:
|
|
81 return '.'
|
|
82 return ';'.join('%s=%s' % item for item in d.items())
|
|
83
|
|
84
|
|
85 def parse_chromosomes(reader):
|
|
86 # This version of cwpair2 accepts only gff format as input.
|
|
87 chromosomes = {}
|
|
88 reader.next()
|
|
89 for line in reader:
|
|
90 cname, junk, junk, start, end, value, strand, junk, junk = line
|
|
91 start = int(start)
|
|
92 end = int(end)
|
|
93 value = float(value)
|
|
94 if cname not in chromosomes:
|
|
95 chromosomes[cname] = []
|
|
96 peaks = chromosomes[cname]
|
|
97 peaks.append((strand, start, end, value))
|
|
98 return chromosomes
|
|
99
|
|
100
|
|
101 def perc95(chromosomes):
|
|
102 """
|
|
103 Returns the 95th percentile value of the given chromosomes.
|
|
104 """
|
|
105 values = []
|
|
106 for peaks in chromosomes.values():
|
|
107 for peak in peaks:
|
|
108 values.append(peak[3])
|
|
109 values.sort()
|
|
110 # Get 95% value
|
|
111 return values[int(len(values)*0.95)]
|
|
112
|
|
113
|
|
114 def filter(chromosomes, threshold=0.05):
|
|
115 """
|
|
116 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted
|
|
117 as a proportion of the maximum, >=1.0 as an absolute value.
|
|
118 """
|
|
119 if threshold < 1:
|
|
120 p95 = perc95(chromosomes)
|
|
121 threshold = p95 * threshold
|
|
122 # Make the threshold a proportion of the
|
|
123 for cname, peaks in chromosomes.items():
|
|
124 chromosomes[cname] = [peak for peak in peaks if peak[3] > threshold]
|
|
125
|
|
126
|
|
127 def split_strands(chromosome):
|
|
128 watson = [peak for peak in chromosome if peak[0] == '+']
|
|
129 crick = [peak for peak in chromosome if peak[0] == '-']
|
|
130 return watson, crick
|
|
131
|
|
132
|
|
133 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize):
|
|
134 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
|
|
135 for cname, data in chromosomes.items():
|
|
136 watson, crick = split_strands(data)
|
|
137 crick.sort(key=lambda data: float(data[1]))
|
|
138 keys = make_keys(crick)
|
|
139 for peak in watson:
|
|
140 for cpeak in get_window(crick, peak, up_distance, down_distance, keys):
|
|
141 dist.add(distance(peak, cpeak))
|
|
142 return dist
|
|
143
|
|
144
|
|
145 def make_keys(crick):
|
|
146 return [(data[1] + data[2])//2 for data in crick]
|
|
147
|
|
148
|
|
149 def get_window(crick, peak, up_distance, down_distance, keys=None):
|
|
150 """
|
|
151 Returns a window of all crick peaks within a distance of a watson peak.
|
|
152 crick strand MUST be sorted by distance
|
|
153 """
|
|
154 strand, start, end, value = peak
|
|
155 midpoint = (start + end) // 2
|
|
156 lower = midpoint - up_distance
|
|
157 upper = midpoint + down_distance
|
|
158 keys = keys or make_keys(crick)
|
|
159 start_index = bisect.bisect_left(keys, lower)
|
|
160 end_index = bisect.bisect_right(keys, upper)
|
|
161 return [cpeak for cpeak in crick[start_index:end_index]]
|
|
162
|
|
163
|
|
164 def match_largest(window, peak):
|
|
165 if not window:
|
|
166 return None
|
|
167 return max(window, key=lambda cpeak: cpeak[3])
|
|
168
|
|
169
|
|
170 def match_closest(window, peak):
|
|
171 if not window:
|
|
172 return None
|
|
173
|
|
174 def key(cpeak):
|
|
175 d = distance(peak, cpeak)
|
|
176 # Search negative distances last
|
|
177 if d < 0:
|
|
178 # And then prefer less negative distances
|
|
179 d = 10000 - d
|
|
180 return d
|
|
181 return min(window, key=key)
|
|
182
|
|
183
|
|
184 def match_mode(window, peak, mode):
|
|
185 if not window:
|
|
186 return None
|
|
187 return min(window, key=lambda cpeak: abs(distance(peak, cpeak)-mode))
|
|
188
|
|
189 METHODS = {'mode': match_mode, 'closest': match_closest, 'largest': match_largest}
|
|
190
|
|
191
|
|
192 def frequency_plot(freqs, fname, labels=[], title=''):
|
|
193 pyplot.clf()
|
|
194 pyplot.figure(figsize=(10, 10))
|
|
195 for i, freq in enumerate(freqs):
|
|
196 x, y = freq.graph_series()
|
|
197 pyplot.plot(x, y, '%s-' % COLORS[i])
|
|
198 if len(freqs) > 1:
|
|
199 pyplot.legend(labels)
|
|
200 pyplot.xlim(freq.start, freq.end)
|
|
201 pyplot.ylim(ymin=0)
|
|
202 pyplot.ylabel(Y_LABEL)
|
|
203 pyplot.xlabel(X_LABEL)
|
|
204 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3])
|
|
205 # Get the current axes
|
|
206 ax = pyplot.gca()
|
|
207 for l in ax.get_xticklines() + ax.get_yticklines():
|
|
208 l.set_markeredgewidth(TICK_WIDTH)
|
|
209 pyplot.savefig(fname)
|
|
210
|
|
211
|
|
212 def create_directories(method):
|
|
213 if method == 'all':
|
|
214 match_methods = METHODS.keys()
|
|
215 else:
|
|
216 match_methods = [method]
|
|
217 for match_method in match_methods:
|
|
218 os.mkdir('%s_%s' % (match_method, DETAILS))
|
|
219 os.mkdir('%s_%s' % (match_method, FINAL_PLOTS))
|
|
220 os.mkdir('%s_%s' % (match_method, ORPHANS))
|
|
221 os.mkdir('%s_%s' % (match_method, PREVIEW_PLOTS))
|
|
222 os.mkdir('%s_%s' % (match_method, SIMPLES))
|
|
223 os.mkdir('%s_%s' % (match_method, STATS_GRAPH))
|
|
224
|
|
225
|
|
226 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, down_distance,
|
|
227 binsize, output_files, plot_format, sort_chromosome, sort_score):
|
|
228 if method == 'all':
|
|
229 match_methods = METHODS.keys()
|
|
230 else:
|
|
231 match_methods = [method]
|
|
232 statistics = []
|
|
233 for match_method in match_methods:
|
|
234 stats = perform_process(dataset_path,
|
|
235 galaxy_hid,
|
|
236 match_method,
|
|
237 threshold,
|
|
238 up_distance,
|
|
239 down_distance,
|
|
240 binsize,
|
|
241 output_files,
|
|
242 plot_format,
|
|
243 sort_chromosome,
|
|
244 sort_score)
|
|
245 statistics.append(stats)
|
|
246 if output_files == 'all' and method == 'all':
|
|
247 frequency_plot([s['dist'] for s in statistics],
|
|
248 statistics[0]['graph_path'],
|
|
249 labels=METHODS.keys())
|
|
250 return statistics
|
|
251
|
|
252
|
|
253 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance,
|
|
254 down_distance, binsize, output_files, plot_format, sort_chromosome, sort_score):
|
|
255 output_details = output_files in ["all", "simple_orphan_detail"]
|
|
256 output_plots = output_files in ["all"]
|
|
257 output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"]
|
|
258 # Keep track of statistics for the output file
|
|
259 statistics = {}
|
|
260 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
|
|
261 fpath, fname = os.path.split(dataset_path)
|
|
262 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid))
|
|
263 statistics['dir'] = fpath
|
|
264 if threshold >= 1:
|
|
265 filter_string = 'fa%d' % threshold
|
|
266 else:
|
|
267 filter_string = 'f%d' % (threshold * 100)
|
|
268 fname = 'data_%s_%su%dd%db%d' % (galaxy_hid, filter_string, up_distance, down_distance, binsize)
|
|
269
|
|
270 def make_path(output_type, extension=TABULAR_EXT):
|
|
271 # Returns the full path for a certain output.
|
|
272 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension))
|
|
273
|
|
274 def td_writer(output_type, extension=TABULAR_EXT):
|
|
275 # Returns a tab-delimited writer for a specified output.
|
|
276 output_file_path = make_path(output_type, extension)
|
|
277 return csv.writer(open(output_file_path, 'wt'), delimiter='\t')
|
|
278
|
|
279 try:
|
|
280 chromosomes = parse_chromosomes(input)
|
|
281 except Exception:
|
|
282 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
|
|
283 if output_details:
|
|
284 # Details
|
|
285 detailed_output = td_writer('%s_%s' % (method, DETAILS), extension=TABULAR_EXT)
|
|
286 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)'))
|
|
287 if output_plots:
|
|
288 # Final Plot
|
|
289 final_plot_path = make_path('%s_%s' % (method, FINAL_PLOTS), plot_format)
|
|
290 if output_orphans:
|
|
291 # Orphans
|
|
292 orphan_output = td_writer('%s_%s' % (method, ORPHANS), extension=TABULAR_EXT)
|
|
293 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value'))
|
|
294 if output_plots:
|
|
295 # Preview Plot
|
|
296 preview_plot_path = make_path('%s_%s' % (method, PREVIEW_PLOTS), plot_format)
|
|
297 # Simple
|
|
298 simple_output = td_writer('%s_%s' % (method, SIMPLES), extension=GFF_EXT)
|
|
299 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT
|
|
300 if output_plots:
|
|
301 statistics['graph_path'] = make_path('%s_%s' % (method, STATS_GRAPH), plot_format)
|
|
302 statistics['perc95'] = perc95(chromosomes)
|
|
303 if threshold > 0:
|
|
304 # Apply filter
|
|
305 filter(chromosomes, threshold)
|
|
306 if method == 'mode':
|
|
307 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize)
|
|
308 mode = freq.mode()
|
|
309 statistics['preview_mode'] = mode
|
|
310 if output_plots:
|
|
311 frequency_plot([freq], preview_plot_path, title='Preview frequency plot')
|
|
312 else:
|
|
313 statistics['preview_mode'] = 'NA'
|
|
314 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
|
|
315 orphans = 0
|
|
316 # x will be used to archive the summary dataset
|
|
317 x = []
|
|
318 for cname, chromosome in chromosomes.items():
|
|
319 # Each peak is (strand, start, end, value)
|
|
320 watson, crick = split_strands(chromosome)
|
|
321 # Sort by value of each peak
|
|
322 watson.sort(key=lambda data: -float(data[3]))
|
|
323 # Sort by position to facilitate binary search
|
|
324 crick.sort(key=lambda data: float(data[1]))
|
|
325 keys = make_keys(crick)
|
|
326 for peak in watson:
|
|
327 window = get_window(crick, peak, up_distance, down_distance, keys)
|
|
328 if method == 'mode':
|
|
329 match = match_mode(window, peak, mode)
|
|
330 else:
|
|
331 match = METHODS[method](window, peak)
|
|
332 if match:
|
|
333 midpoint = (match[1] + match[2] + peak[1] + peak[2]) // 4
|
|
334 d = distance(peak, match)
|
|
335 dist.add(d)
|
|
336 # Simple output in gff format.
|
|
337 x.append(gff_row(cname,
|
|
338 source='cwpair',
|
|
339 start=midpoint,
|
|
340 end=midpoint + 1,
|
|
341 score=peak[3] + match[3],
|
|
342 attrs={'cw_distance': d}))
|
|
343 if output_details:
|
|
344 detailed_output.writerow((cname,
|
|
345 peak[1],
|
|
346 peak[2],
|
|
347 peak[3],
|
|
348 '+',
|
|
349 cname,
|
|
350 match[1],
|
|
351 match[2],
|
|
352 match[3], '-',
|
|
353 midpoint,
|
|
354 peak[3]+match[3],
|
|
355 d))
|
|
356 i = bisect.bisect_left(keys, (match[1]+match[2])/2)
|
|
357 del crick[i]
|
|
358 del keys[i]
|
|
359 else:
|
|
360 if output_orphans:
|
|
361 orphan_output.writerow((cname, peak[0], peak[1], peak[2], peak[3]))
|
|
362 # Keep track of orphans for statistics.
|
|
363 orphans += 1
|
|
364 # Remaining crick peaks are orphans
|
|
365 if output_orphans:
|
|
366 for cpeak in crick:
|
|
367 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3]))
|
|
368 # Keep track of orphans for statistics.
|
|
369 orphans += len(crick)
|
|
370 # Sort output by chromosome if specified.
|
|
371 if sort_chromosome == "asc":
|
|
372 try:
|
|
373 x.sort(key=lambda data: int(data[3]))
|
|
374 x.sort(key=lambda data: int(data[0]))
|
|
375 except:
|
|
376 # Cannot sort because chromosome number is not a numeric.
|
|
377 pass
|
|
378 elif sort_chromosome == "desc":
|
|
379 try:
|
|
380 x.sort(key=lambda data: int(data[0]), reverse=True)
|
|
381 x.sort(key=lambda data: int(data[3]), reverse=True)
|
|
382 except:
|
|
383 # Cannot sort because chromosome number is not a numeric.
|
|
384 pass
|
|
385 # Sort output by score if specified.
|
|
386 if sort_score == "desc":
|
|
387 x.sort(key=lambda data: float(data[5]), reverse=True)
|
|
388 elif sort_score == "asc":
|
|
389 x.sort(key=lambda data: float(data[5]))
|
|
390 # Writing a summary to txt or gff format file
|
|
391 for row in x:
|
|
392 row_tmp = list(row)
|
|
393 # Dataset in tuple cannot be modified in Python, so row will
|
|
394 # be converted to list format to add 'chr'.
|
|
395 if row_tmp[0] == "999":
|
|
396 row_tmp[0] = 'chrM'
|
|
397 elif row_tmp[0] == "998":
|
|
398 row_tmp[0] = 'chrY'
|
|
399 elif row_tmp[0] == "997":
|
|
400 row_tmp[0] = 'chrX'
|
|
401 else:
|
|
402 row_tmp[0] = row_tmp[0]
|
|
403 # Print row_tmp.
|
|
404 simple_output.writerow(row_tmp)
|
|
405 statistics['paired'] = dist.size() * 2
|
|
406 statistics['orphans'] = orphans
|
|
407 statistics['final_mode'] = dist.mode()
|
|
408 if output_plots:
|
|
409 frequency_plot([dist], final_plot_path, title='Frequency distribution')
|
|
410 statistics['dist'] = dist
|
|
411 return statistics
|