comparison cwpair2_util.py @ 0:507521bf124a draft

Uploaded
author greg
date Tue, 17 Nov 2015 14:06:28 -0500
parents
children 4d86371aafa8
comparison
equal deleted inserted replaced
-1:000000000000 0:507521bf124a
1 import bisect
2 import csv
3 import os
4 import sys
5 import traceback
6
7 from matplotlib import pyplot
8
9 DETAILS = 'D'
10 FINAL_PLOTS = 'F'
11 ORPHANS = 'O'
12 PREVIEW_PLOTS = 'P'
13 SIMPLES = 'S'
14 STATS_GRAPH = 'C'
15 GFF_EXT = 'gff'
16 TABULAR_EXT = 'tabular'
17
18 # Graph settings.
19 COLORS = 'krg'
20 Y_LABEL = 'Peak-pair counts'
21 X_LABEL = 'Peak-pair distance (bp)'
22 TICK_WIDTH = 3
23 ADJUST = [0.140, 0.9, 0.9, 0.1]
24 pyplot.rc('xtick.major', size=10.00)
25 pyplot.rc('ytick.major', size=10.00)
26 pyplot.rc('lines', linewidth=4.00)
27 pyplot.rc('axes', linewidth=3.00)
28 pyplot.rc('font', family='Arial', size=32.0)
29
30
31 class FrequencyDistribution(object):
32
33 def __init__(self, start, end, binsize=10, d=None):
34 self.start = start
35 self.end = end
36 self.dist = d or {}
37 self.binsize = binsize
38
39 def get_bin(self, x):
40 """
41 Returns the bin in which a data point falls
42 """
43 return self.start + (x-self.start) // self.binsize * self.binsize + self.binsize/2.0
44
45 def add(self, x):
46 x = self.get_bin(x)
47 self.dist[x] = self.dist.get(x, 0) + 1
48
49 def graph_series(self):
50 x = []
51 y = []
52 for i in range(self.start, self.end, self.binsize):
53 center = self.get_bin(i)
54 x.append(center)
55 y.append(self.dist.get(center, 0))
56 return x, y
57
58 def mode(self):
59 return max(self.dist.items(), key=lambda data: data[1])[0]
60
61 def size(self):
62 return sum(self.dist.values())
63
64
65 def stop_err(msg):
66 sys.stderr.write(msg)
67 sys.exit(1)
68
69
70 def distance(peak1, peak2):
71 return (peak2[1]+peak2[2])/2 - (peak1[1]+peak1[2])/2
72
73
74 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
75 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
76
77
78 def gff_attrs(d):
79 if not d:
80 return '.'
81 return ';'.join('%s=%s' % item for item in d.items())
82
83
84 def parse_chromosomes(reader):
85 # This version of cwpair2 accepts only gff format as input.
86 chromosomes = {}
87 reader.next()
88 for line in reader:
89 cname, junk, junk, start, end, value, strand, junk, junk = line
90 start = int(start)
91 end = int(end)
92 value = float(value)
93 if cname not in chromosomes:
94 chromosomes[cname] = []
95 peaks = chromosomes[cname]
96 peaks.append((strand, start, end, value))
97 return chromosomes
98
99
100 def perc95(chromosomes):
101 """
102 Returns the 95th percentile value of the given chromosomes.
103 """
104 values = []
105 for peaks in chromosomes.values():
106 for peak in peaks:
107 values.append(peak[3])
108 values.sort()
109 # Get 95% value
110 return values[int(len(values)*0.95)]
111
112
113 def filter(chromosomes, threshold=0.05):
114 """
115 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted
116 as a proportion of the maximum, >=1.0 as an absolute value.
117 """
118 if threshold < 1:
119 p95 = perc95(chromosomes)
120 threshold = p95 * threshold
121 # Make the threshold a proportion of the
122 for cname, peaks in chromosomes.items():
123 chromosomes[cname] = [peak for peak in peaks if peak[3] > threshold]
124
125
126 def split_strands(chromosome):
127 watson = [peak for peak in chromosome if peak[0] == '+']
128 crick = [peak for peak in chromosome if peak[0] == '-']
129 return watson, crick
130
131
132 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize):
133 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
134 for cname, data in chromosomes.items():
135 watson, crick = split_strands(data)
136 crick.sort(key=lambda data: float(data[1]))
137 keys = make_keys(crick)
138 for peak in watson:
139 for cpeak in get_window(crick, peak, up_distance, down_distance, keys):
140 dist.add(distance(peak, cpeak))
141 return dist
142
143
144 def make_keys(crick):
145 return [(data[1] + data[2])//2 for data in crick]
146
147
148 def get_window(crick, peak, up_distance, down_distance, keys=None):
149 """
150 Returns a window of all crick peaks within a distance of a watson peak.
151 crick strand MUST be sorted by distance
152 """
153 strand, start, end, value = peak
154 midpoint = (start + end) // 2
155 lower = midpoint - up_distance
156 upper = midpoint + down_distance
157 keys = keys or make_keys(crick)
158 start_index = bisect.bisect_left(keys, lower)
159 end_index = bisect.bisect_right(keys, upper)
160 return [cpeak for cpeak in crick[start_index:end_index]]
161
162
163 def match_largest(window, peak):
164 if not window:
165 return None
166 return max(window, key=lambda cpeak: cpeak[3])
167
168
169 def match_closest(window, peak):
170 if not window:
171 return None
172
173 def key(cpeak):
174 d = distance(peak, cpeak)
175 # Search negative distances last
176 if d < 0:
177 # And then prefer less negative distances
178 d = 10000 - d
179 return d
180 return min(window, key=key)
181
182
183 def match_mode(window, peak, mode):
184 if not window:
185 return None
186 return min(window, key=lambda cpeak: abs(distance(peak, cpeak)-mode))
187
188 METHODS = {'mode': match_mode, 'closest': match_closest, 'largest': match_largest}
189
190
191 def frequency_plot(freqs, fname, labels=[], title=''):
192 pyplot.clf()
193 pyplot.figure(figsize=(10, 10))
194 for i, freq in enumerate(freqs):
195 x, y = freq.graph_series()
196 pyplot.plot(x, y, '%s-' % COLORS[i])
197 if len(freqs) > 1:
198 pyplot.legend(labels)
199 pyplot.xlim(freq.start, freq.end)
200 pyplot.ylim(ymin=0)
201 pyplot.ylabel(Y_LABEL)
202 pyplot.xlabel(X_LABEL)
203 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3])
204 # Get the current axes
205 ax = pyplot.gca()
206 for l in ax.get_xticklines() + ax.get_yticklines():
207 l.set_markeredgewidth(TICK_WIDTH)
208 pyplot.savefig(fname)
209
210
211 def create_directories(method):
212 if method == 'all':
213 match_methods = METHODS.keys()
214 else:
215 match_methods = [method]
216 for match_method in match_methods:
217 os.mkdir('%s_%s' % (match_method, DETAILS))
218 os.mkdir('%s_%s' % (match_method, FINAL_PLOTS))
219 os.mkdir('%s_%s' % (match_method, ORPHANS))
220 os.mkdir('%s_%s' % (match_method, PREVIEW_PLOTS))
221 os.mkdir('%s_%s' % (match_method, SIMPLES))
222 os.mkdir('%s_%s' % (match_method, STATS_GRAPH))
223
224
225 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, down_distance,
226 binsize, output_files, plot_format, sort_chromosome, sort_score):
227 if method == 'all':
228 match_methods = METHODS.keys()
229 else:
230 match_methods = [method]
231 statistics = []
232 for match_method in match_methods:
233 stats = perform_process(dataset_path,
234 galaxy_hid,
235 match_method,
236 threshold,
237 up_distance,
238 down_distance,
239 binsize,
240 output_files,
241 plot_format,
242 sort_chromosome,
243 sort_score)
244 statistics.append(stats)
245 if output_files == 'all' and method == 'all':
246 frequency_plot([s['dist'] for s in statistics],
247 statistics[0]['graph_path'],
248 labels=METHODS.keys())
249 return statistics
250
251
252 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance,
253 down_distance, binsize, output_files, plot_format, sort_chromosome, sort_score):
254 output_details = output_files in ["all", "simple_orphan_detail"]
255 output_plots = output_files in ["all"]
256 output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"]
257 # Keep track of statistics for the output file
258 statistics = {}
259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
260 fpath, fname = os.path.split(dataset_path)
261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid))
262 statistics['dir'] = fpath
263 if threshold >= 1:
264 filter_string = 'fa%d' % threshold
265 else:
266 filter_string = 'f%d' % (threshold * 100)
267 fname = 'data_%s_%su%dd%db%d' % (galaxy_hid, filter_string, up_distance, down_distance, binsize)
268
269 def make_path(output_type, extension=TABULAR_EXT):
270 # Returns the full path for a certain output.
271 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension))
272
273 def td_writer(output_type, extension=TABULAR_EXT):
274 # Returns a tab-delimited writer for a specified output.
275 output_file_path = make_path(output_type, extension)
276 return csv.writer(open(output_file_path, 'wt'), delimiter='\t')
277
278 try:
279 chromosomes = parse_chromosomes(input)
280 except Exception:
281 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
282 if output_details:
283 # Details
284 detailed_output = td_writer('%s_%s' % (method, DETAILS), extension=TABULAR_EXT)
285 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)'))
286 if output_plots:
287 # Final Plot
288 final_plot_path = make_path('%s_%s' % (method, FINAL_PLOTS), plot_format)
289 if output_orphans:
290 # Orphans
291 orphan_output = td_writer('%s_%s' % (method, ORPHANS), extension=TABULAR_EXT)
292 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value'))
293 if output_plots:
294 # Preview Plot
295 preview_plot_path = make_path('%s_%s' % (method, PREVIEW_PLOTS), plot_format)
296 # Simple
297 simple_output = td_writer('%s_%s' % (method, SIMPLES), extension=GFF_EXT)
298 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT
299 if output_plots:
300 statistics['graph_path'] = make_path('%s_%s' % (method, STATS_GRAPH), plot_format)
301 statistics['perc95'] = perc95(chromosomes)
302 if threshold > 0:
303 # Apply filter
304 filter(chromosomes, threshold)
305 if method == 'mode':
306 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize)
307 mode = freq.mode()
308 statistics['preview_mode'] = mode
309 if output_plots:
310 frequency_plot([freq], preview_plot_path, title='Preview frequency plot')
311 else:
312 statistics['preview_mode'] = 'NA'
313 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
314 orphans = 0
315 # x will be used to archive the summary dataset
316 x = []
317 for cname, chromosome in chromosomes.items():
318 # Each peak is (strand, start, end, value)
319 watson, crick = split_strands(chromosome)
320 # Sort by value of each peak
321 watson.sort(key=lambda data: -float(data[3]))
322 # Sort by position to facilitate binary search
323 crick.sort(key=lambda data: float(data[1]))
324 keys = make_keys(crick)
325 for peak in watson:
326 window = get_window(crick, peak, up_distance, down_distance, keys)
327 if method == 'mode':
328 match = match_mode(window, peak, mode)
329 else:
330 match = METHODS[method](window, peak)
331 if match:
332 midpoint = (match[1] + match[2] + peak[1] + peak[2]) // 4
333 d = distance(peak, match)
334 dist.add(d)
335 # Simple output in gff format.
336 x.append(gff_row(cname,
337 source='cwpair',
338 start=midpoint,
339 end=midpoint + 1,
340 score=peak[3] + match[3],
341 attrs={'cw_distance': d}))
342 if output_details:
343 detailed_output.writerow((cname,
344 peak[1],
345 peak[2],
346 peak[3],
347 '+',
348 cname,
349 match[1],
350 match[2],
351 match[3], '-',
352 midpoint,
353 peak[3]+match[3],
354 d))
355 i = bisect.bisect_left(keys, (match[1]+match[2])/2)
356 del crick[i]
357 del keys[i]
358 else:
359 if output_orphans:
360 orphan_output.writerow((cname, peak[0], peak[1], peak[2], peak[3]))
361 # Keep track of orphans for statistics.
362 orphans += 1
363 # Remaining crick peaks are orphans
364 if output_orphans:
365 for cpeak in crick:
366 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3]))
367 # Keep track of orphans for statistics.
368 orphans += len(crick)
369 # Sort output by chromosome if specified.
370 if sort_chromosome == "asc":
371 try:
372 x.sort(key=lambda data: int(data[3]))
373 x.sort(key=lambda data: int(data[0]))
374 except:
375 # Cannot sort because chromosome number is not a numeric.
376 pass
377 elif sort_chromosome == "desc":
378 try:
379 x.sort(key=lambda data: int(data[0]), reverse=True)
380 x.sort(key=lambda data: int(data[3]), reverse=True)
381 except:
382 # Cannot sort because chromosome number is not a numeric.
383 pass
384 # Sort output by score if specified.
385 if sort_score == "desc":
386 x.sort(key=lambda data: float(data[5]), reverse=True)
387 elif sort_score == "asc":
388 x.sort(key=lambda data: float(data[5]))
389 # Writing a summary to txt or gff format file
390 for row in x:
391 row_tmp = list(row)
392 # Dataset in tuple cannot be modified in Python, so row will
393 # be converted to list format to add 'chr'.
394 if row_tmp[0] == "999":
395 row_tmp[0] = 'chrM'
396 elif row_tmp[0] == "998":
397 row_tmp[0] = 'chrY'
398 elif row_tmp[0] == "997":
399 row_tmp[0] = 'chrX'
400 else:
401 row_tmp[0] = row_tmp[0]
402 # Print row_tmp.
403 simple_output.writerow(row_tmp)
404 statistics['paired'] = dist.size() * 2
405 statistics['orphans'] = orphans
406 statistics['final_mode'] = dist.mode()
407 if output_plots:
408 frequency_plot([dist], final_plot_path, title='Frequency distribution')
409 statistics['dist'] = dist
410 return statistics