Mercurial > repos > greg > cwpair2
comparison cwpair2_util.py @ 0:507521bf124a draft
Uploaded
author | greg |
---|---|
date | Tue, 17 Nov 2015 14:06:28 -0500 |
parents | |
children | 4d86371aafa8 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:507521bf124a |
---|---|
1 import bisect | |
2 import csv | |
3 import os | |
4 import sys | |
5 import traceback | |
6 | |
7 from matplotlib import pyplot | |
8 | |
9 DETAILS = 'D' | |
10 FINAL_PLOTS = 'F' | |
11 ORPHANS = 'O' | |
12 PREVIEW_PLOTS = 'P' | |
13 SIMPLES = 'S' | |
14 STATS_GRAPH = 'C' | |
15 GFF_EXT = 'gff' | |
16 TABULAR_EXT = 'tabular' | |
17 | |
18 # Graph settings. | |
19 COLORS = 'krg' | |
20 Y_LABEL = 'Peak-pair counts' | |
21 X_LABEL = 'Peak-pair distance (bp)' | |
22 TICK_WIDTH = 3 | |
23 ADJUST = [0.140, 0.9, 0.9, 0.1] | |
24 pyplot.rc('xtick.major', size=10.00) | |
25 pyplot.rc('ytick.major', size=10.00) | |
26 pyplot.rc('lines', linewidth=4.00) | |
27 pyplot.rc('axes', linewidth=3.00) | |
28 pyplot.rc('font', family='Arial', size=32.0) | |
29 | |
30 | |
31 class FrequencyDistribution(object): | |
32 | |
33 def __init__(self, start, end, binsize=10, d=None): | |
34 self.start = start | |
35 self.end = end | |
36 self.dist = d or {} | |
37 self.binsize = binsize | |
38 | |
39 def get_bin(self, x): | |
40 """ | |
41 Returns the bin in which a data point falls | |
42 """ | |
43 return self.start + (x-self.start) // self.binsize * self.binsize + self.binsize/2.0 | |
44 | |
45 def add(self, x): | |
46 x = self.get_bin(x) | |
47 self.dist[x] = self.dist.get(x, 0) + 1 | |
48 | |
49 def graph_series(self): | |
50 x = [] | |
51 y = [] | |
52 for i in range(self.start, self.end, self.binsize): | |
53 center = self.get_bin(i) | |
54 x.append(center) | |
55 y.append(self.dist.get(center, 0)) | |
56 return x, y | |
57 | |
58 def mode(self): | |
59 return max(self.dist.items(), key=lambda data: data[1])[0] | |
60 | |
61 def size(self): | |
62 return sum(self.dist.values()) | |
63 | |
64 | |
65 def stop_err(msg): | |
66 sys.stderr.write(msg) | |
67 sys.exit(1) | |
68 | |
69 | |
70 def distance(peak1, peak2): | |
71 return (peak2[1]+peak2[2])/2 - (peak1[1]+peak1[2])/2 | |
72 | |
73 | |
74 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
75 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
76 | |
77 | |
78 def gff_attrs(d): | |
79 if not d: | |
80 return '.' | |
81 return ';'.join('%s=%s' % item for item in d.items()) | |
82 | |
83 | |
84 def parse_chromosomes(reader): | |
85 # This version of cwpair2 accepts only gff format as input. | |
86 chromosomes = {} | |
87 reader.next() | |
88 for line in reader: | |
89 cname, junk, junk, start, end, value, strand, junk, junk = line | |
90 start = int(start) | |
91 end = int(end) | |
92 value = float(value) | |
93 if cname not in chromosomes: | |
94 chromosomes[cname] = [] | |
95 peaks = chromosomes[cname] | |
96 peaks.append((strand, start, end, value)) | |
97 return chromosomes | |
98 | |
99 | |
100 def perc95(chromosomes): | |
101 """ | |
102 Returns the 95th percentile value of the given chromosomes. | |
103 """ | |
104 values = [] | |
105 for peaks in chromosomes.values(): | |
106 for peak in peaks: | |
107 values.append(peak[3]) | |
108 values.sort() | |
109 # Get 95% value | |
110 return values[int(len(values)*0.95)] | |
111 | |
112 | |
113 def filter(chromosomes, threshold=0.05): | |
114 """ | |
115 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted | |
116 as a proportion of the maximum, >=1.0 as an absolute value. | |
117 """ | |
118 if threshold < 1: | |
119 p95 = perc95(chromosomes) | |
120 threshold = p95 * threshold | |
121 # Make the threshold a proportion of the | |
122 for cname, peaks in chromosomes.items(): | |
123 chromosomes[cname] = [peak for peak in peaks if peak[3] > threshold] | |
124 | |
125 | |
126 def split_strands(chromosome): | |
127 watson = [peak for peak in chromosome if peak[0] == '+'] | |
128 crick = [peak for peak in chromosome if peak[0] == '-'] | |
129 return watson, crick | |
130 | |
131 | |
132 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): | |
133 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) | |
134 for cname, data in chromosomes.items(): | |
135 watson, crick = split_strands(data) | |
136 crick.sort(key=lambda data: float(data[1])) | |
137 keys = make_keys(crick) | |
138 for peak in watson: | |
139 for cpeak in get_window(crick, peak, up_distance, down_distance, keys): | |
140 dist.add(distance(peak, cpeak)) | |
141 return dist | |
142 | |
143 | |
144 def make_keys(crick): | |
145 return [(data[1] + data[2])//2 for data in crick] | |
146 | |
147 | |
148 def get_window(crick, peak, up_distance, down_distance, keys=None): | |
149 """ | |
150 Returns a window of all crick peaks within a distance of a watson peak. | |
151 crick strand MUST be sorted by distance | |
152 """ | |
153 strand, start, end, value = peak | |
154 midpoint = (start + end) // 2 | |
155 lower = midpoint - up_distance | |
156 upper = midpoint + down_distance | |
157 keys = keys or make_keys(crick) | |
158 start_index = bisect.bisect_left(keys, lower) | |
159 end_index = bisect.bisect_right(keys, upper) | |
160 return [cpeak for cpeak in crick[start_index:end_index]] | |
161 | |
162 | |
163 def match_largest(window, peak): | |
164 if not window: | |
165 return None | |
166 return max(window, key=lambda cpeak: cpeak[3]) | |
167 | |
168 | |
169 def match_closest(window, peak): | |
170 if not window: | |
171 return None | |
172 | |
173 def key(cpeak): | |
174 d = distance(peak, cpeak) | |
175 # Search negative distances last | |
176 if d < 0: | |
177 # And then prefer less negative distances | |
178 d = 10000 - d | |
179 return d | |
180 return min(window, key=key) | |
181 | |
182 | |
183 def match_mode(window, peak, mode): | |
184 if not window: | |
185 return None | |
186 return min(window, key=lambda cpeak: abs(distance(peak, cpeak)-mode)) | |
187 | |
188 METHODS = {'mode': match_mode, 'closest': match_closest, 'largest': match_largest} | |
189 | |
190 | |
191 def frequency_plot(freqs, fname, labels=[], title=''): | |
192 pyplot.clf() | |
193 pyplot.figure(figsize=(10, 10)) | |
194 for i, freq in enumerate(freqs): | |
195 x, y = freq.graph_series() | |
196 pyplot.plot(x, y, '%s-' % COLORS[i]) | |
197 if len(freqs) > 1: | |
198 pyplot.legend(labels) | |
199 pyplot.xlim(freq.start, freq.end) | |
200 pyplot.ylim(ymin=0) | |
201 pyplot.ylabel(Y_LABEL) | |
202 pyplot.xlabel(X_LABEL) | |
203 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) | |
204 # Get the current axes | |
205 ax = pyplot.gca() | |
206 for l in ax.get_xticklines() + ax.get_yticklines(): | |
207 l.set_markeredgewidth(TICK_WIDTH) | |
208 pyplot.savefig(fname) | |
209 | |
210 | |
211 def create_directories(method): | |
212 if method == 'all': | |
213 match_methods = METHODS.keys() | |
214 else: | |
215 match_methods = [method] | |
216 for match_method in match_methods: | |
217 os.mkdir('%s_%s' % (match_method, DETAILS)) | |
218 os.mkdir('%s_%s' % (match_method, FINAL_PLOTS)) | |
219 os.mkdir('%s_%s' % (match_method, ORPHANS)) | |
220 os.mkdir('%s_%s' % (match_method, PREVIEW_PLOTS)) | |
221 os.mkdir('%s_%s' % (match_method, SIMPLES)) | |
222 os.mkdir('%s_%s' % (match_method, STATS_GRAPH)) | |
223 | |
224 | |
225 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, down_distance, | |
226 binsize, output_files, plot_format, sort_chromosome, sort_score): | |
227 if method == 'all': | |
228 match_methods = METHODS.keys() | |
229 else: | |
230 match_methods = [method] | |
231 statistics = [] | |
232 for match_method in match_methods: | |
233 stats = perform_process(dataset_path, | |
234 galaxy_hid, | |
235 match_method, | |
236 threshold, | |
237 up_distance, | |
238 down_distance, | |
239 binsize, | |
240 output_files, | |
241 plot_format, | |
242 sort_chromosome, | |
243 sort_score) | |
244 statistics.append(stats) | |
245 if output_files == 'all' and method == 'all': | |
246 frequency_plot([s['dist'] for s in statistics], | |
247 statistics[0]['graph_path'], | |
248 labels=METHODS.keys()) | |
249 return statistics | |
250 | |
251 | |
252 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance, | |
253 down_distance, binsize, output_files, plot_format, sort_chromosome, sort_score): | |
254 output_details = output_files in ["all", "simple_orphan_detail"] | |
255 output_plots = output_files in ["all"] | |
256 output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"] | |
257 # Keep track of statistics for the output file | |
258 statistics = {} | |
259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') | |
260 fpath, fname = os.path.split(dataset_path) | |
261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) | |
262 statistics['dir'] = fpath | |
263 if threshold >= 1: | |
264 filter_string = 'fa%d' % threshold | |
265 else: | |
266 filter_string = 'f%d' % (threshold * 100) | |
267 fname = 'data_%s_%su%dd%db%d' % (galaxy_hid, filter_string, up_distance, down_distance, binsize) | |
268 | |
269 def make_path(output_type, extension=TABULAR_EXT): | |
270 # Returns the full path for a certain output. | |
271 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) | |
272 | |
273 def td_writer(output_type, extension=TABULAR_EXT): | |
274 # Returns a tab-delimited writer for a specified output. | |
275 output_file_path = make_path(output_type, extension) | |
276 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') | |
277 | |
278 try: | |
279 chromosomes = parse_chromosomes(input) | |
280 except Exception: | |
281 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) | |
282 if output_details: | |
283 # Details | |
284 detailed_output = td_writer('%s_%s' % (method, DETAILS), extension=TABULAR_EXT) | |
285 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) | |
286 if output_plots: | |
287 # Final Plot | |
288 final_plot_path = make_path('%s_%s' % (method, FINAL_PLOTS), plot_format) | |
289 if output_orphans: | |
290 # Orphans | |
291 orphan_output = td_writer('%s_%s' % (method, ORPHANS), extension=TABULAR_EXT) | |
292 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value')) | |
293 if output_plots: | |
294 # Preview Plot | |
295 preview_plot_path = make_path('%s_%s' % (method, PREVIEW_PLOTS), plot_format) | |
296 # Simple | |
297 simple_output = td_writer('%s_%s' % (method, SIMPLES), extension=GFF_EXT) | |
298 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT | |
299 if output_plots: | |
300 statistics['graph_path'] = make_path('%s_%s' % (method, STATS_GRAPH), plot_format) | |
301 statistics['perc95'] = perc95(chromosomes) | |
302 if threshold > 0: | |
303 # Apply filter | |
304 filter(chromosomes, threshold) | |
305 if method == 'mode': | |
306 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) | |
307 mode = freq.mode() | |
308 statistics['preview_mode'] = mode | |
309 if output_plots: | |
310 frequency_plot([freq], preview_plot_path, title='Preview frequency plot') | |
311 else: | |
312 statistics['preview_mode'] = 'NA' | |
313 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) | |
314 orphans = 0 | |
315 # x will be used to archive the summary dataset | |
316 x = [] | |
317 for cname, chromosome in chromosomes.items(): | |
318 # Each peak is (strand, start, end, value) | |
319 watson, crick = split_strands(chromosome) | |
320 # Sort by value of each peak | |
321 watson.sort(key=lambda data: -float(data[3])) | |
322 # Sort by position to facilitate binary search | |
323 crick.sort(key=lambda data: float(data[1])) | |
324 keys = make_keys(crick) | |
325 for peak in watson: | |
326 window = get_window(crick, peak, up_distance, down_distance, keys) | |
327 if method == 'mode': | |
328 match = match_mode(window, peak, mode) | |
329 else: | |
330 match = METHODS[method](window, peak) | |
331 if match: | |
332 midpoint = (match[1] + match[2] + peak[1] + peak[2]) // 4 | |
333 d = distance(peak, match) | |
334 dist.add(d) | |
335 # Simple output in gff format. | |
336 x.append(gff_row(cname, | |
337 source='cwpair', | |
338 start=midpoint, | |
339 end=midpoint + 1, | |
340 score=peak[3] + match[3], | |
341 attrs={'cw_distance': d})) | |
342 if output_details: | |
343 detailed_output.writerow((cname, | |
344 peak[1], | |
345 peak[2], | |
346 peak[3], | |
347 '+', | |
348 cname, | |
349 match[1], | |
350 match[2], | |
351 match[3], '-', | |
352 midpoint, | |
353 peak[3]+match[3], | |
354 d)) | |
355 i = bisect.bisect_left(keys, (match[1]+match[2])/2) | |
356 del crick[i] | |
357 del keys[i] | |
358 else: | |
359 if output_orphans: | |
360 orphan_output.writerow((cname, peak[0], peak[1], peak[2], peak[3])) | |
361 # Keep track of orphans for statistics. | |
362 orphans += 1 | |
363 # Remaining crick peaks are orphans | |
364 if output_orphans: | |
365 for cpeak in crick: | |
366 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3])) | |
367 # Keep track of orphans for statistics. | |
368 orphans += len(crick) | |
369 # Sort output by chromosome if specified. | |
370 if sort_chromosome == "asc": | |
371 try: | |
372 x.sort(key=lambda data: int(data[3])) | |
373 x.sort(key=lambda data: int(data[0])) | |
374 except: | |
375 # Cannot sort because chromosome number is not a numeric. | |
376 pass | |
377 elif sort_chromosome == "desc": | |
378 try: | |
379 x.sort(key=lambda data: int(data[0]), reverse=True) | |
380 x.sort(key=lambda data: int(data[3]), reverse=True) | |
381 except: | |
382 # Cannot sort because chromosome number is not a numeric. | |
383 pass | |
384 # Sort output by score if specified. | |
385 if sort_score == "desc": | |
386 x.sort(key=lambda data: float(data[5]), reverse=True) | |
387 elif sort_score == "asc": | |
388 x.sort(key=lambda data: float(data[5])) | |
389 # Writing a summary to txt or gff format file | |
390 for row in x: | |
391 row_tmp = list(row) | |
392 # Dataset in tuple cannot be modified in Python, so row will | |
393 # be converted to list format to add 'chr'. | |
394 if row_tmp[0] == "999": | |
395 row_tmp[0] = 'chrM' | |
396 elif row_tmp[0] == "998": | |
397 row_tmp[0] = 'chrY' | |
398 elif row_tmp[0] == "997": | |
399 row_tmp[0] = 'chrX' | |
400 else: | |
401 row_tmp[0] = row_tmp[0] | |
402 # Print row_tmp. | |
403 simple_output.writerow(row_tmp) | |
404 statistics['paired'] = dist.size() * 2 | |
405 statistics['orphans'] = orphans | |
406 statistics['final_mode'] = dist.mode() | |
407 if output_plots: | |
408 frequency_plot([dist], final_plot_path, title='Frequency distribution') | |
409 statistics['dist'] = dist | |
410 return statistics |