Mercurial > repos > greg > cwpair2
comparison cwpair2_util.py @ 8:1fc26b8e618d draft
Uploaded
author | greg |
---|---|
date | Wed, 02 Dec 2015 16:13:51 -0500 |
parents | 2e0ddcc726f9 |
children |
comparison
equal
deleted
inserted
replaced
7:d455f14530dc | 8:1fc26b8e618d |
---|---|
5 import traceback | 5 import traceback |
6 import matplotlib | 6 import matplotlib |
7 matplotlib.use('Agg') | 7 matplotlib.use('Agg') |
8 from matplotlib import pyplot | 8 from matplotlib import pyplot |
9 | 9 |
10 # Data outputs | |
10 DETAILS = 'D' | 11 DETAILS = 'D' |
11 FINAL_PLOTS = 'F' | 12 MATCHED_PAIRS = 'MP' |
12 ORPHANS = 'O' | 13 ORPHANS = 'O' |
13 PREVIEW_PLOTS = 'P' | 14 # Data output formats |
14 SIMPLES = 'S' | |
15 STATS_GRAPH = 'C' | |
16 GFF_EXT = 'gff' | 15 GFF_EXT = 'gff' |
17 TABULAR_EXT = 'tabular' | 16 TABULAR_EXT = 'tabular' |
17 # Statistics historgrams output directory. | |
18 HISTOGRAM = 'H' | |
19 # Statistics outputs | |
20 FINAL_PLOTS = 'F' | |
21 PREVIEW_PLOTS = 'P' | |
22 STATS_GRAPH = 'C' | |
18 | 23 |
19 # Graph settings. | 24 # Graph settings. |
20 COLORS = 'krg' | 25 COLORS = 'krg' |
21 Y_LABEL = 'Peak-pair counts' | 26 Y_LABEL = 'Peak-pair counts' |
22 X_LABEL = 'Peak-pair distance (bp)' | 27 X_LABEL = 'Peak-pair distance (bp)' |
208 for l in ax.get_xticklines() + ax.get_yticklines(): | 213 for l in ax.get_xticklines() + ax.get_yticklines(): |
209 l.set_markeredgewidth(TICK_WIDTH) | 214 l.set_markeredgewidth(TICK_WIDTH) |
210 pyplot.savefig(fname) | 215 pyplot.savefig(fname) |
211 | 216 |
212 | 217 |
213 def create_directories(method): | 218 def create_directories(): |
214 if method == 'all': | 219 # Output histograms in pdf. |
215 match_methods = METHODS.keys() | 220 os.mkdir(HISTOGRAM) |
216 else: | 221 os.mkdir('data_%s' % DETAILS) |
217 match_methods = [method] | 222 os.mkdir('data_%s' % ORPHANS) |
218 for match_method in match_methods: | 223 os.mkdir('data_%s' % MATCHED_PAIRS) |
219 os.mkdir('%s_%s' % (match_method, DETAILS)) | |
220 os.mkdir('%s_%s' % (match_method, FINAL_PLOTS)) | |
221 os.mkdir('%s_%s' % (match_method, ORPHANS)) | |
222 os.mkdir('%s_%s' % (match_method, PREVIEW_PLOTS)) | |
223 os.mkdir('%s_%s' % (match_method, SIMPLES)) | |
224 os.mkdir('%s_%s' % (match_method, STATS_GRAPH)) | |
225 | 224 |
226 | 225 |
227 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, | 226 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, |
228 down_distance, binsize, output_files, sort_score): | 227 down_distance, binsize, output_files): |
229 if method == 'all': | 228 if method == 'all': |
230 match_methods = METHODS.keys() | 229 match_methods = METHODS.keys() |
231 else: | 230 else: |
232 match_methods = [method] | 231 match_methods = [method] |
233 statistics = [] | 232 statistics = [] |
237 match_method, | 236 match_method, |
238 threshold, | 237 threshold, |
239 up_distance, | 238 up_distance, |
240 down_distance, | 239 down_distance, |
241 binsize, | 240 binsize, |
242 output_files, | 241 output_files) |
243 sort_score) | |
244 statistics.append(stats) | 242 statistics.append(stats) |
245 if output_files == 'all' and method == 'all': | 243 if output_files == 'all' and method == 'all': |
246 frequency_plot([s['dist'] for s in statistics], | 244 frequency_plot([s['dist'] for s in statistics], |
247 statistics[0]['graph_path'], | 245 statistics[0]['graph_path'], |
248 labels=METHODS.keys()) | 246 labels=METHODS.keys()) |
249 return statistics | 247 return statistics |
250 | 248 |
251 | 249 |
252 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance, | 250 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance, |
253 down_distance, binsize, output_files, sort_score): | 251 down_distance, binsize, output_files): |
254 output_details = output_files in ["all", "simple_orphan_detail"] | 252 output_details = output_files in ["all", "matched_pair_orphan_detail"] |
255 output_plots = output_files in ["all"] | 253 output_plots = output_files in ["all"] |
256 output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"] | 254 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] |
257 # Keep track of statistics for the output file | 255 # Keep track of statistics for the output file |
258 statistics = {} | 256 statistics = {} |
259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') | 257 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') |
260 fpath, fname = os.path.split(dataset_path) | 258 fpath, fname = os.path.split(dataset_path) |
261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) | 259 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) |
262 statistics['dir'] = fpath | 260 statistics['dir'] = fpath |
263 if threshold >= 1: | 261 if threshold >= 1: |
264 filter_string = 'fa%d' % threshold | 262 filter_string = 'fa%d' % threshold |
265 else: | 263 else: |
266 filter_string = 'f%d' % (threshold * 100) | 264 filter_string = 'f%d' % (threshold * 100) |
267 fname = 'data_%s_%su%dd%db%d' % (galaxy_hid, filter_string, up_distance, down_distance, binsize) | 265 fname = '%s_%su%dd%d_on_data_%s' % (method, filter_string, up_distance, down_distance, galaxy_hid) |
268 | 266 |
269 def make_path(output_type, extension=TABULAR_EXT): | 267 def make_histogram_path(output_type, fname): |
270 # Returns the full path for a certain output. | 268 return os.path.join(HISTOGRAM, 'histogram_%s_%s.%s' % (output_type, fname, PLOT_FORMAT)) |
269 | |
270 def make_path(output_type, extension, fname): | |
271 # Returns the full path for an output. | |
271 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) | 272 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) |
272 | 273 |
273 def td_writer(output_type, extension=TABULAR_EXT): | 274 def td_writer(output_type, extension, fname): |
274 # Returns a tab-delimited writer for a specified output. | 275 # Returns a tab-delimited writer for a specified output. |
275 output_file_path = make_path(output_type, extension) | 276 output_file_path = make_path(output_type, extension, fname) |
276 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') | 277 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') |
277 | 278 |
278 try: | 279 try: |
279 chromosomes = parse_chromosomes(input) | 280 chromosomes = parse_chromosomes(input) |
280 except Exception: | 281 except Exception: |
281 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) | 282 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) |
282 if output_details: | 283 if output_details: |
283 # Details | 284 # Details |
284 detailed_output = td_writer('%s_%s' % (method, DETAILS), extension=TABULAR_EXT) | 285 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) |
285 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) | 286 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) |
286 if output_plots: | 287 if output_plots: |
287 # Final Plot | 288 # Final Plot |
288 final_plot_path = make_path('%s_%s' % (method, FINAL_PLOTS), PLOT_FORMAT) | 289 final_plot_path = make_histogram_path(FINAL_PLOTS, fname) |
289 if output_orphans: | 290 if output_orphans: |
290 # Orphans | 291 # Orphans |
291 orphan_output = td_writer('%s_%s' % (method, ORPHANS), extension=TABULAR_EXT) | 292 orphan_output = td_writer('data_%s' % ORPHANS, TABULAR_EXT, fname) |
292 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value')) | 293 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value')) |
293 if output_plots: | 294 if output_plots: |
294 # Preview Plot | 295 # Preview Plot |
295 preview_plot_path = make_path('%s_%s' % (method, PREVIEW_PLOTS), PLOT_FORMAT) | 296 preview_plot_path = make_histogram_path(PREVIEW_PLOTS, fname) |
296 # Simple | 297 # Matched Pairs. |
297 simple_output = td_writer('%s_%s' % (method, SIMPLES), extension=GFF_EXT) | 298 matched_pairs_output = td_writer('data_%s' % MATCHED_PAIRS, GFF_EXT, fname) |
298 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT | 299 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT |
299 if output_plots: | 300 if output_plots: |
300 statistics['graph_path'] = make_path('%s_%s' % (method, STATS_GRAPH), PLOT_FORMAT) | 301 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) |
301 statistics['perc95'] = perc95(chromosomes) | 302 statistics['perc95'] = perc95(chromosomes) |
302 if threshold > 0: | 303 if threshold > 0: |
303 # Apply filter | 304 # Apply filter |
304 filter(chromosomes, threshold) | 305 filter(chromosomes, threshold) |
305 if method == 'mode': | 306 if method == 'mode': |
364 if output_orphans: | 365 if output_orphans: |
365 for cpeak in crick: | 366 for cpeak in crick: |
366 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3])) | 367 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3])) |
367 # Keep track of orphans for statistics. | 368 # Keep track of orphans for statistics. |
368 orphans += len(crick) | 369 orphans += len(crick) |
369 # Sort output by score if specified. | 370 # Sort output descending by score. |
370 if sort_score == "desc": | 371 x.sort(key=lambda data: float(data[5]), reverse=True) |
371 x.sort(key=lambda data: float(data[5]), reverse=True) | |
372 elif sort_score == "asc": | |
373 x.sort(key=lambda data: float(data[5])) | |
374 # Writing a summary to gff format file | 372 # Writing a summary to gff format file |
375 for row in x: | 373 for row in x: |
376 row_tmp = list(row) | 374 row_tmp = list(row) |
377 # Dataset in tuple cannot be modified in Python, so row will | 375 # Dataset in tuple cannot be modified in Python, so row will |
378 # be converted to list format to add 'chr'. | 376 # be converted to list format to add 'chr'. |
383 elif row_tmp[0] == "997": | 381 elif row_tmp[0] == "997": |
384 row_tmp[0] = 'chrX' | 382 row_tmp[0] = 'chrX' |
385 else: | 383 else: |
386 row_tmp[0] = row_tmp[0] | 384 row_tmp[0] = row_tmp[0] |
387 # Print row_tmp. | 385 # Print row_tmp. |
388 simple_output.writerow(row_tmp) | 386 matched_pairs_output.writerow(row_tmp) |
389 statistics['paired'] = dist.size() * 2 | 387 statistics['paired'] = dist.size() * 2 |
390 statistics['orphans'] = orphans | 388 statistics['orphans'] = orphans |
391 statistics['final_mode'] = dist.mode() | 389 statistics['final_mode'] = dist.mode() |
392 if output_plots: | 390 if output_plots: |
393 frequency_plot([dist], final_plot_path, title='Frequency distribution') | 391 frequency_plot([dist], final_plot_path, title='Frequency distribution') |