comparison cwpair2_util.py @ 8:1fc26b8e618d draft

Uploaded
author greg
date Wed, 02 Dec 2015 16:13:51 -0500
parents 2e0ddcc726f9
children
comparison
equal deleted inserted replaced
7:d455f14530dc 8:1fc26b8e618d
5 import traceback 5 import traceback
6 import matplotlib 6 import matplotlib
7 matplotlib.use('Agg') 7 matplotlib.use('Agg')
8 from matplotlib import pyplot 8 from matplotlib import pyplot
9 9
10 # Data outputs
10 DETAILS = 'D' 11 DETAILS = 'D'
11 FINAL_PLOTS = 'F' 12 MATCHED_PAIRS = 'MP'
12 ORPHANS = 'O' 13 ORPHANS = 'O'
13 PREVIEW_PLOTS = 'P' 14 # Data output formats
14 SIMPLES = 'S'
15 STATS_GRAPH = 'C'
16 GFF_EXT = 'gff' 15 GFF_EXT = 'gff'
17 TABULAR_EXT = 'tabular' 16 TABULAR_EXT = 'tabular'
17 # Statistics historgrams output directory.
18 HISTOGRAM = 'H'
19 # Statistics outputs
20 FINAL_PLOTS = 'F'
21 PREVIEW_PLOTS = 'P'
22 STATS_GRAPH = 'C'
18 23
19 # Graph settings. 24 # Graph settings.
20 COLORS = 'krg' 25 COLORS = 'krg'
21 Y_LABEL = 'Peak-pair counts' 26 Y_LABEL = 'Peak-pair counts'
22 X_LABEL = 'Peak-pair distance (bp)' 27 X_LABEL = 'Peak-pair distance (bp)'
208 for l in ax.get_xticklines() + ax.get_yticklines(): 213 for l in ax.get_xticklines() + ax.get_yticklines():
209 l.set_markeredgewidth(TICK_WIDTH) 214 l.set_markeredgewidth(TICK_WIDTH)
210 pyplot.savefig(fname) 215 pyplot.savefig(fname)
211 216
212 217
213 def create_directories(method): 218 def create_directories():
214 if method == 'all': 219 # Output histograms in pdf.
215 match_methods = METHODS.keys() 220 os.mkdir(HISTOGRAM)
216 else: 221 os.mkdir('data_%s' % DETAILS)
217 match_methods = [method] 222 os.mkdir('data_%s' % ORPHANS)
218 for match_method in match_methods: 223 os.mkdir('data_%s' % MATCHED_PAIRS)
219 os.mkdir('%s_%s' % (match_method, DETAILS))
220 os.mkdir('%s_%s' % (match_method, FINAL_PLOTS))
221 os.mkdir('%s_%s' % (match_method, ORPHANS))
222 os.mkdir('%s_%s' % (match_method, PREVIEW_PLOTS))
223 os.mkdir('%s_%s' % (match_method, SIMPLES))
224 os.mkdir('%s_%s' % (match_method, STATS_GRAPH))
225 224
226 225
227 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, 226 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance,
228 down_distance, binsize, output_files, sort_score): 227 down_distance, binsize, output_files):
229 if method == 'all': 228 if method == 'all':
230 match_methods = METHODS.keys() 229 match_methods = METHODS.keys()
231 else: 230 else:
232 match_methods = [method] 231 match_methods = [method]
233 statistics = [] 232 statistics = []
237 match_method, 236 match_method,
238 threshold, 237 threshold,
239 up_distance, 238 up_distance,
240 down_distance, 239 down_distance,
241 binsize, 240 binsize,
242 output_files, 241 output_files)
243 sort_score)
244 statistics.append(stats) 242 statistics.append(stats)
245 if output_files == 'all' and method == 'all': 243 if output_files == 'all' and method == 'all':
246 frequency_plot([s['dist'] for s in statistics], 244 frequency_plot([s['dist'] for s in statistics],
247 statistics[0]['graph_path'], 245 statistics[0]['graph_path'],
248 labels=METHODS.keys()) 246 labels=METHODS.keys())
249 return statistics 247 return statistics
250 248
251 249
252 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance, 250 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance,
253 down_distance, binsize, output_files, sort_score): 251 down_distance, binsize, output_files):
254 output_details = output_files in ["all", "simple_orphan_detail"] 252 output_details = output_files in ["all", "matched_pair_orphan_detail"]
255 output_plots = output_files in ["all"] 253 output_plots = output_files in ["all"]
256 output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"] 254 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"]
257 # Keep track of statistics for the output file 255 # Keep track of statistics for the output file
258 statistics = {} 256 statistics = {}
259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') 257 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
260 fpath, fname = os.path.split(dataset_path) 258 fpath, fname = os.path.split(dataset_path)
261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) 259 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid))
262 statistics['dir'] = fpath 260 statistics['dir'] = fpath
263 if threshold >= 1: 261 if threshold >= 1:
264 filter_string = 'fa%d' % threshold 262 filter_string = 'fa%d' % threshold
265 else: 263 else:
266 filter_string = 'f%d' % (threshold * 100) 264 filter_string = 'f%d' % (threshold * 100)
267 fname = 'data_%s_%su%dd%db%d' % (galaxy_hid, filter_string, up_distance, down_distance, binsize) 265 fname = '%s_%su%dd%d_on_data_%s' % (method, filter_string, up_distance, down_distance, galaxy_hid)
268 266
269 def make_path(output_type, extension=TABULAR_EXT): 267 def make_histogram_path(output_type, fname):
270 # Returns the full path for a certain output. 268 return os.path.join(HISTOGRAM, 'histogram_%s_%s.%s' % (output_type, fname, PLOT_FORMAT))
269
270 def make_path(output_type, extension, fname):
271 # Returns the full path for an output.
271 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) 272 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension))
272 273
273 def td_writer(output_type, extension=TABULAR_EXT): 274 def td_writer(output_type, extension, fname):
274 # Returns a tab-delimited writer for a specified output. 275 # Returns a tab-delimited writer for a specified output.
275 output_file_path = make_path(output_type, extension) 276 output_file_path = make_path(output_type, extension, fname)
276 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') 277 return csv.writer(open(output_file_path, 'wt'), delimiter='\t')
277 278
278 try: 279 try:
279 chromosomes = parse_chromosomes(input) 280 chromosomes = parse_chromosomes(input)
280 except Exception: 281 except Exception:
281 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) 282 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
282 if output_details: 283 if output_details:
283 # Details 284 # Details
284 detailed_output = td_writer('%s_%s' % (method, DETAILS), extension=TABULAR_EXT) 285 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname)
285 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) 286 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)'))
286 if output_plots: 287 if output_plots:
287 # Final Plot 288 # Final Plot
288 final_plot_path = make_path('%s_%s' % (method, FINAL_PLOTS), PLOT_FORMAT) 289 final_plot_path = make_histogram_path(FINAL_PLOTS, fname)
289 if output_orphans: 290 if output_orphans:
290 # Orphans 291 # Orphans
291 orphan_output = td_writer('%s_%s' % (method, ORPHANS), extension=TABULAR_EXT) 292 orphan_output = td_writer('data_%s' % ORPHANS, TABULAR_EXT, fname)
292 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value')) 293 orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value'))
293 if output_plots: 294 if output_plots:
294 # Preview Plot 295 # Preview Plot
295 preview_plot_path = make_path('%s_%s' % (method, PREVIEW_PLOTS), PLOT_FORMAT) 296 preview_plot_path = make_histogram_path(PREVIEW_PLOTS, fname)
296 # Simple 297 # Matched Pairs.
297 simple_output = td_writer('%s_%s' % (method, SIMPLES), extension=GFF_EXT) 298 matched_pairs_output = td_writer('data_%s' % MATCHED_PAIRS, GFF_EXT, fname)
298 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT 299 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT
299 if output_plots: 300 if output_plots:
300 statistics['graph_path'] = make_path('%s_%s' % (method, STATS_GRAPH), PLOT_FORMAT) 301 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname)
301 statistics['perc95'] = perc95(chromosomes) 302 statistics['perc95'] = perc95(chromosomes)
302 if threshold > 0: 303 if threshold > 0:
303 # Apply filter 304 # Apply filter
304 filter(chromosomes, threshold) 305 filter(chromosomes, threshold)
305 if method == 'mode': 306 if method == 'mode':
364 if output_orphans: 365 if output_orphans:
365 for cpeak in crick: 366 for cpeak in crick:
366 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3])) 367 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3]))
367 # Keep track of orphans for statistics. 368 # Keep track of orphans for statistics.
368 orphans += len(crick) 369 orphans += len(crick)
369 # Sort output by score if specified. 370 # Sort output descending by score.
370 if sort_score == "desc": 371 x.sort(key=lambda data: float(data[5]), reverse=True)
371 x.sort(key=lambda data: float(data[5]), reverse=True)
372 elif sort_score == "asc":
373 x.sort(key=lambda data: float(data[5]))
374 # Writing a summary to gff format file 372 # Writing a summary to gff format file
375 for row in x: 373 for row in x:
376 row_tmp = list(row) 374 row_tmp = list(row)
377 # Dataset in tuple cannot be modified in Python, so row will 375 # Dataset in tuple cannot be modified in Python, so row will
378 # be converted to list format to add 'chr'. 376 # be converted to list format to add 'chr'.
383 elif row_tmp[0] == "997": 381 elif row_tmp[0] == "997":
384 row_tmp[0] = 'chrX' 382 row_tmp[0] = 'chrX'
385 else: 383 else:
386 row_tmp[0] = row_tmp[0] 384 row_tmp[0] = row_tmp[0]
387 # Print row_tmp. 385 # Print row_tmp.
388 simple_output.writerow(row_tmp) 386 matched_pairs_output.writerow(row_tmp)
389 statistics['paired'] = dist.size() * 2 387 statistics['paired'] = dist.size() * 2
390 statistics['orphans'] = orphans 388 statistics['orphans'] = orphans
391 statistics['final_mode'] = dist.mode() 389 statistics['final_mode'] = dist.mode()
392 if output_plots: 390 if output_plots:
393 frequency_plot([dist], final_plot_path, title='Frequency distribution') 391 frequency_plot([dist], final_plot_path, title='Frequency distribution')