comparison repmatch_gff3_util.py @ 8:d10ae3aeebc8 draft

Uploaded
author greg
date Wed, 02 Dec 2015 16:15:35 -0500
parents 53cbf79396d7
children
comparison
equal deleted inserted replaced
7:1807688a8a5f 8:d10ae3aeebc8
263 os.close(fd) 263 os.close(fd)
264 return name 264 return name
265 265
266 266
267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, 267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files,
268 output_summary, output_orphan, output_detail, output_key, output_histogram): 268 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram):
269 output_histogram_file = output_files in ["all"] and method in ["all"] 269 output_statistics_histogram_file = output_files in ["all"] and method in ["all"]
270 if len(dataset_paths) < 2: 270 if len(dataset_paths) < 2:
271 return 271 return
272 if method == 'all': 272 if method == 'all':
273 match_methods = METHODS.keys() 273 match_methods = METHODS.keys()
274 else: 274 else:
281 step, 281 step,
282 replicates, 282 replicates,
283 up_limit, 283 up_limit,
284 low_limit, 284 low_limit,
285 output_files, 285 output_files,
286 output_summary, 286 output_matched_peaks,
287 output_orphan, 287 output_unmatched_peaks,
288 output_detail, 288 output_detail,
289 output_key, 289 output_statistics_table,
290 output_histogram) 290 output_statistics_histogram)
291 if output_histogram_file: 291 if output_statistics_histogram_file:
292 tmp_histogram_path = get_temporary_plot_path() 292 tmp_statistics_histogram_path = get_temporary_plot_path()
293 frequency_histogram([stat['distribution'] for stat in [statistics]], 293 frequency_histogram([stat['distribution'] for stat in [statistics]],
294 tmp_histogram_path, 294 tmp_statistics_histogram_path,
295 METHODS.keys()) 295 METHODS.keys())
296 shutil.move(tmp_histogram_path, output_histogram) 296 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram)
297 297
298 298
299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, 299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files,
300 output_summary, output_orphan, output_detail, output_key, output_histogram): 300 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram):
301 output_detail_file = output_files in ["all"] and output_detail is not None 301 output_detail_file = output_files in ["all"] and output_detail is not None
302 output_key_file = output_files in ["all"] and output_key is not None 302 output_statistics_table_file = output_files in ["all"] and output_statistics_table is not None
303 output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None 303 output_unmatched_peaks_file = output_files in ["all", "matched_peaks_unmatched_peaks"] and output_unmatched_peaks is not None
304 output_histogram_file = output_files in ["all"] and output_histogram is not None 304 output_statistics_histogram_file = output_files in ["all"] and output_statistics_histogram is not None
305 replicates = [] 305 replicates = []
306 for i, dataset_path in enumerate(dataset_paths): 306 for i, dataset_path in enumerate(dataset_paths):
307 try: 307 try:
308 galaxy_hid = galaxy_hids[i] 308 galaxy_hid = galaxy_hids[i]
309 r = Replicate(galaxy_hid, dataset_path) 309 r = Replicate(galaxy_hid, dataset_path)
334 'median midpoint', 334 'median midpoint',
335 'median midpoint+1', 335 'median midpoint+1',
336 'c-w sum', 336 'c-w sum',
337 'c-w distance', 337 'c-w distance',
338 'replicate id') 338 'replicate id')
339 summary_output = td_writer(output_summary) 339 matched_peaks_output = td_writer(output_matched_peaks)
340 if output_key_file: 340 if output_statistics_table_file:
341 key_output = td_writer(output_key) 341 statistics_table_output = td_writer(output_statistics_table)
342 key_output.writerow(('data', 'median read count')) 342 statistics_table_output.writerow(('data', 'median read count'))
343 if output_detail_file: 343 if output_detail_file:
344 detail_output = td_writer(output_detail) 344 detail_output = td_writer(output_detail)
345 detail_output.writerow(labels) 345 detail_output.writerow(labels)
346 if output_orphan_file: 346 if output_unmatched_peaks_file:
347 orphan_output = td_writer(output_orphan) 347 unmatched_peaks_output = td_writer(output_unmatched_peaks)
348 orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) 348 unmatched_peaks_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id'))
349 # Perform filtering 349 # Perform filtering
350 if up_limit < 1000 or low_limit > -1000: 350 if up_limit < 1000 or low_limit > -1000:
351 for replicate in replicates: 351 for replicate in replicates:
352 replicate.filter(up_limit, low_limit) 352 replicate.filter(up_limit, low_limit)
353 # Actually merge the peaks 353 # Actually merge the peaks
354 peak_groups = [] 354 peak_groups = []
355 orphans = [] 355 unmatched_peaks = []
356 freq = FrequencyDistribution() 356 freq = FrequencyDistribution()
357 357
358 def do_match(reps, distance): 358 def do_match(reps, distance):
359 # Copy list because we will mutate it, but keep replicate references. 359 # Copy list because we will mutate it, but keep replicate references.
360 reps = reps[:] 360 reps = reps[:]
413 else: 413 else:
414 for d in range(0, distance, step): 414 for d in range(0, distance, step):
415 do_match(replicates, d) 415 do_match(replicates, d)
416 for group in peak_groups: 416 for group in peak_groups:
417 freq.add(group.num_replicates) 417 freq.add(group.num_replicates)
418 # Collect together the remaining orphans 418 # Collect together the remaining unmatched_peaks
419 for replicate in replicates: 419 for replicate in replicates:
420 for chromosome in replicate.chromosomes.values(): 420 for chromosome in replicate.chromosomes.values():
421 for peak in chromosome.peaks: 421 for peak in chromosome.peaks:
422 freq.add(1) 422 freq.add(1)
423 orphans.append(peak) 423 unmatched_peaks.append(peak)
424 # Average the orphan count in the graph by # replicates 424 # Average the unmatched_peaks count in the graph by # replicates
425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) 425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()])
426 for replicate in replicates: 426 for replicate in replicates:
427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) 427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate])
428 key_output.writerow((replicate.id, replicate.median)) 428 statistics_table_output.writerow((replicate.id, replicate.median))
429 for group in peak_groups: 429 for group in peak_groups:
430 # Output summary (matched pairs). 430 # Output matched_peaks (matched pairs).
431 summary_output.writerow(gff_row(cname=group.chrom, 431 matched_peaks_output.writerow(gff_row(cname=group.chrom,
432 start=group.midpoint, 432 start=group.midpoint,
433 end=group.midpoint+1, 433 end=group.midpoint+1,
434 source='repmatch', 434 source='repmatch',
435 score=group.normalized_value(med), 435 score=group.normalized_value(med),
436 attrs={'median_distance': group.median_distance, 436 attrs={'median_distance': group.median_distance,
437 'replicates': group.num_replicates, 437 'replicates': group.num_replicates,
438 'value_sum': group.value_sum})) 438 'value_sum': group.value_sum}))
439 if output_detail_file: 439 if output_detail_file:
440 summary = (group.chrom, 440 matched_peaks = (group.chrom,
441 group.midpoint, 441 group.midpoint,
442 group.midpoint+1, 442 group.midpoint+1,
443 group.normalized_value(med), 443 group.normalized_value(med),
444 group.num_replicates, 444 group.num_replicates,
445 group.median_distance, 445 group.median_distance,
446 group.value_sum) 446 group.value_sum)
447 for peak in group.peaks.values(): 447 for peak in group.peaks.values():
448 summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) 448 matched_peaks += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id)
449 detail_output.writerow(summary) 449 detail_output.writerow(matched_peaks)
450 if output_orphan_file: 450 if output_unmatched_peaks_file:
451 for orphan in orphans: 451 for unmatched_peak in unmatched_peaks:
452 orphan_output.writerow((orphan.chrom, 452 unmatched_peaks_output.writerow((unmatched_peak.chrom,
453 orphan.midpoint, 453 unmatched_peak.midpoint,
454 orphan.midpoint+1, 454 unmatched_peak.midpoint+1,
455 orphan.value, 455 unmatched_peak.value,
456 orphan.distance, 456 unmatched_peak.distance,
457 orphan.replicate.id)) 457 unmatched_peak.replicate.id))
458 if output_histogram_file: 458 if output_statistics_histogram_file:
459 tmp_histogram_path = get_temporary_plot_path() 459 tmp_statistics_histogram_path = get_temporary_plot_path()
460 frequency_histogram([freq], tmp_histogram_path) 460 frequency_histogram([freq], tmp_statistics_histogram_path)
461 shutil.move(tmp_histogram_path, output_histogram) 461 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram)
462 return {'distribution': freq} 462 return {'distribution': freq}