Mercurial > repos > greg > repmatch_gff3
comparison repmatch_gff3_util.py @ 8:d10ae3aeebc8 draft
Uploaded
author | greg |
---|---|
date | Wed, 02 Dec 2015 16:15:35 -0500 |
parents | 53cbf79396d7 |
children |
comparison
equal
deleted
inserted
replaced
7:1807688a8a5f | 8:d10ae3aeebc8 |
---|---|
263 os.close(fd) | 263 os.close(fd) |
264 return name | 264 return name |
265 | 265 |
266 | 266 |
267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, | 267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, |
268 output_summary, output_orphan, output_detail, output_key, output_histogram): | 268 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): |
269 output_histogram_file = output_files in ["all"] and method in ["all"] | 269 output_statistics_histogram_file = output_files in ["all"] and method in ["all"] |
270 if len(dataset_paths) < 2: | 270 if len(dataset_paths) < 2: |
271 return | 271 return |
272 if method == 'all': | 272 if method == 'all': |
273 match_methods = METHODS.keys() | 273 match_methods = METHODS.keys() |
274 else: | 274 else: |
281 step, | 281 step, |
282 replicates, | 282 replicates, |
283 up_limit, | 283 up_limit, |
284 low_limit, | 284 low_limit, |
285 output_files, | 285 output_files, |
286 output_summary, | 286 output_matched_peaks, |
287 output_orphan, | 287 output_unmatched_peaks, |
288 output_detail, | 288 output_detail, |
289 output_key, | 289 output_statistics_table, |
290 output_histogram) | 290 output_statistics_histogram) |
291 if output_histogram_file: | 291 if output_statistics_histogram_file: |
292 tmp_histogram_path = get_temporary_plot_path() | 292 tmp_statistics_histogram_path = get_temporary_plot_path() |
293 frequency_histogram([stat['distribution'] for stat in [statistics]], | 293 frequency_histogram([stat['distribution'] for stat in [statistics]], |
294 tmp_histogram_path, | 294 tmp_statistics_histogram_path, |
295 METHODS.keys()) | 295 METHODS.keys()) |
296 shutil.move(tmp_histogram_path, output_histogram) | 296 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) |
297 | 297 |
298 | 298 |
299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, | 299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, |
300 output_summary, output_orphan, output_detail, output_key, output_histogram): | 300 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): |
301 output_detail_file = output_files in ["all"] and output_detail is not None | 301 output_detail_file = output_files in ["all"] and output_detail is not None |
302 output_key_file = output_files in ["all"] and output_key is not None | 302 output_statistics_table_file = output_files in ["all"] and output_statistics_table is not None |
303 output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None | 303 output_unmatched_peaks_file = output_files in ["all", "matched_peaks_unmatched_peaks"] and output_unmatched_peaks is not None |
304 output_histogram_file = output_files in ["all"] and output_histogram is not None | 304 output_statistics_histogram_file = output_files in ["all"] and output_statistics_histogram is not None |
305 replicates = [] | 305 replicates = [] |
306 for i, dataset_path in enumerate(dataset_paths): | 306 for i, dataset_path in enumerate(dataset_paths): |
307 try: | 307 try: |
308 galaxy_hid = galaxy_hids[i] | 308 galaxy_hid = galaxy_hids[i] |
309 r = Replicate(galaxy_hid, dataset_path) | 309 r = Replicate(galaxy_hid, dataset_path) |
334 'median midpoint', | 334 'median midpoint', |
335 'median midpoint+1', | 335 'median midpoint+1', |
336 'c-w sum', | 336 'c-w sum', |
337 'c-w distance', | 337 'c-w distance', |
338 'replicate id') | 338 'replicate id') |
339 summary_output = td_writer(output_summary) | 339 matched_peaks_output = td_writer(output_matched_peaks) |
340 if output_key_file: | 340 if output_statistics_table_file: |
341 key_output = td_writer(output_key) | 341 statistics_table_output = td_writer(output_statistics_table) |
342 key_output.writerow(('data', 'median read count')) | 342 statistics_table_output.writerow(('data', 'median read count')) |
343 if output_detail_file: | 343 if output_detail_file: |
344 detail_output = td_writer(output_detail) | 344 detail_output = td_writer(output_detail) |
345 detail_output.writerow(labels) | 345 detail_output.writerow(labels) |
346 if output_orphan_file: | 346 if output_unmatched_peaks_file: |
347 orphan_output = td_writer(output_orphan) | 347 unmatched_peaks_output = td_writer(output_unmatched_peaks) |
348 orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) | 348 unmatched_peaks_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) |
349 # Perform filtering | 349 # Perform filtering |
350 if up_limit < 1000 or low_limit > -1000: | 350 if up_limit < 1000 or low_limit > -1000: |
351 for replicate in replicates: | 351 for replicate in replicates: |
352 replicate.filter(up_limit, low_limit) | 352 replicate.filter(up_limit, low_limit) |
353 # Actually merge the peaks | 353 # Actually merge the peaks |
354 peak_groups = [] | 354 peak_groups = [] |
355 orphans = [] | 355 unmatched_peaks = [] |
356 freq = FrequencyDistribution() | 356 freq = FrequencyDistribution() |
357 | 357 |
358 def do_match(reps, distance): | 358 def do_match(reps, distance): |
359 # Copy list because we will mutate it, but keep replicate references. | 359 # Copy list because we will mutate it, but keep replicate references. |
360 reps = reps[:] | 360 reps = reps[:] |
413 else: | 413 else: |
414 for d in range(0, distance, step): | 414 for d in range(0, distance, step): |
415 do_match(replicates, d) | 415 do_match(replicates, d) |
416 for group in peak_groups: | 416 for group in peak_groups: |
417 freq.add(group.num_replicates) | 417 freq.add(group.num_replicates) |
418 # Collect together the remaining orphans | 418 # Collect together the remaining unmatched_peaks |
419 for replicate in replicates: | 419 for replicate in replicates: |
420 for chromosome in replicate.chromosomes.values(): | 420 for chromosome in replicate.chromosomes.values(): |
421 for peak in chromosome.peaks: | 421 for peak in chromosome.peaks: |
422 freq.add(1) | 422 freq.add(1) |
423 orphans.append(peak) | 423 unmatched_peaks.append(peak) |
424 # Average the orphan count in the graph by # replicates | 424 # Average the unmatched_peaks count in the graph by # replicates |
425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) | 425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) |
426 for replicate in replicates: | 426 for replicate in replicates: |
427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) | 427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) |
428 key_output.writerow((replicate.id, replicate.median)) | 428 statistics_table_output.writerow((replicate.id, replicate.median)) |
429 for group in peak_groups: | 429 for group in peak_groups: |
430 # Output summary (matched pairs). | 430 # Output matched_peaks (matched pairs). |
431 summary_output.writerow(gff_row(cname=group.chrom, | 431 matched_peaks_output.writerow(gff_row(cname=group.chrom, |
432 start=group.midpoint, | 432 start=group.midpoint, |
433 end=group.midpoint+1, | 433 end=group.midpoint+1, |
434 source='repmatch', | 434 source='repmatch', |
435 score=group.normalized_value(med), | 435 score=group.normalized_value(med), |
436 attrs={'median_distance': group.median_distance, | 436 attrs={'median_distance': group.median_distance, |
437 'replicates': group.num_replicates, | 437 'replicates': group.num_replicates, |
438 'value_sum': group.value_sum})) | 438 'value_sum': group.value_sum})) |
439 if output_detail_file: | 439 if output_detail_file: |
440 summary = (group.chrom, | 440 matched_peaks = (group.chrom, |
441 group.midpoint, | 441 group.midpoint, |
442 group.midpoint+1, | 442 group.midpoint+1, |
443 group.normalized_value(med), | 443 group.normalized_value(med), |
444 group.num_replicates, | 444 group.num_replicates, |
445 group.median_distance, | 445 group.median_distance, |
446 group.value_sum) | 446 group.value_sum) |
447 for peak in group.peaks.values(): | 447 for peak in group.peaks.values(): |
448 summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) | 448 matched_peaks += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) |
449 detail_output.writerow(summary) | 449 detail_output.writerow(matched_peaks) |
450 if output_orphan_file: | 450 if output_unmatched_peaks_file: |
451 for orphan in orphans: | 451 for unmatched_peak in unmatched_peaks: |
452 orphan_output.writerow((orphan.chrom, | 452 unmatched_peaks_output.writerow((unmatched_peak.chrom, |
453 orphan.midpoint, | 453 unmatched_peak.midpoint, |
454 orphan.midpoint+1, | 454 unmatched_peak.midpoint+1, |
455 orphan.value, | 455 unmatched_peak.value, |
456 orphan.distance, | 456 unmatched_peak.distance, |
457 orphan.replicate.id)) | 457 unmatched_peak.replicate.id)) |
458 if output_histogram_file: | 458 if output_statistics_histogram_file: |
459 tmp_histogram_path = get_temporary_plot_path() | 459 tmp_statistics_histogram_path = get_temporary_plot_path() |
460 frequency_histogram([freq], tmp_histogram_path) | 460 frequency_histogram([freq], tmp_statistics_histogram_path) |
461 shutil.move(tmp_histogram_path, output_histogram) | 461 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) |
462 return {'distribution': freq} | 462 return {'distribution': freq} |