Mercurial > repos > greg > repmatch_gff3
comparison repmatch_gff3_util.py @ 8:d10ae3aeebc8 draft
Uploaded
| author | greg |
|---|---|
| date | Wed, 02 Dec 2015 16:15:35 -0500 |
| parents | 53cbf79396d7 |
| children |
comparison
equal
deleted
inserted
replaced
| 7:1807688a8a5f | 8:d10ae3aeebc8 |
|---|---|
| 263 os.close(fd) | 263 os.close(fd) |
| 264 return name | 264 return name |
| 265 | 265 |
| 266 | 266 |
| 267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, | 267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, |
| 268 output_summary, output_orphan, output_detail, output_key, output_histogram): | 268 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): |
| 269 output_histogram_file = output_files in ["all"] and method in ["all"] | 269 output_statistics_histogram_file = output_files in ["all"] and method in ["all"] |
| 270 if len(dataset_paths) < 2: | 270 if len(dataset_paths) < 2: |
| 271 return | 271 return |
| 272 if method == 'all': | 272 if method == 'all': |
| 273 match_methods = METHODS.keys() | 273 match_methods = METHODS.keys() |
| 274 else: | 274 else: |
| 281 step, | 281 step, |
| 282 replicates, | 282 replicates, |
| 283 up_limit, | 283 up_limit, |
| 284 low_limit, | 284 low_limit, |
| 285 output_files, | 285 output_files, |
| 286 output_summary, | 286 output_matched_peaks, |
| 287 output_orphan, | 287 output_unmatched_peaks, |
| 288 output_detail, | 288 output_detail, |
| 289 output_key, | 289 output_statistics_table, |
| 290 output_histogram) | 290 output_statistics_histogram) |
| 291 if output_histogram_file: | 291 if output_statistics_histogram_file: |
| 292 tmp_histogram_path = get_temporary_plot_path() | 292 tmp_statistics_histogram_path = get_temporary_plot_path() |
| 293 frequency_histogram([stat['distribution'] for stat in [statistics]], | 293 frequency_histogram([stat['distribution'] for stat in [statistics]], |
| 294 tmp_histogram_path, | 294 tmp_statistics_histogram_path, |
| 295 METHODS.keys()) | 295 METHODS.keys()) |
| 296 shutil.move(tmp_histogram_path, output_histogram) | 296 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) |
| 297 | 297 |
| 298 | 298 |
| 299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, | 299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, |
| 300 output_summary, output_orphan, output_detail, output_key, output_histogram): | 300 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): |
| 301 output_detail_file = output_files in ["all"] and output_detail is not None | 301 output_detail_file = output_files in ["all"] and output_detail is not None |
| 302 output_key_file = output_files in ["all"] and output_key is not None | 302 output_statistics_table_file = output_files in ["all"] and output_statistics_table is not None |
| 303 output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None | 303 output_unmatched_peaks_file = output_files in ["all", "matched_peaks_unmatched_peaks"] and output_unmatched_peaks is not None |
| 304 output_histogram_file = output_files in ["all"] and output_histogram is not None | 304 output_statistics_histogram_file = output_files in ["all"] and output_statistics_histogram is not None |
| 305 replicates = [] | 305 replicates = [] |
| 306 for i, dataset_path in enumerate(dataset_paths): | 306 for i, dataset_path in enumerate(dataset_paths): |
| 307 try: | 307 try: |
| 308 galaxy_hid = galaxy_hids[i] | 308 galaxy_hid = galaxy_hids[i] |
| 309 r = Replicate(galaxy_hid, dataset_path) | 309 r = Replicate(galaxy_hid, dataset_path) |
| 334 'median midpoint', | 334 'median midpoint', |
| 335 'median midpoint+1', | 335 'median midpoint+1', |
| 336 'c-w sum', | 336 'c-w sum', |
| 337 'c-w distance', | 337 'c-w distance', |
| 338 'replicate id') | 338 'replicate id') |
| 339 summary_output = td_writer(output_summary) | 339 matched_peaks_output = td_writer(output_matched_peaks) |
| 340 if output_key_file: | 340 if output_statistics_table_file: |
| 341 key_output = td_writer(output_key) | 341 statistics_table_output = td_writer(output_statistics_table) |
| 342 key_output.writerow(('data', 'median read count')) | 342 statistics_table_output.writerow(('data', 'median read count')) |
| 343 if output_detail_file: | 343 if output_detail_file: |
| 344 detail_output = td_writer(output_detail) | 344 detail_output = td_writer(output_detail) |
| 345 detail_output.writerow(labels) | 345 detail_output.writerow(labels) |
| 346 if output_orphan_file: | 346 if output_unmatched_peaks_file: |
| 347 orphan_output = td_writer(output_orphan) | 347 unmatched_peaks_output = td_writer(output_unmatched_peaks) |
| 348 orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) | 348 unmatched_peaks_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) |
| 349 # Perform filtering | 349 # Perform filtering |
| 350 if up_limit < 1000 or low_limit > -1000: | 350 if up_limit < 1000 or low_limit > -1000: |
| 351 for replicate in replicates: | 351 for replicate in replicates: |
| 352 replicate.filter(up_limit, low_limit) | 352 replicate.filter(up_limit, low_limit) |
| 353 # Actually merge the peaks | 353 # Actually merge the peaks |
| 354 peak_groups = [] | 354 peak_groups = [] |
| 355 orphans = [] | 355 unmatched_peaks = [] |
| 356 freq = FrequencyDistribution() | 356 freq = FrequencyDistribution() |
| 357 | 357 |
| 358 def do_match(reps, distance): | 358 def do_match(reps, distance): |
| 359 # Copy list because we will mutate it, but keep replicate references. | 359 # Copy list because we will mutate it, but keep replicate references. |
| 360 reps = reps[:] | 360 reps = reps[:] |
| 413 else: | 413 else: |
| 414 for d in range(0, distance, step): | 414 for d in range(0, distance, step): |
| 415 do_match(replicates, d) | 415 do_match(replicates, d) |
| 416 for group in peak_groups: | 416 for group in peak_groups: |
| 417 freq.add(group.num_replicates) | 417 freq.add(group.num_replicates) |
| 418 # Collect together the remaining orphans | 418 # Collect together the remaining unmatched_peaks |
| 419 for replicate in replicates: | 419 for replicate in replicates: |
| 420 for chromosome in replicate.chromosomes.values(): | 420 for chromosome in replicate.chromosomes.values(): |
| 421 for peak in chromosome.peaks: | 421 for peak in chromosome.peaks: |
| 422 freq.add(1) | 422 freq.add(1) |
| 423 orphans.append(peak) | 423 unmatched_peaks.append(peak) |
| 424 # Average the orphan count in the graph by # replicates | 424 # Average the unmatched_peaks count in the graph by # replicates |
| 425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) | 425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) |
| 426 for replicate in replicates: | 426 for replicate in replicates: |
| 427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) | 427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) |
| 428 key_output.writerow((replicate.id, replicate.median)) | 428 statistics_table_output.writerow((replicate.id, replicate.median)) |
| 429 for group in peak_groups: | 429 for group in peak_groups: |
| 430 # Output summary (matched pairs). | 430 # Output matched_peaks (matched pairs). |
| 431 summary_output.writerow(gff_row(cname=group.chrom, | 431 matched_peaks_output.writerow(gff_row(cname=group.chrom, |
| 432 start=group.midpoint, | 432 start=group.midpoint, |
| 433 end=group.midpoint+1, | 433 end=group.midpoint+1, |
| 434 source='repmatch', | 434 source='repmatch', |
| 435 score=group.normalized_value(med), | 435 score=group.normalized_value(med), |
| 436 attrs={'median_distance': group.median_distance, | 436 attrs={'median_distance': group.median_distance, |
| 437 'replicates': group.num_replicates, | 437 'replicates': group.num_replicates, |
| 438 'value_sum': group.value_sum})) | 438 'value_sum': group.value_sum})) |
| 439 if output_detail_file: | 439 if output_detail_file: |
| 440 summary = (group.chrom, | 440 matched_peaks = (group.chrom, |
| 441 group.midpoint, | 441 group.midpoint, |
| 442 group.midpoint+1, | 442 group.midpoint+1, |
| 443 group.normalized_value(med), | 443 group.normalized_value(med), |
| 444 group.num_replicates, | 444 group.num_replicates, |
| 445 group.median_distance, | 445 group.median_distance, |
| 446 group.value_sum) | 446 group.value_sum) |
| 447 for peak in group.peaks.values(): | 447 for peak in group.peaks.values(): |
| 448 summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) | 448 matched_peaks += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) |
| 449 detail_output.writerow(summary) | 449 detail_output.writerow(matched_peaks) |
| 450 if output_orphan_file: | 450 if output_unmatched_peaks_file: |
| 451 for orphan in orphans: | 451 for unmatched_peak in unmatched_peaks: |
| 452 orphan_output.writerow((orphan.chrom, | 452 unmatched_peaks_output.writerow((unmatched_peak.chrom, |
| 453 orphan.midpoint, | 453 unmatched_peak.midpoint, |
| 454 orphan.midpoint+1, | 454 unmatched_peak.midpoint+1, |
| 455 orphan.value, | 455 unmatched_peak.value, |
| 456 orphan.distance, | 456 unmatched_peak.distance, |
| 457 orphan.replicate.id)) | 457 unmatched_peak.replicate.id)) |
| 458 if output_histogram_file: | 458 if output_statistics_histogram_file: |
| 459 tmp_histogram_path = get_temporary_plot_path() | 459 tmp_statistics_histogram_path = get_temporary_plot_path() |
| 460 frequency_histogram([freq], tmp_histogram_path) | 460 frequency_histogram([freq], tmp_statistics_histogram_path) |
| 461 shutil.move(tmp_histogram_path, output_histogram) | 461 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) |
| 462 return {'distribution': freq} | 462 return {'distribution': freq} |
