Mercurial > repos > greg > repmatch_gff3

--- a/repmatch_gff3_util.py	Sat Nov 28 14:54:42 2015 -0500
+++ b/repmatch_gff3_util.py	Wed Dec 02 16:15:35 2015 -0500
@@ -265,8 +265,8 @@


 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files,
-                  output_summary, output_orphan, output_detail, output_key, output_histogram):
-    output_histogram_file = output_files in ["all"] and method in ["all"]
+                  output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram):
+    output_statistics_histogram_file = output_files in ["all"] and method in ["all"]
     if len(dataset_paths) < 2:
         return
     if method == 'all':
@@ -283,25 +283,25 @@
                                      up_limit,
                                      low_limit,
                                      output_files,
-                                     output_summary,
-                                     output_orphan,
+                                     output_matched_peaks,
+                                     output_unmatched_peaks,
                                      output_detail,
-                                     output_key,
-                                     output_histogram)
-    if output_histogram_file:
-        tmp_histogram_path = get_temporary_plot_path()
+                                     output_statistics_table,
+                                     output_statistics_histogram)
+    if output_statistics_histogram_file:
+        tmp_statistics_histogram_path = get_temporary_plot_path()
         frequency_histogram([stat['distribution'] for stat in [statistics]],
-                            tmp_histogram_path,
+                            tmp_statistics_histogram_path,
                             METHODS.keys())
-        shutil.move(tmp_histogram_path, output_histogram)
+        shutil.move(tmp_statistics_histogram_path, output_statistics_histogram)


 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files,
-                    output_summary, output_orphan, output_detail, output_key, output_histogram):
+                    output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram):
     output_detail_file = output_files in ["all"] and output_detail is not None
-    output_key_file = output_files in ["all"] and output_key is not None
-    output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None
-    output_histogram_file = output_files in ["all"] and output_histogram is not None
+    output_statistics_table_file = output_files in ["all"] and output_statistics_table is not None
+    output_unmatched_peaks_file = output_files in ["all", "matched_peaks_unmatched_peaks"] and output_unmatched_peaks is not None
+    output_statistics_histogram_file = output_files in ["all"] and output_statistics_histogram is not None
     replicates = []
     for i, dataset_path in enumerate(dataset_paths):
         try:
@@ -336,23 +336,23 @@
                    'c-w sum',
                    'c-w distance',
                    'replicate id')
-    summary_output = td_writer(output_summary)
-    if output_key_file:
-        key_output = td_writer(output_key)
-        key_output.writerow(('data', 'median read count'))
+    matched_peaks_output = td_writer(output_matched_peaks)
+    if output_statistics_table_file:
+        statistics_table_output = td_writer(output_statistics_table)
+        statistics_table_output.writerow(('data', 'median read count'))
     if output_detail_file:
         detail_output = td_writer(output_detail)
         detail_output.writerow(labels)
-    if output_orphan_file:
-        orphan_output = td_writer(output_orphan)
-        orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id'))
+    if output_unmatched_peaks_file:
+        unmatched_peaks_output = td_writer(output_unmatched_peaks)
+        unmatched_peaks_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id'))
     # Perform filtering
     if up_limit < 1000 or low_limit > -1000:
         for replicate in replicates:
             replicate.filter(up_limit, low_limit)
     # Actually merge the peaks
     peak_groups = []
-    orphans = []
+    unmatched_peaks = []
     freq = FrequencyDistribution()

     def do_match(reps, distance):
@@ -415,48 +415,48 @@
             do_match(replicates, d)
     for group in peak_groups:
         freq.add(group.num_replicates)
-    # Collect together the remaining orphans
+    # Collect together the remaining unmatched_peaks
     for replicate in replicates:
         for chromosome in replicate.chromosomes.values():
             for peak in chromosome.peaks:
                 freq.add(1)
-                orphans.append(peak)
-    # Average the orphan count in the graph by # replicates
+                unmatched_peaks.append(peak)
+    # Average the unmatched_peaks count in the graph by # replicates
     med = median([peak.value for group in peak_groups for peak in group.peaks.values()])
     for replicate in replicates:
         replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate])
-        key_output.writerow((replicate.id, replicate.median))
+        statistics_table_output.writerow((replicate.id, replicate.median))
     for group in peak_groups:
-        # Output summary (matched pairs).
-        summary_output.writerow(gff_row(cname=group.chrom,
-                                        start=group.midpoint,
-                                        end=group.midpoint+1,
-                                        source='repmatch',
-                                        score=group.normalized_value(med),
-                                        attrs={'median_distance': group.median_distance,
-                                               'replicates': group.num_replicates,
-                                               'value_sum': group.value_sum}))
+        # Output matched_peaks (matched pairs).
+        matched_peaks_output.writerow(gff_row(cname=group.chrom,
+                                              start=group.midpoint,
+                                              end=group.midpoint+1,
+                                              source='repmatch',
+                                              score=group.normalized_value(med),
+                                              attrs={'median_distance': group.median_distance,
+                                                     'replicates': group.num_replicates,
+                                                     'value_sum': group.value_sum}))
         if output_detail_file:
-            summary = (group.chrom,
-                       group.midpoint,
-                       group.midpoint+1,
-                       group.normalized_value(med),
-                       group.num_replicates,
-                       group.median_distance,
-                       group.value_sum)
+            matched_peaks = (group.chrom,
+                             group.midpoint,
+                             group.midpoint+1,
+                             group.normalized_value(med),
+                             group.num_replicates,
+                             group.median_distance,
+                             group.value_sum)
             for peak in group.peaks.values():
-                summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id)
-            detail_output.writerow(summary)
-    if output_orphan_file:
-        for orphan in orphans:
-            orphan_output.writerow((orphan.chrom,
-                                    orphan.midpoint,
-                                    orphan.midpoint+1,
-                                    orphan.value,
-                                    orphan.distance,
-                                    orphan.replicate.id))
-    if output_histogram_file:
-        tmp_histogram_path = get_temporary_plot_path()
-        frequency_histogram([freq], tmp_histogram_path)
-        shutil.move(tmp_histogram_path, output_histogram)
+                matched_peaks += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id)
+            detail_output.writerow(matched_peaks)
+    if output_unmatched_peaks_file:
+        for unmatched_peak in unmatched_peaks:
+            unmatched_peaks_output.writerow((unmatched_peak.chrom,
+                                             unmatched_peak.midpoint,
+                                             unmatched_peak.midpoint+1,
+                                             unmatched_peak.value,
+                                             unmatched_peak.distance,
+                                             unmatched_peak.replicate.id))
+    if output_statistics_histogram_file:
+        tmp_statistics_histogram_path = get_temporary_plot_path()
+        frequency_histogram([freq], tmp_statistics_histogram_path)
+        shutil.move(tmp_statistics_histogram_path, output_statistics_histogram)
     return {'distribution': freq}