Mercurial > repos > greg > cwpair2

--- a/cwpair2_util.py	Tue Nov 24 08:15:57 2015 -0500
+++ b/cwpair2_util.py	Wed Dec 02 16:13:51 2015 -0500
@@ -7,14 +7,19 @@
 matplotlib.use('Agg')
 from matplotlib import pyplot

+# Data outputs
 DETAILS = 'D'
-FINAL_PLOTS = 'F'
+MATCHED_PAIRS = 'MP'
 ORPHANS = 'O'
-PREVIEW_PLOTS = 'P'
-SIMPLES = 'S'
-STATS_GRAPH = 'C'
+# Data output formats
 GFF_EXT = 'gff'
 TABULAR_EXT = 'tabular'
+# Statistics historgrams output directory.
+HISTOGRAM = 'H'
+# Statistics outputs
+FINAL_PLOTS = 'F'
+PREVIEW_PLOTS = 'P'
+STATS_GRAPH = 'C'

 # Graph settings.
 COLORS = 'krg'
@@ -210,22 +215,16 @@
     pyplot.savefig(fname)


-def create_directories(method):
-    if method == 'all':
-        match_methods = METHODS.keys()
-    else:
-        match_methods = [method]
-    for match_method in match_methods:
-        os.mkdir('%s_%s' % (match_method, DETAILS))
-        os.mkdir('%s_%s' % (match_method, FINAL_PLOTS))
-        os.mkdir('%s_%s' % (match_method, ORPHANS))
-        os.mkdir('%s_%s' % (match_method, PREVIEW_PLOTS))
-        os.mkdir('%s_%s' % (match_method, SIMPLES))
-        os.mkdir('%s_%s' % (match_method, STATS_GRAPH))
+def create_directories():
+    # Output histograms in pdf.
+    os.mkdir(HISTOGRAM)
+    os.mkdir('data_%s' % DETAILS)
+    os.mkdir('data_%s' % ORPHANS)
+    os.mkdir('data_%s' % MATCHED_PAIRS)


 def process_file(dataset_path, galaxy_hid, method, threshold, up_distance,
-                 down_distance, binsize, output_files, sort_score):
+                 down_distance, binsize, output_files):
     if method == 'all':
         match_methods = METHODS.keys()
     else:
@@ -239,8 +238,7 @@
                                 up_distance,
                                 down_distance,
                                 binsize,
-                                output_files,
-                                sort_score)
+                                output_files)
         statistics.append(stats)
     if output_files == 'all' and method == 'all':
         frequency_plot([s['dist'] for s in statistics],
@@ -250,10 +248,10 @@


 def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance,
-                    down_distance, binsize, output_files, sort_score):
-    output_details = output_files in ["all", "simple_orphan_detail"]
+                    down_distance, binsize, output_files):
+    output_details = output_files in ["all", "matched_pair_orphan_detail"]
     output_plots = output_files in ["all"]
-    output_orphans = output_files in ["all", "simple_orphan", "simple_orphan_detail"]
+    output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"]
     # Keep track of statistics for the output file
     statistics = {}
     input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
@@ -264,15 +262,18 @@
         filter_string = 'fa%d' % threshold
     else:
         filter_string = 'f%d' % (threshold * 100)
-    fname = 'data_%s_%su%dd%db%d' % (galaxy_hid, filter_string, up_distance, down_distance, binsize)
+    fname = '%s_%su%dd%d_on_data_%s' % (method, filter_string, up_distance, down_distance, galaxy_hid)

-    def make_path(output_type, extension=TABULAR_EXT):
-        # Returns the full path for a certain output.
+    def make_histogram_path(output_type, fname):
+        return os.path.join(HISTOGRAM, 'histogram_%s_%s.%s' % (output_type, fname, PLOT_FORMAT))
+
+    def make_path(output_type, extension, fname):
+        # Returns the full path for an output.
         return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension))

-    def td_writer(output_type, extension=TABULAR_EXT):
+    def td_writer(output_type, extension, fname):
         # Returns a tab-delimited writer for a specified output.
-        output_file_path = make_path(output_type, extension)
+        output_file_path = make_path(output_type, extension, fname)
         return csv.writer(open(output_file_path, 'wt'), delimiter='\t')

     try:
@@ -281,23 +282,23 @@
         stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
     if output_details:
         # Details
-        detailed_output = td_writer('%s_%s' % (method, DETAILS), extension=TABULAR_EXT)
+        detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname)
         detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)'))
     if output_plots:
         # Final Plot
-        final_plot_path = make_path('%s_%s' % (method, FINAL_PLOTS), PLOT_FORMAT)
+        final_plot_path = make_histogram_path(FINAL_PLOTS, fname)
     if output_orphans:
         # Orphans
-        orphan_output = td_writer('%s_%s' % (method, ORPHANS), extension=TABULAR_EXT)
+        orphan_output = td_writer('data_%s' % ORPHANS, TABULAR_EXT, fname)
         orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value'))
     if output_plots:
         # Preview Plot
-        preview_plot_path = make_path('%s_%s' % (method, PREVIEW_PLOTS), PLOT_FORMAT)
-    # Simple
-    simple_output = td_writer('%s_%s' % (method, SIMPLES), extension=GFF_EXT)
+        preview_plot_path = make_histogram_path(PREVIEW_PLOTS, fname)
+    # Matched Pairs.
+    matched_pairs_output = td_writer('data_%s' % MATCHED_PAIRS, GFF_EXT, fname)
     statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT
     if output_plots:
-        statistics['graph_path'] = make_path('%s_%s' % (method, STATS_GRAPH), PLOT_FORMAT)
+        statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname)
     statistics['perc95'] = perc95(chromosomes)
     if threshold > 0:
         # Apply filter
@@ -366,11 +367,8 @@
                 orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3]))
         # Keep track of orphans for statistics.
         orphans += len(crick)
-    # Sort output by score if specified.
-    if sort_score == "desc":
-        x.sort(key=lambda data: float(data[5]), reverse=True)
-    elif sort_score == "asc":
-        x.sort(key=lambda data: float(data[5]))
+    # Sort output descending by score.
+    x.sort(key=lambda data: float(data[5]), reverse=True)
     # Writing a summary to gff format file
     for row in x:
         row_tmp = list(row)
@@ -385,7 +383,7 @@
         else:
             row_tmp[0] = row_tmp[0]
         # Print row_tmp.
-        simple_output.writerow(row_tmp)
+        matched_pairs_output.writerow(row_tmp)
     statistics['paired'] = dist.size() * 2
     statistics['orphans'] = orphans
     statistics['final_mode'] = dist.mode()