diff rlGAT/gat-compare.py @ 11:53487f21c0d5 draft

Uploaded
author fubar
date Thu, 29 Aug 2013 01:57:54 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-compare.py	Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,308 @@
+################################################################################
+#
+#   MRC FGU Computational Genomics Group
+#
+#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+#   Copyright (C) 2009 Andreas Heger
+#
+#   This program is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; either version 2
+#   of the License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#################################################################################
+'''
+gat-compare - compare two gat runs
+==================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares gat runs and compares statistical significance of fold change differences between two or more
+regions of interest.
+
+This script requires counts files created by gat (option ``--output-counts-file``).
+
+The reported observed value between sets 1 and 2 is the difference of fc2 and fc1:
+
+   * observed = fold2 - fold1
+
+The script accepts one or more output files from a gat run. 
+
+If only a single file is given, the significance of fold change differences are computed 
+for each pairwise comparison of annotations. Thus, the script might answer the question 
+if a transcription factor is more enriched in promotors than in enhancers.
+
+If multiple files are given, the script computes the fold change differences for the same 
+annotation for all pairwise combinations of :term:`segments of interest` across the different
+files. Thus, the script might answer the question if transcription factor A is more enriched
+in promotors than transcription factor B is enriched in promotors.
+
+Usage
+-----
+
+Example::
+
+   python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz
+      
+Type::
+
+   python gat-compare.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time, math
+import itertools
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+
+def main( argv = None ):
+    """script main.
+
+    parses command line options in sys.argv, unless *argv* is given.
+    """
+
+    if not argv: argv = sys.argv
+
+    # setup command line parser
+    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
+                                    usage = globals()["__doc__"] )
+
+    parser.add_option("-o", "--order", dest="output_order", type="choice",
+                      choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ),
+                      help="order results in output by fold, track, etc. [default=%default]."  )
+
+    parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
+                      choices = ( "empirical", "norm", ),
+                      help="type of pvalue reported [default=%default]."  )
+
+    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
+                      choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
+                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )
+
+    parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
+                      help="fdr computation: lambda [default=%default]."  )
+
+    parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
+                       choices = ("smoother", "bootstrap" ),
+                       help="fdr computation: method for estimating pi0 [default=%default]."  )
+
+    parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", 
+                       help="filename mapping annotation terms to descriptions. "
+                            " if given, the output table will contain additional columns "
+                            " [default=%default]" )
+
+    parser.add_option( "--pseudo-count", dest="pseudo_count", type="float",
+                      help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
+                       " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]."  )
+
+    parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string", 
+                       help="output pattern for plots [default=%default]" )
+
+    parser.set_defaults(
+        pvalue_method = "empirical",
+        qvalue_method = "BH",
+        qvalue_lambda = None,
+        qvalue_pi0_method = "smoother",
+        # pseudo count for fold change computation to avoid 0 fc
+        pseudo_count = 1.0,
+        output_order = "observed",
+        )
+
+    ## add common options (-h/--help, ...) and parse command line 
+    (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+    input_filenames_counts = args
+
+    ##################################################
+    E.info( "received %i filenames with counts" % len(input_filenames_counts))
+
+    ##################################################
+    description_header, descriptions, description_width = IO.readDescriptions( options )
+
+    all_annotator_results = []
+
+    for input_filename_counts in input_filenames_counts:
+ 
+        E.info("processing %s" % input_filename_counts )
+        
+        annotator_results = gat.fromCounts( input_filename_counts )
+
+        ##################################################
+        if options.pvalue_method != "empirical":
+            E.info("updating pvalues to %s" % options.pvalue_method )
+            gat.updatePValues( annotator_results, options.pvalue_method )
+
+        ##################################################
+        ##################################################
+        ##################################################
+        ## compute global fdr
+        ##################################################
+        E.info( "computing FDR statistics" )
+        gat.updateQValues( annotator_results, 
+                           method = options.qvalue_method,
+                           vlambda = options.qvalue_lambda,
+                           pi0_method = options.qvalue_pi0_method )
+
+        all_annotator_results.append( annotator_results )
+        
+    pseudo_count = options.pseudo_count
+    results = []
+
+    if len(all_annotator_results) == 1:
+        E.info("performing pairwise comparison within a file")
+
+        # collect all annotations
+        annotations, segments = list(), set()
+        for x in all_annotator_results[0]:
+            segments.add( x.track)
+            annotations.append( x )
+
+        if len(segments) != 1:
+            raise NotImplementedError( "multiple segments of interest")
+
+        for data1, data2 in itertools.combinations(annotations, 2):
+            
+            # note that fold changes can be very large if there are 0 samples
+            # this is fine for getting the distributional params (mean, stddev)
+            fold_changes1 = data1.observed / (data1.samples + pseudo_count ) 
+            fold_changes2 = data2.observed / (data2.samples + pseudo_count ) 
+
+            # add a separate fc pseudo-count to avoid 0 values
+            fold_changes1 += 0.0001
+            fold_changes2 += 0.0001
+
+            # Test is if relative fold change rfc is different from 1
+            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
+            #                       = obs1 / obs2 * exp2 / exp1
+            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
+            #
+            # Convert to log space for easier plotting
+            # Move the observed fold ratio in order to get an idea of the magnitude
+            # of the underlying fold change
+            delta_fold = data2.fold - data1.fold
+            sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
+            observed_delta_fold = 0.0 + delta_fold
+
+            result = gat.AnnotatorResult( data1.annotation, data2.annotation, 
+                                          "na",
+                                          observed_delta_fold,
+                                          sampled_delta_fold, 
+                                          reference = None,
+                                          pseudo_count = 0 )
+
+
+            results.append( result )
+
+
+    else:
+        E.info("performing pairwise comparison between files")
+
+        ##################################################
+        # perform pairwise comparison
+        for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2):
+            E.info( "comparing %i and %i" % (index1,index2))
+            a,b = all_annotator_results[index1], all_annotator_results[index2]
+
+            # index results in a and b
+            aa = collections.defaultdict( dict )
+            for x in a: aa[x.track][x.annotation] = x
+
+            bb = collections.defaultdict( dict )
+            for x in b: bb[x.track][x.annotation] = x
+
+            if len(aa.keys()) != 1 or len(bb.keys()) != 1:
+                raise NotImplementedError( "multiple segments of interest")
+
+            track = "merged"
+            # get shared annotations
+            annotations1 = aa[track].keys()
+            annotations2 = bb[track].keys()
+            shared_annotations = list( set(annotations1).intersection( set(annotations2) ) )
+            E.info( "%i shared annotations" % len(shared_annotations))
+
+            for annotation in shared_annotations:
+
+                # if not annotation.startswith("Ram:"): continue
+
+                data1 = aa[track][annotation]
+                data2 = bb[track][annotation]
+
+                # note that fold changes can be very large if there are 0 samples
+                # this is fine for getting the distributional params (mean, stddev)
+                fold_changes1 = data1.observed / (data1.samples + pseudo_count ) 
+                fold_changes2 = data2.observed / (data2.samples + pseudo_count ) 
+
+                # add a separate fc pseudo-count to avoid 0 values
+                fold_changes1 += 0.0001
+                fold_changes2 += 0.0001
+
+                # Test is if relative fold change rfc is different from 1
+                # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
+                #                       = obs1 / obs2 * exp2 / exp1
+                # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
+                #
+                # Convert to log space for easier plotting
+                # Move the observed fold ratio in order to get an idea of the magnitude
+                # of the underlying fold change
+                delta_fold = data2.fold - data1.fold
+                sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
+                observed_delta_fold = 0.0 + delta_fold
+
+                result = gat.AnnotatorResult( track, annotation, 
+                                              "na",
+                                              observed_delta_fold,
+                                              sampled_delta_fold, 
+                                              reference = None,
+                                              pseudo_count = 0 )
+
+
+                results.append( result )
+
+
+    if len(results) == 0:
+        E.critical( "no results found" )
+        E.Stop()
+        return
+            
+    IO.outputResults( results, 
+                      options, 
+                      gat.AnnotatorResult.headers,
+                      description_header, 
+                      description_width,
+                      descriptions,
+                      format_observed = "%6.4f" )
+
+    IO.plotResults( results, options )
+
+    ## write footer and output benchmark information.
+    E.Stop()
+
+if __name__ == "__main__":
+    sys.exit( main( sys.argv) )