Mercurial > repos > fubar > genomic_association_tester
diff rlGAT/gat-compare.py @ 11:53487f21c0d5 draft
Uploaded
| author | fubar |
|---|---|
| date | Thu, 29 Aug 2013 01:57:54 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rlGAT/gat-compare.py Thu Aug 29 01:57:54 2013 -0400 @@ -0,0 +1,308 @@ +################################################################################ +# +# MRC FGU Computational Genomics Group +# +# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ +# +# Copyright (C) 2009 Andreas Heger +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +################################################################################# +''' +gat-compare - compare two gat runs +================================== + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Python + +Purpose +------- + +This script compares gat runs and compares statistical significance of fold change differences between two or more +regions of interest. + +This script requires counts files created by gat (option ``--output-counts-file``). + +The reported observed value between sets 1 and 2 is the difference of fc2 and fc1: + + * observed = fold2 - fold1 + +The script accepts one or more output files from a gat run. + +If only a single file is given, the significance of fold change differences are computed +for each pairwise comparison of annotations. Thus, the script might answer the question +if a transcription factor is more enriched in promotors than in enhancers. + +If multiple files are given, the script computes the fold change differences for the same +annotation for all pairwise combinations of :term:`segments of interest` across the different +files. Thus, the script might answer the question if transcription factor A is more enriched +in promotors than transcription factor B is enriched in promotors. + +Usage +----- + +Example:: + + python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz + +Type:: + + python gat-compare.py --help + +for command line help. + +Documentation +------------- + +Code +---- + +''' + +import os, sys, re, optparse, collections, types, glob, time, math +import itertools +import numpy + +import gat +import gat.Experiment as E +import gat.IOTools as IOTools +import gat.IO as IO + +def main( argv = None ): + """script main. + + parses command line options in sys.argv, unless *argv* is given. + """ + + if not argv: argv = sys.argv + + # setup command line parser + parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", + usage = globals()["__doc__"] ) + + parser.add_option("-o", "--order", dest="output_order", type="choice", + choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ), + help="order results in output by fold, track, etc. [default=%default]." ) + + parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", + choices = ( "empirical", "norm", ), + help="type of pvalue reported [default=%default]." ) + + parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", + choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ), + help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) + + parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", + help="fdr computation: lambda [default=%default]." ) + + parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", + choices = ("smoother", "bootstrap" ), + help="fdr computation: method for estimating pi0 [default=%default]." ) + + parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", + help="filename mapping annotation terms to descriptions. " + " if given, the output table will contain additional columns " + " [default=%default]" ) + + parser.add_option( "--pseudo-count", dest="pseudo_count", type="float", + help="pseudo count. The pseudo count is added to both the observed and expected overlap. " + " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." ) + + parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string", + help="output pattern for plots [default=%default]" ) + + parser.set_defaults( + pvalue_method = "empirical", + qvalue_method = "BH", + qvalue_lambda = None, + qvalue_pi0_method = "smoother", + # pseudo count for fold change computation to avoid 0 fc + pseudo_count = 1.0, + output_order = "observed", + ) + + ## add common options (-h/--help, ...) and parse command line + (options, args) = E.Start( parser, argv = argv, add_output_options = True ) + + input_filenames_counts = args + + ################################################## + E.info( "received %i filenames with counts" % len(input_filenames_counts)) + + ################################################## + description_header, descriptions, description_width = IO.readDescriptions( options ) + + all_annotator_results = [] + + for input_filename_counts in input_filenames_counts: + + E.info("processing %s" % input_filename_counts ) + + annotator_results = gat.fromCounts( input_filename_counts ) + + ################################################## + if options.pvalue_method != "empirical": + E.info("updating pvalues to %s" % options.pvalue_method ) + gat.updatePValues( annotator_results, options.pvalue_method ) + + ################################################## + ################################################## + ################################################## + ## compute global fdr + ################################################## + E.info( "computing FDR statistics" ) + gat.updateQValues( annotator_results, + method = options.qvalue_method, + vlambda = options.qvalue_lambda, + pi0_method = options.qvalue_pi0_method ) + + all_annotator_results.append( annotator_results ) + + pseudo_count = options.pseudo_count + results = [] + + if len(all_annotator_results) == 1: + E.info("performing pairwise comparison within a file") + + # collect all annotations + annotations, segments = list(), set() + for x in all_annotator_results[0]: + segments.add( x.track) + annotations.append( x ) + + if len(segments) != 1: + raise NotImplementedError( "multiple segments of interest") + + for data1, data2 in itertools.combinations(annotations, 2): + + # note that fold changes can be very large if there are 0 samples + # this is fine for getting the distributional params (mean, stddev) + fold_changes1 = data1.observed / (data1.samples + pseudo_count ) + fold_changes2 = data2.observed / (data2.samples + pseudo_count ) + + # add a separate fc pseudo-count to avoid 0 values + fold_changes1 += 0.0001 + fold_changes2 += 0.0001 + + # Test is if relative fold change rfc is different from 1 + # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 + # = obs1 / obs2 * exp2 / exp1 + # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 + # + # Convert to log space for easier plotting + # Move the observed fold ratio in order to get an idea of the magnitude + # of the underlying fold change + delta_fold = data2.fold - data1.fold + sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold + observed_delta_fold = 0.0 + delta_fold + + result = gat.AnnotatorResult( data1.annotation, data2.annotation, + "na", + observed_delta_fold, + sampled_delta_fold, + reference = None, + pseudo_count = 0 ) + + + results.append( result ) + + + else: + E.info("performing pairwise comparison between files") + + ################################################## + # perform pairwise comparison + for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2): + E.info( "comparing %i and %i" % (index1,index2)) + a,b = all_annotator_results[index1], all_annotator_results[index2] + + # index results in a and b + aa = collections.defaultdict( dict ) + for x in a: aa[x.track][x.annotation] = x + + bb = collections.defaultdict( dict ) + for x in b: bb[x.track][x.annotation] = x + + if len(aa.keys()) != 1 or len(bb.keys()) != 1: + raise NotImplementedError( "multiple segments of interest") + + track = "merged" + # get shared annotations + annotations1 = aa[track].keys() + annotations2 = bb[track].keys() + shared_annotations = list( set(annotations1).intersection( set(annotations2) ) ) + E.info( "%i shared annotations" % len(shared_annotations)) + + for annotation in shared_annotations: + + # if not annotation.startswith("Ram:"): continue + + data1 = aa[track][annotation] + data2 = bb[track][annotation] + + # note that fold changes can be very large if there are 0 samples + # this is fine for getting the distributional params (mean, stddev) + fold_changes1 = data1.observed / (data1.samples + pseudo_count ) + fold_changes2 = data2.observed / (data2.samples + pseudo_count ) + + # add a separate fc pseudo-count to avoid 0 values + fold_changes1 += 0.0001 + fold_changes2 += 0.0001 + + # Test is if relative fold change rfc is different from 1 + # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 + # = obs1 / obs2 * exp2 / exp1 + # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 + # + # Convert to log space for easier plotting + # Move the observed fold ratio in order to get an idea of the magnitude + # of the underlying fold change + delta_fold = data2.fold - data1.fold + sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold + observed_delta_fold = 0.0 + delta_fold + + result = gat.AnnotatorResult( track, annotation, + "na", + observed_delta_fold, + sampled_delta_fold, + reference = None, + pseudo_count = 0 ) + + + results.append( result ) + + + if len(results) == 0: + E.critical( "no results found" ) + E.Stop() + return + + IO.outputResults( results, + options, + gat.AnnotatorResult.headers, + description_header, + description_width, + descriptions, + format_observed = "%6.4f" ) + + IO.plotResults( results, options ) + + ## write footer and output benchmark information. + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) )
