view rlGAT/gat-compare.py @ 11:53487f21c0d5 draft

Uploaded
author fubar
date Thu, 29 Aug 2013 01:57:54 -0400
parents
children
line wrap: on
line source

################################################################################
#
#   MRC FGU Computational Genomics Group
#
#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
#
#   Copyright (C) 2009 Andreas Heger
#
#   This program is free software; you can redistribute it and/or
#   modify it under the terms of the GNU General Public License
#   as published by the Free Software Foundation; either version 2
#   of the License, or (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program; if not, write to the Free Software
#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#################################################################################
'''
gat-compare - compare two gat runs
==================================

:Author: Andreas Heger
:Release: $Id$
:Date: |today|
:Tags: Python

Purpose
-------

This script compares gat runs and compares statistical significance of fold change differences between two or more
regions of interest.

This script requires counts files created by gat (option ``--output-counts-file``).

The reported observed value between sets 1 and 2 is the difference of fc2 and fc1:

   * observed = fold2 - fold1

The script accepts one or more output files from a gat run. 

If only a single file is given, the significance of fold change differences are computed 
for each pairwise comparison of annotations. Thus, the script might answer the question 
if a transcription factor is more enriched in promotors than in enhancers.

If multiple files are given, the script computes the fold change differences for the same 
annotation for all pairwise combinations of :term:`segments of interest` across the different
files. Thus, the script might answer the question if transcription factor A is more enriched
in promotors than transcription factor B is enriched in promotors.

Usage
-----

Example::

   python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz
      
Type::

   python gat-compare.py --help

for command line help.

Documentation
-------------

Code
----

'''

import os, sys, re, optparse, collections, types, glob, time, math
import itertools
import numpy

import gat
import gat.Experiment as E
import gat.IOTools as IOTools
import gat.IO as IO

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-o", "--order", dest="output_order", type="choice",
                      choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ),
                      help="order results in output by fold, track, etc. [default=%default]."  )

    parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
                      choices = ( "empirical", "norm", ),
                      help="type of pvalue reported [default=%default]."  )

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )

    parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
                      help="fdr computation: lambda [default=%default]."  )

    parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
                       choices = ("smoother", "bootstrap" ),
                       help="fdr computation: method for estimating pi0 [default=%default]."  )

    parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", 
                       help="filename mapping annotation terms to descriptions. "
                            " if given, the output table will contain additional columns "
                            " [default=%default]" )

    parser.add_option( "--pseudo-count", dest="pseudo_count", type="float",
                      help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
                       " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]."  )

    parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string", 
                       help="output pattern for plots [default=%default]" )

    parser.set_defaults(
        pvalue_method = "empirical",
        qvalue_method = "BH",
        qvalue_lambda = None,
        qvalue_pi0_method = "smoother",
        # pseudo count for fold change computation to avoid 0 fc
        pseudo_count = 1.0,
        output_order = "observed",
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    input_filenames_counts = args

    ##################################################
    E.info( "received %i filenames with counts" % len(input_filenames_counts))

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions( options )

    all_annotator_results = []

    for input_filename_counts in input_filenames_counts:
 
        E.info("processing %s" % input_filename_counts )
        
        annotator_results = gat.fromCounts( input_filename_counts )

        ##################################################
        if options.pvalue_method != "empirical":
            E.info("updating pvalues to %s" % options.pvalue_method )
            gat.updatePValues( annotator_results, options.pvalue_method )

        ##################################################
        ##################################################
        ##################################################
        ## compute global fdr
        ##################################################
        E.info( "computing FDR statistics" )
        gat.updateQValues( annotator_results, 
                           method = options.qvalue_method,
                           vlambda = options.qvalue_lambda,
                           pi0_method = options.qvalue_pi0_method )

        all_annotator_results.append( annotator_results )
        
    pseudo_count = options.pseudo_count
    results = []

    if len(all_annotator_results) == 1:
        E.info("performing pairwise comparison within a file")

        # collect all annotations
        annotations, segments = list(), set()
        for x in all_annotator_results[0]:
            segments.add( x.track)
            annotations.append( x )

        if len(segments) != 1:
            raise NotImplementedError( "multiple segments of interest")

        for data1, data2 in itertools.combinations(annotations, 2):
            
            # note that fold changes can be very large if there are 0 samples
            # this is fine for getting the distributional params (mean, stddev)
            fold_changes1 = data1.observed / (data1.samples + pseudo_count ) 
            fold_changes2 = data2.observed / (data2.samples + pseudo_count ) 

            # add a separate fc pseudo-count to avoid 0 values
            fold_changes1 += 0.0001
            fold_changes2 += 0.0001

            # Test is if relative fold change rfc is different from 1
            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
            #                       = obs1 / obs2 * exp2 / exp1
            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
            #
            # Convert to log space for easier plotting
            # Move the observed fold ratio in order to get an idea of the magnitude
            # of the underlying fold change
            delta_fold = data2.fold - data1.fold
            sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
            observed_delta_fold = 0.0 + delta_fold

            result = gat.AnnotatorResult( data1.annotation, data2.annotation, 
                                          "na",
                                          observed_delta_fold,
                                          sampled_delta_fold, 
                                          reference = None,
                                          pseudo_count = 0 )


            results.append( result )


    else:
        E.info("performing pairwise comparison between files")

        ##################################################
        # perform pairwise comparison
        for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2):
            E.info( "comparing %i and %i" % (index1,index2))
            a,b = all_annotator_results[index1], all_annotator_results[index2]

            # index results in a and b
            aa = collections.defaultdict( dict )
            for x in a: aa[x.track][x.annotation] = x

            bb = collections.defaultdict( dict )
            for x in b: bb[x.track][x.annotation] = x

            if len(aa.keys()) != 1 or len(bb.keys()) != 1:
                raise NotImplementedError( "multiple segments of interest")

            track = "merged"
            # get shared annotations
            annotations1 = aa[track].keys()
            annotations2 = bb[track].keys()
            shared_annotations = list( set(annotations1).intersection( set(annotations2) ) )
            E.info( "%i shared annotations" % len(shared_annotations))

            for annotation in shared_annotations:

                # if not annotation.startswith("Ram:"): continue

                data1 = aa[track][annotation]
                data2 = bb[track][annotation]

                # note that fold changes can be very large if there are 0 samples
                # this is fine for getting the distributional params (mean, stddev)
                fold_changes1 = data1.observed / (data1.samples + pseudo_count ) 
                fold_changes2 = data2.observed / (data2.samples + pseudo_count ) 

                # add a separate fc pseudo-count to avoid 0 values
                fold_changes1 += 0.0001
                fold_changes2 += 0.0001

                # Test is if relative fold change rfc is different from 1
                # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
                #                       = obs1 / obs2 * exp2 / exp1
                # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
                #
                # Convert to log space for easier plotting
                # Move the observed fold ratio in order to get an idea of the magnitude
                # of the underlying fold change
                delta_fold = data2.fold - data1.fold
                sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
                observed_delta_fold = 0.0 + delta_fold

                result = gat.AnnotatorResult( track, annotation, 
                                              "na",
                                              observed_delta_fold,
                                              sampled_delta_fold, 
                                              reference = None,
                                              pseudo_count = 0 )


                results.append( result )


    if len(results) == 0:
        E.critical( "no results found" )
        E.Stop()
        return
            
    IO.outputResults( results, 
                      options, 
                      gat.AnnotatorResult.headers,
                      description_header, 
                      description_width,
                      descriptions,
                      format_observed = "%6.4f" )

    IO.plotResults( results, options )

    ## write footer and output benchmark information.
    E.Stop()

if __name__ == "__main__":
    sys.exit( main( sys.argv) )