Mercurial > repos > fubar > genomic_association_tester

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-compare.py	Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,308 @@
+################################################################################
+#
+#   MRC FGU Computational Genomics Group
+#
+#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+#   Copyright (C) 2009 Andreas Heger
+#
+#   This program is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; either version 2
+#   of the License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#################################################################################
+'''
+gat-compare - compare two gat runs
+==================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares gat runs and compares statistical significance of fold change differences between two or more
+regions of interest.
+
+This script requires counts files created by gat (option ``--output-counts-file``).
+
+The reported observed value between sets 1 and 2 is the difference of fc2 and fc1:
+
+   * observed = fold2 - fold1
+
+The script accepts one or more output files from a gat run.
+
+If only a single file is given, the significance of fold change differences are computed
+for each pairwise comparison of annotations. Thus, the script might answer the question
+if a transcription factor is more enriched in promotors than in enhancers.
+
+If multiple files are given, the script computes the fold change differences for the same
+annotation for all pairwise combinations of :term:`segments of interest` across the different
+files. Thus, the script might answer the question if transcription factor A is more enriched
+in promotors than transcription factor B is enriched in promotors.
+
+Usage
+-----
+
+Example::
+
+   python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz
+
+Type::
+
+   python gat-compare.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time, math
+import itertools
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+
+def main( argv = None ):
+    """script main.
+
+    parses command line options in sys.argv, unless *argv* is given.
+    """
+
+    if not argv: argv = sys.argv
+
+    # setup command line parser
+    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
+                                    usage = globals()["__doc__"] )
+
+    parser.add_option("-o", "--order", dest="output_order", type="choice",
+                      choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ),
+                      help="order results in output by fold, track, etc. [default=%default]."  )
+
+    parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
+                      choices = ( "empirical", "norm", ),
+                      help="type of pvalue reported [default=%default]."  )
+
+    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
+                      choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
+                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )
+
+    parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
+                      help="fdr computation: lambda [default=%default]."  )
+
+    parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
+                       choices = ("smoother", "bootstrap" ),
+                       help="fdr computation: method for estimating pi0 [default=%default]."  )
+
+    parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
+                       help="filename mapping annotation terms to descriptions. "
+                            " if given, the output table will contain additional columns "
+                            " [default=%default]" )
+
+    parser.add_option( "--pseudo-count", dest="pseudo_count", type="float",
+                      help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
+                       " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]."  )
+
+    parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string",
+                       help="output pattern for plots [default=%default]" )
+
+    parser.set_defaults(
+        pvalue_method = "empirical",
+        qvalue_method = "BH",
+        qvalue_lambda = None,
+        qvalue_pi0_method = "smoother",
+        # pseudo count for fold change computation to avoid 0 fc
+        pseudo_count = 1.0,
+        output_order = "observed",
+        )
+
+    ## add common options (-h/--help, ...) and parse command line
+    (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+    input_filenames_counts = args
+
+    ##################################################
+    E.info( "received %i filenames with counts" % len(input_filenames_counts))
+
+    ##################################################
+    description_header, descriptions, description_width = IO.readDescriptions( options )
+
+    all_annotator_results = []
+
+    for input_filename_counts in input_filenames_counts:
+
+        E.info("processing %s" % input_filename_counts )
+
+        annotator_results = gat.fromCounts( input_filename_counts )
+
+        ##################################################
+        if options.pvalue_method != "empirical":
+            E.info("updating pvalues to %s" % options.pvalue_method )
+            gat.updatePValues( annotator_results, options.pvalue_method )
+
+        ##################################################
+        ##################################################
+        ##################################################
+        ## compute global fdr
+        ##################################################
+        E.info( "computing FDR statistics" )
+        gat.updateQValues( annotator_results,
+                           method = options.qvalue_method,
+                           vlambda = options.qvalue_lambda,
+                           pi0_method = options.qvalue_pi0_method )
+
+        all_annotator_results.append( annotator_results )
+
+    pseudo_count = options.pseudo_count
+    results = []
+
+    if len(all_annotator_results) == 1:
+        E.info("performing pairwise comparison within a file")
+
+        # collect all annotations
+        annotations, segments = list(), set()
+        for x in all_annotator_results[0]:
+            segments.add( x.track)
+            annotations.append( x )
+
+        if len(segments) != 1:
+            raise NotImplementedError( "multiple segments of interest")
+
+        for data1, data2 in itertools.combinations(annotations, 2):
+
+            # note that fold changes can be very large if there are 0 samples
+            # this is fine for getting the distributional params (mean, stddev)
+            fold_changes1 = data1.observed / (data1.samples + pseudo_count )
+            fold_changes2 = data2.observed / (data2.samples + pseudo_count )
+
+            # add a separate fc pseudo-count to avoid 0 values
+            fold_changes1 += 0.0001
+            fold_changes2 += 0.0001
+
+            # Test is if relative fold change rfc is different from 1
+            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
+            #                       = obs1 / obs2 * exp2 / exp1
+            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
+            #
+            # Convert to log space for easier plotting
+            # Move the observed fold ratio in order to get an idea of the magnitude
+            # of the underlying fold change
+            delta_fold = data2.fold - data1.fold
+            sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
+            observed_delta_fold = 0.0 + delta_fold
+
+            result = gat.AnnotatorResult( data1.annotation, data2.annotation,
+                                          "na",
+                                          observed_delta_fold,
+                                          sampled_delta_fold,
+                                          reference = None,
+                                          pseudo_count = 0 )
+
+
+            results.append( result )
+
+
+    else:
+        E.info("performing pairwise comparison between files")
+
+        ##################################################
+        # perform pairwise comparison
+        for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2):
+            E.info( "comparing %i and %i" % (index1,index2))
+            a,b = all_annotator_results[index1], all_annotator_results[index2]
+
+            # index results in a and b
+            aa = collections.defaultdict( dict )
+            for x in a: aa[x.track][x.annotation] = x
+
+            bb = collections.defaultdict( dict )
+            for x in b: bb[x.track][x.annotation] = x
+
+            if len(aa.keys()) != 1 or len(bb.keys()) != 1:
+                raise NotImplementedError( "multiple segments of interest")
+
+            track = "merged"
+            # get shared annotations
+            annotations1 = aa[track].keys()
+            annotations2 = bb[track].keys()
+            shared_annotations = list( set(annotations1).intersection( set(annotations2) ) )
+            E.info( "%i shared annotations" % len(shared_annotations))
+
+            for annotation in shared_annotations:
+
+                # if not annotation.startswith("Ram:"): continue
+
+                data1 = aa[track][annotation]
+                data2 = bb[track][annotation]
+
+                # note that fold changes can be very large if there are 0 samples
+                # this is fine for getting the distributional params (mean, stddev)
+                fold_changes1 = data1.observed / (data1.samples + pseudo_count )
+                fold_changes2 = data2.observed / (data2.samples + pseudo_count )
+
+                # add a separate fc pseudo-count to avoid 0 values
+                fold_changes1 += 0.0001
+                fold_changes2 += 0.0001
+
+                # Test is if relative fold change rfc is different from 1
+                # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
+                #                       = obs1 / obs2 * exp2 / exp1
+                # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
+                #
+                # Convert to log space for easier plotting
+                # Move the observed fold ratio in order to get an idea of the magnitude
+                # of the underlying fold change
+                delta_fold = data2.fold - data1.fold
+                sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
+                observed_delta_fold = 0.0 + delta_fold
+
+                result = gat.AnnotatorResult( track, annotation,
+                                              "na",
+                                              observed_delta_fold,
+                                              sampled_delta_fold,
+                                              reference = None,
+                                              pseudo_count = 0 )
+
+
+                results.append( result )
+
+
+    if len(results) == 0:
+        E.critical( "no results found" )
+        E.Stop()
+        return
+
+    IO.outputResults( results,
+                      options,
+                      gat.AnnotatorResult.headers,
+                      description_header,
+                      description_width,
+                      descriptions,
+                      format_observed = "%6.4f" )
+
+    IO.plotResults( results, options )
+
+    ## write footer and output benchmark information.
+    E.Stop()
+
+if __name__ == "__main__":
+    sys.exit( main( sys.argv) )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-great.py	Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,488 @@
+################################################################################
+#
+#   MRC FGU Computational Genomics Group
+#
+#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+#   Copyright (C) 2009 Andreas Heger
+#
+#   This program is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; either version 2
+#   of the License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#################################################################################
+'''
+great - run great analysis
+==========================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares one or more genomic segments of interest
+against one more other genomic annotations.
+
+GREAT says that a segment overlaps an annotation if the midpoint
+of the segment overlaps the annotation.
+
+The GREAT analysis is done on a per-isochore basis and overall.
+For the overall analysis, there is no isochore correction
+as the union of all isochores is treated as a single workspace.
+
+This script implies several counters (``--counter`` option):
+
+binom
+    binomial model (as implemented by GREAT). Returns the probability
+    of a certain number of segments overlapping (mid-point overlap)
+    an annotation
+
+hyperg
+    hypergeometric model. Returns the probabibility of a certain
+    number of nucleotides within segments overlapping an annotation.
+    Conceptually, all genomic positions are treated independently and
+    segments are fragmented into a collection of single bases that are
+    placed indepently. This is valid for point intervals such as SNPs,
+    but of course ignores segment size, though it might be used as a
+    guide towards the expected fold enrichment, that can be computed
+    more accurately using :mod:`gat-run.py`
+
+Usage
+-----
+
+Example::
+
+   python gat-run.py
+      --segment-file=segments.bed.gz
+      --workspace-file=workspace.bed.gz
+      --annotation-file=annotations_architecture.bed.gz
+
+Type::
+
+   python gat-run.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+import numpy.random
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+import csegmentlist
+
+import scipy.stats
+
+GREAT_RESULT = collections.namedtuple( "GREAT",
+                                       ("track",
+                                        "annotation",
+                                        "isochore",
+                                        "counter",
+                                        "observed", # nsegments_overlapping_annotations
+                                        "expected",
+                                        "nsegments_in_workspace",
+                                        "nannotations_in_workspace",
+                                        "nsegments_overlapping_annotation",
+                                        "nannotations_overlapping_segments",
+                                        "basecoverage_intersection",
+                                        "basecoverage_segments",
+                                        "basecoverage_annotation",
+                                        "basecoverage_workspace",
+                                        "fraction_coverage_annotation",
+                                        "fold",
+                                        "pvalue",
+                                        "qvalue" ) )
+
+def main( argv ):
+
+    if not argv: argv = sys.argv
+
+    # setup command line parser
+    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
+                                    usage = globals()["__doc__"] )
+
+    parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append",
+                      help="filename with annotations [default=%default]."  )
+
+    parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append",
+                      help="filename with segments. Also accepts a glob in parentheses [default=%default]."  )
+
+    parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append",
+                      help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]."  )
+
+    parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append",
+                      help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]."  )
+
+    parser.add_option("-o", "--order", dest="output_order", type="choice",
+                      choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ),
+                      help="order results in output by fold, track, etc. [default=%default]."  )
+
+    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
+                      choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
+                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )
+
+    parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
+                      help="fdr computation: lambda [default=%default]."  )
+
+    parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
+                       choices = ("smoother", "bootstrap" ),
+                       help="fdr computation: method for estimating pi0 [default=%default]."  )
+    parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
+                       help="filename mapping annotation terms to descriptions. "
+                            " if given, the output table will contain additional columns "
+                            " [default=%default]" )
+
+    parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true",
+                       help="ignore segment tracks - all segments belong to one track [default=%default]" )
+
+    parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true",
+                       help="permit the same track to be in multiple files [default=%default]" )
+
+    parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append",
+                       choices = ( "all",
+                                   "annotations", "segments",
+                                   "workspaces", "isochores",
+                                   "overlap" ),
+                       help="output bed files [default=%default]."  )
+
+    parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append",
+                       choices = ( "all",
+                                   "annotations", "segments",
+                                   "workspaces", "isochores",
+                                   "overlap" ),
+                       help="output overlap summary stats [default=%default]."  )
+
+    parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true",
+                       help="restrict workspace to those segments that contain both track"
+                       " and annotations [default=%default]" )
+
+    parser.add_option( "--counter", dest="counters", type="choice", action="append",
+                       choices = ( "binom", "hyperg" ),
+                       help="counter to use [default=%default]."  )
+
+    parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string",
+                      help="output pattern for result tables. Used if there are multiple counters used [default=%default]."  )
+
+    parser.set_defaults(
+        annotation_files = [],
+        segment_files = [],
+        workspace_files = [],
+        sample_files = [],
+        counters = [],
+        output_stats = [],
+        output_bed = [],
+        output_tables_pattern = "%s.tsv.gz",
+        output_order = "fold",
+        input_filename_counts = None,
+        input_filename_results = None,
+        pvalue_method = "empirical",
+        output_plots_pattern = None,
+        output_samples_pattern = None,
+        qvalue_method = "storey",
+        qvalue_lambda = None,
+        qvalue_pi0_method = "smoother",
+        ignore_segment_tracks = False,
+        input_filename_descriptions = None,
+        conditional = "unconditional",
+        conditional_extension = None,
+        conditional_expansion = None,
+        restrict_workspace = False,
+        enable_split_tracks = False,
+        shift_expansion = 2.0,
+        shift_extension = 0,
+        overlap_mode = "midpoint",
+        truncate_workspace_to_annotations = False,
+        )
+
+    ## add common options (-h/--help, ...) and parse command line
+    (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+    tstart = time.time()
+
+    if len(options.counters) == 0:
+        options.counters.append("binom")
+
+    ############################################
+    segments, annotations, workspaces, isochores = IO.buildSegments( options )
+    E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
+
+    # filter segments by workspace
+    workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores )
+
+    ############################################
+    description_header, descriptions, description_width = IO.readDescriptions( options )
+
+    ############################################
+    ############################################
+    ## compute per contig
+
+    # compute bases covered by workspace
+    workspace2basecoverage, isochores = {}, []
+    for contig, ww in workspace.iteritems():
+        workspace2basecoverage[contig] = ww.sum()
+        isochores.append( contig )
+
+    # compute percentage of bases covered by annotations in workspace
+    # per isochore
+    annotation2basecoverage = collections.defaultdict( dict )
+    for annotation, aa in annotations.iteritems():
+        for isochore, a in aa.iteritems():
+            # need to truncate to workspace?
+            annotation2basecoverage[annotation][isochore] = a.sum()
+
+    results_per_contig = collections.defaultdict( list )
+
+    E.info( "computing counts per isochore" )
+    # results per isochore
+    def emptyResult( segment, annotation, isochore,
+                     counter,
+                     nsegments_in_workspace,
+                     basecoverage_annotation,
+                     basecoverage_workspace):
+        return GREAT_RESULT._make( (
+                segment, annotation, isochore,
+                counter,
+                0, # observed
+                0, # expected
+                nsegments_in_workspace,
+                0, # nannotations_in_workspace
+                0, # nsegments_overlapping_annotation
+                0, # nannotations_overlapping_segments
+                0, # basecoverage_intersection
+                0, # basecoverage_segments
+                basecoverage_annotation,
+                basecoverage_workspace,
+                0.0,
+                1.0,
+                1.0,
+                1.0 ))
+
+    for isochore in isochores:
+        basecoverage_workspace = workspace2basecoverage[isochore]
+
+        # iterate over all isochores
+        for segment, segmentdict in segments.iteritems():
+            try:
+                ss = segmentdict[isochore]
+                # select segments overlapping workspace
+                segments_in_workspace = csegmentlist.SegmentList( clone = ss )
+                segments_in_workspace.intersect( workspace[isochore] )
+                # number of segments in workspace
+                nsegments_in_workspace = len(segments_in_workspace)
+            except KeyError:
+                ss = None
+
+            basecoverage_segments = segments_in_workspace.sum()
+
+            for annotation, annotationdict in annotations.iteritems():
+
+                # if annotation != "GO:0030957": continue
+
+                try:
+                    aa = annotationdict[isochore]
+                except KeyError:
+                    aa = None
+
+                # p_A: proportion of bases covered by annotation
+                try:
+                    basecoverage_annotation = annotation2basecoverage[annotation][isochore]
+                except KeyError:
+                    basecoverage_annotation = 0
+
+                if ss == None or aa == None:
+                    for counter in options.counters:
+                        results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation,
+                                                                                             isochore,
+                                                                                             counter,
+                                                                                             nsegments_in_workspace,
+                                                                                             basecoverage_annotation,
+                                                                                             basecoverage_workspace ) )
+                    continue
+
+                # select segments overlapping annotation
+                segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss )
+                segments_overlapping_annotation.intersect( annotations[annotation][isochore] )
+                # number of segments in annotation
+                nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore],
+                                                                                mode = options.overlap_mode )
+
+                # number of nucleotides at the intersection of segments, annotation and workspace
+                basecoverage_intersection = segments_overlapping_annotation.sum()
+
+                annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa )
+                annotations_overlapping_segments.intersect( ss )
+                nannotations_overlapping_segments = len( annotations_overlapping_segments )
+
+                nannotations_in_workspace = len( aa )
+                if nannotations_in_workspace == 0:
+                    for counter in options.counters:
+                        results_per_contig[(counter,segment,annotation)].append( emptyResult(segment,
+                                                                                             annotation,
+                                                                                             isochore,
+                                                                                             counter,
+                                                                                             nsegments_in_workspace,
+                                                                                             basecoverage_annotation,
+                                                                                             basecoverage_workspace ) )
+                    continue
+
+                fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
+                fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace
+
+                for counter in options.counters:
+                    if counter.startswith( "binom" ):
+                        # GREAT binomial probability over "regions"
+                        # n = number of genomic regions = nannotations_in_workspace
+                        # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation
+                        # kpi = genomic regions with annotation hit by segments = nannotations_in_segments
+                        # sf = survival functions = 1 -cdf
+                        # probability of observing >kpi in a sample of n where the probabily of succes is
+                        # ppi.
+                        pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1,
+                                                       nsegments_in_workspace,
+                                                       fraction_coverage_annotation )
+
+                        expected = fraction_coverage_annotation * nsegments_in_workspace
+                        observed = nsegments_overlapping_annotation
+
+                    elif counter.startswith( "hyperg" ):
+
+                        # hypergeometric probability over nucleotides
+                        # Sampling without replacement
+                        # x,M,n,M
+                        # x = observed number of nucleotides in overlap of segments,annotations and workspace
+                        # M = number of nucleotides in workspace
+                        # n = number of nucleotides in annotations (and workspace)
+                        # N = number of nucleotides in segments (and workspace)
+                        # P-value of obtaining >x number of nucleotides overlapping.
+                        rv = scipy.stats.hypergeom( basecoverage_workspace,
+                                                    basecoverage_annotation,
+                                                basecoverage_segments )
+
+                        pvalue = rv.sf( basecoverage_intersection )
+                        expected = rv.mean()
+                        observed = basecoverage_intersection
+
+                    if expected != 0:
+                        fold = float(observed) / expected
+                    else:
+                        fold = 1.0
+
+                    r = GREAT_RESULT._make( (
+                            segment, annotation, isochore,
+                            counter,
+                            observed,
+                            expected,
+                            nsegments_in_workspace,
+                            nannotations_in_workspace,
+                            nsegments_overlapping_annotation,
+                            nannotations_overlapping_segments,
+                            basecoverage_intersection,
+                            basecoverage_segments,
+                            basecoverage_annotation,
+                            basecoverage_workspace,
+                            fraction_coverage_annotation,
+                            fold,
+                            pvalue,
+                            1.0 ))
+                    # print "\t".join( map(str, r))
+                    results_per_contig[ (counter,segment,annotation) ].append( r )
+
+    E.info( "merging counts per isochore" )
+
+    # compute sums
+    results = []
+
+    for niteration, pair in enumerate(results_per_contig.iteritems()):
+
+        counter, segment, annotation = pair[0]
+        data = pair[1]
+
+        nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] )
+        nsegments_overlapping_annotation = sum( [x.observed for x in data ] )
+        nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] )
+        nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] )
+
+        basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] )
+        basecoverage_segments = sum( [x.basecoverage_segments for x in data ] )
+        basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] )
+        basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] )
+
+        fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
+
+        if counter.startswith( "binom" ):
+            pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1,
+                                            nsegments_in_workspace,
+                                            fraction_coverage_annotation )
+            expected = fraction_coverage_annotation * nsegments_in_workspace
+            observed = nsegments_overlapping_annotation
+        elif counter.startswith( "hyperg" ):
+            rv = scipy.stats.hypergeom( basecoverage_workspace,
+                                        basecoverage_annotation,
+                                        basecoverage_segments )
+
+            pvalue = rv.sf( basecoverage_intersection )
+            expected = rv.mean()
+            observed = basecoverage_intersection
+
+
+        if expected != 0:
+            fold = float(observed) / expected
+        else:
+            fold = 1.0
+
+        r = GREAT_RESULT._make( (
+                segment, annotation, "all",
+                counter,
+                observed,
+                expected,
+                nsegments_in_workspace,
+                nannotations_in_workspace,
+                nsegments_overlapping_annotation,
+                nannotations_overlapping_segments,
+                basecoverage_intersection,
+                basecoverage_segments,
+                basecoverage_annotation,
+                basecoverage_workspace,
+                fraction_coverage_annotation,
+                fold,
+                pvalue,
+                1.0 ))
+
+        results.append( r )
+
+    IO.outputResults( results,
+                      options,
+                      GREAT_RESULT._fields,
+                      description_header,
+                      description_width,
+                      descriptions )
+
+    E.Stop()
+
+if __name__ == "__main__":
+    sys.exit( main( sys.argv) )
+
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-plot.py	Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,237 @@
+################################################################################
+#
+#   MRC FGU Computational Genomics Group
+#
+#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+#   Copyright (C) 2009 Andreas Heger
+#
+#   This program is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; either version 2
+#   of the License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#################################################################################
+'''
+gat-plot - plot results from a gat analysis
+===========================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script takes the results of a ``gat-run.py` or ``gat-compare.py``
+and plots the results.
+
+This script requires matplotlib.
+
+Usage
+-----
+
+Example::
+
+   python gat-plot.py --input-filename-results=gat.results.tsv.gz
+   python gat-plot.py --input-filename-counts=gat.counts.tsv.gz
+
+Type::
+
+   python gatplot.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+
+try:
+    import matplotlib.pyplot as plt
+    HASPLOT = True
+except (ImportError,RuntimeError):
+    HASPLOT = False
+
+class DummyAnnotatorResult:
+
+    format_observed = "%i"
+    format_expected = "%6.4f"
+    format_fold = "%6.4f"
+    format_pvalue = "%6.4e"
+
+    def __init__( self ):
+        pass
+
+    @classmethod
+    def _fromLine( cls, line ):
+        x = cls()
+        data = line[:-1].split("\t")
+        x.track, x.annotation = data[:2]
+        x.observed, x.expected, x.lower95, x.upper95, x.stddev, x.fold, x.pvalue, x.qvalue = \
+            map(float, data[2:] )
+        return x
+
+    def __str__(self):
+        return "\t".join( (self.track,
+                           self.annotation,
+                           self.format_observed % self.observed,
+                           self.format_expected % self.expected,
+                           self.format_expected % self.lower95,
+                           self.format_expected % self.upper95,
+                           self.format_expected % self.stddev,
+                           self.format_fold % self.fold,
+                           self.format_pvalue % self.pvalue,
+                           self.format_pvalue % self.qvalue ) )
+
+def buildPlotFilename( options, key ):
+    filename = re.sub("%s", key, options.output_plots_pattern)
+    filename = re.sub("[^a-zA-Z0-9-_./]", "_", filename )
+    dirname = os.path.dirname( filename )
+    if dirname and not os.path.exists( dirname ): os.makedirs( dirname )
+    return filename
+
+def plotBarplots( annotator_results, options ):
+    '''output a series of bar-plots.
+
+    Output for each track.
+
+    Significant results are opaque, while
+    non-significant results are transparent.'''
+
+    for track in annotator_results:
+        plt.figure()
+        r = annotator_results[track]
+        keys, values = zip( *r.items())
+        pos = range(len(r))
+        bars = plt.barh( pos, [x.fold for x in values] )
+        for b,v in zip(bars, values):
+            if v.qvalue > 0.05: b.set_alpha( 0.10 )
+
+        filename = buildPlotFilename( options, "bars-%s" % track )
+        plt.yticks( pos, keys )
+        plt.axvline( x=1, color="r")
+        plt.savefig( filename )
+
+def plotBarplot( annotator_results, options ):
+    '''output a single bar-plots.
+
+    Output for each track.
+
+    Significant results are opaque, while
+    non-significant results are transparent.'''
+
+    ntracks = len(annotator_results )
+    height = 1.0 / float(ntracks)
+
+    plt.figure()
+
+    for trackid, track in enumerate(annotator_results):
+
+        r = annotator_results[track]
+        rr = r.items()
+        rr.sort()
+        keys, values = zip(*rr)
+        pos = numpy.arange(0,len(r),1) + trackid * height
+        bars = plt.barh( pos,
+                         [x.fold for x in values],
+                         height = height,
+                         label = track,
+                         xerr = [x.stddev / x.expected for x in values],
+                         color = "bryg"[trackid % 4])
+        for b,v in zip(bars, values):
+            if v.pvalue > 0.05: b.set_alpha( 0.10 )
+
+    pos = range(len(r))
+
+    plt.yticks( pos, keys )
+    plt.axvline(x=1, color = "r" )
+    filename = buildPlotFilename( options, "bars-all"  )
+    plt.legend()
+    plt.savefig( filename )
+
+def main( argv = None ):
+    """script main.
+
+    parses command line options in sys.argv, unless *argv* is given.
+    """
+
+    if not argv: argv = sys.argv
+
+    # setup command line parser
+    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
+                                    usage = globals()["__doc__"] )
+
+    parser.add_option("-l", "--sample-file", dest="sample_files", type="string", action="append",
+                      help="filename with sample files. Start processing from samples [default=%default]."  )
+
+    parser.add_option("-o", "--order", dest="output_order", type="choice",
+                      choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ),
+                      help="order results in output by fold, track, etc. [default=%default]."  )
+
+    parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
+                      choices = ( "empirical", "norm", ),
+                      help="type of pvalue reported [default=%default]."  )
+
+    parser.add_option( "--results-file", dest="input_filename_results", type="string",
+                      help="start processing from results - no segments required [default=%default]."  )
+
+    parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string",
+                       help="output pattern for plots [default=%default]" )
+
+    parser.add_option( "--output-samples-pattern", dest="output_samples_pattern", type="string",
+                       help="output pattern for samples. Samples are stored in bed format, one for "
+                            " each segment [default=%default]" )
+
+    parser.add_option( "--plots", dest="plots", type="choice",
+                      choices = ( "all",
+                                  "bars-per-track",
+                                  "bars", ),
+                      help="plots to be created [default=%default]."  )
+
+    parser.set_defaults(
+        sample_files = [],
+        num_samples = 1000,
+        output_stats = [],
+        output_filename_counts = None,
+        output_order = "fold",
+        input_filename_results = None,
+        pvalue_method = "empirical",
+        output_plots_pattern = None,
+        )
+
+    ## add common options (-h/--help, ...) and parse command line
+    (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+    annotator_results = IO.readAnnotatorResults( options.input_filename_results )
+
+    if "speparate-bars" in options.plots:
+        plotBarplots( annotator_results, options )
+    if "bars" in options.plots:
+        plotBarplot( annotator_results, options )
+
+    ## write footer and output benchmark information.
+    E.Stop()
+
+if __name__ == "__main__":
+    sys.exit( main( sys.argv) )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-run.py	Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,317 @@
+################################################################################
+#
+#   MRC FGU Computational Genomics Group
+#
+#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+#   Copyright (C) 2009 Andreas Heger
+#
+#   This program is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; either version 2
+#   of the License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#################################################################################
+'''
+gat-run - run the genomic annotation tool
+=============================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares one or more genomic segments of interest
+against one more other genomic annotations.
+
+Usage
+-----
+
+Example::
+
+   python gat-run.py
+      --segment-file=segments.bed.gz
+      --workspace-file=workspace.bed.gz
+      --annotation-file=annotations_architecture.bed.gz
+
+Type::
+
+   python gat-run.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+import gat.Stats as Stats
+import GatSegmentList
+import GatEngine
+
+def fromSegments( options, args ):
+    '''run analysis from segment files.
+
+    This is the most common use case.
+    '''
+
+    tstart = time.time()
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## build segments
+    segments, annotations, workspaces, isochores = IO.buildSegments( options )
+
+    E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## open various additional output files
+    ##################################################
+    outfiles = {}
+    for section in ("sample",
+                    "segment_metrics",
+                    "sample_metrics",
+                    ):
+        if section in options.output_stats or \
+            "all" in options.output_stats or \
+                len( [ x for x in options.output_stats if re.search( x, "section" ) ] ) > 0:
+            outfiles[section] = E.openOutputFile(section)
+
+    if 'sample_metrics' in outfiles:
+        outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders() ))
+
+    # filter segments by workspace
+    workspace = IO.applyIsochores( segments,
+                                   annotations,
+                                   workspaces,
+                                   options,
+                                   isochores,
+                                   truncate_segments_to_workspace = options.truncate_segments_to_workspace,
+                                   truncate_workspace_to_annotations = options.truncate_workspace_to_annotations,
+                                   restrict_workspace = options.restrict_workspace )
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## check memory requirements
+    counts = segments.countsPerTrack()
+    max_counts = max(counts.values())
+    # previous algorithm: memory requirements if all samples are stored
+    memory = 8 * 2 * options.num_samples * max_counts * len(workspace)
+
+    ##################################################
+    ##################################################
+    ##################################################
+    # initialize sampler
+    if options.sampler == "annotator":
+        sampler = GatEngine.SamplerAnnotator(
+            bucket_size = options.bucket_size,
+            nbuckets = options.nbuckets )
+    elif options.sampler == "shift":
+        sampler = GatEngine.SamplerShift(
+            radius = options.shift_expansion,
+            extension = options.shift_extension )
+    elif options.sampler == "segments":
+        sampler = GatEngine.SamplerSegments()
+    elif options.sampler == "local-permutation":
+        sampler = GatEngine.SamplerLocalPermutation()
+    elif options.sampler == "global-permutation":
+        sampler = GatEngine.SamplerGlobalPermutation()
+    elif options.sampler == "brute-force":
+        sampler = GatEngine.SamplerBruteForce()
+    elif options.sampler == "uniform":
+        sampler = GatEngine.SamplerUniform()
+
+    ##################################################
+    ##################################################
+    ##################################################
+    # initialize counter
+    counters = []
+    for counter in options.counters:
+        if counter == "nucleotide-overlap":
+            counters.append( GatEngine.CounterNucleotideOverlap() )
+        elif counter == "nucleotide-density":
+            counters.append( GatEngine.CounterNucleotideDensity() )
+        elif counter == "segment-overlap":
+            counters.append( GatEngine.CounterSegmentOverlap() )
+        elif counter == "annotations-overlap":
+            counters.append( GatEngine.CounterAnnotationsOverlap() )
+        elif counter == "segment-midoverlap":
+            counters.append( GatEngine.CounterSegmentMidpointOverlap() )
+        elif counter == "annotations-midoverlap":
+            counters.append( GatEngine.CounterAnnotationsMidpointOverlap() )
+        else:
+            raise ValueError("unknown counter '%s'" % counter )
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## initialize workspace generator
+    if options.conditional == "unconditional":
+        workspace_generator = GatEngine.UnconditionalWorkspace()
+    elif options.conditional == "cooccurance":
+        workspace_generator = GatEngine.ConditionalWorkspaceCooccurance()
+    elif options.conditional == "annotation-centered":
+        if options.conditional_extension == options.conditional_expansion == None:
+            raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
+        workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension,
+                                                                          options.conditional_expansion )
+    elif options.conditional == "segment-centered":
+        if options.conditional_extension == options.conditional_expansion == None:
+            raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
+
+        workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension,
+                                                                       options.conditional_expansion )
+    else:
+        raise ValueError("unknown conditional workspace '%s'" % options.conditional )
+
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## check if reference is compplete
+    ##################################################
+    if options.reference:
+        reference = options.reference
+        for track in segments.tracks:
+            if track not in options.reference:
+                raise ValueError("missing track '%s' in reference" % track )
+            r = options.reference[track]
+            for annotation in annotations.tracks:
+                if annotation not in r:
+                    raise ValueError("missing annotation '%s' in annotations for track='%s'" % (annotation, track ))
+
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## compute
+    ##################################################
+    annotator_results = gat.run( segments,
+                                 annotations,
+                                 workspace,
+                                 sampler,
+                                 counters,
+                                 workspace_generator = workspace_generator,
+                                 num_samples = options.num_samples,
+                                 cache = options.cache,
+                                 outfiles = outfiles,
+                                 output_counts_pattern = options.output_counts_pattern,
+                                 output_samples_pattern = options.output_samples_pattern,
+                                 sample_files = options.sample_files,
+                                 conditional = options.conditional,
+                                 conditional_extension = options.conditional_extension,
+                                 reference = options.reference,
+                                 pseudo_count = options.pseudo_count,
+                                 num_threads = options.num_threads )
+
+    return annotator_results
+
+
+def main( argv = None ):
+    """script main.
+
+    parses command line options in sys.argv, unless *argv* is given.
+    """
+
+    if not argv: argv = sys.argv
+
+    parser = gat.buildParser( usage = globals()["__doc__"] )
+
+    ## add common options (-h/--help, ...) and parse command line
+    (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+    ##################################################
+    description_header, descriptions, description_width = IO.readDescriptions( options )
+
+    ##################################################
+    size_pos, size_segment = GatSegmentList.getSegmentSize()
+    E.debug( "sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos )))
+
+    ##################################################
+    # set default counter
+    if not options.counters:
+        options.counters.append( "nucleotide-overlap" )
+
+    ##################################################
+    if options.output_tables_pattern != None:
+        if "%s" not in options.output_tables_pattern:
+            raise ValueError( "output_tables_pattern should contain at least one '%s'")
+
+    if options.output_samples_pattern != None:
+        if "%s" not in options.output_samples_pattern:
+            raise ValueError( "output_samples_pattern should contain at least one '%s'")
+
+    if options.output_counts_pattern != None:
+        if "%s" not in options.output_counts_pattern:
+            raise ValueError( "output_counts_pattern should contain at least one '%s'")
+
+
+    ##################################################
+    # read fold changes that results should be compared with
+    if options.null != "default":
+        if not os.path.exists( options.null ):
+            raise OSError( "file %s not found" % options.null )
+        E.info( "reading reference results from %s" % options.null )
+        options.reference = IO.readAnnotatorResults( options.null )
+    else:
+        options.reference = None
+
+    if options.input_filename_counts:
+        # use pre-computed counts
+        annotator_results = GatEngine.fromCounts( options.input_filename_counts )
+
+    elif options.input_filename_results:
+        # use previous results (re-computes fdr)
+        E.info( "reading gat results from %s" % options.input_filename_results )
+        annotator_results = IO.readAnnotatorResults( options.input_filename_results )
+
+    else:
+        # do full gat analysis
+        annotator_results = fromSegments( options, args )
+
+    ##################################################
+    if options.pvalue_method != "empirical":
+        E.info("updating pvalues to %s" % options.pvalue_method )
+        GatEngine.updatePValues( annotator_results, options.pvalue_method )
+
+    ##################################################
+    ## output
+    IO.outputResults( annotator_results,
+                      options,
+                      GatEngine.AnnotatorResultExtended.headers,
+                      description_header,
+                      description_width,
+                      descriptions )
+
+    IO.plotResults( annotator_results, options )
+
+    ## write footer and output benchmark information.
+    E.Stop()
+
+if __name__ == "__main__":
+    sys.exit( main( sys.argv) )
--- a/rlGAT/rlGAT.xml	Thu Aug 29 01:01:45 2013 -0400
+++ b/rlGAT/rlGAT.xml	Thu Aug 29 01:57:54 2013 -0400
@@ -9,7 +9,7 @@
       <requirement type="package" version="1.7.1">numpy</requirement>
       <requirement type="package" version="0.19.1">cython</requirement>
   </requirements>
-  <command>
+  <command interpreter="python">
     gat-run.py ${ignore_segments} --log="$logfile" --num-samples=$numsamp
     #for $s in $segmentfiles:
       #if $s.sf.ext != 'data':