diff rlGAT/gat-great.py @ 11:53487f21c0d5 draft

Uploaded
author fubar
date Thu, 29 Aug 2013 01:57:54 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-great.py	Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,488 @@
+################################################################################
+#
+#   MRC FGU Computational Genomics Group
+#
+#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+#   Copyright (C) 2009 Andreas Heger
+#
+#   This program is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; either version 2
+#   of the License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#################################################################################
+'''
+great - run great analysis
+==========================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares one or more genomic segments of interest 
+against one more other genomic annotations.
+
+GREAT says that a segment overlaps an annotation if the midpoint
+of the segment overlaps the annotation.
+
+The GREAT analysis is done on a per-isochore basis and overall.
+For the overall analysis, there is no isochore correction
+as the union of all isochores is treated as a single workspace.
+
+This script implies several counters (``--counter`` option):
+
+binom
+    binomial model (as implemented by GREAT). Returns the probability
+    of a certain number of segments overlapping (mid-point overlap) 
+    an annotation
+
+hyperg
+    hypergeometric model. Returns the probabibility of a certain
+    number of nucleotides within segments overlapping an annotation.
+    Conceptually, all genomic positions are treated independently and 
+    segments are fragmented into a collection of single bases that are
+    placed indepently. This is valid for point intervals such as SNPs,
+    but of course ignores segment size, though it might be used as a
+    guide towards the expected fold enrichment, that can be computed
+    more accurately using :mod:`gat-run.py`
+
+Usage
+-----
+
+Example::
+
+   python gat-run.py 
+      --segment-file=segments.bed.gz 
+      --workspace-file=workspace.bed.gz 
+      --annotation-file=annotations_architecture.bed.gz  
+ 
+Type::
+
+   python gat-run.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+import numpy.random
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+import csegmentlist
+
+import scipy.stats
+
+GREAT_RESULT = collections.namedtuple( "GREAT",
+                                       ("track",
+                                        "annotation",
+                                        "isochore",
+                                        "counter",
+                                        "observed", # nsegments_overlapping_annotations
+                                        "expected",
+                                        "nsegments_in_workspace",
+                                        "nannotations_in_workspace",
+                                        "nsegments_overlapping_annotation",
+                                        "nannotations_overlapping_segments",
+                                        "basecoverage_intersection",
+                                        "basecoverage_segments",
+                                        "basecoverage_annotation",
+                                        "basecoverage_workspace",
+                                        "fraction_coverage_annotation",
+                                        "fold",
+                                        "pvalue",
+                                        "qvalue" ) )
+
+def main( argv ):
+
+    if not argv: argv = sys.argv
+
+    # setup command line parser
+    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
+                                    usage = globals()["__doc__"] )
+
+    parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append",
+                      help="filename with annotations [default=%default]."  )
+
+    parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append",
+                      help="filename with segments. Also accepts a glob in parentheses [default=%default]."  )
+
+    parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append",
+                      help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]."  )
+
+    parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append",
+                      help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]."  )
+
+    parser.add_option("-o", "--order", dest="output_order", type="choice",
+                      choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ),
+                      help="order results in output by fold, track, etc. [default=%default]."  )
+
+    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
+                      choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
+                      help="method to perform multiple testing correction by controlling the fdr [default=%default]."  )
+
+    parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
+                      help="fdr computation: lambda [default=%default]."  )
+
+    parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
+                       choices = ("smoother", "bootstrap" ),
+                       help="fdr computation: method for estimating pi0 [default=%default]."  )
+    parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", 
+                       help="filename mapping annotation terms to descriptions. "
+                            " if given, the output table will contain additional columns "
+                            " [default=%default]" )
+
+    parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", 
+                       help="ignore segment tracks - all segments belong to one track [default=%default]" )
+
+    parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true", 
+                       help="permit the same track to be in multiple files [default=%default]" )
+
+    parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append",
+                       choices = ( "all", 
+                                   "annotations", "segments", 
+                                   "workspaces", "isochores",
+                                   "overlap" ),
+                       help="output bed files [default=%default]."  )
+
+    parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append",
+                       choices = ( "all", 
+                                   "annotations", "segments", 
+                                   "workspaces", "isochores",
+                                   "overlap" ),
+                       help="output overlap summary stats [default=%default]."  )
+
+    parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true", 
+                       help="restrict workspace to those segments that contain both track"
+                       " and annotations [default=%default]" )
+
+    parser.add_option( "--counter", dest="counters", type="choice", action="append",
+                       choices = ( "binom", "hyperg" ),
+                       help="counter to use [default=%default]."  )
+
+    parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string", 
+                      help="output pattern for result tables. Used if there are multiple counters used [default=%default]."  )
+
+    parser.set_defaults(
+        annotation_files = [],
+        segment_files = [],
+        workspace_files = [],  
+        sample_files = [],
+        counters = [],
+        output_stats = [],
+        output_bed = [],
+        output_tables_pattern = "%s.tsv.gz",
+        output_order = "fold",
+        input_filename_counts = None,
+        input_filename_results = None,
+        pvalue_method = "empirical",
+        output_plots_pattern = None,
+        output_samples_pattern = None,
+        qvalue_method = "storey",
+        qvalue_lambda = None,
+        qvalue_pi0_method = "smoother",
+        ignore_segment_tracks = False,
+        input_filename_descriptions = None,
+        conditional = "unconditional",
+        conditional_extension = None,
+        conditional_expansion = None,
+        restrict_workspace = False,
+        enable_split_tracks = False,
+        shift_expansion = 2.0,
+        shift_extension = 0,
+        overlap_mode = "midpoint",
+        truncate_workspace_to_annotations = False,
+        )
+
+    ## add common options (-h/--help, ...) and parse command line 
+    (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+    tstart = time.time()
+
+    if len(options.counters) == 0:
+        options.counters.append("binom")
+
+    ############################################
+    segments, annotations, workspaces, isochores = IO.buildSegments( options )
+    E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
+
+    # filter segments by workspace
+    workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores )
+
+    ############################################
+    description_header, descriptions, description_width = IO.readDescriptions( options )
+
+    ############################################
+    ############################################
+    ## compute per contig
+
+    # compute bases covered by workspace
+    workspace2basecoverage, isochores = {}, []
+    for contig, ww in workspace.iteritems():
+        workspace2basecoverage[contig] = ww.sum()
+        isochores.append( contig )
+    
+    # compute percentage of bases covered by annotations in workspace
+    # per isochore
+    annotation2basecoverage = collections.defaultdict( dict )
+    for annotation, aa in annotations.iteritems():
+        for isochore, a in aa.iteritems():
+            # need to truncate to workspace?
+            annotation2basecoverage[annotation][isochore] = a.sum()
+            
+    results_per_contig = collections.defaultdict( list )
+
+    E.info( "computing counts per isochore" )
+    # results per isochore 
+    def emptyResult( segment, annotation, isochore, 
+                     counter,
+                     nsegments_in_workspace,
+                     basecoverage_annotation,
+                     basecoverage_workspace):
+        return GREAT_RESULT._make( (
+                segment, annotation, isochore, 
+                counter,
+                0, # observed
+                0, # expected
+                nsegments_in_workspace,
+                0, # nannotations_in_workspace
+                0, # nsegments_overlapping_annotation
+                0, # nannotations_overlapping_segments
+                0, # basecoverage_intersection
+                0, # basecoverage_segments
+                basecoverage_annotation,
+                basecoverage_workspace,
+                0.0,
+                1.0,
+                1.0,
+                1.0 ))
+
+    for isochore in isochores:    
+        basecoverage_workspace = workspace2basecoverage[isochore]
+
+        # iterate over all isochores        
+        for segment, segmentdict in segments.iteritems():
+            try:
+                ss = segmentdict[isochore]
+                # select segments overlapping workspace
+                segments_in_workspace = csegmentlist.SegmentList( clone = ss )
+                segments_in_workspace.intersect( workspace[isochore] )
+                # number of segments in workspace
+                nsegments_in_workspace = len(segments_in_workspace)
+            except KeyError:
+                ss = None
+
+            basecoverage_segments = segments_in_workspace.sum()
+
+            for annotation, annotationdict in annotations.iteritems():
+                
+                # if annotation != "GO:0030957": continue
+
+                try:
+                    aa = annotationdict[isochore]
+                except KeyError:
+                    aa = None
+
+                # p_A: proportion of bases covered by annotation
+                try:
+                    basecoverage_annotation = annotation2basecoverage[annotation][isochore]
+                except KeyError:
+                    basecoverage_annotation = 0
+
+                if ss == None or aa == None:
+                    for counter in options.counters:
+                        results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation,
+                                                                                             isochore, 
+                                                                                             counter,
+                                                                                             nsegments_in_workspace,
+                                                                                             basecoverage_annotation,
+                                                                                             basecoverage_workspace ) )
+                    continue
+                
+                # select segments overlapping annotation
+                segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss )
+                segments_overlapping_annotation.intersect( annotations[annotation][isochore] )
+                # number of segments in annotation
+                nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore],
+                                                                                mode = options.overlap_mode )
+                                         
+                # number of nucleotides at the intersection of segments, annotation and workspace
+                basecoverage_intersection = segments_overlapping_annotation.sum()
+
+                annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa )
+                annotations_overlapping_segments.intersect( ss )
+                nannotations_overlapping_segments = len( annotations_overlapping_segments )
+
+                nannotations_in_workspace = len( aa )
+                if nannotations_in_workspace == 0: 
+                    for counter in options.counters:
+                        results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, 
+                                                                                             annotation, 
+                                                                                             isochore, 
+                                                                                             counter,
+                                                                                             nsegments_in_workspace,
+                                                                                             basecoverage_annotation,
+                                                                                             basecoverage_workspace ) )
+                    continue
+
+                fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
+                fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace
+                
+                for counter in options.counters:
+                    if counter.startswith( "binom" ):
+                        # GREAT binomial probability over "regions"
+                        # n = number of genomic regions = nannotations_in_workspace
+                        # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation
+                        # kpi = genomic regions with annotation hit by segments = nannotations_in_segments
+                        # sf = survival functions = 1 -cdf
+                        # probability of observing >kpi in a sample of n where the probabily of succes is
+                        # ppi.
+                        pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1, 
+                                                       nsegments_in_workspace, 
+                                                       fraction_coverage_annotation )
+                        
+                        expected = fraction_coverage_annotation * nsegments_in_workspace
+                        observed = nsegments_overlapping_annotation 
+
+                    elif counter.startswith( "hyperg" ):
+                        
+                        # hypergeometric probability over nucleotides
+                        # Sampling without replacement
+                        # x,M,n,M
+                        # x = observed number of nucleotides in overlap of segments,annotations and workspace
+                        # M = number of nucleotides in workspace
+                        # n = number of nucleotides in annotations (and workspace)
+                        # N = number of nucleotides in segments (and workspace)
+                        # P-value of obtaining >x number of nucleotides overlapping.
+                        rv = scipy.stats.hypergeom( basecoverage_workspace,
+                                                    basecoverage_annotation,
+                                                basecoverage_segments )
+                    
+                        pvalue = rv.sf( basecoverage_intersection )
+                        expected = rv.mean()
+                        observed = basecoverage_intersection
+
+                    if expected != 0:
+                        fold = float(observed) / expected
+                    else:
+                        fold = 1.0
+
+                    r = GREAT_RESULT._make( (
+                            segment, annotation, isochore,
+                            counter,
+                            observed,
+                            expected,
+                            nsegments_in_workspace,
+                            nannotations_in_workspace,
+                            nsegments_overlapping_annotation,
+                            nannotations_overlapping_segments,
+                            basecoverage_intersection,
+                            basecoverage_segments,
+                            basecoverage_annotation,
+                            basecoverage_workspace,
+                            fraction_coverage_annotation,
+                            fold,
+                            pvalue,
+                            1.0 ))
+                    # print "\t".join( map(str, r))
+                    results_per_contig[ (counter,segment,annotation) ].append( r )
+
+    E.info( "merging counts per isochore" )
+
+    # compute sums
+    results = []
+    
+    for niteration, pair in enumerate(results_per_contig.iteritems()):
+
+        counter, segment, annotation = pair[0]
+        data = pair[1]
+            
+        nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] )
+        nsegments_overlapping_annotation = sum( [x.observed for x in data ] )
+        nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] )
+        nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] )
+
+        basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] )
+        basecoverage_segments = sum( [x.basecoverage_segments for x in data ] )
+        basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] )
+        basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] )
+
+        fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
+
+        if counter.startswith( "binom" ):
+            pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1, 
+                                            nsegments_in_workspace, 
+                                            fraction_coverage_annotation )
+            expected = fraction_coverage_annotation * nsegments_in_workspace
+            observed = nsegments_overlapping_annotation 
+        elif counter.startswith( "hyperg" ):
+            rv = scipy.stats.hypergeom( basecoverage_workspace,
+                                        basecoverage_annotation,
+                                        basecoverage_segments )
+            
+            pvalue = rv.sf( basecoverage_intersection )
+            expected = rv.mean()
+            observed = basecoverage_intersection
+
+
+        if expected != 0:
+            fold = float(observed) / expected
+        else:
+            fold = 1.0
+
+        r = GREAT_RESULT._make( (
+                segment, annotation, "all",
+                counter,
+                observed,
+                expected,
+                nsegments_in_workspace,
+                nannotations_in_workspace,
+                nsegments_overlapping_annotation,
+                nannotations_overlapping_segments,
+                basecoverage_intersection,
+                basecoverage_segments,
+                basecoverage_annotation,
+                basecoverage_workspace,
+                fraction_coverage_annotation,
+                fold,
+                pvalue,
+                1.0 ))
+
+        results.append( r )
+
+    IO.outputResults( results, 
+                      options, 
+                      GREAT_RESULT._fields, 
+                      description_header, 
+                      description_width,
+                      descriptions )
+
+    E.Stop()
+
+if __name__ == "__main__":
+    sys.exit( main( sys.argv) )
+
+                            
+