diff rlGAT/gat-run.py @ 11:53487f21c0d5 draft

Uploaded
author fubar
date Thu, 29 Aug 2013 01:57:54 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-run.py	Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,317 @@
+################################################################################
+#
+#   MRC FGU Computational Genomics Group
+#
+#   $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+#   Copyright (C) 2009 Andreas Heger
+#
+#   This program is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; either version 2
+#   of the License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software
+#   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+#################################################################################
+'''
+gat-run - run the genomic annotation tool
+=============================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares one or more genomic segments of interest 
+against one more other genomic annotations.
+
+Usage
+-----
+
+Example::
+
+   python gat-run.py 
+      --segment-file=segments.bed.gz 
+      --workspace-file=workspace.bed.gz 
+      --annotation-file=annotations_architecture.bed.gz  
+ 
+Type::
+
+   python gat-run.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+import gat.Stats as Stats
+import GatSegmentList
+import GatEngine
+
+def fromSegments( options, args ):
+    '''run analysis from segment files. 
+
+    This is the most common use case.
+    '''
+
+    tstart = time.time()
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## build segments
+    segments, annotations, workspaces, isochores = IO.buildSegments( options )
+
+    E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## open various additional output files
+    ##################################################
+    outfiles = {}
+    for section in ("sample", 
+                    "segment_metrics",
+                    "sample_metrics",
+                    ):
+        if section in options.output_stats or \
+            "all" in options.output_stats or \
+                len( [ x for x in options.output_stats if re.search( x, "section" ) ] ) > 0:
+            outfiles[section] = E.openOutputFile(section)
+
+    if 'sample_metrics' in outfiles:
+        outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders() ))
+
+    # filter segments by workspace
+    workspace = IO.applyIsochores( segments, 
+                                   annotations, 
+                                   workspaces,
+                                   options, 
+                                   isochores,
+                                   truncate_segments_to_workspace = options.truncate_segments_to_workspace,
+                                   truncate_workspace_to_annotations = options.truncate_workspace_to_annotations,
+                                   restrict_workspace = options.restrict_workspace )
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## check memory requirements
+    counts = segments.countsPerTrack() 
+    max_counts = max(counts.values())
+    # previous algorithm: memory requirements if all samples are stored
+    memory = 8 * 2 * options.num_samples * max_counts * len(workspace)
+
+    ##################################################
+    ##################################################
+    ##################################################
+    # initialize sampler
+    if options.sampler == "annotator":
+        sampler = GatEngine.SamplerAnnotator(
+            bucket_size = options.bucket_size,
+            nbuckets = options.nbuckets )
+    elif options.sampler == "shift":
+        sampler = GatEngine.SamplerShift( 
+            radius = options.shift_expansion,
+            extension = options.shift_extension )
+    elif options.sampler == "segments":
+        sampler = GatEngine.SamplerSegments()
+    elif options.sampler == "local-permutation":
+        sampler = GatEngine.SamplerLocalPermutation()
+    elif options.sampler == "global-permutation":
+        sampler = GatEngine.SamplerGlobalPermutation()
+    elif options.sampler == "brute-force":
+        sampler = GatEngine.SamplerBruteForce()
+    elif options.sampler == "uniform":
+        sampler = GatEngine.SamplerUniform()
+        
+    ##################################################
+    ##################################################
+    ##################################################
+    # initialize counter
+    counters = []
+    for counter in options.counters:
+        if counter == "nucleotide-overlap":
+            counters.append( GatEngine.CounterNucleotideOverlap() )
+        elif counter == "nucleotide-density":
+            counters.append( GatEngine.CounterNucleotideDensity() )
+        elif counter == "segment-overlap":
+            counters.append( GatEngine.CounterSegmentOverlap() )
+        elif counter == "annotations-overlap":
+            counters.append( GatEngine.CounterAnnotationsOverlap() )
+        elif counter == "segment-midoverlap":
+            counters.append( GatEngine.CounterSegmentMidpointOverlap() )
+        elif counter == "annotations-midoverlap":
+            counters.append( GatEngine.CounterAnnotationsMidpointOverlap() )
+        else:
+            raise ValueError("unknown counter '%s'" % counter )
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## initialize workspace generator
+    if options.conditional == "unconditional":
+        workspace_generator = GatEngine.UnconditionalWorkspace()
+    elif options.conditional == "cooccurance":
+        workspace_generator = GatEngine.ConditionalWorkspaceCooccurance()
+    elif options.conditional == "annotation-centered":
+        if options.conditional_extension == options.conditional_expansion == None:
+            raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
+        workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension,
+                                                                          options.conditional_expansion )
+    elif options.conditional == "segment-centered":
+        if options.conditional_extension == options.conditional_expansion == None:
+            raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
+
+        workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension,
+                                                                       options.conditional_expansion )
+    else:
+        raise ValueError("unknown conditional workspace '%s'" % options.conditional )
+
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## check if reference is compplete
+    ##################################################
+    if options.reference:
+        reference = options.reference
+        for track in segments.tracks:
+            if track not in options.reference:
+                raise ValueError("missing track '%s' in reference" % track )
+            r = options.reference[track]
+            for annotation in annotations.tracks:
+                if annotation not in r:
+                    raise ValueError("missing annotation '%s' in annotations for track='%s'" % (annotation, track ))
+
+
+    ##################################################
+    ##################################################
+    ##################################################
+    ## compute
+    ##################################################
+    annotator_results = gat.run( segments, 
+                                 annotations, 
+                                 workspace,
+                                 sampler, 
+                                 counters,
+                                 workspace_generator = workspace_generator,
+                                 num_samples = options.num_samples,
+                                 cache = options.cache,
+                                 outfiles = outfiles,
+                                 output_counts_pattern = options.output_counts_pattern,
+                                 output_samples_pattern = options.output_samples_pattern,
+                                 sample_files = options.sample_files,
+                                 conditional = options.conditional,
+                                 conditional_extension = options.conditional_extension,
+                                 reference = options.reference,
+                                 pseudo_count = options.pseudo_count,
+                                 num_threads = options.num_threads )
+
+    return annotator_results
+
+
+def main( argv = None ):
+    """script main.
+
+    parses command line options in sys.argv, unless *argv* is given.
+    """
+
+    if not argv: argv = sys.argv
+
+    parser = gat.buildParser( usage = globals()["__doc__"] )
+
+    ## add common options (-h/--help, ...) and parse command line 
+    (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+    ##################################################
+    description_header, descriptions, description_width = IO.readDescriptions( options )
+
+    ##################################################
+    size_pos, size_segment = GatSegmentList.getSegmentSize()
+    E.debug( "sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos )))
+
+    ##################################################
+    # set default counter
+    if not options.counters:
+        options.counters.append( "nucleotide-overlap" )
+
+    ##################################################
+    if options.output_tables_pattern != None: 
+        if "%s" not in options.output_tables_pattern:
+            raise ValueError( "output_tables_pattern should contain at least one '%s'")
+
+    if options.output_samples_pattern != None: 
+        if "%s" not in options.output_samples_pattern:
+            raise ValueError( "output_samples_pattern should contain at least one '%s'")
+
+    if options.output_counts_pattern != None: 
+        if "%s" not in options.output_counts_pattern:
+            raise ValueError( "output_counts_pattern should contain at least one '%s'")
+
+
+    ##################################################
+    # read fold changes that results should be compared with
+    if options.null != "default":
+        if not os.path.exists( options.null ):
+            raise OSError( "file %s not found" % options.null )
+        E.info( "reading reference results from %s" % options.null )
+        options.reference = IO.readAnnotatorResults( options.null )
+    else:
+        options.reference = None
+
+    if options.input_filename_counts:
+        # use pre-computed counts
+        annotator_results = GatEngine.fromCounts( options.input_filename_counts )
+
+    elif options.input_filename_results:
+        # use previous results (re-computes fdr)
+        E.info( "reading gat results from %s" % options.input_filename_results )
+        annotator_results = IO.readAnnotatorResults( options.input_filename_results )
+
+    else:
+        # do full gat analysis
+        annotator_results = fromSegments( options, args )
+
+    ##################################################
+    if options.pvalue_method != "empirical":
+        E.info("updating pvalues to %s" % options.pvalue_method )
+        GatEngine.updatePValues( annotator_results, options.pvalue_method )
+
+    ##################################################
+    ## output
+    IO.outputResults( annotator_results, 
+                      options, 
+                      GatEngine.AnnotatorResultExtended.headers,
+                      description_header, 
+                      description_width,
+                      descriptions )
+
+    IO.plotResults( annotator_results, options )
+
+    ## write footer and output benchmark information.
+    E.Stop()
+
+if __name__ == "__main__":
+    sys.exit( main( sys.argv) )