Mercurial > repos > fubar > genomic_association_tester
diff rlGAT/gat-run.py @ 11:53487f21c0d5 draft
Uploaded
| author | fubar |
|---|---|
| date | Thu, 29 Aug 2013 01:57:54 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rlGAT/gat-run.py Thu Aug 29 01:57:54 2013 -0400 @@ -0,0 +1,317 @@ +################################################################################ +# +# MRC FGU Computational Genomics Group +# +# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ +# +# Copyright (C) 2009 Andreas Heger +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +################################################################################# +''' +gat-run - run the genomic annotation tool +============================================= + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Python + +Purpose +------- + +This script compares one or more genomic segments of interest +against one more other genomic annotations. + +Usage +----- + +Example:: + + python gat-run.py + --segment-file=segments.bed.gz + --workspace-file=workspace.bed.gz + --annotation-file=annotations_architecture.bed.gz + +Type:: + + python gat-run.py --help + +for command line help. + +Documentation +------------- + +Code +---- + +''' + +import os, sys, re, optparse, collections, types, glob, time +import numpy + +import gat +import gat.Experiment as E +import gat.IOTools as IOTools +import gat.IO as IO +import gat.Stats as Stats +import GatSegmentList +import GatEngine + +def fromSegments( options, args ): + '''run analysis from segment files. + + This is the most common use case. + ''' + + tstart = time.time() + + ################################################## + ################################################## + ################################################## + ## build segments + segments, annotations, workspaces, isochores = IO.buildSegments( options ) + + E.info( "intervals loaded in %i seconds" % (time.time() - tstart) ) + + ################################################## + ################################################## + ################################################## + ## open various additional output files + ################################################## + outfiles = {} + for section in ("sample", + "segment_metrics", + "sample_metrics", + ): + if section in options.output_stats or \ + "all" in options.output_stats or \ + len( [ x for x in options.output_stats if re.search( x, "section" ) ] ) > 0: + outfiles[section] = E.openOutputFile(section) + + if 'sample_metrics' in outfiles: + outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders() )) + + # filter segments by workspace + workspace = IO.applyIsochores( segments, + annotations, + workspaces, + options, + isochores, + truncate_segments_to_workspace = options.truncate_segments_to_workspace, + truncate_workspace_to_annotations = options.truncate_workspace_to_annotations, + restrict_workspace = options.restrict_workspace ) + + ################################################## + ################################################## + ################################################## + ## check memory requirements + counts = segments.countsPerTrack() + max_counts = max(counts.values()) + # previous algorithm: memory requirements if all samples are stored + memory = 8 * 2 * options.num_samples * max_counts * len(workspace) + + ################################################## + ################################################## + ################################################## + # initialize sampler + if options.sampler == "annotator": + sampler = GatEngine.SamplerAnnotator( + bucket_size = options.bucket_size, + nbuckets = options.nbuckets ) + elif options.sampler == "shift": + sampler = GatEngine.SamplerShift( + radius = options.shift_expansion, + extension = options.shift_extension ) + elif options.sampler == "segments": + sampler = GatEngine.SamplerSegments() + elif options.sampler == "local-permutation": + sampler = GatEngine.SamplerLocalPermutation() + elif options.sampler == "global-permutation": + sampler = GatEngine.SamplerGlobalPermutation() + elif options.sampler == "brute-force": + sampler = GatEngine.SamplerBruteForce() + elif options.sampler == "uniform": + sampler = GatEngine.SamplerUniform() + + ################################################## + ################################################## + ################################################## + # initialize counter + counters = [] + for counter in options.counters: + if counter == "nucleotide-overlap": + counters.append( GatEngine.CounterNucleotideOverlap() ) + elif counter == "nucleotide-density": + counters.append( GatEngine.CounterNucleotideDensity() ) + elif counter == "segment-overlap": + counters.append( GatEngine.CounterSegmentOverlap() ) + elif counter == "annotations-overlap": + counters.append( GatEngine.CounterAnnotationsOverlap() ) + elif counter == "segment-midoverlap": + counters.append( GatEngine.CounterSegmentMidpointOverlap() ) + elif counter == "annotations-midoverlap": + counters.append( GatEngine.CounterAnnotationsMidpointOverlap() ) + else: + raise ValueError("unknown counter '%s'" % counter ) + + ################################################## + ################################################## + ################################################## + ## initialize workspace generator + if options.conditional == "unconditional": + workspace_generator = GatEngine.UnconditionalWorkspace() + elif options.conditional == "cooccurance": + workspace_generator = GatEngine.ConditionalWorkspaceCooccurance() + elif options.conditional == "annotation-centered": + if options.conditional_extension == options.conditional_expansion == None: + raise ValueError( "please specify either --conditional-expansion or --conditional-extension" ) + workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension, + options.conditional_expansion ) + elif options.conditional == "segment-centered": + if options.conditional_extension == options.conditional_expansion == None: + raise ValueError( "please specify either --conditional-expansion or --conditional-extension" ) + + workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension, + options.conditional_expansion ) + else: + raise ValueError("unknown conditional workspace '%s'" % options.conditional ) + + + ################################################## + ################################################## + ################################################## + ## check if reference is compplete + ################################################## + if options.reference: + reference = options.reference + for track in segments.tracks: + if track not in options.reference: + raise ValueError("missing track '%s' in reference" % track ) + r = options.reference[track] + for annotation in annotations.tracks: + if annotation not in r: + raise ValueError("missing annotation '%s' in annotations for track='%s'" % (annotation, track )) + + + ################################################## + ################################################## + ################################################## + ## compute + ################################################## + annotator_results = gat.run( segments, + annotations, + workspace, + sampler, + counters, + workspace_generator = workspace_generator, + num_samples = options.num_samples, + cache = options.cache, + outfiles = outfiles, + output_counts_pattern = options.output_counts_pattern, + output_samples_pattern = options.output_samples_pattern, + sample_files = options.sample_files, + conditional = options.conditional, + conditional_extension = options.conditional_extension, + reference = options.reference, + pseudo_count = options.pseudo_count, + num_threads = options.num_threads ) + + return annotator_results + + +def main( argv = None ): + """script main. + + parses command line options in sys.argv, unless *argv* is given. + """ + + if not argv: argv = sys.argv + + parser = gat.buildParser( usage = globals()["__doc__"] ) + + ## add common options (-h/--help, ...) and parse command line + (options, args) = E.Start( parser, argv = argv, add_output_options = True ) + + ################################################## + description_header, descriptions, description_width = IO.readDescriptions( options ) + + ################################################## + size_pos, size_segment = GatSegmentList.getSegmentSize() + E.debug( "sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos ))) + + ################################################## + # set default counter + if not options.counters: + options.counters.append( "nucleotide-overlap" ) + + ################################################## + if options.output_tables_pattern != None: + if "%s" not in options.output_tables_pattern: + raise ValueError( "output_tables_pattern should contain at least one '%s'") + + if options.output_samples_pattern != None: + if "%s" not in options.output_samples_pattern: + raise ValueError( "output_samples_pattern should contain at least one '%s'") + + if options.output_counts_pattern != None: + if "%s" not in options.output_counts_pattern: + raise ValueError( "output_counts_pattern should contain at least one '%s'") + + + ################################################## + # read fold changes that results should be compared with + if options.null != "default": + if not os.path.exists( options.null ): + raise OSError( "file %s not found" % options.null ) + E.info( "reading reference results from %s" % options.null ) + options.reference = IO.readAnnotatorResults( options.null ) + else: + options.reference = None + + if options.input_filename_counts: + # use pre-computed counts + annotator_results = GatEngine.fromCounts( options.input_filename_counts ) + + elif options.input_filename_results: + # use previous results (re-computes fdr) + E.info( "reading gat results from %s" % options.input_filename_results ) + annotator_results = IO.readAnnotatorResults( options.input_filename_results ) + + else: + # do full gat analysis + annotator_results = fromSegments( options, args ) + + ################################################## + if options.pvalue_method != "empirical": + E.info("updating pvalues to %s" % options.pvalue_method ) + GatEngine.updatePValues( annotator_results, options.pvalue_method ) + + ################################################## + ## output + IO.outputResults( annotator_results, + options, + GatEngine.AnnotatorResultExtended.headers, + description_header, + description_width, + descriptions ) + + IO.plotResults( annotator_results, options ) + + ## write footer and output benchmark information. + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) )
