comparison rlGAT/gat-run.py @ 11:53487f21c0d5 draft

Uploaded
author fubar
date Thu, 29 Aug 2013 01:57:54 -0400
parents
children
comparison
equal deleted inserted replaced
10:f04dfb37d1bb 11:53487f21c0d5
1 ################################################################################
2 #
3 # MRC FGU Computational Genomics Group
4 #
5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
6 #
7 # Copyright (C) 2009 Andreas Heger
8 #
9 # This program is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU General Public License
11 # as published by the Free Software Foundation; either version 2
12 # of the License, or (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program; if not, write to the Free Software
21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 #################################################################################
23 '''
24 gat-run - run the genomic annotation tool
25 =============================================
26
27 :Author: Andreas Heger
28 :Release: $Id$
29 :Date: |today|
30 :Tags: Python
31
32 Purpose
33 -------
34
35 This script compares one or more genomic segments of interest
36 against one more other genomic annotations.
37
38 Usage
39 -----
40
41 Example::
42
43 python gat-run.py
44 --segment-file=segments.bed.gz
45 --workspace-file=workspace.bed.gz
46 --annotation-file=annotations_architecture.bed.gz
47
48 Type::
49
50 python gat-run.py --help
51
52 for command line help.
53
54 Documentation
55 -------------
56
57 Code
58 ----
59
60 '''
61
62 import os, sys, re, optparse, collections, types, glob, time
63 import numpy
64
65 import gat
66 import gat.Experiment as E
67 import gat.IOTools as IOTools
68 import gat.IO as IO
69 import gat.Stats as Stats
70 import GatSegmentList
71 import GatEngine
72
73 def fromSegments( options, args ):
74 '''run analysis from segment files.
75
76 This is the most common use case.
77 '''
78
79 tstart = time.time()
80
81 ##################################################
82 ##################################################
83 ##################################################
84 ## build segments
85 segments, annotations, workspaces, isochores = IO.buildSegments( options )
86
87 E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
88
89 ##################################################
90 ##################################################
91 ##################################################
92 ## open various additional output files
93 ##################################################
94 outfiles = {}
95 for section in ("sample",
96 "segment_metrics",
97 "sample_metrics",
98 ):
99 if section in options.output_stats or \
100 "all" in options.output_stats or \
101 len( [ x for x in options.output_stats if re.search( x, "section" ) ] ) > 0:
102 outfiles[section] = E.openOutputFile(section)
103
104 if 'sample_metrics' in outfiles:
105 outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders() ))
106
107 # filter segments by workspace
108 workspace = IO.applyIsochores( segments,
109 annotations,
110 workspaces,
111 options,
112 isochores,
113 truncate_segments_to_workspace = options.truncate_segments_to_workspace,
114 truncate_workspace_to_annotations = options.truncate_workspace_to_annotations,
115 restrict_workspace = options.restrict_workspace )
116
117 ##################################################
118 ##################################################
119 ##################################################
120 ## check memory requirements
121 counts = segments.countsPerTrack()
122 max_counts = max(counts.values())
123 # previous algorithm: memory requirements if all samples are stored
124 memory = 8 * 2 * options.num_samples * max_counts * len(workspace)
125
126 ##################################################
127 ##################################################
128 ##################################################
129 # initialize sampler
130 if options.sampler == "annotator":
131 sampler = GatEngine.SamplerAnnotator(
132 bucket_size = options.bucket_size,
133 nbuckets = options.nbuckets )
134 elif options.sampler == "shift":
135 sampler = GatEngine.SamplerShift(
136 radius = options.shift_expansion,
137 extension = options.shift_extension )
138 elif options.sampler == "segments":
139 sampler = GatEngine.SamplerSegments()
140 elif options.sampler == "local-permutation":
141 sampler = GatEngine.SamplerLocalPermutation()
142 elif options.sampler == "global-permutation":
143 sampler = GatEngine.SamplerGlobalPermutation()
144 elif options.sampler == "brute-force":
145 sampler = GatEngine.SamplerBruteForce()
146 elif options.sampler == "uniform":
147 sampler = GatEngine.SamplerUniform()
148
149 ##################################################
150 ##################################################
151 ##################################################
152 # initialize counter
153 counters = []
154 for counter in options.counters:
155 if counter == "nucleotide-overlap":
156 counters.append( GatEngine.CounterNucleotideOverlap() )
157 elif counter == "nucleotide-density":
158 counters.append( GatEngine.CounterNucleotideDensity() )
159 elif counter == "segment-overlap":
160 counters.append( GatEngine.CounterSegmentOverlap() )
161 elif counter == "annotations-overlap":
162 counters.append( GatEngine.CounterAnnotationsOverlap() )
163 elif counter == "segment-midoverlap":
164 counters.append( GatEngine.CounterSegmentMidpointOverlap() )
165 elif counter == "annotations-midoverlap":
166 counters.append( GatEngine.CounterAnnotationsMidpointOverlap() )
167 else:
168 raise ValueError("unknown counter '%s'" % counter )
169
170 ##################################################
171 ##################################################
172 ##################################################
173 ## initialize workspace generator
174 if options.conditional == "unconditional":
175 workspace_generator = GatEngine.UnconditionalWorkspace()
176 elif options.conditional == "cooccurance":
177 workspace_generator = GatEngine.ConditionalWorkspaceCooccurance()
178 elif options.conditional == "annotation-centered":
179 if options.conditional_extension == options.conditional_expansion == None:
180 raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
181 workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension,
182 options.conditional_expansion )
183 elif options.conditional == "segment-centered":
184 if options.conditional_extension == options.conditional_expansion == None:
185 raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
186
187 workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension,
188 options.conditional_expansion )
189 else:
190 raise ValueError("unknown conditional workspace '%s'" % options.conditional )
191
192
193 ##################################################
194 ##################################################
195 ##################################################
196 ## check if reference is compplete
197 ##################################################
198 if options.reference:
199 reference = options.reference
200 for track in segments.tracks:
201 if track not in options.reference:
202 raise ValueError("missing track '%s' in reference" % track )
203 r = options.reference[track]
204 for annotation in annotations.tracks:
205 if annotation not in r:
206 raise ValueError("missing annotation '%s' in annotations for track='%s'" % (annotation, track ))
207
208
209 ##################################################
210 ##################################################
211 ##################################################
212 ## compute
213 ##################################################
214 annotator_results = gat.run( segments,
215 annotations,
216 workspace,
217 sampler,
218 counters,
219 workspace_generator = workspace_generator,
220 num_samples = options.num_samples,
221 cache = options.cache,
222 outfiles = outfiles,
223 output_counts_pattern = options.output_counts_pattern,
224 output_samples_pattern = options.output_samples_pattern,
225 sample_files = options.sample_files,
226 conditional = options.conditional,
227 conditional_extension = options.conditional_extension,
228 reference = options.reference,
229 pseudo_count = options.pseudo_count,
230 num_threads = options.num_threads )
231
232 return annotator_results
233
234
235 def main( argv = None ):
236 """script main.
237
238 parses command line options in sys.argv, unless *argv* is given.
239 """
240
241 if not argv: argv = sys.argv
242
243 parser = gat.buildParser( usage = globals()["__doc__"] )
244
245 ## add common options (-h/--help, ...) and parse command line
246 (options, args) = E.Start( parser, argv = argv, add_output_options = True )
247
248 ##################################################
249 description_header, descriptions, description_width = IO.readDescriptions( options )
250
251 ##################################################
252 size_pos, size_segment = GatSegmentList.getSegmentSize()
253 E.debug( "sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos )))
254
255 ##################################################
256 # set default counter
257 if not options.counters:
258 options.counters.append( "nucleotide-overlap" )
259
260 ##################################################
261 if options.output_tables_pattern != None:
262 if "%s" not in options.output_tables_pattern:
263 raise ValueError( "output_tables_pattern should contain at least one '%s'")
264
265 if options.output_samples_pattern != None:
266 if "%s" not in options.output_samples_pattern:
267 raise ValueError( "output_samples_pattern should contain at least one '%s'")
268
269 if options.output_counts_pattern != None:
270 if "%s" not in options.output_counts_pattern:
271 raise ValueError( "output_counts_pattern should contain at least one '%s'")
272
273
274 ##################################################
275 # read fold changes that results should be compared with
276 if options.null != "default":
277 if not os.path.exists( options.null ):
278 raise OSError( "file %s not found" % options.null )
279 E.info( "reading reference results from %s" % options.null )
280 options.reference = IO.readAnnotatorResults( options.null )
281 else:
282 options.reference = None
283
284 if options.input_filename_counts:
285 # use pre-computed counts
286 annotator_results = GatEngine.fromCounts( options.input_filename_counts )
287
288 elif options.input_filename_results:
289 # use previous results (re-computes fdr)
290 E.info( "reading gat results from %s" % options.input_filename_results )
291 annotator_results = IO.readAnnotatorResults( options.input_filename_results )
292
293 else:
294 # do full gat analysis
295 annotator_results = fromSegments( options, args )
296
297 ##################################################
298 if options.pvalue_method != "empirical":
299 E.info("updating pvalues to %s" % options.pvalue_method )
300 GatEngine.updatePValues( annotator_results, options.pvalue_method )
301
302 ##################################################
303 ## output
304 IO.outputResults( annotator_results,
305 options,
306 GatEngine.AnnotatorResultExtended.headers,
307 description_header,
308 description_width,
309 descriptions )
310
311 IO.plotResults( annotator_results, options )
312
313 ## write footer and output benchmark information.
314 E.Stop()
315
316 if __name__ == "__main__":
317 sys.exit( main( sys.argv) )