|
11
|
1 ################################################################################
|
|
|
2 #
|
|
|
3 # MRC FGU Computational Genomics Group
|
|
|
4 #
|
|
|
5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
|
|
|
6 #
|
|
|
7 # Copyright (C) 2009 Andreas Heger
|
|
|
8 #
|
|
|
9 # This program is free software; you can redistribute it and/or
|
|
|
10 # modify it under the terms of the GNU General Public License
|
|
|
11 # as published by the Free Software Foundation; either version 2
|
|
|
12 # of the License, or (at your option) any later version.
|
|
|
13 #
|
|
|
14 # This program is distributed in the hope that it will be useful,
|
|
|
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
17 # GNU General Public License for more details.
|
|
|
18 #
|
|
|
19 # You should have received a copy of the GNU General Public License
|
|
|
20 # along with this program; if not, write to the Free Software
|
|
|
21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
22 #################################################################################
|
|
|
23 '''
|
|
|
24 gat-run - run the genomic annotation tool
|
|
|
25 =============================================
|
|
|
26
|
|
|
27 :Author: Andreas Heger
|
|
|
28 :Release: $Id$
|
|
|
29 :Date: |today|
|
|
|
30 :Tags: Python
|
|
|
31
|
|
|
32 Purpose
|
|
|
33 -------
|
|
|
34
|
|
|
35 This script compares one or more genomic segments of interest
|
|
|
36 against one more other genomic annotations.
|
|
|
37
|
|
|
38 Usage
|
|
|
39 -----
|
|
|
40
|
|
|
41 Example::
|
|
|
42
|
|
|
43 python gat-run.py
|
|
|
44 --segment-file=segments.bed.gz
|
|
|
45 --workspace-file=workspace.bed.gz
|
|
|
46 --annotation-file=annotations_architecture.bed.gz
|
|
|
47
|
|
|
48 Type::
|
|
|
49
|
|
|
50 python gat-run.py --help
|
|
|
51
|
|
|
52 for command line help.
|
|
|
53
|
|
|
54 Documentation
|
|
|
55 -------------
|
|
|
56
|
|
|
57 Code
|
|
|
58 ----
|
|
|
59
|
|
|
60 '''
|
|
|
61
|
|
|
62 import os, sys, re, optparse, collections, types, glob, time
|
|
|
63 import numpy
|
|
|
64
|
|
|
65 import gat
|
|
|
66 import gat.Experiment as E
|
|
|
67 import gat.IOTools as IOTools
|
|
|
68 import gat.IO as IO
|
|
|
69 import gat.Stats as Stats
|
|
|
70 import GatSegmentList
|
|
|
71 import GatEngine
|
|
|
72
|
|
|
73 def fromSegments( options, args ):
|
|
|
74 '''run analysis from segment files.
|
|
|
75
|
|
|
76 This is the most common use case.
|
|
|
77 '''
|
|
|
78
|
|
|
79 tstart = time.time()
|
|
|
80
|
|
|
81 ##################################################
|
|
|
82 ##################################################
|
|
|
83 ##################################################
|
|
|
84 ## build segments
|
|
|
85 segments, annotations, workspaces, isochores = IO.buildSegments( options )
|
|
|
86
|
|
|
87 E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
|
|
|
88
|
|
|
89 ##################################################
|
|
|
90 ##################################################
|
|
|
91 ##################################################
|
|
|
92 ## open various additional output files
|
|
|
93 ##################################################
|
|
|
94 outfiles = {}
|
|
|
95 for section in ("sample",
|
|
|
96 "segment_metrics",
|
|
|
97 "sample_metrics",
|
|
|
98 ):
|
|
|
99 if section in options.output_stats or \
|
|
|
100 "all" in options.output_stats or \
|
|
|
101 len( [ x for x in options.output_stats if re.search( x, "section" ) ] ) > 0:
|
|
|
102 outfiles[section] = E.openOutputFile(section)
|
|
|
103
|
|
|
104 if 'sample_metrics' in outfiles:
|
|
|
105 outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders() ))
|
|
|
106
|
|
|
107 # filter segments by workspace
|
|
|
108 workspace = IO.applyIsochores( segments,
|
|
|
109 annotations,
|
|
|
110 workspaces,
|
|
|
111 options,
|
|
|
112 isochores,
|
|
|
113 truncate_segments_to_workspace = options.truncate_segments_to_workspace,
|
|
|
114 truncate_workspace_to_annotations = options.truncate_workspace_to_annotations,
|
|
|
115 restrict_workspace = options.restrict_workspace )
|
|
|
116
|
|
|
117 ##################################################
|
|
|
118 ##################################################
|
|
|
119 ##################################################
|
|
|
120 ## check memory requirements
|
|
|
121 counts = segments.countsPerTrack()
|
|
|
122 max_counts = max(counts.values())
|
|
|
123 # previous algorithm: memory requirements if all samples are stored
|
|
|
124 memory = 8 * 2 * options.num_samples * max_counts * len(workspace)
|
|
|
125
|
|
|
126 ##################################################
|
|
|
127 ##################################################
|
|
|
128 ##################################################
|
|
|
129 # initialize sampler
|
|
|
130 if options.sampler == "annotator":
|
|
|
131 sampler = GatEngine.SamplerAnnotator(
|
|
|
132 bucket_size = options.bucket_size,
|
|
|
133 nbuckets = options.nbuckets )
|
|
|
134 elif options.sampler == "shift":
|
|
|
135 sampler = GatEngine.SamplerShift(
|
|
|
136 radius = options.shift_expansion,
|
|
|
137 extension = options.shift_extension )
|
|
|
138 elif options.sampler == "segments":
|
|
|
139 sampler = GatEngine.SamplerSegments()
|
|
|
140 elif options.sampler == "local-permutation":
|
|
|
141 sampler = GatEngine.SamplerLocalPermutation()
|
|
|
142 elif options.sampler == "global-permutation":
|
|
|
143 sampler = GatEngine.SamplerGlobalPermutation()
|
|
|
144 elif options.sampler == "brute-force":
|
|
|
145 sampler = GatEngine.SamplerBruteForce()
|
|
|
146 elif options.sampler == "uniform":
|
|
|
147 sampler = GatEngine.SamplerUniform()
|
|
|
148
|
|
|
149 ##################################################
|
|
|
150 ##################################################
|
|
|
151 ##################################################
|
|
|
152 # initialize counter
|
|
|
153 counters = []
|
|
|
154 for counter in options.counters:
|
|
|
155 if counter == "nucleotide-overlap":
|
|
|
156 counters.append( GatEngine.CounterNucleotideOverlap() )
|
|
|
157 elif counter == "nucleotide-density":
|
|
|
158 counters.append( GatEngine.CounterNucleotideDensity() )
|
|
|
159 elif counter == "segment-overlap":
|
|
|
160 counters.append( GatEngine.CounterSegmentOverlap() )
|
|
|
161 elif counter == "annotations-overlap":
|
|
|
162 counters.append( GatEngine.CounterAnnotationsOverlap() )
|
|
|
163 elif counter == "segment-midoverlap":
|
|
|
164 counters.append( GatEngine.CounterSegmentMidpointOverlap() )
|
|
|
165 elif counter == "annotations-midoverlap":
|
|
|
166 counters.append( GatEngine.CounterAnnotationsMidpointOverlap() )
|
|
|
167 else:
|
|
|
168 raise ValueError("unknown counter '%s'" % counter )
|
|
|
169
|
|
|
170 ##################################################
|
|
|
171 ##################################################
|
|
|
172 ##################################################
|
|
|
173 ## initialize workspace generator
|
|
|
174 if options.conditional == "unconditional":
|
|
|
175 workspace_generator = GatEngine.UnconditionalWorkspace()
|
|
|
176 elif options.conditional == "cooccurance":
|
|
|
177 workspace_generator = GatEngine.ConditionalWorkspaceCooccurance()
|
|
|
178 elif options.conditional == "annotation-centered":
|
|
|
179 if options.conditional_extension == options.conditional_expansion == None:
|
|
|
180 raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
|
|
|
181 workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension,
|
|
|
182 options.conditional_expansion )
|
|
|
183 elif options.conditional == "segment-centered":
|
|
|
184 if options.conditional_extension == options.conditional_expansion == None:
|
|
|
185 raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
|
|
|
186
|
|
|
187 workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension,
|
|
|
188 options.conditional_expansion )
|
|
|
189 else:
|
|
|
190 raise ValueError("unknown conditional workspace '%s'" % options.conditional )
|
|
|
191
|
|
|
192
|
|
|
193 ##################################################
|
|
|
194 ##################################################
|
|
|
195 ##################################################
|
|
|
196 ## check if reference is compplete
|
|
|
197 ##################################################
|
|
|
198 if options.reference:
|
|
|
199 reference = options.reference
|
|
|
200 for track in segments.tracks:
|
|
|
201 if track not in options.reference:
|
|
|
202 raise ValueError("missing track '%s' in reference" % track )
|
|
|
203 r = options.reference[track]
|
|
|
204 for annotation in annotations.tracks:
|
|
|
205 if annotation not in r:
|
|
|
206 raise ValueError("missing annotation '%s' in annotations for track='%s'" % (annotation, track ))
|
|
|
207
|
|
|
208
|
|
|
209 ##################################################
|
|
|
210 ##################################################
|
|
|
211 ##################################################
|
|
|
212 ## compute
|
|
|
213 ##################################################
|
|
|
214 annotator_results = gat.run( segments,
|
|
|
215 annotations,
|
|
|
216 workspace,
|
|
|
217 sampler,
|
|
|
218 counters,
|
|
|
219 workspace_generator = workspace_generator,
|
|
|
220 num_samples = options.num_samples,
|
|
|
221 cache = options.cache,
|
|
|
222 outfiles = outfiles,
|
|
|
223 output_counts_pattern = options.output_counts_pattern,
|
|
|
224 output_samples_pattern = options.output_samples_pattern,
|
|
|
225 sample_files = options.sample_files,
|
|
|
226 conditional = options.conditional,
|
|
|
227 conditional_extension = options.conditional_extension,
|
|
|
228 reference = options.reference,
|
|
|
229 pseudo_count = options.pseudo_count,
|
|
|
230 num_threads = options.num_threads )
|
|
|
231
|
|
|
232 return annotator_results
|
|
|
233
|
|
|
234
|
|
|
235 def main( argv = None ):
|
|
|
236 """script main.
|
|
|
237
|
|
|
238 parses command line options in sys.argv, unless *argv* is given.
|
|
|
239 """
|
|
|
240
|
|
|
241 if not argv: argv = sys.argv
|
|
|
242
|
|
|
243 parser = gat.buildParser( usage = globals()["__doc__"] )
|
|
|
244
|
|
|
245 ## add common options (-h/--help, ...) and parse command line
|
|
|
246 (options, args) = E.Start( parser, argv = argv, add_output_options = True )
|
|
|
247
|
|
|
248 ##################################################
|
|
|
249 description_header, descriptions, description_width = IO.readDescriptions( options )
|
|
|
250
|
|
|
251 ##################################################
|
|
|
252 size_pos, size_segment = GatSegmentList.getSegmentSize()
|
|
|
253 E.debug( "sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos )))
|
|
|
254
|
|
|
255 ##################################################
|
|
|
256 # set default counter
|
|
|
257 if not options.counters:
|
|
|
258 options.counters.append( "nucleotide-overlap" )
|
|
|
259
|
|
|
260 ##################################################
|
|
|
261 if options.output_tables_pattern != None:
|
|
|
262 if "%s" not in options.output_tables_pattern:
|
|
|
263 raise ValueError( "output_tables_pattern should contain at least one '%s'")
|
|
|
264
|
|
|
265 if options.output_samples_pattern != None:
|
|
|
266 if "%s" not in options.output_samples_pattern:
|
|
|
267 raise ValueError( "output_samples_pattern should contain at least one '%s'")
|
|
|
268
|
|
|
269 if options.output_counts_pattern != None:
|
|
|
270 if "%s" not in options.output_counts_pattern:
|
|
|
271 raise ValueError( "output_counts_pattern should contain at least one '%s'")
|
|
|
272
|
|
|
273
|
|
|
274 ##################################################
|
|
|
275 # read fold changes that results should be compared with
|
|
|
276 if options.null != "default":
|
|
|
277 if not os.path.exists( options.null ):
|
|
|
278 raise OSError( "file %s not found" % options.null )
|
|
|
279 E.info( "reading reference results from %s" % options.null )
|
|
|
280 options.reference = IO.readAnnotatorResults( options.null )
|
|
|
281 else:
|
|
|
282 options.reference = None
|
|
|
283
|
|
|
284 if options.input_filename_counts:
|
|
|
285 # use pre-computed counts
|
|
|
286 annotator_results = GatEngine.fromCounts( options.input_filename_counts )
|
|
|
287
|
|
|
288 elif options.input_filename_results:
|
|
|
289 # use previous results (re-computes fdr)
|
|
|
290 E.info( "reading gat results from %s" % options.input_filename_results )
|
|
|
291 annotator_results = IO.readAnnotatorResults( options.input_filename_results )
|
|
|
292
|
|
|
293 else:
|
|
|
294 # do full gat analysis
|
|
|
295 annotator_results = fromSegments( options, args )
|
|
|
296
|
|
|
297 ##################################################
|
|
|
298 if options.pvalue_method != "empirical":
|
|
|
299 E.info("updating pvalues to %s" % options.pvalue_method )
|
|
|
300 GatEngine.updatePValues( annotator_results, options.pvalue_method )
|
|
|
301
|
|
|
302 ##################################################
|
|
|
303 ## output
|
|
|
304 IO.outputResults( annotator_results,
|
|
|
305 options,
|
|
|
306 GatEngine.AnnotatorResultExtended.headers,
|
|
|
307 description_header,
|
|
|
308 description_width,
|
|
|
309 descriptions )
|
|
|
310
|
|
|
311 IO.plotResults( annotator_results, options )
|
|
|
312
|
|
|
313 ## write footer and output benchmark information.
|
|
|
314 E.Stop()
|
|
|
315
|
|
|
316 if __name__ == "__main__":
|
|
|
317 sys.exit( main( sys.argv) )
|