|
11
|
1 ################################################################################
|
|
|
2 #
|
|
|
3 # MRC FGU Computational Genomics Group
|
|
|
4 #
|
|
|
5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
|
|
|
6 #
|
|
|
7 # Copyright (C) 2009 Andreas Heger
|
|
|
8 #
|
|
|
9 # This program is free software; you can redistribute it and/or
|
|
|
10 # modify it under the terms of the GNU General Public License
|
|
|
11 # as published by the Free Software Foundation; either version 2
|
|
|
12 # of the License, or (at your option) any later version.
|
|
|
13 #
|
|
|
14 # This program is distributed in the hope that it will be useful,
|
|
|
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
17 # GNU General Public License for more details.
|
|
|
18 #
|
|
|
19 # You should have received a copy of the GNU General Public License
|
|
|
20 # along with this program; if not, write to the Free Software
|
|
|
21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
22 #################################################################################
|
|
|
23 '''
|
|
|
24 great - run great analysis
|
|
|
25 ==========================
|
|
|
26
|
|
|
27 :Author: Andreas Heger
|
|
|
28 :Release: $Id$
|
|
|
29 :Date: |today|
|
|
|
30 :Tags: Python
|
|
|
31
|
|
|
32 Purpose
|
|
|
33 -------
|
|
|
34
|
|
|
35 This script compares one or more genomic segments of interest
|
|
|
36 against one more other genomic annotations.
|
|
|
37
|
|
|
38 GREAT says that a segment overlaps an annotation if the midpoint
|
|
|
39 of the segment overlaps the annotation.
|
|
|
40
|
|
|
41 The GREAT analysis is done on a per-isochore basis and overall.
|
|
|
42 For the overall analysis, there is no isochore correction
|
|
|
43 as the union of all isochores is treated as a single workspace.
|
|
|
44
|
|
|
45 This script implies several counters (``--counter`` option):
|
|
|
46
|
|
|
47 binom
|
|
|
48 binomial model (as implemented by GREAT). Returns the probability
|
|
|
49 of a certain number of segments overlapping (mid-point overlap)
|
|
|
50 an annotation
|
|
|
51
|
|
|
52 hyperg
|
|
|
53 hypergeometric model. Returns the probabibility of a certain
|
|
|
54 number of nucleotides within segments overlapping an annotation.
|
|
|
55 Conceptually, all genomic positions are treated independently and
|
|
|
56 segments are fragmented into a collection of single bases that are
|
|
|
57 placed indepently. This is valid for point intervals such as SNPs,
|
|
|
58 but of course ignores segment size, though it might be used as a
|
|
|
59 guide towards the expected fold enrichment, that can be computed
|
|
|
60 more accurately using :mod:`gat-run.py`
|
|
|
61
|
|
|
62 Usage
|
|
|
63 -----
|
|
|
64
|
|
|
65 Example::
|
|
|
66
|
|
|
67 python gat-run.py
|
|
|
68 --segment-file=segments.bed.gz
|
|
|
69 --workspace-file=workspace.bed.gz
|
|
|
70 --annotation-file=annotations_architecture.bed.gz
|
|
|
71
|
|
|
72 Type::
|
|
|
73
|
|
|
74 python gat-run.py --help
|
|
|
75
|
|
|
76 for command line help.
|
|
|
77
|
|
|
78 Documentation
|
|
|
79 -------------
|
|
|
80
|
|
|
81 Code
|
|
|
82 ----
|
|
|
83
|
|
|
84 '''
|
|
|
85
|
|
|
86 import os, sys, re, optparse, collections, types, glob, time
|
|
|
87 import numpy
|
|
|
88 import numpy.random
|
|
|
89
|
|
|
90 import gat
|
|
|
91 import gat.Experiment as E
|
|
|
92 import gat.IOTools as IOTools
|
|
|
93 import gat.IO as IO
|
|
|
94 import csegmentlist
|
|
|
95
|
|
|
96 import scipy.stats
|
|
|
97
|
|
|
98 GREAT_RESULT = collections.namedtuple( "GREAT",
|
|
|
99 ("track",
|
|
|
100 "annotation",
|
|
|
101 "isochore",
|
|
|
102 "counter",
|
|
|
103 "observed", # nsegments_overlapping_annotations
|
|
|
104 "expected",
|
|
|
105 "nsegments_in_workspace",
|
|
|
106 "nannotations_in_workspace",
|
|
|
107 "nsegments_overlapping_annotation",
|
|
|
108 "nannotations_overlapping_segments",
|
|
|
109 "basecoverage_intersection",
|
|
|
110 "basecoverage_segments",
|
|
|
111 "basecoverage_annotation",
|
|
|
112 "basecoverage_workspace",
|
|
|
113 "fraction_coverage_annotation",
|
|
|
114 "fold",
|
|
|
115 "pvalue",
|
|
|
116 "qvalue" ) )
|
|
|
117
|
|
|
118 def main( argv ):
|
|
|
119
|
|
|
120 if not argv: argv = sys.argv
|
|
|
121
|
|
|
122 # setup command line parser
|
|
|
123 parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
|
|
|
124 usage = globals()["__doc__"] )
|
|
|
125
|
|
|
126 parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append",
|
|
|
127 help="filename with annotations [default=%default]." )
|
|
|
128
|
|
|
129 parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append",
|
|
|
130 help="filename with segments. Also accepts a glob in parentheses [default=%default]." )
|
|
|
131
|
|
|
132 parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append",
|
|
|
133 help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]." )
|
|
|
134
|
|
|
135 parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append",
|
|
|
136 help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]." )
|
|
|
137
|
|
|
138 parser.add_option("-o", "--order", dest="output_order", type="choice",
|
|
|
139 choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ),
|
|
|
140 help="order results in output by fold, track, etc. [default=%default]." )
|
|
|
141
|
|
|
142 parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
|
|
|
143 choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
|
|
|
144 help="method to perform multiple testing correction by controlling the fdr [default=%default]." )
|
|
|
145
|
|
|
146 parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
|
|
|
147 help="fdr computation: lambda [default=%default]." )
|
|
|
148
|
|
|
149 parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
|
|
|
150 choices = ("smoother", "bootstrap" ),
|
|
|
151 help="fdr computation: method for estimating pi0 [default=%default]." )
|
|
|
152 parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
|
|
|
153 help="filename mapping annotation terms to descriptions. "
|
|
|
154 " if given, the output table will contain additional columns "
|
|
|
155 " [default=%default]" )
|
|
|
156
|
|
|
157 parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true",
|
|
|
158 help="ignore segment tracks - all segments belong to one track [default=%default]" )
|
|
|
159
|
|
|
160 parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true",
|
|
|
161 help="permit the same track to be in multiple files [default=%default]" )
|
|
|
162
|
|
|
163 parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append",
|
|
|
164 choices = ( "all",
|
|
|
165 "annotations", "segments",
|
|
|
166 "workspaces", "isochores",
|
|
|
167 "overlap" ),
|
|
|
168 help="output bed files [default=%default]." )
|
|
|
169
|
|
|
170 parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append",
|
|
|
171 choices = ( "all",
|
|
|
172 "annotations", "segments",
|
|
|
173 "workspaces", "isochores",
|
|
|
174 "overlap" ),
|
|
|
175 help="output overlap summary stats [default=%default]." )
|
|
|
176
|
|
|
177 parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true",
|
|
|
178 help="restrict workspace to those segments that contain both track"
|
|
|
179 " and annotations [default=%default]" )
|
|
|
180
|
|
|
181 parser.add_option( "--counter", dest="counters", type="choice", action="append",
|
|
|
182 choices = ( "binom", "hyperg" ),
|
|
|
183 help="counter to use [default=%default]." )
|
|
|
184
|
|
|
185 parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string",
|
|
|
186 help="output pattern for result tables. Used if there are multiple counters used [default=%default]." )
|
|
|
187
|
|
|
188 parser.set_defaults(
|
|
|
189 annotation_files = [],
|
|
|
190 segment_files = [],
|
|
|
191 workspace_files = [],
|
|
|
192 sample_files = [],
|
|
|
193 counters = [],
|
|
|
194 output_stats = [],
|
|
|
195 output_bed = [],
|
|
|
196 output_tables_pattern = "%s.tsv.gz",
|
|
|
197 output_order = "fold",
|
|
|
198 input_filename_counts = None,
|
|
|
199 input_filename_results = None,
|
|
|
200 pvalue_method = "empirical",
|
|
|
201 output_plots_pattern = None,
|
|
|
202 output_samples_pattern = None,
|
|
|
203 qvalue_method = "storey",
|
|
|
204 qvalue_lambda = None,
|
|
|
205 qvalue_pi0_method = "smoother",
|
|
|
206 ignore_segment_tracks = False,
|
|
|
207 input_filename_descriptions = None,
|
|
|
208 conditional = "unconditional",
|
|
|
209 conditional_extension = None,
|
|
|
210 conditional_expansion = None,
|
|
|
211 restrict_workspace = False,
|
|
|
212 enable_split_tracks = False,
|
|
|
213 shift_expansion = 2.0,
|
|
|
214 shift_extension = 0,
|
|
|
215 overlap_mode = "midpoint",
|
|
|
216 truncate_workspace_to_annotations = False,
|
|
|
217 )
|
|
|
218
|
|
|
219 ## add common options (-h/--help, ...) and parse command line
|
|
|
220 (options, args) = E.Start( parser, argv = argv, add_output_options = True )
|
|
|
221
|
|
|
222 tstart = time.time()
|
|
|
223
|
|
|
224 if len(options.counters) == 0:
|
|
|
225 options.counters.append("binom")
|
|
|
226
|
|
|
227 ############################################
|
|
|
228 segments, annotations, workspaces, isochores = IO.buildSegments( options )
|
|
|
229 E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
|
|
|
230
|
|
|
231 # filter segments by workspace
|
|
|
232 workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores )
|
|
|
233
|
|
|
234 ############################################
|
|
|
235 description_header, descriptions, description_width = IO.readDescriptions( options )
|
|
|
236
|
|
|
237 ############################################
|
|
|
238 ############################################
|
|
|
239 ## compute per contig
|
|
|
240
|
|
|
241 # compute bases covered by workspace
|
|
|
242 workspace2basecoverage, isochores = {}, []
|
|
|
243 for contig, ww in workspace.iteritems():
|
|
|
244 workspace2basecoverage[contig] = ww.sum()
|
|
|
245 isochores.append( contig )
|
|
|
246
|
|
|
247 # compute percentage of bases covered by annotations in workspace
|
|
|
248 # per isochore
|
|
|
249 annotation2basecoverage = collections.defaultdict( dict )
|
|
|
250 for annotation, aa in annotations.iteritems():
|
|
|
251 for isochore, a in aa.iteritems():
|
|
|
252 # need to truncate to workspace?
|
|
|
253 annotation2basecoverage[annotation][isochore] = a.sum()
|
|
|
254
|
|
|
255 results_per_contig = collections.defaultdict( list )
|
|
|
256
|
|
|
257 E.info( "computing counts per isochore" )
|
|
|
258 # results per isochore
|
|
|
259 def emptyResult( segment, annotation, isochore,
|
|
|
260 counter,
|
|
|
261 nsegments_in_workspace,
|
|
|
262 basecoverage_annotation,
|
|
|
263 basecoverage_workspace):
|
|
|
264 return GREAT_RESULT._make( (
|
|
|
265 segment, annotation, isochore,
|
|
|
266 counter,
|
|
|
267 0, # observed
|
|
|
268 0, # expected
|
|
|
269 nsegments_in_workspace,
|
|
|
270 0, # nannotations_in_workspace
|
|
|
271 0, # nsegments_overlapping_annotation
|
|
|
272 0, # nannotations_overlapping_segments
|
|
|
273 0, # basecoverage_intersection
|
|
|
274 0, # basecoverage_segments
|
|
|
275 basecoverage_annotation,
|
|
|
276 basecoverage_workspace,
|
|
|
277 0.0,
|
|
|
278 1.0,
|
|
|
279 1.0,
|
|
|
280 1.0 ))
|
|
|
281
|
|
|
282 for isochore in isochores:
|
|
|
283 basecoverage_workspace = workspace2basecoverage[isochore]
|
|
|
284
|
|
|
285 # iterate over all isochores
|
|
|
286 for segment, segmentdict in segments.iteritems():
|
|
|
287 try:
|
|
|
288 ss = segmentdict[isochore]
|
|
|
289 # select segments overlapping workspace
|
|
|
290 segments_in_workspace = csegmentlist.SegmentList( clone = ss )
|
|
|
291 segments_in_workspace.intersect( workspace[isochore] )
|
|
|
292 # number of segments in workspace
|
|
|
293 nsegments_in_workspace = len(segments_in_workspace)
|
|
|
294 except KeyError:
|
|
|
295 ss = None
|
|
|
296
|
|
|
297 basecoverage_segments = segments_in_workspace.sum()
|
|
|
298
|
|
|
299 for annotation, annotationdict in annotations.iteritems():
|
|
|
300
|
|
|
301 # if annotation != "GO:0030957": continue
|
|
|
302
|
|
|
303 try:
|
|
|
304 aa = annotationdict[isochore]
|
|
|
305 except KeyError:
|
|
|
306 aa = None
|
|
|
307
|
|
|
308 # p_A: proportion of bases covered by annotation
|
|
|
309 try:
|
|
|
310 basecoverage_annotation = annotation2basecoverage[annotation][isochore]
|
|
|
311 except KeyError:
|
|
|
312 basecoverage_annotation = 0
|
|
|
313
|
|
|
314 if ss == None or aa == None:
|
|
|
315 for counter in options.counters:
|
|
|
316 results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation,
|
|
|
317 isochore,
|
|
|
318 counter,
|
|
|
319 nsegments_in_workspace,
|
|
|
320 basecoverage_annotation,
|
|
|
321 basecoverage_workspace ) )
|
|
|
322 continue
|
|
|
323
|
|
|
324 # select segments overlapping annotation
|
|
|
325 segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss )
|
|
|
326 segments_overlapping_annotation.intersect( annotations[annotation][isochore] )
|
|
|
327 # number of segments in annotation
|
|
|
328 nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore],
|
|
|
329 mode = options.overlap_mode )
|
|
|
330
|
|
|
331 # number of nucleotides at the intersection of segments, annotation and workspace
|
|
|
332 basecoverage_intersection = segments_overlapping_annotation.sum()
|
|
|
333
|
|
|
334 annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa )
|
|
|
335 annotations_overlapping_segments.intersect( ss )
|
|
|
336 nannotations_overlapping_segments = len( annotations_overlapping_segments )
|
|
|
337
|
|
|
338 nannotations_in_workspace = len( aa )
|
|
|
339 if nannotations_in_workspace == 0:
|
|
|
340 for counter in options.counters:
|
|
|
341 results_per_contig[(counter,segment,annotation)].append( emptyResult(segment,
|
|
|
342 annotation,
|
|
|
343 isochore,
|
|
|
344 counter,
|
|
|
345 nsegments_in_workspace,
|
|
|
346 basecoverage_annotation,
|
|
|
347 basecoverage_workspace ) )
|
|
|
348 continue
|
|
|
349
|
|
|
350 fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
|
|
|
351 fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace
|
|
|
352
|
|
|
353 for counter in options.counters:
|
|
|
354 if counter.startswith( "binom" ):
|
|
|
355 # GREAT binomial probability over "regions"
|
|
|
356 # n = number of genomic regions = nannotations_in_workspace
|
|
|
357 # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation
|
|
|
358 # kpi = genomic regions with annotation hit by segments = nannotations_in_segments
|
|
|
359 # sf = survival functions = 1 -cdf
|
|
|
360 # probability of observing >kpi in a sample of n where the probabily of succes is
|
|
|
361 # ppi.
|
|
|
362 pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1,
|
|
|
363 nsegments_in_workspace,
|
|
|
364 fraction_coverage_annotation )
|
|
|
365
|
|
|
366 expected = fraction_coverage_annotation * nsegments_in_workspace
|
|
|
367 observed = nsegments_overlapping_annotation
|
|
|
368
|
|
|
369 elif counter.startswith( "hyperg" ):
|
|
|
370
|
|
|
371 # hypergeometric probability over nucleotides
|
|
|
372 # Sampling without replacement
|
|
|
373 # x,M,n,M
|
|
|
374 # x = observed number of nucleotides in overlap of segments,annotations and workspace
|
|
|
375 # M = number of nucleotides in workspace
|
|
|
376 # n = number of nucleotides in annotations (and workspace)
|
|
|
377 # N = number of nucleotides in segments (and workspace)
|
|
|
378 # P-value of obtaining >x number of nucleotides overlapping.
|
|
|
379 rv = scipy.stats.hypergeom( basecoverage_workspace,
|
|
|
380 basecoverage_annotation,
|
|
|
381 basecoverage_segments )
|
|
|
382
|
|
|
383 pvalue = rv.sf( basecoverage_intersection )
|
|
|
384 expected = rv.mean()
|
|
|
385 observed = basecoverage_intersection
|
|
|
386
|
|
|
387 if expected != 0:
|
|
|
388 fold = float(observed) / expected
|
|
|
389 else:
|
|
|
390 fold = 1.0
|
|
|
391
|
|
|
392 r = GREAT_RESULT._make( (
|
|
|
393 segment, annotation, isochore,
|
|
|
394 counter,
|
|
|
395 observed,
|
|
|
396 expected,
|
|
|
397 nsegments_in_workspace,
|
|
|
398 nannotations_in_workspace,
|
|
|
399 nsegments_overlapping_annotation,
|
|
|
400 nannotations_overlapping_segments,
|
|
|
401 basecoverage_intersection,
|
|
|
402 basecoverage_segments,
|
|
|
403 basecoverage_annotation,
|
|
|
404 basecoverage_workspace,
|
|
|
405 fraction_coverage_annotation,
|
|
|
406 fold,
|
|
|
407 pvalue,
|
|
|
408 1.0 ))
|
|
|
409 # print "\t".join( map(str, r))
|
|
|
410 results_per_contig[ (counter,segment,annotation) ].append( r )
|
|
|
411
|
|
|
412 E.info( "merging counts per isochore" )
|
|
|
413
|
|
|
414 # compute sums
|
|
|
415 results = []
|
|
|
416
|
|
|
417 for niteration, pair in enumerate(results_per_contig.iteritems()):
|
|
|
418
|
|
|
419 counter, segment, annotation = pair[0]
|
|
|
420 data = pair[1]
|
|
|
421
|
|
|
422 nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] )
|
|
|
423 nsegments_overlapping_annotation = sum( [x.observed for x in data ] )
|
|
|
424 nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] )
|
|
|
425 nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] )
|
|
|
426
|
|
|
427 basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] )
|
|
|
428 basecoverage_segments = sum( [x.basecoverage_segments for x in data ] )
|
|
|
429 basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] )
|
|
|
430 basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] )
|
|
|
431
|
|
|
432 fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
|
|
|
433
|
|
|
434 if counter.startswith( "binom" ):
|
|
|
435 pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1,
|
|
|
436 nsegments_in_workspace,
|
|
|
437 fraction_coverage_annotation )
|
|
|
438 expected = fraction_coverage_annotation * nsegments_in_workspace
|
|
|
439 observed = nsegments_overlapping_annotation
|
|
|
440 elif counter.startswith( "hyperg" ):
|
|
|
441 rv = scipy.stats.hypergeom( basecoverage_workspace,
|
|
|
442 basecoverage_annotation,
|
|
|
443 basecoverage_segments )
|
|
|
444
|
|
|
445 pvalue = rv.sf( basecoverage_intersection )
|
|
|
446 expected = rv.mean()
|
|
|
447 observed = basecoverage_intersection
|
|
|
448
|
|
|
449
|
|
|
450 if expected != 0:
|
|
|
451 fold = float(observed) / expected
|
|
|
452 else:
|
|
|
453 fold = 1.0
|
|
|
454
|
|
|
455 r = GREAT_RESULT._make( (
|
|
|
456 segment, annotation, "all",
|
|
|
457 counter,
|
|
|
458 observed,
|
|
|
459 expected,
|
|
|
460 nsegments_in_workspace,
|
|
|
461 nannotations_in_workspace,
|
|
|
462 nsegments_overlapping_annotation,
|
|
|
463 nannotations_overlapping_segments,
|
|
|
464 basecoverage_intersection,
|
|
|
465 basecoverage_segments,
|
|
|
466 basecoverage_annotation,
|
|
|
467 basecoverage_workspace,
|
|
|
468 fraction_coverage_annotation,
|
|
|
469 fold,
|
|
|
470 pvalue,
|
|
|
471 1.0 ))
|
|
|
472
|
|
|
473 results.append( r )
|
|
|
474
|
|
|
475 IO.outputResults( results,
|
|
|
476 options,
|
|
|
477 GREAT_RESULT._fields,
|
|
|
478 description_header,
|
|
|
479 description_width,
|
|
|
480 descriptions )
|
|
|
481
|
|
|
482 E.Stop()
|
|
|
483
|
|
|
484 if __name__ == "__main__":
|
|
|
485 sys.exit( main( sys.argv) )
|
|
|
486
|
|
|
487
|
|
|
488
|