comparison rlGAT/gat-great.py @ 11:53487f21c0d5 draft

Uploaded
author fubar
date Thu, 29 Aug 2013 01:57:54 -0400
parents
children
comparison
equal deleted inserted replaced
10:f04dfb37d1bb 11:53487f21c0d5
1 ################################################################################
2 #
3 # MRC FGU Computational Genomics Group
4 #
5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
6 #
7 # Copyright (C) 2009 Andreas Heger
8 #
9 # This program is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU General Public License
11 # as published by the Free Software Foundation; either version 2
12 # of the License, or (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program; if not, write to the Free Software
21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 #################################################################################
23 '''
24 great - run great analysis
25 ==========================
26
27 :Author: Andreas Heger
28 :Release: $Id$
29 :Date: |today|
30 :Tags: Python
31
32 Purpose
33 -------
34
35 This script compares one or more genomic segments of interest
36 against one more other genomic annotations.
37
38 GREAT says that a segment overlaps an annotation if the midpoint
39 of the segment overlaps the annotation.
40
41 The GREAT analysis is done on a per-isochore basis and overall.
42 For the overall analysis, there is no isochore correction
43 as the union of all isochores is treated as a single workspace.
44
45 This script implies several counters (``--counter`` option):
46
47 binom
48 binomial model (as implemented by GREAT). Returns the probability
49 of a certain number of segments overlapping (mid-point overlap)
50 an annotation
51
52 hyperg
53 hypergeometric model. Returns the probabibility of a certain
54 number of nucleotides within segments overlapping an annotation.
55 Conceptually, all genomic positions are treated independently and
56 segments are fragmented into a collection of single bases that are
57 placed indepently. This is valid for point intervals such as SNPs,
58 but of course ignores segment size, though it might be used as a
59 guide towards the expected fold enrichment, that can be computed
60 more accurately using :mod:`gat-run.py`
61
62 Usage
63 -----
64
65 Example::
66
67 python gat-run.py
68 --segment-file=segments.bed.gz
69 --workspace-file=workspace.bed.gz
70 --annotation-file=annotations_architecture.bed.gz
71
72 Type::
73
74 python gat-run.py --help
75
76 for command line help.
77
78 Documentation
79 -------------
80
81 Code
82 ----
83
84 '''
85
86 import os, sys, re, optparse, collections, types, glob, time
87 import numpy
88 import numpy.random
89
90 import gat
91 import gat.Experiment as E
92 import gat.IOTools as IOTools
93 import gat.IO as IO
94 import csegmentlist
95
96 import scipy.stats
97
98 GREAT_RESULT = collections.namedtuple( "GREAT",
99 ("track",
100 "annotation",
101 "isochore",
102 "counter",
103 "observed", # nsegments_overlapping_annotations
104 "expected",
105 "nsegments_in_workspace",
106 "nannotations_in_workspace",
107 "nsegments_overlapping_annotation",
108 "nannotations_overlapping_segments",
109 "basecoverage_intersection",
110 "basecoverage_segments",
111 "basecoverage_annotation",
112 "basecoverage_workspace",
113 "fraction_coverage_annotation",
114 "fold",
115 "pvalue",
116 "qvalue" ) )
117
118 def main( argv ):
119
120 if not argv: argv = sys.argv
121
122 # setup command line parser
123 parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
124 usage = globals()["__doc__"] )
125
126 parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append",
127 help="filename with annotations [default=%default]." )
128
129 parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append",
130 help="filename with segments. Also accepts a glob in parentheses [default=%default]." )
131
132 parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append",
133 help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]." )
134
135 parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append",
136 help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]." )
137
138 parser.add_option("-o", "--order", dest="output_order", type="choice",
139 choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ),
140 help="order results in output by fold, track, etc. [default=%default]." )
141
142 parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
143 choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
144 help="method to perform multiple testing correction by controlling the fdr [default=%default]." )
145
146 parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
147 help="fdr computation: lambda [default=%default]." )
148
149 parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
150 choices = ("smoother", "bootstrap" ),
151 help="fdr computation: method for estimating pi0 [default=%default]." )
152 parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
153 help="filename mapping annotation terms to descriptions. "
154 " if given, the output table will contain additional columns "
155 " [default=%default]" )
156
157 parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true",
158 help="ignore segment tracks - all segments belong to one track [default=%default]" )
159
160 parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true",
161 help="permit the same track to be in multiple files [default=%default]" )
162
163 parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append",
164 choices = ( "all",
165 "annotations", "segments",
166 "workspaces", "isochores",
167 "overlap" ),
168 help="output bed files [default=%default]." )
169
170 parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append",
171 choices = ( "all",
172 "annotations", "segments",
173 "workspaces", "isochores",
174 "overlap" ),
175 help="output overlap summary stats [default=%default]." )
176
177 parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true",
178 help="restrict workspace to those segments that contain both track"
179 " and annotations [default=%default]" )
180
181 parser.add_option( "--counter", dest="counters", type="choice", action="append",
182 choices = ( "binom", "hyperg" ),
183 help="counter to use [default=%default]." )
184
185 parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string",
186 help="output pattern for result tables. Used if there are multiple counters used [default=%default]." )
187
188 parser.set_defaults(
189 annotation_files = [],
190 segment_files = [],
191 workspace_files = [],
192 sample_files = [],
193 counters = [],
194 output_stats = [],
195 output_bed = [],
196 output_tables_pattern = "%s.tsv.gz",
197 output_order = "fold",
198 input_filename_counts = None,
199 input_filename_results = None,
200 pvalue_method = "empirical",
201 output_plots_pattern = None,
202 output_samples_pattern = None,
203 qvalue_method = "storey",
204 qvalue_lambda = None,
205 qvalue_pi0_method = "smoother",
206 ignore_segment_tracks = False,
207 input_filename_descriptions = None,
208 conditional = "unconditional",
209 conditional_extension = None,
210 conditional_expansion = None,
211 restrict_workspace = False,
212 enable_split_tracks = False,
213 shift_expansion = 2.0,
214 shift_extension = 0,
215 overlap_mode = "midpoint",
216 truncate_workspace_to_annotations = False,
217 )
218
219 ## add common options (-h/--help, ...) and parse command line
220 (options, args) = E.Start( parser, argv = argv, add_output_options = True )
221
222 tstart = time.time()
223
224 if len(options.counters) == 0:
225 options.counters.append("binom")
226
227 ############################################
228 segments, annotations, workspaces, isochores = IO.buildSegments( options )
229 E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
230
231 # filter segments by workspace
232 workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores )
233
234 ############################################
235 description_header, descriptions, description_width = IO.readDescriptions( options )
236
237 ############################################
238 ############################################
239 ## compute per contig
240
241 # compute bases covered by workspace
242 workspace2basecoverage, isochores = {}, []
243 for contig, ww in workspace.iteritems():
244 workspace2basecoverage[contig] = ww.sum()
245 isochores.append( contig )
246
247 # compute percentage of bases covered by annotations in workspace
248 # per isochore
249 annotation2basecoverage = collections.defaultdict( dict )
250 for annotation, aa in annotations.iteritems():
251 for isochore, a in aa.iteritems():
252 # need to truncate to workspace?
253 annotation2basecoverage[annotation][isochore] = a.sum()
254
255 results_per_contig = collections.defaultdict( list )
256
257 E.info( "computing counts per isochore" )
258 # results per isochore
259 def emptyResult( segment, annotation, isochore,
260 counter,
261 nsegments_in_workspace,
262 basecoverage_annotation,
263 basecoverage_workspace):
264 return GREAT_RESULT._make( (
265 segment, annotation, isochore,
266 counter,
267 0, # observed
268 0, # expected
269 nsegments_in_workspace,
270 0, # nannotations_in_workspace
271 0, # nsegments_overlapping_annotation
272 0, # nannotations_overlapping_segments
273 0, # basecoverage_intersection
274 0, # basecoverage_segments
275 basecoverage_annotation,
276 basecoverage_workspace,
277 0.0,
278 1.0,
279 1.0,
280 1.0 ))
281
282 for isochore in isochores:
283 basecoverage_workspace = workspace2basecoverage[isochore]
284
285 # iterate over all isochores
286 for segment, segmentdict in segments.iteritems():
287 try:
288 ss = segmentdict[isochore]
289 # select segments overlapping workspace
290 segments_in_workspace = csegmentlist.SegmentList( clone = ss )
291 segments_in_workspace.intersect( workspace[isochore] )
292 # number of segments in workspace
293 nsegments_in_workspace = len(segments_in_workspace)
294 except KeyError:
295 ss = None
296
297 basecoverage_segments = segments_in_workspace.sum()
298
299 for annotation, annotationdict in annotations.iteritems():
300
301 # if annotation != "GO:0030957": continue
302
303 try:
304 aa = annotationdict[isochore]
305 except KeyError:
306 aa = None
307
308 # p_A: proportion of bases covered by annotation
309 try:
310 basecoverage_annotation = annotation2basecoverage[annotation][isochore]
311 except KeyError:
312 basecoverage_annotation = 0
313
314 if ss == None or aa == None:
315 for counter in options.counters:
316 results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation,
317 isochore,
318 counter,
319 nsegments_in_workspace,
320 basecoverage_annotation,
321 basecoverage_workspace ) )
322 continue
323
324 # select segments overlapping annotation
325 segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss )
326 segments_overlapping_annotation.intersect( annotations[annotation][isochore] )
327 # number of segments in annotation
328 nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore],
329 mode = options.overlap_mode )
330
331 # number of nucleotides at the intersection of segments, annotation and workspace
332 basecoverage_intersection = segments_overlapping_annotation.sum()
333
334 annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa )
335 annotations_overlapping_segments.intersect( ss )
336 nannotations_overlapping_segments = len( annotations_overlapping_segments )
337
338 nannotations_in_workspace = len( aa )
339 if nannotations_in_workspace == 0:
340 for counter in options.counters:
341 results_per_contig[(counter,segment,annotation)].append( emptyResult(segment,
342 annotation,
343 isochore,
344 counter,
345 nsegments_in_workspace,
346 basecoverage_annotation,
347 basecoverage_workspace ) )
348 continue
349
350 fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
351 fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace
352
353 for counter in options.counters:
354 if counter.startswith( "binom" ):
355 # GREAT binomial probability over "regions"
356 # n = number of genomic regions = nannotations_in_workspace
357 # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation
358 # kpi = genomic regions with annotation hit by segments = nannotations_in_segments
359 # sf = survival functions = 1 -cdf
360 # probability of observing >kpi in a sample of n where the probabily of succes is
361 # ppi.
362 pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1,
363 nsegments_in_workspace,
364 fraction_coverage_annotation )
365
366 expected = fraction_coverage_annotation * nsegments_in_workspace
367 observed = nsegments_overlapping_annotation
368
369 elif counter.startswith( "hyperg" ):
370
371 # hypergeometric probability over nucleotides
372 # Sampling without replacement
373 # x,M,n,M
374 # x = observed number of nucleotides in overlap of segments,annotations and workspace
375 # M = number of nucleotides in workspace
376 # n = number of nucleotides in annotations (and workspace)
377 # N = number of nucleotides in segments (and workspace)
378 # P-value of obtaining >x number of nucleotides overlapping.
379 rv = scipy.stats.hypergeom( basecoverage_workspace,
380 basecoverage_annotation,
381 basecoverage_segments )
382
383 pvalue = rv.sf( basecoverage_intersection )
384 expected = rv.mean()
385 observed = basecoverage_intersection
386
387 if expected != 0:
388 fold = float(observed) / expected
389 else:
390 fold = 1.0
391
392 r = GREAT_RESULT._make( (
393 segment, annotation, isochore,
394 counter,
395 observed,
396 expected,
397 nsegments_in_workspace,
398 nannotations_in_workspace,
399 nsegments_overlapping_annotation,
400 nannotations_overlapping_segments,
401 basecoverage_intersection,
402 basecoverage_segments,
403 basecoverage_annotation,
404 basecoverage_workspace,
405 fraction_coverage_annotation,
406 fold,
407 pvalue,
408 1.0 ))
409 # print "\t".join( map(str, r))
410 results_per_contig[ (counter,segment,annotation) ].append( r )
411
412 E.info( "merging counts per isochore" )
413
414 # compute sums
415 results = []
416
417 for niteration, pair in enumerate(results_per_contig.iteritems()):
418
419 counter, segment, annotation = pair[0]
420 data = pair[1]
421
422 nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] )
423 nsegments_overlapping_annotation = sum( [x.observed for x in data ] )
424 nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] )
425 nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] )
426
427 basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] )
428 basecoverage_segments = sum( [x.basecoverage_segments for x in data ] )
429 basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] )
430 basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] )
431
432 fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
433
434 if counter.startswith( "binom" ):
435 pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1,
436 nsegments_in_workspace,
437 fraction_coverage_annotation )
438 expected = fraction_coverage_annotation * nsegments_in_workspace
439 observed = nsegments_overlapping_annotation
440 elif counter.startswith( "hyperg" ):
441 rv = scipy.stats.hypergeom( basecoverage_workspace,
442 basecoverage_annotation,
443 basecoverage_segments )
444
445 pvalue = rv.sf( basecoverage_intersection )
446 expected = rv.mean()
447 observed = basecoverage_intersection
448
449
450 if expected != 0:
451 fold = float(observed) / expected
452 else:
453 fold = 1.0
454
455 r = GREAT_RESULT._make( (
456 segment, annotation, "all",
457 counter,
458 observed,
459 expected,
460 nsegments_in_workspace,
461 nannotations_in_workspace,
462 nsegments_overlapping_annotation,
463 nannotations_overlapping_segments,
464 basecoverage_intersection,
465 basecoverage_segments,
466 basecoverage_annotation,
467 basecoverage_workspace,
468 fraction_coverage_annotation,
469 fold,
470 pvalue,
471 1.0 ))
472
473 results.append( r )
474
475 IO.outputResults( results,
476 options,
477 GREAT_RESULT._fields,
478 description_header,
479 description_width,
480 descriptions )
481
482 E.Stop()
483
484 if __name__ == "__main__":
485 sys.exit( main( sys.argv) )
486
487
488