Mercurial > repos > fubar > genomic_association_tester
comparison rlGAT/gat-great.py @ 11:53487f21c0d5 draft
Uploaded
| author | fubar |
|---|---|
| date | Thu, 29 Aug 2013 01:57:54 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 10:f04dfb37d1bb | 11:53487f21c0d5 |
|---|---|
| 1 ################################################################################ | |
| 2 # | |
| 3 # MRC FGU Computational Genomics Group | |
| 4 # | |
| 5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ | |
| 6 # | |
| 7 # Copyright (C) 2009 Andreas Heger | |
| 8 # | |
| 9 # This program is free software; you can redistribute it and/or | |
| 10 # modify it under the terms of the GNU General Public License | |
| 11 # as published by the Free Software Foundation; either version 2 | |
| 12 # of the License, or (at your option) any later version. | |
| 13 # | |
| 14 # This program is distributed in the hope that it will be useful, | |
| 15 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 17 # GNU General Public License for more details. | |
| 18 # | |
| 19 # You should have received a copy of the GNU General Public License | |
| 20 # along with this program; if not, write to the Free Software | |
| 21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
| 22 ################################################################################# | |
| 23 ''' | |
| 24 great - run great analysis | |
| 25 ========================== | |
| 26 | |
| 27 :Author: Andreas Heger | |
| 28 :Release: $Id$ | |
| 29 :Date: |today| | |
| 30 :Tags: Python | |
| 31 | |
| 32 Purpose | |
| 33 ------- | |
| 34 | |
| 35 This script compares one or more genomic segments of interest | |
| 36 against one more other genomic annotations. | |
| 37 | |
| 38 GREAT says that a segment overlaps an annotation if the midpoint | |
| 39 of the segment overlaps the annotation. | |
| 40 | |
| 41 The GREAT analysis is done on a per-isochore basis and overall. | |
| 42 For the overall analysis, there is no isochore correction | |
| 43 as the union of all isochores is treated as a single workspace. | |
| 44 | |
| 45 This script implies several counters (``--counter`` option): | |
| 46 | |
| 47 binom | |
| 48 binomial model (as implemented by GREAT). Returns the probability | |
| 49 of a certain number of segments overlapping (mid-point overlap) | |
| 50 an annotation | |
| 51 | |
| 52 hyperg | |
| 53 hypergeometric model. Returns the probabibility of a certain | |
| 54 number of nucleotides within segments overlapping an annotation. | |
| 55 Conceptually, all genomic positions are treated independently and | |
| 56 segments are fragmented into a collection of single bases that are | |
| 57 placed indepently. This is valid for point intervals such as SNPs, | |
| 58 but of course ignores segment size, though it might be used as a | |
| 59 guide towards the expected fold enrichment, that can be computed | |
| 60 more accurately using :mod:`gat-run.py` | |
| 61 | |
| 62 Usage | |
| 63 ----- | |
| 64 | |
| 65 Example:: | |
| 66 | |
| 67 python gat-run.py | |
| 68 --segment-file=segments.bed.gz | |
| 69 --workspace-file=workspace.bed.gz | |
| 70 --annotation-file=annotations_architecture.bed.gz | |
| 71 | |
| 72 Type:: | |
| 73 | |
| 74 python gat-run.py --help | |
| 75 | |
| 76 for command line help. | |
| 77 | |
| 78 Documentation | |
| 79 ------------- | |
| 80 | |
| 81 Code | |
| 82 ---- | |
| 83 | |
| 84 ''' | |
| 85 | |
| 86 import os, sys, re, optparse, collections, types, glob, time | |
| 87 import numpy | |
| 88 import numpy.random | |
| 89 | |
| 90 import gat | |
| 91 import gat.Experiment as E | |
| 92 import gat.IOTools as IOTools | |
| 93 import gat.IO as IO | |
| 94 import csegmentlist | |
| 95 | |
| 96 import scipy.stats | |
| 97 | |
| 98 GREAT_RESULT = collections.namedtuple( "GREAT", | |
| 99 ("track", | |
| 100 "annotation", | |
| 101 "isochore", | |
| 102 "counter", | |
| 103 "observed", # nsegments_overlapping_annotations | |
| 104 "expected", | |
| 105 "nsegments_in_workspace", | |
| 106 "nannotations_in_workspace", | |
| 107 "nsegments_overlapping_annotation", | |
| 108 "nannotations_overlapping_segments", | |
| 109 "basecoverage_intersection", | |
| 110 "basecoverage_segments", | |
| 111 "basecoverage_annotation", | |
| 112 "basecoverage_workspace", | |
| 113 "fraction_coverage_annotation", | |
| 114 "fold", | |
| 115 "pvalue", | |
| 116 "qvalue" ) ) | |
| 117 | |
| 118 def main( argv ): | |
| 119 | |
| 120 if not argv: argv = sys.argv | |
| 121 | |
| 122 # setup command line parser | |
| 123 parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", | |
| 124 usage = globals()["__doc__"] ) | |
| 125 | |
| 126 parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append", | |
| 127 help="filename with annotations [default=%default]." ) | |
| 128 | |
| 129 parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", | |
| 130 help="filename with segments. Also accepts a glob in parentheses [default=%default]." ) | |
| 131 | |
| 132 parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", | |
| 133 help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]." ) | |
| 134 | |
| 135 parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append", | |
| 136 help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]." ) | |
| 137 | |
| 138 parser.add_option("-o", "--order", dest="output_order", type="choice", | |
| 139 choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ), | |
| 140 help="order results in output by fold, track, etc. [default=%default]." ) | |
| 141 | |
| 142 parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", | |
| 143 choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ), | |
| 144 help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) | |
| 145 | |
| 146 parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", | |
| 147 help="fdr computation: lambda [default=%default]." ) | |
| 148 | |
| 149 parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", | |
| 150 choices = ("smoother", "bootstrap" ), | |
| 151 help="fdr computation: method for estimating pi0 [default=%default]." ) | |
| 152 parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", | |
| 153 help="filename mapping annotation terms to descriptions. " | |
| 154 " if given, the output table will contain additional columns " | |
| 155 " [default=%default]" ) | |
| 156 | |
| 157 parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", | |
| 158 help="ignore segment tracks - all segments belong to one track [default=%default]" ) | |
| 159 | |
| 160 parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true", | |
| 161 help="permit the same track to be in multiple files [default=%default]" ) | |
| 162 | |
| 163 parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append", | |
| 164 choices = ( "all", | |
| 165 "annotations", "segments", | |
| 166 "workspaces", "isochores", | |
| 167 "overlap" ), | |
| 168 help="output bed files [default=%default]." ) | |
| 169 | |
| 170 parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append", | |
| 171 choices = ( "all", | |
| 172 "annotations", "segments", | |
| 173 "workspaces", "isochores", | |
| 174 "overlap" ), | |
| 175 help="output overlap summary stats [default=%default]." ) | |
| 176 | |
| 177 parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true", | |
| 178 help="restrict workspace to those segments that contain both track" | |
| 179 " and annotations [default=%default]" ) | |
| 180 | |
| 181 parser.add_option( "--counter", dest="counters", type="choice", action="append", | |
| 182 choices = ( "binom", "hyperg" ), | |
| 183 help="counter to use [default=%default]." ) | |
| 184 | |
| 185 parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string", | |
| 186 help="output pattern for result tables. Used if there are multiple counters used [default=%default]." ) | |
| 187 | |
| 188 parser.set_defaults( | |
| 189 annotation_files = [], | |
| 190 segment_files = [], | |
| 191 workspace_files = [], | |
| 192 sample_files = [], | |
| 193 counters = [], | |
| 194 output_stats = [], | |
| 195 output_bed = [], | |
| 196 output_tables_pattern = "%s.tsv.gz", | |
| 197 output_order = "fold", | |
| 198 input_filename_counts = None, | |
| 199 input_filename_results = None, | |
| 200 pvalue_method = "empirical", | |
| 201 output_plots_pattern = None, | |
| 202 output_samples_pattern = None, | |
| 203 qvalue_method = "storey", | |
| 204 qvalue_lambda = None, | |
| 205 qvalue_pi0_method = "smoother", | |
| 206 ignore_segment_tracks = False, | |
| 207 input_filename_descriptions = None, | |
| 208 conditional = "unconditional", | |
| 209 conditional_extension = None, | |
| 210 conditional_expansion = None, | |
| 211 restrict_workspace = False, | |
| 212 enable_split_tracks = False, | |
| 213 shift_expansion = 2.0, | |
| 214 shift_extension = 0, | |
| 215 overlap_mode = "midpoint", | |
| 216 truncate_workspace_to_annotations = False, | |
| 217 ) | |
| 218 | |
| 219 ## add common options (-h/--help, ...) and parse command line | |
| 220 (options, args) = E.Start( parser, argv = argv, add_output_options = True ) | |
| 221 | |
| 222 tstart = time.time() | |
| 223 | |
| 224 if len(options.counters) == 0: | |
| 225 options.counters.append("binom") | |
| 226 | |
| 227 ############################################ | |
| 228 segments, annotations, workspaces, isochores = IO.buildSegments( options ) | |
| 229 E.info( "intervals loaded in %i seconds" % (time.time() - tstart) ) | |
| 230 | |
| 231 # filter segments by workspace | |
| 232 workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores ) | |
| 233 | |
| 234 ############################################ | |
| 235 description_header, descriptions, description_width = IO.readDescriptions( options ) | |
| 236 | |
| 237 ############################################ | |
| 238 ############################################ | |
| 239 ## compute per contig | |
| 240 | |
| 241 # compute bases covered by workspace | |
| 242 workspace2basecoverage, isochores = {}, [] | |
| 243 for contig, ww in workspace.iteritems(): | |
| 244 workspace2basecoverage[contig] = ww.sum() | |
| 245 isochores.append( contig ) | |
| 246 | |
| 247 # compute percentage of bases covered by annotations in workspace | |
| 248 # per isochore | |
| 249 annotation2basecoverage = collections.defaultdict( dict ) | |
| 250 for annotation, aa in annotations.iteritems(): | |
| 251 for isochore, a in aa.iteritems(): | |
| 252 # need to truncate to workspace? | |
| 253 annotation2basecoverage[annotation][isochore] = a.sum() | |
| 254 | |
| 255 results_per_contig = collections.defaultdict( list ) | |
| 256 | |
| 257 E.info( "computing counts per isochore" ) | |
| 258 # results per isochore | |
| 259 def emptyResult( segment, annotation, isochore, | |
| 260 counter, | |
| 261 nsegments_in_workspace, | |
| 262 basecoverage_annotation, | |
| 263 basecoverage_workspace): | |
| 264 return GREAT_RESULT._make( ( | |
| 265 segment, annotation, isochore, | |
| 266 counter, | |
| 267 0, # observed | |
| 268 0, # expected | |
| 269 nsegments_in_workspace, | |
| 270 0, # nannotations_in_workspace | |
| 271 0, # nsegments_overlapping_annotation | |
| 272 0, # nannotations_overlapping_segments | |
| 273 0, # basecoverage_intersection | |
| 274 0, # basecoverage_segments | |
| 275 basecoverage_annotation, | |
| 276 basecoverage_workspace, | |
| 277 0.0, | |
| 278 1.0, | |
| 279 1.0, | |
| 280 1.0 )) | |
| 281 | |
| 282 for isochore in isochores: | |
| 283 basecoverage_workspace = workspace2basecoverage[isochore] | |
| 284 | |
| 285 # iterate over all isochores | |
| 286 for segment, segmentdict in segments.iteritems(): | |
| 287 try: | |
| 288 ss = segmentdict[isochore] | |
| 289 # select segments overlapping workspace | |
| 290 segments_in_workspace = csegmentlist.SegmentList( clone = ss ) | |
| 291 segments_in_workspace.intersect( workspace[isochore] ) | |
| 292 # number of segments in workspace | |
| 293 nsegments_in_workspace = len(segments_in_workspace) | |
| 294 except KeyError: | |
| 295 ss = None | |
| 296 | |
| 297 basecoverage_segments = segments_in_workspace.sum() | |
| 298 | |
| 299 for annotation, annotationdict in annotations.iteritems(): | |
| 300 | |
| 301 # if annotation != "GO:0030957": continue | |
| 302 | |
| 303 try: | |
| 304 aa = annotationdict[isochore] | |
| 305 except KeyError: | |
| 306 aa = None | |
| 307 | |
| 308 # p_A: proportion of bases covered by annotation | |
| 309 try: | |
| 310 basecoverage_annotation = annotation2basecoverage[annotation][isochore] | |
| 311 except KeyError: | |
| 312 basecoverage_annotation = 0 | |
| 313 | |
| 314 if ss == None or aa == None: | |
| 315 for counter in options.counters: | |
| 316 results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation, | |
| 317 isochore, | |
| 318 counter, | |
| 319 nsegments_in_workspace, | |
| 320 basecoverage_annotation, | |
| 321 basecoverage_workspace ) ) | |
| 322 continue | |
| 323 | |
| 324 # select segments overlapping annotation | |
| 325 segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss ) | |
| 326 segments_overlapping_annotation.intersect( annotations[annotation][isochore] ) | |
| 327 # number of segments in annotation | |
| 328 nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore], | |
| 329 mode = options.overlap_mode ) | |
| 330 | |
| 331 # number of nucleotides at the intersection of segments, annotation and workspace | |
| 332 basecoverage_intersection = segments_overlapping_annotation.sum() | |
| 333 | |
| 334 annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa ) | |
| 335 annotations_overlapping_segments.intersect( ss ) | |
| 336 nannotations_overlapping_segments = len( annotations_overlapping_segments ) | |
| 337 | |
| 338 nannotations_in_workspace = len( aa ) | |
| 339 if nannotations_in_workspace == 0: | |
| 340 for counter in options.counters: | |
| 341 results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, | |
| 342 annotation, | |
| 343 isochore, | |
| 344 counter, | |
| 345 nsegments_in_workspace, | |
| 346 basecoverage_annotation, | |
| 347 basecoverage_workspace ) ) | |
| 348 continue | |
| 349 | |
| 350 fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace ) | |
| 351 fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace | |
| 352 | |
| 353 for counter in options.counters: | |
| 354 if counter.startswith( "binom" ): | |
| 355 # GREAT binomial probability over "regions" | |
| 356 # n = number of genomic regions = nannotations_in_workspace | |
| 357 # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation | |
| 358 # kpi = genomic regions with annotation hit by segments = nannotations_in_segments | |
| 359 # sf = survival functions = 1 -cdf | |
| 360 # probability of observing >kpi in a sample of n where the probabily of succes is | |
| 361 # ppi. | |
| 362 pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1, | |
| 363 nsegments_in_workspace, | |
| 364 fraction_coverage_annotation ) | |
| 365 | |
| 366 expected = fraction_coverage_annotation * nsegments_in_workspace | |
| 367 observed = nsegments_overlapping_annotation | |
| 368 | |
| 369 elif counter.startswith( "hyperg" ): | |
| 370 | |
| 371 # hypergeometric probability over nucleotides | |
| 372 # Sampling without replacement | |
| 373 # x,M,n,M | |
| 374 # x = observed number of nucleotides in overlap of segments,annotations and workspace | |
| 375 # M = number of nucleotides in workspace | |
| 376 # n = number of nucleotides in annotations (and workspace) | |
| 377 # N = number of nucleotides in segments (and workspace) | |
| 378 # P-value of obtaining >x number of nucleotides overlapping. | |
| 379 rv = scipy.stats.hypergeom( basecoverage_workspace, | |
| 380 basecoverage_annotation, | |
| 381 basecoverage_segments ) | |
| 382 | |
| 383 pvalue = rv.sf( basecoverage_intersection ) | |
| 384 expected = rv.mean() | |
| 385 observed = basecoverage_intersection | |
| 386 | |
| 387 if expected != 0: | |
| 388 fold = float(observed) / expected | |
| 389 else: | |
| 390 fold = 1.0 | |
| 391 | |
| 392 r = GREAT_RESULT._make( ( | |
| 393 segment, annotation, isochore, | |
| 394 counter, | |
| 395 observed, | |
| 396 expected, | |
| 397 nsegments_in_workspace, | |
| 398 nannotations_in_workspace, | |
| 399 nsegments_overlapping_annotation, | |
| 400 nannotations_overlapping_segments, | |
| 401 basecoverage_intersection, | |
| 402 basecoverage_segments, | |
| 403 basecoverage_annotation, | |
| 404 basecoverage_workspace, | |
| 405 fraction_coverage_annotation, | |
| 406 fold, | |
| 407 pvalue, | |
| 408 1.0 )) | |
| 409 # print "\t".join( map(str, r)) | |
| 410 results_per_contig[ (counter,segment,annotation) ].append( r ) | |
| 411 | |
| 412 E.info( "merging counts per isochore" ) | |
| 413 | |
| 414 # compute sums | |
| 415 results = [] | |
| 416 | |
| 417 for niteration, pair in enumerate(results_per_contig.iteritems()): | |
| 418 | |
| 419 counter, segment, annotation = pair[0] | |
| 420 data = pair[1] | |
| 421 | |
| 422 nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] ) | |
| 423 nsegments_overlapping_annotation = sum( [x.observed for x in data ] ) | |
| 424 nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] ) | |
| 425 nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] ) | |
| 426 | |
| 427 basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] ) | |
| 428 basecoverage_segments = sum( [x.basecoverage_segments for x in data ] ) | |
| 429 basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] ) | |
| 430 basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] ) | |
| 431 | |
| 432 fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace ) | |
| 433 | |
| 434 if counter.startswith( "binom" ): | |
| 435 pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1, | |
| 436 nsegments_in_workspace, | |
| 437 fraction_coverage_annotation ) | |
| 438 expected = fraction_coverage_annotation * nsegments_in_workspace | |
| 439 observed = nsegments_overlapping_annotation | |
| 440 elif counter.startswith( "hyperg" ): | |
| 441 rv = scipy.stats.hypergeom( basecoverage_workspace, | |
| 442 basecoverage_annotation, | |
| 443 basecoverage_segments ) | |
| 444 | |
| 445 pvalue = rv.sf( basecoverage_intersection ) | |
| 446 expected = rv.mean() | |
| 447 observed = basecoverage_intersection | |
| 448 | |
| 449 | |
| 450 if expected != 0: | |
| 451 fold = float(observed) / expected | |
| 452 else: | |
| 453 fold = 1.0 | |
| 454 | |
| 455 r = GREAT_RESULT._make( ( | |
| 456 segment, annotation, "all", | |
| 457 counter, | |
| 458 observed, | |
| 459 expected, | |
| 460 nsegments_in_workspace, | |
| 461 nannotations_in_workspace, | |
| 462 nsegments_overlapping_annotation, | |
| 463 nannotations_overlapping_segments, | |
| 464 basecoverage_intersection, | |
| 465 basecoverage_segments, | |
| 466 basecoverage_annotation, | |
| 467 basecoverage_workspace, | |
| 468 fraction_coverage_annotation, | |
| 469 fold, | |
| 470 pvalue, | |
| 471 1.0 )) | |
| 472 | |
| 473 results.append( r ) | |
| 474 | |
| 475 IO.outputResults( results, | |
| 476 options, | |
| 477 GREAT_RESULT._fields, | |
| 478 description_header, | |
| 479 description_width, | |
| 480 descriptions ) | |
| 481 | |
| 482 E.Stop() | |
| 483 | |
| 484 if __name__ == "__main__": | |
| 485 sys.exit( main( sys.argv) ) | |
| 486 | |
| 487 | |
| 488 |
