Mercurial > repos > fubar > genomic_association_tester
diff rlGAT/gat-great.py @ 11:53487f21c0d5 draft
Uploaded
| author | fubar |
|---|---|
| date | Thu, 29 Aug 2013 01:57:54 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rlGAT/gat-great.py Thu Aug 29 01:57:54 2013 -0400 @@ -0,0 +1,488 @@ +################################################################################ +# +# MRC FGU Computational Genomics Group +# +# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ +# +# Copyright (C) 2009 Andreas Heger +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +################################################################################# +''' +great - run great analysis +========================== + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Python + +Purpose +------- + +This script compares one or more genomic segments of interest +against one more other genomic annotations. + +GREAT says that a segment overlaps an annotation if the midpoint +of the segment overlaps the annotation. + +The GREAT analysis is done on a per-isochore basis and overall. +For the overall analysis, there is no isochore correction +as the union of all isochores is treated as a single workspace. + +This script implies several counters (``--counter`` option): + +binom + binomial model (as implemented by GREAT). Returns the probability + of a certain number of segments overlapping (mid-point overlap) + an annotation + +hyperg + hypergeometric model. Returns the probabibility of a certain + number of nucleotides within segments overlapping an annotation. + Conceptually, all genomic positions are treated independently and + segments are fragmented into a collection of single bases that are + placed indepently. This is valid for point intervals such as SNPs, + but of course ignores segment size, though it might be used as a + guide towards the expected fold enrichment, that can be computed + more accurately using :mod:`gat-run.py` + +Usage +----- + +Example:: + + python gat-run.py + --segment-file=segments.bed.gz + --workspace-file=workspace.bed.gz + --annotation-file=annotations_architecture.bed.gz + +Type:: + + python gat-run.py --help + +for command line help. + +Documentation +------------- + +Code +---- + +''' + +import os, sys, re, optparse, collections, types, glob, time +import numpy +import numpy.random + +import gat +import gat.Experiment as E +import gat.IOTools as IOTools +import gat.IO as IO +import csegmentlist + +import scipy.stats + +GREAT_RESULT = collections.namedtuple( "GREAT", + ("track", + "annotation", + "isochore", + "counter", + "observed", # nsegments_overlapping_annotations + "expected", + "nsegments_in_workspace", + "nannotations_in_workspace", + "nsegments_overlapping_annotation", + "nannotations_overlapping_segments", + "basecoverage_intersection", + "basecoverage_segments", + "basecoverage_annotation", + "basecoverage_workspace", + "fraction_coverage_annotation", + "fold", + "pvalue", + "qvalue" ) ) + +def main( argv ): + + if not argv: argv = sys.argv + + # setup command line parser + parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", + usage = globals()["__doc__"] ) + + parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append", + help="filename with annotations [default=%default]." ) + + parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", + help="filename with segments. Also accepts a glob in parentheses [default=%default]." ) + + parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", + help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]." ) + + parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append", + help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]." ) + + parser.add_option("-o", "--order", dest="output_order", type="choice", + choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ), + help="order results in output by fold, track, etc. [default=%default]." ) + + parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", + choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ), + help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) + + parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", + help="fdr computation: lambda [default=%default]." ) + + parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", + choices = ("smoother", "bootstrap" ), + help="fdr computation: method for estimating pi0 [default=%default]." ) + parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", + help="filename mapping annotation terms to descriptions. " + " if given, the output table will contain additional columns " + " [default=%default]" ) + + parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", + help="ignore segment tracks - all segments belong to one track [default=%default]" ) + + parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true", + help="permit the same track to be in multiple files [default=%default]" ) + + parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append", + choices = ( "all", + "annotations", "segments", + "workspaces", "isochores", + "overlap" ), + help="output bed files [default=%default]." ) + + parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append", + choices = ( "all", + "annotations", "segments", + "workspaces", "isochores", + "overlap" ), + help="output overlap summary stats [default=%default]." ) + + parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true", + help="restrict workspace to those segments that contain both track" + " and annotations [default=%default]" ) + + parser.add_option( "--counter", dest="counters", type="choice", action="append", + choices = ( "binom", "hyperg" ), + help="counter to use [default=%default]." ) + + parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string", + help="output pattern for result tables. Used if there are multiple counters used [default=%default]." ) + + parser.set_defaults( + annotation_files = [], + segment_files = [], + workspace_files = [], + sample_files = [], + counters = [], + output_stats = [], + output_bed = [], + output_tables_pattern = "%s.tsv.gz", + output_order = "fold", + input_filename_counts = None, + input_filename_results = None, + pvalue_method = "empirical", + output_plots_pattern = None, + output_samples_pattern = None, + qvalue_method = "storey", + qvalue_lambda = None, + qvalue_pi0_method = "smoother", + ignore_segment_tracks = False, + input_filename_descriptions = None, + conditional = "unconditional", + conditional_extension = None, + conditional_expansion = None, + restrict_workspace = False, + enable_split_tracks = False, + shift_expansion = 2.0, + shift_extension = 0, + overlap_mode = "midpoint", + truncate_workspace_to_annotations = False, + ) + + ## add common options (-h/--help, ...) and parse command line + (options, args) = E.Start( parser, argv = argv, add_output_options = True ) + + tstart = time.time() + + if len(options.counters) == 0: + options.counters.append("binom") + + ############################################ + segments, annotations, workspaces, isochores = IO.buildSegments( options ) + E.info( "intervals loaded in %i seconds" % (time.time() - tstart) ) + + # filter segments by workspace + workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores ) + + ############################################ + description_header, descriptions, description_width = IO.readDescriptions( options ) + + ############################################ + ############################################ + ## compute per contig + + # compute bases covered by workspace + workspace2basecoverage, isochores = {}, [] + for contig, ww in workspace.iteritems(): + workspace2basecoverage[contig] = ww.sum() + isochores.append( contig ) + + # compute percentage of bases covered by annotations in workspace + # per isochore + annotation2basecoverage = collections.defaultdict( dict ) + for annotation, aa in annotations.iteritems(): + for isochore, a in aa.iteritems(): + # need to truncate to workspace? + annotation2basecoverage[annotation][isochore] = a.sum() + + results_per_contig = collections.defaultdict( list ) + + E.info( "computing counts per isochore" ) + # results per isochore + def emptyResult( segment, annotation, isochore, + counter, + nsegments_in_workspace, + basecoverage_annotation, + basecoverage_workspace): + return GREAT_RESULT._make( ( + segment, annotation, isochore, + counter, + 0, # observed + 0, # expected + nsegments_in_workspace, + 0, # nannotations_in_workspace + 0, # nsegments_overlapping_annotation + 0, # nannotations_overlapping_segments + 0, # basecoverage_intersection + 0, # basecoverage_segments + basecoverage_annotation, + basecoverage_workspace, + 0.0, + 1.0, + 1.0, + 1.0 )) + + for isochore in isochores: + basecoverage_workspace = workspace2basecoverage[isochore] + + # iterate over all isochores + for segment, segmentdict in segments.iteritems(): + try: + ss = segmentdict[isochore] + # select segments overlapping workspace + segments_in_workspace = csegmentlist.SegmentList( clone = ss ) + segments_in_workspace.intersect( workspace[isochore] ) + # number of segments in workspace + nsegments_in_workspace = len(segments_in_workspace) + except KeyError: + ss = None + + basecoverage_segments = segments_in_workspace.sum() + + for annotation, annotationdict in annotations.iteritems(): + + # if annotation != "GO:0030957": continue + + try: + aa = annotationdict[isochore] + except KeyError: + aa = None + + # p_A: proportion of bases covered by annotation + try: + basecoverage_annotation = annotation2basecoverage[annotation][isochore] + except KeyError: + basecoverage_annotation = 0 + + if ss == None or aa == None: + for counter in options.counters: + results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation, + isochore, + counter, + nsegments_in_workspace, + basecoverage_annotation, + basecoverage_workspace ) ) + continue + + # select segments overlapping annotation + segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss ) + segments_overlapping_annotation.intersect( annotations[annotation][isochore] ) + # number of segments in annotation + nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore], + mode = options.overlap_mode ) + + # number of nucleotides at the intersection of segments, annotation and workspace + basecoverage_intersection = segments_overlapping_annotation.sum() + + annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa ) + annotations_overlapping_segments.intersect( ss ) + nannotations_overlapping_segments = len( annotations_overlapping_segments ) + + nannotations_in_workspace = len( aa ) + if nannotations_in_workspace == 0: + for counter in options.counters: + results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, + annotation, + isochore, + counter, + nsegments_in_workspace, + basecoverage_annotation, + basecoverage_workspace ) ) + continue + + fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace ) + fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace + + for counter in options.counters: + if counter.startswith( "binom" ): + # GREAT binomial probability over "regions" + # n = number of genomic regions = nannotations_in_workspace + # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation + # kpi = genomic regions with annotation hit by segments = nannotations_in_segments + # sf = survival functions = 1 -cdf + # probability of observing >kpi in a sample of n where the probabily of succes is + # ppi. + pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1, + nsegments_in_workspace, + fraction_coverage_annotation ) + + expected = fraction_coverage_annotation * nsegments_in_workspace + observed = nsegments_overlapping_annotation + + elif counter.startswith( "hyperg" ): + + # hypergeometric probability over nucleotides + # Sampling without replacement + # x,M,n,M + # x = observed number of nucleotides in overlap of segments,annotations and workspace + # M = number of nucleotides in workspace + # n = number of nucleotides in annotations (and workspace) + # N = number of nucleotides in segments (and workspace) + # P-value of obtaining >x number of nucleotides overlapping. + rv = scipy.stats.hypergeom( basecoverage_workspace, + basecoverage_annotation, + basecoverage_segments ) + + pvalue = rv.sf( basecoverage_intersection ) + expected = rv.mean() + observed = basecoverage_intersection + + if expected != 0: + fold = float(observed) / expected + else: + fold = 1.0 + + r = GREAT_RESULT._make( ( + segment, annotation, isochore, + counter, + observed, + expected, + nsegments_in_workspace, + nannotations_in_workspace, + nsegments_overlapping_annotation, + nannotations_overlapping_segments, + basecoverage_intersection, + basecoverage_segments, + basecoverage_annotation, + basecoverage_workspace, + fraction_coverage_annotation, + fold, + pvalue, + 1.0 )) + # print "\t".join( map(str, r)) + results_per_contig[ (counter,segment,annotation) ].append( r ) + + E.info( "merging counts per isochore" ) + + # compute sums + results = [] + + for niteration, pair in enumerate(results_per_contig.iteritems()): + + counter, segment, annotation = pair[0] + data = pair[1] + + nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] ) + nsegments_overlapping_annotation = sum( [x.observed for x in data ] ) + nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] ) + nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] ) + + basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] ) + basecoverage_segments = sum( [x.basecoverage_segments for x in data ] ) + basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] ) + basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] ) + + fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace ) + + if counter.startswith( "binom" ): + pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1, + nsegments_in_workspace, + fraction_coverage_annotation ) + expected = fraction_coverage_annotation * nsegments_in_workspace + observed = nsegments_overlapping_annotation + elif counter.startswith( "hyperg" ): + rv = scipy.stats.hypergeom( basecoverage_workspace, + basecoverage_annotation, + basecoverage_segments ) + + pvalue = rv.sf( basecoverage_intersection ) + expected = rv.mean() + observed = basecoverage_intersection + + + if expected != 0: + fold = float(observed) / expected + else: + fold = 1.0 + + r = GREAT_RESULT._make( ( + segment, annotation, "all", + counter, + observed, + expected, + nsegments_in_workspace, + nannotations_in_workspace, + nsegments_overlapping_annotation, + nannotations_overlapping_segments, + basecoverage_intersection, + basecoverage_segments, + basecoverage_annotation, + basecoverage_workspace, + fraction_coverage_annotation, + fold, + pvalue, + 1.0 )) + + results.append( r ) + + IO.outputResults( results, + options, + GREAT_RESULT._fields, + description_header, + description_width, + descriptions ) + + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) ) + + +
