# HG changeset patch
# User fubar
# Date 1377755874 14400
# Node ID 53487f21c0d5df2d1ffb0286ddb77fab5a798d19
# Parent f04dfb37d1bbe99786fbcc20b7657167daf0f67f
Uploaded
diff -r f04dfb37d1bb -r 53487f21c0d5 rlGAT/gat-compare.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-compare.py Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,308 @@
+################################################################################
+#
+# MRC FGU Computational Genomics Group
+#
+# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+# Copyright (C) 2009 Andreas Heger
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#################################################################################
+'''
+gat-compare - compare two gat runs
+==================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares gat runs and compares statistical significance of fold change differences between two or more
+regions of interest.
+
+This script requires counts files created by gat (option ``--output-counts-file``).
+
+The reported observed value between sets 1 and 2 is the difference of fc2 and fc1:
+
+ * observed = fold2 - fold1
+
+The script accepts one or more output files from a gat run.
+
+If only a single file is given, the significance of fold change differences are computed
+for each pairwise comparison of annotations. Thus, the script might answer the question
+if a transcription factor is more enriched in promotors than in enhancers.
+
+If multiple files are given, the script computes the fold change differences for the same
+annotation for all pairwise combinations of :term:`segments of interest` across the different
+files. Thus, the script might answer the question if transcription factor A is more enriched
+in promotors than transcription factor B is enriched in promotors.
+
+Usage
+-----
+
+Example::
+
+ python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz
+
+Type::
+
+ python gat-compare.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time, math
+import itertools
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+
+def main( argv = None ):
+ """script main.
+
+ parses command line options in sys.argv, unless *argv* is given.
+ """
+
+ if not argv: argv = sys.argv
+
+ # setup command line parser
+ parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
+ usage = globals()["__doc__"] )
+
+ parser.add_option("-o", "--order", dest="output_order", type="choice",
+ choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ),
+ help="order results in output by fold, track, etc. [default=%default]." )
+
+ parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
+ choices = ( "empirical", "norm", ),
+ help="type of pvalue reported [default=%default]." )
+
+ parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
+ choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
+ help="method to perform multiple testing correction by controlling the fdr [default=%default]." )
+
+ parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
+ help="fdr computation: lambda [default=%default]." )
+
+ parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
+ choices = ("smoother", "bootstrap" ),
+ help="fdr computation: method for estimating pi0 [default=%default]." )
+
+ parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
+ help="filename mapping annotation terms to descriptions. "
+ " if given, the output table will contain additional columns "
+ " [default=%default]" )
+
+ parser.add_option( "--pseudo-count", dest="pseudo_count", type="float",
+ help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
+ " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." )
+
+ parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string",
+ help="output pattern for plots [default=%default]" )
+
+ parser.set_defaults(
+ pvalue_method = "empirical",
+ qvalue_method = "BH",
+ qvalue_lambda = None,
+ qvalue_pi0_method = "smoother",
+ # pseudo count for fold change computation to avoid 0 fc
+ pseudo_count = 1.0,
+ output_order = "observed",
+ )
+
+ ## add common options (-h/--help, ...) and parse command line
+ (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+ input_filenames_counts = args
+
+ ##################################################
+ E.info( "received %i filenames with counts" % len(input_filenames_counts))
+
+ ##################################################
+ description_header, descriptions, description_width = IO.readDescriptions( options )
+
+ all_annotator_results = []
+
+ for input_filename_counts in input_filenames_counts:
+
+ E.info("processing %s" % input_filename_counts )
+
+ annotator_results = gat.fromCounts( input_filename_counts )
+
+ ##################################################
+ if options.pvalue_method != "empirical":
+ E.info("updating pvalues to %s" % options.pvalue_method )
+ gat.updatePValues( annotator_results, options.pvalue_method )
+
+ ##################################################
+ ##################################################
+ ##################################################
+ ## compute global fdr
+ ##################################################
+ E.info( "computing FDR statistics" )
+ gat.updateQValues( annotator_results,
+ method = options.qvalue_method,
+ vlambda = options.qvalue_lambda,
+ pi0_method = options.qvalue_pi0_method )
+
+ all_annotator_results.append( annotator_results )
+
+ pseudo_count = options.pseudo_count
+ results = []
+
+ if len(all_annotator_results) == 1:
+ E.info("performing pairwise comparison within a file")
+
+ # collect all annotations
+ annotations, segments = list(), set()
+ for x in all_annotator_results[0]:
+ segments.add( x.track)
+ annotations.append( x )
+
+ if len(segments) != 1:
+ raise NotImplementedError( "multiple segments of interest")
+
+ for data1, data2 in itertools.combinations(annotations, 2):
+
+ # note that fold changes can be very large if there are 0 samples
+ # this is fine for getting the distributional params (mean, stddev)
+ fold_changes1 = data1.observed / (data1.samples + pseudo_count )
+ fold_changes2 = data2.observed / (data2.samples + pseudo_count )
+
+ # add a separate fc pseudo-count to avoid 0 values
+ fold_changes1 += 0.0001
+ fold_changes2 += 0.0001
+
+ # Test is if relative fold change rfc is different from 1
+ # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
+ # = obs1 / obs2 * exp2 / exp1
+ # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
+ #
+ # Convert to log space for easier plotting
+ # Move the observed fold ratio in order to get an idea of the magnitude
+ # of the underlying fold change
+ delta_fold = data2.fold - data1.fold
+ sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
+ observed_delta_fold = 0.0 + delta_fold
+
+ result = gat.AnnotatorResult( data1.annotation, data2.annotation,
+ "na",
+ observed_delta_fold,
+ sampled_delta_fold,
+ reference = None,
+ pseudo_count = 0 )
+
+
+ results.append( result )
+
+
+ else:
+ E.info("performing pairwise comparison between files")
+
+ ##################################################
+ # perform pairwise comparison
+ for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2):
+ E.info( "comparing %i and %i" % (index1,index2))
+ a,b = all_annotator_results[index1], all_annotator_results[index2]
+
+ # index results in a and b
+ aa = collections.defaultdict( dict )
+ for x in a: aa[x.track][x.annotation] = x
+
+ bb = collections.defaultdict( dict )
+ for x in b: bb[x.track][x.annotation] = x
+
+ if len(aa.keys()) != 1 or len(bb.keys()) != 1:
+ raise NotImplementedError( "multiple segments of interest")
+
+ track = "merged"
+ # get shared annotations
+ annotations1 = aa[track].keys()
+ annotations2 = bb[track].keys()
+ shared_annotations = list( set(annotations1).intersection( set(annotations2) ) )
+ E.info( "%i shared annotations" % len(shared_annotations))
+
+ for annotation in shared_annotations:
+
+ # if not annotation.startswith("Ram:"): continue
+
+ data1 = aa[track][annotation]
+ data2 = bb[track][annotation]
+
+ # note that fold changes can be very large if there are 0 samples
+ # this is fine for getting the distributional params (mean, stddev)
+ fold_changes1 = data1.observed / (data1.samples + pseudo_count )
+ fold_changes2 = data2.observed / (data2.samples + pseudo_count )
+
+ # add a separate fc pseudo-count to avoid 0 values
+ fold_changes1 += 0.0001
+ fold_changes2 += 0.0001
+
+ # Test is if relative fold change rfc is different from 1
+ # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
+ # = obs1 / obs2 * exp2 / exp1
+ # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
+ #
+ # Convert to log space for easier plotting
+ # Move the observed fold ratio in order to get an idea of the magnitude
+ # of the underlying fold change
+ delta_fold = data2.fold - data1.fold
+ sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
+ observed_delta_fold = 0.0 + delta_fold
+
+ result = gat.AnnotatorResult( track, annotation,
+ "na",
+ observed_delta_fold,
+ sampled_delta_fold,
+ reference = None,
+ pseudo_count = 0 )
+
+
+ results.append( result )
+
+
+ if len(results) == 0:
+ E.critical( "no results found" )
+ E.Stop()
+ return
+
+ IO.outputResults( results,
+ options,
+ gat.AnnotatorResult.headers,
+ description_header,
+ description_width,
+ descriptions,
+ format_observed = "%6.4f" )
+
+ IO.plotResults( results, options )
+
+ ## write footer and output benchmark information.
+ E.Stop()
+
+if __name__ == "__main__":
+ sys.exit( main( sys.argv) )
diff -r f04dfb37d1bb -r 53487f21c0d5 rlGAT/gat-great.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-great.py Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,488 @@
+################################################################################
+#
+# MRC FGU Computational Genomics Group
+#
+# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+# Copyright (C) 2009 Andreas Heger
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#################################################################################
+'''
+great - run great analysis
+==========================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares one or more genomic segments of interest
+against one more other genomic annotations.
+
+GREAT says that a segment overlaps an annotation if the midpoint
+of the segment overlaps the annotation.
+
+The GREAT analysis is done on a per-isochore basis and overall.
+For the overall analysis, there is no isochore correction
+as the union of all isochores is treated as a single workspace.
+
+This script implies several counters (``--counter`` option):
+
+binom
+ binomial model (as implemented by GREAT). Returns the probability
+ of a certain number of segments overlapping (mid-point overlap)
+ an annotation
+
+hyperg
+ hypergeometric model. Returns the probabibility of a certain
+ number of nucleotides within segments overlapping an annotation.
+ Conceptually, all genomic positions are treated independently and
+ segments are fragmented into a collection of single bases that are
+ placed indepently. This is valid for point intervals such as SNPs,
+ but of course ignores segment size, though it might be used as a
+ guide towards the expected fold enrichment, that can be computed
+ more accurately using :mod:`gat-run.py`
+
+Usage
+-----
+
+Example::
+
+ python gat-run.py
+ --segment-file=segments.bed.gz
+ --workspace-file=workspace.bed.gz
+ --annotation-file=annotations_architecture.bed.gz
+
+Type::
+
+ python gat-run.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+import numpy.random
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+import csegmentlist
+
+import scipy.stats
+
+GREAT_RESULT = collections.namedtuple( "GREAT",
+ ("track",
+ "annotation",
+ "isochore",
+ "counter",
+ "observed", # nsegments_overlapping_annotations
+ "expected",
+ "nsegments_in_workspace",
+ "nannotations_in_workspace",
+ "nsegments_overlapping_annotation",
+ "nannotations_overlapping_segments",
+ "basecoverage_intersection",
+ "basecoverage_segments",
+ "basecoverage_annotation",
+ "basecoverage_workspace",
+ "fraction_coverage_annotation",
+ "fold",
+ "pvalue",
+ "qvalue" ) )
+
+def main( argv ):
+
+ if not argv: argv = sys.argv
+
+ # setup command line parser
+ parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
+ usage = globals()["__doc__"] )
+
+ parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append",
+ help="filename with annotations [default=%default]." )
+
+ parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append",
+ help="filename with segments. Also accepts a glob in parentheses [default=%default]." )
+
+ parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append",
+ help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]." )
+
+ parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append",
+ help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]." )
+
+ parser.add_option("-o", "--order", dest="output_order", type="choice",
+ choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ),
+ help="order results in output by fold, track, etc. [default=%default]." )
+
+ parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
+ choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
+ help="method to perform multiple testing correction by controlling the fdr [default=%default]." )
+
+ parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
+ help="fdr computation: lambda [default=%default]." )
+
+ parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
+ choices = ("smoother", "bootstrap" ),
+ help="fdr computation: method for estimating pi0 [default=%default]." )
+ parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
+ help="filename mapping annotation terms to descriptions. "
+ " if given, the output table will contain additional columns "
+ " [default=%default]" )
+
+ parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true",
+ help="ignore segment tracks - all segments belong to one track [default=%default]" )
+
+ parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true",
+ help="permit the same track to be in multiple files [default=%default]" )
+
+ parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append",
+ choices = ( "all",
+ "annotations", "segments",
+ "workspaces", "isochores",
+ "overlap" ),
+ help="output bed files [default=%default]." )
+
+ parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append",
+ choices = ( "all",
+ "annotations", "segments",
+ "workspaces", "isochores",
+ "overlap" ),
+ help="output overlap summary stats [default=%default]." )
+
+ parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true",
+ help="restrict workspace to those segments that contain both track"
+ " and annotations [default=%default]" )
+
+ parser.add_option( "--counter", dest="counters", type="choice", action="append",
+ choices = ( "binom", "hyperg" ),
+ help="counter to use [default=%default]." )
+
+ parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string",
+ help="output pattern for result tables. Used if there are multiple counters used [default=%default]." )
+
+ parser.set_defaults(
+ annotation_files = [],
+ segment_files = [],
+ workspace_files = [],
+ sample_files = [],
+ counters = [],
+ output_stats = [],
+ output_bed = [],
+ output_tables_pattern = "%s.tsv.gz",
+ output_order = "fold",
+ input_filename_counts = None,
+ input_filename_results = None,
+ pvalue_method = "empirical",
+ output_plots_pattern = None,
+ output_samples_pattern = None,
+ qvalue_method = "storey",
+ qvalue_lambda = None,
+ qvalue_pi0_method = "smoother",
+ ignore_segment_tracks = False,
+ input_filename_descriptions = None,
+ conditional = "unconditional",
+ conditional_extension = None,
+ conditional_expansion = None,
+ restrict_workspace = False,
+ enable_split_tracks = False,
+ shift_expansion = 2.0,
+ shift_extension = 0,
+ overlap_mode = "midpoint",
+ truncate_workspace_to_annotations = False,
+ )
+
+ ## add common options (-h/--help, ...) and parse command line
+ (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+ tstart = time.time()
+
+ if len(options.counters) == 0:
+ options.counters.append("binom")
+
+ ############################################
+ segments, annotations, workspaces, isochores = IO.buildSegments( options )
+ E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
+
+ # filter segments by workspace
+ workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores )
+
+ ############################################
+ description_header, descriptions, description_width = IO.readDescriptions( options )
+
+ ############################################
+ ############################################
+ ## compute per contig
+
+ # compute bases covered by workspace
+ workspace2basecoverage, isochores = {}, []
+ for contig, ww in workspace.iteritems():
+ workspace2basecoverage[contig] = ww.sum()
+ isochores.append( contig )
+
+ # compute percentage of bases covered by annotations in workspace
+ # per isochore
+ annotation2basecoverage = collections.defaultdict( dict )
+ for annotation, aa in annotations.iteritems():
+ for isochore, a in aa.iteritems():
+ # need to truncate to workspace?
+ annotation2basecoverage[annotation][isochore] = a.sum()
+
+ results_per_contig = collections.defaultdict( list )
+
+ E.info( "computing counts per isochore" )
+ # results per isochore
+ def emptyResult( segment, annotation, isochore,
+ counter,
+ nsegments_in_workspace,
+ basecoverage_annotation,
+ basecoverage_workspace):
+ return GREAT_RESULT._make( (
+ segment, annotation, isochore,
+ counter,
+ 0, # observed
+ 0, # expected
+ nsegments_in_workspace,
+ 0, # nannotations_in_workspace
+ 0, # nsegments_overlapping_annotation
+ 0, # nannotations_overlapping_segments
+ 0, # basecoverage_intersection
+ 0, # basecoverage_segments
+ basecoverage_annotation,
+ basecoverage_workspace,
+ 0.0,
+ 1.0,
+ 1.0,
+ 1.0 ))
+
+ for isochore in isochores:
+ basecoverage_workspace = workspace2basecoverage[isochore]
+
+ # iterate over all isochores
+ for segment, segmentdict in segments.iteritems():
+ try:
+ ss = segmentdict[isochore]
+ # select segments overlapping workspace
+ segments_in_workspace = csegmentlist.SegmentList( clone = ss )
+ segments_in_workspace.intersect( workspace[isochore] )
+ # number of segments in workspace
+ nsegments_in_workspace = len(segments_in_workspace)
+ except KeyError:
+ ss = None
+
+ basecoverage_segments = segments_in_workspace.sum()
+
+ for annotation, annotationdict in annotations.iteritems():
+
+ # if annotation != "GO:0030957": continue
+
+ try:
+ aa = annotationdict[isochore]
+ except KeyError:
+ aa = None
+
+ # p_A: proportion of bases covered by annotation
+ try:
+ basecoverage_annotation = annotation2basecoverage[annotation][isochore]
+ except KeyError:
+ basecoverage_annotation = 0
+
+ if ss == None or aa == None:
+ for counter in options.counters:
+ results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation,
+ isochore,
+ counter,
+ nsegments_in_workspace,
+ basecoverage_annotation,
+ basecoverage_workspace ) )
+ continue
+
+ # select segments overlapping annotation
+ segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss )
+ segments_overlapping_annotation.intersect( annotations[annotation][isochore] )
+ # number of segments in annotation
+ nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore],
+ mode = options.overlap_mode )
+
+ # number of nucleotides at the intersection of segments, annotation and workspace
+ basecoverage_intersection = segments_overlapping_annotation.sum()
+
+ annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa )
+ annotations_overlapping_segments.intersect( ss )
+ nannotations_overlapping_segments = len( annotations_overlapping_segments )
+
+ nannotations_in_workspace = len( aa )
+ if nannotations_in_workspace == 0:
+ for counter in options.counters:
+ results_per_contig[(counter,segment,annotation)].append( emptyResult(segment,
+ annotation,
+ isochore,
+ counter,
+ nsegments_in_workspace,
+ basecoverage_annotation,
+ basecoverage_workspace ) )
+ continue
+
+ fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
+ fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace
+
+ for counter in options.counters:
+ if counter.startswith( "binom" ):
+ # GREAT binomial probability over "regions"
+ # n = number of genomic regions = nannotations_in_workspace
+ # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation
+ # kpi = genomic regions with annotation hit by segments = nannotations_in_segments
+ # sf = survival functions = 1 -cdf
+ # probability of observing >kpi in a sample of n where the probabily of succes is
+ # ppi.
+ pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1,
+ nsegments_in_workspace,
+ fraction_coverage_annotation )
+
+ expected = fraction_coverage_annotation * nsegments_in_workspace
+ observed = nsegments_overlapping_annotation
+
+ elif counter.startswith( "hyperg" ):
+
+ # hypergeometric probability over nucleotides
+ # Sampling without replacement
+ # x,M,n,M
+ # x = observed number of nucleotides in overlap of segments,annotations and workspace
+ # M = number of nucleotides in workspace
+ # n = number of nucleotides in annotations (and workspace)
+ # N = number of nucleotides in segments (and workspace)
+ # P-value of obtaining >x number of nucleotides overlapping.
+ rv = scipy.stats.hypergeom( basecoverage_workspace,
+ basecoverage_annotation,
+ basecoverage_segments )
+
+ pvalue = rv.sf( basecoverage_intersection )
+ expected = rv.mean()
+ observed = basecoverage_intersection
+
+ if expected != 0:
+ fold = float(observed) / expected
+ else:
+ fold = 1.0
+
+ r = GREAT_RESULT._make( (
+ segment, annotation, isochore,
+ counter,
+ observed,
+ expected,
+ nsegments_in_workspace,
+ nannotations_in_workspace,
+ nsegments_overlapping_annotation,
+ nannotations_overlapping_segments,
+ basecoverage_intersection,
+ basecoverage_segments,
+ basecoverage_annotation,
+ basecoverage_workspace,
+ fraction_coverage_annotation,
+ fold,
+ pvalue,
+ 1.0 ))
+ # print "\t".join( map(str, r))
+ results_per_contig[ (counter,segment,annotation) ].append( r )
+
+ E.info( "merging counts per isochore" )
+
+ # compute sums
+ results = []
+
+ for niteration, pair in enumerate(results_per_contig.iteritems()):
+
+ counter, segment, annotation = pair[0]
+ data = pair[1]
+
+ nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] )
+ nsegments_overlapping_annotation = sum( [x.observed for x in data ] )
+ nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] )
+ nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] )
+
+ basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] )
+ basecoverage_segments = sum( [x.basecoverage_segments for x in data ] )
+ basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] )
+ basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] )
+
+ fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace )
+
+ if counter.startswith( "binom" ):
+ pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1,
+ nsegments_in_workspace,
+ fraction_coverage_annotation )
+ expected = fraction_coverage_annotation * nsegments_in_workspace
+ observed = nsegments_overlapping_annotation
+ elif counter.startswith( "hyperg" ):
+ rv = scipy.stats.hypergeom( basecoverage_workspace,
+ basecoverage_annotation,
+ basecoverage_segments )
+
+ pvalue = rv.sf( basecoverage_intersection )
+ expected = rv.mean()
+ observed = basecoverage_intersection
+
+
+ if expected != 0:
+ fold = float(observed) / expected
+ else:
+ fold = 1.0
+
+ r = GREAT_RESULT._make( (
+ segment, annotation, "all",
+ counter,
+ observed,
+ expected,
+ nsegments_in_workspace,
+ nannotations_in_workspace,
+ nsegments_overlapping_annotation,
+ nannotations_overlapping_segments,
+ basecoverage_intersection,
+ basecoverage_segments,
+ basecoverage_annotation,
+ basecoverage_workspace,
+ fraction_coverage_annotation,
+ fold,
+ pvalue,
+ 1.0 ))
+
+ results.append( r )
+
+ IO.outputResults( results,
+ options,
+ GREAT_RESULT._fields,
+ description_header,
+ description_width,
+ descriptions )
+
+ E.Stop()
+
+if __name__ == "__main__":
+ sys.exit( main( sys.argv) )
+
+
+
diff -r f04dfb37d1bb -r 53487f21c0d5 rlGAT/gat-plot.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-plot.py Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,237 @@
+################################################################################
+#
+# MRC FGU Computational Genomics Group
+#
+# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+# Copyright (C) 2009 Andreas Heger
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#################################################################################
+'''
+gat-plot - plot results from a gat analysis
+===========================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script takes the results of a ``gat-run.py` or ``gat-compare.py``
+and plots the results.
+
+This script requires matplotlib.
+
+Usage
+-----
+
+Example::
+
+ python gat-plot.py --input-filename-results=gat.results.tsv.gz
+ python gat-plot.py --input-filename-counts=gat.counts.tsv.gz
+
+Type::
+
+ python gatplot.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+
+try:
+ import matplotlib.pyplot as plt
+ HASPLOT = True
+except (ImportError,RuntimeError):
+ HASPLOT = False
+
+class DummyAnnotatorResult:
+
+ format_observed = "%i"
+ format_expected = "%6.4f"
+ format_fold = "%6.4f"
+ format_pvalue = "%6.4e"
+
+ def __init__( self ):
+ pass
+
+ @classmethod
+ def _fromLine( cls, line ):
+ x = cls()
+ data = line[:-1].split("\t")
+ x.track, x.annotation = data[:2]
+ x.observed, x.expected, x.lower95, x.upper95, x.stddev, x.fold, x.pvalue, x.qvalue = \
+ map(float, data[2:] )
+ return x
+
+ def __str__(self):
+ return "\t".join( (self.track,
+ self.annotation,
+ self.format_observed % self.observed,
+ self.format_expected % self.expected,
+ self.format_expected % self.lower95,
+ self.format_expected % self.upper95,
+ self.format_expected % self.stddev,
+ self.format_fold % self.fold,
+ self.format_pvalue % self.pvalue,
+ self.format_pvalue % self.qvalue ) )
+
+def buildPlotFilename( options, key ):
+ filename = re.sub("%s", key, options.output_plots_pattern)
+ filename = re.sub("[^a-zA-Z0-9-_./]", "_", filename )
+ dirname = os.path.dirname( filename )
+ if dirname and not os.path.exists( dirname ): os.makedirs( dirname )
+ return filename
+
+def plotBarplots( annotator_results, options ):
+ '''output a series of bar-plots.
+
+ Output for each track.
+
+ Significant results are opaque, while
+ non-significant results are transparent.'''
+
+ for track in annotator_results:
+ plt.figure()
+ r = annotator_results[track]
+ keys, values = zip( *r.items())
+ pos = range(len(r))
+ bars = plt.barh( pos, [x.fold for x in values] )
+ for b,v in zip(bars, values):
+ if v.qvalue > 0.05: b.set_alpha( 0.10 )
+
+ filename = buildPlotFilename( options, "bars-%s" % track )
+ plt.yticks( pos, keys )
+ plt.axvline( x=1, color="r")
+ plt.savefig( filename )
+
+def plotBarplot( annotator_results, options ):
+ '''output a single bar-plots.
+
+ Output for each track.
+
+ Significant results are opaque, while
+ non-significant results are transparent.'''
+
+ ntracks = len(annotator_results )
+ height = 1.0 / float(ntracks)
+
+ plt.figure()
+
+ for trackid, track in enumerate(annotator_results):
+
+ r = annotator_results[track]
+ rr = r.items()
+ rr.sort()
+ keys, values = zip(*rr)
+ pos = numpy.arange(0,len(r),1) + trackid * height
+ bars = plt.barh( pos,
+ [x.fold for x in values],
+ height = height,
+ label = track,
+ xerr = [x.stddev / x.expected for x in values],
+ color = "bryg"[trackid % 4])
+ for b,v in zip(bars, values):
+ if v.pvalue > 0.05: b.set_alpha( 0.10 )
+
+ pos = range(len(r))
+
+ plt.yticks( pos, keys )
+ plt.axvline(x=1, color = "r" )
+ filename = buildPlotFilename( options, "bars-all" )
+ plt.legend()
+ plt.savefig( filename )
+
+def main( argv = None ):
+ """script main.
+
+ parses command line options in sys.argv, unless *argv* is given.
+ """
+
+ if not argv: argv = sys.argv
+
+ # setup command line parser
+ parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
+ usage = globals()["__doc__"] )
+
+ parser.add_option("-l", "--sample-file", dest="sample_files", type="string", action="append",
+ help="filename with sample files. Start processing from samples [default=%default]." )
+
+ parser.add_option("-o", "--order", dest="output_order", type="choice",
+ choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ),
+ help="order results in output by fold, track, etc. [default=%default]." )
+
+ parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
+ choices = ( "empirical", "norm", ),
+ help="type of pvalue reported [default=%default]." )
+
+ parser.add_option( "--results-file", dest="input_filename_results", type="string",
+ help="start processing from results - no segments required [default=%default]." )
+
+ parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string",
+ help="output pattern for plots [default=%default]" )
+
+ parser.add_option( "--output-samples-pattern", dest="output_samples_pattern", type="string",
+ help="output pattern for samples. Samples are stored in bed format, one for "
+ " each segment [default=%default]" )
+
+ parser.add_option( "--plots", dest="plots", type="choice",
+ choices = ( "all",
+ "bars-per-track",
+ "bars", ),
+ help="plots to be created [default=%default]." )
+
+ parser.set_defaults(
+ sample_files = [],
+ num_samples = 1000,
+ output_stats = [],
+ output_filename_counts = None,
+ output_order = "fold",
+ input_filename_results = None,
+ pvalue_method = "empirical",
+ output_plots_pattern = None,
+ )
+
+ ## add common options (-h/--help, ...) and parse command line
+ (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+ annotator_results = IO.readAnnotatorResults( options.input_filename_results )
+
+ if "speparate-bars" in options.plots:
+ plotBarplots( annotator_results, options )
+ if "bars" in options.plots:
+ plotBarplot( annotator_results, options )
+
+ ## write footer and output benchmark information.
+ E.Stop()
+
+if __name__ == "__main__":
+ sys.exit( main( sys.argv) )
diff -r f04dfb37d1bb -r 53487f21c0d5 rlGAT/gat-run.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rlGAT/gat-run.py Thu Aug 29 01:57:54 2013 -0400
@@ -0,0 +1,317 @@
+################################################################################
+#
+# MRC FGU Computational Genomics Group
+#
+# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
+#
+# Copyright (C) 2009 Andreas Heger
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#################################################################################
+'''
+gat-run - run the genomic annotation tool
+=============================================
+
+:Author: Andreas Heger
+:Release: $Id$
+:Date: |today|
+:Tags: Python
+
+Purpose
+-------
+
+This script compares one or more genomic segments of interest
+against one more other genomic annotations.
+
+Usage
+-----
+
+Example::
+
+ python gat-run.py
+ --segment-file=segments.bed.gz
+ --workspace-file=workspace.bed.gz
+ --annotation-file=annotations_architecture.bed.gz
+
+Type::
+
+ python gat-run.py --help
+
+for command line help.
+
+Documentation
+-------------
+
+Code
+----
+
+'''
+
+import os, sys, re, optparse, collections, types, glob, time
+import numpy
+
+import gat
+import gat.Experiment as E
+import gat.IOTools as IOTools
+import gat.IO as IO
+import gat.Stats as Stats
+import GatSegmentList
+import GatEngine
+
+def fromSegments( options, args ):
+ '''run analysis from segment files.
+
+ This is the most common use case.
+ '''
+
+ tstart = time.time()
+
+ ##################################################
+ ##################################################
+ ##################################################
+ ## build segments
+ segments, annotations, workspaces, isochores = IO.buildSegments( options )
+
+ E.info( "intervals loaded in %i seconds" % (time.time() - tstart) )
+
+ ##################################################
+ ##################################################
+ ##################################################
+ ## open various additional output files
+ ##################################################
+ outfiles = {}
+ for section in ("sample",
+ "segment_metrics",
+ "sample_metrics",
+ ):
+ if section in options.output_stats or \
+ "all" in options.output_stats or \
+ len( [ x for x in options.output_stats if re.search( x, "section" ) ] ) > 0:
+ outfiles[section] = E.openOutputFile(section)
+
+ if 'sample_metrics' in outfiles:
+ outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders() ))
+
+ # filter segments by workspace
+ workspace = IO.applyIsochores( segments,
+ annotations,
+ workspaces,
+ options,
+ isochores,
+ truncate_segments_to_workspace = options.truncate_segments_to_workspace,
+ truncate_workspace_to_annotations = options.truncate_workspace_to_annotations,
+ restrict_workspace = options.restrict_workspace )
+
+ ##################################################
+ ##################################################
+ ##################################################
+ ## check memory requirements
+ counts = segments.countsPerTrack()
+ max_counts = max(counts.values())
+ # previous algorithm: memory requirements if all samples are stored
+ memory = 8 * 2 * options.num_samples * max_counts * len(workspace)
+
+ ##################################################
+ ##################################################
+ ##################################################
+ # initialize sampler
+ if options.sampler == "annotator":
+ sampler = GatEngine.SamplerAnnotator(
+ bucket_size = options.bucket_size,
+ nbuckets = options.nbuckets )
+ elif options.sampler == "shift":
+ sampler = GatEngine.SamplerShift(
+ radius = options.shift_expansion,
+ extension = options.shift_extension )
+ elif options.sampler == "segments":
+ sampler = GatEngine.SamplerSegments()
+ elif options.sampler == "local-permutation":
+ sampler = GatEngine.SamplerLocalPermutation()
+ elif options.sampler == "global-permutation":
+ sampler = GatEngine.SamplerGlobalPermutation()
+ elif options.sampler == "brute-force":
+ sampler = GatEngine.SamplerBruteForce()
+ elif options.sampler == "uniform":
+ sampler = GatEngine.SamplerUniform()
+
+ ##################################################
+ ##################################################
+ ##################################################
+ # initialize counter
+ counters = []
+ for counter in options.counters:
+ if counter == "nucleotide-overlap":
+ counters.append( GatEngine.CounterNucleotideOverlap() )
+ elif counter == "nucleotide-density":
+ counters.append( GatEngine.CounterNucleotideDensity() )
+ elif counter == "segment-overlap":
+ counters.append( GatEngine.CounterSegmentOverlap() )
+ elif counter == "annotations-overlap":
+ counters.append( GatEngine.CounterAnnotationsOverlap() )
+ elif counter == "segment-midoverlap":
+ counters.append( GatEngine.CounterSegmentMidpointOverlap() )
+ elif counter == "annotations-midoverlap":
+ counters.append( GatEngine.CounterAnnotationsMidpointOverlap() )
+ else:
+ raise ValueError("unknown counter '%s'" % counter )
+
+ ##################################################
+ ##################################################
+ ##################################################
+ ## initialize workspace generator
+ if options.conditional == "unconditional":
+ workspace_generator = GatEngine.UnconditionalWorkspace()
+ elif options.conditional == "cooccurance":
+ workspace_generator = GatEngine.ConditionalWorkspaceCooccurance()
+ elif options.conditional == "annotation-centered":
+ if options.conditional_extension == options.conditional_expansion == None:
+ raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
+ workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension,
+ options.conditional_expansion )
+ elif options.conditional == "segment-centered":
+ if options.conditional_extension == options.conditional_expansion == None:
+ raise ValueError( "please specify either --conditional-expansion or --conditional-extension" )
+
+ workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension,
+ options.conditional_expansion )
+ else:
+ raise ValueError("unknown conditional workspace '%s'" % options.conditional )
+
+
+ ##################################################
+ ##################################################
+ ##################################################
+ ## check if reference is compplete
+ ##################################################
+ if options.reference:
+ reference = options.reference
+ for track in segments.tracks:
+ if track not in options.reference:
+ raise ValueError("missing track '%s' in reference" % track )
+ r = options.reference[track]
+ for annotation in annotations.tracks:
+ if annotation not in r:
+ raise ValueError("missing annotation '%s' in annotations for track='%s'" % (annotation, track ))
+
+
+ ##################################################
+ ##################################################
+ ##################################################
+ ## compute
+ ##################################################
+ annotator_results = gat.run( segments,
+ annotations,
+ workspace,
+ sampler,
+ counters,
+ workspace_generator = workspace_generator,
+ num_samples = options.num_samples,
+ cache = options.cache,
+ outfiles = outfiles,
+ output_counts_pattern = options.output_counts_pattern,
+ output_samples_pattern = options.output_samples_pattern,
+ sample_files = options.sample_files,
+ conditional = options.conditional,
+ conditional_extension = options.conditional_extension,
+ reference = options.reference,
+ pseudo_count = options.pseudo_count,
+ num_threads = options.num_threads )
+
+ return annotator_results
+
+
+def main( argv = None ):
+ """script main.
+
+ parses command line options in sys.argv, unless *argv* is given.
+ """
+
+ if not argv: argv = sys.argv
+
+ parser = gat.buildParser( usage = globals()["__doc__"] )
+
+ ## add common options (-h/--help, ...) and parse command line
+ (options, args) = E.Start( parser, argv = argv, add_output_options = True )
+
+ ##################################################
+ description_header, descriptions, description_width = IO.readDescriptions( options )
+
+ ##################################################
+ size_pos, size_segment = GatSegmentList.getSegmentSize()
+ E.debug( "sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos )))
+
+ ##################################################
+ # set default counter
+ if not options.counters:
+ options.counters.append( "nucleotide-overlap" )
+
+ ##################################################
+ if options.output_tables_pattern != None:
+ if "%s" not in options.output_tables_pattern:
+ raise ValueError( "output_tables_pattern should contain at least one '%s'")
+
+ if options.output_samples_pattern != None:
+ if "%s" not in options.output_samples_pattern:
+ raise ValueError( "output_samples_pattern should contain at least one '%s'")
+
+ if options.output_counts_pattern != None:
+ if "%s" not in options.output_counts_pattern:
+ raise ValueError( "output_counts_pattern should contain at least one '%s'")
+
+
+ ##################################################
+ # read fold changes that results should be compared with
+ if options.null != "default":
+ if not os.path.exists( options.null ):
+ raise OSError( "file %s not found" % options.null )
+ E.info( "reading reference results from %s" % options.null )
+ options.reference = IO.readAnnotatorResults( options.null )
+ else:
+ options.reference = None
+
+ if options.input_filename_counts:
+ # use pre-computed counts
+ annotator_results = GatEngine.fromCounts( options.input_filename_counts )
+
+ elif options.input_filename_results:
+ # use previous results (re-computes fdr)
+ E.info( "reading gat results from %s" % options.input_filename_results )
+ annotator_results = IO.readAnnotatorResults( options.input_filename_results )
+
+ else:
+ # do full gat analysis
+ annotator_results = fromSegments( options, args )
+
+ ##################################################
+ if options.pvalue_method != "empirical":
+ E.info("updating pvalues to %s" % options.pvalue_method )
+ GatEngine.updatePValues( annotator_results, options.pvalue_method )
+
+ ##################################################
+ ## output
+ IO.outputResults( annotator_results,
+ options,
+ GatEngine.AnnotatorResultExtended.headers,
+ description_header,
+ description_width,
+ descriptions )
+
+ IO.plotResults( annotator_results, options )
+
+ ## write footer and output benchmark information.
+ E.Stop()
+
+if __name__ == "__main__":
+ sys.exit( main( sys.argv) )
diff -r f04dfb37d1bb -r 53487f21c0d5 rlGAT/rlGAT.xml
--- a/rlGAT/rlGAT.xml Thu Aug 29 01:01:45 2013 -0400
+++ b/rlGAT/rlGAT.xml Thu Aug 29 01:57:54 2013 -0400
@@ -9,7 +9,7 @@
numpy
cython
-
+
gat-run.py ${ignore_segments} --log="$logfile" --num-samples=$numsamp
#for $s in $segmentfiles:
#if $s.sf.ext != 'data':