Mercurial > repos > fubar > genomic_association_tester
changeset 11:53487f21c0d5 draft
Uploaded
| author | fubar |
|---|---|
| date | Thu, 29 Aug 2013 01:57:54 -0400 |
| parents | f04dfb37d1bb |
| children | 2087a3ce6fb2 |
| files | rlGAT/gat-compare.py rlGAT/gat-great.py rlGAT/gat-plot.py rlGAT/gat-run.py rlGAT/rlGAT.xml |
| diffstat | 5 files changed, 1351 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rlGAT/gat-compare.py Thu Aug 29 01:57:54 2013 -0400 @@ -0,0 +1,308 @@ +################################################################################ +# +# MRC FGU Computational Genomics Group +# +# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ +# +# Copyright (C) 2009 Andreas Heger +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +################################################################################# +''' +gat-compare - compare two gat runs +================================== + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Python + +Purpose +------- + +This script compares gat runs and compares statistical significance of fold change differences between two or more +regions of interest. + +This script requires counts files created by gat (option ``--output-counts-file``). + +The reported observed value between sets 1 and 2 is the difference of fc2 and fc1: + + * observed = fold2 - fold1 + +The script accepts one or more output files from a gat run. + +If only a single file is given, the significance of fold change differences are computed +for each pairwise comparison of annotations. Thus, the script might answer the question +if a transcription factor is more enriched in promotors than in enhancers. + +If multiple files are given, the script computes the fold change differences for the same +annotation for all pairwise combinations of :term:`segments of interest` across the different +files. Thus, the script might answer the question if transcription factor A is more enriched +in promotors than transcription factor B is enriched in promotors. + +Usage +----- + +Example:: + + python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz + +Type:: + + python gat-compare.py --help + +for command line help. + +Documentation +------------- + +Code +---- + +''' + +import os, sys, re, optparse, collections, types, glob, time, math +import itertools +import numpy + +import gat +import gat.Experiment as E +import gat.IOTools as IOTools +import gat.IO as IO + +def main( argv = None ): + """script main. + + parses command line options in sys.argv, unless *argv* is given. + """ + + if not argv: argv = sys.argv + + # setup command line parser + parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", + usage = globals()["__doc__"] ) + + parser.add_option("-o", "--order", dest="output_order", type="choice", + choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ), + help="order results in output by fold, track, etc. [default=%default]." ) + + parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", + choices = ( "empirical", "norm", ), + help="type of pvalue reported [default=%default]." ) + + parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", + choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ), + help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) + + parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", + help="fdr computation: lambda [default=%default]." ) + + parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", + choices = ("smoother", "bootstrap" ), + help="fdr computation: method for estimating pi0 [default=%default]." ) + + parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", + help="filename mapping annotation terms to descriptions. " + " if given, the output table will contain additional columns " + " [default=%default]" ) + + parser.add_option( "--pseudo-count", dest="pseudo_count", type="float", + help="pseudo count. The pseudo count is added to both the observed and expected overlap. " + " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." ) + + parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string", + help="output pattern for plots [default=%default]" ) + + parser.set_defaults( + pvalue_method = "empirical", + qvalue_method = "BH", + qvalue_lambda = None, + qvalue_pi0_method = "smoother", + # pseudo count for fold change computation to avoid 0 fc + pseudo_count = 1.0, + output_order = "observed", + ) + + ## add common options (-h/--help, ...) and parse command line + (options, args) = E.Start( parser, argv = argv, add_output_options = True ) + + input_filenames_counts = args + + ################################################## + E.info( "received %i filenames with counts" % len(input_filenames_counts)) + + ################################################## + description_header, descriptions, description_width = IO.readDescriptions( options ) + + all_annotator_results = [] + + for input_filename_counts in input_filenames_counts: + + E.info("processing %s" % input_filename_counts ) + + annotator_results = gat.fromCounts( input_filename_counts ) + + ################################################## + if options.pvalue_method != "empirical": + E.info("updating pvalues to %s" % options.pvalue_method ) + gat.updatePValues( annotator_results, options.pvalue_method ) + + ################################################## + ################################################## + ################################################## + ## compute global fdr + ################################################## + E.info( "computing FDR statistics" ) + gat.updateQValues( annotator_results, + method = options.qvalue_method, + vlambda = options.qvalue_lambda, + pi0_method = options.qvalue_pi0_method ) + + all_annotator_results.append( annotator_results ) + + pseudo_count = options.pseudo_count + results = [] + + if len(all_annotator_results) == 1: + E.info("performing pairwise comparison within a file") + + # collect all annotations + annotations, segments = list(), set() + for x in all_annotator_results[0]: + segments.add( x.track) + annotations.append( x ) + + if len(segments) != 1: + raise NotImplementedError( "multiple segments of interest") + + for data1, data2 in itertools.combinations(annotations, 2): + + # note that fold changes can be very large if there are 0 samples + # this is fine for getting the distributional params (mean, stddev) + fold_changes1 = data1.observed / (data1.samples + pseudo_count ) + fold_changes2 = data2.observed / (data2.samples + pseudo_count ) + + # add a separate fc pseudo-count to avoid 0 values + fold_changes1 += 0.0001 + fold_changes2 += 0.0001 + + # Test is if relative fold change rfc is different from 1 + # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 + # = obs1 / obs2 * exp2 / exp1 + # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 + # + # Convert to log space for easier plotting + # Move the observed fold ratio in order to get an idea of the magnitude + # of the underlying fold change + delta_fold = data2.fold - data1.fold + sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold + observed_delta_fold = 0.0 + delta_fold + + result = gat.AnnotatorResult( data1.annotation, data2.annotation, + "na", + observed_delta_fold, + sampled_delta_fold, + reference = None, + pseudo_count = 0 ) + + + results.append( result ) + + + else: + E.info("performing pairwise comparison between files") + + ################################################## + # perform pairwise comparison + for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2): + E.info( "comparing %i and %i" % (index1,index2)) + a,b = all_annotator_results[index1], all_annotator_results[index2] + + # index results in a and b + aa = collections.defaultdict( dict ) + for x in a: aa[x.track][x.annotation] = x + + bb = collections.defaultdict( dict ) + for x in b: bb[x.track][x.annotation] = x + + if len(aa.keys()) != 1 or len(bb.keys()) != 1: + raise NotImplementedError( "multiple segments of interest") + + track = "merged" + # get shared annotations + annotations1 = aa[track].keys() + annotations2 = bb[track].keys() + shared_annotations = list( set(annotations1).intersection( set(annotations2) ) ) + E.info( "%i shared annotations" % len(shared_annotations)) + + for annotation in shared_annotations: + + # if not annotation.startswith("Ram:"): continue + + data1 = aa[track][annotation] + data2 = bb[track][annotation] + + # note that fold changes can be very large if there are 0 samples + # this is fine for getting the distributional params (mean, stddev) + fold_changes1 = data1.observed / (data1.samples + pseudo_count ) + fold_changes2 = data2.observed / (data2.samples + pseudo_count ) + + # add a separate fc pseudo-count to avoid 0 values + fold_changes1 += 0.0001 + fold_changes2 += 0.0001 + + # Test is if relative fold change rfc is different from 1 + # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 + # = obs1 / obs2 * exp2 / exp1 + # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 + # + # Convert to log space for easier plotting + # Move the observed fold ratio in order to get an idea of the magnitude + # of the underlying fold change + delta_fold = data2.fold - data1.fold + sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold + observed_delta_fold = 0.0 + delta_fold + + result = gat.AnnotatorResult( track, annotation, + "na", + observed_delta_fold, + sampled_delta_fold, + reference = None, + pseudo_count = 0 ) + + + results.append( result ) + + + if len(results) == 0: + E.critical( "no results found" ) + E.Stop() + return + + IO.outputResults( results, + options, + gat.AnnotatorResult.headers, + description_header, + description_width, + descriptions, + format_observed = "%6.4f" ) + + IO.plotResults( results, options ) + + ## write footer and output benchmark information. + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rlGAT/gat-great.py Thu Aug 29 01:57:54 2013 -0400 @@ -0,0 +1,488 @@ +################################################################################ +# +# MRC FGU Computational Genomics Group +# +# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ +# +# Copyright (C) 2009 Andreas Heger +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +################################################################################# +''' +great - run great analysis +========================== + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Python + +Purpose +------- + +This script compares one or more genomic segments of interest +against one more other genomic annotations. + +GREAT says that a segment overlaps an annotation if the midpoint +of the segment overlaps the annotation. + +The GREAT analysis is done on a per-isochore basis and overall. +For the overall analysis, there is no isochore correction +as the union of all isochores is treated as a single workspace. + +This script implies several counters (``--counter`` option): + +binom + binomial model (as implemented by GREAT). Returns the probability + of a certain number of segments overlapping (mid-point overlap) + an annotation + +hyperg + hypergeometric model. Returns the probabibility of a certain + number of nucleotides within segments overlapping an annotation. + Conceptually, all genomic positions are treated independently and + segments are fragmented into a collection of single bases that are + placed indepently. This is valid for point intervals such as SNPs, + but of course ignores segment size, though it might be used as a + guide towards the expected fold enrichment, that can be computed + more accurately using :mod:`gat-run.py` + +Usage +----- + +Example:: + + python gat-run.py + --segment-file=segments.bed.gz + --workspace-file=workspace.bed.gz + --annotation-file=annotations_architecture.bed.gz + +Type:: + + python gat-run.py --help + +for command line help. + +Documentation +------------- + +Code +---- + +''' + +import os, sys, re, optparse, collections, types, glob, time +import numpy +import numpy.random + +import gat +import gat.Experiment as E +import gat.IOTools as IOTools +import gat.IO as IO +import csegmentlist + +import scipy.stats + +GREAT_RESULT = collections.namedtuple( "GREAT", + ("track", + "annotation", + "isochore", + "counter", + "observed", # nsegments_overlapping_annotations + "expected", + "nsegments_in_workspace", + "nannotations_in_workspace", + "nsegments_overlapping_annotation", + "nannotations_overlapping_segments", + "basecoverage_intersection", + "basecoverage_segments", + "basecoverage_annotation", + "basecoverage_workspace", + "fraction_coverage_annotation", + "fold", + "pvalue", + "qvalue" ) ) + +def main( argv ): + + if not argv: argv = sys.argv + + # setup command line parser + parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", + usage = globals()["__doc__"] ) + + parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append", + help="filename with annotations [default=%default]." ) + + parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", + help="filename with segments. Also accepts a glob in parentheses [default=%default]." ) + + parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", + help="filename with workspace segments. Also accepts a glob in parentheses [default=%default]." ) + + parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append", + help="filename with isochore segments. Also accepts a glob in parentheses [default=%default]." ) + + parser.add_option("-o", "--order", dest="output_order", type="choice", + choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ), + help="order results in output by fold, track, etc. [default=%default]." ) + + parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", + choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ), + help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) + + parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", + help="fdr computation: lambda [default=%default]." ) + + parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", + choices = ("smoother", "bootstrap" ), + help="fdr computation: method for estimating pi0 [default=%default]." ) + parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", + help="filename mapping annotation terms to descriptions. " + " if given, the output table will contain additional columns " + " [default=%default]" ) + + parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", + help="ignore segment tracks - all segments belong to one track [default=%default]" ) + + parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true", + help="permit the same track to be in multiple files [default=%default]" ) + + parser.add_option( "--output-bed", dest="output_bed", type="choice", action="append", + choices = ( "all", + "annotations", "segments", + "workspaces", "isochores", + "overlap" ), + help="output bed files [default=%default]." ) + + parser.add_option( "--output-stats", dest="output_stats", type="choice", action="append", + choices = ( "all", + "annotations", "segments", + "workspaces", "isochores", + "overlap" ), + help="output overlap summary stats [default=%default]." ) + + parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true", + help="restrict workspace to those segments that contain both track" + " and annotations [default=%default]" ) + + parser.add_option( "--counter", dest="counters", type="choice", action="append", + choices = ( "binom", "hyperg" ), + help="counter to use [default=%default]." ) + + parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string", + help="output pattern for result tables. Used if there are multiple counters used [default=%default]." ) + + parser.set_defaults( + annotation_files = [], + segment_files = [], + workspace_files = [], + sample_files = [], + counters = [], + output_stats = [], + output_bed = [], + output_tables_pattern = "%s.tsv.gz", + output_order = "fold", + input_filename_counts = None, + input_filename_results = None, + pvalue_method = "empirical", + output_plots_pattern = None, + output_samples_pattern = None, + qvalue_method = "storey", + qvalue_lambda = None, + qvalue_pi0_method = "smoother", + ignore_segment_tracks = False, + input_filename_descriptions = None, + conditional = "unconditional", + conditional_extension = None, + conditional_expansion = None, + restrict_workspace = False, + enable_split_tracks = False, + shift_expansion = 2.0, + shift_extension = 0, + overlap_mode = "midpoint", + truncate_workspace_to_annotations = False, + ) + + ## add common options (-h/--help, ...) and parse command line + (options, args) = E.Start( parser, argv = argv, add_output_options = True ) + + tstart = time.time() + + if len(options.counters) == 0: + options.counters.append("binom") + + ############################################ + segments, annotations, workspaces, isochores = IO.buildSegments( options ) + E.info( "intervals loaded in %i seconds" % (time.time() - tstart) ) + + # filter segments by workspace + workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores ) + + ############################################ + description_header, descriptions, description_width = IO.readDescriptions( options ) + + ############################################ + ############################################ + ## compute per contig + + # compute bases covered by workspace + workspace2basecoverage, isochores = {}, [] + for contig, ww in workspace.iteritems(): + workspace2basecoverage[contig] = ww.sum() + isochores.append( contig ) + + # compute percentage of bases covered by annotations in workspace + # per isochore + annotation2basecoverage = collections.defaultdict( dict ) + for annotation, aa in annotations.iteritems(): + for isochore, a in aa.iteritems(): + # need to truncate to workspace? + annotation2basecoverage[annotation][isochore] = a.sum() + + results_per_contig = collections.defaultdict( list ) + + E.info( "computing counts per isochore" ) + # results per isochore + def emptyResult( segment, annotation, isochore, + counter, + nsegments_in_workspace, + basecoverage_annotation, + basecoverage_workspace): + return GREAT_RESULT._make( ( + segment, annotation, isochore, + counter, + 0, # observed + 0, # expected + nsegments_in_workspace, + 0, # nannotations_in_workspace + 0, # nsegments_overlapping_annotation + 0, # nannotations_overlapping_segments + 0, # basecoverage_intersection + 0, # basecoverage_segments + basecoverage_annotation, + basecoverage_workspace, + 0.0, + 1.0, + 1.0, + 1.0 )) + + for isochore in isochores: + basecoverage_workspace = workspace2basecoverage[isochore] + + # iterate over all isochores + for segment, segmentdict in segments.iteritems(): + try: + ss = segmentdict[isochore] + # select segments overlapping workspace + segments_in_workspace = csegmentlist.SegmentList( clone = ss ) + segments_in_workspace.intersect( workspace[isochore] ) + # number of segments in workspace + nsegments_in_workspace = len(segments_in_workspace) + except KeyError: + ss = None + + basecoverage_segments = segments_in_workspace.sum() + + for annotation, annotationdict in annotations.iteritems(): + + # if annotation != "GO:0030957": continue + + try: + aa = annotationdict[isochore] + except KeyError: + aa = None + + # p_A: proportion of bases covered by annotation + try: + basecoverage_annotation = annotation2basecoverage[annotation][isochore] + except KeyError: + basecoverage_annotation = 0 + + if ss == None or aa == None: + for counter in options.counters: + results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, annotation, + isochore, + counter, + nsegments_in_workspace, + basecoverage_annotation, + basecoverage_workspace ) ) + continue + + # select segments overlapping annotation + segments_overlapping_annotation = csegmentlist.SegmentList( clone = ss ) + segments_overlapping_annotation.intersect( annotations[annotation][isochore] ) + # number of segments in annotation + nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore], + mode = options.overlap_mode ) + + # number of nucleotides at the intersection of segments, annotation and workspace + basecoverage_intersection = segments_overlapping_annotation.sum() + + annotations_overlapping_segments = csegmentlist.SegmentList( clone = aa ) + annotations_overlapping_segments.intersect( ss ) + nannotations_overlapping_segments = len( annotations_overlapping_segments ) + + nannotations_in_workspace = len( aa ) + if nannotations_in_workspace == 0: + for counter in options.counters: + results_per_contig[(counter,segment,annotation)].append( emptyResult(segment, + annotation, + isochore, + counter, + nsegments_in_workspace, + basecoverage_annotation, + basecoverage_workspace ) ) + continue + + fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace ) + fraction_hit_annotation = float(nannotations_overlapping_segments) / nannotations_in_workspace + + for counter in options.counters: + if counter.startswith( "binom" ): + # GREAT binomial probability over "regions" + # n = number of genomic regions = nannotations_in_workspace + # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation + # kpi = genomic regions with annotation hit by segments = nannotations_in_segments + # sf = survival functions = 1 -cdf + # probability of observing >kpi in a sample of n where the probabily of succes is + # ppi. + pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1, + nsegments_in_workspace, + fraction_coverage_annotation ) + + expected = fraction_coverage_annotation * nsegments_in_workspace + observed = nsegments_overlapping_annotation + + elif counter.startswith( "hyperg" ): + + # hypergeometric probability over nucleotides + # Sampling without replacement + # x,M,n,M + # x = observed number of nucleotides in overlap of segments,annotations and workspace + # M = number of nucleotides in workspace + # n = number of nucleotides in annotations (and workspace) + # N = number of nucleotides in segments (and workspace) + # P-value of obtaining >x number of nucleotides overlapping. + rv = scipy.stats.hypergeom( basecoverage_workspace, + basecoverage_annotation, + basecoverage_segments ) + + pvalue = rv.sf( basecoverage_intersection ) + expected = rv.mean() + observed = basecoverage_intersection + + if expected != 0: + fold = float(observed) / expected + else: + fold = 1.0 + + r = GREAT_RESULT._make( ( + segment, annotation, isochore, + counter, + observed, + expected, + nsegments_in_workspace, + nannotations_in_workspace, + nsegments_overlapping_annotation, + nannotations_overlapping_segments, + basecoverage_intersection, + basecoverage_segments, + basecoverage_annotation, + basecoverage_workspace, + fraction_coverage_annotation, + fold, + pvalue, + 1.0 )) + # print "\t".join( map(str, r)) + results_per_contig[ (counter,segment,annotation) ].append( r ) + + E.info( "merging counts per isochore" ) + + # compute sums + results = [] + + for niteration, pair in enumerate(results_per_contig.iteritems()): + + counter, segment, annotation = pair[0] + data = pair[1] + + nsegments_in_workspace = sum( [x.nsegments_in_workspace for x in data ] ) + nsegments_overlapping_annotation = sum( [x.observed for x in data ] ) + nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data ] ) + nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data ] ) + + basecoverage_intersection = sum( [x.basecoverage_intersection for x in data ] ) + basecoverage_segments = sum( [x.basecoverage_segments for x in data ] ) + basecoverage_annotation = sum( [x.basecoverage_annotation for x in data ] ) + basecoverage_workspace = sum( [x.basecoverage_workspace for x in data ] ) + + fraction_coverage_annotation = basecoverage_annotation / float( basecoverage_workspace ) + + if counter.startswith( "binom" ): + pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation-1, + nsegments_in_workspace, + fraction_coverage_annotation ) + expected = fraction_coverage_annotation * nsegments_in_workspace + observed = nsegments_overlapping_annotation + elif counter.startswith( "hyperg" ): + rv = scipy.stats.hypergeom( basecoverage_workspace, + basecoverage_annotation, + basecoverage_segments ) + + pvalue = rv.sf( basecoverage_intersection ) + expected = rv.mean() + observed = basecoverage_intersection + + + if expected != 0: + fold = float(observed) / expected + else: + fold = 1.0 + + r = GREAT_RESULT._make( ( + segment, annotation, "all", + counter, + observed, + expected, + nsegments_in_workspace, + nannotations_in_workspace, + nsegments_overlapping_annotation, + nannotations_overlapping_segments, + basecoverage_intersection, + basecoverage_segments, + basecoverage_annotation, + basecoverage_workspace, + fraction_coverage_annotation, + fold, + pvalue, + 1.0 )) + + results.append( r ) + + IO.outputResults( results, + options, + GREAT_RESULT._fields, + description_header, + description_width, + descriptions ) + + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) ) + + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rlGAT/gat-plot.py Thu Aug 29 01:57:54 2013 -0400 @@ -0,0 +1,237 @@ +################################################################################ +# +# MRC FGU Computational Genomics Group +# +# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ +# +# Copyright (C) 2009 Andreas Heger +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +################################################################################# +''' +gat-plot - plot results from a gat analysis +=========================================== + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Python + +Purpose +------- + +This script takes the results of a ``gat-run.py` or ``gat-compare.py`` +and plots the results. + +This script requires matplotlib. + +Usage +----- + +Example:: + + python gat-plot.py --input-filename-results=gat.results.tsv.gz + python gat-plot.py --input-filename-counts=gat.counts.tsv.gz + +Type:: + + python gatplot.py --help + +for command line help. + +Documentation +------------- + +Code +---- + +''' + +import os, sys, re, optparse, collections, types, glob, time +import numpy + +import gat +import gat.Experiment as E +import gat.IOTools as IOTools +import gat.IO as IO + +try: + import matplotlib.pyplot as plt + HASPLOT = True +except (ImportError,RuntimeError): + HASPLOT = False + +class DummyAnnotatorResult: + + format_observed = "%i" + format_expected = "%6.4f" + format_fold = "%6.4f" + format_pvalue = "%6.4e" + + def __init__( self ): + pass + + @classmethod + def _fromLine( cls, line ): + x = cls() + data = line[:-1].split("\t") + x.track, x.annotation = data[:2] + x.observed, x.expected, x.lower95, x.upper95, x.stddev, x.fold, x.pvalue, x.qvalue = \ + map(float, data[2:] ) + return x + + def __str__(self): + return "\t".join( (self.track, + self.annotation, + self.format_observed % self.observed, + self.format_expected % self.expected, + self.format_expected % self.lower95, + self.format_expected % self.upper95, + self.format_expected % self.stddev, + self.format_fold % self.fold, + self.format_pvalue % self.pvalue, + self.format_pvalue % self.qvalue ) ) + +def buildPlotFilename( options, key ): + filename = re.sub("%s", key, options.output_plots_pattern) + filename = re.sub("[^a-zA-Z0-9-_./]", "_", filename ) + dirname = os.path.dirname( filename ) + if dirname and not os.path.exists( dirname ): os.makedirs( dirname ) + return filename + +def plotBarplots( annotator_results, options ): + '''output a series of bar-plots. + + Output for each track. + + Significant results are opaque, while + non-significant results are transparent.''' + + for track in annotator_results: + plt.figure() + r = annotator_results[track] + keys, values = zip( *r.items()) + pos = range(len(r)) + bars = plt.barh( pos, [x.fold for x in values] ) + for b,v in zip(bars, values): + if v.qvalue > 0.05: b.set_alpha( 0.10 ) + + filename = buildPlotFilename( options, "bars-%s" % track ) + plt.yticks( pos, keys ) + plt.axvline( x=1, color="r") + plt.savefig( filename ) + +def plotBarplot( annotator_results, options ): + '''output a single bar-plots. + + Output for each track. + + Significant results are opaque, while + non-significant results are transparent.''' + + ntracks = len(annotator_results ) + height = 1.0 / float(ntracks) + + plt.figure() + + for trackid, track in enumerate(annotator_results): + + r = annotator_results[track] + rr = r.items() + rr.sort() + keys, values = zip(*rr) + pos = numpy.arange(0,len(r),1) + trackid * height + bars = plt.barh( pos, + [x.fold for x in values], + height = height, + label = track, + xerr = [x.stddev / x.expected for x in values], + color = "bryg"[trackid % 4]) + for b,v in zip(bars, values): + if v.pvalue > 0.05: b.set_alpha( 0.10 ) + + pos = range(len(r)) + + plt.yticks( pos, keys ) + plt.axvline(x=1, color = "r" ) + filename = buildPlotFilename( options, "bars-all" ) + plt.legend() + plt.savefig( filename ) + +def main( argv = None ): + """script main. + + parses command line options in sys.argv, unless *argv* is given. + """ + + if not argv: argv = sys.argv + + # setup command line parser + parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", + usage = globals()["__doc__"] ) + + parser.add_option("-l", "--sample-file", dest="sample_files", type="string", action="append", + help="filename with sample files. Start processing from samples [default=%default]." ) + + parser.add_option("-o", "--order", dest="output_order", type="choice", + choices = ( "track", "annotation", "fold", "pvalue", "qvalue" ), + help="order results in output by fold, track, etc. [default=%default]." ) + + parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", + choices = ( "empirical", "norm", ), + help="type of pvalue reported [default=%default]." ) + + parser.add_option( "--results-file", dest="input_filename_results", type="string", + help="start processing from results - no segments required [default=%default]." ) + + parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string", + help="output pattern for plots [default=%default]" ) + + parser.add_option( "--output-samples-pattern", dest="output_samples_pattern", type="string", + help="output pattern for samples. Samples are stored in bed format, one for " + " each segment [default=%default]" ) + + parser.add_option( "--plots", dest="plots", type="choice", + choices = ( "all", + "bars-per-track", + "bars", ), + help="plots to be created [default=%default]." ) + + parser.set_defaults( + sample_files = [], + num_samples = 1000, + output_stats = [], + output_filename_counts = None, + output_order = "fold", + input_filename_results = None, + pvalue_method = "empirical", + output_plots_pattern = None, + ) + + ## add common options (-h/--help, ...) and parse command line + (options, args) = E.Start( parser, argv = argv, add_output_options = True ) + + annotator_results = IO.readAnnotatorResults( options.input_filename_results ) + + if "speparate-bars" in options.plots: + plotBarplots( annotator_results, options ) + if "bars" in options.plots: + plotBarplot( annotator_results, options ) + + ## write footer and output benchmark information. + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) )
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rlGAT/gat-run.py Thu Aug 29 01:57:54 2013 -0400 @@ -0,0 +1,317 @@ +################################################################################ +# +# MRC FGU Computational Genomics Group +# +# $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ +# +# Copyright (C) 2009 Andreas Heger +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +################################################################################# +''' +gat-run - run the genomic annotation tool +============================================= + +:Author: Andreas Heger +:Release: $Id$ +:Date: |today| +:Tags: Python + +Purpose +------- + +This script compares one or more genomic segments of interest +against one more other genomic annotations. + +Usage +----- + +Example:: + + python gat-run.py + --segment-file=segments.bed.gz + --workspace-file=workspace.bed.gz + --annotation-file=annotations_architecture.bed.gz + +Type:: + + python gat-run.py --help + +for command line help. + +Documentation +------------- + +Code +---- + +''' + +import os, sys, re, optparse, collections, types, glob, time +import numpy + +import gat +import gat.Experiment as E +import gat.IOTools as IOTools +import gat.IO as IO +import gat.Stats as Stats +import GatSegmentList +import GatEngine + +def fromSegments( options, args ): + '''run analysis from segment files. + + This is the most common use case. + ''' + + tstart = time.time() + + ################################################## + ################################################## + ################################################## + ## build segments + segments, annotations, workspaces, isochores = IO.buildSegments( options ) + + E.info( "intervals loaded in %i seconds" % (time.time() - tstart) ) + + ################################################## + ################################################## + ################################################## + ## open various additional output files + ################################################## + outfiles = {} + for section in ("sample", + "segment_metrics", + "sample_metrics", + ): + if section in options.output_stats or \ + "all" in options.output_stats or \ + len( [ x for x in options.output_stats if re.search( x, "section" ) ] ) > 0: + outfiles[section] = E.openOutputFile(section) + + if 'sample_metrics' in outfiles: + outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders() )) + + # filter segments by workspace + workspace = IO.applyIsochores( segments, + annotations, + workspaces, + options, + isochores, + truncate_segments_to_workspace = options.truncate_segments_to_workspace, + truncate_workspace_to_annotations = options.truncate_workspace_to_annotations, + restrict_workspace = options.restrict_workspace ) + + ################################################## + ################################################## + ################################################## + ## check memory requirements + counts = segments.countsPerTrack() + max_counts = max(counts.values()) + # previous algorithm: memory requirements if all samples are stored + memory = 8 * 2 * options.num_samples * max_counts * len(workspace) + + ################################################## + ################################################## + ################################################## + # initialize sampler + if options.sampler == "annotator": + sampler = GatEngine.SamplerAnnotator( + bucket_size = options.bucket_size, + nbuckets = options.nbuckets ) + elif options.sampler == "shift": + sampler = GatEngine.SamplerShift( + radius = options.shift_expansion, + extension = options.shift_extension ) + elif options.sampler == "segments": + sampler = GatEngine.SamplerSegments() + elif options.sampler == "local-permutation": + sampler = GatEngine.SamplerLocalPermutation() + elif options.sampler == "global-permutation": + sampler = GatEngine.SamplerGlobalPermutation() + elif options.sampler == "brute-force": + sampler = GatEngine.SamplerBruteForce() + elif options.sampler == "uniform": + sampler = GatEngine.SamplerUniform() + + ################################################## + ################################################## + ################################################## + # initialize counter + counters = [] + for counter in options.counters: + if counter == "nucleotide-overlap": + counters.append( GatEngine.CounterNucleotideOverlap() ) + elif counter == "nucleotide-density": + counters.append( GatEngine.CounterNucleotideDensity() ) + elif counter == "segment-overlap": + counters.append( GatEngine.CounterSegmentOverlap() ) + elif counter == "annotations-overlap": + counters.append( GatEngine.CounterAnnotationsOverlap() ) + elif counter == "segment-midoverlap": + counters.append( GatEngine.CounterSegmentMidpointOverlap() ) + elif counter == "annotations-midoverlap": + counters.append( GatEngine.CounterAnnotationsMidpointOverlap() ) + else: + raise ValueError("unknown counter '%s'" % counter ) + + ################################################## + ################################################## + ################################################## + ## initialize workspace generator + if options.conditional == "unconditional": + workspace_generator = GatEngine.UnconditionalWorkspace() + elif options.conditional == "cooccurance": + workspace_generator = GatEngine.ConditionalWorkspaceCooccurance() + elif options.conditional == "annotation-centered": + if options.conditional_extension == options.conditional_expansion == None: + raise ValueError( "please specify either --conditional-expansion or --conditional-extension" ) + workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension, + options.conditional_expansion ) + elif options.conditional == "segment-centered": + if options.conditional_extension == options.conditional_expansion == None: + raise ValueError( "please specify either --conditional-expansion or --conditional-extension" ) + + workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension, + options.conditional_expansion ) + else: + raise ValueError("unknown conditional workspace '%s'" % options.conditional ) + + + ################################################## + ################################################## + ################################################## + ## check if reference is compplete + ################################################## + if options.reference: + reference = options.reference + for track in segments.tracks: + if track not in options.reference: + raise ValueError("missing track '%s' in reference" % track ) + r = options.reference[track] + for annotation in annotations.tracks: + if annotation not in r: + raise ValueError("missing annotation '%s' in annotations for track='%s'" % (annotation, track )) + + + ################################################## + ################################################## + ################################################## + ## compute + ################################################## + annotator_results = gat.run( segments, + annotations, + workspace, + sampler, + counters, + workspace_generator = workspace_generator, + num_samples = options.num_samples, + cache = options.cache, + outfiles = outfiles, + output_counts_pattern = options.output_counts_pattern, + output_samples_pattern = options.output_samples_pattern, + sample_files = options.sample_files, + conditional = options.conditional, + conditional_extension = options.conditional_extension, + reference = options.reference, + pseudo_count = options.pseudo_count, + num_threads = options.num_threads ) + + return annotator_results + + +def main( argv = None ): + """script main. + + parses command line options in sys.argv, unless *argv* is given. + """ + + if not argv: argv = sys.argv + + parser = gat.buildParser( usage = globals()["__doc__"] ) + + ## add common options (-h/--help, ...) and parse command line + (options, args) = E.Start( parser, argv = argv, add_output_options = True ) + + ################################################## + description_header, descriptions, description_width = IO.readDescriptions( options ) + + ################################################## + size_pos, size_segment = GatSegmentList.getSegmentSize() + E.debug( "sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos ))) + + ################################################## + # set default counter + if not options.counters: + options.counters.append( "nucleotide-overlap" ) + + ################################################## + if options.output_tables_pattern != None: + if "%s" not in options.output_tables_pattern: + raise ValueError( "output_tables_pattern should contain at least one '%s'") + + if options.output_samples_pattern != None: + if "%s" not in options.output_samples_pattern: + raise ValueError( "output_samples_pattern should contain at least one '%s'") + + if options.output_counts_pattern != None: + if "%s" not in options.output_counts_pattern: + raise ValueError( "output_counts_pattern should contain at least one '%s'") + + + ################################################## + # read fold changes that results should be compared with + if options.null != "default": + if not os.path.exists( options.null ): + raise OSError( "file %s not found" % options.null ) + E.info( "reading reference results from %s" % options.null ) + options.reference = IO.readAnnotatorResults( options.null ) + else: + options.reference = None + + if options.input_filename_counts: + # use pre-computed counts + annotator_results = GatEngine.fromCounts( options.input_filename_counts ) + + elif options.input_filename_results: + # use previous results (re-computes fdr) + E.info( "reading gat results from %s" % options.input_filename_results ) + annotator_results = IO.readAnnotatorResults( options.input_filename_results ) + + else: + # do full gat analysis + annotator_results = fromSegments( options, args ) + + ################################################## + if options.pvalue_method != "empirical": + E.info("updating pvalues to %s" % options.pvalue_method ) + GatEngine.updatePValues( annotator_results, options.pvalue_method ) + + ################################################## + ## output + IO.outputResults( annotator_results, + options, + GatEngine.AnnotatorResultExtended.headers, + description_header, + description_width, + descriptions ) + + IO.plotResults( annotator_results, options ) + + ## write footer and output benchmark information. + E.Stop() + +if __name__ == "__main__": + sys.exit( main( sys.argv) )
--- a/rlGAT/rlGAT.xml Thu Aug 29 01:01:45 2013 -0400 +++ b/rlGAT/rlGAT.xml Thu Aug 29 01:57:54 2013 -0400 @@ -9,7 +9,7 @@ <requirement type="package" version="1.7.1">numpy</requirement> <requirement type="package" version="0.19.1">cython</requirement> </requirements> - <command> + <command interpreter="python"> gat-run.py ${ignore_segments} --log="$logfile" --num-samples=$numsamp #for $s in $segmentfiles: #if $s.sf.ext != 'data':
