Mercurial > repos > fubar > genomic_association_tester
comparison rlGAT/gat-compare.py @ 11:53487f21c0d5 draft
Uploaded
| author | fubar |
|---|---|
| date | Thu, 29 Aug 2013 01:57:54 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 10:f04dfb37d1bb | 11:53487f21c0d5 |
|---|---|
| 1 ################################################################################ | |
| 2 # | |
| 3 # MRC FGU Computational Genomics Group | |
| 4 # | |
| 5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $ | |
| 6 # | |
| 7 # Copyright (C) 2009 Andreas Heger | |
| 8 # | |
| 9 # This program is free software; you can redistribute it and/or | |
| 10 # modify it under the terms of the GNU General Public License | |
| 11 # as published by the Free Software Foundation; either version 2 | |
| 12 # of the License, or (at your option) any later version. | |
| 13 # | |
| 14 # This program is distributed in the hope that it will be useful, | |
| 15 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 17 # GNU General Public License for more details. | |
| 18 # | |
| 19 # You should have received a copy of the GNU General Public License | |
| 20 # along with this program; if not, write to the Free Software | |
| 21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
| 22 ################################################################################# | |
| 23 ''' | |
| 24 gat-compare - compare two gat runs | |
| 25 ================================== | |
| 26 | |
| 27 :Author: Andreas Heger | |
| 28 :Release: $Id$ | |
| 29 :Date: |today| | |
| 30 :Tags: Python | |
| 31 | |
| 32 Purpose | |
| 33 ------- | |
| 34 | |
| 35 This script compares gat runs and compares statistical significance of fold change differences between two or more | |
| 36 regions of interest. | |
| 37 | |
| 38 This script requires counts files created by gat (option ``--output-counts-file``). | |
| 39 | |
| 40 The reported observed value between sets 1 and 2 is the difference of fc2 and fc1: | |
| 41 | |
| 42 * observed = fold2 - fold1 | |
| 43 | |
| 44 The script accepts one or more output files from a gat run. | |
| 45 | |
| 46 If only a single file is given, the significance of fold change differences are computed | |
| 47 for each pairwise comparison of annotations. Thus, the script might answer the question | |
| 48 if a transcription factor is more enriched in promotors than in enhancers. | |
| 49 | |
| 50 If multiple files are given, the script computes the fold change differences for the same | |
| 51 annotation for all pairwise combinations of :term:`segments of interest` across the different | |
| 52 files. Thus, the script might answer the question if transcription factor A is more enriched | |
| 53 in promotors than transcription factor B is enriched in promotors. | |
| 54 | |
| 55 Usage | |
| 56 ----- | |
| 57 | |
| 58 Example:: | |
| 59 | |
| 60 python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz | |
| 61 | |
| 62 Type:: | |
| 63 | |
| 64 python gat-compare.py --help | |
| 65 | |
| 66 for command line help. | |
| 67 | |
| 68 Documentation | |
| 69 ------------- | |
| 70 | |
| 71 Code | |
| 72 ---- | |
| 73 | |
| 74 ''' | |
| 75 | |
| 76 import os, sys, re, optparse, collections, types, glob, time, math | |
| 77 import itertools | |
| 78 import numpy | |
| 79 | |
| 80 import gat | |
| 81 import gat.Experiment as E | |
| 82 import gat.IOTools as IOTools | |
| 83 import gat.IO as IO | |
| 84 | |
| 85 def main( argv = None ): | |
| 86 """script main. | |
| 87 | |
| 88 parses command line options in sys.argv, unless *argv* is given. | |
| 89 """ | |
| 90 | |
| 91 if not argv: argv = sys.argv | |
| 92 | |
| 93 # setup command line parser | |
| 94 parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", | |
| 95 usage = globals()["__doc__"] ) | |
| 96 | |
| 97 parser.add_option("-o", "--order", dest="output_order", type="choice", | |
| 98 choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ), | |
| 99 help="order results in output by fold, track, etc. [default=%default]." ) | |
| 100 | |
| 101 parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", | |
| 102 choices = ( "empirical", "norm", ), | |
| 103 help="type of pvalue reported [default=%default]." ) | |
| 104 | |
| 105 parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", | |
| 106 choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ), | |
| 107 help="method to perform multiple testing correction by controlling the fdr [default=%default]." ) | |
| 108 | |
| 109 parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float", | |
| 110 help="fdr computation: lambda [default=%default]." ) | |
| 111 | |
| 112 parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", | |
| 113 choices = ("smoother", "bootstrap" ), | |
| 114 help="fdr computation: method for estimating pi0 [default=%default]." ) | |
| 115 | |
| 116 parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", | |
| 117 help="filename mapping annotation terms to descriptions. " | |
| 118 " if given, the output table will contain additional columns " | |
| 119 " [default=%default]" ) | |
| 120 | |
| 121 parser.add_option( "--pseudo-count", dest="pseudo_count", type="float", | |
| 122 help="pseudo count. The pseudo count is added to both the observed and expected overlap. " | |
| 123 " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." ) | |
| 124 | |
| 125 parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string", | |
| 126 help="output pattern for plots [default=%default]" ) | |
| 127 | |
| 128 parser.set_defaults( | |
| 129 pvalue_method = "empirical", | |
| 130 qvalue_method = "BH", | |
| 131 qvalue_lambda = None, | |
| 132 qvalue_pi0_method = "smoother", | |
| 133 # pseudo count for fold change computation to avoid 0 fc | |
| 134 pseudo_count = 1.0, | |
| 135 output_order = "observed", | |
| 136 ) | |
| 137 | |
| 138 ## add common options (-h/--help, ...) and parse command line | |
| 139 (options, args) = E.Start( parser, argv = argv, add_output_options = True ) | |
| 140 | |
| 141 input_filenames_counts = args | |
| 142 | |
| 143 ################################################## | |
| 144 E.info( "received %i filenames with counts" % len(input_filenames_counts)) | |
| 145 | |
| 146 ################################################## | |
| 147 description_header, descriptions, description_width = IO.readDescriptions( options ) | |
| 148 | |
| 149 all_annotator_results = [] | |
| 150 | |
| 151 for input_filename_counts in input_filenames_counts: | |
| 152 | |
| 153 E.info("processing %s" % input_filename_counts ) | |
| 154 | |
| 155 annotator_results = gat.fromCounts( input_filename_counts ) | |
| 156 | |
| 157 ################################################## | |
| 158 if options.pvalue_method != "empirical": | |
| 159 E.info("updating pvalues to %s" % options.pvalue_method ) | |
| 160 gat.updatePValues( annotator_results, options.pvalue_method ) | |
| 161 | |
| 162 ################################################## | |
| 163 ################################################## | |
| 164 ################################################## | |
| 165 ## compute global fdr | |
| 166 ################################################## | |
| 167 E.info( "computing FDR statistics" ) | |
| 168 gat.updateQValues( annotator_results, | |
| 169 method = options.qvalue_method, | |
| 170 vlambda = options.qvalue_lambda, | |
| 171 pi0_method = options.qvalue_pi0_method ) | |
| 172 | |
| 173 all_annotator_results.append( annotator_results ) | |
| 174 | |
| 175 pseudo_count = options.pseudo_count | |
| 176 results = [] | |
| 177 | |
| 178 if len(all_annotator_results) == 1: | |
| 179 E.info("performing pairwise comparison within a file") | |
| 180 | |
| 181 # collect all annotations | |
| 182 annotations, segments = list(), set() | |
| 183 for x in all_annotator_results[0]: | |
| 184 segments.add( x.track) | |
| 185 annotations.append( x ) | |
| 186 | |
| 187 if len(segments) != 1: | |
| 188 raise NotImplementedError( "multiple segments of interest") | |
| 189 | |
| 190 for data1, data2 in itertools.combinations(annotations, 2): | |
| 191 | |
| 192 # note that fold changes can be very large if there are 0 samples | |
| 193 # this is fine for getting the distributional params (mean, stddev) | |
| 194 fold_changes1 = data1.observed / (data1.samples + pseudo_count ) | |
| 195 fold_changes2 = data2.observed / (data2.samples + pseudo_count ) | |
| 196 | |
| 197 # add a separate fc pseudo-count to avoid 0 values | |
| 198 fold_changes1 += 0.0001 | |
| 199 fold_changes2 += 0.0001 | |
| 200 | |
| 201 # Test is if relative fold change rfc is different from 1 | |
| 202 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 | |
| 203 # = obs1 / obs2 * exp2 / exp1 | |
| 204 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 | |
| 205 # | |
| 206 # Convert to log space for easier plotting | |
| 207 # Move the observed fold ratio in order to get an idea of the magnitude | |
| 208 # of the underlying fold change | |
| 209 delta_fold = data2.fold - data1.fold | |
| 210 sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold | |
| 211 observed_delta_fold = 0.0 + delta_fold | |
| 212 | |
| 213 result = gat.AnnotatorResult( data1.annotation, data2.annotation, | |
| 214 "na", | |
| 215 observed_delta_fold, | |
| 216 sampled_delta_fold, | |
| 217 reference = None, | |
| 218 pseudo_count = 0 ) | |
| 219 | |
| 220 | |
| 221 results.append( result ) | |
| 222 | |
| 223 | |
| 224 else: | |
| 225 E.info("performing pairwise comparison between files") | |
| 226 | |
| 227 ################################################## | |
| 228 # perform pairwise comparison | |
| 229 for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2): | |
| 230 E.info( "comparing %i and %i" % (index1,index2)) | |
| 231 a,b = all_annotator_results[index1], all_annotator_results[index2] | |
| 232 | |
| 233 # index results in a and b | |
| 234 aa = collections.defaultdict( dict ) | |
| 235 for x in a: aa[x.track][x.annotation] = x | |
| 236 | |
| 237 bb = collections.defaultdict( dict ) | |
| 238 for x in b: bb[x.track][x.annotation] = x | |
| 239 | |
| 240 if len(aa.keys()) != 1 or len(bb.keys()) != 1: | |
| 241 raise NotImplementedError( "multiple segments of interest") | |
| 242 | |
| 243 track = "merged" | |
| 244 # get shared annotations | |
| 245 annotations1 = aa[track].keys() | |
| 246 annotations2 = bb[track].keys() | |
| 247 shared_annotations = list( set(annotations1).intersection( set(annotations2) ) ) | |
| 248 E.info( "%i shared annotations" % len(shared_annotations)) | |
| 249 | |
| 250 for annotation in shared_annotations: | |
| 251 | |
| 252 # if not annotation.startswith("Ram:"): continue | |
| 253 | |
| 254 data1 = aa[track][annotation] | |
| 255 data2 = bb[track][annotation] | |
| 256 | |
| 257 # note that fold changes can be very large if there are 0 samples | |
| 258 # this is fine for getting the distributional params (mean, stddev) | |
| 259 fold_changes1 = data1.observed / (data1.samples + pseudo_count ) | |
| 260 fold_changes2 = data2.observed / (data2.samples + pseudo_count ) | |
| 261 | |
| 262 # add a separate fc pseudo-count to avoid 0 values | |
| 263 fold_changes1 += 0.0001 | |
| 264 fold_changes2 += 0.0001 | |
| 265 | |
| 266 # Test is if relative fold change rfc is different from 1 | |
| 267 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 | |
| 268 # = obs1 / obs2 * exp2 / exp1 | |
| 269 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 | |
| 270 # | |
| 271 # Convert to log space for easier plotting | |
| 272 # Move the observed fold ratio in order to get an idea of the magnitude | |
| 273 # of the underlying fold change | |
| 274 delta_fold = data2.fold - data1.fold | |
| 275 sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold | |
| 276 observed_delta_fold = 0.0 + delta_fold | |
| 277 | |
| 278 result = gat.AnnotatorResult( track, annotation, | |
| 279 "na", | |
| 280 observed_delta_fold, | |
| 281 sampled_delta_fold, | |
| 282 reference = None, | |
| 283 pseudo_count = 0 ) | |
| 284 | |
| 285 | |
| 286 results.append( result ) | |
| 287 | |
| 288 | |
| 289 if len(results) == 0: | |
| 290 E.critical( "no results found" ) | |
| 291 E.Stop() | |
| 292 return | |
| 293 | |
| 294 IO.outputResults( results, | |
| 295 options, | |
| 296 gat.AnnotatorResult.headers, | |
| 297 description_header, | |
| 298 description_width, | |
| 299 descriptions, | |
| 300 format_observed = "%6.4f" ) | |
| 301 | |
| 302 IO.plotResults( results, options ) | |
| 303 | |
| 304 ## write footer and output benchmark information. | |
| 305 E.Stop() | |
| 306 | |
| 307 if __name__ == "__main__": | |
| 308 sys.exit( main( sys.argv) ) |
