comparison rlGAT/gat-compare.py @ 11:53487f21c0d5 draft

Uploaded
author fubar
date Thu, 29 Aug 2013 01:57:54 -0400
parents
children
comparison
equal deleted inserted replaced
10:f04dfb37d1bb 11:53487f21c0d5
1 ################################################################################
2 #
3 # MRC FGU Computational Genomics Group
4 #
5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
6 #
7 # Copyright (C) 2009 Andreas Heger
8 #
9 # This program is free software; you can redistribute it and/or
10 # modify it under the terms of the GNU General Public License
11 # as published by the Free Software Foundation; either version 2
12 # of the License, or (at your option) any later version.
13 #
14 # This program is distributed in the hope that it will be useful,
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU General Public License for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with this program; if not, write to the Free Software
21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 #################################################################################
23 '''
24 gat-compare - compare two gat runs
25 ==================================
26
27 :Author: Andreas Heger
28 :Release: $Id$
29 :Date: |today|
30 :Tags: Python
31
32 Purpose
33 -------
34
35 This script compares gat runs and compares statistical significance of fold change differences between two or more
36 regions of interest.
37
38 This script requires counts files created by gat (option ``--output-counts-file``).
39
40 The reported observed value between sets 1 and 2 is the difference of fc2 and fc1:
41
42 * observed = fold2 - fold1
43
44 The script accepts one or more output files from a gat run.
45
46 If only a single file is given, the significance of fold change differences are computed
47 for each pairwise comparison of annotations. Thus, the script might answer the question
48 if a transcription factor is more enriched in promotors than in enhancers.
49
50 If multiple files are given, the script computes the fold change differences for the same
51 annotation for all pairwise combinations of :term:`segments of interest` across the different
52 files. Thus, the script might answer the question if transcription factor A is more enriched
53 in promotors than transcription factor B is enriched in promotors.
54
55 Usage
56 -----
57
58 Example::
59
60 python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz
61
62 Type::
63
64 python gat-compare.py --help
65
66 for command line help.
67
68 Documentation
69 -------------
70
71 Code
72 ----
73
74 '''
75
76 import os, sys, re, optparse, collections, types, glob, time, math
77 import itertools
78 import numpy
79
80 import gat
81 import gat.Experiment as E
82 import gat.IOTools as IOTools
83 import gat.IO as IO
84
85 def main( argv = None ):
86 """script main.
87
88 parses command line options in sys.argv, unless *argv* is given.
89 """
90
91 if not argv: argv = sys.argv
92
93 # setup command line parser
94 parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
95 usage = globals()["__doc__"] )
96
97 parser.add_option("-o", "--order", dest="output_order", type="choice",
98 choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ),
99 help="order results in output by fold, track, etc. [default=%default]." )
100
101 parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
102 choices = ( "empirical", "norm", ),
103 help="type of pvalue reported [default=%default]." )
104
105 parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
106 choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
107 help="method to perform multiple testing correction by controlling the fdr [default=%default]." )
108
109 parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
110 help="fdr computation: lambda [default=%default]." )
111
112 parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
113 choices = ("smoother", "bootstrap" ),
114 help="fdr computation: method for estimating pi0 [default=%default]." )
115
116 parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
117 help="filename mapping annotation terms to descriptions. "
118 " if given, the output table will contain additional columns "
119 " [default=%default]" )
120
121 parser.add_option( "--pseudo-count", dest="pseudo_count", type="float",
122 help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
123 " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." )
124
125 parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string",
126 help="output pattern for plots [default=%default]" )
127
128 parser.set_defaults(
129 pvalue_method = "empirical",
130 qvalue_method = "BH",
131 qvalue_lambda = None,
132 qvalue_pi0_method = "smoother",
133 # pseudo count for fold change computation to avoid 0 fc
134 pseudo_count = 1.0,
135 output_order = "observed",
136 )
137
138 ## add common options (-h/--help, ...) and parse command line
139 (options, args) = E.Start( parser, argv = argv, add_output_options = True )
140
141 input_filenames_counts = args
142
143 ##################################################
144 E.info( "received %i filenames with counts" % len(input_filenames_counts))
145
146 ##################################################
147 description_header, descriptions, description_width = IO.readDescriptions( options )
148
149 all_annotator_results = []
150
151 for input_filename_counts in input_filenames_counts:
152
153 E.info("processing %s" % input_filename_counts )
154
155 annotator_results = gat.fromCounts( input_filename_counts )
156
157 ##################################################
158 if options.pvalue_method != "empirical":
159 E.info("updating pvalues to %s" % options.pvalue_method )
160 gat.updatePValues( annotator_results, options.pvalue_method )
161
162 ##################################################
163 ##################################################
164 ##################################################
165 ## compute global fdr
166 ##################################################
167 E.info( "computing FDR statistics" )
168 gat.updateQValues( annotator_results,
169 method = options.qvalue_method,
170 vlambda = options.qvalue_lambda,
171 pi0_method = options.qvalue_pi0_method )
172
173 all_annotator_results.append( annotator_results )
174
175 pseudo_count = options.pseudo_count
176 results = []
177
178 if len(all_annotator_results) == 1:
179 E.info("performing pairwise comparison within a file")
180
181 # collect all annotations
182 annotations, segments = list(), set()
183 for x in all_annotator_results[0]:
184 segments.add( x.track)
185 annotations.append( x )
186
187 if len(segments) != 1:
188 raise NotImplementedError( "multiple segments of interest")
189
190 for data1, data2 in itertools.combinations(annotations, 2):
191
192 # note that fold changes can be very large if there are 0 samples
193 # this is fine for getting the distributional params (mean, stddev)
194 fold_changes1 = data1.observed / (data1.samples + pseudo_count )
195 fold_changes2 = data2.observed / (data2.samples + pseudo_count )
196
197 # add a separate fc pseudo-count to avoid 0 values
198 fold_changes1 += 0.0001
199 fold_changes2 += 0.0001
200
201 # Test is if relative fold change rfc is different from 1
202 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
203 # = obs1 / obs2 * exp2 / exp1
204 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
205 #
206 # Convert to log space for easier plotting
207 # Move the observed fold ratio in order to get an idea of the magnitude
208 # of the underlying fold change
209 delta_fold = data2.fold - data1.fold
210 sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
211 observed_delta_fold = 0.0 + delta_fold
212
213 result = gat.AnnotatorResult( data1.annotation, data2.annotation,
214 "na",
215 observed_delta_fold,
216 sampled_delta_fold,
217 reference = None,
218 pseudo_count = 0 )
219
220
221 results.append( result )
222
223
224 else:
225 E.info("performing pairwise comparison between files")
226
227 ##################################################
228 # perform pairwise comparison
229 for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2):
230 E.info( "comparing %i and %i" % (index1,index2))
231 a,b = all_annotator_results[index1], all_annotator_results[index2]
232
233 # index results in a and b
234 aa = collections.defaultdict( dict )
235 for x in a: aa[x.track][x.annotation] = x
236
237 bb = collections.defaultdict( dict )
238 for x in b: bb[x.track][x.annotation] = x
239
240 if len(aa.keys()) != 1 or len(bb.keys()) != 1:
241 raise NotImplementedError( "multiple segments of interest")
242
243 track = "merged"
244 # get shared annotations
245 annotations1 = aa[track].keys()
246 annotations2 = bb[track].keys()
247 shared_annotations = list( set(annotations1).intersection( set(annotations2) ) )
248 E.info( "%i shared annotations" % len(shared_annotations))
249
250 for annotation in shared_annotations:
251
252 # if not annotation.startswith("Ram:"): continue
253
254 data1 = aa[track][annotation]
255 data2 = bb[track][annotation]
256
257 # note that fold changes can be very large if there are 0 samples
258 # this is fine for getting the distributional params (mean, stddev)
259 fold_changes1 = data1.observed / (data1.samples + pseudo_count )
260 fold_changes2 = data2.observed / (data2.samples + pseudo_count )
261
262 # add a separate fc pseudo-count to avoid 0 values
263 fold_changes1 += 0.0001
264 fold_changes2 += 0.0001
265
266 # Test is if relative fold change rfc is different from 1
267 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
268 # = obs1 / obs2 * exp2 / exp1
269 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
270 #
271 # Convert to log space for easier plotting
272 # Move the observed fold ratio in order to get an idea of the magnitude
273 # of the underlying fold change
274 delta_fold = data2.fold - data1.fold
275 sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
276 observed_delta_fold = 0.0 + delta_fold
277
278 result = gat.AnnotatorResult( track, annotation,
279 "na",
280 observed_delta_fold,
281 sampled_delta_fold,
282 reference = None,
283 pseudo_count = 0 )
284
285
286 results.append( result )
287
288
289 if len(results) == 0:
290 E.critical( "no results found" )
291 E.Stop()
292 return
293
294 IO.outputResults( results,
295 options,
296 gat.AnnotatorResult.headers,
297 description_header,
298 description_width,
299 descriptions,
300 format_observed = "%6.4f" )
301
302 IO.plotResults( results, options )
303
304 ## write footer and output benchmark information.
305 E.Stop()
306
307 if __name__ == "__main__":
308 sys.exit( main( sys.argv) )