|
11
|
1 ################################################################################
|
|
|
2 #
|
|
|
3 # MRC FGU Computational Genomics Group
|
|
|
4 #
|
|
|
5 # $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $
|
|
|
6 #
|
|
|
7 # Copyright (C) 2009 Andreas Heger
|
|
|
8 #
|
|
|
9 # This program is free software; you can redistribute it and/or
|
|
|
10 # modify it under the terms of the GNU General Public License
|
|
|
11 # as published by the Free Software Foundation; either version 2
|
|
|
12 # of the License, or (at your option) any later version.
|
|
|
13 #
|
|
|
14 # This program is distributed in the hope that it will be useful,
|
|
|
15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
17 # GNU General Public License for more details.
|
|
|
18 #
|
|
|
19 # You should have received a copy of the GNU General Public License
|
|
|
20 # along with this program; if not, write to the Free Software
|
|
|
21 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
|
22 #################################################################################
|
|
|
23 '''
|
|
|
24 gat-compare - compare two gat runs
|
|
|
25 ==================================
|
|
|
26
|
|
|
27 :Author: Andreas Heger
|
|
|
28 :Release: $Id$
|
|
|
29 :Date: |today|
|
|
|
30 :Tags: Python
|
|
|
31
|
|
|
32 Purpose
|
|
|
33 -------
|
|
|
34
|
|
|
35 This script compares gat runs and compares statistical significance of fold change differences between two or more
|
|
|
36 regions of interest.
|
|
|
37
|
|
|
38 This script requires counts files created by gat (option ``--output-counts-file``).
|
|
|
39
|
|
|
40 The reported observed value between sets 1 and 2 is the difference of fc2 and fc1:
|
|
|
41
|
|
|
42 * observed = fold2 - fold1
|
|
|
43
|
|
|
44 The script accepts one or more output files from a gat run.
|
|
|
45
|
|
|
46 If only a single file is given, the significance of fold change differences are computed
|
|
|
47 for each pairwise comparison of annotations. Thus, the script might answer the question
|
|
|
48 if a transcription factor is more enriched in promotors than in enhancers.
|
|
|
49
|
|
|
50 If multiple files are given, the script computes the fold change differences for the same
|
|
|
51 annotation for all pairwise combinations of :term:`segments of interest` across the different
|
|
|
52 files. Thus, the script might answer the question if transcription factor A is more enriched
|
|
|
53 in promotors than transcription factor B is enriched in promotors.
|
|
|
54
|
|
|
55 Usage
|
|
|
56 -----
|
|
|
57
|
|
|
58 Example::
|
|
|
59
|
|
|
60 python gat-compare.py tfA.counts.tsv.gz tfB.counts.tsv.gz
|
|
|
61
|
|
|
62 Type::
|
|
|
63
|
|
|
64 python gat-compare.py --help
|
|
|
65
|
|
|
66 for command line help.
|
|
|
67
|
|
|
68 Documentation
|
|
|
69 -------------
|
|
|
70
|
|
|
71 Code
|
|
|
72 ----
|
|
|
73
|
|
|
74 '''
|
|
|
75
|
|
|
76 import os, sys, re, optparse, collections, types, glob, time, math
|
|
|
77 import itertools
|
|
|
78 import numpy
|
|
|
79
|
|
|
80 import gat
|
|
|
81 import gat.Experiment as E
|
|
|
82 import gat.IOTools as IOTools
|
|
|
83 import gat.IO as IO
|
|
|
84
|
|
|
85 def main( argv = None ):
|
|
|
86 """script main.
|
|
|
87
|
|
|
88 parses command line options in sys.argv, unless *argv* is given.
|
|
|
89 """
|
|
|
90
|
|
|
91 if not argv: argv = sys.argv
|
|
|
92
|
|
|
93 # setup command line parser
|
|
|
94 parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
|
|
|
95 usage = globals()["__doc__"] )
|
|
|
96
|
|
|
97 parser.add_option("-o", "--order", dest="output_order", type="choice",
|
|
|
98 choices = ( "track", "annotation", "fold", "pvalue", "qvalue", "observed" ),
|
|
|
99 help="order results in output by fold, track, etc. [default=%default]." )
|
|
|
100
|
|
|
101 parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
|
|
|
102 choices = ( "empirical", "norm", ),
|
|
|
103 help="type of pvalue reported [default=%default]." )
|
|
|
104
|
|
|
105 parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
|
|
|
106 choices = ( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none" ),
|
|
|
107 help="method to perform multiple testing correction by controlling the fdr [default=%default]." )
|
|
|
108
|
|
|
109 parser.add_option( "--qvalue-lambda", dest="qvalue_lambda", type="float",
|
|
|
110 help="fdr computation: lambda [default=%default]." )
|
|
|
111
|
|
|
112 parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
|
|
|
113 choices = ("smoother", "bootstrap" ),
|
|
|
114 help="fdr computation: method for estimating pi0 [default=%default]." )
|
|
|
115
|
|
|
116 parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string",
|
|
|
117 help="filename mapping annotation terms to descriptions. "
|
|
|
118 " if given, the output table will contain additional columns "
|
|
|
119 " [default=%default]" )
|
|
|
120
|
|
|
121 parser.add_option( "--pseudo-count", dest="pseudo_count", type="float",
|
|
|
122 help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
|
|
|
123 " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." )
|
|
|
124
|
|
|
125 parser.add_option( "--output-plots-pattern", dest="output_plots_pattern", type="string",
|
|
|
126 help="output pattern for plots [default=%default]" )
|
|
|
127
|
|
|
128 parser.set_defaults(
|
|
|
129 pvalue_method = "empirical",
|
|
|
130 qvalue_method = "BH",
|
|
|
131 qvalue_lambda = None,
|
|
|
132 qvalue_pi0_method = "smoother",
|
|
|
133 # pseudo count for fold change computation to avoid 0 fc
|
|
|
134 pseudo_count = 1.0,
|
|
|
135 output_order = "observed",
|
|
|
136 )
|
|
|
137
|
|
|
138 ## add common options (-h/--help, ...) and parse command line
|
|
|
139 (options, args) = E.Start( parser, argv = argv, add_output_options = True )
|
|
|
140
|
|
|
141 input_filenames_counts = args
|
|
|
142
|
|
|
143 ##################################################
|
|
|
144 E.info( "received %i filenames with counts" % len(input_filenames_counts))
|
|
|
145
|
|
|
146 ##################################################
|
|
|
147 description_header, descriptions, description_width = IO.readDescriptions( options )
|
|
|
148
|
|
|
149 all_annotator_results = []
|
|
|
150
|
|
|
151 for input_filename_counts in input_filenames_counts:
|
|
|
152
|
|
|
153 E.info("processing %s" % input_filename_counts )
|
|
|
154
|
|
|
155 annotator_results = gat.fromCounts( input_filename_counts )
|
|
|
156
|
|
|
157 ##################################################
|
|
|
158 if options.pvalue_method != "empirical":
|
|
|
159 E.info("updating pvalues to %s" % options.pvalue_method )
|
|
|
160 gat.updatePValues( annotator_results, options.pvalue_method )
|
|
|
161
|
|
|
162 ##################################################
|
|
|
163 ##################################################
|
|
|
164 ##################################################
|
|
|
165 ## compute global fdr
|
|
|
166 ##################################################
|
|
|
167 E.info( "computing FDR statistics" )
|
|
|
168 gat.updateQValues( annotator_results,
|
|
|
169 method = options.qvalue_method,
|
|
|
170 vlambda = options.qvalue_lambda,
|
|
|
171 pi0_method = options.qvalue_pi0_method )
|
|
|
172
|
|
|
173 all_annotator_results.append( annotator_results )
|
|
|
174
|
|
|
175 pseudo_count = options.pseudo_count
|
|
|
176 results = []
|
|
|
177
|
|
|
178 if len(all_annotator_results) == 1:
|
|
|
179 E.info("performing pairwise comparison within a file")
|
|
|
180
|
|
|
181 # collect all annotations
|
|
|
182 annotations, segments = list(), set()
|
|
|
183 for x in all_annotator_results[0]:
|
|
|
184 segments.add( x.track)
|
|
|
185 annotations.append( x )
|
|
|
186
|
|
|
187 if len(segments) != 1:
|
|
|
188 raise NotImplementedError( "multiple segments of interest")
|
|
|
189
|
|
|
190 for data1, data2 in itertools.combinations(annotations, 2):
|
|
|
191
|
|
|
192 # note that fold changes can be very large if there are 0 samples
|
|
|
193 # this is fine for getting the distributional params (mean, stddev)
|
|
|
194 fold_changes1 = data1.observed / (data1.samples + pseudo_count )
|
|
|
195 fold_changes2 = data2.observed / (data2.samples + pseudo_count )
|
|
|
196
|
|
|
197 # add a separate fc pseudo-count to avoid 0 values
|
|
|
198 fold_changes1 += 0.0001
|
|
|
199 fold_changes2 += 0.0001
|
|
|
200
|
|
|
201 # Test is if relative fold change rfc is different from 1
|
|
|
202 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
|
|
|
203 # = obs1 / obs2 * exp2 / exp1
|
|
|
204 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
|
|
|
205 #
|
|
|
206 # Convert to log space for easier plotting
|
|
|
207 # Move the observed fold ratio in order to get an idea of the magnitude
|
|
|
208 # of the underlying fold change
|
|
|
209 delta_fold = data2.fold - data1.fold
|
|
|
210 sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
|
|
|
211 observed_delta_fold = 0.0 + delta_fold
|
|
|
212
|
|
|
213 result = gat.AnnotatorResult( data1.annotation, data2.annotation,
|
|
|
214 "na",
|
|
|
215 observed_delta_fold,
|
|
|
216 sampled_delta_fold,
|
|
|
217 reference = None,
|
|
|
218 pseudo_count = 0 )
|
|
|
219
|
|
|
220
|
|
|
221 results.append( result )
|
|
|
222
|
|
|
223
|
|
|
224 else:
|
|
|
225 E.info("performing pairwise comparison between files")
|
|
|
226
|
|
|
227 ##################################################
|
|
|
228 # perform pairwise comparison
|
|
|
229 for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2):
|
|
|
230 E.info( "comparing %i and %i" % (index1,index2))
|
|
|
231 a,b = all_annotator_results[index1], all_annotator_results[index2]
|
|
|
232
|
|
|
233 # index results in a and b
|
|
|
234 aa = collections.defaultdict( dict )
|
|
|
235 for x in a: aa[x.track][x.annotation] = x
|
|
|
236
|
|
|
237 bb = collections.defaultdict( dict )
|
|
|
238 for x in b: bb[x.track][x.annotation] = x
|
|
|
239
|
|
|
240 if len(aa.keys()) != 1 or len(bb.keys()) != 1:
|
|
|
241 raise NotImplementedError( "multiple segments of interest")
|
|
|
242
|
|
|
243 track = "merged"
|
|
|
244 # get shared annotations
|
|
|
245 annotations1 = aa[track].keys()
|
|
|
246 annotations2 = bb[track].keys()
|
|
|
247 shared_annotations = list( set(annotations1).intersection( set(annotations2) ) )
|
|
|
248 E.info( "%i shared annotations" % len(shared_annotations))
|
|
|
249
|
|
|
250 for annotation in shared_annotations:
|
|
|
251
|
|
|
252 # if not annotation.startswith("Ram:"): continue
|
|
|
253
|
|
|
254 data1 = aa[track][annotation]
|
|
|
255 data2 = bb[track][annotation]
|
|
|
256
|
|
|
257 # note that fold changes can be very large if there are 0 samples
|
|
|
258 # this is fine for getting the distributional params (mean, stddev)
|
|
|
259 fold_changes1 = data1.observed / (data1.samples + pseudo_count )
|
|
|
260 fold_changes2 = data2.observed / (data2.samples + pseudo_count )
|
|
|
261
|
|
|
262 # add a separate fc pseudo-count to avoid 0 values
|
|
|
263 fold_changes1 += 0.0001
|
|
|
264 fold_changes2 += 0.0001
|
|
|
265
|
|
|
266 # Test is if relative fold change rfc is different from 1
|
|
|
267 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
|
|
|
268 # = obs1 / obs2 * exp2 / exp1
|
|
|
269 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
|
|
|
270 #
|
|
|
271 # Convert to log space for easier plotting
|
|
|
272 # Move the observed fold ratio in order to get an idea of the magnitude
|
|
|
273 # of the underlying fold change
|
|
|
274 delta_fold = data2.fold - data1.fold
|
|
|
275 sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold
|
|
|
276 observed_delta_fold = 0.0 + delta_fold
|
|
|
277
|
|
|
278 result = gat.AnnotatorResult( track, annotation,
|
|
|
279 "na",
|
|
|
280 observed_delta_fold,
|
|
|
281 sampled_delta_fold,
|
|
|
282 reference = None,
|
|
|
283 pseudo_count = 0 )
|
|
|
284
|
|
|
285
|
|
|
286 results.append( result )
|
|
|
287
|
|
|
288
|
|
|
289 if len(results) == 0:
|
|
|
290 E.critical( "no results found" )
|
|
|
291 E.Stop()
|
|
|
292 return
|
|
|
293
|
|
|
294 IO.outputResults( results,
|
|
|
295 options,
|
|
|
296 gat.AnnotatorResult.headers,
|
|
|
297 description_header,
|
|
|
298 description_width,
|
|
|
299 descriptions,
|
|
|
300 format_observed = "%6.4f" )
|
|
|
301
|
|
|
302 IO.plotResults( results, options )
|
|
|
303
|
|
|
304 ## write footer and output benchmark information.
|
|
|
305 E.Stop()
|
|
|
306
|
|
|
307 if __name__ == "__main__":
|
|
|
308 sys.exit( main( sys.argv) )
|