0
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 ##usage, options and doc goes here
|
|
4 argspec <- c("determine.IPL.threshold.R takes an IPL result, and determines a statistically sound threshold to use
|
|
5
|
|
6 Usage:
|
|
7 determine.IPL.threshold.R -d <IPL_data_file>
|
|
8 Optional:
|
|
9 -o output.rdata ## rdata output file (contains variables used for calculation, for those who want to review them
|
|
10 -f filter type # must be either modulated, active, or inactive
|
|
11 -p percent of samples passing (must be value on [0,1]
|
|
12 \n\n")
|
|
13 args <- commandArgs(TRUE)
|
|
14 if ( length( args ) == 1 && args =="--help") {
|
|
15 write( argspec, stderr() )
|
|
16 q();
|
|
17 }
|
|
18
|
|
19 lib.load.quiet <- function( package ) {
|
|
20 package <- as.character(substitute(package))
|
|
21 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
22 }
|
|
23 lib.load.quiet(getopt)
|
|
24
|
|
25 spec <- matrix( c( "data.fname", "d", 1, "character",
|
|
26 "output.rdata", "o", 2, "character",
|
|
27 "filter.type", "f", 2, "character",
|
|
28 "perc.pass", "p", 2, "numeric",
|
|
29 "selection.criteria", "s", 2, "character",
|
|
30 "output.report.dir", "r", 2, "character",
|
|
31 "output.report.html", "h", 2, "character"
|
|
32 ),
|
|
33 nc=4,
|
|
34 byrow=T
|
|
35 )
|
|
36 opt <- getopt( spec=spec )
|
|
37
|
|
38 ## default params for non-required params
|
|
39 if ( is.null( opt$filter.type ) ) { opt$filter.type <- 'modulated' }
|
|
40 if ( is.null( opt$perc.pass ) ) { opt$perc.pass <- 1/3 }
|
|
41 if ( is.null( opt$selection.criteria ) ) { opt$selection.criteria <- 'max_diffs' }
|
|
42 if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
|
|
43 if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" }
|
|
44 if ( is.null( opt$output.rdata ) ) { opt$output.rdata <- "output.rdata" }
|
|
45 if ( opt$perc.pass < 0 ) {
|
|
46 stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
|
|
47 }
|
|
48
|
|
49 if (!file.exists(opt$output.report.dir)){
|
|
50 dir.create(opt$output.report.dir)
|
|
51 }
|
|
52
|
|
53
|
|
54 data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
|
|
55 genes <- rownames( data )
|
|
56 genes <- genes[ !grepl( "abstract|family|complex", genes ) ]
|
|
57 data <- data[ genes, ]
|
|
58
|
|
59 nulls.mat <- grepl( "na_", colnames( data ) )
|
|
60 reals <- ! nulls.mat
|
|
61 nulls.mat <- data[ , nulls.mat ]
|
|
62 reals.mat <- data[, reals ]
|
|
63 if ( ncol( nulls.mat ) == 0 ) stop( "no nulls were in the file provided!\n" )
|
|
64 if ( ncol( reals.mat ) == 0 ) stop( "no reals were in the file provided!\n" )
|
|
65
|
|
66
|
|
67 if ( opt$filter.type == 'modulated' ) {
|
|
68 reals.mat <- abs( reals.mat )
|
|
69 nulls.mat <- abs( nulls.mat )
|
|
70 } else {
|
|
71 if ( opt$filter.type == "inactive" ) {
|
|
72 reals.mat <- -reals.mat
|
|
73 nulls.mat <- -nulls.mat
|
|
74 }
|
|
75 }
|
|
76
|
|
77
|
|
78 ## we only look at the larger 50% of the possible IPL values
|
|
79 ## as possible thresholds to use (since the lower 50% are almost
|
|
80 ## always uninformative)
|
|
81 thresholds <- unique( quantile( reals.mat,
|
|
82 seq( 0.5, 1, by=0.001 ) ) )
|
|
83 thresholds <- c( thresholds,
|
|
84 quantile( nulls.mat,
|
|
85 seq( 0.5, 1, by=0.001 ) ) )
|
|
86 thresholds <- unique( sort( thresholds ) )
|
|
87
|
|
88
|
|
89 get.num.filtered.feats <- function( mat,
|
|
90 threshold,
|
|
91 perc.samples.passing=1/3 ) {
|
|
92 feat.vect <- apply( mat,
|
|
93 1,
|
|
94 function(x) {
|
|
95 tmp <- sum( x > threshold )
|
|
96 if ( perc.samples.passing >=1 ) {
|
|
97 return( tmp >= perc.samples.passing )
|
|
98 } else {
|
|
99 return( tmp > floor( perc.samples.passing * length(x) ) )
|
|
100 }
|
|
101 }
|
|
102 )
|
|
103 return( sum( feat.vect ) )
|
|
104 }
|
|
105
|
|
106
|
|
107 real.feats <- null.feats <- length( genes )
|
|
108 chisq.pvals <- binom.pvals <- numeric()
|
|
109
|
|
110 for ( i in 1:length( thresholds ) ) {
|
|
111
|
|
112 nul.feats.this.thresh <- get.num.filtered.feats( mat=nulls.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )
|
|
113 ## limit the maximum threshold to one where there are at least 75 valid points
|
|
114 ## because if there are fewer nulls than that, it heavily skews the probability
|
|
115 if ( nul.feats.this.thresh < 50 ) break
|
|
116
|
|
117 null.feats[ i ] <- nul.feats.this.thresh
|
|
118 real.feats[ i ] <- get.num.filtered.feats( mat=reals.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )
|
|
119
|
|
120 ## only calculate if there are more real features than nulls, otherwise, give a p-value of 1
|
|
121 if ( null.feats[i] < real.feats[i] ) {
|
|
122 p <- null.feats[i]/nrow( nulls.mat )
|
|
123 sd <- ( nrow( nulls.mat ) * p * (1-p ) )^0.5
|
|
124
|
|
125 ## binomial test
|
|
126 p <- -pnorm( q=real.feats[i],
|
|
127 mean=null.feats[i],
|
|
128 sd=sd,
|
|
129 log.p=TRUE,
|
|
130 lower.tail=FALSE )
|
|
131
|
|
132 ##chisq test
|
|
133 chi <- ( real.feats[i] - null.feats[i] )^2
|
|
134 chi <- chi/(null.feats[i])^2
|
|
135 chi <- -pchisq( chi, 1, log.p=TRUE, lower=FALSE )
|
|
136 } else {
|
|
137 p <- chi <- 0 ## 0 == -log(1)
|
|
138 }
|
|
139
|
|
140 binom.pvals <- c( binom.pvals, p )
|
|
141 chisq.pvals <- c( chisq.pvals, chi )
|
|
142
|
|
143 if ( length( chisq.pvals ) != i ) {
|
|
144 stop( "lengths differ\n" )
|
|
145 }
|
|
146 }
|
|
147
|
|
148
|
|
149
|
|
150 ##names( binom.pvals ) <- names( chisq.pvals ) <- thresholds
|
|
151 diffs <- real.feats - null.feats
|
|
152 if ( opt$selection.criteria == "max_diffs" ) {
|
|
153 max.diff <- max( diffs )
|
|
154 opt.thresh <- which( diffs %in% max.diff )
|
|
155 } else if ( opt$selection.criteria == "binomial" ) {
|
|
156 max.bin <- max( binom.pvals )
|
|
157 opt.thresh <- which( binom.pvals %in% max.bin )
|
|
158 } else if ( opt$selection.criteria == "chisq" ) {
|
|
159 max.chi <- max( chisq.pvals )
|
|
160 opt.thresh <- which( chisq.pvals %in% max.chi )
|
|
161 }
|
|
162
|
|
163 opt.thresh <- mean( c( thresholds[ opt.thresh ], thresholds[ (opt.thresh-1) ] ) )
|
|
164 opt.thresh <- signif( opt.thresh, 4 )
|
|
165
|
|
166
|
|
167 ##corrected.binom.pvals <- binom.pvals + log( length(thresholds) )
|
|
168 ##binom.pvals <- binom.pvals - log( length(thresholds) )
|
|
169 ##corrected.chisq.pvals <- chisq.pvals + log( length(thresholds) )
|
|
170 ##chisq.pvals <- chisq.pvals - log( length(thresholds) )
|
|
171
|
|
172
|
|
173 eval.thresh <- thresholds[ 1:length( real.feats ) ]
|
|
174 ##plot.new(); screens <- split.screen( c( 4,1 ) )
|
|
175 ##postscript( "threshold.comparison.ps", paper='letter', horizontal=F )
|
|
176 ##png.fname <- file.path( opt$output.report.dir, "IPL.threshold.determination.png")
|
|
177 ##plot.dev <- png( png.fname,
|
|
178 ## width=11,
|
|
179 ## height=8.5,
|
|
180 ## units='in',
|
|
181 ## res=72 )
|
|
182 ##par( mar=rep(0,4) )
|
|
183 ##screens <- split.screen( c( 4,1 ) )
|
|
184
|
|
185 png.fname <- file.path( opt$output.report.dir, "01.num.feats.IPL.threshold.determination.png")
|
|
186 plot.dev <- png( png.fname,
|
|
187 width=11,
|
|
188 height=8.5,
|
|
189 units='in',
|
|
190 res=72 )
|
|
191 par( mar=c(2.25,3,1.5,0.5) )
|
|
192 plot( eval.thresh, null.feats, type='l', lwd=2, col='blue', cex.axis=0.75 )
|
|
193 lines( eval.thresh, real.feats, type='l', lwd=2, col='black', cex.axis=0.75 )
|
|
194 abline( v=opt.thresh )
|
|
195 legend( "topright", c( "Real", "Null" ), lwd=2, col=c('black', 'blue' ) )
|
|
196 mtext( "Number of Genes Passing Threshold", font=2 )
|
|
197 mtext( "IPL Threshold", 1, font=2, line=1.5 )
|
|
198 mtext( "Number of Genes", 2, font=2, line=1.8 )
|
|
199 dev.off()
|
|
200
|
|
201
|
|
202 png.fname <- file.path( opt$output.report.dir, "02.diffs.IPL.threshold.determination.png")
|
|
203 plot.dev <- png( png.fname,
|
|
204 width=11,
|
|
205 height=8.5,
|
|
206 units='in',
|
|
207 res=72 )
|
|
208 ##screen( screen()+1 )
|
|
209 par( mar=c(2.25,3,1.5,0.5) )
|
|
210 plot( eval.thresh, diffs, type='l', lwd=2, col='black', cex.axis=0.75 )
|
|
211 abline( v=opt.thresh )
|
|
212 mtext( "Difference between number of Real & Null genes passing Threshold", font=2 )
|
|
213 mtext( "IPL Threshold", 1, font=2, line=1.5 )
|
|
214 mtext( "Number of Genes", 2, font=2, line=1.8 )
|
|
215 dev.off()
|
|
216
|
|
217
|
|
218
|
|
219 png.fname <- file.path( opt$output.report.dir, "03.chisq.IPL.threshold.determination.png")
|
|
220 plot.dev <- png( png.fname,
|
|
221 width=11,
|
|
222 height=8.5,
|
|
223 units='in',
|
|
224 res=72 )
|
|
225 ##screen( screen()+1 )
|
|
226 par( mar=c(2.25,3,1.5,0.5) )
|
|
227 plot( eval.thresh, chisq.pvals, type='l', lwd=2, col='red', cex.axis=0.75 )
|
|
228 abline( v=opt.thresh )
|
|
229 mtext( "Chi-sq p-values", font=2 )
|
|
230 mtext( "IPL Threshold", 1, font=2, line=1.5 )
|
|
231 mtext( "-Log p-value", 2, font=2, line=1.8 )
|
|
232 dev.off()
|
|
233
|
|
234
|
|
235
|
|
236 png.fname <- file.path( opt$output.report.dir, "04.binom.IPL.threshold.determination.png")
|
|
237 plot.dev <- png( png.fname,
|
|
238 width=11,
|
|
239 height=8.5,
|
|
240 units='in',
|
|
241 res=72 )
|
|
242 ##screen( screen()+1 )
|
|
243 par( mar=c(2.25,3,1.5,0.5) )
|
|
244 plot( eval.thresh, binom.pvals, type='l', lwd=2, col='green', cex.axis=0.75 )
|
|
245 abline( v=opt.thresh )
|
|
246 mtext( "Binomial p-values", font=2 )
|
|
247 mtext( "IPL Threshold", 1, font=2, line=1.5 )
|
|
248 mtext( "-Log p-value", 2, font=2, line=1.8 )
|
|
249 dev.off()
|
|
250
|
|
251 ##close.screen( all=T ); dev.off()
|
|
252
|
|
253 report_str = paste( "The threshold to use for consensus clustering filtering is ", opt.thresh, "\n", sep="" )
|
|
254
|
|
255 pngs = list.files(path=opt$output.report.dir, patt="png")
|
|
256 html.out <- paste( "<html>", report_str,
|
|
257 paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
|
|
258 "</html>" )
|
|
259 cat( html.out, file=opt$output.report.html )
|
|
260
|
|
261 filter.type <- opt$filter.type
|
|
262 perc.pass <- opt$perc.pass
|
|
263 save( file=opt$output.rdata, thresholds, diffs, binom.pvals, chisq.pvals, real.feats, null.feats, data, filter.type, perc.pass, opt.thresh )
|