Mercurial > repos > peter-waltman > ucsc_cluster_tools2
comparison cluster.tools/determine.IPL.threshold.R @ 0:0decf3fd54bc draft
Uploaded
| author | peter-waltman |
|---|---|
| date | Thu, 28 Feb 2013 01:45:39 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0decf3fd54bc |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 | |
| 3 ##usage, options and doc goes here | |
| 4 argspec <- c("determine.IPL.threshold.R takes an IPL result, and determines a statistically sound threshold to use | |
| 5 | |
| 6 Usage: | |
| 7 determine.IPL.threshold.R -d <IPL_data_file> | |
| 8 Optional: | |
| 9 -o output.rdata ## rdata output file (contains variables used for calculation, for those who want to review them | |
| 10 -f filter type # must be either modulated, active, or inactive | |
| 11 -p percent of samples passing (must be value on [0,1] | |
| 12 \n\n") | |
| 13 args <- commandArgs(TRUE) | |
| 14 if ( length( args ) == 1 && args =="--help") { | |
| 15 write( argspec, stderr() ) | |
| 16 q(); | |
| 17 } | |
| 18 | |
| 19 lib.load.quiet <- function( package ) { | |
| 20 package <- as.character(substitute(package)) | |
| 21 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
| 22 } | |
| 23 lib.load.quiet(getopt) | |
| 24 | |
| 25 spec <- matrix( c( "data.fname", "d", 1, "character", | |
| 26 "output.rdata", "o", 2, "character", | |
| 27 "filter.type", "f", 2, "character", | |
| 28 "perc.pass", "p", 2, "numeric", | |
| 29 "selection.criteria", "s", 2, "character", | |
| 30 "output.report.dir", "r", 2, "character", | |
| 31 "output.report.html", "h", 2, "character" | |
| 32 ), | |
| 33 nc=4, | |
| 34 byrow=T | |
| 35 ) | |
| 36 opt <- getopt( spec=spec ) | |
| 37 | |
| 38 ## default params for non-required params | |
| 39 if ( is.null( opt$filter.type ) ) { opt$filter.type <- 'modulated' } | |
| 40 if ( is.null( opt$perc.pass ) ) { opt$perc.pass <- 1/3 } | |
| 41 if ( is.null( opt$selection.criteria ) ) { opt$selection.criteria <- 'max_diffs' } | |
| 42 if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" } | |
| 43 if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" } | |
| 44 if ( is.null( opt$output.rdata ) ) { opt$output.rdata <- "output.rdata" } | |
| 45 if ( opt$perc.pass < 0 ) { | |
| 46 stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" ) | |
| 47 } | |
| 48 | |
| 49 if (!file.exists(opt$output.report.dir)){ | |
| 50 dir.create(opt$output.report.dir) | |
| 51 } | |
| 52 | |
| 53 | |
| 54 data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) ) | |
| 55 genes <- rownames( data ) | |
| 56 genes <- genes[ !grepl( "abstract|family|complex", genes ) ] | |
| 57 data <- data[ genes, ] | |
| 58 | |
| 59 nulls.mat <- grepl( "na_", colnames( data ) ) | |
| 60 reals <- ! nulls.mat | |
| 61 nulls.mat <- data[ , nulls.mat ] | |
| 62 reals.mat <- data[, reals ] | |
| 63 if ( ncol( nulls.mat ) == 0 ) stop( "no nulls were in the file provided!\n" ) | |
| 64 if ( ncol( reals.mat ) == 0 ) stop( "no reals were in the file provided!\n" ) | |
| 65 | |
| 66 | |
| 67 if ( opt$filter.type == 'modulated' ) { | |
| 68 reals.mat <- abs( reals.mat ) | |
| 69 nulls.mat <- abs( nulls.mat ) | |
| 70 } else { | |
| 71 if ( opt$filter.type == "inactive" ) { | |
| 72 reals.mat <- -reals.mat | |
| 73 nulls.mat <- -nulls.mat | |
| 74 } | |
| 75 } | |
| 76 | |
| 77 | |
| 78 ## we only look at the larger 50% of the possible IPL values | |
| 79 ## as possible thresholds to use (since the lower 50% are almost | |
| 80 ## always uninformative) | |
| 81 thresholds <- unique( quantile( reals.mat, | |
| 82 seq( 0.5, 1, by=0.001 ) ) ) | |
| 83 thresholds <- c( thresholds, | |
| 84 quantile( nulls.mat, | |
| 85 seq( 0.5, 1, by=0.001 ) ) ) | |
| 86 thresholds <- unique( sort( thresholds ) ) | |
| 87 | |
| 88 | |
| 89 get.num.filtered.feats <- function( mat, | |
| 90 threshold, | |
| 91 perc.samples.passing=1/3 ) { | |
| 92 feat.vect <- apply( mat, | |
| 93 1, | |
| 94 function(x) { | |
| 95 tmp <- sum( x > threshold ) | |
| 96 if ( perc.samples.passing >=1 ) { | |
| 97 return( tmp >= perc.samples.passing ) | |
| 98 } else { | |
| 99 return( tmp > floor( perc.samples.passing * length(x) ) ) | |
| 100 } | |
| 101 } | |
| 102 ) | |
| 103 return( sum( feat.vect ) ) | |
| 104 } | |
| 105 | |
| 106 | |
| 107 real.feats <- null.feats <- length( genes ) | |
| 108 chisq.pvals <- binom.pvals <- numeric() | |
| 109 | |
| 110 for ( i in 1:length( thresholds ) ) { | |
| 111 | |
| 112 nul.feats.this.thresh <- get.num.filtered.feats( mat=nulls.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass ) | |
| 113 ## limit the maximum threshold to one where there are at least 75 valid points | |
| 114 ## because if there are fewer nulls than that, it heavily skews the probability | |
| 115 if ( nul.feats.this.thresh < 50 ) break | |
| 116 | |
| 117 null.feats[ i ] <- nul.feats.this.thresh | |
| 118 real.feats[ i ] <- get.num.filtered.feats( mat=reals.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass ) | |
| 119 | |
| 120 ## only calculate if there are more real features than nulls, otherwise, give a p-value of 1 | |
| 121 if ( null.feats[i] < real.feats[i] ) { | |
| 122 p <- null.feats[i]/nrow( nulls.mat ) | |
| 123 sd <- ( nrow( nulls.mat ) * p * (1-p ) )^0.5 | |
| 124 | |
| 125 ## binomial test | |
| 126 p <- -pnorm( q=real.feats[i], | |
| 127 mean=null.feats[i], | |
| 128 sd=sd, | |
| 129 log.p=TRUE, | |
| 130 lower.tail=FALSE ) | |
| 131 | |
| 132 ##chisq test | |
| 133 chi <- ( real.feats[i] - null.feats[i] )^2 | |
| 134 chi <- chi/(null.feats[i])^2 | |
| 135 chi <- -pchisq( chi, 1, log.p=TRUE, lower=FALSE ) | |
| 136 } else { | |
| 137 p <- chi <- 0 ## 0 == -log(1) | |
| 138 } | |
| 139 | |
| 140 binom.pvals <- c( binom.pvals, p ) | |
| 141 chisq.pvals <- c( chisq.pvals, chi ) | |
| 142 | |
| 143 if ( length( chisq.pvals ) != i ) { | |
| 144 stop( "lengths differ\n" ) | |
| 145 } | |
| 146 } | |
| 147 | |
| 148 | |
| 149 | |
| 150 ##names( binom.pvals ) <- names( chisq.pvals ) <- thresholds | |
| 151 diffs <- real.feats - null.feats | |
| 152 if ( opt$selection.criteria == "max_diffs" ) { | |
| 153 max.diff <- max( diffs ) | |
| 154 opt.thresh <- which( diffs %in% max.diff ) | |
| 155 } else if ( opt$selection.criteria == "binomial" ) { | |
| 156 max.bin <- max( binom.pvals ) | |
| 157 opt.thresh <- which( binom.pvals %in% max.bin ) | |
| 158 } else if ( opt$selection.criteria == "chisq" ) { | |
| 159 max.chi <- max( chisq.pvals ) | |
| 160 opt.thresh <- which( chisq.pvals %in% max.chi ) | |
| 161 } | |
| 162 | |
| 163 opt.thresh <- mean( c( thresholds[ opt.thresh ], thresholds[ (opt.thresh-1) ] ) ) | |
| 164 opt.thresh <- signif( opt.thresh, 4 ) | |
| 165 | |
| 166 | |
| 167 ##corrected.binom.pvals <- binom.pvals + log( length(thresholds) ) | |
| 168 ##binom.pvals <- binom.pvals - log( length(thresholds) ) | |
| 169 ##corrected.chisq.pvals <- chisq.pvals + log( length(thresholds) ) | |
| 170 ##chisq.pvals <- chisq.pvals - log( length(thresholds) ) | |
| 171 | |
| 172 | |
| 173 eval.thresh <- thresholds[ 1:length( real.feats ) ] | |
| 174 ##plot.new(); screens <- split.screen( c( 4,1 ) ) | |
| 175 ##postscript( "threshold.comparison.ps", paper='letter', horizontal=F ) | |
| 176 ##png.fname <- file.path( opt$output.report.dir, "IPL.threshold.determination.png") | |
| 177 ##plot.dev <- png( png.fname, | |
| 178 ## width=11, | |
| 179 ## height=8.5, | |
| 180 ## units='in', | |
| 181 ## res=72 ) | |
| 182 ##par( mar=rep(0,4) ) | |
| 183 ##screens <- split.screen( c( 4,1 ) ) | |
| 184 | |
| 185 png.fname <- file.path( opt$output.report.dir, "01.num.feats.IPL.threshold.determination.png") | |
| 186 plot.dev <- png( png.fname, | |
| 187 width=11, | |
| 188 height=8.5, | |
| 189 units='in', | |
| 190 res=72 ) | |
| 191 par( mar=c(2.25,3,1.5,0.5) ) | |
| 192 plot( eval.thresh, null.feats, type='l', lwd=2, col='blue', cex.axis=0.75 ) | |
| 193 lines( eval.thresh, real.feats, type='l', lwd=2, col='black', cex.axis=0.75 ) | |
| 194 abline( v=opt.thresh ) | |
| 195 legend( "topright", c( "Real", "Null" ), lwd=2, col=c('black', 'blue' ) ) | |
| 196 mtext( "Number of Genes Passing Threshold", font=2 ) | |
| 197 mtext( "IPL Threshold", 1, font=2, line=1.5 ) | |
| 198 mtext( "Number of Genes", 2, font=2, line=1.8 ) | |
| 199 dev.off() | |
| 200 | |
| 201 | |
| 202 png.fname <- file.path( opt$output.report.dir, "02.diffs.IPL.threshold.determination.png") | |
| 203 plot.dev <- png( png.fname, | |
| 204 width=11, | |
| 205 height=8.5, | |
| 206 units='in', | |
| 207 res=72 ) | |
| 208 ##screen( screen()+1 ) | |
| 209 par( mar=c(2.25,3,1.5,0.5) ) | |
| 210 plot( eval.thresh, diffs, type='l', lwd=2, col='black', cex.axis=0.75 ) | |
| 211 abline( v=opt.thresh ) | |
| 212 mtext( "Difference between number of Real & Null genes passing Threshold", font=2 ) | |
| 213 mtext( "IPL Threshold", 1, font=2, line=1.5 ) | |
| 214 mtext( "Number of Genes", 2, font=2, line=1.8 ) | |
| 215 dev.off() | |
| 216 | |
| 217 | |
| 218 | |
| 219 png.fname <- file.path( opt$output.report.dir, "03.chisq.IPL.threshold.determination.png") | |
| 220 plot.dev <- png( png.fname, | |
| 221 width=11, | |
| 222 height=8.5, | |
| 223 units='in', | |
| 224 res=72 ) | |
| 225 ##screen( screen()+1 ) | |
| 226 par( mar=c(2.25,3,1.5,0.5) ) | |
| 227 plot( eval.thresh, chisq.pvals, type='l', lwd=2, col='red', cex.axis=0.75 ) | |
| 228 abline( v=opt.thresh ) | |
| 229 mtext( "Chi-sq p-values", font=2 ) | |
| 230 mtext( "IPL Threshold", 1, font=2, line=1.5 ) | |
| 231 mtext( "-Log p-value", 2, font=2, line=1.8 ) | |
| 232 dev.off() | |
| 233 | |
| 234 | |
| 235 | |
| 236 png.fname <- file.path( opt$output.report.dir, "04.binom.IPL.threshold.determination.png") | |
| 237 plot.dev <- png( png.fname, | |
| 238 width=11, | |
| 239 height=8.5, | |
| 240 units='in', | |
| 241 res=72 ) | |
| 242 ##screen( screen()+1 ) | |
| 243 par( mar=c(2.25,3,1.5,0.5) ) | |
| 244 plot( eval.thresh, binom.pvals, type='l', lwd=2, col='green', cex.axis=0.75 ) | |
| 245 abline( v=opt.thresh ) | |
| 246 mtext( "Binomial p-values", font=2 ) | |
| 247 mtext( "IPL Threshold", 1, font=2, line=1.5 ) | |
| 248 mtext( "-Log p-value", 2, font=2, line=1.8 ) | |
| 249 dev.off() | |
| 250 | |
| 251 ##close.screen( all=T ); dev.off() | |
| 252 | |
| 253 report_str = paste( "The threshold to use for consensus clustering filtering is ", opt.thresh, "\n", sep="" ) | |
| 254 | |
| 255 pngs = list.files(path=opt$output.report.dir, patt="png") | |
| 256 html.out <- paste( "<html>", report_str, | |
| 257 paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""), | |
| 258 "</html>" ) | |
| 259 cat( html.out, file=opt$output.report.html ) | |
| 260 | |
| 261 filter.type <- opt$filter.type | |
| 262 perc.pass <- opt$perc.pass | |
| 263 save( file=opt$output.rdata, thresholds, diffs, binom.pvals, chisq.pvals, real.feats, null.feats, data, filter.type, perc.pass, opt.thresh ) |
