Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/determine.IPL.threshold.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/determine.IPL.threshold.R Thu Feb 28 01:45:39 2013 -0500 @@ -0,0 +1,263 @@ +#!/usr/bin/env Rscript + +##usage, options and doc goes here +argspec <- c("determine.IPL.threshold.R takes an IPL result, and determines a statistically sound threshold to use + + Usage: + determine.IPL.threshold.R -d <IPL_data_file> + Optional: + -o output.rdata ## rdata output file (contains variables used for calculation, for those who want to review them + -f filter type # must be either modulated, active, or inactive + -p percent of samples passing (must be value on [0,1] +\n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write( argspec, stderr() ) + q(); +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + +spec <- matrix( c( "data.fname", "d", 1, "character", + "output.rdata", "o", 2, "character", + "filter.type", "f", 2, "character", + "perc.pass", "p", 2, "numeric", + "selection.criteria", "s", 2, "character", + "output.report.dir", "r", 2, "character", + "output.report.html", "h", 2, "character" + ), + nc=4, + byrow=T + ) +opt <- getopt( spec=spec ) + +## default params for non-required params +if ( is.null( opt$filter.type ) ) { opt$filter.type <- 'modulated' } +if ( is.null( opt$perc.pass ) ) { opt$perc.pass <- 1/3 } +if ( is.null( opt$selection.criteria ) ) { opt$selection.criteria <- 'max_diffs' } +if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" } +if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" } +if ( is.null( opt$output.rdata ) ) { opt$output.rdata <- "output.rdata" } +if ( opt$perc.pass < 0 ) { + stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" ) +} + +if (!file.exists(opt$output.report.dir)){ + dir.create(opt$output.report.dir) +} + + +data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) ) +genes <- rownames( data ) +genes <- genes[ !grepl( "abstract|family|complex", genes ) ] +data <- data[ genes, ] + +nulls.mat <- grepl( "na_", colnames( data ) ) +reals <- ! nulls.mat +nulls.mat <- data[ , nulls.mat ] +reals.mat <- data[, reals ] +if ( ncol( nulls.mat ) == 0 ) stop( "no nulls were in the file provided!\n" ) +if ( ncol( reals.mat ) == 0 ) stop( "no reals were in the file provided!\n" ) + + +if ( opt$filter.type == 'modulated' ) { + reals.mat <- abs( reals.mat ) + nulls.mat <- abs( nulls.mat ) +} else { + if ( opt$filter.type == "inactive" ) { + reals.mat <- -reals.mat + nulls.mat <- -nulls.mat + } +} + + +## we only look at the larger 50% of the possible IPL values +## as possible thresholds to use (since the lower 50% are almost +## always uninformative) +thresholds <- unique( quantile( reals.mat, + seq( 0.5, 1, by=0.001 ) ) ) +thresholds <- c( thresholds, + quantile( nulls.mat, + seq( 0.5, 1, by=0.001 ) ) ) +thresholds <- unique( sort( thresholds ) ) + + +get.num.filtered.feats <- function( mat, + threshold, + perc.samples.passing=1/3 ) { + feat.vect <- apply( mat, + 1, + function(x) { + tmp <- sum( x > threshold ) + if ( perc.samples.passing >=1 ) { + return( tmp >= perc.samples.passing ) + } else { + return( tmp > floor( perc.samples.passing * length(x) ) ) + } + } + ) + return( sum( feat.vect ) ) +} + + +real.feats <- null.feats <- length( genes ) +chisq.pvals <- binom.pvals <- numeric() + +for ( i in 1:length( thresholds ) ) { + + nul.feats.this.thresh <- get.num.filtered.feats( mat=nulls.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass ) + ## limit the maximum threshold to one where there are at least 75 valid points + ## because if there are fewer nulls than that, it heavily skews the probability + if ( nul.feats.this.thresh < 50 ) break + + null.feats[ i ] <- nul.feats.this.thresh + real.feats[ i ] <- get.num.filtered.feats( mat=reals.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass ) + + ## only calculate if there are more real features than nulls, otherwise, give a p-value of 1 + if ( null.feats[i] < real.feats[i] ) { + p <- null.feats[i]/nrow( nulls.mat ) + sd <- ( nrow( nulls.mat ) * p * (1-p ) )^0.5 + + ## binomial test + p <- -pnorm( q=real.feats[i], + mean=null.feats[i], + sd=sd, + log.p=TRUE, + lower.tail=FALSE ) + + ##chisq test + chi <- ( real.feats[i] - null.feats[i] )^2 + chi <- chi/(null.feats[i])^2 + chi <- -pchisq( chi, 1, log.p=TRUE, lower=FALSE ) + } else { + p <- chi <- 0 ## 0 == -log(1) + } + + binom.pvals <- c( binom.pvals, p ) + chisq.pvals <- c( chisq.pvals, chi ) + + if ( length( chisq.pvals ) != i ) { + stop( "lengths differ\n" ) + } +} + + + +##names( binom.pvals ) <- names( chisq.pvals ) <- thresholds +diffs <- real.feats - null.feats +if ( opt$selection.criteria == "max_diffs" ) { + max.diff <- max( diffs ) + opt.thresh <- which( diffs %in% max.diff ) +} else if ( opt$selection.criteria == "binomial" ) { + max.bin <- max( binom.pvals ) + opt.thresh <- which( binom.pvals %in% max.bin ) +} else if ( opt$selection.criteria == "chisq" ) { + max.chi <- max( chisq.pvals ) + opt.thresh <- which( chisq.pvals %in% max.chi ) +} + +opt.thresh <- mean( c( thresholds[ opt.thresh ], thresholds[ (opt.thresh-1) ] ) ) +opt.thresh <- signif( opt.thresh, 4 ) + + +##corrected.binom.pvals <- binom.pvals + log( length(thresholds) ) +##binom.pvals <- binom.pvals - log( length(thresholds) ) +##corrected.chisq.pvals <- chisq.pvals + log( length(thresholds) ) +##chisq.pvals <- chisq.pvals - log( length(thresholds) ) + + +eval.thresh <- thresholds[ 1:length( real.feats ) ] +##plot.new(); screens <- split.screen( c( 4,1 ) ) +##postscript( "threshold.comparison.ps", paper='letter', horizontal=F ) +##png.fname <- file.path( opt$output.report.dir, "IPL.threshold.determination.png") +##plot.dev <- png( png.fname, +## width=11, +## height=8.5, +## units='in', +## res=72 ) +##par( mar=rep(0,4) ) +##screens <- split.screen( c( 4,1 ) ) + +png.fname <- file.path( opt$output.report.dir, "01.num.feats.IPL.threshold.determination.png") +plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) +par( mar=c(2.25,3,1.5,0.5) ) +plot( eval.thresh, null.feats, type='l', lwd=2, col='blue', cex.axis=0.75 ) +lines( eval.thresh, real.feats, type='l', lwd=2, col='black', cex.axis=0.75 ) +abline( v=opt.thresh ) +legend( "topright", c( "Real", "Null" ), lwd=2, col=c('black', 'blue' ) ) +mtext( "Number of Genes Passing Threshold", font=2 ) +mtext( "IPL Threshold", 1, font=2, line=1.5 ) +mtext( "Number of Genes", 2, font=2, line=1.8 ) +dev.off() + + +png.fname <- file.path( opt$output.report.dir, "02.diffs.IPL.threshold.determination.png") +plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) +##screen( screen()+1 ) +par( mar=c(2.25,3,1.5,0.5) ) +plot( eval.thresh, diffs, type='l', lwd=2, col='black', cex.axis=0.75 ) +abline( v=opt.thresh ) +mtext( "Difference between number of Real & Null genes passing Threshold", font=2 ) +mtext( "IPL Threshold", 1, font=2, line=1.5 ) +mtext( "Number of Genes", 2, font=2, line=1.8 ) +dev.off() + + + +png.fname <- file.path( opt$output.report.dir, "03.chisq.IPL.threshold.determination.png") +plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) +##screen( screen()+1 ) +par( mar=c(2.25,3,1.5,0.5) ) +plot( eval.thresh, chisq.pvals, type='l', lwd=2, col='red', cex.axis=0.75 ) +abline( v=opt.thresh ) +mtext( "Chi-sq p-values", font=2 ) +mtext( "IPL Threshold", 1, font=2, line=1.5 ) +mtext( "-Log p-value", 2, font=2, line=1.8 ) +dev.off() + + + +png.fname <- file.path( opt$output.report.dir, "04.binom.IPL.threshold.determination.png") +plot.dev <- png( png.fname, + width=11, + height=8.5, + units='in', + res=72 ) +##screen( screen()+1 ) +par( mar=c(2.25,3,1.5,0.5) ) +plot( eval.thresh, binom.pvals, type='l', lwd=2, col='green', cex.axis=0.75 ) +abline( v=opt.thresh ) +mtext( "Binomial p-values", font=2 ) +mtext( "IPL Threshold", 1, font=2, line=1.5 ) +mtext( "-Log p-value", 2, font=2, line=1.8 ) +dev.off() + +##close.screen( all=T ); dev.off() + +report_str = paste( "The threshold to use for consensus clustering filtering is ", opt.thresh, "\n", sep="" ) + +pngs = list.files(path=opt$output.report.dir, patt="png") +html.out <- paste( "<html>", report_str, + paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""), + "</html>" ) +cat( html.out, file=opt$output.report.html ) + +filter.type <- opt$filter.type +perc.pass <- opt$perc.pass +save( file=opt$output.rdata, thresholds, diffs, binom.pvals, chisq.pvals, real.feats, null.feats, data, filter.type, perc.pass, opt.thresh )