Mercurial > repos > peter-waltman > ucsc_cluster_tools2

diff cluster.tools/determine.IPL.threshold.R @ 0:0decf3fd54bc draft
Uploaded
author: peter-waltman
date: Thu, 28 Feb 2013 01:45:39 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/determine.IPL.threshold.R	Thu Feb 28 01:45:39 2013 -0500
@@ -0,0 +1,263 @@
+#!/usr/bin/env Rscript
+
+##usage, options and doc goes here
+argspec <- c("determine.IPL.threshold.R takes an IPL result, and determines a statistically sound threshold to use
+
+        Usage: 
+                determine.IPL.threshold.R -d <IPL_data_file>
+        Optional:
+                -o output.rdata  ## rdata output file (contains variables used for calculation, for those who want to review them
+                -f filter type # must be either modulated, active, or inactive
+                -p percent of samples passing (must be value on [0,1]
+\n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write( argspec, stderr() )
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",          "d", 1, "character",
+                   "output.rdata",        "o", 2, "character",
+                   "filter.type",         "f", 2, "character",
+                   "perc.pass",           "p", 2, "numeric",
+                   "selection.criteria",  "s", 2, "character",
+                   "output.report.dir",   "r", 2, "character",
+                   "output.report.html",  "h", 2, "character"
+                  ),
+                nc=4,
+                byrow=T
+               )
+opt <- getopt( spec=spec )
+
+## default params for non-required params
+if ( is.null( opt$filter.type ) ) { opt$filter.type <- 'modulated' }
+if ( is.null( opt$perc.pass ) ) { opt$perc.pass <- 1/3 }
+if ( is.null( opt$selection.criteria ) ) { opt$selection.criteria <- 'max_diffs' }
+if ( is.null( opt$output.report.dir ) ) { opt$output.report.dir <- "report" }
+if ( is.null( opt$output.report.html ) ) { opt$output.report.html <- "report/index.html" }
+if ( is.null( opt$output.rdata ) ) { opt$output.rdata <- "output.rdata" }
+if ( opt$perc.pass < 0  ) {
+  stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
+}
+
+if (!file.exists(opt$output.report.dir)){
+    dir.create(opt$output.report.dir)
+}
+
+                               
+data <- as.matrix( read.delim( opt$data.fname, row.names=1, check.names=FALSE ) )
+genes <- rownames( data )
+genes <- genes[ !grepl( "abstract|family|complex", genes ) ]
+data <- data[ genes, ]
+
+nulls.mat <- grepl( "na_", colnames( data ) )
+reals <- ! nulls.mat
+nulls.mat <- data[ , nulls.mat ]
+reals.mat <- data[, reals ]
+if ( ncol( nulls.mat ) == 0 ) stop( "no nulls were in the file provided!\n" )
+if ( ncol( reals.mat ) == 0 ) stop( "no reals were in the file provided!\n" )
+
+
+if ( opt$filter.type == 'modulated' ) {
+  reals.mat <- abs( reals.mat )
+  nulls.mat <- abs( nulls.mat )
+} else {
+  if ( opt$filter.type == "inactive" ) {
+    reals.mat <- -reals.mat
+    nulls.mat <- -nulls.mat
+  }
+} 
+
+
+##  we only look at the larger 50% of the possible IPL values
+##  as possible  thresholds to use (since the lower 50% are almost
+##  always uninformative)
+thresholds <- unique( quantile( reals.mat,
+                                seq( 0.5, 1, by=0.001 ) ) )
+thresholds <- c( thresholds,
+                 quantile( nulls.mat,
+                           seq( 0.5, 1, by=0.001 ) ) )
+thresholds <- unique( sort( thresholds ) )
+
+
+get.num.filtered.feats <- function( mat,
+                                    threshold,
+                                    perc.samples.passing=1/3 ) {
+  feat.vect <- apply( mat,
+                      1,
+                      function(x) {
+                        tmp <- sum( x > threshold )
+                        if ( perc.samples.passing >=1 ) {
+                          return( tmp >= perc.samples.passing )
+                        } else {
+                          return( tmp > floor( perc.samples.passing * length(x) ) )
+                        }
+                      }
+                     )
+  return( sum( feat.vect ) )
+}
+
+
+real.feats <- null.feats <- length( genes )
+chisq.pvals <- binom.pvals <- numeric()
+
+for ( i in 1:length( thresholds ) ) {
+
+  nul.feats.this.thresh <- get.num.filtered.feats( mat=nulls.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )
+  ## limit the maximum threshold to one where there are at least 75 valid points
+  ##  because if there are fewer nulls than that, it heavily skews the probability
+  if ( nul.feats.this.thresh < 50 ) break
+  
+  null.feats[ i ] <- nul.feats.this.thresh
+  real.feats[ i ] <- get.num.filtered.feats( mat=reals.mat, threshold=thresholds[i], perc.samples.passing=opt$perc.pass )  
+  
+  ## only calculate if there are more real features than nulls, otherwise, give a p-value of 1
+  if ( null.feats[i] < real.feats[i] ) {
+    p <- null.feats[i]/nrow( nulls.mat )
+    sd <- ( nrow( nulls.mat ) * p * (1-p ) )^0.5
+
+    ## binomial test
+    p <- -pnorm( q=real.feats[i],
+                 mean=null.feats[i],
+                 sd=sd,
+                 log.p=TRUE,
+                 lower.tail=FALSE )
+
+    ##chisq test
+    chi <- ( real.feats[i] - null.feats[i] )^2
+    chi <- chi/(null.feats[i])^2
+    chi <- -pchisq( chi, 1, log.p=TRUE, lower=FALSE )
+  } else {
+    p <- chi <- 0 ## 0 == -log(1)
+  }
+
+  binom.pvals <- c( binom.pvals, p )
+  chisq.pvals <- c( chisq.pvals, chi )
+
+  if ( length( chisq.pvals ) != i ) {
+    stop( "lengths differ\n" )
+  }
+}
+
+
+
+##names( binom.pvals ) <- names( chisq.pvals ) <- thresholds
+diffs <- real.feats - null.feats
+if ( opt$selection.criteria == "max_diffs" ) {
+  max.diff <- max( diffs )
+  opt.thresh <- which( diffs %in% max.diff )
+} else if ( opt$selection.criteria == "binomial" ) {
+  max.bin <- max( binom.pvals )
+  opt.thresh <- which( binom.pvals %in% max.bin )
+} else if ( opt$selection.criteria == "chisq" ) {
+  max.chi <- max( chisq.pvals )
+  opt.thresh <- which( chisq.pvals %in% max.chi )
+}
+
+opt.thresh <- mean( c( thresholds[ opt.thresh ], thresholds[ (opt.thresh-1) ] ) )
+opt.thresh <- signif( opt.thresh, 4 )
+
+  
+##corrected.binom.pvals <- binom.pvals + log( length(thresholds) )
+##binom.pvals <- binom.pvals - log( length(thresholds) )
+##corrected.chisq.pvals <- chisq.pvals + log( length(thresholds) )
+##chisq.pvals <- chisq.pvals - log( length(thresholds) )
+
+
+eval.thresh <- thresholds[ 1:length( real.feats ) ]
+##plot.new(); screens <- split.screen( c( 4,1 ) )
+##postscript( "threshold.comparison.ps", paper='letter', horizontal=F )
+##png.fname <- file.path( opt$output.report.dir, "IPL.threshold.determination.png")
+##plot.dev <- png( png.fname,
+##                 width=11,
+##                 height=8.5,
+##                 units='in',
+##                 res=72 )
+##par( mar=rep(0,4) )
+##screens <- split.screen( c( 4,1 ) )
+
+png.fname <- file.path( opt$output.report.dir, "01.num.feats.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, null.feats, type='l', lwd=2, col='blue', cex.axis=0.75 )
+lines( eval.thresh, real.feats, type='l', lwd=2, col='black', cex.axis=0.75 )
+abline( v=opt.thresh )
+legend( "topright", c( "Real", "Null" ), lwd=2, col=c('black', 'blue' ) )
+mtext( "Number of Genes Passing Threshold", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "Number of Genes", 2, font=2, line=1.8 )
+dev.off()
+
+
+png.fname <- file.path( opt$output.report.dir, "02.diffs.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, diffs, type='l', lwd=2, col='black', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Difference between number of Real & Null genes passing Threshold", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "Number of Genes", 2, font=2, line=1.8 )
+dev.off()
+
+
+
+png.fname <- file.path( opt$output.report.dir, "03.chisq.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, chisq.pvals, type='l', lwd=2, col='red', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Chi-sq p-values", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "-Log p-value", 2, font=2, line=1.8 )
+dev.off()
+
+
+
+png.fname <- file.path( opt$output.report.dir, "04.binom.IPL.threshold.determination.png")
+plot.dev <- png( png.fname,
+                 width=11,
+                 height=8.5,
+                 units='in',
+                 res=72 )
+##screen( screen()+1 )
+par( mar=c(2.25,3,1.5,0.5) )
+plot( eval.thresh, binom.pvals, type='l', lwd=2, col='green', cex.axis=0.75 )
+abline( v=opt.thresh )
+mtext( "Binomial p-values", font=2 )
+mtext( "IPL Threshold", 1, font=2, line=1.5 )
+mtext( "-Log p-value", 2, font=2, line=1.8 )
+dev.off()
+
+##close.screen( all=T ); dev.off()
+
+report_str = paste( "The threshold to use for consensus clustering filtering is ", opt.thresh, "\n", sep="" )
+
+pngs = list.files(path=opt$output.report.dir, patt="png")
+html.out <- paste( "<html>", report_str,
+                   paste( paste( paste( "<div><img src=\'", pngs, sep="" ), "\'/></div>", sep="" ), collapse=""),
+                   "</html>" )
+cat( html.out, file=opt$output.report.html )
+
+filter.type <- opt$filter.type
+perc.pass <- opt$perc.pass
+save( file=opt$output.rdata, thresholds, diffs, binom.pvals, chisq.pvals, real.feats, null.feats, data, filter.type, perc.pass, opt.thresh )
author	peter-waltman
date	Thu, 28 Feb 2013 01:45:39 -0500
parents
children