diff cluster.tools/ipl.feature.selection.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/ipl.feature.selection.R	Wed Feb 27 20:17:04 2013 -0500
@@ -0,0 +1,156 @@
+#!/usr/bin/env Rscript
+## IPL selection script by Peter Waltman
+## August 21, 2011
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+#usage, options and doc goes here
+argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are
+active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples.
+
+        Usage: 
+                ipl.feature.selection.R -d <data.file> 
+        Optional:
+                -o <output.name>
+                -g <genes-only>   ## to set if only returning genes (default is all features)
+                -f <filter.type>       ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated)
+                -t <threshold>    ## the threshold to use for the filter (default is 0.25)
+                -p <perc.pass>    ## the percentage of samples that must pass the filter (default is 0.33)
+                -v <verbose>      ## to set verbose on
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "output.name",        "o", 2, "character",
+                   "genes.only",         "g", 0, "logical",
+                   "filter.type",        "f", 2, "character", ## must be either 'active', 'inactive' or 'modulated'
+                   "threshold",          "t", 2, "numeric",
+                   "empirical.fname",    "e", 2, "character",
+                   "perc.pass",          "p", 2, "numeric",
+                   "verbose",            "v", 0, "logical",    ## to set verbose on
+                   "help",               "h", 0, "logical"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+##save.image( "~/work.local/tmp/ipl.feature.sel.dbg.rda" )
+#set some reasonable defaults for the options that are needed,
+#but were not specified.
+if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
+if ( is.null(opt$genes.only ) ) {
+  opt$genes.only <- FALSE
+}
+
+if ( is.null(opt$filter.type ) ) { opt$filter.type = 'modulated' }
+if ( is.null( opt$threshold ) ) { opt$threshold=0.25 }
+if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1/3 }
+if ( opt$perc.pass < 0  ) {
+  stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
+}
+## now set filter.type, threshold & perc.pass if an empirical result has been passed in
+if ( ! is.null( opt$empirical.fname ) ) {
+
+  if ( ! file.exists( opt$empirical.fname ) ) stop( "can't file empirical result file:", opt$empirical.fname, "\n" )
+  ## assume this is an RData file
+  emp.fname.contents <- load( opt$empirical.fname )
+  if ( ! "opt.thresh" %in% emp.fname.contents ) stop( "no optimal threshold value found in RData file passed in\n" )
+  opt$threshold <- opt.thresh
+
+  if ( ! "filter.type" %in% emp.fname.contents ) stop( "no filter type value found in RData file passed in\n" )
+  opt$filter.type <- filter.type
+
+  if ( ! "perc.pass" %in% emp.fname.contents ) stop( "no percentage passing value found in RData file passed in\n" )
+  opt$perc.pass <- perc.pass
+}
+if ( ! opt$filter.type %in% c( 'active', 'inactive', 'modulated' ) ) stop( 'invalid filter.type specified:', opt$filter.type, "\n" )
+if ( is.null( opt$output.name ) ) {
+  opt$output.name <- file.path( getwd(),
+                                paste( opt$filter.type, basename( opt$data.fname ), sep="." ) )
+}
+
+
+
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+if ( opt$genes.only ) {
+  genes <- rownames( data )
+  genes <- genes[ ! grepl( "abstract|complex|family", genes ) ]
+  data <- data[ genes, ]
+}
+
+
+count.samps.threshold <- function( data,
+                                   threshold,
+                                   comparator ## must be one of lte, lt, gt, gte
+                                    ) {
+  filter.vect <- rep( TRUE, nrow( data ) ) ## set an initial val
+  if ( comparator == "lt" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x < threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "lte" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x <= threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "gte" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x >= threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "gt" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( x > threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "bothe" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( abs(x) >= threshold, na.rm=T ) ) )
+  }
+  if ( comparator == "both" ) {
+    return( apply( data,
+                   1,
+                   function(x) sum( abs(x) > threshold, na.rm=T ) ) )
+  }
+}
+
+
+
+
+if ( opt$filter.type=="active" ) {
+  ## this is an implementation of the activity filter that was used in the original PARADIGM paper
+  filter.vect <- count.samps.threshold( data, opt$threshold, "gt" )
+} else {
+  if ( opt$filter.type=="inactive" ) {
+    filter.vect <- count.samps.threshold( data, -opt$threshold, "lt" )
+  } else {
+    if ( opt$filter.type=="modulated" ) {
+      filter.vect <- count.samps.threshold( data, opt$threshold, "both" )
+    } else {
+      stop( "invalid filter.type specified: ", opt$filter.type )
+    }
+  }
+}
+
+if ( opt$perc.pass <1 ) {
+  filter.vect <- filter.vect > floor( ncol( data ) * opt$perc.pass )
+} else {
+  filter.vect <- filter.vect >= opt$perc.pass
+}
+data <- data[ filter.vect, ]
+
+write.table( data, opt$output.name, sep="\t", row.names=TRUE, col.names=NA, quote=FALSE )
+