diff cluster.tools/rnaseq.feature.selection.R @ 1:dddfeedb85af draft

Uploaded
author peter-waltman
date Fri, 01 Mar 2013 10:16:53 -0500
parents
children 563832f48c08
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/rnaseq.feature.selection.R	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,91 @@
+#!/usr/bin/env Rscript
+## RNASeq selection script by Peter Waltman
+## August 21, 2011
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+#usage, options and doc goes here  - NEEDS TO BE UPDATED  ---- TBD
+argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are
+active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples.
+
+        Usage: 
+                ipl.feature.selection.R -d <data.file> 
+        Optional:
+                -o <output.name>
+                -g <genes-only>   ## to set if only returning genes (default is all features)
+                -f <filter.type>       ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated)
+                -t <threshold>    ## the threshold to use for the filter (default is 0.25)
+                -p <perc.pass>    ## the percentage of samples that must pass the filter (default is 0.33)
+                -v <verbose>      ## to set verbose on
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+## MAXIMUM absolute deviation function
+maxad <- function( x, na.rm=FALSE, center=median(x, na.rm=na.rm ) ) {
+  return( max( abs(x), na.rm=na.rm ) - center )
+}
+
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "output.fname",       "o", 2, "character",
+                   "var.method",         "m", 2, "character", ## must be either 'active', 'inactive' or 'modulated'
+                   "z.transform",        "z", 2, "character",
+                   "perc.pass",          "p", 2, "numeric"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+#set some reasonable defaults for the options that are needed,
+#but were not specified.
+if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1500 }
+if ( opt$perc.pass < 0  ) {
+  stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
+} else {
+  if ( opt$perc.pass < 1  ) {
+    opt$perc.pass <- floor( opt$perc.pass * nrow( data ) )
+  }
+}
+      
+if ( is.null( opt$var.method ) ) { opt$var.method <- 'mad' }
+if ( is.null( opt$z.transform ) ) {
+  opt$z.transform <- TRUE
+} else {
+  opt$z.transform <- ( tolower( opt$z.transform ) %in% "yes" )
+}
+if ( is.null( opt$output.fname ) ) opt$output.fname <- "high.var.rnaseq.tab"
+
+if ( opt$z.transform ) {
+  ##normalize it to have mean==0, std==1
+  data <- t( data )
+  data <- t( scale( data ) )
+}
+
+
+my.var.fn <- get( opt$var.method )
+my.var.vals <- apply( data, 1, my.var.fn, na.rm=T )
+filter.vect <- sort( my.var.vals, dec=TRUE, index=TRUE )$ix[ 1:opt$perc.pass ]
+
+data <- data[ filter.vect, ]
+write.2.tab( data, opt$output.fname )
+