changeset 1:dddfeedb85af draft

Uploaded
author peter-waltman
date Fri, 01 Mar 2013 10:16:53 -0500
parents 0decf3fd54bc
children b8f262149ee2
files cluster.tools/cutree.xml cluster.tools/extract.TCGA.survival.data.R cluster.tools/extract.TCGA.survival.data.py cluster.tools/extract.TCGA.survival.data.xml cluster.tools/fix.and.merge.TCGA.sample.IDs.R cluster.tools/format.raw.TCGA.RNASeq.data.R cluster.tools/format.raw.TCGA.RNASeq.data.py cluster.tools/format.raw.TCGA.RNASeq.data.xml cluster.tools/ipl.feature.selection.xml cluster.tools/normalize.matrix.R cluster.tools/rnaseq.feature.selection.R cluster.tools/rnaseq.feature.selection.py cluster.tools/rnaseq.feature.selection.xml
diffstat 13 files changed, 579 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/cluster.tools/cutree.xml	Thu Feb 28 01:45:39 2013 -0500
+++ b/cluster.tools/cutree.xml	Fri Mar 01 10:16:53 2013 -0500
@@ -13,7 +13,7 @@
         <data format="rdata" name="rdata_output" label="Cutree from Hierarchical Clustering Result (RData)"/>
     </outputs>
 <help>
-- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools.  Specifically, it must contain the following objects
+- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Hierarchical Clustering (HAC),' tool.  Specifically, it must contain the following objects:
 
          * A 'treecl.res' or 'partcl.res' object - corresponding to whether the cluster results is from a partition or tree clustering method
          * A 'data' object that contains the data that was passed into the clustering method.  NOTE, it is better for this to be the actual data passed in, rather than the data prior to the pre-processing that was performed prior to the actual clustering.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.TCGA.survival.data.R	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,195 @@
+#!/usr/bin/env Rscript
+## 
+## formats raw clinical data from TCGA to contain a single status & time colums
+##
+## Input (required):
+##    - clinical data
+## Input (optional):
+##    - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
+##         ideally, a better design would allow a user to specify 1 or more columns
+##         to check for the status & time columns - however, due to the necessities
+##         required to pre-process the TCGA clinical data, the script would not be
+##         generalizeable - and for this reason, the TCGA columns are hard-coded.
+##
+## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
+##
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                format.raw.TCGA.clinical.data.R -c <clinical.file> 
+        Options:
+                -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
+              ")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "clinical.fname", "d", 1, "character",  
+                   "output.fname",   "o", 2, "character"
+                  ),
+                ncol=4,
+                byrow=TRUE
+               )
+opt <- getopt( spec=spec )
+
+##set some reasonable defaults for the options that are needed,
+##but were not specified.
+if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) }
+
+##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 )
+orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE )
+orig.clinical.data <- unique( orig.clinical.data )
+rownames( orig.clinical.data ) <- orig.clinical.data[,1]
+orig.clinical.data <- orig.clinical.data[, -1 ]
+
+##  ugh, some TCGA data sets have all NAs in the "days_to_..." columns
+if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
+  time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" )
+} else {
+  time.cols <- c( "days_to_death", "days_to_last_followup"  )
+}
+good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) )
+
+orig.clinical.data <- orig.clinical.data[ good.samps, ]
+
+if ( is.null(opt$status.column ) ) {
+  status.colname <- "vital_status"
+  if ( status.colname %in% colnames( orig.clinical.data ) ) {
+    opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
+    clinical.data <- orig.clinical.data[ , opt$status.column ]
+  }
+  else {
+    status.colname <- "days_to_death"
+    if ( status.colname %in% colnames( orig.clinical.data ) ) {
+      opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname )
+      clinical.data <- orig.clinical.data[ , opt$status.column ]
+    }
+    else {
+      stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" )
+    }
+  }
+  clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) )
+}
+if ( is.null(opt$time.column ) ) {
+  time.colname <- "CDE.clinical_time"
+  
+  if ( time.colname %in% colnames( orig.clinical.data ) ) {
+    opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname )
+    clinical.data <- cbind( clinical.data,
+                           as.numeric( orig.clinical.data[, opt$time.column ] ) )
+  }
+  else {
+    dec.mat <- matrix( NA,
+                       nc=length( time.cols ),
+                       nr=nrow( orig.clinical.data ),
+                       dimnames=list( rownames( orig.clinical.data ),
+                                       time.cols )
+                      )
+    for ( cname in colnames( dec.mat ) ) {
+      if ( cname %in% colnames( orig.clinical.data ) ) {
+        dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] )
+      }
+    }
+                         
+    
+
+    if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) {
+
+      opt$time.column <- sapply( 1:length( clinical.data ),
+                                 function(i) {
+                                   if ( clinical.data[i] ) {
+                                     ## this is a deceased sample
+                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
+                                                     dec.mat[ i, "days_to_death" ],
+                                                     ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ),
+                                                             dec.mat[ i, "days_to_last_known_alive" ],
+                                                             dec.mat[ i, "days_to_last_followup" ] ) ) )
+                                                   
+                                   }
+                                   else {
+                                     return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) )
+                                   }
+                                 }
+                                )
+    } else {
+      opt$time.column <- sapply( 1:length( clinical.data ),
+                                 function(i) {
+                                   if ( clinical.data[i] ) {
+                                     ## this is a deceased sample
+                                     return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ),
+                                                     dec.mat[ i, "days_to_death" ],
+                                                     dec.mat[ i, "days_to_last_followup" ] ) )
+                                                   
+                                   }
+                                   else {
+                                     return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) )
+                                   }
+                                 }
+                                )
+    }
+                                   
+    
+    clinical.data <- cbind( clinical.data,
+                           as.numeric( opt$time.column ) )
+  }
+}
+
+clinical.data <- as.data.frame( clinical.data )
+colnames( clinical.data ) <- c( "status", "time" )
+rownames( clinical.data ) <- rownames( orig.clinical.data )
+
+
+##  check to make sure that the id's are sync'd correctly
+## the default format is to use hyphens to separate the elt's of the name
+## and to only use the 1st 3 elements of the name
+## so we check to see if they're using something else as separators and/or using more than 3 elts
+reformat.ids <- function( ids ) {
+
+  if ( grepl( "TCGA\\.", ids[1] ) ) {
+    ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) )
+  } else {
+    ## do this just in case there's more than 3 elements to the names
+    if ( grepl( "TCGA-", ids[1] ) ) {
+      ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) )
+    }
+  }
+  return( ids )
+}
+
+
+new.samp.ids <- reformat.ids( rownames( clinical.data ) )
+if ( any( duplicated( new.samp.ids ) ) ) {
+  ## in some cases, we have duplicate sample ids in the raw data after we truncate to
+  ##   the 1st 3 elts in the barcode, so just simplify the data
+  uniqs <- ! duplicated( new.samp.ids )
+  clinical.data <- clinical.data[ uniqs, ]
+  new.samp.ids <- new.samp.ids[ uniqs ]
+}
+  
+rownames( clinical.data ) <- new.samp.ids
+
+write.2.tab( clinical.data, opt$output.fname )
+##write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.TCGA.survival.data.py	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extract.TCGA.survival.data.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/extract.TCGA.survival.data.xml	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,21 @@
+<tool id="extract_TCGA_survival_data" name="Extract TCGA Survival Data" force_history_refresh="True">
+    <command interpreter="python">extract.TCGA.survival.data.py
+-d $dataset
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Raw Clinical Data"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Formatted Clinical Data"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Format Raw TCGA Clinical Data** - Tool to convert a raw clinical TCGA data file into a the format expected by the Survival Analysis tools
+
+**OUTPUT:**  A new clinical data file that is a 2 column, tab-delimited file of the format that is expected by the Survival Analysis tools
+
+</help>
+</tool>
--- a/cluster.tools/fix.and.merge.TCGA.sample.IDs.R	Thu Feb 28 01:45:39 2013 -0500
+++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.R	Fri Mar 01 10:16:53 2013 -0500
@@ -12,6 +12,15 @@
   q();
 }
 
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
 lib.load.quiet <- function( package ) {
    package <- as.character(substitute(package))
    suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
@@ -116,4 +125,4 @@
 
 if ( transpose.back ) data <- t( data )
 
-write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
+write.2.tab( data, opt$output.fname )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.RNASeq.data.R	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,120 @@
+#!/usr/bin/env Rscript
+## 
+## formats raw clinical data from TCGA to contain a single status & time colums
+##
+## Input (required):
+##    - clinical data
+## Input (optional):
+##    - status & time columns: (NOT USED IN THIS SCRIPT - see comment below)
+##         ideally, a better design would allow a user to specify 1 or more columns
+##         to check for the status & time columns - however, due to the necessities
+##         required to pre-process the TCGA clinical data, the script would not be
+##         generalizeable - and for this reason, the TCGA columns are hard-coded.
+##
+## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time
+##
+## Date: August 21, 2012
+## Author: Peter Waltman
+##
+
+##usage, options and doc goes here
+argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data
+and generates a KM-plot, along with the log-rank p-values
+
+        Usage: 
+                format.raw.TCGA.clinical.data.R -c <clinical.file> 
+        Options:
+                -o <output file> (tab-delimited (3 col: sample_id <tab> status <tab> time))
+              ")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
+fix.genes <- function( mat ) {
+  ## filter out the unknowns
+  mat <- mat[ ( !  grepl( "^\\?", rownames( mat ) ) ), ]  
+
+  genes <-  rownames( mat )
+  ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices
+  genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] )
+
+  if ( any( duplicated( genes ) ) ) {
+    dupes <- unique( genes[ duplicated( genes ) ] )
+    no.duped.mat <- which( ! genes %in% dupes )
+    no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ]
+
+    ## now merge in the duplicates
+    for ( dup in dupes ) {
+      duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ]
+      duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE )
+      no.duped.mat <- rbind( no.duped.mat, duped.mat )
+      ## weird bit of follow up to make sure the new row/gene is named correctly
+      rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ],
+                                     paste( dup, "|0000", sep="" ) )
+    }
+    mat <- no.duped.mat; rm( no.duped.mat ); gc()
+  }
+
+  rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] )
+  return( mat )
+}
+
+fix.sample.ids <- function( mat ) {
+  colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) )
+  return( mat )
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "dataset",             "d", 1, "character",
+                   "log.tranform",        "l", 2, "character",
+                   "filter.low.variant",  "f", 2, "integer",
+                   "output.fname",        "o", 2, "character"
+                  ),
+                ncol=4,
+                byrow=TRUE
+               )
+opt <- getopt( spec=spec )
+
+##set some reasonable defaults for the options that are needed,
+##but were not specified.
+if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) }
+if ( is.null(opt$log.transform ) ) {
+  opt$log.transform <- TRUE
+} else {
+  opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" )
+}
+
+if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 } 
+
+
+data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) )
+data <- fix.genes( data )
+data <- fix.sample.ids( data )
+
+if ( opt$filter.low.variant > 0 ) {
+  rpkm.meds <- apply( data, 1, median, na.rm=TRUE )
+  rpkm.meds <- rpkm.meds >= opt$filter.low.variant
+  data <- data[ rpkm.meds, ]
+}
+
+if ( opt$log.transform ) {
+  data <- log( data + 1 )
+}
+
+write.2.tab( data, opt$output.fname )
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.RNASeq.data.py	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "format.raw.TCGA.RNASeq.data.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/format.raw.TCGA.RNASeq.data.xml	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,34 @@
+<tool id="format_raw_TCGA_RNASeq_data" name="Format Raw TCGA RNASeq Data" force_history_refresh="True">
+    <command interpreter="python">format.raw.TCGA.RNASeq.data.py
+-d $dataset
+-l {$log_transform}
+-f ${filter_low_variant}
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="Raw TCGA RNASeq Data"/>
+        <param name="log_transform" type='select' display="radio" label="Log-transform data?" help="Specify whether or not to log-transform the data matrix (log(x+1))">
+          <option value="no">No</option>
+          <option value="yes" selected='true' >Yes</option>
+        </param>
+        <param name="filter_low_variant" type='integer' label="Filter Threshold for Low-variant genes?" value="10" help="Specify threshold for minimum median value for all genes (-1 to use no filter)">
+          <option value="yes" selected='true' >Yes</option>
+          <option value="no">No</option>
+        </param>
+
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Formatted RNASeq Data"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**Format Raw TCGA RNASeq Data** - Tool to convert a raw RNASeq TCGA data file (a file from either Synapse or Firehose) into a the format expected by the Survival Analysis tools
+
+**Log-transform data?** -Specify whether or not to log-transform the data matrix.  To avoid numeric underflow, this will use log(x+1), where x is the value of the RNASeq data
+
+**OUTPUT:**  A new tab-delimited file containing the log-transformed RNASeq data
+
+</help>
+</tool>
--- a/cluster.tools/ipl.feature.selection.xml	Thu Feb 28 01:45:39 2013 -0500
+++ b/cluster.tools/ipl.feature.selection.xml	Fri Mar 01 10:16:53 2013 -0500
@@ -15,7 +15,7 @@
 
 </command>
     <inputs>
-    	<param name="dataset" type="data" format='tabular' label="Data Set"/>
+    	<param name="dataset" type="data" format='tabular' label="IPL Data Set"/>
     	<param name="genes_only" type="boolean" label="Genes Only (check to set yes)" truevalue="-g" falsevalue="" checked="False" />
     	
 	<conditional name="thresh_format" >
--- a/cluster.tools/normalize.matrix.R	Thu Feb 28 01:45:39 2013 -0500
+++ b/cluster.tools/normalize.matrix.R	Fri Mar 01 10:16:53 2013 -0500
@@ -19,6 +19,15 @@
   q();
 }
 
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
 lib.load.quiet <- function( package ) {
    package <- as.character(substitute(package))
    suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
@@ -56,7 +65,6 @@
 if ( is.null( opt$var.adj.cols ) ) { opt$var.adj.cols <- 'none' }
 
 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
-
 my.center <- rep( 0, nrow( data ) )
 my.var.adj <- rep( 1, nrow( data ) )
 if ( opt$center.rows != "none" ) {
@@ -70,8 +78,8 @@
 }
 data <- t( scale( t( data ), center=my.center, scale=my.var.adj ) )
 
-my.center <- rep( 0, nrow( data ) )
-my.var.adj <- rep( 1, nrow( data ) )
+my.center <- rep( 0, ncol( data ) )
+my.var.adj <- rep( 1, ncol( data ) )
 if ( opt$center.cols != "none" ) {
   my.center.fn <- get( opt$center.cols )
   ##data <- sweep( data, 2, apply( data, 2, my.center.fn, na.rm=T ) )
@@ -84,5 +92,5 @@
 }
 data <- scale( data, center=my.center, scale=my.var.adj )
 
-write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA )
+write.2.tab( data, opt$output.fname )
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/rnaseq.feature.selection.R	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,91 @@
+#!/usr/bin/env Rscript
+## RNASeq selection script by Peter Waltman
+## August 21, 2011
+## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
+##
+#usage, options and doc goes here  - NEEDS TO BE UPDATED  ---- TBD
+argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are
+active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples.
+
+        Usage: 
+                ipl.feature.selection.R -d <data.file> 
+        Optional:
+                -o <output.name>
+                -g <genes-only>   ## to set if only returning genes (default is all features)
+                -f <filter.type>       ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated)
+                -t <threshold>    ## the threshold to use for the filter (default is 0.25)
+                -p <perc.pass>    ## the percentage of samples that must pass the filter (default is 0.33)
+                -v <verbose>      ## to set verbose on
+
+                \n\n")
+args <- commandArgs(TRUE)
+if ( length( args ) == 1 && args =="--help") { 
+  write(argspec, stderr())
+  q();
+}
+
+## some helper fn's
+write.2.tab <- function( mat,
+                         fname ) {
+  mat <- rbind( colnames( mat ), mat )
+  mat <- cbind( c( "ID", rownames( mat )[-1] ),
+                      mat )
+  write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
+}
+
+lib.load.quiet <- function( package ) {
+   package <- as.character(substitute(package))
+   suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
+}
+## MAXIMUM absolute deviation function
+maxad <- function( x, na.rm=FALSE, center=median(x, na.rm=na.rm ) ) {
+  return( max( abs(x), na.rm=na.rm ) - center )
+}
+
+lib.load.quiet(getopt)
+
+spec <- matrix( c( "data.fname",         "d", 1, "character",
+                   "output.fname",       "o", 2, "character",
+                   "var.method",         "m", 2, "character", ## must be either 'active', 'inactive' or 'modulated'
+                   "z.transform",        "z", 2, "character",
+                   "perc.pass",          "p", 2, "numeric"
+                   ),
+                nc=4,
+                byrow=TRUE
+               )
+
+opt <- getopt( spec=spec )
+data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
+#set some reasonable defaults for the options that are needed,
+#but were not specified.
+if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1500 }
+if ( opt$perc.pass < 0  ) {
+  stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
+} else {
+  if ( opt$perc.pass < 1  ) {
+    opt$perc.pass <- floor( opt$perc.pass * nrow( data ) )
+  }
+}
+      
+if ( is.null( opt$var.method ) ) { opt$var.method <- 'mad' }
+if ( is.null( opt$z.transform ) ) {
+  opt$z.transform <- TRUE
+} else {
+  opt$z.transform <- ( tolower( opt$z.transform ) %in% "yes" )
+}
+if ( is.null( opt$output.fname ) ) opt$output.fname <- "high.var.rnaseq.tab"
+
+if ( opt$z.transform ) {
+  ##normalize it to have mean==0, std==1
+  data <- t( data )
+  data <- t( scale( data ) )
+}
+
+
+my.var.fn <- get( opt$var.method )
+my.var.vals <- apply( data, 1, my.var.fn, na.rm=T )
+filter.vect <- sort( my.var.vals, dec=TRUE, index=TRUE )$ix[ 1:opt$perc.pass ]
+
+data <- data[ filter.vect, ]
+write.2.tab( data, opt$output.fname )
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/rnaseq.feature.selection.py	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+import os
+import sys
+import subprocess
+
+select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "rnaseq.feature.selection.R")
+
+cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] 
+
+proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE )
+(stdoutdata, stderrdata) = proc.communicate()
+if proc.returncode:
+	sys.stderr.write(stderrdata)
+sys.stdout.write(stdoutdata)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cluster.tools/rnaseq.feature.selection.xml	Fri Mar 01 10:16:53 2013 -0500
@@ -0,0 +1,52 @@
+<tool id="rnaseq_feature_selection" name="RNASeq Feature Selection" force_history_refresh="True">
+    <command interpreter="python">rnaseq.feature.selection.py
+-d $dataset 
+-z ${z_transform}
+-m ${var_method} 
+-p ${perc_pass} 
+-o ${output}
+
+</command>
+    <inputs>
+    	<param name="dataset" type="data" format='tabular' label="RNASeq Data Set"/>
+        <param name="z_transform" type='select' display="radio" label="Z-transform data?" help="Specify whether or not to Z-transform the rows (mean=0, sd=1)">
+          <option value="yes" selected='true' >Yes</option>
+          <option value="no">No</option>
+        </param>
+    	<param name="var_method" type="select" label="Variance Metric for Genes" help="Specify Metric to use for calculating Gene Variance" >
+	  <option value="mad">Median Absolute Deviation (MAD)</option>
+	  <option value="maxad">Maximum Absolute Deviation (Max AD)</option>
+	  <option value="sd">Standard Deviation (SD)</option>
+    	</param>
+	<param name="perc_pass" type="float" label="Total number of features to keep" help="Use value >= 1 to indicate exact number of genes. Use value in 0-1 range to specify percentage" value="1500"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="output" label="Filtered RNASeqs"/>
+    </outputs>
+<help>
+.. class:: infomark
+     
+**RNASeq Feature Selection** - Tool to filter an RNASeq matrix to select the most variant features
+
+**OUTPUT:**  A new matrix containing only the most variant feaures
+
+----
+
+**Parameters**
+
+- **Z-transform data?** - Specify whether or not to Z-transform the rows (mean=0, sd=1)
+- **Variance Metric for Genes** - Specify Metric to use for calculating Gene Variance. Choice of:
+
+	 * Median Absolute Deviation (MAD)
+	 * Maximum Absolute Deviation - similar to MAD, but uses the _Maximum_, instead of the Median Absolute Deviatioin
+	 * Standard Deviation
+
+ - **Percentage of Samples Passing** Percent of samples with an IPL that passes the threshold. Choice of:
+
+         * Integer Value       - indicate the exact number of genes that are to be kept
+         * Real Value in [0,1] - indicate the percentage of genes that are to be kept
+
+
+
+</help>
+</tool>