# HG changeset patch # User peter-waltman # Date 1362151013 18000 # Node ID dddfeedb85af334bcf8ad445aa963b8cfabcbab1 # Parent 0decf3fd54bc4570a98078388015db0c88eb6747 Uploaded diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/cutree.xml --- a/cluster.tools/cutree.xml Thu Feb 28 01:45:39 2013 -0500 +++ b/cluster.tools/cutree.xml Fri Mar 01 10:16:53 2013 -0500 @@ -13,7 +13,7 @@ -- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Partition Clustering,' 'Hierarchical Clustering (HAC),' or 'Consensus Clustering' tools. Specifically, it must contain the following objects +- **Cluster Result** - Specify the cluster result to analayze (MUST BE IN rdata format), and must contain the same objects that are produced by the 'Hierarchical Clustering (HAC),' tool. Specifically, it must contain the following objects: * A 'treecl.res' or 'partcl.res' object - corresponding to whether the cluster results is from a partition or tree clustering method * A 'data' object that contains the data that was passed into the clustering method. NOTE, it is better for this to be the actual data passed in, rather than the data prior to the pre-processing that was performed prior to the actual clustering. diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/extract.TCGA.survival.data.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/extract.TCGA.survival.data.R Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,195 @@ +#!/usr/bin/env Rscript +## +## formats raw clinical data from TCGA to contain a single status & time colums +## +## Input (required): +## - clinical data +## Input (optional): +## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) +## ideally, a better design would allow a user to specify 1 or more columns +## to check for the status & time columns - however, due to the necessities +## required to pre-process the TCGA clinical data, the script would not be +## generalizeable - and for this reason, the TCGA columns are hard-coded. +## +## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time +## +## Date: August 21, 2012 +## Author: Peter Waltman +## + +##usage, options and doc goes here +argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data +and generates a KM-plot, along with the log-rank p-values + + Usage: + format.raw.TCGA.clinical.data.R -c + Options: + -o (tab-delimited (3 col: sample_id status time)) + ") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + +spec <- matrix( c( "clinical.fname", "d", 1, "character", + "output.fname", "o", 2, "character" + ), + ncol=4, + byrow=TRUE + ) +opt <- getopt( spec=spec ) + +##set some reasonable defaults for the options that are needed, +##but were not specified. +if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.clinical.data" ) } + +##orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE, row.names=1 ) +orig.clinical.data <- read.delim( opt$clinical.fname, as.is=TRUE ) +orig.clinical.data <- unique( orig.clinical.data ) +rownames( orig.clinical.data ) <- orig.clinical.data[,1] +orig.clinical.data <- orig.clinical.data[, -1 ] + +## ugh, some TCGA data sets have all NAs in the "days_to_..." columns +if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { + time.cols <- c( "days_to_death", "days_to_last_followup", "days_to_last_known_alive" ) +} else { + time.cols <- c( "days_to_death", "days_to_last_followup" ) +} +good.samps <- ! apply( orig.clinical.data[, time.cols ], 1, function(x) all( is.na(x) ) | all( x <= 0, na.rm=T ) ) + +orig.clinical.data <- orig.clinical.data[ good.samps, ] + +if ( is.null(opt$status.column ) ) { + status.colname <- "vital_status" + if ( status.colname %in% colnames( orig.clinical.data ) ) { + opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) + clinical.data <- orig.clinical.data[ , opt$status.column ] + } + else { + status.colname <- "days_to_death" + if ( status.colname %in% colnames( orig.clinical.data ) ) { + opt$status.column <- which( colnames( orig.clinical.data ) %in% status.colname ) + clinical.data <- orig.clinical.data[ , opt$status.column ] + } + else { + stop( "can't find a valid entry with status info - have tried vital_status & days_to_death\n" ) + } + } + clinical.data <- as.numeric( ! grepl( "(LIVING|Not)", clinical.data ) ) +} +if ( is.null(opt$time.column ) ) { + time.colname <- "CDE.clinical_time" + + if ( time.colname %in% colnames( orig.clinical.data ) ) { + opt$time.column <- which( colnames( orig.clinical.data ) %in% time.colname ) + clinical.data <- cbind( clinical.data, + as.numeric( orig.clinical.data[, opt$time.column ] ) ) + } + else { + dec.mat <- matrix( NA, + nc=length( time.cols ), + nr=nrow( orig.clinical.data ), + dimnames=list( rownames( orig.clinical.data ), + time.cols ) + ) + for ( cname in colnames( dec.mat ) ) { + if ( cname %in% colnames( orig.clinical.data ) ) { + dec.mat[, cname ] <- as.numeric( orig.clinical.data[, cname ] ) + } + } + + + + if ( "days_to_last_known_alive" %in% colnames( orig.clinical.data ) ) { + + opt$time.column <- sapply( 1:length( clinical.data ), + function(i) { + if ( clinical.data[i] ) { + ## this is a deceased sample + return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), + dec.mat[ i, "days_to_death" ], + ifelse( ( !is.na( dec.mat[ i, "days_to_last_known_alive" ] ) ), + dec.mat[ i, "days_to_last_known_alive" ], + dec.mat[ i, "days_to_last_followup" ] ) ) ) + + } + else { + return( max( dec.mat[ i, c( "days_to_last_followup","days_to_last_known_alive") ], na.rm=T ) ) + } + } + ) + } else { + opt$time.column <- sapply( 1:length( clinical.data ), + function(i) { + if ( clinical.data[i] ) { + ## this is a deceased sample + return( ifelse( ( !is.na( dec.mat[ i, "days_to_death" ] ) ), + dec.mat[ i, "days_to_death" ], + dec.mat[ i, "days_to_last_followup" ] ) ) + + } + else { + return( max( dec.mat[ i, c( "days_to_last_followup") ], na.rm=T ) ) + } + } + ) + } + + + clinical.data <- cbind( clinical.data, + as.numeric( opt$time.column ) ) + } +} + +clinical.data <- as.data.frame( clinical.data ) +colnames( clinical.data ) <- c( "status", "time" ) +rownames( clinical.data ) <- rownames( orig.clinical.data ) + + +## check to make sure that the id's are sync'd correctly +## the default format is to use hyphens to separate the elt's of the name +## and to only use the 1st 3 elements of the name +## so we check to see if they're using something else as separators and/or using more than 3 elts +reformat.ids <- function( ids ) { + + if ( grepl( "TCGA\\.", ids[1] ) ) { + ids <- sapply( strsplit( ids, "\\." ), function(x) paste( x[1:3], collapse="-" ) ) + } else { + ## do this just in case there's more than 3 elements to the names + if ( grepl( "TCGA-", ids[1] ) ) { + ids <- sapply( strsplit( ids, "-" ), function(x) paste( x[1:min( c(3,length(x) ) )], collapse="-" ) ) + } + } + return( ids ) +} + + +new.samp.ids <- reformat.ids( rownames( clinical.data ) ) +if ( any( duplicated( new.samp.ids ) ) ) { + ## in some cases, we have duplicate sample ids in the raw data after we truncate to + ## the 1st 3 elts in the barcode, so just simplify the data + uniqs <- ! duplicated( new.samp.ids ) + clinical.data <- clinical.data[ uniqs, ] + new.samp.ids <- new.samp.ids[ uniqs ] +} + +rownames( clinical.data ) <- new.samp.ids + +write.2.tab( clinical.data, opt$output.fname ) +##write.table( clinical.data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA ) diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/extract.TCGA.survival.data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/extract.TCGA.survival.data.py Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import os +import sys +import subprocess + +select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "extract.TCGA.survival.data.R") + +cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] + +proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) +(stdoutdata, stderrdata) = proc.communicate() +if proc.returncode: + sys.stderr.write(stderrdata) +sys.stdout.write(stdoutdata) diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/extract.TCGA.survival.data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/extract.TCGA.survival.data.xml Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,21 @@ + + extract.TCGA.survival.data.py +-d $dataset +-o ${output} + + + + + + + + + +.. class:: infomark + +**Format Raw TCGA Clinical Data** - Tool to convert a raw clinical TCGA data file into a the format expected by the Survival Analysis tools + +**OUTPUT:** A new clinical data file that is a 2 column, tab-delimited file of the format that is expected by the Survival Analysis tools + + + diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/fix.and.merge.TCGA.sample.IDs.R --- a/cluster.tools/fix.and.merge.TCGA.sample.IDs.R Thu Feb 28 01:45:39 2013 -0500 +++ b/cluster.tools/fix.and.merge.TCGA.sample.IDs.R Fri Mar 01 10:16:53 2013 -0500 @@ -12,6 +12,15 @@ q(); } +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + lib.load.quiet <- function( package ) { package <- as.character(substitute(package)) suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) @@ -116,4 +125,4 @@ if ( transpose.back ) data <- t( data ) -write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA ) +write.2.tab( data, opt$output.fname ) diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/format.raw.TCGA.RNASeq.data.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/format.raw.TCGA.RNASeq.data.R Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,120 @@ +#!/usr/bin/env Rscript +## +## formats raw clinical data from TCGA to contain a single status & time colums +## +## Input (required): +## - clinical data +## Input (optional): +## - status & time columns: (NOT USED IN THIS SCRIPT - see comment below) +## ideally, a better design would allow a user to specify 1 or more columns +## to check for the status & time columns - however, due to the necessities +## required to pre-process the TCGA clinical data, the script would not be +## generalizeable - and for this reason, the TCGA columns are hard-coded. +## +## Output: a re-formatted clinical file containing 3 columns: sample-ID, status & time +## +## Date: August 21, 2012 +## Author: Peter Waltman +## + +##usage, options and doc goes here +argspec <- c("format.raw.TCGA.clinical.data.R takes a clustering from ConsensusClusterPlus and clinical survival data +and generates a KM-plot, along with the log-rank p-values + + Usage: + format.raw.TCGA.clinical.data.R -c + Options: + -o (tab-delimited (3 col: sample_id status time)) + ") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + +fix.genes <- function( mat ) { + ## filter out the unknowns + mat <- mat[ ( ! grepl( "^\\?", rownames( mat ) ) ), ] + + genes <- rownames( mat ) + ## select the HUGO name portion of the "gene-names" that are in the TCGA matrices + genes <- sapply( strsplit( genes, "\\|" ), function(x) x[1] ) + + if ( any( duplicated( genes ) ) ) { + dupes <- unique( genes[ duplicated( genes ) ] ) + no.duped.mat <- which( ! genes %in% dupes ) + no.duped.mat <- mat[ no.duped.mat, , drop=FALSE ] + + ## now merge in the duplicates + for ( dup in dupes ) { + duped.mat <- mat[ grepl( paste( "^", dup, sep="" ), rownames( mat ) ), ] + duped.mat <- apply( duped.mat, 2, mean, na.rm=TRUE ) + no.duped.mat <- rbind( no.duped.mat, duped.mat ) + ## weird bit of follow up to make sure the new row/gene is named correctly + rownames( no.duped.mat ) <- c( rownames( no.duped.mat )[ -nrow(no.duped.mat) ], + paste( dup, "|0000", sep="" ) ) + } + mat <- no.duped.mat; rm( no.duped.mat ); gc() + } + + rownames( mat ) <- sapply( strsplit( rownames( mat ), "\\|" ), function(x) x[1] ) + return( mat ) +} + +fix.sample.ids <- function( mat ) { + colnames( mat ) <- gsub( "\\.", "-", colnames( mat ) ) + return( mat ) +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + +spec <- matrix( c( "dataset", "d", 1, "character", + "log.tranform", "l", 2, "character", + "filter.low.variant", "f", 2, "integer", + "output.fname", "o", 2, "character" + ), + ncol=4, + byrow=TRUE + ) +opt <- getopt( spec=spec ) + +##set some reasonable defaults for the options that are needed, +##but were not specified. +if ( is.null(opt$output.fname ) ) { opt$output.fname <-file.path( getwd(), "formated.TCGA.RNASeq.data" ) } +if ( is.null(opt$log.transform ) ) { + opt$log.transform <- TRUE +} else { + opt$log.transform <- ( tolower( opt$log.transform ) %in% "yes" ) +} + +if ( is.null(opt$filter.low.variant ) ) { opt$log.transform <- 10 } + + +data <- as.matrix( read.delim( opt$dataset, row.names=1, check.names=F ) ) +data <- fix.genes( data ) +data <- fix.sample.ids( data ) + +if ( opt$filter.low.variant > 0 ) { + rpkm.meds <- apply( data, 1, median, na.rm=TRUE ) + rpkm.meds <- rpkm.meds >= opt$filter.low.variant + data <- data[ rpkm.meds, ] +} + +if ( opt$log.transform ) { + data <- log( data + 1 ) +} + +write.2.tab( data, opt$output.fname ) diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/format.raw.TCGA.RNASeq.data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/format.raw.TCGA.RNASeq.data.py Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import os +import sys +import subprocess + +select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "format.raw.TCGA.RNASeq.data.R") + +cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] + +proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) +(stdoutdata, stderrdata) = proc.communicate() +if proc.returncode: + sys.stderr.write(stderrdata) +sys.stdout.write(stdoutdata) diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/format.raw.TCGA.RNASeq.data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/format.raw.TCGA.RNASeq.data.xml Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,34 @@ + + format.raw.TCGA.RNASeq.data.py +-d $dataset +-l {$log_transform} +-f ${filter_low_variant} +-o ${output} + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**Format Raw TCGA RNASeq Data** - Tool to convert a raw RNASeq TCGA data file (a file from either Synapse or Firehose) into a the format expected by the Survival Analysis tools + +**Log-transform data?** -Specify whether or not to log-transform the data matrix. To avoid numeric underflow, this will use log(x+1), where x is the value of the RNASeq data + +**OUTPUT:** A new tab-delimited file containing the log-transformed RNASeq data + + + diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/ipl.feature.selection.xml --- a/cluster.tools/ipl.feature.selection.xml Thu Feb 28 01:45:39 2013 -0500 +++ b/cluster.tools/ipl.feature.selection.xml Fri Mar 01 10:16:53 2013 -0500 @@ -15,7 +15,7 @@ - + diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/normalize.matrix.R --- a/cluster.tools/normalize.matrix.R Thu Feb 28 01:45:39 2013 -0500 +++ b/cluster.tools/normalize.matrix.R Fri Mar 01 10:16:53 2013 -0500 @@ -19,6 +19,15 @@ q(); } +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + lib.load.quiet <- function( package ) { package <- as.character(substitute(package)) suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) @@ -56,7 +65,6 @@ if ( is.null( opt$var.adj.cols ) ) { opt$var.adj.cols <- 'none' } data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) - my.center <- rep( 0, nrow( data ) ) my.var.adj <- rep( 1, nrow( data ) ) if ( opt$center.rows != "none" ) { @@ -70,8 +78,8 @@ } data <- t( scale( t( data ), center=my.center, scale=my.var.adj ) ) -my.center <- rep( 0, nrow( data ) ) -my.var.adj <- rep( 1, nrow( data ) ) +my.center <- rep( 0, ncol( data ) ) +my.var.adj <- rep( 1, ncol( data ) ) if ( opt$center.cols != "none" ) { my.center.fn <- get( opt$center.cols ) ##data <- sweep( data, 2, apply( data, 2, my.center.fn, na.rm=T ) ) @@ -84,5 +92,5 @@ } data <- scale( data, center=my.center, scale=my.var.adj ) -write.table( data, opt$output.fname, sep="\t", quote=FALSE, col.names=NA ) +write.2.tab( data, opt$output.fname ) diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/rnaseq.feature.selection.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/rnaseq.feature.selection.R Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,91 @@ +#!/usr/bin/env Rscript +## RNASeq selection script by Peter Waltman +## August 21, 2011 +## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) +## +#usage, options and doc goes here - NEEDS TO BE UPDATED ---- TBD +argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are +active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples. + + Usage: + ipl.feature.selection.R -d + Optional: + -o + -g ## to set if only returning genes (default is all features) + -f ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated) + -t ## the threshold to use for the filter (default is 0.25) + -p ## the percentage of samples that must pass the filter (default is 0.33) + -v ## to set verbose on + + \n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +## MAXIMUM absolute deviation function +maxad <- function( x, na.rm=FALSE, center=median(x, na.rm=na.rm ) ) { + return( max( abs(x), na.rm=na.rm ) - center ) +} + +lib.load.quiet(getopt) + +spec <- matrix( c( "data.fname", "d", 1, "character", + "output.fname", "o", 2, "character", + "var.method", "m", 2, "character", ## must be either 'active', 'inactive' or 'modulated' + "z.transform", "z", 2, "character", + "perc.pass", "p", 2, "numeric" + ), + nc=4, + byrow=TRUE + ) + +opt <- getopt( spec=spec ) +data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) +#set some reasonable defaults for the options that are needed, +#but were not specified. +if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1500 } +if ( opt$perc.pass < 0 ) { + stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" ) +} else { + if ( opt$perc.pass < 1 ) { + opt$perc.pass <- floor( opt$perc.pass * nrow( data ) ) + } +} + +if ( is.null( opt$var.method ) ) { opt$var.method <- 'mad' } +if ( is.null( opt$z.transform ) ) { + opt$z.transform <- TRUE +} else { + opt$z.transform <- ( tolower( opt$z.transform ) %in% "yes" ) +} +if ( is.null( opt$output.fname ) ) opt$output.fname <- "high.var.rnaseq.tab" + +if ( opt$z.transform ) { + ##normalize it to have mean==0, std==1 + data <- t( data ) + data <- t( scale( data ) ) +} + + +my.var.fn <- get( opt$var.method ) +my.var.vals <- apply( data, 1, my.var.fn, na.rm=T ) +filter.vect <- sort( my.var.vals, dec=TRUE, index=TRUE )$ix[ 1:opt$perc.pass ] + +data <- data[ filter.vect, ] +write.2.tab( data, opt$output.fname ) + diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/rnaseq.feature.selection.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/rnaseq.feature.selection.py Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,14 @@ +#!/usr/bin/env python +import os +import sys +import subprocess + +select_script_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "rnaseq.feature.selection.R") + +cmd_args = [ "Rscript", select_script_path ] + sys.argv[1:] + +proc = subprocess.Popen( cmd_args, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) +(stdoutdata, stderrdata) = proc.communicate() +if proc.returncode: + sys.stderr.write(stderrdata) +sys.stdout.write(stdoutdata) diff -r 0decf3fd54bc -r dddfeedb85af cluster.tools/rnaseq.feature.selection.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/rnaseq.feature.selection.xml Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,52 @@ + + rnaseq.feature.selection.py +-d $dataset +-z ${z_transform} +-m ${var_method} +-p ${perc_pass} +-o ${output} + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**RNASeq Feature Selection** - Tool to filter an RNASeq matrix to select the most variant features + +**OUTPUT:** A new matrix containing only the most variant feaures + +---- + +**Parameters** + +- **Z-transform data?** - Specify whether or not to Z-transform the rows (mean=0, sd=1) +- **Variance Metric for Genes** - Specify Metric to use for calculating Gene Variance. Choice of: + + * Median Absolute Deviation (MAD) + * Maximum Absolute Deviation - similar to MAD, but uses the _Maximum_, instead of the Median Absolute Deviatioin + * Standard Deviation + + - **Percentage of Samples Passing** Percent of samples with an IPL that passes the threshold. Choice of: + + * Integer Value - indicate the exact number of genes that are to be kept + * Real Value in [0,1] - indicate the percentage of genes that are to be kept + + + + +