Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/rnaseq.feature.selection.R @ 1:dddfeedb85af draft
Uploaded
| author | peter-waltman | 
|---|---|
| date | Fri, 01 Mar 2013 10:16:53 -0500 | 
| parents | |
| children | 563832f48c08 | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/rnaseq.feature.selection.R Fri Mar 01 10:16:53 2013 -0500 @@ -0,0 +1,91 @@ +#!/usr/bin/env Rscript +## RNASeq selection script by Peter Waltman +## August 21, 2011 +## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) +## +#usage, options and doc goes here - NEEDS TO BE UPDATED ---- TBD +argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are +active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples. + + Usage: + ipl.feature.selection.R -d <data.file> + Optional: + -o <output.name> + -g <genes-only> ## to set if only returning genes (default is all features) + -f <filter.type> ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated) + -t <threshold> ## the threshold to use for the filter (default is 0.25) + -p <perc.pass> ## the percentage of samples that must pass the filter (default is 0.33) + -v <verbose> ## to set verbose on + + \n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +## some helper fn's +write.2.tab <- function( mat, + fname ) { + mat <- rbind( colnames( mat ), mat ) + mat <- cbind( c( "ID", rownames( mat )[-1] ), + mat ) + write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE ) +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +## MAXIMUM absolute deviation function +maxad <- function( x, na.rm=FALSE, center=median(x, na.rm=na.rm ) ) { + return( max( abs(x), na.rm=na.rm ) - center ) +} + +lib.load.quiet(getopt) + +spec <- matrix( c( "data.fname", "d", 1, "character", + "output.fname", "o", 2, "character", + "var.method", "m", 2, "character", ## must be either 'active', 'inactive' or 'modulated' + "z.transform", "z", 2, "character", + "perc.pass", "p", 2, "numeric" + ), + nc=4, + byrow=TRUE + ) + +opt <- getopt( spec=spec ) +data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) +#set some reasonable defaults for the options that are needed, +#but were not specified. +if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1500 } +if ( opt$perc.pass < 0 ) { + stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" ) +} else { + if ( opt$perc.pass < 1 ) { + opt$perc.pass <- floor( opt$perc.pass * nrow( data ) ) + } +} + +if ( is.null( opt$var.method ) ) { opt$var.method <- 'mad' } +if ( is.null( opt$z.transform ) ) { + opt$z.transform <- TRUE +} else { + opt$z.transform <- ( tolower( opt$z.transform ) %in% "yes" ) +} +if ( is.null( opt$output.fname ) ) opt$output.fname <- "high.var.rnaseq.tab" + +if ( opt$z.transform ) { + ##normalize it to have mean==0, std==1 + data <- t( data ) + data <- t( scale( data ) ) +} + + +my.var.fn <- get( opt$var.method ) +my.var.vals <- apply( data, 1, my.var.fn, na.rm=T ) +filter.vect <- sort( my.var.vals, dec=TRUE, index=TRUE )$ix[ 1:opt$perc.pass ] + +data <- data[ filter.vect, ] +write.2.tab( data, opt$output.fname ) +
