Mercurial > repos > peter-waltman > ucsc_cluster_tools2
diff cluster.tools/ipl.feature.selection.R @ 0:0decf3fd54bc draft
Uploaded
author | peter-waltman |
---|---|
date | Thu, 28 Feb 2013 01:45:39 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cluster.tools/ipl.feature.selection.R Thu Feb 28 01:45:39 2013 -0500 @@ -0,0 +1,155 @@ +#!/usr/bin/env Rscript +## IPL selection script by Peter Waltman +## August 21, 2011 +## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) +## +#usage, options and doc goes here +argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are +active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples. + + Usage: + ipl.feature.selection.R -d <data.file> + Optional: + -o <output.name> + -g <genes-only> ## to set if only returning genes (default is all features) + -f <filter.type> ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated) + -t <threshold> ## the threshold to use for the filter (default is 0.25) + -p <perc.pass> ## the percentage of samples that must pass the filter (default is 0.33) + -v <verbose> ## to set verbose on + + \n\n") +args <- commandArgs(TRUE) +if ( length( args ) == 1 && args =="--help") { + write(argspec, stderr()) + q(); +} + +lib.load.quiet <- function( package ) { + package <- as.character(substitute(package)) + suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) +} +lib.load.quiet(getopt) + +spec <- matrix( c( "data.fname", "d", 1, "character", + "output.name", "o", 2, "character", + "genes.only", "g", 0, "logical", + "filter.type", "f", 2, "character", ## must be either 'active', 'inactive' or 'modulated' + "threshold", "t", 2, "numeric", + "empirical.fname", "e", 2, "character", + "perc.pass", "p", 2, "numeric", + "verbose", "v", 0, "logical", ## to set verbose on + "help", "h", 0, "logical" + ), + nc=4, + byrow=TRUE + ) + +opt <- getopt( spec=spec ) +#set some reasonable defaults for the options that are needed, +#but were not specified. +if ( is.null(opt$verbose ) ) { opt$verbose = FALSE } +if ( is.null(opt$genes.only ) ) { + opt$genes.only <- FALSE +} + +if ( is.null(opt$filter.type ) ) { opt$filter.type = 'modulated' } +if ( is.null( opt$threshold ) ) { opt$threshold=0.25 } +if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1/3 } +if ( opt$perc.pass < 0 ) { + stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" ) +} +## now set filter.type, threshold & perc.pass if an empirical result has been passed in +if ( ! is.null( opt$empirical.fname ) ) { + + if ( ! file.exists( opt$empirical.fname ) ) stop( "can't file empirical result file:", opt$empirical.fname, "\n" ) + ## assume this is an RData file + emp.fname.contents <- load( opt$empirical.fname ) + if ( ! "opt.thresh" %in% emp.fname.contents ) stop( "no optimal threshold value found in RData file passed in\n" ) + opt$threshold <- opt.thresh + + if ( ! "filter.type" %in% emp.fname.contents ) stop( "no filter type value found in RData file passed in\n" ) + opt$filter.type <- filter.type + + if ( ! "perc.pass" %in% emp.fname.contents ) stop( "no percentage passing value found in RData file passed in\n" ) + opt$perc.pass <- perc.pass +} +if ( ! opt$filter.type %in% c( 'active', 'inactive', 'modulated' ) ) stop( 'invalid filter.type specified:', opt$filter.type, "\n" ) +if ( is.null( opt$output.name ) ) { + opt$output.name <- file.path( getwd(), + paste( opt$filter.type, basename( opt$data.fname ), sep="." ) ) +} + + + +data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) +if ( opt$genes.only ) { + genes <- rownames( data ) + genes <- genes[ ! grepl( "abstract|complex|family", genes ) ] + data <- data[ genes, ] +} + + +count.samps.threshold <- function( data, + threshold, + comparator ## must be one of lte, lt, gt, gte + ) { + filter.vect <- rep( TRUE, nrow( data ) ) ## set an initial val + if ( comparator == "lt" ) { + return( apply( data, + 1, + function(x) sum( x < threshold, na.rm=T ) ) ) + } + if ( comparator == "lte" ) { + return( apply( data, + 1, + function(x) sum( x <= threshold, na.rm=T ) ) ) + } + if ( comparator == "gte" ) { + return( apply( data, + 1, + function(x) sum( x >= threshold, na.rm=T ) ) ) + } + if ( comparator == "gt" ) { + return( apply( data, + 1, + function(x) sum( x > threshold, na.rm=T ) ) ) + } + if ( comparator == "bothe" ) { + return( apply( data, + 1, + function(x) sum( abs(x) >= threshold, na.rm=T ) ) ) + } + if ( comparator == "both" ) { + return( apply( data, + 1, + function(x) sum( abs(x) > threshold, na.rm=T ) ) ) + } +} + + + + +if ( opt$filter.type=="active" ) { + ## this is an implementation of the activity filter that was used in the original PARADIGM paper + filter.vect <- count.samps.threshold( data, opt$threshold, "gt" ) +} else { + if ( opt$filter.type=="inactive" ) { + filter.vect <- count.samps.threshold( data, -opt$threshold, "lt" ) + } else { + if ( opt$filter.type=="modulated" ) { + filter.vect <- count.samps.threshold( data, opt$threshold, "both" ) + } else { + stop( "invalid filter.type specified: ", opt$filter.type ) + } + } +} + +if ( opt$perc.pass <1 ) { + filter.vect <- filter.vect > floor( ncol( data ) * opt$perc.pass ) +} else { + filter.vect <- filter.vect >= opt$perc.pass +} +data <- data[ filter.vect, ] + +write.table( data, opt$output.name, sep="\t", row.names=TRUE, col.names=NA, quote=FALSE ) +