Mercurial > repos > peter-waltman > ucsc_cluster_tools
comparison cluster.tools/ipl.feature.selection.R @ 2:b442996b66ae draft
Uploaded
| author | peter-waltman |
|---|---|
| date | Wed, 27 Feb 2013 20:17:04 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:e25d2bece0a2 | 2:b442996b66ae |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 ## IPL selection script by Peter Waltman | |
| 3 ## August 21, 2011 | |
| 4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0) | |
| 5 ## | |
| 6 #usage, options and doc goes here | |
| 7 argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are | |
| 8 active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples. | |
| 9 | |
| 10 Usage: | |
| 11 ipl.feature.selection.R -d <data.file> | |
| 12 Optional: | |
| 13 -o <output.name> | |
| 14 -g <genes-only> ## to set if only returning genes (default is all features) | |
| 15 -f <filter.type> ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated) | |
| 16 -t <threshold> ## the threshold to use for the filter (default is 0.25) | |
| 17 -p <perc.pass> ## the percentage of samples that must pass the filter (default is 0.33) | |
| 18 -v <verbose> ## to set verbose on | |
| 19 | |
| 20 \n\n") | |
| 21 args <- commandArgs(TRUE) | |
| 22 if ( length( args ) == 1 && args =="--help") { | |
| 23 write(argspec, stderr()) | |
| 24 q(); | |
| 25 } | |
| 26 | |
| 27 lib.load.quiet <- function( package ) { | |
| 28 package <- as.character(substitute(package)) | |
| 29 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) ) | |
| 30 } | |
| 31 lib.load.quiet(getopt) | |
| 32 | |
| 33 spec <- matrix( c( "data.fname", "d", 1, "character", | |
| 34 "output.name", "o", 2, "character", | |
| 35 "genes.only", "g", 0, "logical", | |
| 36 "filter.type", "f", 2, "character", ## must be either 'active', 'inactive' or 'modulated' | |
| 37 "threshold", "t", 2, "numeric", | |
| 38 "empirical.fname", "e", 2, "character", | |
| 39 "perc.pass", "p", 2, "numeric", | |
| 40 "verbose", "v", 0, "logical", ## to set verbose on | |
| 41 "help", "h", 0, "logical" | |
| 42 ), | |
| 43 nc=4, | |
| 44 byrow=TRUE | |
| 45 ) | |
| 46 | |
| 47 opt <- getopt( spec=spec ) | |
| 48 ##save.image( "~/work.local/tmp/ipl.feature.sel.dbg.rda" ) | |
| 49 #set some reasonable defaults for the options that are needed, | |
| 50 #but were not specified. | |
| 51 if ( is.null(opt$verbose ) ) { opt$verbose = FALSE } | |
| 52 if ( is.null(opt$genes.only ) ) { | |
| 53 opt$genes.only <- FALSE | |
| 54 } | |
| 55 | |
| 56 if ( is.null(opt$filter.type ) ) { opt$filter.type = 'modulated' } | |
| 57 if ( is.null( opt$threshold ) ) { opt$threshold=0.25 } | |
| 58 if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1/3 } | |
| 59 if ( opt$perc.pass < 0 ) { | |
| 60 stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" ) | |
| 61 } | |
| 62 ## now set filter.type, threshold & perc.pass if an empirical result has been passed in | |
| 63 if ( ! is.null( opt$empirical.fname ) ) { | |
| 64 | |
| 65 if ( ! file.exists( opt$empirical.fname ) ) stop( "can't file empirical result file:", opt$empirical.fname, "\n" ) | |
| 66 ## assume this is an RData file | |
| 67 emp.fname.contents <- load( opt$empirical.fname ) | |
| 68 if ( ! "opt.thresh" %in% emp.fname.contents ) stop( "no optimal threshold value found in RData file passed in\n" ) | |
| 69 opt$threshold <- opt.thresh | |
| 70 | |
| 71 if ( ! "filter.type" %in% emp.fname.contents ) stop( "no filter type value found in RData file passed in\n" ) | |
| 72 opt$filter.type <- filter.type | |
| 73 | |
| 74 if ( ! "perc.pass" %in% emp.fname.contents ) stop( "no percentage passing value found in RData file passed in\n" ) | |
| 75 opt$perc.pass <- perc.pass | |
| 76 } | |
| 77 if ( ! opt$filter.type %in% c( 'active', 'inactive', 'modulated' ) ) stop( 'invalid filter.type specified:', opt$filter.type, "\n" ) | |
| 78 if ( is.null( opt$output.name ) ) { | |
| 79 opt$output.name <- file.path( getwd(), | |
| 80 paste( opt$filter.type, basename( opt$data.fname ), sep="." ) ) | |
| 81 } | |
| 82 | |
| 83 | |
| 84 | |
| 85 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) ) | |
| 86 if ( opt$genes.only ) { | |
| 87 genes <- rownames( data ) | |
| 88 genes <- genes[ ! grepl( "abstract|complex|family", genes ) ] | |
| 89 data <- data[ genes, ] | |
| 90 } | |
| 91 | |
| 92 | |
| 93 count.samps.threshold <- function( data, | |
| 94 threshold, | |
| 95 comparator ## must be one of lte, lt, gt, gte | |
| 96 ) { | |
| 97 filter.vect <- rep( TRUE, nrow( data ) ) ## set an initial val | |
| 98 if ( comparator == "lt" ) { | |
| 99 return( apply( data, | |
| 100 1, | |
| 101 function(x) sum( x < threshold, na.rm=T ) ) ) | |
| 102 } | |
| 103 if ( comparator == "lte" ) { | |
| 104 return( apply( data, | |
| 105 1, | |
| 106 function(x) sum( x <= threshold, na.rm=T ) ) ) | |
| 107 } | |
| 108 if ( comparator == "gte" ) { | |
| 109 return( apply( data, | |
| 110 1, | |
| 111 function(x) sum( x >= threshold, na.rm=T ) ) ) | |
| 112 } | |
| 113 if ( comparator == "gt" ) { | |
| 114 return( apply( data, | |
| 115 1, | |
| 116 function(x) sum( x > threshold, na.rm=T ) ) ) | |
| 117 } | |
| 118 if ( comparator == "bothe" ) { | |
| 119 return( apply( data, | |
| 120 1, | |
| 121 function(x) sum( abs(x) >= threshold, na.rm=T ) ) ) | |
| 122 } | |
| 123 if ( comparator == "both" ) { | |
| 124 return( apply( data, | |
| 125 1, | |
| 126 function(x) sum( abs(x) > threshold, na.rm=T ) ) ) | |
| 127 } | |
| 128 } | |
| 129 | |
| 130 | |
| 131 | |
| 132 | |
| 133 if ( opt$filter.type=="active" ) { | |
| 134 ## this is an implementation of the activity filter that was used in the original PARADIGM paper | |
| 135 filter.vect <- count.samps.threshold( data, opt$threshold, "gt" ) | |
| 136 } else { | |
| 137 if ( opt$filter.type=="inactive" ) { | |
| 138 filter.vect <- count.samps.threshold( data, -opt$threshold, "lt" ) | |
| 139 } else { | |
| 140 if ( opt$filter.type=="modulated" ) { | |
| 141 filter.vect <- count.samps.threshold( data, opt$threshold, "both" ) | |
| 142 } else { | |
| 143 stop( "invalid filter.type specified: ", opt$filter.type ) | |
| 144 } | |
| 145 } | |
| 146 } | |
| 147 | |
| 148 if ( opt$perc.pass <1 ) { | |
| 149 filter.vect <- filter.vect > floor( ncol( data ) * opt$perc.pass ) | |
| 150 } else { | |
| 151 filter.vect <- filter.vect >= opt$perc.pass | |
| 152 } | |
| 153 data <- data[ filter.vect, ] | |
| 154 | |
| 155 write.table( data, opt$output.name, sep="\t", row.names=TRUE, col.names=NA, quote=FALSE ) | |
| 156 |
