annotate cluster.tools/ipl.feature.selection.R @ 2:b442996b66ae draft

Uploaded
author peter-waltman
date Wed, 27 Feb 2013 20:17:04 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
1 #!/usr/bin/env Rscript
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
2 ## IPL selection script by Peter Waltman
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
3 ## August 21, 2011
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
5 ##
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
6 #usage, options and doc goes here
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
7 argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
8 active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples.
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
9
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
10 Usage:
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
11 ipl.feature.selection.R -d <data.file>
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
12 Optional:
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
13 -o <output.name>
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
14 -g <genes-only> ## to set if only returning genes (default is all features)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
15 -f <filter.type> ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
16 -t <threshold> ## the threshold to use for the filter (default is 0.25)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
17 -p <perc.pass> ## the percentage of samples that must pass the filter (default is 0.33)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
18 -v <verbose> ## to set verbose on
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
19
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
20 \n\n")
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
21 args <- commandArgs(TRUE)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
22 if ( length( args ) == 1 && args =="--help") {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
23 write(argspec, stderr())
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
24 q();
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
25 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
26
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
27 lib.load.quiet <- function( package ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
28 package <- as.character(substitute(package))
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
29 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
30 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
31 lib.load.quiet(getopt)
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
32
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
33 spec <- matrix( c( "data.fname", "d", 1, "character",
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
34 "output.name", "o", 2, "character",
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
35 "genes.only", "g", 0, "logical",
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
36 "filter.type", "f", 2, "character", ## must be either 'active', 'inactive' or 'modulated'
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
37 "threshold", "t", 2, "numeric",
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
38 "empirical.fname", "e", 2, "character",
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
39 "perc.pass", "p", 2, "numeric",
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
40 "verbose", "v", 0, "logical", ## to set verbose on
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
41 "help", "h", 0, "logical"
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
42 ),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
43 nc=4,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
44 byrow=TRUE
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
45 )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
46
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
47 opt <- getopt( spec=spec )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
48 ##save.image( "~/work.local/tmp/ipl.feature.sel.dbg.rda" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
49 #set some reasonable defaults for the options that are needed,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
50 #but were not specified.
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
51 if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
52 if ( is.null(opt$genes.only ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
53 opt$genes.only <- FALSE
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
54 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
55
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
56 if ( is.null(opt$filter.type ) ) { opt$filter.type = 'modulated' }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
57 if ( is.null( opt$threshold ) ) { opt$threshold=0.25 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
58 if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1/3 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
59 if ( opt$perc.pass < 0 ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
60 stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
61 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
62 ## now set filter.type, threshold & perc.pass if an empirical result has been passed in
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
63 if ( ! is.null( opt$empirical.fname ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
64
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
65 if ( ! file.exists( opt$empirical.fname ) ) stop( "can't file empirical result file:", opt$empirical.fname, "\n" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
66 ## assume this is an RData file
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
67 emp.fname.contents <- load( opt$empirical.fname )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
68 if ( ! "opt.thresh" %in% emp.fname.contents ) stop( "no optimal threshold value found in RData file passed in\n" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
69 opt$threshold <- opt.thresh
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
70
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
71 if ( ! "filter.type" %in% emp.fname.contents ) stop( "no filter type value found in RData file passed in\n" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
72 opt$filter.type <- filter.type
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
73
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
74 if ( ! "perc.pass" %in% emp.fname.contents ) stop( "no percentage passing value found in RData file passed in\n" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
75 opt$perc.pass <- perc.pass
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
76 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
77 if ( ! opt$filter.type %in% c( 'active', 'inactive', 'modulated' ) ) stop( 'invalid filter.type specified:', opt$filter.type, "\n" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
78 if ( is.null( opt$output.name ) ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
79 opt$output.name <- file.path( getwd(),
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
80 paste( opt$filter.type, basename( opt$data.fname ), sep="." ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
81 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
82
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
83
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
84
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
85 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
86 if ( opt$genes.only ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
87 genes <- rownames( data )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
88 genes <- genes[ ! grepl( "abstract|complex|family", genes ) ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
89 data <- data[ genes, ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
90 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
91
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
92
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
93 count.samps.threshold <- function( data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
94 threshold,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
95 comparator ## must be one of lte, lt, gt, gte
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
96 ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
97 filter.vect <- rep( TRUE, nrow( data ) ) ## set an initial val
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
98 if ( comparator == "lt" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
99 return( apply( data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
100 1,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
101 function(x) sum( x < threshold, na.rm=T ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
102 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
103 if ( comparator == "lte" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
104 return( apply( data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
105 1,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
106 function(x) sum( x <= threshold, na.rm=T ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
107 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
108 if ( comparator == "gte" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
109 return( apply( data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
110 1,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
111 function(x) sum( x >= threshold, na.rm=T ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
112 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
113 if ( comparator == "gt" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
114 return( apply( data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
115 1,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
116 function(x) sum( x > threshold, na.rm=T ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
117 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
118 if ( comparator == "bothe" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
119 return( apply( data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
120 1,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
121 function(x) sum( abs(x) >= threshold, na.rm=T ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
122 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
123 if ( comparator == "both" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
124 return( apply( data,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
125 1,
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
126 function(x) sum( abs(x) > threshold, na.rm=T ) ) )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
127 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
128 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
129
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
130
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
131
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
132
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
133 if ( opt$filter.type=="active" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
134 ## this is an implementation of the activity filter that was used in the original PARADIGM paper
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
135 filter.vect <- count.samps.threshold( data, opt$threshold, "gt" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
136 } else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
137 if ( opt$filter.type=="inactive" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
138 filter.vect <- count.samps.threshold( data, -opt$threshold, "lt" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
139 } else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
140 if ( opt$filter.type=="modulated" ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
141 filter.vect <- count.samps.threshold( data, opt$threshold, "both" )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
142 } else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
143 stop( "invalid filter.type specified: ", opt$filter.type )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
144 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
145 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
146 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
147
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
148 if ( opt$perc.pass <1 ) {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
149 filter.vect <- filter.vect > floor( ncol( data ) * opt$perc.pass )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
150 } else {
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
151 filter.vect <- filter.vect >= opt$perc.pass
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
152 }
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
153 data <- data[ filter.vect, ]
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
154
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
155 write.table( data, opt$output.name, sep="\t", row.names=TRUE, col.names=NA, quote=FALSE )
b442996b66ae Uploaded
peter-waltman
parents:
diff changeset
156