0
|
1 #!/usr/bin/env Rscript
|
|
2 ## IPL selection script by Peter Waltman
|
|
3 ## August 21, 2011
|
|
4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
|
|
5 ##
|
|
6 #usage, options and doc goes here
|
|
7 argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are
|
|
8 active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples.
|
|
9
|
|
10 Usage:
|
|
11 ipl.feature.selection.R -d <data.file>
|
|
12 Optional:
|
|
13 -o <output.name>
|
|
14 -g <genes-only> ## to set if only returning genes (default is all features)
|
|
15 -f <filter.type> ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated)
|
|
16 -t <threshold> ## the threshold to use for the filter (default is 0.25)
|
|
17 -p <perc.pass> ## the percentage of samples that must pass the filter (default is 0.33)
|
|
18 -v <verbose> ## to set verbose on
|
|
19
|
|
20 \n\n")
|
|
21 args <- commandArgs(TRUE)
|
|
22 if ( length( args ) == 1 && args =="--help") {
|
|
23 write(argspec, stderr())
|
|
24 q();
|
|
25 }
|
|
26
|
|
27 lib.load.quiet <- function( package ) {
|
|
28 package <- as.character(substitute(package))
|
|
29 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
30 }
|
|
31 lib.load.quiet(getopt)
|
|
32
|
|
33 spec <- matrix( c( "data.fname", "d", 1, "character",
|
|
34 "output.name", "o", 2, "character",
|
|
35 "genes.only", "g", 0, "logical",
|
|
36 "filter.type", "f", 2, "character", ## must be either 'active', 'inactive' or 'modulated'
|
|
37 "threshold", "t", 2, "numeric",
|
|
38 "empirical.fname", "e", 2, "character",
|
|
39 "perc.pass", "p", 2, "numeric",
|
|
40 "verbose", "v", 0, "logical", ## to set verbose on
|
|
41 "help", "h", 0, "logical"
|
|
42 ),
|
|
43 nc=4,
|
|
44 byrow=TRUE
|
|
45 )
|
|
46
|
|
47 opt <- getopt( spec=spec )
|
|
48 #set some reasonable defaults for the options that are needed,
|
|
49 #but were not specified.
|
|
50 if ( is.null(opt$verbose ) ) { opt$verbose = FALSE }
|
|
51 if ( is.null(opt$genes.only ) ) {
|
|
52 opt$genes.only <- FALSE
|
|
53 }
|
|
54
|
|
55 if ( is.null(opt$filter.type ) ) { opt$filter.type = 'modulated' }
|
|
56 if ( is.null( opt$threshold ) ) { opt$threshold=0.25 }
|
|
57 if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1/3 }
|
|
58 if ( opt$perc.pass < 0 ) {
|
|
59 stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
|
|
60 }
|
|
61 ## now set filter.type, threshold & perc.pass if an empirical result has been passed in
|
|
62 if ( ! is.null( opt$empirical.fname ) ) {
|
|
63
|
|
64 if ( ! file.exists( opt$empirical.fname ) ) stop( "can't file empirical result file:", opt$empirical.fname, "\n" )
|
|
65 ## assume this is an RData file
|
|
66 emp.fname.contents <- load( opt$empirical.fname )
|
|
67 if ( ! "opt.thresh" %in% emp.fname.contents ) stop( "no optimal threshold value found in RData file passed in\n" )
|
|
68 opt$threshold <- opt.thresh
|
|
69
|
|
70 if ( ! "filter.type" %in% emp.fname.contents ) stop( "no filter type value found in RData file passed in\n" )
|
|
71 opt$filter.type <- filter.type
|
|
72
|
|
73 if ( ! "perc.pass" %in% emp.fname.contents ) stop( "no percentage passing value found in RData file passed in\n" )
|
|
74 opt$perc.pass <- perc.pass
|
|
75 }
|
|
76 if ( ! opt$filter.type %in% c( 'active', 'inactive', 'modulated' ) ) stop( 'invalid filter.type specified:', opt$filter.type, "\n" )
|
|
77 if ( is.null( opt$output.name ) ) {
|
|
78 opt$output.name <- file.path( getwd(),
|
|
79 paste( opt$filter.type, basename( opt$data.fname ), sep="." ) )
|
|
80 }
|
|
81
|
|
82
|
|
83
|
|
84 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
|
|
85 if ( opt$genes.only ) {
|
|
86 genes <- rownames( data )
|
|
87 genes <- genes[ ! grepl( "abstract|complex|family", genes ) ]
|
|
88 data <- data[ genes, ]
|
|
89 }
|
|
90
|
|
91
|
|
92 count.samps.threshold <- function( data,
|
|
93 threshold,
|
|
94 comparator ## must be one of lte, lt, gt, gte
|
|
95 ) {
|
|
96 filter.vect <- rep( TRUE, nrow( data ) ) ## set an initial val
|
|
97 if ( comparator == "lt" ) {
|
|
98 return( apply( data,
|
|
99 1,
|
|
100 function(x) sum( x < threshold, na.rm=T ) ) )
|
|
101 }
|
|
102 if ( comparator == "lte" ) {
|
|
103 return( apply( data,
|
|
104 1,
|
|
105 function(x) sum( x <= threshold, na.rm=T ) ) )
|
|
106 }
|
|
107 if ( comparator == "gte" ) {
|
|
108 return( apply( data,
|
|
109 1,
|
|
110 function(x) sum( x >= threshold, na.rm=T ) ) )
|
|
111 }
|
|
112 if ( comparator == "gt" ) {
|
|
113 return( apply( data,
|
|
114 1,
|
|
115 function(x) sum( x > threshold, na.rm=T ) ) )
|
|
116 }
|
|
117 if ( comparator == "bothe" ) {
|
|
118 return( apply( data,
|
|
119 1,
|
|
120 function(x) sum( abs(x) >= threshold, na.rm=T ) ) )
|
|
121 }
|
|
122 if ( comparator == "both" ) {
|
|
123 return( apply( data,
|
|
124 1,
|
|
125 function(x) sum( abs(x) > threshold, na.rm=T ) ) )
|
|
126 }
|
|
127 }
|
|
128
|
|
129
|
|
130
|
|
131
|
|
132 if ( opt$filter.type=="active" ) {
|
|
133 ## this is an implementation of the activity filter that was used in the original PARADIGM paper
|
|
134 filter.vect <- count.samps.threshold( data, opt$threshold, "gt" )
|
|
135 } else {
|
|
136 if ( opt$filter.type=="inactive" ) {
|
|
137 filter.vect <- count.samps.threshold( data, -opt$threshold, "lt" )
|
|
138 } else {
|
|
139 if ( opt$filter.type=="modulated" ) {
|
|
140 filter.vect <- count.samps.threshold( data, opt$threshold, "both" )
|
|
141 } else {
|
|
142 stop( "invalid filter.type specified: ", opt$filter.type )
|
|
143 }
|
|
144 }
|
|
145 }
|
|
146
|
|
147 if ( opt$perc.pass <1 ) {
|
|
148 filter.vect <- filter.vect > floor( ncol( data ) * opt$perc.pass )
|
|
149 } else {
|
|
150 filter.vect <- filter.vect >= opt$perc.pass
|
|
151 }
|
|
152 data <- data[ filter.vect, ]
|
|
153
|
|
154 write.table( data, opt$output.name, sep="\t", row.names=TRUE, col.names=NA, quote=FALSE )
|
|
155
|