1
|
1 #!/usr/bin/env Rscript
|
|
2 ## RNASeq selection script by Peter Waltman
|
|
3 ## August 21, 2011
|
|
4 ## License under Creative Commons Attribution 3.0 Unported (CC BY 3.0)
|
|
5 ##
|
|
6 #usage, options and doc goes here - NEEDS TO BE UPDATED ---- TBD
|
|
7 argspec <- c("ipl.feature.selection.R takes a set of results from Paradigm, and filters for features that are
|
|
8 active, inactive or modulated above a given IPL threshold over a sufficient percentage of samples.
|
|
9
|
|
10 Usage:
|
|
11 ipl.feature.selection.R -d <data.file>
|
|
12 Optional:
|
|
13 -o <output.name>
|
|
14 -g <genes-only> ## to set if only returning genes (default is all features)
|
|
15 -f <filter.type> ## filter.type must be either 'modulated', 'active'or 'inactive' (default is modulated)
|
|
16 -t <threshold> ## the threshold to use for the filter (default is 0.25)
|
|
17 -p <perc.pass> ## the percentage of samples that must pass the filter (default is 0.33)
|
|
18 -v <verbose> ## to set verbose on
|
|
19
|
|
20 \n\n")
|
|
21 args <- commandArgs(TRUE)
|
|
22 if ( length( args ) == 1 && args =="--help") {
|
|
23 write(argspec, stderr())
|
|
24 q();
|
|
25 }
|
|
26
|
|
27 ## some helper fn's
|
|
28 write.2.tab <- function( mat,
|
|
29 fname ) {
|
|
30 mat <- rbind( colnames( mat ), mat )
|
|
31 mat <- cbind( c( "ID", rownames( mat )[-1] ),
|
|
32 mat )
|
|
33 write.table( mat, fname, sep="\t", row.names=FALSE, col.names=FALSE, quote=FALSE )
|
|
34 }
|
|
35
|
|
36 lib.load.quiet <- function( package ) {
|
|
37 package <- as.character(substitute(package))
|
|
38 suppressPackageStartupMessages( do.call( "library", list( package=package ) ) )
|
|
39 }
|
|
40 ## MAXIMUM absolute deviation function
|
|
41 maxad <- function( x, na.rm=FALSE, center=median(x, na.rm=na.rm ) ) {
|
|
42 return( max( abs(x), na.rm=na.rm ) - center )
|
|
43 }
|
|
44
|
|
45 lib.load.quiet(getopt)
|
|
46
|
|
47 spec <- matrix( c( "data.fname", "d", 1, "character",
|
|
48 "output.fname", "o", 2, "character",
|
3
|
49 "var.method", "m", 2, "character",
|
1
|
50 "z.transform", "z", 2, "character",
|
|
51 "perc.pass", "p", 2, "numeric"
|
|
52 ),
|
|
53 nc=4,
|
|
54 byrow=TRUE
|
|
55 )
|
|
56
|
|
57 opt <- getopt( spec=spec )
|
|
58 data <- as.matrix( read.delim( opt$data.fname, header=T, row.names=1 , check.names=FALSE ) )
|
|
59 #set some reasonable defaults for the options that are needed,
|
|
60 #but were not specified.
|
|
61 if ( is.null( opt$perc.pass ) ) { opt$perc.pass=1500 }
|
|
62 if ( opt$perc.pass < 0 ) {
|
|
63 stop( "please specify a positive number for the percentage of samples that pass the filter (if applicable)" )
|
|
64 } else {
|
|
65 if ( opt$perc.pass < 1 ) {
|
|
66 opt$perc.pass <- floor( opt$perc.pass * nrow( data ) )
|
|
67 }
|
|
68 }
|
|
69
|
|
70 if ( is.null( opt$var.method ) ) { opt$var.method <- 'mad' }
|
|
71 if ( is.null( opt$z.transform ) ) {
|
|
72 opt$z.transform <- TRUE
|
|
73 } else {
|
|
74 opt$z.transform <- ( tolower( opt$z.transform ) %in% "yes" )
|
|
75 }
|
|
76 if ( is.null( opt$output.fname ) ) opt$output.fname <- "high.var.rnaseq.tab"
|
|
77
|
|
78 if ( opt$z.transform ) {
|
|
79 ##normalize it to have mean==0, std==1
|
|
80 data <- t( data )
|
|
81 data <- t( scale( data ) )
|
|
82 }
|
|
83
|
|
84
|
|
85 my.var.fn <- get( opt$var.method )
|
|
86 my.var.vals <- apply( data, 1, my.var.fn, na.rm=T )
|
|
87 filter.vect <- sort( my.var.vals, dec=TRUE, index=TRUE )$ix[ 1:opt$perc.pass ]
|
|
88
|
|
89 data <- data[ filter.vect, ]
|
|
90 write.2.tab( data, opt$output.fname )
|
|
91
|