comparison normalize.r @ 0:31cfcab40d8f draft

Uploaded
author ynewton
date Wed, 26 Sep 2012 17:32:30 -0400
parents
children 710627b47962
comparison
equal deleted inserted replaced
-1:000000000000 0:31cfcab40d8f
1 #!/usr/bin/Rscript
2
3 #usage, options and doc goes here
4 argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation.
5 Usage:
6 normalize.r input.tab norm_type norm_by > output.tab
7 Example:
8 Rscript normalize.r test_matrix.tab median_shift column > output.tab
9 Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab
10 Options:
11 input matrix (annotated by row and column names)
12 normalization type; available options:
13 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals
14 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals
15 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately
16 exp_fit - (only by column) ranks data and transforms exponential CDF
17 normal_fit - (only by column) ranks data and transforms normal CDF
18 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5
19 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1
20 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5
21 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5
22 normalization by:
23 row
24 column
25 normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals
26 output file is specified through redirect character >")
27
28 read_matrix <- function(in_file){
29 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]]
30 cl.cols<- 1:length(header) > 1
31 data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE)
32 data_matrix <- as.matrix(data_matrix.df[,cl.cols])
33 rownames(data_matrix) <- data_matrix.df[,1]
34 return(data_matrix)
35
36 #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters
37 #read_mtrx[,1]
38
39 #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters
40 #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")
41 #print(mtrx[1,])
42 }
43
44 write_matrix <- function(data_matrix){
45 header <- append(c("Genes"), colnames(data_matrix))
46 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
47 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
48 }
49
50 read_normals <- function(in_file){
51 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1])
52 }
53
54 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){
55 if(norm_type == 'MEDIAN_SHIFT'){
56 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list))
57 }
58 else if(norm_type == 'MEAN_SHIFT'){
59 return(shift(data_matrix, 'MEAN', normals_list, tumors_list))
60 }
61 else if(norm_type == 'T_STATISTIC'){
62 return(compute_z_score(data_matrix, normals_list, tumors_list))
63 }
64 else if(norm_type == 'EXPONENTIAL_FIT'){
65 return(fit_distribution(data_matrix, 'EXPONENTIAL'))
66 }
67 else if(norm_type == 'NORMAL_FIT'){
68 return(fit_distribution(data_matrix, 'NORMAL'))
69 }
70 else if(norm_type == 'WEIBULL_0.5_FIT'){
71 return(fit_distribution(data_matrix, 'WEIBULL_0.5'))
72 }
73 else if(norm_type == 'WEIBULL_1_FIT'){
74 return(fit_distribution(data_matrix, 'WEIBULL_1'))
75 }
76 else if(norm_type == 'WEIBULL_1.5_FIT'){
77 return(fit_distribution(data_matrix, 'WEIBULL_1.5'))
78 }
79 else if(norm_type == 'WEIBULL_5_FIT'){
80 return(fit_distribution(data_matrix, 'WEIBULL_5'))
81 }
82 }
83
84 shift <- function(data_matrix, shift_type, normals_list, tumors_list){
85 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list)))
86 }
87
88 shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){
89 if(length(normals_list) == 0){ #no normals are specified
90 if(norm_type == 'MEDIAN'){
91 row_stat <- median(data_row)
92 }
93 else if(norm_type == 'MEAN'){
94 row_stat <- mean(data_row)
95 }
96 return(unlist(lapply(data_row, function(x){return(x - row_stat);})))
97 }
98 else{ #normals are specified
99 normal_values <- data_row[normals_list]
100 tumor_columns <- data_row[tumors_list]
101
102 if(norm_type == 'MEDIAN'){
103 row_stat <- median(normal_values)
104 }
105 else if(norm_type == 'MEAN'){
106 row_stat <- mean(normal_values)
107 }
108 return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);})))
109 }
110 }
111
112 compute_z_score <- function(data_matrix, normals_list, tumors_list){
113 return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list)))
114 }
115
116 t_stat_normalize_row <- function(data_row, normals_list, tumors_list){
117 if(length(normals_list) == 0){ #no normals are specified
118 row_mean <- mean(data_row)
119 row_sd <- sd(data_row)
120 return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);})))
121 }
122 else{ #normals are specified
123 normal_values <- data_row[normals_list]
124 normal_mean <- mean(normal_values)
125 normal_sd <- sd(normal_values)
126 normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);}))
127
128 tumor_values <- data_row[tumors_list]
129 tumor_mean <- mean(tumor_values)
130 tumor_sd <- sd(tumor_values)
131 normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - tumor_mean)/tumor_sd);}))
132
133 return(append(normalized_normals, normalized_tumors))
134 }
135 }
136
137 rankNA <- function(col){ #originally written by Dan Carlin
138 col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col)))
139 return(col)
140 }
141
142 fit_distribution <- function(data_matrix, dist){
143 if(dist == 'EXPONENTIAL'){
144 ranked_data_matrix <- apply(data_matrix,2,rankNA) #idea by Dan Carlin
145 return(apply(ranked_data_matrix, c(1,2), qexp))
146 }
147 else if(dist == 'NORMAL'){
148 ranked_data_matrix <- apply(data_matrix,2,rankNA)
149 #return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));}))
150 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=2))
151 }
152 else if(dist == 'WEIBULL_0.5'){
153 ranked_data_matrix <- apply(data_matrix,2,rankNA)
154 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5))
155 }
156 else if(dist == 'WEIBULL_1'){
157 ranked_data_matrix <- apply(data_matrix,2,rankNA)
158 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1))
159 }
160 else if(dist == 'WEIBULL_1.5'){
161 ranked_data_matrix <- apply(data_matrix,2,rankNA)
162 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5))
163 }
164 else if(dist == 'WEIBULL_5'){
165 ranked_data_matrix <- apply(data_matrix,2,rankNA)
166 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5))
167 }
168 }
169
170 main <- function(argv) {
171 #determine if correct number of arguments are specified and if normals are specified
172 with_normals = FALSE
173
174 if(length(argv) == 1){
175 if(argv==c('--help')){
176 write(argspec, stderr());
177 q();
178 }
179 }
180
181 if(!(length(argv) == 3 || length(argv) == 4)){
182 write("ERROR: invalid number of arguments is specified", stderr());
183 q();
184 }
185
186 if(length(argv) == 4){
187 with_normals = TRUE
188 normals_file <- argv[4]
189 }
190
191 #store command line arguments in variables:
192 input_file <- argv[1]
193 norm_type <- toupper(argv[2])
194 norm_by <- toupper(argv[3])
195
196 #read the input file(s):
197 data_matrix <- read_matrix(input_file)
198
199 if(with_normals){
200 normals_list <- read_normals(normals_file)
201 normals_indices <- which(colnames(data_matrix) %in% normals_list)
202 tumor_indices <- which(!(colnames(data_matrix) %in% normals_list))
203 norm_by <- 'ROW'
204 }
205 else{
206 normals_indices <- c()
207 tumor_indices <- c()
208 }
209
210 #if normalize by columns then transpose the matrix:
211 if(norm_by == 'COLUMN'){
212 data_matrix <- t(data_matrix)
213 }
214
215 #normalize:
216 data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices)
217
218 #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row:
219 if(norm_by == 'COLUMN'){
220 data_matrix <- t(data_matrix)
221 }
222
223 write_matrix(data_matrix)
224 #print(data_matrix)
225 }
226
227 main(commandArgs(TRUE))