comparison normalize.r @ 9:600872152be6 draft

Uploaded
author ynewton
date Sat, 20 Oct 2012 02:28:37 -0400
parents 710627b47962
children
comparison
equal deleted inserted replaced
8:277a79e23357 9:600872152be6
11 input matrix (annotated by row and column names) 11 input matrix (annotated by row and column names)
12 normalization type; available options: 12 normalization type; available options:
13 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals 13 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals
14 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals 14 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals
15 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately 15 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately
16 exp_fit - (only by column) ranks data and transforms exponential CDF 16 exponential_fit - (only by column) ranks data and transforms exponential CDF
17 normal_fit - (only by column) ranks data and transforms normal CDF 17 normal_fit - (only by column) ranks data and transforms normal CDF
18 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 18 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5
19 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 19 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1
20 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 20 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5
21 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 21 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5
22 normalization by: 22 normalization by:
23 row 23 row
24 column 24 column
25 normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals 25 normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples
26 output file is specified through redirect character >") 26 output file is specified through redirect character >")
27 27
28 read_matrix <- function(in_file){ 28 read_matrix <- function(in_file){
29 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] 29 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]]
30 cl.cols<- 1:length(header) > 1 30 cl.cols<- 1:length(header) > 1
46 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) 46 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
47 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) 47 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
48 } 48 }
49 49
50 read_normals <- function(in_file){ 50 read_normals <- function(in_file){
51 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) 51 #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1])
52 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE)))
52 } 53 }
53 54
54 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ 55 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){
55 if(norm_type == 'MEDIAN_SHIFT'){ 56 if(norm_type == 'MEDIAN_SHIFT'){
56 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) 57 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list))
76 else if(norm_type == 'WEIBULL_1.5_FIT'){ 77 else if(norm_type == 'WEIBULL_1.5_FIT'){
77 return(fit_distribution(data_matrix, 'WEIBULL_1.5')) 78 return(fit_distribution(data_matrix, 'WEIBULL_1.5'))
78 } 79 }
79 else if(norm_type == 'WEIBULL_5_FIT'){ 80 else if(norm_type == 'WEIBULL_5_FIT'){
80 return(fit_distribution(data_matrix, 'WEIBULL_5')) 81 return(fit_distribution(data_matrix, 'WEIBULL_5'))
81 } 82 }else{
83 write("ERROR: unknown normalization type", stderr());
84 q();
85 }
82 } 86 }
83 87
84 shift <- function(data_matrix, shift_type, normals_list, tumors_list){ 88 shift <- function(data_matrix, shift_type, normals_list, tumors_list){
85 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) 89 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list)))
86 } 90 }
96 return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) 100 return(unlist(lapply(data_row, function(x){return(x - row_stat);})))
97 } 101 }
98 else{ #normals are specified 102 else{ #normals are specified
99 normal_values <- data_row[normals_list] 103 normal_values <- data_row[normals_list]
100 tumor_columns <- data_row[tumors_list] 104 tumor_columns <- data_row[tumors_list]
101 105
102 if(norm_type == 'MEDIAN'){ 106 if(norm_type == 'MEDIAN'){
103 row_stat <- median(normal_values) 107 row_stat <- median(normal_values)
104 } 108 }
105 else if(norm_type == 'MEAN'){ 109 else if(norm_type == 'MEAN'){
106 row_stat <- mean(normal_values) 110 row_stat <- mean(normal_values)
137 return(col) 141 return(col)
138 } 142 }
139 143
140 fit_distribution <- function(data_matrix, dist){ 144 fit_distribution <- function(data_matrix, dist){
141 if(dist == 'EXPONENTIAL'){ 145 if(dist == 'EXPONENTIAL'){
142 ranked_data_matrix <- apply(data_matrix,2,rankNA) #idea by Dan Carlin 146 ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin
143 return(apply(ranked_data_matrix, c(1,2), qexp)) 147 #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
148 #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
149 return(apply(ranked_data_matrix, 1, qexp))
144 } 150 }
145 else if(dist == 'NORMAL'){ 151 else if(dist == 'NORMAL'){
146 ranked_data_matrix <- apply(data_matrix,2,rankNA) 152 ranked_data_matrix <- apply(data_matrix,2,rankNA)
147 #return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));}))
148 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1)) 153 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1))
149 } 154 }
150 else if(dist == 'WEIBULL_0.5'){ 155 else if(dist == 'WEIBULL_0.5'){
151 ranked_data_matrix <- apply(data_matrix,2,rankNA) 156 ranked_data_matrix <- apply(data_matrix,2,rankNA)
152 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) 157 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5))
189 #store command line arguments in variables: 194 #store command line arguments in variables:
190 input_file <- argv[1] 195 input_file <- argv[1]
191 norm_type <- toupper(argv[2]) 196 norm_type <- toupper(argv[2])
192 norm_by <- toupper(argv[3]) 197 norm_by <- toupper(argv[3])
193 198
199 #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab"
200 #norm_type <- "MEAN_SHIFT"
201 #norm_by <- "ROW"
202 #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab"
203 #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab"
204
194 #read the input file(s): 205 #read the input file(s):
195 data_matrix <- read_matrix(input_file) 206 data_matrix <- read_matrix(input_file)
196 207
197 if(with_normals){ 208 if(with_normals){
198 normals_list <- read_normals(normals_file) 209 normals <- read_normals(normals_file)
199 normals_indices <- which(colnames(data_matrix) %in% normals_list) 210 if(length(colnames(normals)) == 1){
200 tumor_indices <- which(!(colnames(data_matrix) %in% normals_list)) 211 normals_indices <- which(colnames(data_matrix) %in% normals)
201 norm_by <- 'ROW' 212 tumor_indices <- which(!(colnames(data_matrix) %in% normals))
202 } 213 }else{
203 else{ 214 normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])]
215 normals_numeric <- apply(normals_numeric, 2, as.numeric)
216 rownames(normals_numeric) <- normals[,1][2:length(normals[,1])]
217 colnames(normals_numeric) <- normals[1,][2:length(normals[1,])]
218
219 combined_matrix <- cbind(data_matrix, normals_numeric)
220 tumor_indices <- c(1:length(data_matrix[1,]))
221 normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,]))
222 data_matrix <- combined_matrix
223 }
224 }else{
204 normals_indices <- c() 225 normals_indices <- c()
205 tumor_indices <- c() 226 tumor_indices <- c()
206 } 227 }
207 228
208 #if normalize by columns then transpose the matrix: 229 #if normalize by columns then transpose the matrix:
217 if(norm_by == 'COLUMN'){ 238 if(norm_by == 'COLUMN'){
218 data_matrix <- t(data_matrix) 239 data_matrix <- t(data_matrix)
219 } 240 }
220 241
221 write_matrix(data_matrix) 242 write_matrix(data_matrix)
222 #print(data_matrix)
223 } 243 }
224 244
225 main(commandArgs(TRUE)) 245 main(commandArgs(TRUE))