Mercurial > repos > ynewton > matrix_normalization
comparison normalize.r @ 9:600872152be6 draft
Uploaded
author | ynewton |
---|---|
date | Sat, 20 Oct 2012 02:28:37 -0400 |
parents | 710627b47962 |
children |
comparison
equal
deleted
inserted
replaced
8:277a79e23357 | 9:600872152be6 |
---|---|
11 input matrix (annotated by row and column names) | 11 input matrix (annotated by row and column names) |
12 normalization type; available options: | 12 normalization type; available options: |
13 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals | 13 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals |
14 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals | 14 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals |
15 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately | 15 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately |
16 exp_fit - (only by column) ranks data and transforms exponential CDF | 16 exponential_fit - (only by column) ranks data and transforms exponential CDF |
17 normal_fit - (only by column) ranks data and transforms normal CDF | 17 normal_fit - (only by column) ranks data and transforms normal CDF |
18 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 | 18 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 |
19 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 | 19 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 |
20 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 | 20 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 |
21 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 | 21 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 |
22 normalization by: | 22 normalization by: |
23 row | 23 row |
24 column | 24 column |
25 normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals | 25 normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples |
26 output file is specified through redirect character >") | 26 output file is specified through redirect character >") |
27 | 27 |
28 read_matrix <- function(in_file){ | 28 read_matrix <- function(in_file){ |
29 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] | 29 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] |
30 cl.cols<- 1:length(header) > 1 | 30 cl.cols<- 1:length(header) > 1 |
46 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) | 46 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) |
47 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) | 47 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) |
48 } | 48 } |
49 | 49 |
50 read_normals <- function(in_file){ | 50 read_normals <- function(in_file){ |
51 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) | 51 #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) |
52 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))) | |
52 } | 53 } |
53 | 54 |
54 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ | 55 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ |
55 if(norm_type == 'MEDIAN_SHIFT'){ | 56 if(norm_type == 'MEDIAN_SHIFT'){ |
56 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) | 57 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) |
76 else if(norm_type == 'WEIBULL_1.5_FIT'){ | 77 else if(norm_type == 'WEIBULL_1.5_FIT'){ |
77 return(fit_distribution(data_matrix, 'WEIBULL_1.5')) | 78 return(fit_distribution(data_matrix, 'WEIBULL_1.5')) |
78 } | 79 } |
79 else if(norm_type == 'WEIBULL_5_FIT'){ | 80 else if(norm_type == 'WEIBULL_5_FIT'){ |
80 return(fit_distribution(data_matrix, 'WEIBULL_5')) | 81 return(fit_distribution(data_matrix, 'WEIBULL_5')) |
81 } | 82 }else{ |
83 write("ERROR: unknown normalization type", stderr()); | |
84 q(); | |
85 } | |
82 } | 86 } |
83 | 87 |
84 shift <- function(data_matrix, shift_type, normals_list, tumors_list){ | 88 shift <- function(data_matrix, shift_type, normals_list, tumors_list){ |
85 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) | 89 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) |
86 } | 90 } |
96 return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) | 100 return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) |
97 } | 101 } |
98 else{ #normals are specified | 102 else{ #normals are specified |
99 normal_values <- data_row[normals_list] | 103 normal_values <- data_row[normals_list] |
100 tumor_columns <- data_row[tumors_list] | 104 tumor_columns <- data_row[tumors_list] |
101 | 105 |
102 if(norm_type == 'MEDIAN'){ | 106 if(norm_type == 'MEDIAN'){ |
103 row_stat <- median(normal_values) | 107 row_stat <- median(normal_values) |
104 } | 108 } |
105 else if(norm_type == 'MEAN'){ | 109 else if(norm_type == 'MEAN'){ |
106 row_stat <- mean(normal_values) | 110 row_stat <- mean(normal_values) |
137 return(col) | 141 return(col) |
138 } | 142 } |
139 | 143 |
140 fit_distribution <- function(data_matrix, dist){ | 144 fit_distribution <- function(data_matrix, dist){ |
141 if(dist == 'EXPONENTIAL'){ | 145 if(dist == 'EXPONENTIAL'){ |
142 ranked_data_matrix <- apply(data_matrix,2,rankNA) #idea by Dan Carlin | 146 ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin |
143 return(apply(ranked_data_matrix, c(1,2), qexp)) | 147 #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) |
148 #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) | |
149 return(apply(ranked_data_matrix, 1, qexp)) | |
144 } | 150 } |
145 else if(dist == 'NORMAL'){ | 151 else if(dist == 'NORMAL'){ |
146 ranked_data_matrix <- apply(data_matrix,2,rankNA) | 152 ranked_data_matrix <- apply(data_matrix,2,rankNA) |
147 #return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));})) | |
148 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1)) | 153 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1)) |
149 } | 154 } |
150 else if(dist == 'WEIBULL_0.5'){ | 155 else if(dist == 'WEIBULL_0.5'){ |
151 ranked_data_matrix <- apply(data_matrix,2,rankNA) | 156 ranked_data_matrix <- apply(data_matrix,2,rankNA) |
152 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) | 157 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) |
189 #store command line arguments in variables: | 194 #store command line arguments in variables: |
190 input_file <- argv[1] | 195 input_file <- argv[1] |
191 norm_type <- toupper(argv[2]) | 196 norm_type <- toupper(argv[2]) |
192 norm_by <- toupper(argv[3]) | 197 norm_by <- toupper(argv[3]) |
193 | 198 |
199 #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab" | |
200 #norm_type <- "MEAN_SHIFT" | |
201 #norm_by <- "ROW" | |
202 #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab" | |
203 #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab" | |
204 | |
194 #read the input file(s): | 205 #read the input file(s): |
195 data_matrix <- read_matrix(input_file) | 206 data_matrix <- read_matrix(input_file) |
196 | 207 |
197 if(with_normals){ | 208 if(with_normals){ |
198 normals_list <- read_normals(normals_file) | 209 normals <- read_normals(normals_file) |
199 normals_indices <- which(colnames(data_matrix) %in% normals_list) | 210 if(length(colnames(normals)) == 1){ |
200 tumor_indices <- which(!(colnames(data_matrix) %in% normals_list)) | 211 normals_indices <- which(colnames(data_matrix) %in% normals) |
201 norm_by <- 'ROW' | 212 tumor_indices <- which(!(colnames(data_matrix) %in% normals)) |
202 } | 213 }else{ |
203 else{ | 214 normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])] |
215 normals_numeric <- apply(normals_numeric, 2, as.numeric) | |
216 rownames(normals_numeric) <- normals[,1][2:length(normals[,1])] | |
217 colnames(normals_numeric) <- normals[1,][2:length(normals[1,])] | |
218 | |
219 combined_matrix <- cbind(data_matrix, normals_numeric) | |
220 tumor_indices <- c(1:length(data_matrix[1,])) | |
221 normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,])) | |
222 data_matrix <- combined_matrix | |
223 } | |
224 }else{ | |
204 normals_indices <- c() | 225 normals_indices <- c() |
205 tumor_indices <- c() | 226 tumor_indices <- c() |
206 } | 227 } |
207 | 228 |
208 #if normalize by columns then transpose the matrix: | 229 #if normalize by columns then transpose the matrix: |
217 if(norm_by == 'COLUMN'){ | 238 if(norm_by == 'COLUMN'){ |
218 data_matrix <- t(data_matrix) | 239 data_matrix <- t(data_matrix) |
219 } | 240 } |
220 | 241 |
221 write_matrix(data_matrix) | 242 write_matrix(data_matrix) |
222 #print(data_matrix) | |
223 } | 243 } |
224 | 244 |
225 main(commandArgs(TRUE)) | 245 main(commandArgs(TRUE)) |