Mercurial > repos > ynewton > matrix_normalization
comparison normalize.r @ 0:31cfcab40d8f draft
Uploaded
| author | ynewton |
|---|---|
| date | Wed, 26 Sep 2012 17:32:30 -0400 |
| parents | |
| children | 710627b47962 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:31cfcab40d8f |
|---|---|
| 1 #!/usr/bin/Rscript | |
| 2 | |
| 3 #usage, options and doc goes here | |
| 4 argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation. | |
| 5 Usage: | |
| 6 normalize.r input.tab norm_type norm_by > output.tab | |
| 7 Example: | |
| 8 Rscript normalize.r test_matrix.tab median_shift column > output.tab | |
| 9 Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab | |
| 10 Options: | |
| 11 input matrix (annotated by row and column names) | |
| 12 normalization type; available options: | |
| 13 median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals | |
| 14 mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals | |
| 15 t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately | |
| 16 exp_fit - (only by column) ranks data and transforms exponential CDF | |
| 17 normal_fit - (only by column) ranks data and transforms normal CDF | |
| 18 weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 | |
| 19 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 | |
| 20 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 | |
| 21 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 | |
| 22 normalization by: | |
| 23 row | |
| 24 column | |
| 25 normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals | |
| 26 output file is specified through redirect character >") | |
| 27 | |
| 28 read_matrix <- function(in_file){ | |
| 29 header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] | |
| 30 cl.cols<- 1:length(header) > 1 | |
| 31 data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE) | |
| 32 data_matrix <- as.matrix(data_matrix.df[,cl.cols]) | |
| 33 rownames(data_matrix) <- data_matrix.df[,1] | |
| 34 return(data_matrix) | |
| 35 | |
| 36 #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters | |
| 37 #read_mtrx[,1] | |
| 38 | |
| 39 #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters | |
| 40 #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA") | |
| 41 #print(mtrx[1,]) | |
| 42 } | |
| 43 | |
| 44 write_matrix <- function(data_matrix){ | |
| 45 header <- append(c("Genes"), colnames(data_matrix)) | |
| 46 write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) | |
| 47 write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) | |
| 48 } | |
| 49 | |
| 50 read_normals <- function(in_file){ | |
| 51 return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) | |
| 52 } | |
| 53 | |
| 54 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ | |
| 55 if(norm_type == 'MEDIAN_SHIFT'){ | |
| 56 return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) | |
| 57 } | |
| 58 else if(norm_type == 'MEAN_SHIFT'){ | |
| 59 return(shift(data_matrix, 'MEAN', normals_list, tumors_list)) | |
| 60 } | |
| 61 else if(norm_type == 'T_STATISTIC'){ | |
| 62 return(compute_z_score(data_matrix, normals_list, tumors_list)) | |
| 63 } | |
| 64 else if(norm_type == 'EXPONENTIAL_FIT'){ | |
| 65 return(fit_distribution(data_matrix, 'EXPONENTIAL')) | |
| 66 } | |
| 67 else if(norm_type == 'NORMAL_FIT'){ | |
| 68 return(fit_distribution(data_matrix, 'NORMAL')) | |
| 69 } | |
| 70 else if(norm_type == 'WEIBULL_0.5_FIT'){ | |
| 71 return(fit_distribution(data_matrix, 'WEIBULL_0.5')) | |
| 72 } | |
| 73 else if(norm_type == 'WEIBULL_1_FIT'){ | |
| 74 return(fit_distribution(data_matrix, 'WEIBULL_1')) | |
| 75 } | |
| 76 else if(norm_type == 'WEIBULL_1.5_FIT'){ | |
| 77 return(fit_distribution(data_matrix, 'WEIBULL_1.5')) | |
| 78 } | |
| 79 else if(norm_type == 'WEIBULL_5_FIT'){ | |
| 80 return(fit_distribution(data_matrix, 'WEIBULL_5')) | |
| 81 } | |
| 82 } | |
| 83 | |
| 84 shift <- function(data_matrix, shift_type, normals_list, tumors_list){ | |
| 85 return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) | |
| 86 } | |
| 87 | |
| 88 shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){ | |
| 89 if(length(normals_list) == 0){ #no normals are specified | |
| 90 if(norm_type == 'MEDIAN'){ | |
| 91 row_stat <- median(data_row) | |
| 92 } | |
| 93 else if(norm_type == 'MEAN'){ | |
| 94 row_stat <- mean(data_row) | |
| 95 } | |
| 96 return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) | |
| 97 } | |
| 98 else{ #normals are specified | |
| 99 normal_values <- data_row[normals_list] | |
| 100 tumor_columns <- data_row[tumors_list] | |
| 101 | |
| 102 if(norm_type == 'MEDIAN'){ | |
| 103 row_stat <- median(normal_values) | |
| 104 } | |
| 105 else if(norm_type == 'MEAN'){ | |
| 106 row_stat <- mean(normal_values) | |
| 107 } | |
| 108 return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);}))) | |
| 109 } | |
| 110 } | |
| 111 | |
| 112 compute_z_score <- function(data_matrix, normals_list, tumors_list){ | |
| 113 return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list))) | |
| 114 } | |
| 115 | |
| 116 t_stat_normalize_row <- function(data_row, normals_list, tumors_list){ | |
| 117 if(length(normals_list) == 0){ #no normals are specified | |
| 118 row_mean <- mean(data_row) | |
| 119 row_sd <- sd(data_row) | |
| 120 return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);}))) | |
| 121 } | |
| 122 else{ #normals are specified | |
| 123 normal_values <- data_row[normals_list] | |
| 124 normal_mean <- mean(normal_values) | |
| 125 normal_sd <- sd(normal_values) | |
| 126 normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);})) | |
| 127 | |
| 128 tumor_values <- data_row[tumors_list] | |
| 129 tumor_mean <- mean(tumor_values) | |
| 130 tumor_sd <- sd(tumor_values) | |
| 131 normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - tumor_mean)/tumor_sd);})) | |
| 132 | |
| 133 return(append(normalized_normals, normalized_tumors)) | |
| 134 } | |
| 135 } | |
| 136 | |
| 137 rankNA <- function(col){ #originally written by Dan Carlin | |
| 138 col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col))) | |
| 139 return(col) | |
| 140 } | |
| 141 | |
| 142 fit_distribution <- function(data_matrix, dist){ | |
| 143 if(dist == 'EXPONENTIAL'){ | |
| 144 ranked_data_matrix <- apply(data_matrix,2,rankNA) #idea by Dan Carlin | |
| 145 return(apply(ranked_data_matrix, c(1,2), qexp)) | |
| 146 } | |
| 147 else if(dist == 'NORMAL'){ | |
| 148 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
| 149 #return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));})) | |
| 150 return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=2)) | |
| 151 } | |
| 152 else if(dist == 'WEIBULL_0.5'){ | |
| 153 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
| 154 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) | |
| 155 } | |
| 156 else if(dist == 'WEIBULL_1'){ | |
| 157 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
| 158 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1)) | |
| 159 } | |
| 160 else if(dist == 'WEIBULL_1.5'){ | |
| 161 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
| 162 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5)) | |
| 163 } | |
| 164 else if(dist == 'WEIBULL_5'){ | |
| 165 ranked_data_matrix <- apply(data_matrix,2,rankNA) | |
| 166 return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5)) | |
| 167 } | |
| 168 } | |
| 169 | |
| 170 main <- function(argv) { | |
| 171 #determine if correct number of arguments are specified and if normals are specified | |
| 172 with_normals = FALSE | |
| 173 | |
| 174 if(length(argv) == 1){ | |
| 175 if(argv==c('--help')){ | |
| 176 write(argspec, stderr()); | |
| 177 q(); | |
| 178 } | |
| 179 } | |
| 180 | |
| 181 if(!(length(argv) == 3 || length(argv) == 4)){ | |
| 182 write("ERROR: invalid number of arguments is specified", stderr()); | |
| 183 q(); | |
| 184 } | |
| 185 | |
| 186 if(length(argv) == 4){ | |
| 187 with_normals = TRUE | |
| 188 normals_file <- argv[4] | |
| 189 } | |
| 190 | |
| 191 #store command line arguments in variables: | |
| 192 input_file <- argv[1] | |
| 193 norm_type <- toupper(argv[2]) | |
| 194 norm_by <- toupper(argv[3]) | |
| 195 | |
| 196 #read the input file(s): | |
| 197 data_matrix <- read_matrix(input_file) | |
| 198 | |
| 199 if(with_normals){ | |
| 200 normals_list <- read_normals(normals_file) | |
| 201 normals_indices <- which(colnames(data_matrix) %in% normals_list) | |
| 202 tumor_indices <- which(!(colnames(data_matrix) %in% normals_list)) | |
| 203 norm_by <- 'ROW' | |
| 204 } | |
| 205 else{ | |
| 206 normals_indices <- c() | |
| 207 tumor_indices <- c() | |
| 208 } | |
| 209 | |
| 210 #if normalize by columns then transpose the matrix: | |
| 211 if(norm_by == 'COLUMN'){ | |
| 212 data_matrix <- t(data_matrix) | |
| 213 } | |
| 214 | |
| 215 #normalize: | |
| 216 data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices) | |
| 217 | |
| 218 #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row: | |
| 219 if(norm_by == 'COLUMN'){ | |
| 220 data_matrix <- t(data_matrix) | |
| 221 } | |
| 222 | |
| 223 write_matrix(data_matrix) | |
| 224 #print(data_matrix) | |
| 225 } | |
| 226 | |
| 227 main(commandArgs(TRUE)) |
