Mercurial > repos > ynewton > matrix_normalization
diff normalize.r @ 0:31cfcab40d8f draft
Uploaded
| author | ynewton |
|---|---|
| date | Wed, 26 Sep 2012 17:32:30 -0400 |
| parents | |
| children | 710627b47962 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/normalize.r Wed Sep 26 17:32:30 2012 -0400 @@ -0,0 +1,227 @@ +#!/usr/bin/Rscript + +#usage, options and doc goes here +argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation. +Usage: + normalize.r input.tab norm_type norm_by > output.tab +Example: + Rscript normalize.r test_matrix.tab median_shift column > output.tab + Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab +Options: + input matrix (annotated by row and column names) + normalization type; available options: + median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals + mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals + t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately + exp_fit - (only by column) ranks data and transforms exponential CDF + normal_fit - (only by column) ranks data and transforms normal CDF + weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 + weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 + weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 + weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 + normalization by: + row + column + normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals + output file is specified through redirect character >") + +read_matrix <- function(in_file){ + header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] + cl.cols<- 1:length(header) > 1 + data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE) + data_matrix <- as.matrix(data_matrix.df[,cl.cols]) + rownames(data_matrix) <- data_matrix.df[,1] + return(data_matrix) + + #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters + #read_mtrx[,1] + + #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters + #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA") + #print(mtrx[1,]) +} + +write_matrix <- function(data_matrix){ + header <- append(c("Genes"), colnames(data_matrix)) + write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) +} + +read_normals <- function(in_file){ + return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) +} + +normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ + if(norm_type == 'MEDIAN_SHIFT'){ + return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) + } + else if(norm_type == 'MEAN_SHIFT'){ + return(shift(data_matrix, 'MEAN', normals_list, tumors_list)) + } + else if(norm_type == 'T_STATISTIC'){ + return(compute_z_score(data_matrix, normals_list, tumors_list)) + } + else if(norm_type == 'EXPONENTIAL_FIT'){ + return(fit_distribution(data_matrix, 'EXPONENTIAL')) + } + else if(norm_type == 'NORMAL_FIT'){ + return(fit_distribution(data_matrix, 'NORMAL')) + } + else if(norm_type == 'WEIBULL_0.5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_0.5')) + } + else if(norm_type == 'WEIBULL_1_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_1')) + } + else if(norm_type == 'WEIBULL_1.5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_1.5')) + } + else if(norm_type == 'WEIBULL_5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_5')) + } +} + +shift <- function(data_matrix, shift_type, normals_list, tumors_list){ + return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) +} + +shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){ + if(length(normals_list) == 0){ #no normals are specified + if(norm_type == 'MEDIAN'){ + row_stat <- median(data_row) + } + else if(norm_type == 'MEAN'){ + row_stat <- mean(data_row) + } + return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) + } + else{ #normals are specified + normal_values <- data_row[normals_list] + tumor_columns <- data_row[tumors_list] + + if(norm_type == 'MEDIAN'){ + row_stat <- median(normal_values) + } + else if(norm_type == 'MEAN'){ + row_stat <- mean(normal_values) + } + return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);}))) + } +} + +compute_z_score <- function(data_matrix, normals_list, tumors_list){ + return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list))) +} + +t_stat_normalize_row <- function(data_row, normals_list, tumors_list){ + if(length(normals_list) == 0){ #no normals are specified + row_mean <- mean(data_row) + row_sd <- sd(data_row) + return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);}))) + } + else{ #normals are specified + normal_values <- data_row[normals_list] + normal_mean <- mean(normal_values) + normal_sd <- sd(normal_values) + normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);})) + + tumor_values <- data_row[tumors_list] + tumor_mean <- mean(tumor_values) + tumor_sd <- sd(tumor_values) + normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - tumor_mean)/tumor_sd);})) + + return(append(normalized_normals, normalized_tumors)) + } +} + +rankNA <- function(col){ #originally written by Dan Carlin + col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col))) + return(col) +} + +fit_distribution <- function(data_matrix, dist){ + if(dist == 'EXPONENTIAL'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) #idea by Dan Carlin + return(apply(ranked_data_matrix, c(1,2), qexp)) + } + else if(dist == 'NORMAL'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + #return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));})) + return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=2)) + } + else if(dist == 'WEIBULL_0.5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) + } + else if(dist == 'WEIBULL_1'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1)) + } + else if(dist == 'WEIBULL_1.5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5)) + } + else if(dist == 'WEIBULL_5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5)) + } +} + +main <- function(argv) { + #determine if correct number of arguments are specified and if normals are specified + with_normals = FALSE + + if(length(argv) == 1){ + if(argv==c('--help')){ + write(argspec, stderr()); + q(); + } + } + + if(!(length(argv) == 3 || length(argv) == 4)){ + write("ERROR: invalid number of arguments is specified", stderr()); + q(); + } + + if(length(argv) == 4){ + with_normals = TRUE + normals_file <- argv[4] + } + + #store command line arguments in variables: + input_file <- argv[1] + norm_type <- toupper(argv[2]) + norm_by <- toupper(argv[3]) + + #read the input file(s): + data_matrix <- read_matrix(input_file) + + if(with_normals){ + normals_list <- read_normals(normals_file) + normals_indices <- which(colnames(data_matrix) %in% normals_list) + tumor_indices <- which(!(colnames(data_matrix) %in% normals_list)) + norm_by <- 'ROW' + } + else{ + normals_indices <- c() + tumor_indices <- c() + } + + #if normalize by columns then transpose the matrix: + if(norm_by == 'COLUMN'){ + data_matrix <- t(data_matrix) + } + + #normalize: + data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices) + + #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row: + if(norm_by == 'COLUMN'){ + data_matrix <- t(data_matrix) + } + + write_matrix(data_matrix) + #print(data_matrix) +} + +main(commandArgs(TRUE))
