Mercurial > repos > ynewton > matrix_normalization
changeset 9:600872152be6 draft
Uploaded
author | ynewton |
---|---|
date | Sat, 20 Oct 2012 02:28:37 -0400 |
parents | 277a79e23357 |
children | 0c4dcd3980c1 |
files | normalize.r |
diffstat | 1 files changed, 35 insertions(+), 15 deletions(-) [+] |
line wrap: on
line diff
--- a/normalize.r Wed Oct 03 10:10:47 2012 -0400 +++ b/normalize.r Sat Oct 20 02:28:37 2012 -0400 @@ -13,7 +13,7 @@ median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately - exp_fit - (only by column) ranks data and transforms exponential CDF + exponential_fit - (only by column) ranks data and transforms exponential CDF normal_fit - (only by column) ranks data and transforms normal CDF weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 @@ -22,7 +22,7 @@ normalization by: row column - normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals + normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples output file is specified through redirect character >") read_matrix <- function(in_file){ @@ -48,7 +48,8 @@ } read_normals <- function(in_file){ - return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) + #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) + return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))) } normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ @@ -78,7 +79,10 @@ } else if(norm_type == 'WEIBULL_5_FIT'){ return(fit_distribution(data_matrix, 'WEIBULL_5')) - } + }else{ + write("ERROR: unknown normalization type", stderr()); + q(); + } } shift <- function(data_matrix, shift_type, normals_list, tumors_list){ @@ -98,7 +102,7 @@ else{ #normals are specified normal_values <- data_row[normals_list] tumor_columns <- data_row[tumors_list] - + if(norm_type == 'MEDIAN'){ row_stat <- median(normal_values) } @@ -139,12 +143,13 @@ fit_distribution <- function(data_matrix, dist){ if(dist == 'EXPONENTIAL'){ - ranked_data_matrix <- apply(data_matrix,2,rankNA) #idea by Dan Carlin - return(apply(ranked_data_matrix, c(1,2), qexp)) + ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin + #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + return(apply(ranked_data_matrix, 1, qexp)) } else if(dist == 'NORMAL'){ ranked_data_matrix <- apply(data_matrix,2,rankNA) - #return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));})) return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1)) } else if(dist == 'WEIBULL_0.5'){ @@ -191,16 +196,32 @@ norm_type <- toupper(argv[2]) norm_by <- toupper(argv[3]) + #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab" + #norm_type <- "MEAN_SHIFT" + #norm_by <- "ROW" + #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab" + #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab" + #read the input file(s): data_matrix <- read_matrix(input_file) if(with_normals){ - normals_list <- read_normals(normals_file) - normals_indices <- which(colnames(data_matrix) %in% normals_list) - tumor_indices <- which(!(colnames(data_matrix) %in% normals_list)) - norm_by <- 'ROW' - } - else{ + normals <- read_normals(normals_file) + if(length(colnames(normals)) == 1){ + normals_indices <- which(colnames(data_matrix) %in% normals) + tumor_indices <- which(!(colnames(data_matrix) %in% normals)) + }else{ + normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])] + normals_numeric <- apply(normals_numeric, 2, as.numeric) + rownames(normals_numeric) <- normals[,1][2:length(normals[,1])] + colnames(normals_numeric) <- normals[1,][2:length(normals[1,])] + + combined_matrix <- cbind(data_matrix, normals_numeric) + tumor_indices <- c(1:length(data_matrix[1,])) + normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,])) + data_matrix <- combined_matrix + } + }else{ normals_indices <- c() tumor_indices <- c() } @@ -219,7 +240,6 @@ } write_matrix(data_matrix) - #print(data_matrix) } main(commandArgs(TRUE))