matrix_normalization: normalize.r comparison

comparison normalize.r @ 9:600872152be6 draft

Uploaded

author	ynewton
date	Sat, 20 Oct 2012 02:28:37 -0400
parents	710627b47962
children

comparison

equal deleted inserted replaced

-:277a79e23357
+:600872152be6
 	input matrix (annotated by row and column names)
 	normalization type; available options:
 		median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals
 		mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals
 		t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately
-		exp_fit - (only by column) ranks data and transforms exponential CDF
+		exponential_fit - (only by column) ranks data and transforms exponential CDF
 		normal_fit - (only by column) ranks data and transforms normal CDF
 		weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5
 		weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1
 		weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5
 		weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5
 	normalization by:
 		row
 		column
-	normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals
+	normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples
 	output file is specified through redirect character >")
 read_matrix <- function(in_file){
 	header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]]
 	cl.cols<- 1:length(header) > 1
 	write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
 	write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
 }
 read_normals <- function(in_file){
-	return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1])
+	#return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1])
+	return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE)))
 }
 normalize <- function(data_matrix, norm_type, normals_list, tumors_list){
 	if(norm_type == 'MEDIAN_SHIFT'){
 		return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list))
 	else if(norm_type == 'WEIBULL_1.5_FIT'){
 		return(fit_distribution(data_matrix, 'WEIBULL_1.5'))
 	}
 	else if(norm_type == 'WEIBULL_5_FIT'){
 		return(fit_distribution(data_matrix, 'WEIBULL_5'))
-	}
+	}else{
+		write("ERROR: unknown normalization type", stderr());
+		q();
+	}
 }
 shift <- function(data_matrix, shift_type, normals_list, tumors_list){
 	return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list)))
 }
 		return(unlist(lapply(data_row, function(x){return(x - row_stat);})))
 	}
 	else{	#normals are specified
 		normal_values <- data_row[normals_list]
 		tumor_columns <- data_row[tumors_list]
 		if(norm_type == 'MEDIAN'){
 			row_stat <- median(normal_values)
 		}
 		else if(norm_type == 'MEAN'){
 			row_stat <- mean(normal_values)
 	return(col)
 }
 fit_distribution <- function(data_matrix, dist){
 	if(dist == 'EXPONENTIAL'){
-		ranked_data_matrix <- apply(data_matrix,2,rankNA)	#idea by Dan Carlin
+		ranked_data_matrix <- apply(data_matrix,1,rankNA)	#idea by Dan Carlin
-		return(apply(ranked_data_matrix, c(1,2), qexp))
+		#write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+		#write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE)
+		return(apply(ranked_data_matrix, 1, qexp))
 	}
 	else if(dist == 'NORMAL'){
 		ranked_data_matrix <- apply(data_matrix,2,rankNA)
-		#return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));}))
 		return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1))
 	}
 	else if(dist == 'WEIBULL_0.5'){
 		ranked_data_matrix <- apply(data_matrix,2,rankNA)
 		return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5))
 	#store command line arguments in variables:
 	input_file <- argv[1]
 	norm_type <- toupper(argv[2])
 	norm_by <- toupper(argv[3])
+	#input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab"
+	#norm_type <- "MEAN_SHIFT"
+	#norm_by <- "ROW"
+	#normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab"
+	#normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab"
 	#read the input file(s):
 	data_matrix <- read_matrix(input_file)
 	if(with_normals){
-		normals_list <- read_normals(normals_file)
+		normals <- read_normals(normals_file)
-		normals_indices <- which(colnames(data_matrix) %in% normals_list)
+		if(length(colnames(normals)) == 1){
-		tumor_indices <- which(!(colnames(data_matrix) %in% normals_list))
+			normals_indices <- which(colnames(data_matrix) %in% normals)
-		norm_by <- 'ROW'
+			tumor_indices <- which(!(colnames(data_matrix) %in% normals))
-	}
+		}else{
-	else{
+			normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])]
+			normals_numeric <- apply(normals_numeric, 2, as.numeric)
+			rownames(normals_numeric) <- normals[,1][2:length(normals[,1])]
+			colnames(normals_numeric) <- normals[1,][2:length(normals[1,])]
+			combined_matrix <- cbind(data_matrix, normals_numeric)
+			tumor_indices <- c(1:length(data_matrix[1,]))
+			normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,]))
+			data_matrix <- combined_matrix
+		}
+	}else{
 		normals_indices <- c()
 		tumor_indices <- c()
 	}
 	#if normalize by columns then transpose the matrix:
 	if(norm_by == 'COLUMN'){
 		data_matrix <- t(data_matrix)
 	}
 	write_matrix(data_matrix)
-	#print(data_matrix)
 }
 main(commandArgs(TRUE))

Mercurial > repos > ynewton > matrix_normalization

comparison normalize.r @ 9:600872152be6 draft