# HG changeset patch # User ynewton # Date 1348695150 14400 # Node ID 31cfcab40d8fe76387efcc74b90a360a7cf746dd Uploaded diff -r 000000000000 -r 31cfcab40d8f ._normalize.r Binary file ._normalize.r has changed diff -r 000000000000 -r 31cfcab40d8f ._normalize.xml Binary file ._normalize.xml has changed diff -r 000000000000 -r 31cfcab40d8f normalize.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/normalize.r Wed Sep 26 17:32:30 2012 -0400 @@ -0,0 +1,227 @@ +#!/usr/bin/Rscript + +#usage, options and doc goes here +argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation. +Usage: + normalize.r input.tab norm_type norm_by > output.tab +Example: + Rscript normalize.r test_matrix.tab median_shift column > output.tab + Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab +Options: + input matrix (annotated by row and column names) + normalization type; available options: + median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals + mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals + t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately + exp_fit - (only by column) ranks data and transforms exponential CDF + normal_fit - (only by column) ranks data and transforms normal CDF + weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 + weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 + weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 + weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 + normalization by: + row + column + normals_file is an optional parameter which contains a list of column headers from the input matrix, which should be considered as normals + output file is specified through redirect character >") + +read_matrix <- function(in_file){ + header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] + cl.cols<- 1:length(header) > 1 + data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE) + data_matrix <- as.matrix(data_matrix.df[,cl.cols]) + rownames(data_matrix) <- data_matrix.df[,1] + return(data_matrix) + + #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters + #read_mtrx[,1] + + #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters + #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA") + #print(mtrx[1,]) +} + +write_matrix <- function(data_matrix){ + header <- append(c("Genes"), colnames(data_matrix)) + write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) +} + +read_normals <- function(in_file){ + return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) +} + +normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ + if(norm_type == 'MEDIAN_SHIFT'){ + return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) + } + else if(norm_type == 'MEAN_SHIFT'){ + return(shift(data_matrix, 'MEAN', normals_list, tumors_list)) + } + else if(norm_type == 'T_STATISTIC'){ + return(compute_z_score(data_matrix, normals_list, tumors_list)) + } + else if(norm_type == 'EXPONENTIAL_FIT'){ + return(fit_distribution(data_matrix, 'EXPONENTIAL')) + } + else if(norm_type == 'NORMAL_FIT'){ + return(fit_distribution(data_matrix, 'NORMAL')) + } + else if(norm_type == 'WEIBULL_0.5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_0.5')) + } + else if(norm_type == 'WEIBULL_1_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_1')) + } + else if(norm_type == 'WEIBULL_1.5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_1.5')) + } + else if(norm_type == 'WEIBULL_5_FIT'){ + return(fit_distribution(data_matrix, 'WEIBULL_5')) + } +} + +shift <- function(data_matrix, shift_type, normals_list, tumors_list){ + return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) +} + +shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){ + if(length(normals_list) == 0){ #no normals are specified + if(norm_type == 'MEDIAN'){ + row_stat <- median(data_row) + } + else if(norm_type == 'MEAN'){ + row_stat <- mean(data_row) + } + return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) + } + else{ #normals are specified + normal_values <- data_row[normals_list] + tumor_columns <- data_row[tumors_list] + + if(norm_type == 'MEDIAN'){ + row_stat <- median(normal_values) + } + else if(norm_type == 'MEAN'){ + row_stat <- mean(normal_values) + } + return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);}))) + } +} + +compute_z_score <- function(data_matrix, normals_list, tumors_list){ + return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list))) +} + +t_stat_normalize_row <- function(data_row, normals_list, tumors_list){ + if(length(normals_list) == 0){ #no normals are specified + row_mean <- mean(data_row) + row_sd <- sd(data_row) + return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);}))) + } + else{ #normals are specified + normal_values <- data_row[normals_list] + normal_mean <- mean(normal_values) + normal_sd <- sd(normal_values) + normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);})) + + tumor_values <- data_row[tumors_list] + tumor_mean <- mean(tumor_values) + tumor_sd <- sd(tumor_values) + normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - tumor_mean)/tumor_sd);})) + + return(append(normalized_normals, normalized_tumors)) + } +} + +rankNA <- function(col){ #originally written by Dan Carlin + col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col))) + return(col) +} + +fit_distribution <- function(data_matrix, dist){ + if(dist == 'EXPONENTIAL'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) #idea by Dan Carlin + return(apply(ranked_data_matrix, c(1,2), qexp)) + } + else if(dist == 'NORMAL'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + #return(apply(ranked_data_matrix, c(1,2), function(x){return(qnorm(mean=mean(x), sd=sd(x)));})) + return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=2)) + } + else if(dist == 'WEIBULL_0.5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) + } + else if(dist == 'WEIBULL_1'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1)) + } + else if(dist == 'WEIBULL_1.5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5)) + } + else if(dist == 'WEIBULL_5'){ + ranked_data_matrix <- apply(data_matrix,2,rankNA) + return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5)) + } +} + +main <- function(argv) { + #determine if correct number of arguments are specified and if normals are specified + with_normals = FALSE + + if(length(argv) == 1){ + if(argv==c('--help')){ + write(argspec, stderr()); + q(); + } + } + + if(!(length(argv) == 3 || length(argv) == 4)){ + write("ERROR: invalid number of arguments is specified", stderr()); + q(); + } + + if(length(argv) == 4){ + with_normals = TRUE + normals_file <- argv[4] + } + + #store command line arguments in variables: + input_file <- argv[1] + norm_type <- toupper(argv[2]) + norm_by <- toupper(argv[3]) + + #read the input file(s): + data_matrix <- read_matrix(input_file) + + if(with_normals){ + normals_list <- read_normals(normals_file) + normals_indices <- which(colnames(data_matrix) %in% normals_list) + tumor_indices <- which(!(colnames(data_matrix) %in% normals_list)) + norm_by <- 'ROW' + } + else{ + normals_indices <- c() + tumor_indices <- c() + } + + #if normalize by columns then transpose the matrix: + if(norm_by == 'COLUMN'){ + data_matrix <- t(data_matrix) + } + + #normalize: + data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices) + + #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row: + if(norm_by == 'COLUMN'){ + data_matrix <- t(data_matrix) + } + + write_matrix(data_matrix) + #print(data_matrix) +} + +main(commandArgs(TRUE)) diff -r 000000000000 -r 31cfcab40d8f normalize.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/normalize.xml Wed Sep 26 17:32:30 2012 -0400 @@ -0,0 +1,56 @@ + + Matrix Normalize + normalize.r $genomicMatrix $normType $normBy +#if str($normalList) != "None": + $normalList +#end if + > $outfile + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool takes data in a matrix format and normalizes it using the chosen normalization options. The matrix data is assumed to be column and row annotated, meaning that the first line of the matrix file is assumed to be the column headers and the first column of each row is assumed to be the row header. + +Data can be normalized either by row or column. Note that fitting normalizations automatically do so by column regardless of the user selection. + +The following normalizations are provided: + +1. Median shift: if no normals list is provided then computes the median for the whole row and subtracts it from each entry of the row. If normals are provided then computes median for normals and subtracts it from each value of non-normal. Returns only non-normal samples if normals are provided. If "Column" is selected in normalize by, then normals are ignored. + +2. Mean shift: if no normals list is provided then computes the mean for the whole row and subtracts it from each entry of the row. If normals are provided then computes mean for normals and subtracts it from each value of non-normal. Returns only non-normal samples if normals are provided. If "Column" is selected in normalize by, then normals are ignored. + +3. T-statistic (z-score): sometimes called standardization. Z-score is computed for each value of the row/column. If normals are specified then the z-score within each class (normals and non-normals) is computed. + +4. Exponential normalization: performed by columns/samples. All genes/probes in the column/sample are ranked. Then inverse CDF (quantile function) is applied to the ranks (transforms a rank to a real number in exponential distribution). + +5. Normal normalization: same as exponential normalization, but inverse quantile function of Normal distribution is applied. + +6. Weibull normalizations: same as exponential normalization, but inverse quantile function of Weibull distribution is applied with appropriate scale and shape parameters. + + +Normals parameter is an optional parameter which contains a list of column headers from the input matrix which should be considered as normals + + + \ No newline at end of file