Mercurial > repos > yhoogstrate > edger_with_design_matrix
diff edgeR_Differential_Gene_Expression.xml @ 107:049d8bc2214e draft
planemo upload for repository https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools/raw/master/edger_with_design_matrix commit 2700e500a4fb135a20ede7d52221a9d31f1aaa5e-dirty
author | yhoogstrate |
---|---|
date | Tue, 01 Sep 2015 04:32:16 -0400 |
parents | f2ac9f6bc542 |
children | a02794bb9073 |
line wrap: on
line diff
--- a/edgeR_Differential_Gene_Expression.xml Tue Sep 01 04:25:37 2015 -0400 +++ b/edgeR_Differential_Gene_Expression.xml Tue Sep 01 04:32:16 2015 -0400 @@ -29,11 +29,6 @@ <version_command>echo $(R --version | grep version | grep -v GNU) " , EdgeR version" $(R --vanilla --slave -e "library(edgeR) ; cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")</version_command> <command> - <!-- - The following script is written in the "Cheetah" language: - http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html - --> - R --vanilla --slave -f $R_script '--args $expression_matrix $design_matrix @@ -111,48 +106,47 @@ <configfiles> <configfile name="R_script"> -library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping -library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping -library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping +<![CDATA[ + +library(limma,quietly=TRUE) ## quietly to avoid unnecessaity stderr messages +library(edgeR,quietly=TRUE) ## quietly to avoid unnecessaity stderr messages +library(splines,quietly=TRUE)## quietly to avoid unnecessaity stderr messages ## Fetch commandline arguments -args <- commandArgs(trailingOnly = TRUE) +args <- commandArgs(trailingOnly = TRUE) -expression_matrix_file = args[1] -design_matrix_file = args[2] -contrast = args[3] +expression_matrix_file <- args[1] +design_matrix_file <- args[2] +contrast <- args[3] -fdr = args[4] +fdr <- args[4] -output_count_edgeR = args[5] -output_cpm = args[6] +output_count_edgeR <- args[5] +output_cpm <- args[6] -output_xpkm = args[7] ##FPKM file - yet to be implemented +output_xpkm <- args[7] ##FPKM file - to be implemented -output_raw_counts = args[8] -output_MDSplot_logFC = args[9] -output_MDSplot_bcv = args[10] -output_BCVplot = args[11] -output_MAplot = args[12] -output_PValue_distribution_plot = args[13] -output_hierarchical_clustering_plot = args[14] -output_heatmap_plot = args[15] -output_RData_obj = args[16] -output_format_images = args[17] +output_raw_counts <- args[8] +output_MDSplot_logFC <- args[9] +output_MDSplot_bcv <- args[10] +output_BCVplot <- args[11] +output_MAplot <- args[12] +output_PValue_distribution_plot <- args[13] +output_hierarchical_clustering_plot <- args[14] +output_heatmap_plot <- args[15] +output_RData_obj <- args[16] +output_format_images <- args[17] -library(edgeR) -##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) ## Obtain read-counts +expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) +design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) -expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) -design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) - -colnames(design_matrix) <- make.names(colnames(design_matrix)) +colnames(design_matrix) <- make.names(colnames(design_matrix)) for(i in 1:ncol(design_matrix)) { - old <- design_matrix[,i] - design_matrix[,i] <- make.names(design_matrix[,i]) + old <- design_matrix[,i] + design_matrix[,i] <- make.names(design_matrix[,i]) if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) { print("Renaming of factors:") print(old) @@ -160,46 +154,46 @@ print(design_matrix[,i]) } ## The following line seems to malfunction the script: - ##design_matrix[,i] <- as.factor(design_matrix[,i]) + ##design_matrix[,i] <- as.factor(design_matrix[,i]) } ## 1) In the expression matrix, you only want to have the samples described in the design matrix -columns <- match(rownames(design_matrix),colnames(expression_matrix)) -columns <- columns[!is.na(columns)] -read_counts <- expression_matrix[,columns] +columns <- match(rownames(design_matrix),colnames(expression_matrix)) +columns <- columns[!is.na(columns)] +read_counts <- expression_matrix[,columns] ## 2) In the design matrix, you only want to have samples of which you really have the counts -columns <- match(colnames(read_counts),rownames(design_matrix)) -columns <- columns[!is.na(columns)] -design_matrix <- design_matrix[columns,,drop=FALSE] +columns <- match(colnames(read_counts),rownames(design_matrix)) +columns <- columns[!is.na(columns)] +design_matrix <- design_matrix[columns,,drop=FALSE] ## Filter for HTSeq predifined counts: -exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") -exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned") +exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") +exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned") -exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) -exclude <- exclude[is.na(exclude)==0] +exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) +exclude <- exclude[is.na(exclude)==0] if(length(exclude) != 0) { - read_counts <- read_counts[-exclude,] + read_counts <- read_counts[-exclude,] } ## sorting expression matrix with the order of the read_counts -##order <- match(colnames(read_counts) , rownames(design_matrix)) -##read_counts_ordered <- read_counts[,order2] +##order <- match(colnames(read_counts) , rownames(design_matrix)) +##read_counts_ordered <- read_counts[,order2] -empty_samples <- apply(read_counts,2,function(x) sum(x) == 0) +empty_samples <- apply(read_counts,2,function(x) sum(x) == 0) if(sum(empty_samples) > 0) { write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) write(colnames(read_counts)[empty_samples],stderr()) } else { - dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) + dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) - formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") - design_matrix_tmp <- design_matrix - colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) - design <- model.matrix(as.formula(formula),design_matrix_tmp) + formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") + design_matrix_tmp <- design_matrix + colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) + design <- model.matrix(as.formula(formula),design_matrix_tmp) rm(design_matrix_tmp) # Filter prefixes @@ -211,18 +205,18 @@ # Do normalization write("Calculating normalization factors...",stdout()) - dge <- calcNormFactors(dge) + dge <- calcNormFactors(dge) write("Estimating common dispersion...",stdout()) - dge <- estimateGLMCommonDisp(dge,design) + dge <- estimateGLMCommonDisp(dge,design) write("Estimating trended dispersion...",stdout()) - dge <- estimateGLMTrendedDisp(dge,design) + dge <- estimateGLMTrendedDisp(dge,design) write("Estimating tagwise dispersion...",stdout()) - dge <- estimateGLMTagwiseDisp(dge,design) + dge <- estimateGLMTagwiseDisp(dge,design) if(output_MDSplot_logFC != "/dev/null") { write("Creating MDS plot (logFC method)",stdout()) - points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot + points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot dev.off()# Kill it if(output_format_images == "pdf") { @@ -237,7 +231,7 @@ } - diff_x <- abs(max(points\$x)-min(points\$x)) + diff_x <- abs(max(points\$x)-min(points\$x)) diff_y <-(max(points\$y)-min(points\$y)) plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR logFC-MDS Plot on top 500 genes",type="n", xlab="Leading logFC dim 1", ylab="Leading logFC dim 2") points(points\$x,points\$y,pch=20) @@ -252,7 +246,7 @@ ## 1. First create a virtual plot to obtain the desired coordinates pdf("bcvmds.pdf") - points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples))) + points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples))) dev.off()# Kill it ## 2. Re-plot the coordinates in a new figure with the size and settings. @@ -267,8 +261,8 @@ bitmap(output_MDSplot_bcv,type="png16m",height=14,width=14) } - diff_x <- abs(max(points\$x)-min(points\$x)) - diff_y <-(max(points\$y)-min(points\$y)) + diff_x <- abs(max(points\$x)-min(points\$x)) + diff_y <- (max(points\$y)-min(points\$y)) plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR BCV-MDS Plot",type="n", xlab="Leading BCV dim 1", ylab="Leading BCV dim 2") points(points\$x,points\$y,pch=20) text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4) @@ -298,13 +292,13 @@ write("Fitting GLM...",stdout()) - fit <- glmFit(dge,design) + fit <- glmFit(dge,design) write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout()) - cont <- c(contrast) - cont <- makeContrasts(contrasts=cont, levels=design) + cont <- c(contrast) + cont <- makeContrasts(contrasts=cont, levels=design) - lrt <- glmLRT(fit, contrast=cont[,1]) + lrt <- glmLRT(fit, contrast=cont[,1]) write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout()) write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA) write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA) @@ -313,8 +307,8 @@ write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA) if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") { - etable <- topTags(lrt, n=nrow(dge))\$table - etable <- etable[order(etable\$FDR), ] + etable <- topTags(lrt, n=nrow(dge))\$table + etable <- etable[order(etable\$FDR), ] if(output_MAplot != "/dev/null") { write("Creating MA plot...",stdout()) @@ -350,16 +344,16 @@ bitmap(output_PValue_distribution_plot,type="png16m",width=14,height=14) } - expressed_genes <- subset(etable, PValue < 0.99) - h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") - center <- sum(h\$counts) / length(h\$counts) + expressed_genes <- subset(etable, PValue < 0.99) + h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") + center <- sum(h\$counts) / length(h\$counts) lines(c(0,1),c(center,center),lty=2,col="red",lwd=2) - k <- ksmooth(h\$mid, h\$counts) + k <- ksmooth(h\$mid, h\$counts) lines(k\$x,k\$y,col="red",lwd=2) - rmsd <- (h\$counts) - center - rmsd <- rmsd^2 - rmsd <- sum(rmsd) - rmsd <- sqrt(rmsd) + rmsd <- (h\$counts) - center + rmsd <- rmsd^2 + rmsd <- sum(rmsd) + rmsd <- sqrt(rmsd) text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue") ## change e into epsilon somehow dev.off() @@ -379,9 +373,9 @@ bitmap(output_heatmap_plot,type="png16m",width=10.5) } - etable2 <- topTags(lrt, n=100)\$table - order <- rownames(etable2) - cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),] + etable2 <- topTags(lrt, n=100)\$table + order <- rownames(etable2) + cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),] heatmap(t(cpm_sub)) dev.off() } @@ -394,6 +388,7 @@ write("Done!",stdout()) } +]]> </configfile> </configfiles>