# HG changeset patch # User yhoogstrate # Date 1400577994 14400 # Node ID 7cb518091b1826c2a5022da0d6bb1d1f519078e3 # Parent 86f91bf4ab4cd75ae5e7ddaa6562e4ffd12199f7 Uploaded diff -r 86f91bf4ab4c -r 7cb518091b18 edgeR_Differential_Gene_Expression.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edgeR_Differential_Gene_Expression.xml Tue May 20 05:26:34 2014 -0400 @@ -0,0 +1,422 @@ + + + RNA-Seq gene expression analysis using edgeR (R package) + + + r3 + biocbasics + + + + + + R --vanilla --slave -f $R_script '--args + $expression_matrix + $design_matrix + $contrast + + $fdr + + $output_count_edgeR + $output_cpm + + /dev/null + + #if $output_raw_counts: + $output_raw_counts + #else: + /dev/null + #end if + + #if $output_MDSplot: + $output_MDSplot + #else: + /dev/null + #end if + + #if $output_BCVplot: + $output_BCVplot + #else: + /dev/null + #end if + + #if $output_MAplot: + $output_MAplot + #else: + /dev/null + #end if + + #if $output_PValue_distribution_plot: + $output_PValue_distribution_plot + #else: + /dev/null + #end if + + #if $output_hierarchical_clustering_plot: + $output_hierarchical_clustering_plot + #else: + /dev/null + #end if + + #if $output_heatmap_plot: + $output_heatmap_plot + #else: + /dev/null + #end if + + #if $output_RData_obj: + $output_RData_obj + #else: + /dev/null + #end if + ' + #if $output_R: + > $output_R + #else: + > /dev/null + #end if + + 2> stderr.txt + ; + grep -v 'Calculating library sizes from column' stderr.txt 1>&2 + + + + + + + + + + + + + + + + + + + + + + + + + + + +library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping +library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping +library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping + +## Fetch commandline arguments +args <- commandArgs(trailingOnly = TRUE) + +expression_matrix_file = args[1] +design_matrix_file = args[2] +contrast = args[3] + +fdr = args[4] + +output_count_edgeR = args[5] +output_cpm = args[6] + +output_xpkm = args[7] ##FPKM file - yet to be implemented + +output_raw_counts = args[8] +output_MDSplot = args[9] +output_BCVplot = args[10] +output_MAplot = args[11] +output_PValue_distribution_plot = args[12] +output_hierarchical_clustering_plot = args[13] +output_heatmap_plot = args[14] +output_RData_obj = args[15] + + +library(edgeR) +##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) +## Obtain read-counts + +expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) +design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) + +colnames(design_matrix) <- make.names(colnames(design_matrix)) + +for(i in 1:ncol(design_matrix)) { + old = design_matrix[,i] + design_matrix[,i] = make.names(design_matrix[,i]) + if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) { + print("Renaming of factors:") + print(old) + print("To:") + print(design_matrix[,i]) + } + design_matrix[,i] <- as.factor(design_matrix[,i]) +} + + +columns <- match(rownames(design_matrix),colnames(expression_matrix)) +read_counts <- expression_matrix[,columns] + + +## Filter for HTSeq predifined counts: +exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") +exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned") + +exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) +exclude = exclude[is.na(exclude)==0] +if(length(exclude) != 0) { + read_counts = read_counts[-exclude,] +} + + +empty_samples = apply(read_counts,2,function(x) sum(x) == 0) +if(sum(empty_samples) > 0) { + write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) + write(colnames(read_counts)[empty_samples],stderr()) +} else { + + dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) + + formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") + design_matrix_tmp <- design_matrix + colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) + design <- model.matrix(as.formula(formula),design_matrix_tmp) + rm(design_matrix_tmp) + + # Filter prefixes + prefixes = colnames(design_matrix)[attr(design,"assign")] + avoid = nchar(prefixes) == nchar(colnames(design)) + replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) + replacements[avoid] = colnames(design)[avoid] + colnames(design) = replacements + + # Do normalization + write("Calculating normalization factors...",stdout()) + dge <- calcNormFactors(dge) + write("Estimating common dispersion...",stdout()) + dge <- estimateGLMCommonDisp(dge,design) + write("Estimating trended dispersion...",stdout()) + dge <- estimateGLMTrendedDisp(dge,design) + write("Estimating tagwise dispersion...",stdout()) + dge <- estimateGLMTagwiseDisp(dge,design) + + + if(output_MDSplot != "/dev/null") { + write("Creating MDS plot",stdout()) + ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot + points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot + dev.off()# Kill it + + pdf(output_MDSplot) + diff_x <- abs(max(points\$x)-min(points\$x)) + diff_y <-(max(points\$y)-min(points\$y)) + plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2") + points(points\$x,points\$y,pch=20) + text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4) + rm(diff_x,diff_y) + + dev.off() + } + + if(output_BCVplot != "/dev/null") { + write("Creating Biological coefficient of variation plot",stdout()) + pdf(output_BCVplot) + plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") + dev.off() + } + + + write("Fitting GLM...",stdout()) + fit <- glmFit(dge,design) + + write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout()) + cont <- c(contrast) + cont <- makeContrasts(contrasts=cont, levels=design) + + lrt <- glmLRT(fit, contrast=cont[,1]) + write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout()) + write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA) + write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA) + + ## todo EXPORT FPKM + write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA) + + + if(output_MAplot != "/dev/null") { + write("Creating MA plot...",stdout()) + + etable <- topTags(lrt, n=nrow(dge))\$table + etable <- etable[order(etable\$FDR), ] + pdf(output_MAplot) + with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) + with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red")) + abline(h=c(-1,1), col="blue") + dev.off() + } + + if(output_PValue_distribution_plot != "/dev/null") { + write("Creating P-value distribution plot...",stdout()) + pdf(output_PValue_distribution_plot) + expressed_genes <- subset(etable, PValue < 0.99) + h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") + center <- sum(h\$counts) / length(h\$counts) + lines(c(0,1),c(center,center),lty=2,col="red",lwd=2) + k <- ksmooth(h\$mid, h\$counts) + lines(k\$x,k\$y,col="red",lwd=2) + rmsd <- (h\$counts) - center + rmsd <- rmsd^2 + rmsd <- sum(rmsd) + rmsd <- sqrt(rmsd) + text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue") + ## change e into epsilon somehow + dev.off() + } + + ##output_hierarchical_clustering_plot = args[13] + ##output_heatmap_plot = args[14] + + if(output_RData_obj != "/dev/null/") { + save.image(output_RData_obj) + } + + write("Done!",stdout()) +} + + + + + + + + + ("make_output_raw_counts" in outputs) + + + + ("make_output_MDSplot" in outputs) + + + + ("make_output_BCVplot" in outputs) + + + + ("make_output_MAplot" in outputs) + + + + ("make_output_PValue_distribution_plot" in outputs) + + + + ("make_output_hierarchical_clustering_plot" in outputs) + + + + ("make_output_heatmap_plot" in outputs) + + + + ("make_output_R" in outputs) + + + + ("make_output_RData_obj" in outputs) + + + + +edgeR: Differential Gene(Expression) Analysis + +**Overview** + +Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data. +Author: Mark Robinson, Davis McCarthy, Yunshun Chen, Aaron Lun & Gordon Smyth + +http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html +http://dx.doi.org/10.1093/bioinformatics/btp616 + +For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups. +More details on this are given in the edgeR manual: +http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf +and the limma manual. + +Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it. +This tool is called *edgeR Design Matrix Creator*. +If the appropriate design matrix (with corresponding links to the files) is given, +the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given. + +If you have for example two groups, with an equal weight, you would like to compare either +"g1~g2" or "normal~cancer". + +** Input ** + +Expression matrix:: + + Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n" + SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n" + BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n" + LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n" + SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n" + [...] + +Note: Make sure the number of columns in the header is identical to the number of columns in the body. + +Design matrix:: + + Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n" + Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n" + Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n" + Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n" + Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n" + Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n" + Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n" + Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n" + Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n" + Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n" + Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n" + Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n" + Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n" + +Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters. + +Contrast:: + + Tumor-Normal + African-European + 0.5*(Control+Placebo) / Treated + + +**Installation** + +This tool requires no specific configurations. The following dependencies are installed automatically: + * R + * Bioconductor + - limma + - edgeR + +**License** + - R - GPL-2 & GPL-3 + - limma - GPL (>=2) + - edgeR - GPL (>=2) + +**Contact** + +The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project: +http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch + +More tools by the Translational Research IT (TraIT) project can be found in the following repository: +http://toolshed.nbic.nl/ + +**References** + +The test data is coming from: doi: 10.1093/bioinformatics/btt688. +http://www.ncbi.nlm.nih.gov/pubmed/24319002 + + +