Mercurial > repos > yhoogstrate > edger_with_design_matrix

--- a/edgeR_DGE_test.R	Thu Jan 09 02:55:02 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,151 +0,0 @@
-#!/usr/bin/env Rscript
-
-# edgeR citation:
-#  Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression
-#  analysis of digital gene expression data. Bioinformatics 26, 139-140
-#
-#  Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing differences in tag
-#  abundance. Bioinformatics 23, 2881-2887
-
-#  Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with
-#  applications to SAGE data. Biostatistics, 9, 321-332
-
-#  McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq
-#  experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297
-
-# R script Author:
-# - MSc. René Böttcher (Erasmus MC)
-
-# Hooked into Galaxy Server:
-# - MSc. Youri Hoogstrate (Erasmus MC)
-
-
-setwd("/home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE")
-
-library(edgeR)
-
-# Fetch commandline arguments
-args <- commandArgs(trailingOnly = TRUE)
-designmatrix        = args[1]
-contrast            = args[2]
-
-output_1            = args[3]
-output_2            = args[4]
-output_3            = args[5]		#FPKM file - to be implemented
-output_4            = args[6]
-
-QC                  = nchar(args[7]) > 0
-
-output_5            = args[8]
-output_6            = args[9]
-output_7            = args[10]
-
-output_8            = args[11]
-
-
-
-
-library(edgeR)
-raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
-
-# Obtain read-counts
-read_counts 		= read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1)
-for(i in 2:length(raw_data[,1])) {
-  print("parsing counts from:")
-  print(raw_data[i,1])
-  read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1))
-  print(i)
-}
-
-
-
-# Filter for HTSeq predifined counts:
-exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
-exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")
-
-exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
-exclude = exclude[is.na(exclude)==0]
-if(length(exclude) != 0)  {
-  read_counts = read_counts[-exclude,]
-}
-
-
-
-
-
-
-
-
-
-colnames(read_counts) = raw_data[,2]
-dge 					= DGEList(counts=read_counts,genes=rownames(read_counts))
-
-design_tmp <- raw_data[3:length(raw_data)]
-rownames(design_tmp)     <- colnames(dge)
-formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
-design <- model.matrix(as.formula(formula),design_tmp)
-
-prefixes = colnames(design_tmp)[attr(design,"assign")]
-avoid = nchar(prefixes) == nchar(colnames(design))
-replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
-replacements[avoid] = colnames(design)[avoid]
-colnames(design) = replacements
-
-
-
-print("Calculating normalization factors...")
-dge		= calcNormFactors(dge)
-print("Estimating common dispersion...")
-dge 	= estimateGLMCommonDisp(dge,design)
-print("Estimating trended dispersion...")
-dge 	= estimateGLMTrendedDisp(dge,design)
-print("Estimating tagwise dispersion...")
-dge 	= estimateGLMTagwiseDisp(dge,design)
-
-
-
-
-if (QC == TRUE) {
-  print("Creating QC plots...")
-  ## MDS Plot
-  pdf(output_5)
-  plotMDS(dge, main="edgeR MDS Plot")
-  dev.off()
-  ## Biological coefficient of variation plot
-  pdf(output_6)
-  plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
-  dev.off()
-}
-
-
-
-print("Fitting GLM...")
-fit 	= glmFit(dge,design)
-
-print(paste("Performing likelihood ratio test: ",contrast,sep=""))
-cont <- c(contrast)
-cont <- makeContrasts(contrasts=cont, levels=design)
-
-lrt <- glmLRT(fit, contrast=cont[,1])
-print(paste("Exporting to file: ",output_1,sep=""))
-write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T)
-write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
-# todo EXPORT FPKM
-write.table(file=output_4,dge$counts,sep="\t")
-
-
-
-if (QC == TRUE) {
-  print("Creating MA plots...")
-
-
-  etable <- topTags(lrt, n=nrow(dge))$table
-  etable <- etable[order(etable$FDR), ]
-  pdf(output_7)
-  with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
-  with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
-  abline(h=c(-1,1), col="blue")
-  dev.off()
-}
-print("Done!")
-