changeset 13:6e475c849a38 draft

Uploaded
author yhoogstrate
date Thu, 09 Jan 2014 08:27:09 -0500
parents c672e76503b2
children 15fd1a6798e0
files edgeR_DGE.xml
diffstat 1 files changed, 63 insertions(+), 61 deletions(-) [+]
line wrap: on
line diff
--- a/edgeR_DGE.xml	Thu Jan 09 06:55:28 2014 -0500
+++ b/edgeR_DGE.xml	Thu Jan 09 08:27:09 2014 -0500
@@ -72,7 +72,7 @@
 header = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
 has_header = (class(header[1,1]) == "character")
 
-read_counts 		= read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)
+read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]
 
 for(i in 2:length(raw_data[,1])) {
   print("parsing counts from:")
@@ -80,11 +80,14 @@
   
   header = read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
   has_header = (class(header[1,1]) == "character")
+  table = read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]
   
-  read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1))
+  read_counts = cbind(read_counts,table)
   print(i)
 }
 
+colnames(read_counts) = as.character(raw_data[,2])
+
 
 
 ## Filter for HTSeq predifined counts:
@@ -99,83 +102,82 @@
 
 
 
-
+empty_samples = apply(read_counts,2,function(x) sum(x) == 0)
+if(sum(empty_samples) > 0) {
+  write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
+  write(colnames(read_counts)[empty_samples],stderr())
+}
+else {
+  dge = DGEList(counts=read_counts,genes=rownames(read_counts))
 
+  design_tmp <- raw_data[3:length(raw_data)]
+  rownames(design_tmp)     <- colnames(dge)
+  formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
+  design <- model.matrix(as.formula(formula),design_tmp)
 
+  prefixes = colnames(design_tmp)[attr(design,"assign")]
+  avoid = nchar(prefixes) == nchar(colnames(design))
+  replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
+  replacements[avoid] = colnames(design)[avoid]
+  colnames(design) = replacements
 
 
 
-colnames(read_counts) = raw_data[,2]
-dge 					= DGEList(counts=read_counts,genes=rownames(read_counts))
-
-design_tmp <- raw_data[3:length(raw_data)]
-rownames(design_tmp)     <- colnames(dge)
-formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
-design <- model.matrix(as.formula(formula),design_tmp)
-
-prefixes = colnames(design_tmp)[attr(design,"assign")]
-avoid = nchar(prefixes) == nchar(colnames(design))
-replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
-replacements[avoid] = colnames(design)[avoid]
-colnames(design) = replacements
-
-
-
-print("Calculating normalization factors...")
-dge		= calcNormFactors(dge)
-print("Estimating common dispersion...")
-dge 	= estimateGLMCommonDisp(dge,design)
-print("Estimating trended dispersion...")
-dge 	= estimateGLMTrendedDisp(dge,design)
-print("Estimating tagwise dispersion...")
-dge 	= estimateGLMTagwiseDisp(dge,design)
+  print("Calculating normalization factors...")
+  dge = calcNormFactors(dge)
+  print("Estimating common dispersion...")
+  dge = estimateGLMCommonDisp(dge,design)
+  print("Estimating trended dispersion...")
+  dge = estimateGLMTrendedDisp(dge,design)
+  print("Estimating tagwise dispersion...")
+  dge = estimateGLMTagwiseDisp(dge,design)
 
 
 
 
-if (QC == TRUE) {
-  print("Creating QC plots...")
-  #### MDS Plot
-  pdf(output_5)
-  plotMDS(dge, main="edgeR MDS Plot")
-  dev.off()
-  #### Biological coefficient of variation plot
-  pdf(output_6)
-  plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
-  dev.off()
-}
+  if (QC == TRUE) {
+    print("Creating QC plots...")
+    #### MDS Plot
+    pdf(output_5)
+    plotMDS(dge, main="edgeR MDS Plot")
+    dev.off()
+    #### Biological coefficient of variation plot
+    pdf(output_6)
+    plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
+    dev.off()
+  }
 
 
 
-print("Fitting GLM...")
-fit 	= glmFit(dge,design)
+  print("Fitting GLM...")
+  fit   = glmFit(dge,design)
 
-print(paste("Performing likelihood ratio test: ",contrast,sep=""))
-cont <- c(contrast)
-cont <- makeContrasts(contrasts=cont, levels=design)
+  print(paste("Performing likelihood ratio test: ",contrast,sep=""))
+  cont <- c(contrast)
+  cont <- makeContrasts(contrasts=cont, levels=design)
 
-lrt <- glmLRT(fit, contrast=cont[,1])
-print(paste("Exporting to file: ",output_1,sep=""))
-write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T)
-write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
-## todo EXPORT FPKM
-write.table(file=output_4,dge\$counts,sep="\t")
+  lrt <- glmLRT(fit, contrast=cont[,1])
+  print(paste("Exporting to file: ",output_1,sep=""))
+  write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T)
+  write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
+  ## todo EXPORT FPKM
+  write.table(file=output_4,dge\$counts,sep="\t")
 
 
 
-if (QC == TRUE) {
-  print("Creating MA plots...")
-  
-  
-  etable <- topTags(lrt, n=nrow(dge))\$table
-  etable <- etable[order(etable\$FDR), ]
-  pdf(output_7)
-  with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
-  with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
-  abline(h=c(-1,1), col="blue")
-  dev.off()
+  if (QC == TRUE) {
+    print("Creating MA plots...")
+    
+    etable <- topTags(lrt, n=nrow(dge))\$table
+    etable <- etable[order(etable\$FDR), ]
+    pdf(output_7)
+    with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
+    with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
+    abline(h=c(-1,1), col="blue")
+    dev.off()
+  }
+  print("Done!")
 }
-print("Done!")
 		</configfile>
 	</configfiles>