view edgeR_DGE.xml @ 24:86f91bf4ab4c draft

Uploaded
author yhoogstrate
date Tue, 20 May 2014 05:26:25 -0400
parents fa476f8e1f9e
children
line wrap: on
line source

<?xml version="1.0" encoding="UTF-8"?>
<tool id="edger_dge" name="edgeR Differential GeneExpression Analysis">
	<description>RNA-Seq expression analysis using edgeR (R package)</description>
	
	<command>
		<!--
			The following script is written in the "Cheetah" language:
			http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
		-->
		
		R --vanilla --slave -f $R_script '--args
			$design_matrix
			$contrast
			
			$output_count_edgeR 
			$output_cpm
			output_FPXM
			$output_raw_counts
			
			$qc
			$output_MDSplot
			$output_BCVplot
			$output_MAplot
			smearPlot '
			> $output_R
			2> stderr.txt
			&amp;&amp;
			grep -v 'Calculating library sizes from column' stderr.txt 1>&amp;2
	
	</command>
	
	<inputs>
		<param name="design_matrix" type="data" format="tabular" help="Design matrix" />
		
		<param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info." />
		
		<param name="qc" type="select" label="Quality control reports">
			<option value="true" selected="true">Yes</option>
			<option value="false">No</option>
		</param>
		
		<param name="debug" type="select" label="R Debug output">
			<option value="true"> Yes</option>
			<option value="false" selected="true">No</option>
		</param>
	</inputs>
	
	<configfiles>
		<configfile name="R_script">
library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping for loading limma
 
## Fetch commandline arguments
args &lt;- commandArgs(trailingOnly = TRUE)
designmatrix        = args[1]
contrast            = args[2]

output_1            = args[3]
output_2            = args[4]
output_3            = args[5]		##FPKM file - yet to be implemented
output_4            = args[6]

QC                  = nchar(args[7]) > 0

output_5            = args[8]
output_6            = args[9]
output_7            = args[10]

output_8            = args[11]


library(edgeR)
raw_data &lt;- read.delim(designmatrix,header=T,stringsAsFactors=T)

## Obtain read-counts

header = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
has_header = (class(header[1,1]) == "character")

read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]

for(i in 2:length(raw_data[,1])) {
  write("parsing counts from:",stdout())
  write(raw_data[i,1],stdout())
  
  header = read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
  has_header = (class(header[1,1]) == "character")
  table = read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]
  
  read_counts = cbind(read_counts,table)
}

colnames(read_counts) = as.character(raw_data[,2])



## Filter for HTSeq predifined counts:
exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")

exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
exclude = exclude[is.na(exclude)==0]
if(length(exclude) != 0)  {
  read_counts = read_counts[-exclude,]
}



empty_samples = apply(read_counts,2,function(x) sum(x) == 0)
if(sum(empty_samples) > 0) {
  write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
  write(colnames(read_counts)[empty_samples],stderr())
} else {
  dge = DGEList(counts=read_counts,genes=rownames(read_counts))

  design_tmp &lt;- raw_data[3:length(raw_data)]
  rownames(design_tmp)     &lt;- colnames(dge)
  formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
  design &lt;- model.matrix(as.formula(formula),design_tmp)

  prefixes = colnames(design_tmp)[attr(design,"assign")]
  avoid = nchar(prefixes) == nchar(colnames(design))
  replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
  replacements[avoid] = colnames(design)[avoid]
  colnames(design) = replacements



  write("Calculating normalization factors...",stdout())
  dge = calcNormFactors(dge)
  write("Estimating common dispersion...",stdout())
  dge = estimateGLMCommonDisp(dge,design)
  write("Estimating trended dispersion...",stdout())
  dge = estimateGLMTrendedDisp(dge,design)
  write("Estimating tagwise dispersion...",stdout())
  dge = estimateGLMTagwiseDisp(dge,design)




  if(QC == TRUE) {
    write("Creating QC plots...",stdout())
    #### MDS Plot
    pdf(output_5)
    plotMDS(dge, main="edgeR MDS Plot")
    dev.off()
    #### Biological coefficient of variation plot
    pdf(output_6)
    plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
    dev.off()
  }



  write("Fitting GLM...",stdout())
  fit   = glmFit(dge,design)

  write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
  cont &lt;- c(contrast)
  cont &lt;- makeContrasts(contrasts=cont, levels=design)

  lrt &lt;- glmLRT(fit, contrast=cont[,1])
  write(paste("Exporting to file: ",output_1,sep=""),stdout())
  write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T)
  write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
  ## todo EXPORT FPKM
  write.table(file=output_4,dge\$counts,sep="\t")
  
  
  
  if(QC == TRUE) {
    write("Creating MA plots...",stdout())
    
    etable &lt;- topTags(lrt, n=nrow(dge))\$table
    etable &lt;- etable[order(etable\$FDR), ]
    pdf(output_7)
    with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
    with(subset(etable, FDR&lt;0.05), points(logCPM, logFC, pch=20, col="red"))
    abline(h=c(-1,1), col="blue")
    dev.off()
  }
  write("Done!",stdout())
}
		</configfile>
	</configfiles>
	
	<outputs>
		<data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" />
		<data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
		<data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts" />
		
		<data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output" >
			<filter>(debug == "true")</filter>
		</data>
		
		<data format="pdf" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
			<filter>(qc == "true")</filter>
		</data>
		
		<data format="pdf" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
			<filter>(qc == "true")</filter>
		</data>
		
		<data format="pdf" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
			<filter>(qc == "true")</filter>
		</data>
	</outputs>
	
	<help>
		input: Design matrix using "create Design matrix" tool
		input: contrast
	</help>
</tool>