comparison rgedgeRpaired_nocamera.xml @ 124:731315bd6e48 draft

Uploaded
author fubar
date Tue, 25 Nov 2014 05:50:07 -0500
parents 51f998262ada
children 999d4b5939bb
comparison
equal deleted inserted replaced
123:51f998262ada 124:731315bd6e48
6 <requirement type="package" version="9.10">ghostscript</requirement> 6 <requirement type="package" version="9.10">ghostscript</requirement>
7 <requirement type="package" version="2.14">biocbasics</requirement> 7 <requirement type="package" version="2.14">biocbasics</requirement>
8 </requirements> 8 </requirements>
9 9
10 <command interpreter="python"> 10 <command interpreter="python">
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts" 11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts"
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" 12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes"
13 </command> 13 </command>
14 <inputs> 14 <inputs>
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" 15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample"
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/> 16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/>
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid> 20 <valid initial="string.letters,string.digits"><add value="_" /> </valid>
21 </sanitizer> 21 </sanitizer>
22 </param> 22 </param>
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/> 23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/>
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" 24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True"
25 multiple="true" use_header_names="true" size="120" display="checkboxes"> 25 multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True">
26 <validator type="no_options" message="Please select at least one column."/> 26 <validator type="no_options" message="Please select at least one column."/>
27 </param> 27 </param>
28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/> 28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/>
29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" 29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True"
30 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true"> 30 multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True">
31 </param> 31 </param>
32 <param name="subjectids" type="text" optional="true" size="120" value = "" 32 <param name="subjectids" type="text" optional="true" size="120" value = ""
33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input" 33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input"
34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'"> 34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'">
35 <sanitizer> 35 <sanitizer>
83 label="Run the same model with Voom/limma and compare findings" 83 label="Run the same model with Voom/limma and compare findings"
84 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma"> 84 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma">
85 <option value="F" selected="true">Do not run VOOM</option> 85 <option value="F" selected="true">Do not run VOOM</option>
86 <option value="T">Run VOOM</option> 86 <option value="T">Run VOOM</option>
87 </param> 87 </param>
88 <!--
89 <conditional name="camera">
90 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets"
91 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history">
92 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option>
93 <option value="T">Run GSEA tests with the Camera algorithm</option>
94 </param>
95 <when value="T">
96 <conditional name="gmtSource">
97 <param name="refgmtSource" type="select"
98 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set">
99 <option value="indexed" selected="true">Use a built-in gene set</option>
100 <option value="history">Use a gene set from my history</option>
101 <option value="both">Add a gene set from my history to a built in gene set</option>
102 </param>
103 <when value="indexed">
104 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
105 <options from_data_table="gseaGMT_3.1">
106 <filter type="sort_by" column="2" />
107 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/>
108 </options>
109 </param>
110 </when>
111 <when value="history">
112 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" />
113 </when>
114 <when value="both">
115 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" />
116 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis">
117 <options from_data_table="gseaGMT_4">
118 <filter type="sort_by" column="2" />
119 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/>
120 </options>
121 </param>
122 </when>
123 </conditional>
124 </when>
125 <when value="F">
126 </when>
127 </conditional>
128 -->
129 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control" 88 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control"
130 help="Conventional default value of 0.05 recommended"/> 89 help="Conventional default value of 0.05 recommended"/>
131 <param name="fdrtype" type="select" label="FDR (Type II error) control method" 90 <param name="fdrtype" type="select" label="FDR (Type II error) control method"
132 help="Use fdr or bh typically to control for the number of tests in a reliable way"> 91 help="Use fdr or bh typically to control for the number of tests in a reliable way">
133 <option value="fdr" selected="true">fdr</option> 92 <option value="fdr" selected="true">fdr</option>
270 grid(col="lightgray", lty="dotted") 229 grid(col="lightgray", lty="dotted")
271 dev.off() 230 dev.off()
272 } 231 }
273 232
274 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname) 233 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname)
275 { # 234 {
276 nc = ncol(rawrs) 235 nc = ncol(rawrs)
277 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} 236 ##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA}
278 fullnames = colnames(rawrs) 237 fullnames = colnames(rawrs)
279 newcolnames = substr(colnames(rawrs),1,20) 238 newcolnames = substr(colnames(rawrs),1,20)
280 colnames(rawrs) = newcolnames 239 colnames(rawrs) = newcolnames
281 newcolnames = substr(colnames(cleanrs),1,20) 240 newcolnames = substr(colnames(cleanrs),1,20)
282 colnames(cleanrs) = newcolnames 241 colnames(cleanrs) = newcolnames
283 defpar = par(no.readonly=T) 242 defpar = par(no.readonly=T)
284 print.noquote('raw contig counts by sample:') 243 print.noquote('@@@ Raw contig counts by sample:')
285 print.noquote(summary(rawrs)) 244 print.noquote(summary(rawrs))
286 print.noquote('normalised contig counts by sample:') 245 print.noquote('@@@ Library size contig counts by sample:')
287 print.noquote(summary(cleanrs)) 246 print.noquote(summary(cleanrs))
288 pdf(pdfname) 247 pdf(pdfname)
289 par(mfrow=c(1,2)) 248 par(mfrow=c(1,2))
290 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint)) 249 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts')
291 grid(col="lightgray",lty="dotted") 250 grid(col="lightgray",lty="dotted")
292 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint)) 251 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint))
293 grid(col="lightgray",lty="dotted") 252 grid(col="lightgray",lty="dotted")
294 dev.off() 253 dev.off()
295 pdfname = "sample_counts_histogram.pdf" 254 pdfname = "sample_counts_histogram.pdf"
296 nc = ncol(rawrs) 255 nc = ncol(rawrs)
297 print.noquote(paste('Using ncol rawrs=',nc)) 256 print.noquote(paste('Using ncol rawrs=',nc))
321 280
322 } 281 }
323 282
324 cumPlot = function(rawrs,cleanrs,maint,myTitle) 283 cumPlot = function(rawrs,cleanrs,maint,myTitle)
325 { # updated to use ecdf 284 { # updated to use ecdf
326 pdfname = "Filtering_rowsum_bar_charts.pdf" 285 pdfname = "Differential_rowsum_bar_charts.pdf"
327 defpar = par(no.readonly=T) 286 defpar = par(no.readonly=T)
328 lrs = log(rawrs,10) 287 lrs = log(rawrs,10)
329 lim = max(lrs) 288 lim = max(lrs)
330 pdf(pdfname) 289 pdf(pdfname)
331 par(mfrow=c(2,1)) 290 par(mfrow=c(2,1))
515 } 474 }
516 475
517 476
518 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5, 477 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5,
519 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F, 478 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F,
520 filterquantile=0.2, subjects=c(),mydesign=NULL, 479 filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL,
521 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19', 480 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19',
522 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", 481 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt",
523 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary') 482 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary')
524 { 483 {
525 484
526 485 logf = file('Differential.log', open = "a")
527 run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR) 486 sink(logf,type = c("output", "message"))
487
488
489 run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
528 { 490 {
529 sink('edgeR.log') 491 logf = file('edgeR.log', open = "a")
492 sink(logf,type = c("output", "message"))
530 #### Setup myDGEList object 493 #### Setup myDGEList object
531 myDGEList = DGEList(counts=workCM, group = group) 494 myDGEList = DGEList(counts=workCM, group = group)
532 myDGEList = calcNormFactors(myDGEList) 495 myDGEList = calcNormFactors(myDGEList)
533 if (robust_meth == 'ordinary') { 496 if (robust_meth == 'ordinary') {
534 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) 497 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
543 } 506 }
544 507
545 508
546 DGLM = glmFit(myDGEList,design=mydesign) 509 DGLM = glmFit(myDGEList,design=mydesign)
547 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed 510 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed
548 efflib = myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors
549 normData = cpm(myDGEList) 511 normData = cpm(myDGEList)
550 ### normData = (1e+06*myDGEList\$counts/efflib)
551 uoutput = cbind( 512 uoutput = cbind(
552 Name=as.character(rownames(myDGEList\$counts)), 513 Name=as.character(rownames(myDGEList\$counts)),
553 DE\$table, 514 DE\$table,
554 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), 515 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
555 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData, 516 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData,
568 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") 529 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion")
569 abline(0,1,lwd=3) 530 abline(0,1,lwd=3)
570 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon") 531 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon")
571 dev.off() 532 dev.off()
572 uniqueg = unique(group) 533 uniqueg = unique(group)
573 #### Plot MDS
574 sample_colors = match(group,levels(group))
575 sampleTypes = levels(factor(group))
576 print.noquote(sampleTypes)
577 pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_'))
578 plotMDS.DGEList(myDGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
579 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
580 grid(col="blue")
581 dev.off()
582 colnames(normData) = paste( colnames(normData),'N',sep="_")
583 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
584 nzd = data.frame(log(nonzerod + 1e-2,10))
585 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_') ))
586 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F) 534 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F)
587 tt = cbind( 535 tt = cbind(
588 Name=as.character(rownames(myDGEList)), 536 Name=as.character(rownames(myDGEList)),
589 DE\$table, 537 DE\$table,
590 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), 538 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype),
591 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums 539 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums
592 ) 540 )
593 print.noquote("# edgeR Top tags\n") 541 print.noquote("@@ edgeR Top tags\n")
594 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely 542 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely
595 tt = tt[order(DE\$table\$PValue),]
596 print.noquote(tt[1:50,]) 543 print.noquote(tt[1:50,])
597 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,]) 544 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,])
598 nsig = length(deTags) 545 nsig = length(deTags)
599 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F) 546 print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh))
600 deColours = ifelse(deTags,'red','black') 547 deColours = ifelse(deTags,'red','black')
601 pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_")) 548 pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_"))
602 plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance") 549 plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance")
603 dev.off() 550 dev.off()
604 dg = myDGEList[order(DE\$table\$PValue),] 551 dg = myDGEList[order(DE\$table\$PValue),]
605 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg)))
606 outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_") 552 outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_")
607 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap')) 553 ocpm = normData[order(DE\$table\$PValue),]
554 ocpm = ocpm[c(1:100),]
555 hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap'))
608 outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_") 556 outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_")
609 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') 557 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='')
610 smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) 558 smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain)
611 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_')) 559 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_'))
612 norm.factor = myDGEList\$samples\$norm.factors
613 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ] 560 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ]
614 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR)) 561 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR))
615 edgeRcounts = rep(0, length(allgenes)) 562 edgeRcounts = rep(0, length(allgenes))
616 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits 563 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits
617 sink() 564 sink()
620 567
621 568
622 run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) 569 run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType)
623 570
624 { 571 {
625 sink("DESeq2.log") 572 logf = file("DESeq2.log", open = "a")
573 sink(logf,type = c("output", "message"))
626 # DESeq2 574 # DESeq2
627 require('DESeq2') 575 require('DESeq2')
628 library('RColorBrewer') 576 library('RColorBrewer')
629 if (length(subjects) == 0) 577 if (length(subjects) == 0)
630 { 578 {
632 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx)) 580 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx))
633 } else { 581 } else {
634 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM)) 582 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM))
635 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx)) 583 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx))
636 } 584 }
637 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype)
638 #rDESeq = results(DESeq2)
639 #newCountDataSet(workCM, group)
640 deSeqDatsizefac = estimateSizeFactors(deSEQds) 585 deSeqDatsizefac = estimateSizeFactors(deSEQds)
641 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType) 586 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType)
642 resDESeq = nbinomWaldTest(deSeqDatdisp) 587 resDESeq = nbinomWaldTest(deSeqDatdisp)
643 rDESeq = as.data.frame(results(resDESeq)) 588 rDESeq = as.data.frame(results(resDESeq))
644 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls) 589 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls)
665 sdmat = as.matrix(sampledists) 610 sdmat = as.matrix(sampledists)
666 pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_")) 611 pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_"))
667 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"), 612 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"),
668 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255)) 613 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255))
669 dev.off() 614 dev.off()
670 ###outpdfname=paste("DESeq2",mt,"top50_heatmap.pdf",sep="_")
671 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle))
672 sink()
673 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) ) 615 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) )
674 if ("try-error" %in% class(result)) { 616 if ("try-error" %in% class(result)) {
675 print.noquote('DESeq2 plotPCA failed.') 617 print.noquote('DESeq2 plotPCA failed.')
676 } else { 618 } else {
677 pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_")) 619 pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_"))
678 #### wtf - print? Seems needed to get this to work 620 #### wtf - print? Seems needed to get this to work
679 print(ppca) 621 print(ppca)
680 dev.off() 622 dev.off()
681 } 623 }
624 sink()
682 return(DESeqcounts) 625 return(DESeqcounts)
683 } 626 }
684 627
685 628
686 run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom) 629 run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom)
687 { 630 {
688 sink('VOOM.log') 631 logf = file('VOOM.log', open = "a")
689 if (doedgeR == F) { 632 sink(logf,type = c("output", "message"))
690 #### Setup myDGEList object 633 if (doedgeR == F) {
691 myDGEList = DGEList(counts=workCM, group = group) 634 #### Setup myDGEList object
692 myDGEList = calcNormFactors(myDGEList) 635 myDGEList = DGEList(counts=workCM, group = group)
693 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) 636 myDGEList = calcNormFactors(myDGEList)
694 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) 637 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign)
695 myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) 638 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign)
696 } 639 myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign)
697 pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_')) 640 }
698 dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL) 641 pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_'))
699 dev.off() 642 dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL)
700 # Use limma to fit data 643 dev.off()
701 fit = lmFit(dat.voomed, mydesign) 644 # Use limma to fit data
702 fit = eBayes(fit) 645 fit = lmFit(dat.voomed, mydesign)
703 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none") 646 fit = eBayes(fit)
704 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_')) 647 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none")
705 rownames(rvoom) = rownames(workCM) 648 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_'))
706 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls) 649 rownames(rvoom) = rownames(workCM)
707 srvoom = rvoom[order(rvoom\$P.Value),] 650 rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls)
708 cat("# VOOM top 50\n") 651 srvoom = rvoom[order(rvoom\$P.Value),]
709 print(srvoom[1:50,]) 652 cat("# VOOM top 50\n")
710 write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F) 653 print(srvoom[1:50,])
711 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma 654 write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F)
712 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] 655 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma
713 voomcountsindex <- which(allgenes %in% rownames(topresults.voom)) 656 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ]
714 voomcounts = rep(0, length(allgenes)) 657 voomcountsindex <- which(allgenes %in% rownames(topresults.voom))
715 voomcounts[voomcountsindex] = 1 658 voomcounts = rep(0, length(allgenes))
716 sink() 659 voomcounts[voomcountsindex] = 1
717 return(voomcounts) 660 sink()
718 } 661 return(voomcounts)
662 }
719 663
720 664
721 #### data cleaning and analsis control starts here 665 #### data cleaning and analsis control starts here
666
722 667
723 # Error handling 668 # Error handling
724 nugroup = length(unique(group)) 669 nugroup = length(unique(group))
725 if (nugroup!=2){ 670 if (nugroup!=2){
726 print("Number of conditions identified in experiment does not equal 2") 671 print("Number of conditions identified in experiment does not equal 2")
736 rawrs = rowSums(Count_Matrix) 681 rawrs = rowSums(Count_Matrix)
737 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes 682 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes
738 nzN = nrow(nonzerod) 683 nzN = nrow(nonzerod)
739 nzrs = rowSums(nonzerod) 684 nzrs = rowSums(nonzerod)
740 zN = allN - nzN 685 zN = allN - nzN
741 print('# Quantiles for non-zero row counts:',quote=F) 686 print('@@@ Quantiles for non-zero row counts:',quote=F)
742 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) 687 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F)
743 if (useNDF == T) 688 if (useNDF == T)
744 { 689 {
745 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut 690 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut
746 lo = colSums(Count_Matrix[!gt1rpin3,]) 691 lo = colSums(Count_Matrix[!gt1rpin3,])
772 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>") 717 contigurls = paste0(ucsc,"&amp;position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>")
773 } else { 718 } else {
774 print("@@ using genecards substitution for urls") 719 print("@@ using genecards substitution for urls")
775 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>") 720 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>")
776 } 721 }
777 print.noquote("# urls sample") 722 print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=',')))
778 print.noquote(head(contigurls))
779 print(paste("# Total low count contigs per sample = ",table(lo)),quote=F)
780 cmrowsums = rowSums(workCM) 723 cmrowsums = rowSums(workCM)
781 TName=unique(group)[1] 724 TName=unique(group)[1]
782 CName=unique(group)[2] 725 CName=unique(group)[2]
783 if (is.null(mydesign)) { 726 if (is.null(mydesign)) {
784 if (length(subjects) == 0) 727 if (length(subjects) == 0)
791 } 734 }
792 } 735 }
793 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) 736 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=',')))
794 print.noquote('Using design matrix:') 737 print.noquote('Using design matrix:')
795 print.noquote(mydesign) 738 print.noquote(mydesign)
739 normData = cpm(workCM)*1e6
740 colnames(normData) = paste( colnames(workCM),'N',sep="_")
741 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=',')))
742
796 if (doedgeR == T) { 743 if (doedgeR == T) {
797 eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR) 744 eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod)
798 myDGEList = eres\$myDGEList 745 myDGEList = eres\$myDGEList
799 edgeRcounts = eres\$edgeRcounts 746 edgeRcounts = eres\$edgeRcounts
747 #### Plot MDS
748 sample_colors = match(group,levels(group))
749 sampleTypes = levels(factor(group))
750 print.noquote(sampleTypes)
751 pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_'))
752 plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors)
753 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19)
754 grid(col="blue")
755 dev.off()
756 scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors
757 normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale))
758 try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T)
800 } 759 }
801 if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) } 760 if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) }
802 if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) } 761 if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) }
803 762
804 763
821 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes) 780 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes)
822 } 781 }
823 782
824 if (nrow(counts.dataframe > 1)) { 783 if (nrow(counts.dataframe > 1)) {
825 counts.venn = vennCounts(counts.dataframe) 784 counts.venn = vennCounts(counts.dataframe)
826 vennf = paste("Venn",mt,"significant_genes_overlap.pdf",sep="_") 785 vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_")
827 pdf(vennf) 786 pdf(vennf)
828 vennDiagram(counts.venn,main=vennmain,col="maroon") 787 vennDiagram(counts.venn,main=vennmain,col="maroon")
829 dev.off() 788 dev.off()
830 } 789 }
831 } #### doDESeq2 or doVoom 790 } #### doDESeq2 or doVoom
832 791 sink()
833 } 792 }
834 #### Done 793 #### Done
835 ]]> 794 ]]>
836
837 ###sink(stdout(),append=T,type="message")
838 builtin_gmt = "" 795 builtin_gmt = ""
839 history_gmt = "" 796 history_gmt = ""
840 history_gmt_name = "" 797 history_gmt_name = ""
841 out_edgeR = F 798 out_edgeR = F
842 out_DESeq2 = F 799 out_DESeq2 = F
843 out_Voom = "$out_VOOM" 800 out_Voom = "$out_VOOM"
844 edgeR_robust_meth = "ordinary" # control robust deviance options 801 edgeR_robust_meth = "ordinary"
845 doDESeq2 = $DESeq2.doDESeq2 802 doDESeq2 = $DESeq2.doDESeq2
846 doVoom = $doVoom 803 doVoom = $doVoom
847 doCamera = F 804 doCamera = F
848 doedgeR = $edgeR.doedgeR 805 doedgeR = $edgeR.doedgeR
849 edgeR_priordf = 10 806 edgeR_priordf = 10
881 org = "$input1.dbkey" 838 org = "$input1.dbkey"
882 if (org == "") { org = "hg19"} 839 if (org == "") { org = "hg19"}
883 fdrtype = "$fdrtype" 840 fdrtype = "$fdrtype"
884 fdrthresh = $fdrthresh 841 fdrthresh = $fdrthresh
885 useNDF = $useNDF 842 useNDF = $useNDF
886 fQ = $fQ 843 fQ = $fQ # non-differential centile cutoff
887 myTitle = "$title" 844 myTitle = "$title"
888 sids = strsplit("$subjectids",',') 845 sids = strsplit("$subjectids",',')
889 subjects = unlist(sids) 846 subjects = unlist(sids)
890 nsubj = length(subjects) 847 nsubj = length(subjects)
891 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 848 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1
893 cat('Got TCols=') 850 cat('Got TCols=')
894 cat(TCols) 851 cat(TCols)
895 cat('; CCols=') 852 cat('; CCols=')
896 cat(CCols) 853 cat(CCols)
897 cat('\n') 854 cat('\n')
855 <![CDATA[
898 useCols = c(TCols,CCols) 856 useCols = c(TCols,CCols)
899 if (file.exists(Out_Dir) == F) dir.create(Out_Dir) 857 if (file.exists(Out_Dir) == F) dir.create(Out_Dir)
900 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') 858 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t')
901 snames = colnames(Count_Matrix) 859 snames = colnames(Count_Matrix)
902 nsamples = length(snames) 860 nsamples = length(snames)
903 if (nsubj &gt; 0 &amp; nsubj != nsamples) { 861 if (nsubj > 0 & nsubj != nsamples) {
904 options("show.error.messages"=T) 862 options("show.error.messages"=T)
905 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','), 863 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','),
906 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=',')) 864 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=','))
907 write(mess, stderr()) 865 write(mess, stderr())
908 quit(save="no",status=4) 866 quit(save="no",status=4)
909 } 867 }
910 if (length(subjects) != 0) {subjects = subjects[useCols]} 868 if (length(subjects) != 0) {subjects = subjects[useCols]}
911 Count_Matrix = Count_Matrix[,useCols] 869 Count_Matrix = Count_Matrix[,useCols] ### reorder columns
912 rn = rownames(Count_Matrix) 870 rn = rownames(Count_Matrix)
913 islib = rn %in% c('librarySize','NotInBedRegions') 871 islib = rn %in% c('librarySize','NotInBedRegions')
914 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first 872 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first
915 Count_Matrix = Count_Matrix[subset(rn,! islib),] 873 Count_Matrix = Count_Matrix[subset(rn,! islib),]
916 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) 874 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) )
917 group = factor(group, levels=c(ControlName,TreatmentName)) 875 group = factor(group, levels=c(ControlName,TreatmentName))
918 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") 876 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_")
919 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2, 877 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2,
920 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.', 878 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.',
921 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects, 879 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName,
922 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org, 880 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org,
923 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth) 881 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth)
924 sessionInfo() 882 sessionInfo()
925 883
884 sink()
885 ]]>
926 </configfile> 886 </configfile>
927 </configfiles> 887 </configfiles>
928 <help> 888 <help>
929 889
930 **What it does** 890 **What it does**
934 894
935 **Input** 895 **Input**
936 896
937 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper 897 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper
938 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the 898 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the
939 non-negative integer count of reads from one sample overlapping the feature. 899 non-negative integer count of reads from one sample overlapping the feature.
900
940 The matrix must have a header row uniquely identifying the source samples, and unique row names in 901 The matrix must have a header row uniquely identifying the source samples, and unique row names in
941 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. 902 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods.
903 They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers.
942 904
943 **Specifying comparisons** 905 **Specifying comparisons**
944 906
945 This is basically dumbed down for two factors - case vs control. 907 This is basically dumbed down for two factors - case vs control.
946 908