Mercurial > repos > fubar > differential_count_models
comparison rgedgeRpaired_nocamera.xml @ 124:731315bd6e48 draft
Uploaded
author | fubar |
---|---|
date | Tue, 25 Nov 2014 05:50:07 -0500 |
parents | 51f998262ada |
children | 999d4b5939bb |
comparison
equal
deleted
inserted
replaced
123:51f998262ada | 124:731315bd6e48 |
---|---|
6 <requirement type="package" version="9.10">ghostscript</requirement> | 6 <requirement type="package" version="9.10">ghostscript</requirement> |
7 <requirement type="package" version="2.14">biocbasics</requirement> | 7 <requirement type="package" version="2.14">biocbasics</requirement> |
8 </requirements> | 8 </requirements> |
9 | 9 |
10 <command interpreter="python"> | 10 <command interpreter="python"> |
11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts" | 11 rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts" |
12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" | 12 --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" |
13 </command> | 13 </command> |
14 <inputs> | 14 <inputs> |
15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" | 15 <param name="input1" type="data" format="tabular" label="Select an input matrix - rows are contigs, columns are counts for each sample" |
16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/> | 16 help="Use the HTSeq based count matrix preparation tool to create these matrices from BAM/SAM files and a GTF file of genomic features"/> |
20 <valid initial="string.letters,string.digits"><add value="_" /> </valid> | 20 <valid initial="string.letters,string.digits"><add value="_" /> </valid> |
21 </sanitizer> | 21 </sanitizer> |
22 </param> | 22 </param> |
23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/> | 23 <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/> |
24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" | 24 <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" |
25 multiple="true" use_header_names="true" size="120" display="checkboxes"> | 25 multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True"> |
26 <validator type="no_options" message="Please select at least one column."/> | 26 <validator type="no_options" message="Please select at least one column."/> |
27 </param> | 27 </param> |
28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/> | 28 <param name="control_name" type="text" value="Control" size="50" label="Control Name"/> |
29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" | 29 <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" |
30 multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true"> | 30 multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True"> |
31 </param> | 31 </param> |
32 <param name="subjectids" type="text" optional="true" size="120" value = "" | 32 <param name="subjectids" type="text" optional="true" size="120" value = "" |
33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input" | 33 label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input" |
34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'"> | 34 help="Leave blank if no pairing, but eg if data from sample id A99 is in columns 2,4 and id C21 is in 3,5 then enter 'A99,C21,A99,C21'"> |
35 <sanitizer> | 35 <sanitizer> |
83 label="Run the same model with Voom/limma and compare findings" | 83 label="Run the same model with Voom/limma and compare findings" |
84 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma"> | 84 help="Voom uses counts per million and a precise transformation of variance so count data can be analysed using limma"> |
85 <option value="F" selected="true">Do not run VOOM</option> | 85 <option value="F" selected="true">Do not run VOOM</option> |
86 <option value="T">Run VOOM</option> | 86 <option value="T">Run VOOM</option> |
87 </param> | 87 </param> |
88 <!-- | |
89 <conditional name="camera"> | |
90 <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets" | |
91 help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history"> | |
92 <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option> | |
93 <option value="T">Run GSEA tests with the Camera algorithm</option> | |
94 </param> | |
95 <when value="T"> | |
96 <conditional name="gmtSource"> | |
97 <param name="refgmtSource" type="select" | |
98 label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set"> | |
99 <option value="indexed" selected="true">Use a built-in gene set</option> | |
100 <option value="history">Use a gene set from my history</option> | |
101 <option value="both">Add a gene set from my history to a built in gene set</option> | |
102 </param> | |
103 <when value="indexed"> | |
104 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis"> | |
105 <options from_data_table="gseaGMT_3.1"> | |
106 <filter type="sort_by" column="2" /> | |
107 <validator type="no_options" message="No GMT v3.1 files are available - please install them"/> | |
108 </options> | |
109 </param> | |
110 </when> | |
111 <when value="history"> | |
112 <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" /> | |
113 </when> | |
114 <when value="both"> | |
115 <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" /> | |
116 <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis"> | |
117 <options from_data_table="gseaGMT_4"> | |
118 <filter type="sort_by" column="2" /> | |
119 <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/> | |
120 </options> | |
121 </param> | |
122 </when> | |
123 </conditional> | |
124 </when> | |
125 <when value="F"> | |
126 </when> | |
127 </conditional> | |
128 --> | |
129 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control" | 88 <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control" |
130 help="Conventional default value of 0.05 recommended"/> | 89 help="Conventional default value of 0.05 recommended"/> |
131 <param name="fdrtype" type="select" label="FDR (Type II error) control method" | 90 <param name="fdrtype" type="select" label="FDR (Type II error) control method" |
132 help="Use fdr or bh typically to control for the number of tests in a reliable way"> | 91 help="Use fdr or bh typically to control for the number of tests in a reliable way"> |
133 <option value="fdr" selected="true">fdr</option> | 92 <option value="fdr" selected="true">fdr</option> |
270 grid(col="lightgray", lty="dotted") | 229 grid(col="lightgray", lty="dotted") |
271 dev.off() | 230 dev.off() |
272 } | 231 } |
273 | 232 |
274 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname) | 233 boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname) |
275 { # | 234 { |
276 nc = ncol(rawrs) | 235 nc = ncol(rawrs) |
277 for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} | 236 ##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} |
278 fullnames = colnames(rawrs) | 237 fullnames = colnames(rawrs) |
279 newcolnames = substr(colnames(rawrs),1,20) | 238 newcolnames = substr(colnames(rawrs),1,20) |
280 colnames(rawrs) = newcolnames | 239 colnames(rawrs) = newcolnames |
281 newcolnames = substr(colnames(cleanrs),1,20) | 240 newcolnames = substr(colnames(cleanrs),1,20) |
282 colnames(cleanrs) = newcolnames | 241 colnames(cleanrs) = newcolnames |
283 defpar = par(no.readonly=T) | 242 defpar = par(no.readonly=T) |
284 print.noquote('raw contig counts by sample:') | 243 print.noquote('@@@ Raw contig counts by sample:') |
285 print.noquote(summary(rawrs)) | 244 print.noquote(summary(rawrs)) |
286 print.noquote('normalised contig counts by sample:') | 245 print.noquote('@@@ Library size contig counts by sample:') |
287 print.noquote(summary(cleanrs)) | 246 print.noquote(summary(cleanrs)) |
288 pdf(pdfname) | 247 pdf(pdfname) |
289 par(mfrow=c(1,2)) | 248 par(mfrow=c(1,2)) |
290 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint)) | 249 boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts') |
291 grid(col="lightgray",lty="dotted") | 250 grid(col="lightgray",lty="dotted") |
292 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint)) | 251 boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint)) |
293 grid(col="lightgray",lty="dotted") | 252 grid(col="lightgray",lty="dotted") |
294 dev.off() | 253 dev.off() |
295 pdfname = "sample_counts_histogram.pdf" | 254 pdfname = "sample_counts_histogram.pdf" |
296 nc = ncol(rawrs) | 255 nc = ncol(rawrs) |
297 print.noquote(paste('Using ncol rawrs=',nc)) | 256 print.noquote(paste('Using ncol rawrs=',nc)) |
321 | 280 |
322 } | 281 } |
323 | 282 |
324 cumPlot = function(rawrs,cleanrs,maint,myTitle) | 283 cumPlot = function(rawrs,cleanrs,maint,myTitle) |
325 { # updated to use ecdf | 284 { # updated to use ecdf |
326 pdfname = "Filtering_rowsum_bar_charts.pdf" | 285 pdfname = "Differential_rowsum_bar_charts.pdf" |
327 defpar = par(no.readonly=T) | 286 defpar = par(no.readonly=T) |
328 lrs = log(rawrs,10) | 287 lrs = log(rawrs,10) |
329 lim = max(lrs) | 288 lim = max(lrs) |
330 pdf(pdfname) | 289 pdf(pdfname) |
331 par(mfrow=c(2,1)) | 290 par(mfrow=c(2,1)) |
515 } | 474 } |
516 | 475 |
517 | 476 |
518 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5, | 477 edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5, |
519 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F, | 478 fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F, |
520 filterquantile=0.2, subjects=c(),mydesign=NULL, | 479 filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL, |
521 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19', | 480 doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19', |
522 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", | 481 histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", |
523 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary') | 482 doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary') |
524 { | 483 { |
525 | 484 |
526 | 485 logf = file('Differential.log', open = "a") |
527 run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR) | 486 sink(logf,type = c("output", "message")) |
487 | |
488 | |
489 run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod) | |
528 { | 490 { |
529 sink('edgeR.log') | 491 logf = file('edgeR.log', open = "a") |
492 sink(logf,type = c("output", "message")) | |
530 #### Setup myDGEList object | 493 #### Setup myDGEList object |
531 myDGEList = DGEList(counts=workCM, group = group) | 494 myDGEList = DGEList(counts=workCM, group = group) |
532 myDGEList = calcNormFactors(myDGEList) | 495 myDGEList = calcNormFactors(myDGEList) |
533 if (robust_meth == 'ordinary') { | 496 if (robust_meth == 'ordinary') { |
534 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) | 497 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) |
543 } | 506 } |
544 | 507 |
545 | 508 |
546 DGLM = glmFit(myDGEList,design=mydesign) | 509 DGLM = glmFit(myDGEList,design=mydesign) |
547 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed | 510 DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed |
548 efflib = myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors | |
549 normData = cpm(myDGEList) | 511 normData = cpm(myDGEList) |
550 ### normData = (1e+06*myDGEList\$counts/efflib) | |
551 uoutput = cbind( | 512 uoutput = cbind( |
552 Name=as.character(rownames(myDGEList\$counts)), | 513 Name=as.character(rownames(myDGEList\$counts)), |
553 DE\$table, | 514 DE\$table, |
554 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), | 515 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), |
555 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData, | 516 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums,normData, |
568 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") | 529 qq = qqnorm(z, panel.first=grid(), main="tagwise dispersion") |
569 abline(0,1,lwd=3) | 530 abline(0,1,lwd=3) |
570 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon") | 531 points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon") |
571 dev.off() | 532 dev.off() |
572 uniqueg = unique(group) | 533 uniqueg = unique(group) |
573 #### Plot MDS | |
574 sample_colors = match(group,levels(group)) | |
575 sampleTypes = levels(factor(group)) | |
576 print.noquote(sampleTypes) | |
577 pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_')) | |
578 plotMDS.DGEList(myDGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) | |
579 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19) | |
580 grid(col="blue") | |
581 dev.off() | |
582 colnames(normData) = paste( colnames(normData),'N',sep="_") | |
583 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) | |
584 nzd = data.frame(log(nonzerod + 1e-2,10)) | |
585 try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_') )) | |
586 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F) | 534 write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F) |
587 tt = cbind( | 535 tt = cbind( |
588 Name=as.character(rownames(myDGEList)), | 536 Name=as.character(rownames(myDGEList)), |
589 DE\$table, | 537 DE\$table, |
590 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), | 538 adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), |
591 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums | 539 Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums |
592 ) | 540 ) |
593 print.noquote("# edgeR Top tags\n") | 541 print.noquote("@@ edgeR Top tags\n") |
594 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely | 542 tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely |
595 tt = tt[order(DE\$table\$PValue),] | |
596 print.noquote(tt[1:50,]) | 543 print.noquote(tt[1:50,]) |
597 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,]) | 544 deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,]) |
598 nsig = length(deTags) | 545 nsig = length(deTags) |
599 print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F) | 546 print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh)) |
600 deColours = ifelse(deTags,'red','black') | 547 deColours = ifelse(deTags,'red','black') |
601 pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_")) | 548 pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_")) |
602 plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance") | 549 plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance") |
603 dev.off() | 550 dev.off() |
604 dg = myDGEList[order(DE\$table\$PValue),] | 551 dg = myDGEList[order(DE\$table\$PValue),] |
605 #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg))) | |
606 outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_") | 552 outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_") |
607 hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap')) | 553 ocpm = normData[order(DE\$table\$PValue),] |
554 ocpm = ocpm[c(1:100),] | |
555 hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap')) | |
608 outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_") | 556 outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_") |
609 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') | 557 outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') |
610 smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) | 558 smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) |
611 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_')) | 559 qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_')) |
612 norm.factor = myDGEList\$samples\$norm.factors | |
613 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ] | 560 topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ] |
614 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR)) | 561 edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR)) |
615 edgeRcounts = rep(0, length(allgenes)) | 562 edgeRcounts = rep(0, length(allgenes)) |
616 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits | 563 edgeRcounts[edgeRcountsindex] = 1 # Create venn diagram of hits |
617 sink() | 564 sink() |
620 | 567 |
621 | 568 |
622 run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) | 569 run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) |
623 | 570 |
624 { | 571 { |
625 sink("DESeq2.log") | 572 logf = file("DESeq2.log", open = "a") |
573 sink(logf,type = c("output", "message")) | |
626 # DESeq2 | 574 # DESeq2 |
627 require('DESeq2') | 575 require('DESeq2') |
628 library('RColorBrewer') | 576 library('RColorBrewer') |
629 if (length(subjects) == 0) | 577 if (length(subjects) == 0) |
630 { | 578 { |
632 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx)) | 580 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ Rx)) |
633 } else { | 581 } else { |
634 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM)) | 582 pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM)) |
635 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx)) | 583 deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx)) |
636 } | 584 } |
637 #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype) | |
638 #rDESeq = results(DESeq2) | |
639 #newCountDataSet(workCM, group) | |
640 deSeqDatsizefac = estimateSizeFactors(deSEQds) | 585 deSeqDatsizefac = estimateSizeFactors(deSEQds) |
641 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType) | 586 deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType) |
642 resDESeq = nbinomWaldTest(deSeqDatdisp) | 587 resDESeq = nbinomWaldTest(deSeqDatdisp) |
643 rDESeq = as.data.frame(results(resDESeq)) | 588 rDESeq = as.data.frame(results(resDESeq)) |
644 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls) | 589 rDESeq = cbind(Contig=rownames(workCM),rDESeq,NReads=cmrowsums,URL=contigurls) |
665 sdmat = as.matrix(sampledists) | 610 sdmat = as.matrix(sampledists) |
666 pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_")) | 611 pdf(paste("DESeq2",mt,"sample_distance_plot.pdf",sep="_")) |
667 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"), | 612 heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"), |
668 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255)) | 613 col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255)) |
669 dev.off() | 614 dev.off() |
670 ###outpdfname=paste("DESeq2",mt,"top50_heatmap.pdf",sep="_") | |
671 ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle)) | |
672 sink() | |
673 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) ) | 615 result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) ) |
674 if ("try-error" %in% class(result)) { | 616 if ("try-error" %in% class(result)) { |
675 print.noquote('DESeq2 plotPCA failed.') | 617 print.noquote('DESeq2 plotPCA failed.') |
676 } else { | 618 } else { |
677 pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_")) | 619 pdf(paste("DESeq2",mt,"PCA_plot.pdf",sep="_")) |
678 #### wtf - print? Seems needed to get this to work | 620 #### wtf - print? Seems needed to get this to work |
679 print(ppca) | 621 print(ppca) |
680 dev.off() | 622 dev.off() |
681 } | 623 } |
624 sink() | |
682 return(DESeqcounts) | 625 return(DESeqcounts) |
683 } | 626 } |
684 | 627 |
685 | 628 |
686 run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom) | 629 run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom) |
687 { | 630 { |
688 sink('VOOM.log') | 631 logf = file('VOOM.log', open = "a") |
689 if (doedgeR == F) { | 632 sink(logf,type = c("output", "message")) |
690 #### Setup myDGEList object | 633 if (doedgeR == F) { |
691 myDGEList = DGEList(counts=workCM, group = group) | 634 #### Setup myDGEList object |
692 myDGEList = calcNormFactors(myDGEList) | 635 myDGEList = DGEList(counts=workCM, group = group) |
693 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) | 636 myDGEList = calcNormFactors(myDGEList) |
694 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) | 637 myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) |
695 myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) | 638 myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) |
696 } | 639 myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) |
697 pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_')) | 640 } |
698 dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL) | 641 pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_')) |
699 dev.off() | 642 dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL) |
700 # Use limma to fit data | 643 dev.off() |
701 fit = lmFit(dat.voomed, mydesign) | 644 # Use limma to fit data |
702 fit = eBayes(fit) | 645 fit = lmFit(dat.voomed, mydesign) |
703 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none") | 646 fit = eBayes(fit) |
704 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_')) | 647 rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none") |
705 rownames(rvoom) = rownames(workCM) | 648 qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_')) |
706 rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls) | 649 rownames(rvoom) = rownames(workCM) |
707 srvoom = rvoom[order(rvoom\$P.Value),] | 650 rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls) |
708 cat("# VOOM top 50\n") | 651 srvoom = rvoom[order(rvoom\$P.Value),] |
709 print(srvoom[1:50,]) | 652 cat("# VOOM top 50\n") |
710 write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F) | 653 print(srvoom[1:50,]) |
711 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma | 654 write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F) |
712 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] | 655 # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma |
713 voomcountsindex <- which(allgenes %in% rownames(topresults.voom)) | 656 topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] |
714 voomcounts = rep(0, length(allgenes)) | 657 voomcountsindex <- which(allgenes %in% rownames(topresults.voom)) |
715 voomcounts[voomcountsindex] = 1 | 658 voomcounts = rep(0, length(allgenes)) |
716 sink() | 659 voomcounts[voomcountsindex] = 1 |
717 return(voomcounts) | 660 sink() |
718 } | 661 return(voomcounts) |
662 } | |
719 | 663 |
720 | 664 |
721 #### data cleaning and analsis control starts here | 665 #### data cleaning and analsis control starts here |
666 | |
722 | 667 |
723 # Error handling | 668 # Error handling |
724 nugroup = length(unique(group)) | 669 nugroup = length(unique(group)) |
725 if (nugroup!=2){ | 670 if (nugroup!=2){ |
726 print("Number of conditions identified in experiment does not equal 2") | 671 print("Number of conditions identified in experiment does not equal 2") |
736 rawrs = rowSums(Count_Matrix) | 681 rawrs = rowSums(Count_Matrix) |
737 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes | 682 nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes |
738 nzN = nrow(nonzerod) | 683 nzN = nrow(nonzerod) |
739 nzrs = rowSums(nonzerod) | 684 nzrs = rowSums(nonzerod) |
740 zN = allN - nzN | 685 zN = allN - nzN |
741 print('# Quantiles for non-zero row counts:',quote=F) | 686 print('@@@ Quantiles for non-zero row counts:',quote=F) |
742 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) | 687 print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) |
743 if (useNDF == T) | 688 if (useNDF == T) |
744 { | 689 { |
745 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut | 690 gt1rpin3 = rowSums(Count_Matrix/expandAsMatrix(colTotmillionreads,dim(Count_Matrix)) >= 1) >= nscut |
746 lo = colSums(Count_Matrix[!gt1rpin3,]) | 691 lo = colSums(Count_Matrix[!gt1rpin3,]) |
772 contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>") | 717 contigurls = paste0(ucsc,"&position=chr",testreg[,2],":",testreg[,3],"-",testreg[,4],"\'>",allgenes,"</a>") |
773 } else { | 718 } else { |
774 print("@@ using genecards substitution for urls") | 719 print("@@ using genecards substitution for urls") |
775 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>") | 720 contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>") |
776 } | 721 } |
777 print.noquote("# urls sample") | 722 print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=','))) |
778 print.noquote(head(contigurls)) | |
779 print(paste("# Total low count contigs per sample = ",table(lo)),quote=F) | |
780 cmrowsums = rowSums(workCM) | 723 cmrowsums = rowSums(workCM) |
781 TName=unique(group)[1] | 724 TName=unique(group)[1] |
782 CName=unique(group)[2] | 725 CName=unique(group)[2] |
783 if (is.null(mydesign)) { | 726 if (is.null(mydesign)) { |
784 if (length(subjects) == 0) | 727 if (length(subjects) == 0) |
791 } | 734 } |
792 } | 735 } |
793 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) | 736 print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) |
794 print.noquote('Using design matrix:') | 737 print.noquote('Using design matrix:') |
795 print.noquote(mydesign) | 738 print.noquote(mydesign) |
739 normData = cpm(workCM)*1e6 | |
740 colnames(normData) = paste( colnames(workCM),'N',sep="_") | |
741 print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) | |
742 | |
796 if (doedgeR == T) { | 743 if (doedgeR == T) { |
797 eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR) | 744 eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod) |
798 myDGEList = eres\$myDGEList | 745 myDGEList = eres\$myDGEList |
799 edgeRcounts = eres\$edgeRcounts | 746 edgeRcounts = eres\$edgeRcounts |
747 #### Plot MDS | |
748 sample_colors = match(group,levels(group)) | |
749 sampleTypes = levels(factor(group)) | |
750 print.noquote(sampleTypes) | |
751 pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_')) | |
752 plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) | |
753 legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19) | |
754 grid(col="blue") | |
755 dev.off() | |
756 scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors | |
757 normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale)) | |
758 try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T) | |
800 } | 759 } |
801 if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) } | 760 if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) } |
802 if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) } | 761 if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) } |
803 | 762 |
804 | 763 |
821 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes) | 780 counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes) |
822 } | 781 } |
823 | 782 |
824 if (nrow(counts.dataframe > 1)) { | 783 if (nrow(counts.dataframe > 1)) { |
825 counts.venn = vennCounts(counts.dataframe) | 784 counts.venn = vennCounts(counts.dataframe) |
826 vennf = paste("Venn",mt,"significant_genes_overlap.pdf",sep="_") | 785 vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_") |
827 pdf(vennf) | 786 pdf(vennf) |
828 vennDiagram(counts.venn,main=vennmain,col="maroon") | 787 vennDiagram(counts.venn,main=vennmain,col="maroon") |
829 dev.off() | 788 dev.off() |
830 } | 789 } |
831 } #### doDESeq2 or doVoom | 790 } #### doDESeq2 or doVoom |
832 | 791 sink() |
833 } | 792 } |
834 #### Done | 793 #### Done |
835 ]]> | 794 ]]> |
836 | |
837 ###sink(stdout(),append=T,type="message") | |
838 builtin_gmt = "" | 795 builtin_gmt = "" |
839 history_gmt = "" | 796 history_gmt = "" |
840 history_gmt_name = "" | 797 history_gmt_name = "" |
841 out_edgeR = F | 798 out_edgeR = F |
842 out_DESeq2 = F | 799 out_DESeq2 = F |
843 out_Voom = "$out_VOOM" | 800 out_Voom = "$out_VOOM" |
844 edgeR_robust_meth = "ordinary" # control robust deviance options | 801 edgeR_robust_meth = "ordinary" |
845 doDESeq2 = $DESeq2.doDESeq2 | 802 doDESeq2 = $DESeq2.doDESeq2 |
846 doVoom = $doVoom | 803 doVoom = $doVoom |
847 doCamera = F | 804 doCamera = F |
848 doedgeR = $edgeR.doedgeR | 805 doedgeR = $edgeR.doedgeR |
849 edgeR_priordf = 10 | 806 edgeR_priordf = 10 |
881 org = "$input1.dbkey" | 838 org = "$input1.dbkey" |
882 if (org == "") { org = "hg19"} | 839 if (org == "") { org = "hg19"} |
883 fdrtype = "$fdrtype" | 840 fdrtype = "$fdrtype" |
884 fdrthresh = $fdrthresh | 841 fdrthresh = $fdrthresh |
885 useNDF = $useNDF | 842 useNDF = $useNDF |
886 fQ = $fQ | 843 fQ = $fQ # non-differential centile cutoff |
887 myTitle = "$title" | 844 myTitle = "$title" |
888 sids = strsplit("$subjectids",',') | 845 sids = strsplit("$subjectids",',') |
889 subjects = unlist(sids) | 846 subjects = unlist(sids) |
890 nsubj = length(subjects) | 847 nsubj = length(subjects) |
891 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 | 848 TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 |
893 cat('Got TCols=') | 850 cat('Got TCols=') |
894 cat(TCols) | 851 cat(TCols) |
895 cat('; CCols=') | 852 cat('; CCols=') |
896 cat(CCols) | 853 cat(CCols) |
897 cat('\n') | 854 cat('\n') |
855 <![CDATA[ | |
898 useCols = c(TCols,CCols) | 856 useCols = c(TCols,CCols) |
899 if (file.exists(Out_Dir) == F) dir.create(Out_Dir) | 857 if (file.exists(Out_Dir) == F) dir.create(Out_Dir) |
900 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') | 858 Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') |
901 snames = colnames(Count_Matrix) | 859 snames = colnames(Count_Matrix) |
902 nsamples = length(snames) | 860 nsamples = length(snames) |
903 if (nsubj > 0 & nsubj != nsamples) { | 861 if (nsubj > 0 & nsubj != nsamples) { |
904 options("show.error.messages"=T) | 862 options("show.error.messages"=T) |
905 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','), | 863 mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','), |
906 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=',')) | 864 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=',')) |
907 write(mess, stderr()) | 865 write(mess, stderr()) |
908 quit(save="no",status=4) | 866 quit(save="no",status=4) |
909 } | 867 } |
910 if (length(subjects) != 0) {subjects = subjects[useCols]} | 868 if (length(subjects) != 0) {subjects = subjects[useCols]} |
911 Count_Matrix = Count_Matrix[,useCols] | 869 Count_Matrix = Count_Matrix[,useCols] ### reorder columns |
912 rn = rownames(Count_Matrix) | 870 rn = rownames(Count_Matrix) |
913 islib = rn %in% c('librarySize','NotInBedRegions') | 871 islib = rn %in% c('librarySize','NotInBedRegions') |
914 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first | 872 LibSizes = Count_Matrix[subset(rn,islib),][1] # take first |
915 Count_Matrix = Count_Matrix[subset(rn,! islib),] | 873 Count_Matrix = Count_Matrix[subset(rn,! islib),] |
916 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) | 874 group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) |
917 group = factor(group, levels=c(ControlName,TreatmentName)) | 875 group = factor(group, levels=c(ControlName,TreatmentName)) |
918 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") | 876 colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") |
919 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2, | 877 results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2, |
920 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.', | 878 fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.', |
921 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects, | 879 myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName, |
922 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org, | 880 doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org, |
923 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth) | 881 histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth) |
924 sessionInfo() | 882 sessionInfo() |
925 | 883 |
884 sink() | |
885 ]]> | |
926 </configfile> | 886 </configfile> |
927 </configfiles> | 887 </configfiles> |
928 <help> | 888 <help> |
929 | 889 |
930 **What it does** | 890 **What it does** |
934 | 894 |
935 **Input** | 895 **Input** |
936 | 896 |
937 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper | 897 Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper |
938 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the | 898 and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the |
939 non-negative integer count of reads from one sample overlapping the feature. | 899 non-negative integer count of reads from one sample overlapping the feature. |
900 | |
940 The matrix must have a header row uniquely identifying the source samples, and unique row names in | 901 The matrix must have a header row uniquely identifying the source samples, and unique row names in |
941 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. | 902 the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. |
903 They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers. | |
942 | 904 |
943 **Specifying comparisons** | 905 **Specifying comparisons** |
944 | 906 |
945 This is basically dumbed down for two factors - case vs control. | 907 This is basically dumbed down for two factors - case vs control. |
946 | 908 |