Mercurial > repos > fubar > differential_count_models
changeset 124:731315bd6e48 draft
Uploaded
author | fubar |
---|---|
date | Tue, 25 Nov 2014 05:50:07 -0500 |
parents | 51f998262ada |
children | 7e0b4a60da89 |
files | rgedgeRpaired_nocamera.xml |
diffstat | 1 files changed, 91 insertions(+), 129 deletions(-) [+] |
line wrap: on
line diff
--- a/rgedgeRpaired_nocamera.xml Tue Nov 25 05:36:47 2014 -0500 +++ b/rgedgeRpaired_nocamera.xml Tue Nov 25 05:50:07 2014 -0500 @@ -8,7 +8,7 @@ </requirements> <command interpreter="python"> - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "DifferentialCounts" + rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts" --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" </command> <inputs> @@ -22,12 +22,12 @@ </param> <param name="treatment_name" type="text" value="Treatment" size="50" label="Treatment Name"/> <param name="Treat_cols" label="Select columns containing treatment." type="data_column" data_ref="input1" numerical="True" - multiple="true" use_header_names="true" size="120" display="checkboxes"> + multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True"> <validator type="no_options" message="Please select at least one column."/> </param> <param name="control_name" type="text" value="Control" size="50" label="Control Name"/> <param name="Control_cols" label="Select columns containing control." type="data_column" data_ref="input1" numerical="True" - multiple="true" use_header_names="true" size="120" display="checkboxes" optional="true"> + multiple="true" use_header_names="true" size="120" display="checkboxes" force_select="True"> </param> <param name="subjectids" type="text" optional="true" size="120" value = "" label="IF SUBJECTS NOT ALL INDEPENDENT! Enter comma separated strings to indicate sample labels for (eg) pairing - must be one for every column in input" @@ -85,47 +85,6 @@ <option value="F" selected="true">Do not run VOOM</option> <option value="T">Run VOOM</option> </param> - <!-- - <conditional name="camera"> - <param name="doCamera" type="select" label="Run the edgeR implementation of Camera GSEA for up/down gene sets" - help="If yes, you can choose a set of genesets to test and/or supply a gmt format geneset collection from your history"> - <option value="F" selected="true">Do not run GSEA tests with the Camera algorithm</option> - <option value="T">Run GSEA tests with the Camera algorithm</option> - </param> - <when value="T"> - <conditional name="gmtSource"> - <param name="refgmtSource" type="select" - label="Use a gene set (.gmt) from your history and/or use a built-in (MSigDB etc) gene set"> - <option value="indexed" selected="true">Use a built-in gene set</option> - <option value="history">Use a gene set from my history</option> - <option value="both">Add a gene set from my history to a built in gene set</option> - </param> - <when value="indexed"> - <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis"> - <options from_data_table="gseaGMT_3.1"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No GMT v3.1 files are available - please install them"/> - </options> - </param> - </when> - <when value="history"> - <param name="ownGMT" type="data" format="gmt" label="Select a Gene Set from your history" /> - </when> - <when value="both"> - <param name="ownGMT" type="data" format="gseagmt" label="Select a Gene Set from your history" /> - <param name="builtinGMT" type="select" label="Select a gene set matrix (.gmt) file to use for the analysis"> - <options from_data_table="gseaGMT_4"> - <filter type="sort_by" column="2" /> - <validator type="no_options" message="No GMT v4 files are available - please fix tool_data_table and loc files"/> - </options> - </param> - </when> - </conditional> - </when> - <when value="F"> - </when> - </conditional> - --> <param name="fdrthresh" type="float" value="0.05" size="5" label="P value threshold for FDR filtering for amily wise error rate control" help="Conventional default value of 0.05 recommended"/> <param name="fdrtype" type="select" label="FDR (Type II error) control method" @@ -272,24 +231,24 @@ } boxPlot = function(rawrs,cleanrs,maint,myTitle,pdfname) -{ # +{ nc = ncol(rawrs) - for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} + ##### for (i in c(1:nc)) {rawrs[(rawrs[,i] < 0),i] = NA} fullnames = colnames(rawrs) newcolnames = substr(colnames(rawrs),1,20) colnames(rawrs) = newcolnames newcolnames = substr(colnames(cleanrs),1,20) colnames(cleanrs) = newcolnames defpar = par(no.readonly=T) - print.noquote('raw contig counts by sample:') + print.noquote('@@@ Raw contig counts by sample:') print.noquote(summary(rawrs)) - print.noquote('normalised contig counts by sample:') + print.noquote('@@@ Library size contig counts by sample:') print.noquote(summary(cleanrs)) pdf(pdfname) par(mfrow=c(1,2)) - boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('Raw:',maint)) + boxplot(rawrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main='log2 raw counts') grid(col="lightgray",lty="dotted") - boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('After ',maint)) + boxplot(cleanrs,varwidth=T,notch=T,ylab='log contig count',col="maroon",las=3,cex.axis=0.35,main=paste('log2 counts after ',maint)) grid(col="lightgray",lty="dotted") dev.off() pdfname = "sample_counts_histogram.pdf" @@ -323,7 +282,7 @@ cumPlot = function(rawrs,cleanrs,maint,myTitle) { # updated to use ecdf - pdfname = "Filtering_rowsum_bar_charts.pdf" + pdfname = "Differential_rowsum_bar_charts.pdf" defpar = par(no.readonly=T) lrs = log(rawrs,10) lim = max(lrs) @@ -517,16 +476,20 @@ edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5, fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F, - filterquantile=0.2, subjects=c(),mydesign=NULL, + filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL, doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19', histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary') { +logf = file('Differential.log', open = "a") +sink(logf,type = c("output", "message")) -run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR) + +run_edgeR = function(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod) { - sink('edgeR.log') + logf = file('edgeR.log', open = "a") + sink(logf,type = c("output", "message")) #### Setup myDGEList object myDGEList = DGEList(counts=workCM, group = group) myDGEList = calcNormFactors(myDGEList) @@ -545,9 +508,7 @@ DGLM = glmFit(myDGEList,design=mydesign) DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed - efflib = myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors normData = cpm(myDGEList) - ### normData = (1e+06*myDGEList\$counts/efflib) uoutput = cbind( Name=as.character(rownames(myDGEList\$counts)), DE\$table, @@ -570,19 +531,6 @@ points(qq\$x[goodness\$outlier],qq\$y[goodness\$outlier], pch=16, col="maroon") dev.off() uniqueg = unique(group) - #### Plot MDS - sample_colors = match(group,levels(group)) - sampleTypes = levels(factor(group)) - print.noquote(sampleTypes) - pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_')) - plotMDS.DGEList(myDGEList,main=paste("edgeR MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) - legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19) - grid(col="blue") - dev.off() - colnames(normData) = paste( colnames(normData),'N',sep="_") - print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) - nzd = data.frame(log(nonzerod + 1e-2,10)) - try( boxPlot(rawrs=nzd,cleanrs=log(normData,10),maint='TMM Normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_') )) write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F) tt = cbind( Name=as.character(rownames(myDGEList)), @@ -590,26 +538,25 @@ adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), Dispersion=myDGEList\$tagwise.dispersion,totreads=cmrowsums ) - print.noquote("# edgeR Top tags\n") + print.noquote("@@ edgeR Top tags\n") tt = cbind(tt,URL=contigurls) # add to end so table isn't laid out strangely - tt = tt[order(DE\$table\$PValue),] print.noquote(tt[1:50,]) deTags = rownames(uoutput[uoutput\$adj.p.value < fdrthresh,]) nsig = length(deTags) - print(paste('#',nsig,'tags significant at adj p=',fdrthresh),quote=F) + print.noquote(paste('@@',nsig,'tags significant at adj p=',fdrthresh)) deColours = ifelse(deTags,'red','black') pdf(paste("edgeR",mt,"BCV_vs_abundance.pdf",sep="_")) plotBCV(myDGEList, cex=0.3, main="Biological CV vs abundance") dev.off() dg = myDGEList[order(DE\$table\$PValue),] - #normData = (1e+06 * dg\$counts/expandAsMatrix(dg\$samples\$lib.size, dim(dg))) outpdfname= paste("edgeR",mt,"top_100_heatmap.pdf",sep="_") - hmap2(normData,nsamp=100,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap')) + ocpm = normData[order(DE\$table\$PValue),] + ocpm = ocpm[c(1:100),] + hmap2(ocpm,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste(myTitle,'Heatmap')) outSmear = paste("edgeR",mt,"smearplot.pdf",sep="_") outMain = paste("Smear Plot for ",TName,' Vs ',CName,' (FDR@',fdrthresh,' N = ',nsig,')',sep='') smearPlot(myDGEList=myDGEList,deTags=deTags, outSmear=outSmear, outMain = outMain) qqPlot(descr=paste(myTitle,'edgeR adj p QQ plot'),pvector=tt\$adj.p.value,outpdf=paste('edgeR',mt,'qqplot.pdf',sep='_')) - norm.factor = myDGEList\$samples\$norm.factors topresults.edgeR = soutput[which(soutput\$adj.p.value < fdrthresh), ] edgeRcountsindex = which(allgenes %in% rownames(topresults.edgeR)) edgeRcounts = rep(0, length(allgenes)) @@ -622,7 +569,8 @@ run_DESeq2 = function(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) { - sink("DESeq2.log") + logf = file("DESeq2.log", open = "a") + sink(logf,type = c("output", "message")) # DESeq2 require('DESeq2') library('RColorBrewer') @@ -634,9 +582,6 @@ pdata = data.frame(Name=colnames(workCM),Rx=group,subjects=subjects,row.names=colnames(workCM)) deSEQds = DESeqDataSetFromMatrix(countData = workCM, colData = pdata, design = formula(~ subjects + Rx)) } - #DESeq2 = DESeq(deSEQds,fitType='local',pAdjustMethod=fdrtype) - #rDESeq = results(DESeq2) - #newCountDataSet(workCM, group) deSeqDatsizefac = estimateSizeFactors(deSEQds) deSeqDatdisp = estimateDispersions(deSeqDatsizefac,fitType=DESeq_fitType) resDESeq = nbinomWaldTest(deSeqDatdisp) @@ -667,9 +612,6 @@ heatmap.2(sdmat,trace="none",main=paste(myTitle,"DESeq2 sample distances"), col = colorRampPalette( rev(brewer.pal(9, "RdBu")) )(255)) dev.off() - ###outpdfname=paste("DESeq2",mt,"top50_heatmap.pdf",sep="_") - ###hmap2(sresDESeq,nsamp=50,TName=TName,group=group,outpdfname=outpdfname,myTitle=paste('DESeq2 vst rlog Heatmap',myTitle)) - sink() result = try( (ppca = plotPCA( varianceStabilizingTransformation(deSeqDatdisp,blind=T), intgroup=c("Rx","Name")) ) ) if ("try-error" %in% class(result)) { print.noquote('DESeq2 plotPCA failed.') @@ -679,47 +621,50 @@ print(ppca) dev.off() } + sink() return(DESeqcounts) } run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom) { - sink('VOOM.log') - if (doedgeR == F) { - #### Setup myDGEList object - myDGEList = DGEList(counts=workCM, group = group) - myDGEList = calcNormFactors(myDGEList) - myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) - myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) - myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) - } - pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_')) - dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL) - dev.off() - # Use limma to fit data - fit = lmFit(dat.voomed, mydesign) - fit = eBayes(fit) - rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none") - qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_')) - rownames(rvoom) = rownames(workCM) - rvoom = cbind(rvoom,NReads=cmrowsums,URL=contigurls) - srvoom = rvoom[order(rvoom\$P.Value),] - cat("# VOOM top 50\n") - print(srvoom[1:50,]) - write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F) - # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma - topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] - voomcountsindex <- which(allgenes %in% rownames(topresults.voom)) - voomcounts = rep(0, length(allgenes)) - voomcounts[voomcountsindex] = 1 - sink() - return(voomcounts) - } + logf = file('VOOM.log', open = "a") + sink(logf,type = c("output", "message")) + if (doedgeR == F) { + #### Setup myDGEList object + myDGEList = DGEList(counts=workCM, group = group) + myDGEList = calcNormFactors(myDGEList) + myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) + myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) + myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) + } + pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_')) + dat.voomed <- voom(myDGEList, mydesign, plot = TRUE, normalize.method="quantil", lib.size = NULL) + dev.off() + # Use limma to fit data + fit = lmFit(dat.voomed, mydesign) + fit = eBayes(fit) + rvoom = topTable(fit, coef = length(colnames(mydesign)), adj = fdrtype, n = Inf, sort="none") + qqPlot(descr=paste(myTitle,'VOOM-limma adj p QQ plot'),pvector=rvoom\$adj.P.Val,outpdf=paste('VOOM',mt,'qqplot.pdf',sep='_')) + rownames(rvoom) = rownames(workCM) + rvoom = cbind(Contig=rownames(workCM),rvoom,NReads=cmrowsums,URL=contigurls) + srvoom = rvoom[order(rvoom\$P.Value),] + cat("# VOOM top 50\n") + print(srvoom[1:50,]) + write.table(srvoom,file=out_Voom, quote=FALSE, sep="\t",row.names=F) + # Use an FDR cutoff to find interesting samples for edgeR, DESeq and voom/limma + topresults.voom = rvoom[which(rvoom\$adj.P.Val < fdrthresh), ] + voomcountsindex <- which(allgenes %in% rownames(topresults.voom)) + voomcounts = rep(0, length(allgenes)) + voomcounts[voomcountsindex] = 1 + sink() + return(voomcounts) + } #### data cleaning and analsis control starts here + # Error handling nugroup = length(unique(group)) if (nugroup!=2){ @@ -738,7 +683,7 @@ nzN = nrow(nonzerod) nzrs = rowSums(nonzerod) zN = allN - nzN - print('# Quantiles for non-zero row counts:',quote=F) + print('@@@ Quantiles for non-zero row counts:',quote=F) print(quantile(nzrs,probs=seq(0,1,0.1)),quote=F) if (useNDF == T) { @@ -774,9 +719,7 @@ print("@@ using genecards substitution for urls") contigurls = paste0(genecards,allgenes,"\'>",allgenes,"</a>") } - print.noquote("# urls sample") - print.noquote(head(contigurls)) - print(paste("# Total low count contigs per sample = ",table(lo)),quote=F) + print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=','))) cmrowsums = rowSums(workCM) TName=unique(group)[1] CName=unique(group)[2] @@ -793,10 +736,26 @@ print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) print.noquote('Using design matrix:') print.noquote(mydesign) + normData = cpm(workCM)*1e6 + colnames(normData) = paste( colnames(workCM),'N',sep="_") + print(paste('Raw sample read totals',paste(colSums(nonzerod,na.rm=T),collapse=','))) + if (doedgeR == T) { - eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR) + eres = run_edgeR(workCM,pdata,subjects,group,priordf,robust_meth,mydesign,mt,cmrowsums,out_edgeR,nonzerod) myDGEList = eres\$myDGEList edgeRcounts = eres\$edgeRcounts + #### Plot MDS + sample_colors = match(group,levels(group)) + sampleTypes = levels(factor(group)) + print.noquote(sampleTypes) + pdf(paste("edgeR",mt,"MDSplot.pdf",sep='_')) + plotMDS.DGEList(myDGEList,main=paste("MDS for",myTitle),cex=0.5,col=sample_colors,pch=sample_colors) + legend(x="topleft", legend = sampleTypes,col=c(1:length(sampleTypes)), pch=19) + grid(col="blue") + dev.off() + scale <- myDGEList\$samples\$lib.size*myDGEList\$samples\$norm.factors + normCounts <- round(t(t(myDGEList\$counts)/scale)*mean(scale)) + try({boxPlot(rawrs=nzd,cleanrs=log2(normCounts+1),maint='Effects of TMM size normalisation',myTitle=myTitle,pdfname=paste("edgeR",mt,"raw_norm_counts_box.pdf",sep='_'))},T) } if (doDESeq2 == T) { DESeqcounts = run_DESeq2(workCM,pdata,subjects,group,out_DESeq2,mt,DESeq_fitType) } if (doVoom == T) { voomcounts = run_Voom(workCM,pdata,subjects,group,mydesign,mt,out_Voom) } @@ -823,25 +782,23 @@ if (nrow(counts.dataframe > 1)) { counts.venn = vennCounts(counts.dataframe) - vennf = paste("Venn",mt,"significant_genes_overlap.pdf",sep="_") + vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_") pdf(vennf) vennDiagram(counts.venn,main=vennmain,col="maroon") dev.off() } } #### doDESeq2 or doVoom - +sink() } #### Done ]]> - -###sink(stdout(),append=T,type="message") builtin_gmt = "" history_gmt = "" history_gmt_name = "" out_edgeR = F out_DESeq2 = F out_Voom = "$out_VOOM" -edgeR_robust_meth = "ordinary" # control robust deviance options +edgeR_robust_meth = "ordinary" doDESeq2 = $DESeq2.doDESeq2 doVoom = $doVoom doCamera = F @@ -883,7 +840,7 @@ fdrtype = "$fdrtype" fdrthresh = $fdrthresh useNDF = $useNDF -fQ = $fQ +fQ = $fQ # non-differential centile cutoff myTitle = "$title" sids = strsplit("$subjectids",',') subjects = unlist(sids) @@ -895,12 +852,13 @@ cat('; CCols=') cat(CCols) cat('\n') +<![CDATA[ useCols = c(TCols,CCols) if (file.exists(Out_Dir) == F) dir.create(Out_Dir) Count_Matrix = read.table(Input,header=T,row.names=1,sep='\t') snames = colnames(Count_Matrix) nsamples = length(snames) -if (nsubj > 0 & nsubj != nsamples) { +if (nsubj > 0 & nsubj != nsamples) { options("show.error.messages"=T) mess = paste('Fatal error: Supplied subject id list',paste(subjects,collapse=','), 'has length',nsubj,'but there are',nsamples,'samples',paste(snames,collapse=',')) @@ -908,21 +866,23 @@ quit(save="no",status=4) } if (length(subjects) != 0) {subjects = subjects[useCols]} -Count_Matrix = Count_Matrix[,useCols] +Count_Matrix = Count_Matrix[,useCols] ### reorder columns rn = rownames(Count_Matrix) islib = rn %in% c('librarySize','NotInBedRegions') LibSizes = Count_Matrix[subset(rn,islib),][1] # take first Count_Matrix = Count_Matrix[subset(rn,! islib),] group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) group = factor(group, levels=c(ControlName,TreatmentName)) -colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") +colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2, fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.', - myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects, + myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName, doDESeq2=doDESeq2,doVoom=doVoom,doCamera=doCamera,doedgeR=doedgeR,org=org, histgmt=history_gmt,bigmt=builtin_gmt,DESeq_fitType=DESeq_fitType,robust_meth=edgeR_robust_meth) sessionInfo() +sink() +]]> </configfile> </configfiles> <help> @@ -936,9 +896,11 @@ Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the -non-negative integer count of reads from one sample overlapping the feature. +non-negative integer count of reads from one sample overlapping the feature. + The matrix must have a header row uniquely identifying the source samples, and unique row names in the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. +They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers. **Specifying comparisons**