# HG changeset patch # User iuc # Date 1430794056 14400 # Node ID 3107df74056ed54791a976523cc68ba685323954 # Parent 1e20061decdd73700289acfd24aecb73abb9cd4a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/differential_count_models commit 344140b8df53b8b7024618bb04594607a045c03a diff -r 1e20061decdd -r 3107df74056e rgedgeRpaired_nocamera.xml --- a/rgedgeRpaired_nocamera.xml Wed Apr 29 12:07:19 2015 -0400 +++ b/rgedgeRpaired_nocamera.xml Mon May 04 22:47:36 2015 -0400 @@ -7,119 +7,16 @@ ghostscript biocbasics - - rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts" - --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - edgeR['doedgeR'] == "T" - - - DESeq2['doDESeq2'] == "T" - - - doVoom == "T" - - - - - - - - - - - - - - - - - - - - - - - - + + rgToolFactory.py --script_path "$runme" --interpreter "Rscript" --tool_name "Differential_Counts" + --output_dir "$html_file.files_path" --output_html "$html_file" --make_HTML "yes" + 20) + if (ncols > 20) { scale = 7*ncols/20 pdf(pdfname,width=scale,height=scale) - } else { + } else { pdf(pdfname) } par(mfrow=c(ncroot,ncroot)) for (i in c(1:nc)) { - hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon", + hist(rawrs[,i], main=paste("Contig logcount",i), xlab='log raw count', col="maroon", breaks=100,sub=fullnames[i],cex=0.8,ylim=c(0,ymax)) } dev.off() par(defpar) - + } cumPlot = function(rawrs,cleanrs,maint,myTitle) { # updated to use ecdf pdfname = "Differential_rowsum_bar_charts.pdf" defpar = par(no.readonly=T) - lrs = log(rawrs,10) + lrs = log(rawrs,10) lim = max(lrs) pdf(pdfname) par(mfrow=c(2,1)) hist(lrs,breaks=100,main=paste('Before:',maint),xlab="# Reads (log)", ylab="Count",col="maroon",sub=myTitle, xlim=c(0,lim),las=1) grid(col="lightgray", lty="dotted") - lrs = log(cleanrs,10) + lrs = log(cleanrs,10) hist(lrs,breaks=100,main=paste('After:',maint),xlab="# Reads (log)", ylab="Count",col="maroon",sub=myTitle,xlim=c(0,lim),las=1) grid(col="lightgray", lty="dotted") @@ -344,7 +241,7 @@ if (! is.null(camres)) { rownames(camres) = g # gene set name camres = cbind(GeneSet=g,URL=u,camres) - if (camres\$Direction == "Up") + if (camres\$Direction == "Up") { upcam = rbind(upcam,camres) } else { downcam = rbind(downcam,camres) @@ -433,7 +330,7 @@ { upcam = rbind(upcam,camres) } else { downcam = rbind(downcam,camres) } - + } } uscam = upcam[order(upcam\$PValue),] @@ -454,12 +351,12 @@ } -edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5, +edgeIt = function (Count_Matrix=c(),group=c(),out_edgeR=F,out_Voom=F,out_DESeq2=F,fdrtype='fdr',priordf=5, fdrthresh=0.05,outputdir='.', myTitle='Differential Counts',libSize=c(),useNDF=F, filterquantile=0.2, subjects=c(),TreatmentName="Rx",ControlName="Ctrl",mydesign=NULL, doDESeq2=T,doVoom=T,doCamera=T,doedgeR=T,org='hg19', histgmt="", bigmt="/data/genomes/gsea/3.1/Abetterchoice_nocgp_c2_c3_c5_symbols_all.gmt", - doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary') + doCook=F,DESeq_fitType="parameteric",robust_meth='ordinary') { logf = file('Differential.log', open = "a") @@ -476,20 +373,20 @@ if (robust_meth == 'ordinary') { myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) - if (priordf > 0) { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign,prior.df = priordf) + if (priordf > 0) { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign,prior.df = priordf) } else { myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) } comdisp = myDGEList\$common.dispersion estpriorn = getPriorN(myDGEList) print(paste("Common Dispersion =",comdisp,"CV = ",sqrt(comdisp),"getPriorN = ",estpriorn),quote=F) - } else { + } else { myDGEList = estimateGLMRobustDisp(myDGEList,design=mydesign, prior.df = priordf, maxit = 6, residual.type = robust_meth) } - - + + DGLM = glmFit(myDGEList,design=mydesign) DE = glmLRT(DGLM,coef=ncol(DGLM\$design)) # always last one - subject is first if needed normData = cpm(myDGEList) - uoutput = cbind( + uoutput = cbind( Name=as.character(rownames(myDGEList\$counts)), DE\$table, adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), @@ -501,7 +398,7 @@ if (sum(goodness\$outlier) > 0) { print.noquote('GLM outliers:') print(paste(rownames(DGLM)[(goodness\$outlier)],collapse=','),quote=F) - } else { + } else { print('No GLM fit outlier genes found\n') } z = limma::zscoreGamma(goodness\$gof.statistic, shape=goodness\$df/2, scale=2) @@ -512,7 +409,7 @@ dev.off() uniqueg = unique(group) write.table(soutput,file=out_edgeR, quote=FALSE, sep="\t",row.names=F) - tt = cbind( + tt = cbind( Name=as.character(rownames(myDGEList)), DE\$table, adj.p.value=p.adjust(DE\$table\$PValue, method=fdrtype), @@ -610,13 +507,13 @@ run_Voom = function(workCM,pdata,subjects,group,mydesign,mt,out_Voom) { logf = file('VOOM.log', open = "a") - sink(logf,type = c("output", "message")) + sink(logf,type = c("output", "message")) if (doedgeR == F) { #### Setup myDGEList object myDGEList = DGEList(counts=workCM, group = group) myDGEList = calcNormFactors(myDGEList) myDGEList = estimateGLMCommonDisp(myDGEList,mydesign) - myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) + myDGEList = estimateGLMTrendedDisp(myDGEList,mydesign) myDGEList = estimateGLMTagwiseDisp(myDGEList,mydesign) } pdf(paste("VOOM",mt,"mean_variance_plot.pdf",sep='_')) @@ -653,12 +550,12 @@ q() } require(edgeR) - options(width = 512) + options(width = 512) mt = paste(unlist(strsplit(myTitle,'_')),collapse=" ") allN = nrow(Count_Matrix) nscut = round(ncol(Count_Matrix)/2) # half samples colTotmillionreads = colSums(Count_Matrix)/1e6 - counts.dataframe = as.data.frame(c()) + counts.dataframe = as.data.frame(c()) rawrs = rowSums(Count_Matrix) nonzerod = Count_Matrix[(rawrs > 0),] # remove all zero count genes nzN = nrow(nonzerod) @@ -676,7 +573,7 @@ meth = paste( "After removing",length(lo),"contigs with fewer than ",nscut," sample read counts >= 1 per million, there are",sep="") print(paste("Read",allN,"contigs. Removed",zN,"contigs with no reads.",meth,cleanN,"contigs"),quote=F) maint = paste('Filter >=1/million reads in >=',nscut,'samples') - } else { + } else { useme = (nzrs > quantile(nzrs,filterquantile)) workCM = nonzerod[useme,] lo = colSums(nonzerod[!useme,]) @@ -700,20 +597,20 @@ print("@@ using genecards substitution for urls") contigurls = paste0(genecards,allgenes,"\'>",allgenes,"") } - print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=','))) + print.noquote(paste("@@ Total low count contigs per sample = ",paste(table(lo),collapse=','))) cmrowsums = rowSums(workCM) TName=unique(group)[1] CName=unique(group)[2] if (is.null(mydesign)) { - if (length(subjects) == 0) + if (length(subjects) == 0) { mydesign = model.matrix(~group) - } - else { + } + else { subjf = factor(subjects) mydesign = model.matrix(~subjf+group) # we block on subject so make group last to simplify finding it } - } + } print.noquote(paste('Using samples:',paste(colnames(workCM),collapse=','))) print.noquote('Using design matrix:') print.noquote(mydesign) @@ -751,7 +648,7 @@ if ((doDESeq2==T) || (doVoom==T) || (doedgeR==T)) { if ((doVoom==T) && (doDESeq2==T) && (doedgeR==T)) { vennmain = paste(mt,'Voom,edgeR and DESeq2 overlap at FDR=',fdrthresh) - counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, + counts.dataframe = data.frame(edgeR = edgeRcounts, DESeq2 = DESeqcounts, VOOM_limma = voomcounts, row.names = allgenes) } else if ((doDESeq2==T) && (doedgeR==T)) { vennmain = paste(mt,'DESeq2 and edgeR overlap at FDR=',fdrthresh) @@ -760,10 +657,10 @@ vennmain = paste(mt,'Voom and edgeR overlap at FDR=',fdrthresh) counts.dataframe = data.frame(edgeR = edgeRcounts, VOOM_limma = voomcounts, row.names = allgenes) } - + if (nrow(counts.dataframe > 1)) { counts.venn = vennCounts(counts.dataframe) - vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_") + vennf = paste("Differential_venn",mt,"significant_genes_overlap.pdf",sep="_") pdf(vennf) vennDiagram(counts.venn,main=vennmain,col="maroon") dev.off() @@ -779,7 +676,7 @@ out_edgeR = F out_DESeq2 = F out_Voom = "$out_VOOM" -edgeR_robust_meth = "ordinary" +edgeR_robust_meth = "ordinary" doDESeq2 = $DESeq2.doDESeq2 doVoom = $doVoom doCamera = F @@ -799,7 +696,7 @@ #if $edgeR.doedgeR == "T": out_edgeR = "$out_edgeR" - edgeR_priordf = $edgeR.edgeR_priordf + edgeR_priordf = $edgeR.edgeR_priordf edgeR_robust_meth = "$edgeR.edgeR_robust_method" #end if @@ -827,7 +724,7 @@ subjects = unlist(sids) nsubj = length(subjects) TCols = as.numeric(strsplit(TreatmentCols,",")[[1]])-1 -CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 +CCols = as.numeric(strsplit(ControlCols,",")[[1]])-1 cat('Got TCols=') cat(TCols) cat('; CCols=') @@ -836,7 +733,7 @@ 0 & nsubj != nsamples) { @@ -852,9 +749,9 @@ islib = rn %in% c('librarySize','NotInBedRegions') LibSizes = Count_Matrix[subset(rn,islib),][1] # take first Count_Matrix = Count_Matrix[subset(rn,! islib),] -group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) +group = c(rep(TreatmentName,length(TCols)), rep(ControlName,length(CCols)) ) group = factor(group, levels=c(ControlName,TreatmentName)) -colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") +colnames(Count_Matrix) = paste(group,colnames(Count_Matrix),sep="_") results = edgeIt(Count_Matrix=Count_Matrix,group=group, out_edgeR=out_edgeR, out_Voom=out_Voom, out_DESeq2=out_DESeq2, fdrtype='BH',mydesign=NULL,priordf=edgeR_priordf,fdrthresh=fdrthresh,outputdir='.', myTitle=myTitle,useNDF=F,libSize=c(),filterquantile=fQ,subjects=subjects,TreatmentName=TreatmentName,ControlName=ControlName, @@ -866,6 +763,109 @@ ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + edgeR['doedgeR'] == "T" + + + DESeq2['doDESeq2'] == "T" + + + doVoom == "T" + + + + + + + + + + + + + + + + + + + + + + + + **What it does** @@ -876,30 +876,30 @@ **Input** Requires a count matrix as a tabular file. These are best made using the companion HTSeq_ based counter Galaxy wrapper -and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the +and your fave gene model to generate inputs. Each row is a genomic feature (gene or exon eg) and each column the non-negative integer count of reads from one sample overlapping the feature. -The matrix must have a header row uniquely identifying the source samples, and unique row names in +The matrix must have a header row uniquely identifying the source samples, and unique row names in the first column. Typically the row names are gene symbols or probe ids for downstream use in GSEA and other methods. -They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers. +They must be unique and R names or they will be mangled - please read the fine R docs for the rules on identifiers. **Specifying comparisons** This is basically dumbed down for two factors - case vs control. -More complex interfaces are possible but painful at present. +More complex interfaces are possible but painful at present. Probably need to specify a phenotype file to do this better. Work in progress. Send code. If you have (eg) paired samples and wish to include a term in the GLM to account for some other factor (subject in the case of paired samples), -put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or +put a comma separated list of indicators for every sample (whether modelled or not!) indicating (eg) the subject number or A list of integers, one for each subject or an empty string if samples are all independent. If not empty, there must be exactly as many integers in the supplied integer list as there are columns (samples) in the count matrix. Integers for samples that are not in the analysis *must* be present in the string as filler even if not used. So if you have 2 pairs out of 6 samples, you need to put in unique integers for the unpaired ones eg if you had 6 samples with the first two independent but the second and third pairs each being from independent subjects. you might use -8,9,1,1,2,2 +8,9,1,1,2,2 as subject IDs to indicate two paired samples from the same subject in columns 3/4 and 5/6 **Methods available** @@ -916,7 +916,7 @@ **Outputs** -Some helpful plots and analysis results. Note that most of these are produced using R code +Some helpful plots and analysis results. Note that most of these are produced using R code suggested by the excellent documentation and vignettes for the Bioconductor packages invoked. The Tool Factory is used to automatically lay these out for you to enjoy. @@ -961,10 +961,10 @@ ***old rant on changes to Bioconductor package variable names between versions*** -The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) -breaking this and all other code that assumed the old name for this variable, -between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). -This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing +The edgeR authors made a small cosmetic change in the name of one important variable (from p.value to PValue) +breaking this and all other code that assumed the old name for this variable, +between edgeR2.4.4 and 2.4.6 (the version for R 2.14 as at the time of writing). +This means that all code using edgeR is sensitive to the version. I think this was a very unwise thing to do because it wasted hours of my time to track down and will similarly cost other edgeR users dearly when their old scripts break. This tool currently now works with 2.4.6. @@ -974,19 +974,19 @@ *prior.n* -The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. -You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood -in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your -tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the +The value for prior.n determines the amount of smoothing of tagwise dispersions towards the common dispersion. +You can think of it as like a "weight" for the common value. (It is actually the weight for the common likelihood +in the weighted likelihood equation). The larger the value for prior.n, the more smoothing, i.e. the closer your +tagwise dispersion estimates will be to the common dispersion. If you use a prior.n of 1, then that gives the common likelihood the weight of one observation. -In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, -or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that -you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation -(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? -What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. -If you have more samples, then the tagwise dispersion estimates will be more reliable, -so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. +In answer to your question, it is a good thing to squeeze the tagwise dispersions towards a common value, +or else you will be using very unreliable estimates of the dispersion. I would not recommend using the value that +you obtained from estimateSmoothing()---this is far too small and would result in virtually no moderation +(squeezing) of the tagwise dispersions. How many samples do you have in your experiment? +What is the experimental design? If you have few samples (less than 6) then I would suggest a prior.n of at least 10. +If you have more samples, then the tagwise dispersion estimates will be more reliable, +so you could consider using a smaller prior.n, although I would hesitate to use a prior.n less than 5. From Bioconductor Digest, Vol 118, Issue 5, Gordon writes: @@ -1023,17 +1023,17 @@ **Attributions** -edgeR - edgeR_ +edgeR - edgeR_ -VOOM/limma - limma_VOOM_ +VOOM/limma - limma_VOOM_ DESeq2 - DESeq2_ for details See above for Bioconductor package documentation for packages exposed in Galaxy by this tool and app store package. -Galaxy_ (that's what you are using right now!) for gluing everything together +Galaxy_ (that's what you are using right now!) for gluing everything together -Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is +Otherwise, all code and documentation comprising this tool was written by Ross Lazarus and is licensed to you under the LGPL_ like other rgenetics artefacts .. _LGPL: http://www.gnu.org/copyleft/lesser.html