Mercurial > repos > vladimir-daric > ebio_deseq
view function_deseq.R @ 0:8c5de60b3c04 draft
Uploaded
author | vladimir-daric |
---|---|
date | Fri, 25 Apr 2014 05:05:39 -0400 |
parents | |
children |
line wrap: on
line source
# Home made violin plot (Alban Ott) vioplot = function(x, DensCol="#00000060", BoxplotCol="#000000FF", InfFactorSpace=0.4,BoxplotFill="#FFFFFF00", YLAB="log10(normalized count)"){ #x must be a matrix or a dataframe # Number of samples : SampleNumber=length(colnames(x)) if(is.vector(x)){ x=as.matrix(x) } YNormalizedDensity=list() XNormalizedDensity=list() #Set a value to -Inf xNoInfo=x[x!=-Inf] OverallMinMax=c(min(xNoInfo),max(xNoInfo)) range=OverallMinMax[2]-OverallMinMax[1] Inf2Value=OverallMinMax[1]-range*InfFactorSpace breakInf=OverallMinMax[1]-range*(InfFactorSpace-.1) xValInf=x IsInf=xValInf==-Inf if(sum(IsInf)>0){ xValInf[xValInf==-Inf]=Inf2Value } #Pre compute densities for (i in 1:dim(xValInf)[2]){ dens=density(xValInf[,i]) YNormalizedDensity[[i]]=dens$x #Normalize density to fit boxplot XNormalizedDensity[[i]]=dens$y/max(dens$y)/5 } #Print boxplot with a range compatible with density plot if(sum(IsInf)>0){ ylim=c(OverallMinMax[1]-range*(InfFactorSpace+0.1),OverallMinMax[2]+range*0.1) if (SampleNumber > 8) { boxplot(x, boxwex=0.2, ylim=ylim, border=BoxplotCol, col=BoxplotFill, notch=T, yaxt="n", las=2) }else { boxplot(x, boxwex=0.2, ylim=ylim, border=BoxplotCol, col=BoxplotFill, notch=T, yaxt="n") } AxisTick=seq(floor(OverallMinMax[1]),round(OverallMinMax[2]),length.out=round(OverallMinMax[2])-floor(OverallMinMax[1])+1) axis(2,at=c(Inf2Value,AxisTick),c("-Inf",AxisTick),las=1) axis.break(2,breakInf,style="slash") }else{ if (SampleNumber > 8){ boxplot(x, boxwex=0.2, border=BoxplotCol, col=BoxplotFill, notch=T, las=2) }else { boxplot(x, boxwex=0.2, border=BoxplotCol, col=BoxplotFill, notch=T) } } mtext(side = 2, YLAB, line = 2) #Plot densities for (i in 1:dim(x)[2]){ points(i+XNormalizedDensity[[i]], YNormalizedDensity[[i]], type="l", lwd=3, col=DensCol) points(i-XNormalizedDensity[[i]], YNormalizedDensity[[i]], type="l", lwd=3, col=DensCol) if(sum(IsInf)>0){ points(i,Inf2Value,pch=8,col=DensCol,lwd=3) } } } ####################################################### # Histogram of p-value. Use adjusted p-value if # there are replicats ####################################################### PlotHistPval = function(res, rep, file, ImageSize=1960, PPI=200){ png( file=file , width=ImageSize, height=ImageSize, res=PPI ) if (rep){ hist(res$padj, breaks=100, col="black", border="black", main="",ylab="Number of elements (e.g. : genes)",xlab="adjusted p-value") }else{ hist(res$pval, breaks=100, col="black", border="black", main="",ylab="Number of elements (e.g. : genes)",xlab="raw p-value") } invisible(dev.off()) } ####################################################### # Plot violinplot of (un)normalized counts ####################################################### PlotViolin = function(cds, norm, file, ImageSize=1960, PPI=200, logBase=10){ png( file=file , width=ImageSize, height=ImageSize, res=PPI ) ## probleme avec les +/-inf NormCount = log(counts( cds, normalized=norm ), base=logBase) ## get correct ylab if ( norm ){ YLAB=paste("log",logBase,"(normalized count)",sep="") }else{ YLAB=paste("log",logBase,"(raw count)",sep="") } color_cond1 = colorRampPalette(brewer.pal(3, "Purples"))(sum(conditions(cds)==levels(conditions(cds))[1])) color_cond2 = colorRampPalette(brewer.pal(3, "RdPu"))(sum(conditions(cds)==levels(conditions(cds))[2])) colors = c(as.vector(color_cond1),as.vector(color_cond2)) labels = colnames( NormCount ) vioplot(NormCount,BoxplotFill=colors, YLAB=YLAB) invisible(dev.off()) } ############################################################## # Plot volcano with density plot of foldchange and pvalues ############################################################## PlotVolcano = function (res, cond1,cond2 ,alpha, fold, rep, file, ImageSize=1960,logBase=10, InfFactorSpace=0.4,PPI=200){ png( file=file , width=ImageSize, height=ImageSize ,res=PPI ) if ( rep ){ pvalCol=8 YLAB=paste("-log",logBase,"(adjusted p-value)",sep="") }else{ pvalCol=7 YLAB=paste("-log",logBase,"(raw p-value)",sep="") } ThreslogAlpha = -log(alpha, logBase) ThreslogFC = log(fold, logBase) logPVal = -log(res[,pvalCol], logBase) logFC=log(res$foldChange, logBase) #Set values to -/+Inf MinMaxlogFC=summary(logFC[abs(logFC)!=Inf])[c("Min.","Max.")] range=MinMaxlogFC[2]-MinMaxlogFC[1] Inf2Value=c(MinMaxlogFC[1]-range*InfFactorSpace, MinMaxlogFC[2]+range*InfFactorSpace) breakInf=c(MinMaxlogFC[1]-range*(InfFactorSpace-0.1), MinMaxlogFC[2]+range*(InfFactorSpace-0.1)) logFC[logFC==-Inf]=Inf2Value[1] logFC[logFC== Inf]=Inf2Value[2] #Precompute densities logFCDens=density(logFC) logPvalDens=density(logPVal) tmp=logPvalDens$x logPvalDens$x=logPvalDens$y logPvalDens$y=tmp #Compute xlim and ylim to align graphics marOutside=0.05 XLIM=c(min(logFCDens$x)-(max(logFCDens$x)-min(logFCDens$x))*marOutside,max(logFCDens$x)+(max(logFCDens$x)-min(logFCDens$x))*marOutside) YLIM=c(min(logPvalDens$y)-(max(logPvalDens$y)-min(logPvalDens$y))*marOutside,max(logPvalDens$y)+(max(logPvalDens$y)-min(logPvalDens$y))*marOutside) plot.new() split.screen(rbind( c(0,0.7,0,0.7), c(0,0.7,0.7,1), c(0.7,1,0,0.7) )) screen(1) # c(bottom, left, top, right) default : par(mar=c(5,6,3,4)+.1) par(mar=c(5,6,0,0)+.1) # volcano plot VolcanoCol=ifelse((log(res$foldChange, logBase) > ThreslogFC | log(res$foldChange, logBase) < -ThreslogFC)&(logPVal>ThreslogAlpha), "red","black") plot(logFC,logPVal,pch=20,cex=0.3,xlim=XLIM,ylim=YLIM,xaxt="n",col=VolcanoCol,xlab=paste("log",logBase,"(FoldChange ",paste(cond2,cond1,sep="/"),")",sep=""),ylab=YLAB,bty="l") AxisTick=seq(floor(MinMaxlogFC[1]),round(MinMaxlogFC[2]),length.out=round(MinMaxlogFC[2])-floor(MinMaxlogFC[1])+1) axis(1,at=c(Inf2Value[1],AxisTick,Inf2Value[2]),c("-Inf",AxisTick,"+Inf"),las=1) axis.break(1,breakInf[1],style="slash") axis.break(1,breakInf[2],style="slash") #pval threshold dashed line abline(h=ThreslogAlpha,col="red", lty=2, lwd=2 ) screen(2) par(mar=c(0,6,3,0)+.1) plot(logFCDens, col="black",main ="",xaxt="n",bty="n",ylab=paste("Density of\nlog",logBase,"(FoldChange)",sep=""),xlab="",xlim=XLIM) screen(3) par(mar=c(5,0,0,4)+.1) plot(logPvalDens,main="", col="black",yaxt="n",bty="n",xlab=paste("Density of\n",YLAB,sep=""),ylab="",ylim=YLIM) invisible(dev.off()) } ####################################################### # Plot mean of normalized counts vs log2 of foldchange ####################################################### MAPlot = function(res, OutPrefix, file, ImageSize=1960, PPI=200){ png( file=file , width=ImageSize, height=ImageSize, res=PPI ) plotMA(res) invisible(dev.off()) } ####################################################### # Plot estimation of dispersion ####################################################### EstDispPlot = function(cds, method, file, ImageSize=1960, PPI=200){ png( file=file , width=ImageSize, height=ImageSize, res=PPI ) if (method == "per-condition"){ #print(ls( cds@fitInfo)) par(mfcol=c(1,2)) for (cond in (ls(cds@fitInfo))){ plotDispEsts(cds, name=cond,main=paste("Estimate dispersion plot for ",cond,sep="")) } }else{ plotDispEsts(cds) } invisible(dev.off()) } ####################################################### # Histogram of null counts proportion per condition ####################################################### NullCondPlot = function(cds, file ,ImageSize=1960, PPI=200){ png( file=file , width=ImageSize/2, height=ImageSize, res=PPI ) CountsNorm_cond=list() CountsNull_cond=list() ## for each condition for (cond in rev(levels(conditions(cds)))){ # we count number of feature with no count CountsNorm_cond[[cond]]=counts( cds, normalized=TRUE )[ , conditions(cds)==cond ] # if there are no replicate dim is null and return R error if (is.null(dim(CountsNorm_cond[[cond]]))) { CountsNull_cond[[cond]]=sum(CountsNorm_cond[[cond]] == 0)/length(CountsNorm_cond[[cond]]) } else { CountsNull_cond[[cond]]=sum(apply(CountsNorm_cond[[cond]],1,sum) == 0)/nrow(CountsNorm_cond[[cond]]) } } # we plot number of feature with no count colors = c("#756bb1","#fa9fb5") barplot(as.numeric(CountsNull_cond), ylab="Proportion of null count elements (e.g. : genes)", names.arg = names(CountsNull_cond), col=colors, ylim=c(0,1)) invisible(dev.off()) } ####################################################### # Histogram of total reads count per sample ####################################################### TotReadSamplePlot = function(cds, file ,ImageSize=1960, PPI=200){ png( file=file , width=ImageSize, height=ImageSize, res=PPI ) RawCount = counts( cds, normalized=FALSE) ## choice of x colors according to number of replicate in Ramp palette color_cond1 = colorRampPalette(brewer.pal(3, "Purples"))(sum(conditions(cds)==levels(conditions(cds))[1])) color_cond2 = colorRampPalette(brewer.pal(3, "RdPu"))(sum(conditions(cds)==levels(conditions(cds))[2])) colors = c(as.vector(color_cond1),as.vector(color_cond2)) #if there are many replicates, x labels are drawn vertically if (length(sampleNames(cds)) > 8){ barplot(colSums(RawCount), ylab="", col=colors, ylim=c(0,max(colSums(RawCount))*1.2 ), las=2, yaxt="n") ## As there are millions of reads, y labels are drawn horizontally axis(2, las=0) mtext("Total Read Count per Sample", side=2, line=2.5) } else { barplot(colSums(RawCount), ylab="", names.arg = colnames(RawCount), col=colors, ylim=c(0,max(colSums(RawCount))*1.2 ) ) ## As there are millions of reads, y labels are drawn horizontally axis(2, las=0) mtext("Total Read Count per Sample", side=2, line=2.5) } invisible(dev.off()) } ################################################################### # heatmap of expression data of the 30 most highly expressed genes ################################################################### HeatCountPlot=function(cds, vsd, file, geneNumber=30, ImageSize=1960, PPI=200){ png( file=file , width=ImageSize, height=ImageSize, res=PPI ) select = order(rowMeans(counts(cds)), decreasing=TRUE)[1:geneNumber] hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100) heatmap.2(exprs(vsd)[select,], col = hmcol, trace="none", margin=c(10, 6)) invisible(dev.off()) } ####################################################### # Heatmap of Euclidean distances between the samples ####################################################### HeatDistPlot=function(vsd, cdsBlind, file, ImageSize=1960, PPI=200){ png( file=file , width=ImageSize, height=ImageSize, res=PPI ) dists = dist(t(exprs(vsd))) mat = as.matrix(dists) rownames(mat) = colnames(mat) = with(pData(cdsBlind), paste(condition,sampleNames(cdsBlind), sep=" : ")) hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100) heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13)) invisible(dev.off()) } ####################################################### # Export up and down genes according to p-value and # Foldchange, and export complete results ####################################################### WriteResults = function (res, rawCounts, normCounts, cond1, cond2, alpha, foldchange, rep, OutPrefix, logBase=10){ ## re-compute logBase(FC) passed in argument res[,6] = log(res[,5],base=logBase) ## get results not failed pvalue threshold if (rep == TRUE){ res_sig = res[res$padj < alpha,] }else{ res_sig = res[res$pval < alpha,] } # select up genes up = res_sig[res_sig$foldChange > foldchange,] up = up[rev(order(up$foldChange)),] # select down genes down = res_sig[res_sig$foldChange < 1/foldchange,] down = down[order(down$foldChange),] # change any column names for more comprehension names = colnames(res) names[3] = paste("baseMean_",cond1,sep="") names[4] = paste("baseMean_",cond2,sep="") names[5] = paste("foldChange_",paste(cond2,cond1,sep="/"),sep="") names[6] = paste("log",logBase,"FoldChange",sep="") colnames(res) = names colnames(up) = names colnames(down) = names # export up genes write.table( format(up,scientific=FALSE,digits=3,dec=','), file=paste(OutPrefix,"_up_genes.csv",sep=""), sep="\t", row.names=F, quote=F) # export down genes write.table( format(down,scientific=FALSE,digits=3,dec=','), file=paste(OutPrefix,"_down_genes.csv",sep=""), sep="\t", row.names=F, quote=F) #rename col names of raw counts names = NULL for(name in colnames(rawCounts)){ names = c(names,paste("raw_",name,sep="")) } colnames(rawCounts) = names #rename col names of normalized counts names = NULL for(name in colnames(normCounts)){ names = c(names,paste("norm_",name,sep="")) } colnames(normCounts) = names ## merge raw and normalized counts counts = merge(rawCounts, normCounts,by="row.names",all=T) all_results = data.frame(res$id, counts[,2:ncol(counts)], res[,3:ncol(res)]) colnames(all_results)[1] <- "ID" ## export complete results write.table( format(all_results,scientific=FALSE,digits=3,dec=','), file=paste(OutPrefix,"_complete.csv",sep=""), sep="\t", row.names=F, quote=F) } ####################################################### # create a raw count matrix from argument's sample ####################################################### CreateCountMatrix = function(sample, header, rep){ #Read the HTseq-count files dans concatenate them in different columns. names = sample$files rawCounts=data.frame(Id=character(),stringsAsFactors=T) for (i in 1:length(names)){ tmp = read.table(names[i], sep="\t", header=header, as.is=T,col.names=c("Id", sample$replicate[i]))#read rawCounts = merge(rawCounts, tmp, by="Id", all=T)#concatenate } colnames(rawCounts) = c("Id", sample$replicate)#rename rawCounts[is.na(rawCounts)] = 0 #Clean up the HTseq-count from overview lines row2remove = c("alignment_not_unique", "ambiguous", "no_feature", "not_aligned", "too_low_aQual") rawCounts = rawCounts[!rawCounts$Id %in% row2remove,] rawCounts[is.na(rawCounts)] = 0 #convert Id columns to row names rownames(rawCounts)=rawCounts[,1] rawCounts=rawCounts[,-1] return(rawCounts) } ####################################################### # Launch DE analysis with(out) replicates ####################################################### # parameters: # method: how samples are pooled to estimate dispersion. If no replicates use "blind" # sharingMode: how variance estimate is computed with respect to the fitted line. # "Maximum" is the most conservative (takes care of outliers, i.e., genes with dispersion much larger than the fitted values), # "fit-only" keeps the estimated value. Use this only with very few replicates, and when you are not too concerned about false positives from dispersion outliers, i.e. genes with an unusually high variability. # "gene-est-only" : No fitting or sharing, use only the empirical value. #sharingMode = c( "maximum", "fit-only", "gene-est-only" ) # fitType: refers to the model. "Local" is the published model, "parametric" is glm-based (may not converge) #in this case, without replicates #fitType = c("parametric", "local") DE_analysis = function(countTable, design, rep, mode="maximum", method="pooled", fitType="parametric"){ cds = newCountDataSet( countTable, design$condition) cds = estimateSizeFactors( cds ) if (rep){ cds = estimateDispersions( cds, fitType=fitType, sharingMode=mode, method=method) # If there are no replicates, we force Mode and method as described in DESeq manual } else { cds = estimateDispersions( cds, fitType=fitType, sharingMode="fit-only", method="blind") } return(cds) }