view function_deseq.R @ 0:8c5de60b3c04 draft

Uploaded
author vladimir-daric
date Fri, 25 Apr 2014 05:05:39 -0400
parents
children
line wrap: on
line source

# Home made violin plot (Alban Ott)

vioplot = function(x, DensCol="#00000060", BoxplotCol="#000000FF", InfFactorSpace=0.4,BoxplotFill="#FFFFFF00", YLAB="log10(normalized count)"){ #x must be a matrix or a dataframe
	
	# Number of samples :
	SampleNumber=length(colnames(x))
	
	if(is.vector(x)){
		x=as.matrix(x)
	}
	YNormalizedDensity=list()
	XNormalizedDensity=list()
	
	#Set a value to -Inf
	xNoInfo=x[x!=-Inf]
	OverallMinMax=c(min(xNoInfo),max(xNoInfo))
	range=OverallMinMax[2]-OverallMinMax[1]
	Inf2Value=OverallMinMax[1]-range*InfFactorSpace
	breakInf=OverallMinMax[1]-range*(InfFactorSpace-.1)
	xValInf=x
	IsInf=xValInf==-Inf
	if(sum(IsInf)>0){
		xValInf[xValInf==-Inf]=Inf2Value
	}
	#Pre compute densities
	for (i in 1:dim(xValInf)[2]){
		dens=density(xValInf[,i])
		YNormalizedDensity[[i]]=dens$x
		#Normalize density to fit boxplot
		XNormalizedDensity[[i]]=dens$y/max(dens$y)/5
	}
	
	#Print boxplot with a range compatible with density plot
	if(sum(IsInf)>0){
		ylim=c(OverallMinMax[1]-range*(InfFactorSpace+0.1),OverallMinMax[2]+range*0.1)
		if (SampleNumber > 8) {
			boxplot(x, boxwex=0.2, ylim=ylim, border=BoxplotCol, col=BoxplotFill, notch=T, yaxt="n", las=2)
		}else {
			boxplot(x, boxwex=0.2, ylim=ylim, border=BoxplotCol, col=BoxplotFill, notch=T, yaxt="n")	
		}
		AxisTick=seq(floor(OverallMinMax[1]),round(OverallMinMax[2]),length.out=round(OverallMinMax[2])-floor(OverallMinMax[1])+1)
		axis(2,at=c(Inf2Value,AxisTick),c("-Inf",AxisTick),las=1)
		axis.break(2,breakInf,style="slash")

	}else{
		if (SampleNumber > 8){
			boxplot(x, boxwex=0.2, border=BoxplotCol, col=BoxplotFill, notch=T, las=2)
		}else {
			boxplot(x, boxwex=0.2, border=BoxplotCol, col=BoxplotFill, notch=T)
		}
		
	}
	mtext(side = 2, YLAB, line = 2)	
	#Plot densities
	for (i in 1:dim(x)[2]){
		points(i+XNormalizedDensity[[i]], YNormalizedDensity[[i]], type="l", lwd=3, col=DensCol)
		points(i-XNormalizedDensity[[i]], YNormalizedDensity[[i]], type="l", lwd=3, col=DensCol)
		if(sum(IsInf)>0){
			points(i,Inf2Value,pch=8,col=DensCol,lwd=3)
		}
	}
}



#######################################################
# Histogram of p-value. Use adjusted p-value if
#             there are replicats
#######################################################


PlotHistPval = function(res, rep, file, ImageSize=1960, PPI=200){
	
	png( file=file , width=ImageSize, height=ImageSize, res=PPI )
	if (rep){
		hist(res$padj, breaks=100, col="black", border="black", main="",ylab="Number of elements (e.g. : genes)",xlab="adjusted p-value")
	}else{
		hist(res$pval, breaks=100, col="black", border="black", main="",ylab="Number of elements (e.g. : genes)",xlab="raw p-value")
	}
	invisible(dev.off())
	
}



#######################################################
# Plot violinplot of (un)normalized counts
#######################################################

PlotViolin = function(cds, norm, file, ImageSize=1960, PPI=200, logBase=10){
	
	png( file=file , width=ImageSize, height=ImageSize, res=PPI )
	## probleme avec les +/-inf
	NormCount = log(counts( cds, normalized=norm ), base=logBase)
	## get correct ylab
	if ( norm ){
		YLAB=paste("log",logBase,"(normalized count)",sep="")
	}else{
		YLAB=paste("log",logBase,"(raw count)",sep="")
	}	
	color_cond1 = colorRampPalette(brewer.pal(3, "Purples"))(sum(conditions(cds)==levels(conditions(cds))[1]))
	color_cond2 = colorRampPalette(brewer.pal(3, "RdPu"))(sum(conditions(cds)==levels(conditions(cds))[2]))
	colors = c(as.vector(color_cond1),as.vector(color_cond2))
	labels = colnames( NormCount )

	vioplot(NormCount,BoxplotFill=colors, YLAB=YLAB)	
	
	invisible(dev.off())

	
}

##############################################################
# Plot volcano with density plot of foldchange and pvalues
##############################################################

PlotVolcano = function (res, cond1,cond2 ,alpha, fold, rep, file, ImageSize=1960,logBase=10, InfFactorSpace=0.4,PPI=200){
	
	png( file=file , width=ImageSize, height=ImageSize ,res=PPI )
	
	
	if ( rep ){
		pvalCol=8
		YLAB=paste("-log",logBase,"(adjusted p-value)",sep="")
	}else{
		pvalCol=7
		YLAB=paste("-log",logBase,"(raw p-value)",sep="")
	}
	
	ThreslogAlpha = -log(alpha, logBase)
	ThreslogFC = log(fold, logBase)
	logPVal = -log(res[,pvalCol], logBase)
	logFC=log(res$foldChange, logBase)
	
	#Set values to -/+Inf
	MinMaxlogFC=summary(logFC[abs(logFC)!=Inf])[c("Min.","Max.")]
	range=MinMaxlogFC[2]-MinMaxlogFC[1]
	Inf2Value=c(MinMaxlogFC[1]-range*InfFactorSpace, MinMaxlogFC[2]+range*InfFactorSpace)
	breakInf=c(MinMaxlogFC[1]-range*(InfFactorSpace-0.1), MinMaxlogFC[2]+range*(InfFactorSpace-0.1))
	logFC[logFC==-Inf]=Inf2Value[1]
	logFC[logFC== Inf]=Inf2Value[2]
	
	
	#Precompute densities
	logFCDens=density(logFC)
	logPvalDens=density(logPVal)	
	tmp=logPvalDens$x
	logPvalDens$x=logPvalDens$y
	logPvalDens$y=tmp
	#Compute xlim and ylim to align graphics
	marOutside=0.05
	XLIM=c(min(logFCDens$x)-(max(logFCDens$x)-min(logFCDens$x))*marOutside,max(logFCDens$x)+(max(logFCDens$x)-min(logFCDens$x))*marOutside)
	YLIM=c(min(logPvalDens$y)-(max(logPvalDens$y)-min(logPvalDens$y))*marOutside,max(logPvalDens$y)+(max(logPvalDens$y)-min(logPvalDens$y))*marOutside)
	
	
	plot.new()
	split.screen(rbind(
					c(0,0.7,0,0.7),
					c(0,0.7,0.7,1),
					c(0.7,1,0,0.7)
			))
	screen(1)
	# c(bottom, left, top, right) default : par(mar=c(5,6,3,4)+.1)
	par(mar=c(5,6,0,0)+.1)
	# volcano plot
	VolcanoCol=ifelse((log(res$foldChange, logBase) > ThreslogFC  | log(res$foldChange, logBase) < -ThreslogFC)&(logPVal>ThreslogAlpha), "red","black")
	plot(logFC,logPVal,pch=20,cex=0.3,xlim=XLIM,ylim=YLIM,xaxt="n",col=VolcanoCol,xlab=paste("log",logBase,"(FoldChange ",paste(cond2,cond1,sep="/"),")",sep=""),ylab=YLAB,bty="l")
	
	AxisTick=seq(floor(MinMaxlogFC[1]),round(MinMaxlogFC[2]),length.out=round(MinMaxlogFC[2])-floor(MinMaxlogFC[1])+1)
	axis(1,at=c(Inf2Value[1],AxisTick,Inf2Value[2]),c("-Inf",AxisTick,"+Inf"),las=1)
	axis.break(1,breakInf[1],style="slash")
	axis.break(1,breakInf[2],style="slash")
	
	#pval threshold dashed line
	abline(h=ThreslogAlpha,col="red", lty=2, lwd=2 )
	
	screen(2)
	par(mar=c(0,6,3,0)+.1)
	plot(logFCDens, col="black",main ="",xaxt="n",bty="n",ylab=paste("Density of\nlog",logBase,"(FoldChange)",sep=""),xlab="",xlim=XLIM)
	
	screen(3)
	par(mar=c(5,0,0,4)+.1)
	plot(logPvalDens,main="", col="black",yaxt="n",bty="n",xlab=paste("Density of\n",YLAB,sep=""),ylab="",ylim=YLIM)
	
	invisible(dev.off())
	
}

#######################################################
# Plot mean of normalized counts vs log2 of foldchange
#######################################################

MAPlot = function(res, OutPrefix, file, ImageSize=1960, PPI=200){
	
	png( file=file , width=ImageSize, height=ImageSize, res=PPI )
	plotMA(res)
	
	invisible(dev.off())
}


#######################################################
# Plot estimation of dispersion
#######################################################


EstDispPlot = function(cds, method, file, ImageSize=1960, PPI=200){
	
	png( file=file , width=ImageSize, height=ImageSize, res=PPI )
	
	if (method == "per-condition"){
		
		#print(ls( cds@fitInfo))
		par(mfcol=c(1,2))
		for (cond in (ls(cds@fitInfo))){
			plotDispEsts(cds, name=cond,main=paste("Estimate dispersion plot for ",cond,sep=""))		
		}
		
	}else{
		plotDispEsts(cds)
	}
	
	
	invisible(dev.off())
}

#######################################################
# Histogram of null counts proportion per condition
#######################################################


NullCondPlot = function(cds, file ,ImageSize=1960, PPI=200){

	png( file=file , width=ImageSize/2, height=ImageSize, res=PPI )
	
	
	CountsNorm_cond=list()
	CountsNull_cond=list()
	## for each condition 
	for (cond in rev(levels(conditions(cds)))){
		# we count number of feature with no count
		CountsNorm_cond[[cond]]=counts( cds, normalized=TRUE )[ , conditions(cds)==cond ]
		# if there are no replicate dim is null and return R error
		if (is.null(dim(CountsNorm_cond[[cond]]))) {
			CountsNull_cond[[cond]]=sum(CountsNorm_cond[[cond]] == 0)/length(CountsNorm_cond[[cond]])
		}
		else {
			CountsNull_cond[[cond]]=sum(apply(CountsNorm_cond[[cond]],1,sum) == 0)/nrow(CountsNorm_cond[[cond]])
		}

	}
	# we plot number of feature with no count
	colors = c("#756bb1","#fa9fb5")
	barplot(as.numeric(CountsNull_cond), ylab="Proportion of null count elements (e.g. : genes)", names.arg = names(CountsNull_cond), col=colors, ylim=c(0,1))
	
	invisible(dev.off())
}


#######################################################
# Histogram of total reads count per sample
#######################################################


TotReadSamplePlot = function(cds, file ,ImageSize=1960, PPI=200){

	png( file=file , width=ImageSize, height=ImageSize, res=PPI )
	
	RawCount = counts( cds, normalized=FALSE)
	## choice of x colors according to number of replicate in Ramp palette
	color_cond1 = colorRampPalette(brewer.pal(3, "Purples"))(sum(conditions(cds)==levels(conditions(cds))[1]))
	color_cond2 = colorRampPalette(brewer.pal(3, "RdPu"))(sum(conditions(cds)==levels(conditions(cds))[2]))
	colors = c(as.vector(color_cond1),as.vector(color_cond2))
	#if there are many replicates, x labels are drawn vertically
	if (length(sampleNames(cds)) > 8){
		barplot(colSums(RawCount), ylab="", col=colors, ylim=c(0,max(colSums(RawCount))*1.2 ), las=2, yaxt="n")
		## As there are millions of reads, y labels are drawn horizontally
		axis(2, las=0)
		mtext("Total Read Count per Sample", side=2, line=2.5)
	}
	else {
		barplot(colSums(RawCount), ylab="", names.arg = colnames(RawCount), col=colors, ylim=c(0,max(colSums(RawCount))*1.2 ) )
		## As there are millions of reads, y labels are drawn horizontally
		axis(2, las=0)
		mtext("Total Read Count per Sample", side=2, line=2.5)
	}
	invisible(dev.off())
}


###################################################################
# heatmap of expression data of the 30 most highly expressed genes
###################################################################

HeatCountPlot=function(cds, vsd, file, geneNumber=30, ImageSize=1960, PPI=200){
	
	png( file=file , width=ImageSize, height=ImageSize, res=PPI )
	
	select = order(rowMeans(counts(cds)), decreasing=TRUE)[1:geneNumber]
	hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100)
	heatmap.2(exprs(vsd)[select,], col = hmcol, trace="none", margin=c(10, 6))
	
	invisible(dev.off())
}

#######################################################
# Heatmap of Euclidean distances between the samples
#######################################################

HeatDistPlot=function(vsd, cdsBlind, file, ImageSize=1960, PPI=200){
	
	png( file=file , width=ImageSize, height=ImageSize, res=PPI )
	
	dists = dist(t(exprs(vsd)))
	mat = as.matrix(dists)
	rownames(mat) = colnames(mat) = with(pData(cdsBlind), paste(condition,sampleNames(cdsBlind), sep=" : "))
	hmcol = colorRampPalette(brewer.pal(9, "GnBu"))(100)
	heatmap.2(mat, trace="none", col = rev(hmcol), margin=c(13, 13))
	
	invisible(dev.off())
}

#######################################################
# Export up and down genes according to p-value and 
#     Foldchange, and export complete results
#######################################################


WriteResults = function (res, rawCounts, normCounts, cond1, cond2, alpha, foldchange, rep, OutPrefix, logBase=10){
	
	## re-compute logBase(FC) passed in argument
	res[,6] = log(res[,5],base=logBase)
	
	## get results not failed pvalue threshold
	if (rep == TRUE){
		res_sig = res[res$padj < alpha,]
	}else{
		res_sig = res[res$pval < alpha,]
	}
	
	# select up genes
	up = res_sig[res_sig$foldChange > foldchange,]
	up = up[rev(order(up$foldChange)),]
	
	# select down genes
	down = res_sig[res_sig$foldChange < 1/foldchange,]
	down = down[order(down$foldChange),]
	
	# change any column names for more comprehension
	names = colnames(res)
	names[3] = paste("baseMean_",cond1,sep="")
	names[4] = paste("baseMean_",cond2,sep="")
	names[5] = paste("foldChange_",paste(cond2,cond1,sep="/"),sep="")
	names[6] = paste("log",logBase,"FoldChange",sep="")
	colnames(res) = names
	colnames(up) = names
	colnames(down) = names
	
	# export up genes
	write.table( format(up,scientific=FALSE,digits=3,dec=','), file=paste(OutPrefix,"_up_genes.csv",sep=""), sep="\t", row.names=F, quote=F)
	# export down genes
	write.table( format(down,scientific=FALSE,digits=3,dec=','), file=paste(OutPrefix,"_down_genes.csv",sep=""), sep="\t", row.names=F, quote=F)
	
	#rename col names of raw counts
	names = NULL
	for(name in colnames(rawCounts)){
		names = c(names,paste("raw_",name,sep=""))
	} 
	colnames(rawCounts) = names
	
	#rename col names of normalized counts
	names = NULL
	for(name in colnames(normCounts)){
		names = c(names,paste("norm_",name,sep=""))
	} 
	colnames(normCounts) = names	
	
	## merge raw and normalized counts	
	counts = merge(rawCounts, normCounts,by="row.names",all=T)
	all_results = data.frame(res$id, counts[,2:ncol(counts)], res[,3:ncol(res)])
	colnames(all_results)[1] <- "ID"
	## export complete results
	write.table( format(all_results,scientific=FALSE,digits=3,dec=','), file=paste(OutPrefix,"_complete.csv",sep=""), sep="\t", row.names=F, quote=F)
	
	
}

#######################################################
# create a raw count matrix from argument's sample 
#######################################################


CreateCountMatrix = function(sample, header, rep){
	
	#Read the HTseq-count files dans concatenate them in different columns.
	names = sample$files
	rawCounts=data.frame(Id=character(),stringsAsFactors=T)
	for (i in 1:length(names)){
		tmp = read.table(names[i], sep="\t", header=header, as.is=T,col.names=c("Id", sample$replicate[i]))#read
		rawCounts = merge(rawCounts, tmp, by="Id", all=T)#concatenate
	}
	colnames(rawCounts) = c("Id", sample$replicate)#rename
	
	rawCounts[is.na(rawCounts)] = 0  
	
	#Clean up the HTseq-count from overview lines
	row2remove = c("alignment_not_unique", "ambiguous", "no_feature", "not_aligned", "too_low_aQual")
	rawCounts = rawCounts[!rawCounts$Id %in% row2remove,]
	rawCounts[is.na(rawCounts)] = 0 
	
	#convert Id columns to row names
	rownames(rawCounts)=rawCounts[,1]
	rawCounts=rawCounts[,-1]
	
	return(rawCounts)
}



#######################################################
# Launch DE analysis with(out) replicates
#######################################################
# parameters: 
# method: how samples are pooled to estimate dispersion. If no replicates use "blind"
# sharingMode: how variance estimate is computed with respect to the fitted line. 
# 				"Maximum" is the most conservative (takes care of outliers, i.e., genes with dispersion much larger than the fitted values), 
#					"fit-only" keeps the estimated value. Use this only with very few replicates, and when you are not too concerned about false positives from dispersion outliers, i.e. genes with an unusually high variability.
# 				"gene-est-only" : No fitting or sharing, use only the empirical value.
#sharingMode = c( "maximum", "fit-only", "gene-est-only" )
# fitType: refers to the model. "Local" is the published model, "parametric" is glm-based (may not converge)
#in this case, without replicates
#fitType = c("parametric", "local")


DE_analysis = function(countTable, design, rep, mode="maximum", method="pooled", fitType="parametric"){

	cds = newCountDataSet( countTable, design$condition)
	cds = estimateSizeFactors( cds )
	
	if (rep){
		cds = estimateDispersions( cds, fitType=fitType, sharingMode=mode, method=method)
	# If there are no replicates, we force Mode and method as described in DESeq manual
	} else {
		cds = estimateDispersions( cds, fitType=fitType, sharingMode="fit-only", method="blind")
	}
	return(cds)
}