Mercurial > repos > yhoogstrate > edger_with_design_matrix
changeset 13:6e475c849a38 draft
Uploaded
author | yhoogstrate |
---|---|
date | Thu, 09 Jan 2014 08:27:09 -0500 |
parents | c672e76503b2 |
children | 15fd1a6798e0 |
files | edgeR_DGE.xml |
diffstat | 1 files changed, 63 insertions(+), 61 deletions(-) [+] |
line wrap: on
line diff
--- a/edgeR_DGE.xml Thu Jan 09 06:55:28 2014 -0500 +++ b/edgeR_DGE.xml Thu Jan 09 08:27:09 2014 -0500 @@ -72,7 +72,7 @@ header = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1) has_header = (class(header[1,1]) == "character") -read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1) +read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)[1] for(i in 2:length(raw_data[,1])) { print("parsing counts from:") @@ -80,11 +80,14 @@ header = read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1) has_header = (class(header[1,1]) == "character") + table = read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1)[1] - read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1)) + read_counts = cbind(read_counts,table) print(i) } +colnames(read_counts) = as.character(raw_data[,2]) + ## Filter for HTSeq predifined counts: @@ -99,83 +102,82 @@ - +empty_samples = apply(read_counts,2,function(x) sum(x) == 0) +if(sum(empty_samples) > 0) { + write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) + write(colnames(read_counts)[empty_samples],stderr()) +} +else { + dge = DGEList(counts=read_counts,genes=rownames(read_counts)) + design_tmp <- raw_data[3:length(raw_data)] + rownames(design_tmp) <- colnames(dge) + formula = paste(c("~0",colnames(design_tmp)),collapse = " + ") + design <- model.matrix(as.formula(formula),design_tmp) + prefixes = colnames(design_tmp)[attr(design,"assign")] + avoid = nchar(prefixes) == nchar(colnames(design)) + replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) + replacements[avoid] = colnames(design)[avoid] + colnames(design) = replacements -colnames(read_counts) = raw_data[,2] -dge = DGEList(counts=read_counts,genes=rownames(read_counts)) - -design_tmp <- raw_data[3:length(raw_data)] -rownames(design_tmp) <- colnames(dge) -formula = paste(c("~0",colnames(design_tmp)),collapse = " + ") -design <- model.matrix(as.formula(formula),design_tmp) - -prefixes = colnames(design_tmp)[attr(design,"assign")] -avoid = nchar(prefixes) == nchar(colnames(design)) -replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) -replacements[avoid] = colnames(design)[avoid] -colnames(design) = replacements - - - -print("Calculating normalization factors...") -dge = calcNormFactors(dge) -print("Estimating common dispersion...") -dge = estimateGLMCommonDisp(dge,design) -print("Estimating trended dispersion...") -dge = estimateGLMTrendedDisp(dge,design) -print("Estimating tagwise dispersion...") -dge = estimateGLMTagwiseDisp(dge,design) + print("Calculating normalization factors...") + dge = calcNormFactors(dge) + print("Estimating common dispersion...") + dge = estimateGLMCommonDisp(dge,design) + print("Estimating trended dispersion...") + dge = estimateGLMTrendedDisp(dge,design) + print("Estimating tagwise dispersion...") + dge = estimateGLMTagwiseDisp(dge,design) -if (QC == TRUE) { - print("Creating QC plots...") - #### MDS Plot - pdf(output_5) - plotMDS(dge, main="edgeR MDS Plot") - dev.off() - #### Biological coefficient of variation plot - pdf(output_6) - plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") - dev.off() -} + if (QC == TRUE) { + print("Creating QC plots...") + #### MDS Plot + pdf(output_5) + plotMDS(dge, main="edgeR MDS Plot") + dev.off() + #### Biological coefficient of variation plot + pdf(output_6) + plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") + dev.off() + } -print("Fitting GLM...") -fit = glmFit(dge,design) + print("Fitting GLM...") + fit = glmFit(dge,design) -print(paste("Performing likelihood ratio test: ",contrast,sep="")) -cont <- c(contrast) -cont <- makeContrasts(contrasts=cont, levels=design) + print(paste("Performing likelihood ratio test: ",contrast,sep="")) + cont <- c(contrast) + cont <- makeContrasts(contrasts=cont, levels=design) -lrt <- glmLRT(fit, contrast=cont[,1]) -print(paste("Exporting to file: ",output_1,sep="")) -write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T) -write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t") -## todo EXPORT FPKM -write.table(file=output_4,dge\$counts,sep="\t") + lrt <- glmLRT(fit, contrast=cont[,1]) + print(paste("Exporting to file: ",output_1,sep="")) + write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T) + write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t") + ## todo EXPORT FPKM + write.table(file=output_4,dge\$counts,sep="\t") -if (QC == TRUE) { - print("Creating MA plots...") - - - etable <- topTags(lrt, n=nrow(dge))\$table - etable <- etable[order(etable\$FDR), ] - pdf(output_7) - with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) - with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) - abline(h=c(-1,1), col="blue") - dev.off() + if (QC == TRUE) { + print("Creating MA plots...") + + etable <- topTags(lrt, n=nrow(dge))\$table + etable <- etable[order(etable\$FDR), ] + pdf(output_7) + with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) + with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) + abline(h=c(-1,1), col="blue") + dev.off() + } + print("Done!") } -print("Done!") </configfile> </configfiles>