Mercurial > repos > yhoogstrate > edger_with_design_matrix
changeset 5:c0cd78c1c10d draft
Deleted selected files
author | yhoogstrate |
---|---|
date | Thu, 09 Jan 2014 02:55:12 -0500 |
parents | b1aee4b59049 |
children | 149a52c74f39 |
files | edgeR_DGE_test.R |
diffstat | 1 files changed, 0 insertions(+), 151 deletions(-) [+] |
line wrap: on
line diff
--- a/edgeR_DGE_test.R Thu Jan 09 02:55:02 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,151 +0,0 @@ -#!/usr/bin/env Rscript - -# edgeR citation: -# Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression -# analysis of digital gene expression data. Bioinformatics 26, 139-140 -# -# Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing differences in tag -# abundance. Bioinformatics 23, 2881-2887 - -# Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with -# applications to SAGE data. Biostatistics, 9, 321-332 - -# McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq -# experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297 - -# R script Author: -# - MSc. René Böttcher (Erasmus MC) - -# Hooked into Galaxy Server: -# - MSc. Youri Hoogstrate (Erasmus MC) - - -setwd("/home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE") - -library(edgeR) - -# Fetch commandline arguments -args <- commandArgs(trailingOnly = TRUE) -designmatrix = args[1] -contrast = args[2] - -output_1 = args[3] -output_2 = args[4] -output_3 = args[5] #FPKM file - to be implemented -output_4 = args[6] - -QC = nchar(args[7]) > 0 - -output_5 = args[8] -output_6 = args[9] -output_7 = args[10] - -output_8 = args[11] - - - - -library(edgeR) -raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) - -# Obtain read-counts -read_counts = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1) -for(i in 2:length(raw_data[,1])) { - print("parsing counts from:") - print(raw_data[i,1]) - read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1)) - print(i) -} - - - -# Filter for HTSeq predifined counts: -exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") -exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned") - -exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) -exclude = exclude[is.na(exclude)==0] -if(length(exclude) != 0) { - read_counts = read_counts[-exclude,] -} - - - - - - - - - -colnames(read_counts) = raw_data[,2] -dge = DGEList(counts=read_counts,genes=rownames(read_counts)) - -design_tmp <- raw_data[3:length(raw_data)] -rownames(design_tmp) <- colnames(dge) -formula = paste(c("~0",colnames(design_tmp)),collapse = " + ") -design <- model.matrix(as.formula(formula),design_tmp) - -prefixes = colnames(design_tmp)[attr(design,"assign")] -avoid = nchar(prefixes) == nchar(colnames(design)) -replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) -replacements[avoid] = colnames(design)[avoid] -colnames(design) = replacements - - - -print("Calculating normalization factors...") -dge = calcNormFactors(dge) -print("Estimating common dispersion...") -dge = estimateGLMCommonDisp(dge,design) -print("Estimating trended dispersion...") -dge = estimateGLMTrendedDisp(dge,design) -print("Estimating tagwise dispersion...") -dge = estimateGLMTagwiseDisp(dge,design) - - - - -if (QC == TRUE) { - print("Creating QC plots...") - ## MDS Plot - pdf(output_5) - plotMDS(dge, main="edgeR MDS Plot") - dev.off() - ## Biological coefficient of variation plot - pdf(output_6) - plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") - dev.off() -} - - - -print("Fitting GLM...") -fit = glmFit(dge,design) - -print(paste("Performing likelihood ratio test: ",contrast,sep="")) -cont <- c(contrast) -cont <- makeContrasts(contrasts=cont, levels=design) - -lrt <- glmLRT(fit, contrast=cont[,1]) -print(paste("Exporting to file: ",output_1,sep="")) -write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T) -write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t") -# todo EXPORT FPKM -write.table(file=output_4,dge$counts,sep="\t") - - - -if (QC == TRUE) { - print("Creating MA plots...") - - - etable <- topTags(lrt, n=nrow(dge))$table - etable <- etable[order(etable$FDR), ] - pdf(output_7) - with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) - with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) - abline(h=c(-1,1), col="blue") - dev.off() -} -print("Done!") -