Mercurial > repos > fubar > shrnaseq_test
diff hairpinTool.R @ 7:2c6bcbc1e76a draft default tip
Uploaded
author | fubar |
---|---|
date | Mon, 19 Jan 2015 22:14:10 -0500 |
parents | e7922416086d |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hairpinTool.R Mon Jan 19 22:14:10 2015 -0500 @@ -0,0 +1,1069 @@ +#!/usr/bin/env Rscript +# ARGS: 1.inputType -String specifying format of input (fastq or table) +# IF inputType is "fastq" or "pairedFastq: +# 2*.fastqPath -One or more strings specifying path to fastq files +# 2.annoPath -String specifying path to hairpin annotation table +# 3.samplePath -String specifying path to sample annotation table +# 4.barStart -Integer specifying starting position of barcode +# 5.barEnd -Integer specifying ending position of barcode +# ### +# IF inputType is "pairedFastq": +# 6.barStartRev -Integer specifying starting position of barcode +# on reverse end +# 7.barEndRev -Integer specifying ending position of barcode +# on reverse end +# ### +# 8.hpStart -Integer specifying startins position of hairpin +# unique region +# 9.hpEnd -Integer specifying ending position of hairpin +# unique region +# IF inputType is "counts": +# 2.countPath -String specifying path to count table +# 3.annoPath -String specifying path to hairpin annotation table +# 4.samplePath -String specifying path to sample annotation table +# ### +# 10.secFactName -String specifying name of secondary factor +# 11.cpmReq -Float specifying cpm requirement +# 12.sampleReq -Integer specifying cpm requirement +# 13.readReq -Integer specifying read requirement +# 14.fdrThresh -Float specifying the FDR requirement +# 15.lfcThresh -Float specifying the log-fold-change requirement +# 16.workMode -String specifying exact test or GLM usage +# 17.htmlPath -String specifying path to HTML file +# 18.folderPath -String specifying path to folder for output +# IF workMode is "classic" (exact test) +# 19.pairData[2] -String specifying first group for exact test +# 20.pairData[1] -String specifying second group for exact test +# ### +# IF workMode is "glm" +# 19.contrastData -String specifying contrasts to be made +# 20.roastOpt -String specifying usage of gene-wise tests +# 21.hairpinReq -String specifying hairpin requirement for gene- +# wise test +# 22.selectOpt -String specifying type of selection for barcode +# plots +# 23.selectVals -String specifying members selected for barcode +# plots +# ### +# +# OUT: Bar Plot of Counts Per Index +# Bar Plot of Counts Per Hairpin +# MDS Plot +# BCV Plot +# Smear Plot +# Barcode Plots (If Genewise testing was selected) +# Top Expression Table +# Feature Counts Table +# HTML file linking to the ouputs +# +# Author: Shian Su - registertonysu@gmail.com - Jan 2014 + +# Record starting time +timeStart <- as.character(Sys.time()) +options(bitmapType='cairo') +# needed to prevent missing x11 errors for png() +# Loading and checking required packages +library(methods, quietly=TRUE, warn.conflicts=FALSE) +library(statmod, quietly=TRUE, warn.conflicts=FALSE) +library(splines, quietly=TRUE, warn.conflicts=FALSE) +library(edgeR, quietly=TRUE, warn.conflicts=FALSE) +library(limma, quietly=TRUE, warn.conflicts=FALSE) + +if (packageVersion("edgeR") < "3.7.17") { + stop("Please update 'edgeR' to version >= 3.7.17 to run this tool") +} + +if (packageVersion("limma")<"3.21.16") { + message("Update 'limma' to version >= 3.21.16 to see updated barcode graphs") +} + +################################################################################ +### Function declarations +################################################################################ + +# Function to load libaries without messages +silentLibrary <- function(...) { + list <- c(...) + for (package in list){ + suppressPackageStartupMessages(library(package, character.only=TRUE)) + } +} + +# Function to sanitise contrast equations so there are no whitespaces +# surrounding the arithmetic operators, leading or trailing whitespace +sanitiseEquation <- function(equation) { + equation <- gsub(" *[+] *", "+", equation) + equation <- gsub(" *[-] *", "-", equation) + equation <- gsub(" *[/] *", "/", equation) + equation <- gsub(" *[*] *", "*", equation) + equation <- gsub("^\\s+|\\s+$", "", equation) + return(equation) +} + +# Function to sanitise group information +sanitiseGroups <- function(string) { + string <- gsub(" *[,] *", ",", string) + string <- gsub("^\\s+|\\s+$", "", string) + return(string) +} + +# Function to change periods to whitespace in a string +unmake.names <- function(string) { + string <- gsub(".", " ", string, fixed=TRUE) + return(string) +} + +# Function has string input and generates an output path string +makeOut <- function(filename) { + return(paste0(folderPath, "/", filename)) +} + +# Function has string input and generates both a pdf and png output strings +imgOut <- function(filename) { + assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), + envir=.GlobalEnv) + assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")), + envir=.GlobalEnv) +} + +# Create cat function default path set, default seperator empty and appending +# true by default (Ripped straight from the cat function with altered argument +# defaults) +cata <- function(..., file=htmlPath, sep="", fill=FALSE, labels=NULL, + append=TRUE) { + if (is.character(file)) + if (file == "") + file <- stdout() + else if (substring(file, 1L, 1L) == "|") { + file <- pipe(substring(file, 2L), "w") + on.exit(close(file)) + } + else { + file <- file(file, ifelse(append, "a", "w")) + on.exit(close(file)) + } + .Internal(cat(list(...), file, sep, fill, labels, append)) +} + +# Function to write code for html head and title +HtmlHead <- function(title) { + cata("<head>\n") + cata("<title>", title, "</title>\n") + cata("</head>\n") +} + +# Function to write code for html links +HtmlLink <- function(address, label=address) { + cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n") +} + +# Function to write code for html images +HtmlImage <- function(source, label=source, height=600, width=600) { + cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height) + cata("\" width=\"", width, "\"/>\n") +} + +# Function to write code for html list items +ListItem <- function(...) { + cata("<li>", ..., "</li>\n") +} + +TableItem <- function(...) { + cata("<td>", ..., "</td>\n") +} + +TableHeadItem <- function(...) { + cata("<th>", ..., "</th>\n") +} +################################################################################ +### Input Processing +################################################################################ + +# Grabbing arguments from command line +argv <- commandArgs(T) + +inputType <- as.character(argv[1]) +if (inputType == "fastq") { + + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], + fixed=TRUE)) + + # Remove fastq paths + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + + fastqPathRev <- NULL + annoPath <- as.character(argv[2]) + samplePath <- as.character(argv[3]) + barStart <- as.numeric(argv[4]) + barEnd <- as.numeric(argv[5]) + barStartRev <- NULL + barStartRev <- NULL + hpStart <- as.numeric(argv[8]) + hpEnd <- as.numeric(argv[9]) +} else if (inputType=="pairedFastq") { + + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], + fixed=TRUE)) + + fastqPathRev <- as.character(gsub("fastqRev::", "", + argv[grepl("fastqRev::", argv)], fixed=TRUE)) + + # Remove fastq paths + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + argv <- argv[!grepl("fastqRev::", argv, fixed=TRUE)] + + annoPath <- as.character(argv[2]) + samplePath <- as.character(argv[3]) + barStart <- as.numeric(argv[4]) + barEnd <- as.numeric(argv[5]) + barStartRev <- as.numeric(argv[6]) + barEndRev <- as.numeric(argv[7]) + hpStart <- as.numeric(argv[8]) + hpEnd <- as.numeric(argv[9]) +} else if (inputType == "counts") { + countPath <- as.character(argv[2]) + annoPath <- as.character(argv[3]) + samplePath <- as.character(argv[4]) +} + +secFactName <- as.character(argv[10]) +cpmReq <- as.numeric(argv[11]) +sampleReq <- as.numeric(argv[12]) +readReq <- as.numeric(argv[13]) +fdrThresh <- as.numeric(argv[14]) +lfcThresh <- as.numeric(argv[15]) +selectDirection <- as.character(argv[16]) +workMode <- as.character(argv[17]) +htmlPath <- as.character(argv[18]) +folderPath <- as.character(argv[19]) + +if (workMode == "classic") { + pairData <- character() + pairData[2] <- as.character(argv[20]) + pairData[1] <- as.character(argv[21]) +} else if (workMode == "glm") { + contrastData <- as.character(argv[20]) + roastOpt <- as.character(argv[21]) + hairpinReq <- as.numeric(argv[22]) + selectOpt <- as.character(argv[23]) + selectVals <- as.character(argv[24]) +} + +# Read in inputs + +samples <- read.table(samplePath, header=TRUE, sep="\t") + +anno <- read.table(annoPath, header=TRUE, sep="\t") + +if (inputType == "counts") { + counts <- read.table(countPath, header=TRUE, sep="\t") +} + +###################### Check inputs for correctness ############################ +samples$ID <- make.names(samples$ID) + +if ( !any(grepl("group", names(samples))) ) { + stop("'group' column not specified in sample annotation file") +} # Check if grouping variable has been specified + +if (secFactName != "none") { + if ( !any(grepl(secFactName, names(samples))) ) { + tempStr <- paste0("Second factor specified as \"", secFactName, "\" but ", + "no such column name found in sample annotation file") + stop(tempStr) + } # Check if specified secondary factor is present +} + + +if ( any(table(samples$ID) > 1) ){ + tab <- table(samples$ID) + offenders <- paste(names(tab[tab > 1]), collapse=", ") + offenders <- unmake.names(offenders) + stop("'ID' column of sample annotation must have unique values, values ", + offenders, " are repeated") +} # Check that IDs in sample annotation are unique + +if (inputType == "fastq" || inputType == "pairedFastq") { + + if ( any(table(anno$ID) > 1) ){ + tab <- table(anno$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + stop("'ID' column of hairpin annotation must have unique values, values ", + offenders, " are repeated") + } # Check that IDs in hairpin annotation are unique + +} else if (inputType == "counts") { + # The first element of the colnames will be 'ID' and should not match + idFromSample <- samples$ID + idFromTable <- colnames(counts)[-1] + if (any(is.na(match(idFromTable, idFromSample)))) { + stop("not all samples have groups specified") + } # Check that a group has be specifed for each sample + + if ( any(table(counts$ID) > 1) ){ + tab <- table(counts$ID) + offenders <- paste(names(tab[tab>1]), collapse=", ") + stop("'ID' column of count table must have unique values, values ", + offenders, " are repeated") + } # Check that IDs in count table are unique +} +if (workMode == "glm") { + if (roastOpt == "yes") { + if (is.na(match("Gene", colnames(anno)))) { + tempStr <- paste("Gene-wise tests selected but'Gene' column not", + "specified in hairpin annotation file") + stop(tempStr) + } + } +} + +if (secFactName != "none") { + if (workMode != "glm") { + tempStr <- paste("only glm analysis type possible when secondary factor", + "used, please change appropriate option.") + } +} + +################################################################################ + +# Process arguments +if (workMode == "glm") { + if (roastOpt == "yes") { + wantRoast <- TRUE + } else { + wantRoast <- FALSE + } +} + +# Split up contrasts seperated by comma into a vector and replace spaces with +# periods +if (exists("contrastData")) { + contrastData <- unlist(strsplit(contrastData, split=",")) + contrastData <- sanitiseEquation(contrastData) + contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) +} + +# Replace spaces with periods in pair data +if (exists("pairData")) { + pairData <- make.names(pairData) +} + +# Generate output folder and paths +dir.create(folderPath, showWarnings=FALSE) + +# Generate links for outputs +imgOut("barHairpin") +imgOut("barIndex") +imgOut("mds") +imgOut("bcv") +if (workMode == "classic") { + smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) + smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) + topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) +} else if (workMode == "glm") { + smearPng <- character() + smearPdf <- character() + topOut <- character() + roastOut <- character() + barcodePng <- character() + barcodePdf <- character() + for (i in 1:length(contrastData)) { + smearPng[i] <- makeOut(paste0("smear(", contrastData[i], ").png")) + smearPdf[i] <- makeOut(paste0("smear(", contrastData[i], ").pdf")) + topOut[i] <- makeOut(paste0("toptag(", contrastData[i], ").tsv")) + roastOut[i] <- makeOut(paste0("gene_level(", contrastData[i], ").tsv")) + barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png")) + barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf")) + } +} +countsOut <- makeOut("counts.tsv") +sessionOut <- makeOut("session_info.txt") + +# Initialise data for html links and images, table with the link label and +# link address +linkData <- data.frame(Label=character(), Link=character(), + stringsAsFactors=FALSE) +imageData <- data.frame(Label=character(), Link=character(), + stringsAsFactors=FALSE) + +# Initialise vectors for storage of up/down/neutral regulated counts +upCount <- numeric() +downCount <- numeric() +flatCount <- numeric() + +################################################################################ +### Data Processing +################################################################################ + +# Transform gene selection from string into index values for mroast +if (workMode == "glm") { + if (selectOpt == "rank") { + selectVals <- gsub(" ", "", selectVals, fixed=TRUE) + selectVals <- unlist(strsplit(selectVals, ",")) + + for (i in 1:length(selectVals)) { + if (grepl(":", selectVals[i], fixed=TRUE)) { + temp <- unlist(strsplit(selectVals[i], ":")) + selectVals <- selectVals[-i] + a <- as.numeric(temp[1]) + b <- as.numeric(temp[2]) + selectVals <- c(selectVals, a:b) + } + } + selectVals <- as.numeric(unique(selectVals)) + } else { + selectVals <- gsub(" ", "", selectVals, fixed=TRUE) + selectVals <- unlist(strsplit(selectVals, ",")) + } +} + +if (inputType == "fastq" || inputType == "pairedFastq") { + # Use EdgeR hairpin process and capture outputs + + hpReadout <- capture.output( + data <- processAmplicons(readfile=fastqPath, readfile2=fastqPathRev, + barcodefile=samplePath, + hairpinfile=annoPath, + barcodeStart=barStart, barcodeEnd=barEnd, + barcodeStartRev=barStartRev, + barcodeEndRev=barEndRev, + hairpinStart=hpStart, hairpinEnd=hpEnd, + verbose=TRUE) + ) + + # Remove function output entries that show processing data or is empty + hpReadout <- hpReadout[hpReadout!=""] + hpReadout <- hpReadout[!grepl("Processing", hpReadout)] + hpReadout <- hpReadout[!grepl("in file", hpReadout)] + hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) + + # Make the names of groups syntactically valid (replace spaces with periods) + data$samples$group <- make.names(data$samples$group) + if (secFactName != "none") { + data$samples[[secFactName]] <- make.names(data$samples[[secFactName]]) + } +} else if (inputType == "counts") { + # Process counts information, set ID column to be row names + rownames(counts) <- counts$ID + counts <- counts[ , !(colnames(counts) == "ID")] + countsRows <- nrow(counts) + + # Process group information + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- samples$group[matchedIndex] + + if (secFactName != "none") { + secFactors <- samples[[secFactName]][matchedIndex] + } + + annoRows <- nrow(anno) + anno <- anno[match(rownames(counts), anno$ID), ] + annoMatched <- sum(!is.na(anno$ID)) + + if (any(is.na(anno$ID))) { + warningStr <- paste("count table contained more hairpins than", + "specified in hairpin annotation file") + warning(warningStr) + } + + # Filter out rows with zero counts + sel <- rowSums(counts)!=0 + counts <- counts[sel, ] + anno <- anno[sel, ] + + # Create DGEList + data <- DGEList(counts=counts, lib.size=colSums(counts), + norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) + + # Make the names of groups syntactically valid (replace spaces with periods) + data$samples$group <- make.names(data$samples$group) +} + +# Filter out any samples with zero counts +if (any(data$samples$lib.size == 0)) { + sampleSel <- data$samples$lib.size != 0 + filteredSamples <- paste(data$samples$ID[!sampleSel], collapse=", ") + data$counts <- data$counts[, sampleSel] + data$samples <- data$samples[sampleSel, ] +} + +# Filter hairpins with low counts +preFilterCount <- nrow(data) +selRow <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq +selCol <- colSums(data$counts) > readReq +data <- data[selRow, selCol] + +# Check if any data survived filtering +if (length(data$counts) == 0) { + stop("no data remains after filtering, consider relaxing filters") +} + +# Count number of filtered tags and samples +postFilterCount <- nrow(data) +filteredCount <- preFilterCount - postFilterCount +sampleFilterCount <- sum(!selCol) + +if (secFactName == "none") { + # Estimate dispersions + data <- estimateDisp(data) + commonBCV <- round(sqrt(data$common.dispersion), 4) +} else { + # Construct design + if (inputType == "counts") { + + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- factor(make.names(samples$group[matchedIndex])) + + secFactors <- factor(make.names(samples[[secFactName]][matchedIndex])) + + } else if (inputType == "fastq" || inputType == "pairedFastq") { + + factors <- factor(data$sample$group) + secFactors <- factor(data$sample[[secFactName]]) + + } + + design <- model.matrix(~0 + factors + secFactors) + + # Estimate dispersions + data <- estimateDisp(data, design=design) + commonBCV <- round(sqrt(data$common.dispersion), 4) +} + + +################################################################################ +### Output Processing +################################################################################ + +# Plot number of hairpins that could be matched per sample +png(barIndexPng, width=600, height=600) +barplot(height<-colSums(data$counts), las=2, main="Counts per index", + cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +imageData[1, ] <- c("Counts per Index", "barIndex.png") +invisible(dev.off()) + +pdf(barIndexPdf) +barplot(height<-colSums(data$counts), las=2, main="Counts per index", + cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +linkData[1, ] <- c("Counts per Index Barplot (.pdf)", "barIndex.pdf") +invisible(dev.off()) + +# Plot per hairpin totals across all samples +png(barHairpinPng, width=600, height=600) +if (nrow(data$counts)<50) { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +} else { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), + names.arg=FALSE) +} +imageData <- rbind(imageData, c("Counts per Hairpin", "barHairpin.png")) +invisible(dev.off()) + +pdf(barHairpinPdf) +if (nrow(data$counts)<50) { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) +} else { + barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", + cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), + names.arg=FALSE) +} +newEntry <- c("Counts per Hairpin Barplot (.pdf)", "barHairpin.pdf") +linkData <- rbind(linkData, newEntry) +invisible(dev.off()) + +# Make an MDS plot to visualise relationships between replicate samples +png(mdsPng, width=600, height=600) +plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group), main="MDS Plot") +imageData <- rbind(imageData, c("MDS Plot", "mds.png")) +invisible(dev.off()) + +pdf(mdsPdf) +plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group),main="MDS Plot") +newEntry <- c("MDS Plot (.pdf)", "mds.pdf") +linkData <- rbind(linkData, newEntry) +invisible(dev.off()) + +# BCV Plot +png(bcvPng, width=600, height=600) +plotBCV(data, main="BCV Plot") +imageData <- rbind(imageData, c("BCV Plot", "bcv.png")) +invisible(dev.off()) + +pdf(bcvPdf) +plotBCV(data, main="BCV Plot") +newEntry <- c("BCV Plot (.pdf)", "bcv.pdf") +linkData <- rbind(linkData, newEntry) +invisible(dev.off()) + +if (workMode == "classic") { + # Assess differential representation using classic exact testing methodology + # in edgeR + testData <- exactTest(data, pair=pairData) + + top <- topTags(testData, n=Inf) + + if (selectDirection == "all") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + } else if (selectDirection == "up") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC > lfcThresh), 1] + } else if (selectDirection == "down") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC < -lfcThresh), 1] +} + + write.table(top, file=topOut, row.names=FALSE, sep="\t") + + linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], + ") (.tsv)") + linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + + upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) + + downCount[1] <- sum(top$table$FDR < fdrThresh & + top$table$logFC < -lfcThresh) + + flatCount[1] <- sum(top$table$FDR > fdrThresh | + abs(top$table$logFC) < lfcThresh) + + + + # Select hairpins with FDR < 0.05 to highlight on plot + png(smearPng, width=600, height=600) + plotTitle <- gsub(".", " ", + paste0("Smear Plot: ", pairData[2], "-", pairData[1]), + fixed=TRUE) + plotSmear(testData, de.tags=topIDs, + pch=20, cex=1.0, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ")") + imgAddr <- paste0("smear(", pairData[2], "-", pairData[1],").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) + + pdf(smearPdf) + plotTitle <- gsub(".", " ", + paste0("Smear Plot: ", pairData[2], "-", pairData[1]), + fixed=TRUE) + plotSmear(testData, de.tags=topIDs, + pch=20, cex=1.0, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)") + imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf") + linkData <- rbind(linkData, c(imgName, imgAddr)) + invisible(dev.off()) + +} else if (workMode == "glm") { + # Generating design information + if (secFactName == "none") { + + factors <- factor(data$sample$group) + design <- model.matrix(~0 + factors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + + } else { + + factors <- factor(data$sample$group) + + if (inputType == "counts") { + + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- factor(samples$group[matchedIndex]) + + secFactors <- factor(samples[[secFactName]][matchedIndex]) + + } else if (inputType == "fastq" || inputType == "pairedFastq") { + + secFactors <- factor(data$sample[[secFactName]]) + + } + + design <- model.matrix(~0 + factors + secFactors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + colnames(design) <- gsub("secFactors", secFactName, colnames(design), + fixed=TRUE) + } + + + # Split up contrasts seperated by comma into a vector + contrastData <- unlist(strsplit(contrastData, split=",")) + + for (i in 1:length(contrastData)) { + # Generate contrasts information + contrasts <- makeContrasts(contrasts=contrastData[i], levels=design) + + # Fit negative bionomial GLM + fit <- glmFit(data, design) + # Carry out Likelihood ratio test + testData <- glmLRT(fit, contrast=contrasts) + + # Select hairpins with FDR < 0.05 to highlight on plot + top <- topTags(testData, n=Inf) + + if (selectDirection == "all") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + } else if (selectDirection == "up") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC > lfcThresh), 1] + } else if (selectDirection == "down") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC < -lfcThresh), 1] + } + + write.table(top, file=topOut[i], row.names=FALSE, sep="\t") + + linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") + linkAddr <- paste0("toptag(", contrastData[i], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + + # Collect counts for differential representation + upCount[i] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) + downCount[i] <- sum(top$table$FDR < fdrThresh & + top$table$logFC < -lfcThresh) + flatCount[i] <- sum(top$table$FDR > fdrThresh | + abs(top$table$logFC) < lfcThresh) + + # Make a plot of logFC versus logCPM + png(smearPng[i], height=600, width=600) + plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], + fixed=TRUE)) + plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + + imgName <- paste0("Smear Plot(", contrastData[i], ")") + imgAddr <- paste0("smear(", contrastData[i], ").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + invisible(dev.off()) + + pdf(smearPdf[i]) + plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], + fixed=TRUE)) + plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) + abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) + + linkName <- paste0("Smear Plot(", contrastData[i], ") (.pdf)") + linkAddr <- paste0("smear(", contrastData[i], ").pdf") + linkData <- rbind(linkData, c(linkName, linkAddr)) + invisible(dev.off()) + + genes <- as.character(data$genes$Gene) + unq <- unique(genes) + unq <- unq[!is.na(unq)] + geneList <- list() + for (gene in unq) { + if (length(which(genes == gene)) >= hairpinReq) { + geneList[[gene]] <- which(genes == gene) + } + } + + if (wantRoast) { + # Input preparaton for roast + nrot <- 9999 + set.seed(602214129) + roastData <- mroast(data, index=geneList, design=design, + contrast=contrasts, nrot=nrot) + roastData <- cbind(GeneID=rownames(roastData), roastData) + write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t") + linkName <- paste0("Gene Level Analysis Table(", contrastData[i], + ") (.tsv)") + linkAddr <- paste0("gene_level(", contrastData[i], ").tsv") + linkData <- rbind(linkData, c(linkName, linkAddr)) + if (selectOpt == "rank") { + selectedGenes <- rownames(roastData)[selectVals] + } else { + selectedGenes <- selectVals + } + + if (packageVersion("limma")<"3.19.19") { + png(barcodePng[i], width=600, height=length(selectedGenes)*150) + } else { + png(barcodePng[i], width=600, height=length(selectedGenes)*300) + } + par(mfrow=c(length(selectedGenes), 1)) + for (gene in selectedGenes) { + barcodeplot(testData$table$logFC, index=geneList[[gene]], + main=paste("Barcode Plot for", gene, "(logFCs)", + gsub(".", " ", contrastData[i])), + labels=c("Positive logFC", "Negative logFC")) + } + imgName <- paste0("Barcode Plot(", contrastData[i], ")") + imgAddr <- paste0("barcode(", contrastData[i], ").png") + imageData <- rbind(imageData, c(imgName, imgAddr)) + dev.off() + if (packageVersion("limma")<"3.19.19") { + pdf(barcodePdf[i], width=8, height=2) + } else { + pdf(barcodePdf[i], width=8, height=4) + } + for (gene in selectedGenes) { + barcodeplot(testData$table$logFC, index=geneList[[gene]], + main=paste("Barcode Plot for", gene, "(logFCs)", + gsub(".", " ", contrastData[i])), + labels=c("Positive logFC", "Negative logFC")) + } + linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)") + linkAddr <- paste0("barcode(", contrastData[i], ").pdf") + linkData <- rbind(linkData, c(linkName, linkAddr)) + dev.off() + } + } +} + +# Generate data frame of the significant differences +sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) +if (workMode == "glm") { + + row.names(sigDiff) <- contrastData + +} else if (workMode == "classic") { + + row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1]) + +} + +# Output table of summarised counts +ID <- rownames(data$counts) +outputCounts <- cbind(ID, data$counts) +write.table(outputCounts, file=countsOut, row.names=FALSE, sep="\t", + quote=FALSE) +linkName <- "Counts table (.tsv)" +linkAddr <- "counts.tsv" +linkData <- rbind(linkData, c(linkName, linkAddr)) + +# Record session info +writeLines(capture.output(sessionInfo()), sessionOut) +linkData <- rbind(linkData, c("Session Info", "session_info.txt")) + +# Record ending time and calculate total run time +timeEnd <- as.character(Sys.time()) +timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3)) +timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE) +################################################################################ +### HTML Generation +################################################################################ +# Clear file +cat("", file=htmlPath) + +cata("<html>\n") +HtmlHead("EdgeR Output") + +cata("<body>\n") +cata("<h3>EdgeR Analysis Output:</h3>\n") +cata("<h4>Input Summary:</h4>\n") +if (inputType == "fastq" || inputType == "pairedFastq") { + + cata("<ul>\n") + ListItem(hpReadout[1]) + ListItem(hpReadout[2]) + cata("</ul>\n") + cata(hpReadout[3], "<br />\n") + cata("<ul>\n") + ListItem(hpReadout[4]) + ListItem(hpReadout[7]) + cata("</ul>\n") + cata(hpReadout[8:11], sep="<br />\n") + cata("<br />\n") + cata("<b>Please check that read percentages are consistent with ") + cata("expectations.</b><br >\n") + +} else if (inputType == "counts") { + + cata("<ul>\n") + ListItem("Number of Samples: ", ncol(data$counts)) + ListItem("Number of Hairpins: ", countsRows) + ListItem("Number of annotations provided: ", annoRows) + ListItem("Number of annotations matched to hairpin: ", annoMatched) + cata("</ul>\n") + +} + +cata("The estimated common biological coefficient of variation (BCV) is: ", + commonBCV, "<br />\n") + +if (secFactName == "none") { + + cata("No secondary factor specified.<br />\n") + +} else { + + cata("Secondary factor specified as: ", secFactName, "<br />\n") + +} + +cata("<h4>Output:</h4>\n") +cata("PDF copies of JPEGS available in 'Plots' section.<br />\n") +for (i in 1:nrow(imageData)) { + if (grepl("barcode", imageData$Link[i])) { + + if (packageVersion("limma")<"3.19.19") { + + HtmlImage(imageData$Link[i], imageData$Label[i], + height=length(selectedGenes)*150) + + } else { + + HtmlImage(imageData$Link[i], imageData$Label[i], + height=length(selectedGenes)*300) + + } + } else { + + HtmlImage(imageData$Link[i], imageData$Label[i]) + + } +} +cata("<br />\n") + +cata("<h4>Differential Representation Counts:</h4>\n") + +cata("<table border=\"1\" cellpadding=\"4\">\n") +cata("<tr>\n") +TableItem() +for (i in colnames(sigDiff)) { + TableHeadItem(i) +} +cata("</tr>\n") +for (i in 1:nrow(sigDiff)) { + cata("<tr>\n") + TableHeadItem(unmake.names(row.names(sigDiff)[i])) + for (j in 1:ncol(sigDiff)) { + TableItem(as.character(sigDiff[i, j])) + } + cata("</tr>\n") +} +cata("</table>") + +cata("<h4>Plots:</h4>\n") +for (i in 1:nrow(linkData)) { + if (grepl(".pdf", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<h4>Tables:</h4>\n") +for (i in 1:nrow(linkData)) { + if (grepl(".tsv", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<p>Alt-click links to download file.</p>\n") +cata("<p>Click floppy disc icon on associated history item to download ") +cata("all files.</p>\n") +cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n") + +cata("<h4>Additional Information:</h4>\n") + +if (inputType == "fastq") { + + ListItem("Data was gathered from fastq raw read file(s).") + +} else if (inputType == "counts") { + + ListItem("Data was gathered from a table of counts.") + +} + +if (cpmReq != 0 && sampleReq != 0) { + tempStr <- paste("Target sequences without more than", cpmReq, + "CPM in at least", sampleReq, "samples are insignificant", + "and filtered out.") + ListItem(tempStr) + + filterProp <- round(filteredCount/preFilterCount*100, digits=2) + tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, + "%) target sequences were filtered out for low ", + "count-per-million.") + ListItem(tempStr) +} + +if (readReq != 0) { + tempStr <- paste("Samples that did not produce more than", readReq, + "counts were filtered out.") + ListItem(tempStr) + + tempStr <- paste0(sampleFilterCount, " samples were filtered out for low ", + "counts.") + ListItem(tempStr) +} + +if (exists("filteredSamples")) { + tempStr <- paste("The following samples were filtered out for having zero", + "library size: ", filteredSamples) + ListItem(tempStr) +} + +if (workMode == "classic") { + ListItem("An exact test was performed on each target sequence.") +} else if (workMode == "glm") { + ListItem("A generalised linear model was fitted to each target sequence.") +} + +cit <- character() +link <-character() +link[1] <- paste0("<a href=\"", + "http://www.bioconductor.org/packages/release/bioc/", + "vignettes/limma/inst/doc/usersguide.pdf", + "\">", "limma User's Guide", "</a>.") +link[2] <- paste0("<a href=\"", + "http://www.bioconductor.org/packages/release/bioc/", + "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf", + "\">", "edgeR User's Guide", "</a>") + +cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).", + "edgeR: a Bioconductor package for differential", + "expression analysis of digital gene expression", + "data. Bioinformatics 26, 139-140") +cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests", + "for assessing differences in tag abundance. Bioinformatics", + "23, 2881-2887") +cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of", + "negative binomial dispersion, with applications to SAGE data.", + "Biostatistics, 9, 321-332") + +cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential", + "expression analysis of multifactor RNA-Seq experiments with", + "respect to biological variation. Nucleic Acids Research 40,", + "4288-4297") + +cata("<h4>Citations</h4>") +cata("<ol>\n") +ListItem(cit[1]) +ListItem(cit[2]) +ListItem(cit[3]) +ListItem(cit[4]) +cata("</ol>\n") + +cata("<p>Report problems to: su.s@wehi.edu.au</p>\n") + +for (i in 1:nrow(linkData)) { + if (grepl("session_info", linkData$Link[i])) { + HtmlLink(linkData$Link[i], linkData$Label[i]) + } +} + +cata("<table border=\"0\">\n") +cata("<tr>\n") +TableItem("Task started at:"); TableItem(timeStart) +cata("</tr>\n") +cata("<tr>\n") +TableItem("Task ended at:"); TableItem(timeEnd) +cata("</tr>\n") +cata("<tr>\n") +TableItem("Task run time:"); TableItem(timeTaken) +cata("<tr>\n") +cata("</table>\n") + +cata("</body>\n") +cata("</html>")