Mercurial > repos > fubar > shrnaseq_test

diff hairpinTool.R @ 7:2c6bcbc1e76a draft default tip
Uploaded
author: fubar
date: Mon, 19 Jan 2015 22:14:10 -0500
parents: e7922416086d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hairpinTool.R	Mon Jan 19 22:14:10 2015 -0500
@@ -0,0 +1,1069 @@
+#!/usr/bin/env Rscript
+# ARGS: 1.inputType         -String specifying format of input (fastq or table)
+#    IF inputType is "fastq" or "pairedFastq:
+#       2*.fastqPath        -One or more strings specifying path to fastq files
+#       2.annoPath          -String specifying path to hairpin annotation table
+#       3.samplePath        -String specifying path to sample annotation table
+#       4.barStart          -Integer specifying starting position of barcode
+#       5.barEnd            -Integer specifying ending position of barcode
+#    ###   
+#    IF inputType is "pairedFastq":
+#       6.barStartRev       -Integer specifying starting position of barcode
+#                            on reverse end
+#       7.barEndRev         -Integer specifying ending position of barcode
+#                            on reverse end
+#    ### 
+#       8.hpStart           -Integer specifying startins position of hairpin
+#                            unique region
+#       9.hpEnd             -Integer specifying ending position of hairpin
+#                            unique region
+#    IF inputType is "counts":
+#       2.countPath         -String specifying path to count table
+#       3.annoPath          -String specifying path to hairpin annotation table
+#       4.samplePath        -String specifying path to sample annotation table
+#    ###
+#       10.secFactName      -String specifying name of secondary factor
+#       11.cpmReq           -Float specifying cpm requirement
+#       12.sampleReq        -Integer specifying cpm requirement
+#       13.readReq          -Integer specifying read requirement
+#       14.fdrThresh        -Float specifying the FDR requirement
+#       15.lfcThresh        -Float specifying the log-fold-change requirement
+#       16.workMode         -String specifying exact test or GLM usage
+#       17.htmlPath         -String specifying path to HTML file
+#       18.folderPath       -String specifying path to folder for output
+#    IF workMode is "classic" (exact test)
+#       19.pairData[2]      -String specifying first group for exact test
+#       20.pairData[1]      -String specifying second group for exact test
+#    ###
+#    IF workMode is "glm"
+#       19.contrastData     -String specifying contrasts to be made
+#       20.roastOpt         -String specifying usage of gene-wise tests
+#       21.hairpinReq       -String specifying hairpin requirement for gene-
+#                            wise test
+#       22.selectOpt        -String specifying type of selection for barcode
+#                            plots
+#       23.selectVals       -String specifying members selected for barcode
+#                            plots
+#    ###
+#
+# OUT:  Bar Plot of Counts Per Index
+#       Bar Plot of Counts Per Hairpin
+#       MDS Plot
+#       BCV Plot
+#       Smear Plot
+#       Barcode Plots (If Genewise testing was selected)
+#       Top Expression Table
+#       Feature Counts Table
+#       HTML file linking to the ouputs
+#
+# Author: Shian Su - registertonysu@gmail.com - Jan 2014
+
+# Record starting time
+timeStart <- as.character(Sys.time())
+options(bitmapType='cairo')
+# needed to prevent missing x11 errors for png()
+# Loading and checking required packages
+library(methods, quietly=TRUE, warn.conflicts=FALSE)
+library(statmod, quietly=TRUE, warn.conflicts=FALSE)
+library(splines, quietly=TRUE, warn.conflicts=FALSE)
+library(edgeR, quietly=TRUE, warn.conflicts=FALSE)
+library(limma, quietly=TRUE, warn.conflicts=FALSE)
+
+if (packageVersion("edgeR") < "3.7.17") {
+  stop("Please update 'edgeR' to version >= 3.7.17 to run this tool")
+}
+
+if (packageVersion("limma")<"3.21.16") {
+  message("Update 'limma' to version >= 3.21.16 to see updated barcode graphs")
+}
+
+################################################################################
+### Function declarations
+################################################################################
+
+# Function to load libaries without messages
+silentLibrary <- function(...) {
+  list <- c(...)
+  for (package in list){
+    suppressPackageStartupMessages(library(package, character.only=TRUE))
+  }
+}
+
+# Function to sanitise contrast equations so there are no whitespaces
+# surrounding the arithmetic operators, leading or trailing whitespace
+sanitiseEquation <- function(equation) {
+  equation <- gsub(" *[+] *", "+", equation)
+  equation <- gsub(" *[-] *", "-", equation)
+  equation <- gsub(" *[/] *", "/", equation)
+  equation <- gsub(" *[*] *", "*", equation)
+  equation <- gsub("^\\s+|\\s+$", "", equation)
+  return(equation)
+}
+
+# Function to sanitise group information
+sanitiseGroups <- function(string) {
+  string <- gsub(" *[,] *", ",", string)
+  string <- gsub("^\\s+|\\s+$", "", string)
+  return(string)
+}
+
+# Function to change periods to whitespace in a string
+unmake.names <- function(string) {
+  string <- gsub(".", " ", string, fixed=TRUE)
+  return(string)
+}
+
+# Function has string input and generates an output path string
+makeOut <- function(filename) {
+  return(paste0(folderPath, "/", filename))
+}
+
+# Function has string input and generates both a pdf and png output strings
+imgOut <- function(filename) {
+  assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), 
+         envir=.GlobalEnv)
+  assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")),
+         envir=.GlobalEnv)
+}
+
+# Create cat function default path set, default seperator empty and appending
+# true by default (Ripped straight from the cat function with altered argument
+# defaults)
+cata <- function(..., file=htmlPath, sep="", fill=FALSE, labels=NULL, 
+                 append=TRUE) {
+  if (is.character(file)) 
+    if (file == "") 
+      file <- stdout()
+  else if (substring(file, 1L, 1L) == "|") {
+    file <- pipe(substring(file, 2L), "w")
+    on.exit(close(file))
+  }
+  else {
+    file <- file(file, ifelse(append, "a", "w"))
+    on.exit(close(file))
+  }
+  .Internal(cat(list(...), file, sep, fill, labels, append))
+}
+
+# Function to write code for html head and title
+HtmlHead <- function(title) {
+  cata("<head>\n")
+  cata("<title>", title, "</title>\n")
+  cata("</head>\n")
+}
+
+# Function to write code for html links
+HtmlLink <- function(address, label=address) {
+  cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n")
+}
+
+# Function to write code for html images
+HtmlImage <- function(source, label=source, height=600, width=600) {
+  cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height)
+  cata("\" width=\"", width, "\"/>\n")
+}
+
+# Function to write code for html list items
+ListItem <- function(...) {
+  cata("<li>", ..., "</li>\n")
+}
+
+TableItem <- function(...) {
+  cata("<td>", ..., "</td>\n")
+}
+
+TableHeadItem <- function(...) {
+  cata("<th>", ..., "</th>\n")
+}
+################################################################################
+### Input Processing
+################################################################################
+
+# Grabbing arguments from command line
+argv <- commandArgs(T)
+
+inputType <- as.character(argv[1])
+if (inputType == "fastq") {
+
+  fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], 
+                                 fixed=TRUE))
+
+  # Remove fastq paths
+  argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] 
+
+  fastqPathRev <- NULL
+  annoPath <- as.character(argv[2])
+  samplePath <- as.character(argv[3])
+  barStart <- as.numeric(argv[4])
+  barEnd <- as.numeric(argv[5])
+  barStartRev <- NULL
+  barStartRev <- NULL
+  hpStart <- as.numeric(argv[8])
+  hpEnd <- as.numeric(argv[9])
+} else if (inputType=="pairedFastq") {
+
+  fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], 
+                                 fixed=TRUE))
+  
+  fastqPathRev <- as.character(gsub("fastqRev::", "", 
+                               argv[grepl("fastqRev::", argv)], fixed=TRUE))
+
+  # Remove fastq paths
+  argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
+  argv <- argv[!grepl("fastqRev::", argv, fixed=TRUE)] 
+
+  annoPath <- as.character(argv[2])
+  samplePath <- as.character(argv[3])
+  barStart <- as.numeric(argv[4])
+  barEnd <- as.numeric(argv[5])
+  barStartRev <- as.numeric(argv[6])
+  barEndRev <- as.numeric(argv[7])
+  hpStart <- as.numeric(argv[8])
+  hpEnd <- as.numeric(argv[9])
+} else if (inputType == "counts") {
+  countPath <- as.character(argv[2])
+  annoPath <- as.character(argv[3])
+  samplePath <- as.character(argv[4])
+}
+
+secFactName <- as.character(argv[10])
+cpmReq <- as.numeric(argv[11])
+sampleReq <- as.numeric(argv[12])
+readReq <- as.numeric(argv[13])
+fdrThresh <- as.numeric(argv[14])
+lfcThresh <- as.numeric(argv[15])
+selectDirection <- as.character(argv[16])
+workMode <- as.character(argv[17])
+htmlPath <- as.character(argv[18])
+folderPath <- as.character(argv[19])
+
+if (workMode == "classic") {
+  pairData <- character()
+  pairData[2] <- as.character(argv[20])
+  pairData[1] <- as.character(argv[21])
+} else if (workMode == "glm") {
+  contrastData <- as.character(argv[20])
+  roastOpt <- as.character(argv[21])
+  hairpinReq <- as.numeric(argv[22])
+  selectOpt <- as.character(argv[23])
+  selectVals <- as.character(argv[24])
+}
+
+# Read in inputs
+
+samples <- read.table(samplePath, header=TRUE, sep="\t")
+
+anno <- read.table(annoPath, header=TRUE, sep="\t")
+
+if (inputType == "counts") {
+  counts <- read.table(countPath, header=TRUE, sep="\t")
+}
+
+###################### Check inputs for correctness ############################
+samples$ID <- make.names(samples$ID)
+
+if ( !any(grepl("group", names(samples))) ) {
+  stop("'group' column not specified in sample annotation file")
+} # Check if grouping variable has been specified
+
+if (secFactName != "none") {
+  if ( !any(grepl(secFactName, names(samples))) ) {
+  tempStr <- paste0("Second factor specified as \"", secFactName, "\" but ",
+                    "no such column name found in sample annotation file")
+  stop(tempStr)
+  } # Check if specified secondary factor is present 
+}
+
+
+if ( any(table(samples$ID) > 1) ){
+  tab <- table(samples$ID)
+  offenders <- paste(names(tab[tab > 1]), collapse=", ")
+  offenders <- unmake.names(offenders)
+  stop("'ID' column of sample annotation must have unique values, values ",
+       offenders, " are repeated")
+} # Check that IDs in sample annotation are unique
+
+if (inputType == "fastq" || inputType == "pairedFastq") {
+  
+  if ( any(table(anno$ID) > 1) ){
+    tab <- table(anno$ID)
+    offenders <- paste(names(tab[tab>1]), collapse=", ")
+    stop("'ID' column of hairpin annotation must have unique values, values ",
+    offenders, " are repeated")
+  } # Check that IDs in hairpin annotation are unique
+  
+} else if (inputType == "counts") {
+  # The first element of the colnames will be 'ID' and should not match
+  idFromSample <- samples$ID
+  idFromTable <- colnames(counts)[-1]
+  if (any(is.na(match(idFromTable, idFromSample)))) {
+    stop("not all samples have groups specified")
+  } # Check that a group has be specifed for each sample
+  
+  if ( any(table(counts$ID) > 1) ){
+    tab <- table(counts$ID)
+    offenders <- paste(names(tab[tab>1]), collapse=", ")
+    stop("'ID' column of count table must have unique values, values ",
+    offenders, " are repeated")
+  } # Check that IDs in count table are unique
+}
+if (workMode == "glm") {
+  if (roastOpt == "yes") {
+    if (is.na(match("Gene", colnames(anno)))) {
+      tempStr <- paste("Gene-wise tests selected but'Gene' column not",
+                       "specified in hairpin annotation file")
+      stop(tempStr)
+    }
+  }
+}
+
+if (secFactName != "none") {
+  if (workMode != "glm") {
+    tempStr <- paste("only glm analysis type possible when secondary factor",
+                     "used, please change appropriate option.")
+  }
+}
+
+################################################################################
+
+# Process arguments
+if (workMode == "glm") {
+  if (roastOpt == "yes") {
+    wantRoast <- TRUE
+  } else {
+    wantRoast <- FALSE
+  }
+}
+
+# Split up contrasts seperated by comma into a vector and replace spaces with
+# periods
+if (exists("contrastData")) {
+  contrastData <- unlist(strsplit(contrastData, split=","))
+  contrastData <- sanitiseEquation(contrastData)
+  contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
+}
+
+# Replace spaces with periods in pair data
+if (exists("pairData")) {
+  pairData <- make.names(pairData)
+}
+
+# Generate output folder and paths
+dir.create(folderPath, showWarnings=FALSE)
+
+# Generate links for outputs
+imgOut("barHairpin")
+imgOut("barIndex")
+imgOut("mds")
+imgOut("bcv")
+if (workMode == "classic") {
+  smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png"))
+  smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf"))
+  topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv"))
+} else if (workMode == "glm") {
+  smearPng <- character()
+  smearPdf <- character()
+  topOut <- character()
+  roastOut <- character()
+  barcodePng <- character()
+  barcodePdf <- character()
+  for (i in 1:length(contrastData)) {
+    smearPng[i] <- makeOut(paste0("smear(", contrastData[i], ").png"))
+    smearPdf[i] <- makeOut(paste0("smear(", contrastData[i], ").pdf"))
+    topOut[i] <- makeOut(paste0("toptag(", contrastData[i], ").tsv"))
+    roastOut[i] <- makeOut(paste0("gene_level(", contrastData[i], ").tsv"))
+    barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png"))
+    barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf"))
+  }
+}
+countsOut <- makeOut("counts.tsv")
+sessionOut <- makeOut("session_info.txt")
+
+# Initialise data for html links and images, table with the link label and
+# link address
+linkData <- data.frame(Label=character(), Link=character(),
+                       stringsAsFactors=FALSE)
+imageData <- data.frame(Label=character(), Link=character(),
+                        stringsAsFactors=FALSE)
+                        
+# Initialise vectors for storage of up/down/neutral regulated counts
+upCount <- numeric()
+downCount <- numeric()
+flatCount <- numeric()
+
+################################################################################
+### Data Processing
+################################################################################
+
+# Transform gene selection from string into index values for mroast
+if (workMode == "glm") {
+  if (selectOpt == "rank") {
+    selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
+    selectVals <- unlist(strsplit(selectVals, ","))
+    
+    for (i in 1:length(selectVals)) {
+      if (grepl(":", selectVals[i], fixed=TRUE)) {
+        temp <- unlist(strsplit(selectVals[i], ":"))
+        selectVals <- selectVals[-i]
+        a <- as.numeric(temp[1])
+        b <- as.numeric(temp[2])
+        selectVals <- c(selectVals, a:b)         
+      }
+    }
+    selectVals <- as.numeric(unique(selectVals))
+  } else {
+    selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
+    selectVals <- unlist(strsplit(selectVals, ","))
+  }                                                           
+}
+                                                  
+if (inputType == "fastq" || inputType == "pairedFastq") {
+  # Use EdgeR hairpin process and capture outputs
+
+  hpReadout <- capture.output(
+  data <- processAmplicons(readfile=fastqPath, readfile2=fastqPathRev,
+                            barcodefile=samplePath, 
+                            hairpinfile=annoPath,
+                            barcodeStart=barStart, barcodeEnd=barEnd,
+                            barcodeStartRev=barStartRev, 
+                            barcodeEndRev=barEndRev,
+                            hairpinStart=hpStart, hairpinEnd=hpEnd, 
+                            verbose=TRUE)
+  )
+
+  # Remove function output entries that show processing data or is empty
+  hpReadout <- hpReadout[hpReadout!=""]
+  hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
+  hpReadout <- hpReadout[!grepl("in file", hpReadout)]
+  hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
+
+  # Make the names of groups syntactically valid (replace spaces with periods)
+  data$samples$group <- make.names(data$samples$group)
+  if (secFactName != "none") {
+    data$samples[[secFactName]] <- make.names(data$samples[[secFactName]])
+  }
+} else if (inputType == "counts") {
+  # Process counts information, set ID column to be row names
+  rownames(counts) <- counts$ID
+  counts <- counts[ , !(colnames(counts) == "ID")]
+  countsRows <- nrow(counts)
+  
+  # Process group information
+  sampleNames <- colnames(counts)
+  matchedIndex <- match(sampleNames, samples$ID)
+  factors <- samples$group[matchedIndex]
+
+  if (secFactName != "none") {
+    secFactors <- samples[[secFactName]][matchedIndex]
+  }
+  
+  annoRows <- nrow(anno)
+  anno <- anno[match(rownames(counts), anno$ID), ]
+  annoMatched <- sum(!is.na(anno$ID))
+  
+  if (any(is.na(anno$ID))) {
+    warningStr <- paste("count table contained more hairpins than",
+                        "specified in hairpin annotation file")
+    warning(warningStr)
+  }
+  
+  # Filter out rows with zero counts
+  sel <- rowSums(counts)!=0
+  counts <- counts[sel, ]
+  anno <- anno[sel, ]
+  
+  # Create DGEList
+  data <- DGEList(counts=counts, lib.size=colSums(counts), 
+                  norm.factors=rep(1,ncol(counts)), genes=anno, group=factors)
+  
+  # Make the names of groups syntactically valid (replace spaces with periods)
+  data$samples$group <- make.names(data$samples$group)
+}
+
+# Filter out any samples with zero counts
+if (any(data$samples$lib.size == 0)) {
+  sampleSel <- data$samples$lib.size != 0
+  filteredSamples <- paste(data$samples$ID[!sampleSel], collapse=", ")
+  data$counts <- data$counts[, sampleSel]
+  data$samples <- data$samples[sampleSel, ]
+}
+
+# Filter hairpins with low counts
+preFilterCount <- nrow(data)
+selRow <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
+selCol <- colSums(data$counts) > readReq
+data <- data[selRow, selCol]
+
+# Check if any data survived filtering
+if (length(data$counts) == 0) {
+  stop("no data remains after filtering, consider relaxing filters")
+}
+
+# Count number of filtered tags and samples
+postFilterCount <- nrow(data)
+filteredCount <- preFilterCount - postFilterCount
+sampleFilterCount <- sum(!selCol)
+
+if (secFactName == "none") {
+  # Estimate dispersions
+  data <- estimateDisp(data)
+  commonBCV <- round(sqrt(data$common.dispersion), 4)
+} else {
+  # Construct design
+  if (inputType == "counts") {
+      
+    sampleNames <- colnames(counts)
+    matchedIndex <- match(sampleNames, samples$ID)
+    factors <- factor(make.names(samples$group[matchedIndex]))
+
+    secFactors <- factor(make.names(samples[[secFactName]][matchedIndex]))
+
+  } else if (inputType == "fastq" || inputType == "pairedFastq") {
+
+    factors <- factor(data$sample$group)
+    secFactors <- factor(data$sample[[secFactName]])
+  
+  }
+
+  design <- model.matrix(~0 + factors + secFactors)
+  
+  # Estimate dispersions
+  data <- estimateDisp(data, design=design)
+  commonBCV <- round(sqrt(data$common.dispersion), 4)
+}
+
+
+################################################################################
+### Output Processing
+################################################################################
+
+# Plot number of hairpins that could be matched per sample
+png(barIndexPng, width=600, height=600)
+barplot(height<-colSums(data$counts), las=2, main="Counts per index", 
+        cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+imageData[1, ] <- c("Counts per Index", "barIndex.png")
+invisible(dev.off())
+
+pdf(barIndexPdf)
+barplot(height<-colSums(data$counts), las=2, main="Counts per index", 
+        cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+linkData[1, ] <- c("Counts per Index Barplot (.pdf)", "barIndex.pdf")
+invisible(dev.off())
+
+# Plot per hairpin totals across all samples
+png(barHairpinPng, width=600, height=600)
+if (nrow(data$counts)<50) {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+} else {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2),
+          names.arg=FALSE)
+}
+imageData <- rbind(imageData, c("Counts per Hairpin", "barHairpin.png"))
+invisible(dev.off())
+
+pdf(barHairpinPdf)
+if (nrow(data$counts)<50) {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2))
+} else {
+  barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin",
+          cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2),
+          names.arg=FALSE)
+}
+newEntry <- c("Counts per Hairpin Barplot (.pdf)", "barHairpin.pdf")
+linkData <- rbind(linkData, newEntry)
+invisible(dev.off())
+
+# Make an MDS plot to visualise relationships between replicate samples
+png(mdsPng, width=600, height=600)
+plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group), main="MDS Plot")
+imageData <- rbind(imageData, c("MDS Plot", "mds.png"))
+invisible(dev.off())
+
+pdf(mdsPdf)
+plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group),main="MDS Plot")
+newEntry <- c("MDS Plot (.pdf)", "mds.pdf")
+linkData <- rbind(linkData, newEntry)
+invisible(dev.off())
+
+# BCV Plot
+png(bcvPng, width=600, height=600)
+plotBCV(data, main="BCV Plot")
+imageData <- rbind(imageData, c("BCV Plot", "bcv.png"))
+invisible(dev.off())
+
+pdf(bcvPdf)
+plotBCV(data, main="BCV Plot")
+newEntry <- c("BCV Plot (.pdf)", "bcv.pdf")
+linkData <- rbind(linkData, newEntry)
+invisible(dev.off())
+
+if (workMode == "classic") {
+  # Assess differential representation using classic exact testing methodology 
+  # in edgeR
+  testData <- exactTest(data, pair=pairData)
+  
+  top <- topTags(testData, n=Inf)
+
+  if (selectDirection == "all") {
+    topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                      (abs(top$table$logFC) > lfcThresh), 1]
+  } else if (selectDirection == "up") {
+    topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                      (top$table$logFC > lfcThresh), 1]
+  } else if (selectDirection == "down") {
+  topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                      (top$table$logFC < -lfcThresh), 1]
+}
+                      
+  write.table(top, file=topOut, row.names=FALSE, sep="\t")
+  
+  linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], 
+                     ") (.tsv)")
+  linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv")
+  linkData <- rbind(linkData, c(linkName, linkAddr))
+  
+  upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh)
+
+  downCount[1] <- sum(top$table$FDR < fdrThresh & 
+                      top$table$logFC < -lfcThresh)
+
+  flatCount[1] <- sum(top$table$FDR > fdrThresh |
+                      abs(top$table$logFC) < lfcThresh)
+  
+  
+  
+  # Select hairpins with FDR < 0.05 to highlight on plot
+  png(smearPng, width=600, height=600)
+  plotTitle <- gsub(".", " ", 
+                    paste0("Smear Plot: ", pairData[2], "-", pairData[1]),
+                    fixed=TRUE)
+  plotSmear(testData, de.tags=topIDs, 
+            pch=20, cex=1.0, main=plotTitle)
+  abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+  imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ")")
+  imgAddr <- paste0("smear(", pairData[2], "-", pairData[1],").png")
+  imageData <- rbind(imageData, c(imgName, imgAddr))
+  invisible(dev.off())
+  
+  pdf(smearPdf)
+  plotTitle <- gsub(".", " ", 
+                    paste0("Smear Plot: ", pairData[2], "-", pairData[1]),
+                    fixed=TRUE)
+  plotSmear(testData, de.tags=topIDs, 
+            pch=20, cex=1.0, main=plotTitle)
+  abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+  imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)")
+  imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf")
+  linkData <- rbind(linkData, c(imgName, imgAddr))
+  invisible(dev.off())
+  
+} else if (workMode == "glm") {
+  # Generating design information
+  if (secFactName == "none") {
+
+    factors <- factor(data$sample$group)
+    design <- model.matrix(~0 + factors)
+    
+    colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE)
+
+  } else {
+
+    factors <- factor(data$sample$group)
+
+    if (inputType == "counts") {
+      
+      sampleNames <- colnames(counts)
+      matchedIndex <- match(sampleNames, samples$ID)
+      factors <- factor(samples$group[matchedIndex])
+
+      secFactors <- factor(samples[[secFactName]][matchedIndex])
+
+    } else if (inputType == "fastq" || inputType == "pairedFastq") {
+
+      secFactors <- factor(data$sample[[secFactName]])
+    
+    }
+
+    design <- model.matrix(~0 + factors + secFactors)
+    
+    colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE)
+    colnames(design) <- gsub("secFactors", secFactName, colnames(design), 
+                              fixed=TRUE)
+  }
+  
+  
+  # Split up contrasts seperated by comma into a vector
+  contrastData <- unlist(strsplit(contrastData, split=","))
+  
+  for (i in 1:length(contrastData)) {
+    # Generate contrasts information
+    contrasts <- makeContrasts(contrasts=contrastData[i], levels=design)
+    
+    # Fit negative bionomial GLM
+    fit <- glmFit(data, design)
+    # Carry out Likelihood ratio test
+    testData <- glmLRT(fit, contrast=contrasts)
+    
+    # Select hairpins with FDR < 0.05 to highlight on plot
+    top <- topTags(testData, n=Inf)
+
+    if (selectDirection == "all") {
+      topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                        (abs(top$table$logFC) > lfcThresh), 1]
+    } else if (selectDirection == "up") {
+      topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                        (top$table$logFC > lfcThresh), 1]
+    } else if (selectDirection == "down") {
+      topIDs <- top$table[(top$table$FDR < fdrThresh) &
+                        (top$table$logFC < -lfcThresh), 1]
+    }
+
+    write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
+    
+    linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)")
+    linkAddr <- paste0("toptag(", contrastData[i], ").tsv")
+    linkData <- rbind(linkData, c(linkName, linkAddr))
+    
+    # Collect counts for differential representation
+    upCount[i] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh)
+    downCount[i] <- sum(top$table$FDR < fdrThresh & 
+                        top$table$logFC < -lfcThresh)
+    flatCount[i] <- sum(top$table$FDR > fdrThresh |
+                        abs(top$table$logFC) < lfcThresh)
+    
+    # Make a plot of logFC versus logCPM
+    png(smearPng[i], height=600, width=600)
+    plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], 
+                       fixed=TRUE))
+    plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle)
+    abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+    
+    imgName <- paste0("Smear Plot(", contrastData[i], ")")
+    imgAddr <- paste0("smear(", contrastData[i], ").png")
+    imageData <- rbind(imageData, c(imgName, imgAddr))
+    invisible(dev.off())
+    
+    pdf(smearPdf[i])
+    plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], 
+                       fixed=TRUE))
+    plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle)
+    abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2)
+    
+    linkName <- paste0("Smear Plot(", contrastData[i], ") (.pdf)")
+    linkAddr <- paste0("smear(", contrastData[i], ").pdf")
+    linkData <- rbind(linkData, c(linkName, linkAddr))
+    invisible(dev.off())
+    
+    genes <- as.character(data$genes$Gene)
+    unq <- unique(genes)
+    unq <- unq[!is.na(unq)]
+    geneList <- list()
+    for (gene in unq) {
+      if (length(which(genes == gene)) >= hairpinReq) {
+        geneList[[gene]] <- which(genes == gene)
+      }
+    }
+    
+    if (wantRoast) {
+      # Input preparaton for roast
+      nrot <- 9999
+      set.seed(602214129)
+      roastData <- mroast(data, index=geneList, design=design,
+                         contrast=contrasts, nrot=nrot)
+      roastData <- cbind(GeneID=rownames(roastData), roastData)
+      write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t")
+      linkName <- paste0("Gene Level Analysis Table(", contrastData[i], 
+                         ") (.tsv)")
+      linkAddr <- paste0("gene_level(", contrastData[i], ").tsv")
+      linkData <- rbind(linkData, c(linkName, linkAddr))
+      if (selectOpt == "rank") {
+        selectedGenes <- rownames(roastData)[selectVals]
+      } else {
+        selectedGenes <- selectVals
+      }
+      
+      if (packageVersion("limma")<"3.19.19") {
+        png(barcodePng[i], width=600, height=length(selectedGenes)*150)
+      } else {
+        png(barcodePng[i], width=600, height=length(selectedGenes)*300)
+      }
+      par(mfrow=c(length(selectedGenes), 1))
+      for (gene in selectedGenes) {
+        barcodeplot(testData$table$logFC, index=geneList[[gene]],
+                    main=paste("Barcode Plot for", gene, "(logFCs)", 
+                               gsub(".", " ", contrastData[i])),
+                    labels=c("Positive logFC", "Negative logFC"))
+      }
+      imgName <- paste0("Barcode Plot(", contrastData[i], ")")
+      imgAddr <- paste0("barcode(", contrastData[i], ").png")
+      imageData <- rbind(imageData, c(imgName, imgAddr))
+      dev.off()
+      if (packageVersion("limma")<"3.19.19") {
+        pdf(barcodePdf[i], width=8, height=2)
+      } else {
+        pdf(barcodePdf[i], width=8, height=4)
+      }
+      for (gene in selectedGenes) {
+        barcodeplot(testData$table$logFC, index=geneList[[gene]],
+                    main=paste("Barcode Plot for", gene, "(logFCs)", 
+                               gsub(".", " ", contrastData[i])),
+                    labels=c("Positive logFC", "Negative logFC"))
+      }
+      linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)")
+      linkAddr <- paste0("barcode(", contrastData[i], ").pdf")
+      linkData <- rbind(linkData, c(linkName, linkAddr))
+      dev.off()
+    }
+  }
+}
+
+# Generate data frame of the significant differences
+sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount)
+if (workMode == "glm") {
+
+  row.names(sigDiff) <- contrastData
+
+} else if (workMode == "classic") {
+
+  row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1])
+
+}
+
+# Output table of summarised counts
+ID <- rownames(data$counts)
+outputCounts <- cbind(ID, data$counts)
+write.table(outputCounts, file=countsOut, row.names=FALSE, sep="\t",
+            quote=FALSE)
+linkName <- "Counts table (.tsv)"
+linkAddr <- "counts.tsv"
+linkData <- rbind(linkData, c(linkName, linkAddr))
+
+# Record session info
+writeLines(capture.output(sessionInfo()), sessionOut)
+linkData <- rbind(linkData, c("Session Info", "session_info.txt"))
+
+# Record ending time and calculate total run time
+timeEnd <- as.character(Sys.time())
+timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3))
+timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE)
+################################################################################
+### HTML Generation
+################################################################################
+# Clear file
+cat("", file=htmlPath)
+
+cata("<html>\n")
+HtmlHead("EdgeR Output")
+
+cata("<body>\n")
+cata("<h3>EdgeR Analysis Output:</h3>\n")
+cata("<h4>Input Summary:</h4>\n")
+if (inputType == "fastq" || inputType == "pairedFastq") {
+
+  cata("<ul>\n")
+  ListItem(hpReadout[1])
+  ListItem(hpReadout[2])
+  cata("</ul>\n")
+  cata(hpReadout[3], "<br />\n")
+  cata("<ul>\n")
+  ListItem(hpReadout[4])
+  ListItem(hpReadout[7])
+  cata("</ul>\n")
+  cata(hpReadout[8:11], sep="<br />\n")
+  cata("<br />\n")
+  cata("<b>Please check that read percentages are consistent with ")
+  cata("expectations.</b><br >\n")
+
+} else if (inputType == "counts") {
+
+  cata("<ul>\n")
+  ListItem("Number of Samples: ", ncol(data$counts))
+  ListItem("Number of Hairpins: ", countsRows)
+  ListItem("Number of annotations provided: ", annoRows)
+  ListItem("Number of annotations matched to hairpin: ", annoMatched)
+  cata("</ul>\n")
+
+}
+
+cata("The estimated common biological coefficient of variation (BCV) is: ", 
+     commonBCV, "<br />\n")
+
+if (secFactName == "none") {
+
+  cata("No secondary factor specified.<br />\n")
+
+} else {
+
+  cata("Secondary factor specified as: ", secFactName, "<br />\n")
+
+}
+
+cata("<h4>Output:</h4>\n")
+cata("PDF copies of JPEGS available in 'Plots' section.<br />\n")
+for (i in 1:nrow(imageData)) {
+  if (grepl("barcode", imageData$Link[i])) {
+
+    if (packageVersion("limma")<"3.19.19") {
+
+      HtmlImage(imageData$Link[i], imageData$Label[i], 
+                height=length(selectedGenes)*150)
+
+    } else {
+
+      HtmlImage(imageData$Link[i], imageData$Label[i], 
+                height=length(selectedGenes)*300)
+
+    }
+  } else {
+
+    HtmlImage(imageData$Link[i], imageData$Label[i])
+
+  }
+}
+cata("<br />\n")
+
+cata("<h4>Differential Representation Counts:</h4>\n")
+
+cata("<table border=\"1\" cellpadding=\"4\">\n")
+cata("<tr>\n")
+TableItem()
+for (i in colnames(sigDiff)) {
+  TableHeadItem(i)
+}
+cata("</tr>\n")
+for (i in 1:nrow(sigDiff)) {
+  cata("<tr>\n")
+  TableHeadItem(unmake.names(row.names(sigDiff)[i]))
+  for (j in 1:ncol(sigDiff)) {
+    TableItem(as.character(sigDiff[i, j]))
+  }
+  cata("</tr>\n")
+}
+cata("</table>")
+
+cata("<h4>Plots:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (grepl(".pdf", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<h4>Tables:</h4>\n")
+for (i in 1:nrow(linkData)) {
+  if (grepl(".tsv", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<p>Alt-click links to download file.</p>\n")
+cata("<p>Click floppy disc icon on associated history item to download ")
+cata("all files.</p>\n")
+cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n")
+
+cata("<h4>Additional Information:</h4>\n")
+
+if (inputType == "fastq") {
+
+  ListItem("Data was gathered from fastq raw read file(s).")
+
+} else if (inputType == "counts") {
+
+  ListItem("Data was gathered from a table of counts.")
+
+}
+
+if (cpmReq != 0 && sampleReq != 0) {
+  tempStr <- paste("Target sequences without more than", cpmReq,
+                   "CPM in at least", sampleReq, "samples are insignificant",
+                   "and filtered out.")
+  ListItem(tempStr)
+
+  filterProp <- round(filteredCount/preFilterCount*100, digits=2)
+  tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
+                   "%) target sequences were filtered out for low ",
+                   "count-per-million.")
+  ListItem(tempStr)
+}
+
+if (readReq != 0) {
+  tempStr <- paste("Samples that did not produce more than", readReq,
+                   "counts were filtered out.")
+  ListItem(tempStr)
+
+  tempStr <- paste0(sampleFilterCount, " samples were filtered out for low ",
+                    "counts.")
+  ListItem(tempStr)
+}
+
+if (exists("filteredSamples")) {
+  tempStr <- paste("The following samples were filtered out for having zero",
+                   "library size: ", filteredSamples)
+  ListItem(tempStr)
+}
+
+if (workMode == "classic") {
+  ListItem("An exact test was performed on each target sequence.")
+} else if (workMode == "glm") {
+  ListItem("A generalised linear model was fitted to each target sequence.")
+}
+
+cit <- character()
+link <-character()
+link[1] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/limma/inst/doc/usersguide.pdf",
+                  "\">", "limma User's Guide", "</a>.")
+link[2] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",
+                  "\">", "edgeR User's Guide", "</a>")
+                  
+cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).",
+                "edgeR: a Bioconductor package for differential",
+                "expression analysis of digital gene expression",
+                "data. Bioinformatics 26, 139-140")
+cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",
+                "for assessing differences in tag abundance. Bioinformatics",
+                "23, 2881-2887")
+cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",
+                "negative binomial dispersion, with applications to SAGE data.",
+                "Biostatistics, 9, 321-332")
+
+cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",
+                "expression analysis of multifactor RNA-Seq experiments with",
+                "respect to biological variation. Nucleic Acids Research 40,",
+                "4288-4297")
+
+cata("<h4>Citations</h4>")
+cata("<ol>\n")
+ListItem(cit[1])
+ListItem(cit[2])
+ListItem(cit[3])
+ListItem(cit[4])
+cata("</ol>\n")
+
+cata("<p>Report problems to: su.s@wehi.edu.au</p>\n")
+
+for (i in 1:nrow(linkData)) {
+  if (grepl("session_info", linkData$Link[i])) {
+    HtmlLink(linkData$Link[i], linkData$Label[i])
+  }
+}
+
+cata("<table border=\"0\">\n")
+cata("<tr>\n")
+TableItem("Task started at:"); TableItem(timeStart)
+cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task ended at:"); TableItem(timeEnd)
+cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task run time:"); TableItem(timeTaken)
+cata("<tr>\n")
+cata("</table>\n")
+
+cata("</body>\n")
+cata("</html>")
author	fubar
date	Mon, 19 Jan 2015 22:14:10 -0500
parents	e7922416086d
children