Mercurial > repos > fubar > shrnaseq_test
comparison hairpinTool.R @ 7:2c6bcbc1e76a draft default tip
Uploaded
author | fubar |
---|---|
date | Mon, 19 Jan 2015 22:14:10 -0500 |
parents | e7922416086d |
children |
comparison
equal
deleted
inserted
replaced
6:c1b3da0fde4a | 7:2c6bcbc1e76a |
---|---|
1 #!/usr/bin/env Rscript | |
2 # ARGS: 1.inputType -String specifying format of input (fastq or table) | |
3 # IF inputType is "fastq" or "pairedFastq: | |
4 # 2*.fastqPath -One or more strings specifying path to fastq files | |
5 # 2.annoPath -String specifying path to hairpin annotation table | |
6 # 3.samplePath -String specifying path to sample annotation table | |
7 # 4.barStart -Integer specifying starting position of barcode | |
8 # 5.barEnd -Integer specifying ending position of barcode | |
9 # ### | |
10 # IF inputType is "pairedFastq": | |
11 # 6.barStartRev -Integer specifying starting position of barcode | |
12 # on reverse end | |
13 # 7.barEndRev -Integer specifying ending position of barcode | |
14 # on reverse end | |
15 # ### | |
16 # 8.hpStart -Integer specifying startins position of hairpin | |
17 # unique region | |
18 # 9.hpEnd -Integer specifying ending position of hairpin | |
19 # unique region | |
20 # IF inputType is "counts": | |
21 # 2.countPath -String specifying path to count table | |
22 # 3.annoPath -String specifying path to hairpin annotation table | |
23 # 4.samplePath -String specifying path to sample annotation table | |
24 # ### | |
25 # 10.secFactName -String specifying name of secondary factor | |
26 # 11.cpmReq -Float specifying cpm requirement | |
27 # 12.sampleReq -Integer specifying cpm requirement | |
28 # 13.readReq -Integer specifying read requirement | |
29 # 14.fdrThresh -Float specifying the FDR requirement | |
30 # 15.lfcThresh -Float specifying the log-fold-change requirement | |
31 # 16.workMode -String specifying exact test or GLM usage | |
32 # 17.htmlPath -String specifying path to HTML file | |
33 # 18.folderPath -String specifying path to folder for output | |
34 # IF workMode is "classic" (exact test) | |
35 # 19.pairData[2] -String specifying first group for exact test | |
36 # 20.pairData[1] -String specifying second group for exact test | |
37 # ### | |
38 # IF workMode is "glm" | |
39 # 19.contrastData -String specifying contrasts to be made | |
40 # 20.roastOpt -String specifying usage of gene-wise tests | |
41 # 21.hairpinReq -String specifying hairpin requirement for gene- | |
42 # wise test | |
43 # 22.selectOpt -String specifying type of selection for barcode | |
44 # plots | |
45 # 23.selectVals -String specifying members selected for barcode | |
46 # plots | |
47 # ### | |
48 # | |
49 # OUT: Bar Plot of Counts Per Index | |
50 # Bar Plot of Counts Per Hairpin | |
51 # MDS Plot | |
52 # BCV Plot | |
53 # Smear Plot | |
54 # Barcode Plots (If Genewise testing was selected) | |
55 # Top Expression Table | |
56 # Feature Counts Table | |
57 # HTML file linking to the ouputs | |
58 # | |
59 # Author: Shian Su - registertonysu@gmail.com - Jan 2014 | |
60 | |
61 # Record starting time | |
62 timeStart <- as.character(Sys.time()) | |
63 options(bitmapType='cairo') | |
64 # needed to prevent missing x11 errors for png() | |
65 # Loading and checking required packages | |
66 library(methods, quietly=TRUE, warn.conflicts=FALSE) | |
67 library(statmod, quietly=TRUE, warn.conflicts=FALSE) | |
68 library(splines, quietly=TRUE, warn.conflicts=FALSE) | |
69 library(edgeR, quietly=TRUE, warn.conflicts=FALSE) | |
70 library(limma, quietly=TRUE, warn.conflicts=FALSE) | |
71 | |
72 if (packageVersion("edgeR") < "3.7.17") { | |
73 stop("Please update 'edgeR' to version >= 3.7.17 to run this tool") | |
74 } | |
75 | |
76 if (packageVersion("limma")<"3.21.16") { | |
77 message("Update 'limma' to version >= 3.21.16 to see updated barcode graphs") | |
78 } | |
79 | |
80 ################################################################################ | |
81 ### Function declarations | |
82 ################################################################################ | |
83 | |
84 # Function to load libaries without messages | |
85 silentLibrary <- function(...) { | |
86 list <- c(...) | |
87 for (package in list){ | |
88 suppressPackageStartupMessages(library(package, character.only=TRUE)) | |
89 } | |
90 } | |
91 | |
92 # Function to sanitise contrast equations so there are no whitespaces | |
93 # surrounding the arithmetic operators, leading or trailing whitespace | |
94 sanitiseEquation <- function(equation) { | |
95 equation <- gsub(" *[+] *", "+", equation) | |
96 equation <- gsub(" *[-] *", "-", equation) | |
97 equation <- gsub(" *[/] *", "/", equation) | |
98 equation <- gsub(" *[*] *", "*", equation) | |
99 equation <- gsub("^\\s+|\\s+$", "", equation) | |
100 return(equation) | |
101 } | |
102 | |
103 # Function to sanitise group information | |
104 sanitiseGroups <- function(string) { | |
105 string <- gsub(" *[,] *", ",", string) | |
106 string <- gsub("^\\s+|\\s+$", "", string) | |
107 return(string) | |
108 } | |
109 | |
110 # Function to change periods to whitespace in a string | |
111 unmake.names <- function(string) { | |
112 string <- gsub(".", " ", string, fixed=TRUE) | |
113 return(string) | |
114 } | |
115 | |
116 # Function has string input and generates an output path string | |
117 makeOut <- function(filename) { | |
118 return(paste0(folderPath, "/", filename)) | |
119 } | |
120 | |
121 # Function has string input and generates both a pdf and png output strings | |
122 imgOut <- function(filename) { | |
123 assign(paste0(filename, "Png"), makeOut(paste0(filename,".png")), | |
124 envir=.GlobalEnv) | |
125 assign(paste0(filename, "Pdf"), makeOut(paste0(filename,".pdf")), | |
126 envir=.GlobalEnv) | |
127 } | |
128 | |
129 # Create cat function default path set, default seperator empty and appending | |
130 # true by default (Ripped straight from the cat function with altered argument | |
131 # defaults) | |
132 cata <- function(..., file=htmlPath, sep="", fill=FALSE, labels=NULL, | |
133 append=TRUE) { | |
134 if (is.character(file)) | |
135 if (file == "") | |
136 file <- stdout() | |
137 else if (substring(file, 1L, 1L) == "|") { | |
138 file <- pipe(substring(file, 2L), "w") | |
139 on.exit(close(file)) | |
140 } | |
141 else { | |
142 file <- file(file, ifelse(append, "a", "w")) | |
143 on.exit(close(file)) | |
144 } | |
145 .Internal(cat(list(...), file, sep, fill, labels, append)) | |
146 } | |
147 | |
148 # Function to write code for html head and title | |
149 HtmlHead <- function(title) { | |
150 cata("<head>\n") | |
151 cata("<title>", title, "</title>\n") | |
152 cata("</head>\n") | |
153 } | |
154 | |
155 # Function to write code for html links | |
156 HtmlLink <- function(address, label=address) { | |
157 cata("<a href=\"", address, "\" target=\"_blank\">", label, "</a><br />\n") | |
158 } | |
159 | |
160 # Function to write code for html images | |
161 HtmlImage <- function(source, label=source, height=600, width=600) { | |
162 cata("<img src=\"", source, "\" alt=\"", label, "\" height=\"", height) | |
163 cata("\" width=\"", width, "\"/>\n") | |
164 } | |
165 | |
166 # Function to write code for html list items | |
167 ListItem <- function(...) { | |
168 cata("<li>", ..., "</li>\n") | |
169 } | |
170 | |
171 TableItem <- function(...) { | |
172 cata("<td>", ..., "</td>\n") | |
173 } | |
174 | |
175 TableHeadItem <- function(...) { | |
176 cata("<th>", ..., "</th>\n") | |
177 } | |
178 ################################################################################ | |
179 ### Input Processing | |
180 ################################################################################ | |
181 | |
182 # Grabbing arguments from command line | |
183 argv <- commandArgs(T) | |
184 | |
185 inputType <- as.character(argv[1]) | |
186 if (inputType == "fastq") { | |
187 | |
188 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], | |
189 fixed=TRUE)) | |
190 | |
191 # Remove fastq paths | |
192 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] | |
193 | |
194 fastqPathRev <- NULL | |
195 annoPath <- as.character(argv[2]) | |
196 samplePath <- as.character(argv[3]) | |
197 barStart <- as.numeric(argv[4]) | |
198 barEnd <- as.numeric(argv[5]) | |
199 barStartRev <- NULL | |
200 barStartRev <- NULL | |
201 hpStart <- as.numeric(argv[8]) | |
202 hpEnd <- as.numeric(argv[9]) | |
203 } else if (inputType=="pairedFastq") { | |
204 | |
205 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], | |
206 fixed=TRUE)) | |
207 | |
208 fastqPathRev <- as.character(gsub("fastqRev::", "", | |
209 argv[grepl("fastqRev::", argv)], fixed=TRUE)) | |
210 | |
211 # Remove fastq paths | |
212 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] | |
213 argv <- argv[!grepl("fastqRev::", argv, fixed=TRUE)] | |
214 | |
215 annoPath <- as.character(argv[2]) | |
216 samplePath <- as.character(argv[3]) | |
217 barStart <- as.numeric(argv[4]) | |
218 barEnd <- as.numeric(argv[5]) | |
219 barStartRev <- as.numeric(argv[6]) | |
220 barEndRev <- as.numeric(argv[7]) | |
221 hpStart <- as.numeric(argv[8]) | |
222 hpEnd <- as.numeric(argv[9]) | |
223 } else if (inputType == "counts") { | |
224 countPath <- as.character(argv[2]) | |
225 annoPath <- as.character(argv[3]) | |
226 samplePath <- as.character(argv[4]) | |
227 } | |
228 | |
229 secFactName <- as.character(argv[10]) | |
230 cpmReq <- as.numeric(argv[11]) | |
231 sampleReq <- as.numeric(argv[12]) | |
232 readReq <- as.numeric(argv[13]) | |
233 fdrThresh <- as.numeric(argv[14]) | |
234 lfcThresh <- as.numeric(argv[15]) | |
235 selectDirection <- as.character(argv[16]) | |
236 workMode <- as.character(argv[17]) | |
237 htmlPath <- as.character(argv[18]) | |
238 folderPath <- as.character(argv[19]) | |
239 | |
240 if (workMode == "classic") { | |
241 pairData <- character() | |
242 pairData[2] <- as.character(argv[20]) | |
243 pairData[1] <- as.character(argv[21]) | |
244 } else if (workMode == "glm") { | |
245 contrastData <- as.character(argv[20]) | |
246 roastOpt <- as.character(argv[21]) | |
247 hairpinReq <- as.numeric(argv[22]) | |
248 selectOpt <- as.character(argv[23]) | |
249 selectVals <- as.character(argv[24]) | |
250 } | |
251 | |
252 # Read in inputs | |
253 | |
254 samples <- read.table(samplePath, header=TRUE, sep="\t") | |
255 | |
256 anno <- read.table(annoPath, header=TRUE, sep="\t") | |
257 | |
258 if (inputType == "counts") { | |
259 counts <- read.table(countPath, header=TRUE, sep="\t") | |
260 } | |
261 | |
262 ###################### Check inputs for correctness ############################ | |
263 samples$ID <- make.names(samples$ID) | |
264 | |
265 if ( !any(grepl("group", names(samples))) ) { | |
266 stop("'group' column not specified in sample annotation file") | |
267 } # Check if grouping variable has been specified | |
268 | |
269 if (secFactName != "none") { | |
270 if ( !any(grepl(secFactName, names(samples))) ) { | |
271 tempStr <- paste0("Second factor specified as \"", secFactName, "\" but ", | |
272 "no such column name found in sample annotation file") | |
273 stop(tempStr) | |
274 } # Check if specified secondary factor is present | |
275 } | |
276 | |
277 | |
278 if ( any(table(samples$ID) > 1) ){ | |
279 tab <- table(samples$ID) | |
280 offenders <- paste(names(tab[tab > 1]), collapse=", ") | |
281 offenders <- unmake.names(offenders) | |
282 stop("'ID' column of sample annotation must have unique values, values ", | |
283 offenders, " are repeated") | |
284 } # Check that IDs in sample annotation are unique | |
285 | |
286 if (inputType == "fastq" || inputType == "pairedFastq") { | |
287 | |
288 if ( any(table(anno$ID) > 1) ){ | |
289 tab <- table(anno$ID) | |
290 offenders <- paste(names(tab[tab>1]), collapse=", ") | |
291 stop("'ID' column of hairpin annotation must have unique values, values ", | |
292 offenders, " are repeated") | |
293 } # Check that IDs in hairpin annotation are unique | |
294 | |
295 } else if (inputType == "counts") { | |
296 # The first element of the colnames will be 'ID' and should not match | |
297 idFromSample <- samples$ID | |
298 idFromTable <- colnames(counts)[-1] | |
299 if (any(is.na(match(idFromTable, idFromSample)))) { | |
300 stop("not all samples have groups specified") | |
301 } # Check that a group has be specifed for each sample | |
302 | |
303 if ( any(table(counts$ID) > 1) ){ | |
304 tab <- table(counts$ID) | |
305 offenders <- paste(names(tab[tab>1]), collapse=", ") | |
306 stop("'ID' column of count table must have unique values, values ", | |
307 offenders, " are repeated") | |
308 } # Check that IDs in count table are unique | |
309 } | |
310 if (workMode == "glm") { | |
311 if (roastOpt == "yes") { | |
312 if (is.na(match("Gene", colnames(anno)))) { | |
313 tempStr <- paste("Gene-wise tests selected but'Gene' column not", | |
314 "specified in hairpin annotation file") | |
315 stop(tempStr) | |
316 } | |
317 } | |
318 } | |
319 | |
320 if (secFactName != "none") { | |
321 if (workMode != "glm") { | |
322 tempStr <- paste("only glm analysis type possible when secondary factor", | |
323 "used, please change appropriate option.") | |
324 } | |
325 } | |
326 | |
327 ################################################################################ | |
328 | |
329 # Process arguments | |
330 if (workMode == "glm") { | |
331 if (roastOpt == "yes") { | |
332 wantRoast <- TRUE | |
333 } else { | |
334 wantRoast <- FALSE | |
335 } | |
336 } | |
337 | |
338 # Split up contrasts seperated by comma into a vector and replace spaces with | |
339 # periods | |
340 if (exists("contrastData")) { | |
341 contrastData <- unlist(strsplit(contrastData, split=",")) | |
342 contrastData <- sanitiseEquation(contrastData) | |
343 contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) | |
344 } | |
345 | |
346 # Replace spaces with periods in pair data | |
347 if (exists("pairData")) { | |
348 pairData <- make.names(pairData) | |
349 } | |
350 | |
351 # Generate output folder and paths | |
352 dir.create(folderPath, showWarnings=FALSE) | |
353 | |
354 # Generate links for outputs | |
355 imgOut("barHairpin") | |
356 imgOut("barIndex") | |
357 imgOut("mds") | |
358 imgOut("bcv") | |
359 if (workMode == "classic") { | |
360 smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) | |
361 smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) | |
362 topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) | |
363 } else if (workMode == "glm") { | |
364 smearPng <- character() | |
365 smearPdf <- character() | |
366 topOut <- character() | |
367 roastOut <- character() | |
368 barcodePng <- character() | |
369 barcodePdf <- character() | |
370 for (i in 1:length(contrastData)) { | |
371 smearPng[i] <- makeOut(paste0("smear(", contrastData[i], ").png")) | |
372 smearPdf[i] <- makeOut(paste0("smear(", contrastData[i], ").pdf")) | |
373 topOut[i] <- makeOut(paste0("toptag(", contrastData[i], ").tsv")) | |
374 roastOut[i] <- makeOut(paste0("gene_level(", contrastData[i], ").tsv")) | |
375 barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png")) | |
376 barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf")) | |
377 } | |
378 } | |
379 countsOut <- makeOut("counts.tsv") | |
380 sessionOut <- makeOut("session_info.txt") | |
381 | |
382 # Initialise data for html links and images, table with the link label and | |
383 # link address | |
384 linkData <- data.frame(Label=character(), Link=character(), | |
385 stringsAsFactors=FALSE) | |
386 imageData <- data.frame(Label=character(), Link=character(), | |
387 stringsAsFactors=FALSE) | |
388 | |
389 # Initialise vectors for storage of up/down/neutral regulated counts | |
390 upCount <- numeric() | |
391 downCount <- numeric() | |
392 flatCount <- numeric() | |
393 | |
394 ################################################################################ | |
395 ### Data Processing | |
396 ################################################################################ | |
397 | |
398 # Transform gene selection from string into index values for mroast | |
399 if (workMode == "glm") { | |
400 if (selectOpt == "rank") { | |
401 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) | |
402 selectVals <- unlist(strsplit(selectVals, ",")) | |
403 | |
404 for (i in 1:length(selectVals)) { | |
405 if (grepl(":", selectVals[i], fixed=TRUE)) { | |
406 temp <- unlist(strsplit(selectVals[i], ":")) | |
407 selectVals <- selectVals[-i] | |
408 a <- as.numeric(temp[1]) | |
409 b <- as.numeric(temp[2]) | |
410 selectVals <- c(selectVals, a:b) | |
411 } | |
412 } | |
413 selectVals <- as.numeric(unique(selectVals)) | |
414 } else { | |
415 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) | |
416 selectVals <- unlist(strsplit(selectVals, ",")) | |
417 } | |
418 } | |
419 | |
420 if (inputType == "fastq" || inputType == "pairedFastq") { | |
421 # Use EdgeR hairpin process and capture outputs | |
422 | |
423 hpReadout <- capture.output( | |
424 data <- processAmplicons(readfile=fastqPath, readfile2=fastqPathRev, | |
425 barcodefile=samplePath, | |
426 hairpinfile=annoPath, | |
427 barcodeStart=barStart, barcodeEnd=barEnd, | |
428 barcodeStartRev=barStartRev, | |
429 barcodeEndRev=barEndRev, | |
430 hairpinStart=hpStart, hairpinEnd=hpEnd, | |
431 verbose=TRUE) | |
432 ) | |
433 | |
434 # Remove function output entries that show processing data or is empty | |
435 hpReadout <- hpReadout[hpReadout!=""] | |
436 hpReadout <- hpReadout[!grepl("Processing", hpReadout)] | |
437 hpReadout <- hpReadout[!grepl("in file", hpReadout)] | |
438 hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) | |
439 | |
440 # Make the names of groups syntactically valid (replace spaces with periods) | |
441 data$samples$group <- make.names(data$samples$group) | |
442 if (secFactName != "none") { | |
443 data$samples[[secFactName]] <- make.names(data$samples[[secFactName]]) | |
444 } | |
445 } else if (inputType == "counts") { | |
446 # Process counts information, set ID column to be row names | |
447 rownames(counts) <- counts$ID | |
448 counts <- counts[ , !(colnames(counts) == "ID")] | |
449 countsRows <- nrow(counts) | |
450 | |
451 # Process group information | |
452 sampleNames <- colnames(counts) | |
453 matchedIndex <- match(sampleNames, samples$ID) | |
454 factors <- samples$group[matchedIndex] | |
455 | |
456 if (secFactName != "none") { | |
457 secFactors <- samples[[secFactName]][matchedIndex] | |
458 } | |
459 | |
460 annoRows <- nrow(anno) | |
461 anno <- anno[match(rownames(counts), anno$ID), ] | |
462 annoMatched <- sum(!is.na(anno$ID)) | |
463 | |
464 if (any(is.na(anno$ID))) { | |
465 warningStr <- paste("count table contained more hairpins than", | |
466 "specified in hairpin annotation file") | |
467 warning(warningStr) | |
468 } | |
469 | |
470 # Filter out rows with zero counts | |
471 sel <- rowSums(counts)!=0 | |
472 counts <- counts[sel, ] | |
473 anno <- anno[sel, ] | |
474 | |
475 # Create DGEList | |
476 data <- DGEList(counts=counts, lib.size=colSums(counts), | |
477 norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) | |
478 | |
479 # Make the names of groups syntactically valid (replace spaces with periods) | |
480 data$samples$group <- make.names(data$samples$group) | |
481 } | |
482 | |
483 # Filter out any samples with zero counts | |
484 if (any(data$samples$lib.size == 0)) { | |
485 sampleSel <- data$samples$lib.size != 0 | |
486 filteredSamples <- paste(data$samples$ID[!sampleSel], collapse=", ") | |
487 data$counts <- data$counts[, sampleSel] | |
488 data$samples <- data$samples[sampleSel, ] | |
489 } | |
490 | |
491 # Filter hairpins with low counts | |
492 preFilterCount <- nrow(data) | |
493 selRow <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq | |
494 selCol <- colSums(data$counts) > readReq | |
495 data <- data[selRow, selCol] | |
496 | |
497 # Check if any data survived filtering | |
498 if (length(data$counts) == 0) { | |
499 stop("no data remains after filtering, consider relaxing filters") | |
500 } | |
501 | |
502 # Count number of filtered tags and samples | |
503 postFilterCount <- nrow(data) | |
504 filteredCount <- preFilterCount - postFilterCount | |
505 sampleFilterCount <- sum(!selCol) | |
506 | |
507 if (secFactName == "none") { | |
508 # Estimate dispersions | |
509 data <- estimateDisp(data) | |
510 commonBCV <- round(sqrt(data$common.dispersion), 4) | |
511 } else { | |
512 # Construct design | |
513 if (inputType == "counts") { | |
514 | |
515 sampleNames <- colnames(counts) | |
516 matchedIndex <- match(sampleNames, samples$ID) | |
517 factors <- factor(make.names(samples$group[matchedIndex])) | |
518 | |
519 secFactors <- factor(make.names(samples[[secFactName]][matchedIndex])) | |
520 | |
521 } else if (inputType == "fastq" || inputType == "pairedFastq") { | |
522 | |
523 factors <- factor(data$sample$group) | |
524 secFactors <- factor(data$sample[[secFactName]]) | |
525 | |
526 } | |
527 | |
528 design <- model.matrix(~0 + factors + secFactors) | |
529 | |
530 # Estimate dispersions | |
531 data <- estimateDisp(data, design=design) | |
532 commonBCV <- round(sqrt(data$common.dispersion), 4) | |
533 } | |
534 | |
535 | |
536 ################################################################################ | |
537 ### Output Processing | |
538 ################################################################################ | |
539 | |
540 # Plot number of hairpins that could be matched per sample | |
541 png(barIndexPng, width=600, height=600) | |
542 barplot(height<-colSums(data$counts), las=2, main="Counts per index", | |
543 cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) | |
544 imageData[1, ] <- c("Counts per Index", "barIndex.png") | |
545 invisible(dev.off()) | |
546 | |
547 pdf(barIndexPdf) | |
548 barplot(height<-colSums(data$counts), las=2, main="Counts per index", | |
549 cex.names=1.0, cex.axis=0.8, ylim=c(0, max(height)*1.2)) | |
550 linkData[1, ] <- c("Counts per Index Barplot (.pdf)", "barIndex.pdf") | |
551 invisible(dev.off()) | |
552 | |
553 # Plot per hairpin totals across all samples | |
554 png(barHairpinPng, width=600, height=600) | |
555 if (nrow(data$counts)<50) { | |
556 barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", | |
557 cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) | |
558 } else { | |
559 barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", | |
560 cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), | |
561 names.arg=FALSE) | |
562 } | |
563 imageData <- rbind(imageData, c("Counts per Hairpin", "barHairpin.png")) | |
564 invisible(dev.off()) | |
565 | |
566 pdf(barHairpinPdf) | |
567 if (nrow(data$counts)<50) { | |
568 barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", | |
569 cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2)) | |
570 } else { | |
571 barplot(height<-rowSums(data$counts), las=2, main="Counts per hairpin", | |
572 cex.names=0.8, cex.axis=0.8, ylim=c(0, max(height)*1.2), | |
573 names.arg=FALSE) | |
574 } | |
575 newEntry <- c("Counts per Hairpin Barplot (.pdf)", "barHairpin.pdf") | |
576 linkData <- rbind(linkData, newEntry) | |
577 invisible(dev.off()) | |
578 | |
579 # Make an MDS plot to visualise relationships between replicate samples | |
580 png(mdsPng, width=600, height=600) | |
581 plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group), main="MDS Plot") | |
582 imageData <- rbind(imageData, c("MDS Plot", "mds.png")) | |
583 invisible(dev.off()) | |
584 | |
585 pdf(mdsPdf) | |
586 plotMDS(data, labels=data$samples$group, col=as.numeric(data$samples$group),main="MDS Plot") | |
587 newEntry <- c("MDS Plot (.pdf)", "mds.pdf") | |
588 linkData <- rbind(linkData, newEntry) | |
589 invisible(dev.off()) | |
590 | |
591 # BCV Plot | |
592 png(bcvPng, width=600, height=600) | |
593 plotBCV(data, main="BCV Plot") | |
594 imageData <- rbind(imageData, c("BCV Plot", "bcv.png")) | |
595 invisible(dev.off()) | |
596 | |
597 pdf(bcvPdf) | |
598 plotBCV(data, main="BCV Plot") | |
599 newEntry <- c("BCV Plot (.pdf)", "bcv.pdf") | |
600 linkData <- rbind(linkData, newEntry) | |
601 invisible(dev.off()) | |
602 | |
603 if (workMode == "classic") { | |
604 # Assess differential representation using classic exact testing methodology | |
605 # in edgeR | |
606 testData <- exactTest(data, pair=pairData) | |
607 | |
608 top <- topTags(testData, n=Inf) | |
609 | |
610 if (selectDirection == "all") { | |
611 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
612 (abs(top$table$logFC) > lfcThresh), 1] | |
613 } else if (selectDirection == "up") { | |
614 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
615 (top$table$logFC > lfcThresh), 1] | |
616 } else if (selectDirection == "down") { | |
617 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
618 (top$table$logFC < -lfcThresh), 1] | |
619 } | |
620 | |
621 write.table(top, file=topOut, row.names=FALSE, sep="\t") | |
622 | |
623 linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], | |
624 ") (.tsv)") | |
625 linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv") | |
626 linkData <- rbind(linkData, c(linkName, linkAddr)) | |
627 | |
628 upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) | |
629 | |
630 downCount[1] <- sum(top$table$FDR < fdrThresh & | |
631 top$table$logFC < -lfcThresh) | |
632 | |
633 flatCount[1] <- sum(top$table$FDR > fdrThresh | | |
634 abs(top$table$logFC) < lfcThresh) | |
635 | |
636 | |
637 | |
638 # Select hairpins with FDR < 0.05 to highlight on plot | |
639 png(smearPng, width=600, height=600) | |
640 plotTitle <- gsub(".", " ", | |
641 paste0("Smear Plot: ", pairData[2], "-", pairData[1]), | |
642 fixed=TRUE) | |
643 plotSmear(testData, de.tags=topIDs, | |
644 pch=20, cex=1.0, main=plotTitle) | |
645 abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) | |
646 imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ")") | |
647 imgAddr <- paste0("smear(", pairData[2], "-", pairData[1],").png") | |
648 imageData <- rbind(imageData, c(imgName, imgAddr)) | |
649 invisible(dev.off()) | |
650 | |
651 pdf(smearPdf) | |
652 plotTitle <- gsub(".", " ", | |
653 paste0("Smear Plot: ", pairData[2], "-", pairData[1]), | |
654 fixed=TRUE) | |
655 plotSmear(testData, de.tags=topIDs, | |
656 pch=20, cex=1.0, main=plotTitle) | |
657 abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) | |
658 imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)") | |
659 imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf") | |
660 linkData <- rbind(linkData, c(imgName, imgAddr)) | |
661 invisible(dev.off()) | |
662 | |
663 } else if (workMode == "glm") { | |
664 # Generating design information | |
665 if (secFactName == "none") { | |
666 | |
667 factors <- factor(data$sample$group) | |
668 design <- model.matrix(~0 + factors) | |
669 | |
670 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) | |
671 | |
672 } else { | |
673 | |
674 factors <- factor(data$sample$group) | |
675 | |
676 if (inputType == "counts") { | |
677 | |
678 sampleNames <- colnames(counts) | |
679 matchedIndex <- match(sampleNames, samples$ID) | |
680 factors <- factor(samples$group[matchedIndex]) | |
681 | |
682 secFactors <- factor(samples[[secFactName]][matchedIndex]) | |
683 | |
684 } else if (inputType == "fastq" || inputType == "pairedFastq") { | |
685 | |
686 secFactors <- factor(data$sample[[secFactName]]) | |
687 | |
688 } | |
689 | |
690 design <- model.matrix(~0 + factors + secFactors) | |
691 | |
692 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) | |
693 colnames(design) <- gsub("secFactors", secFactName, colnames(design), | |
694 fixed=TRUE) | |
695 } | |
696 | |
697 | |
698 # Split up contrasts seperated by comma into a vector | |
699 contrastData <- unlist(strsplit(contrastData, split=",")) | |
700 | |
701 for (i in 1:length(contrastData)) { | |
702 # Generate contrasts information | |
703 contrasts <- makeContrasts(contrasts=contrastData[i], levels=design) | |
704 | |
705 # Fit negative bionomial GLM | |
706 fit <- glmFit(data, design) | |
707 # Carry out Likelihood ratio test | |
708 testData <- glmLRT(fit, contrast=contrasts) | |
709 | |
710 # Select hairpins with FDR < 0.05 to highlight on plot | |
711 top <- topTags(testData, n=Inf) | |
712 | |
713 if (selectDirection == "all") { | |
714 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
715 (abs(top$table$logFC) > lfcThresh), 1] | |
716 } else if (selectDirection == "up") { | |
717 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
718 (top$table$logFC > lfcThresh), 1] | |
719 } else if (selectDirection == "down") { | |
720 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
721 (top$table$logFC < -lfcThresh), 1] | |
722 } | |
723 | |
724 write.table(top, file=topOut[i], row.names=FALSE, sep="\t") | |
725 | |
726 linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") | |
727 linkAddr <- paste0("toptag(", contrastData[i], ").tsv") | |
728 linkData <- rbind(linkData, c(linkName, linkAddr)) | |
729 | |
730 # Collect counts for differential representation | |
731 upCount[i] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) | |
732 downCount[i] <- sum(top$table$FDR < fdrThresh & | |
733 top$table$logFC < -lfcThresh) | |
734 flatCount[i] <- sum(top$table$FDR > fdrThresh | | |
735 abs(top$table$logFC) < lfcThresh) | |
736 | |
737 # Make a plot of logFC versus logCPM | |
738 png(smearPng[i], height=600, width=600) | |
739 plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], | |
740 fixed=TRUE)) | |
741 plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) | |
742 abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) | |
743 | |
744 imgName <- paste0("Smear Plot(", contrastData[i], ")") | |
745 imgAddr <- paste0("smear(", contrastData[i], ").png") | |
746 imageData <- rbind(imageData, c(imgName, imgAddr)) | |
747 invisible(dev.off()) | |
748 | |
749 pdf(smearPdf[i]) | |
750 plotTitle <- paste("Smear Plot:", gsub(".", " ", contrastData[i], | |
751 fixed=TRUE)) | |
752 plotSmear(testData, de.tags=topIDs, pch=20, cex=0.8, main=plotTitle) | |
753 abline(h=c(-1, 0, 1), col=c("dodgerblue", "yellow", "dodgerblue"), lty=2) | |
754 | |
755 linkName <- paste0("Smear Plot(", contrastData[i], ") (.pdf)") | |
756 linkAddr <- paste0("smear(", contrastData[i], ").pdf") | |
757 linkData <- rbind(linkData, c(linkName, linkAddr)) | |
758 invisible(dev.off()) | |
759 | |
760 genes <- as.character(data$genes$Gene) | |
761 unq <- unique(genes) | |
762 unq <- unq[!is.na(unq)] | |
763 geneList <- list() | |
764 for (gene in unq) { | |
765 if (length(which(genes == gene)) >= hairpinReq) { | |
766 geneList[[gene]] <- which(genes == gene) | |
767 } | |
768 } | |
769 | |
770 if (wantRoast) { | |
771 # Input preparaton for roast | |
772 nrot <- 9999 | |
773 set.seed(602214129) | |
774 roastData <- mroast(data, index=geneList, design=design, | |
775 contrast=contrasts, nrot=nrot) | |
776 roastData <- cbind(GeneID=rownames(roastData), roastData) | |
777 write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t") | |
778 linkName <- paste0("Gene Level Analysis Table(", contrastData[i], | |
779 ") (.tsv)") | |
780 linkAddr <- paste0("gene_level(", contrastData[i], ").tsv") | |
781 linkData <- rbind(linkData, c(linkName, linkAddr)) | |
782 if (selectOpt == "rank") { | |
783 selectedGenes <- rownames(roastData)[selectVals] | |
784 } else { | |
785 selectedGenes <- selectVals | |
786 } | |
787 | |
788 if (packageVersion("limma")<"3.19.19") { | |
789 png(barcodePng[i], width=600, height=length(selectedGenes)*150) | |
790 } else { | |
791 png(barcodePng[i], width=600, height=length(selectedGenes)*300) | |
792 } | |
793 par(mfrow=c(length(selectedGenes), 1)) | |
794 for (gene in selectedGenes) { | |
795 barcodeplot(testData$table$logFC, index=geneList[[gene]], | |
796 main=paste("Barcode Plot for", gene, "(logFCs)", | |
797 gsub(".", " ", contrastData[i])), | |
798 labels=c("Positive logFC", "Negative logFC")) | |
799 } | |
800 imgName <- paste0("Barcode Plot(", contrastData[i], ")") | |
801 imgAddr <- paste0("barcode(", contrastData[i], ").png") | |
802 imageData <- rbind(imageData, c(imgName, imgAddr)) | |
803 dev.off() | |
804 if (packageVersion("limma")<"3.19.19") { | |
805 pdf(barcodePdf[i], width=8, height=2) | |
806 } else { | |
807 pdf(barcodePdf[i], width=8, height=4) | |
808 } | |
809 for (gene in selectedGenes) { | |
810 barcodeplot(testData$table$logFC, index=geneList[[gene]], | |
811 main=paste("Barcode Plot for", gene, "(logFCs)", | |
812 gsub(".", " ", contrastData[i])), | |
813 labels=c("Positive logFC", "Negative logFC")) | |
814 } | |
815 linkName <- paste0("Barcode Plot(", contrastData[i], ") (.pdf)") | |
816 linkAddr <- paste0("barcode(", contrastData[i], ").pdf") | |
817 linkData <- rbind(linkData, c(linkName, linkAddr)) | |
818 dev.off() | |
819 } | |
820 } | |
821 } | |
822 | |
823 # Generate data frame of the significant differences | |
824 sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) | |
825 if (workMode == "glm") { | |
826 | |
827 row.names(sigDiff) <- contrastData | |
828 | |
829 } else if (workMode == "classic") { | |
830 | |
831 row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1]) | |
832 | |
833 } | |
834 | |
835 # Output table of summarised counts | |
836 ID <- rownames(data$counts) | |
837 outputCounts <- cbind(ID, data$counts) | |
838 write.table(outputCounts, file=countsOut, row.names=FALSE, sep="\t", | |
839 quote=FALSE) | |
840 linkName <- "Counts table (.tsv)" | |
841 linkAddr <- "counts.tsv" | |
842 linkData <- rbind(linkData, c(linkName, linkAddr)) | |
843 | |
844 # Record session info | |
845 writeLines(capture.output(sessionInfo()), sessionOut) | |
846 linkData <- rbind(linkData, c("Session Info", "session_info.txt")) | |
847 | |
848 # Record ending time and calculate total run time | |
849 timeEnd <- as.character(Sys.time()) | |
850 timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3)) | |
851 timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE) | |
852 ################################################################################ | |
853 ### HTML Generation | |
854 ################################################################################ | |
855 # Clear file | |
856 cat("", file=htmlPath) | |
857 | |
858 cata("<html>\n") | |
859 HtmlHead("EdgeR Output") | |
860 | |
861 cata("<body>\n") | |
862 cata("<h3>EdgeR Analysis Output:</h3>\n") | |
863 cata("<h4>Input Summary:</h4>\n") | |
864 if (inputType == "fastq" || inputType == "pairedFastq") { | |
865 | |
866 cata("<ul>\n") | |
867 ListItem(hpReadout[1]) | |
868 ListItem(hpReadout[2]) | |
869 cata("</ul>\n") | |
870 cata(hpReadout[3], "<br />\n") | |
871 cata("<ul>\n") | |
872 ListItem(hpReadout[4]) | |
873 ListItem(hpReadout[7]) | |
874 cata("</ul>\n") | |
875 cata(hpReadout[8:11], sep="<br />\n") | |
876 cata("<br />\n") | |
877 cata("<b>Please check that read percentages are consistent with ") | |
878 cata("expectations.</b><br >\n") | |
879 | |
880 } else if (inputType == "counts") { | |
881 | |
882 cata("<ul>\n") | |
883 ListItem("Number of Samples: ", ncol(data$counts)) | |
884 ListItem("Number of Hairpins: ", countsRows) | |
885 ListItem("Number of annotations provided: ", annoRows) | |
886 ListItem("Number of annotations matched to hairpin: ", annoMatched) | |
887 cata("</ul>\n") | |
888 | |
889 } | |
890 | |
891 cata("The estimated common biological coefficient of variation (BCV) is: ", | |
892 commonBCV, "<br />\n") | |
893 | |
894 if (secFactName == "none") { | |
895 | |
896 cata("No secondary factor specified.<br />\n") | |
897 | |
898 } else { | |
899 | |
900 cata("Secondary factor specified as: ", secFactName, "<br />\n") | |
901 | |
902 } | |
903 | |
904 cata("<h4>Output:</h4>\n") | |
905 cata("PDF copies of JPEGS available in 'Plots' section.<br />\n") | |
906 for (i in 1:nrow(imageData)) { | |
907 if (grepl("barcode", imageData$Link[i])) { | |
908 | |
909 if (packageVersion("limma")<"3.19.19") { | |
910 | |
911 HtmlImage(imageData$Link[i], imageData$Label[i], | |
912 height=length(selectedGenes)*150) | |
913 | |
914 } else { | |
915 | |
916 HtmlImage(imageData$Link[i], imageData$Label[i], | |
917 height=length(selectedGenes)*300) | |
918 | |
919 } | |
920 } else { | |
921 | |
922 HtmlImage(imageData$Link[i], imageData$Label[i]) | |
923 | |
924 } | |
925 } | |
926 cata("<br />\n") | |
927 | |
928 cata("<h4>Differential Representation Counts:</h4>\n") | |
929 | |
930 cata("<table border=\"1\" cellpadding=\"4\">\n") | |
931 cata("<tr>\n") | |
932 TableItem() | |
933 for (i in colnames(sigDiff)) { | |
934 TableHeadItem(i) | |
935 } | |
936 cata("</tr>\n") | |
937 for (i in 1:nrow(sigDiff)) { | |
938 cata("<tr>\n") | |
939 TableHeadItem(unmake.names(row.names(sigDiff)[i])) | |
940 for (j in 1:ncol(sigDiff)) { | |
941 TableItem(as.character(sigDiff[i, j])) | |
942 } | |
943 cata("</tr>\n") | |
944 } | |
945 cata("</table>") | |
946 | |
947 cata("<h4>Plots:</h4>\n") | |
948 for (i in 1:nrow(linkData)) { | |
949 if (grepl(".pdf", linkData$Link[i])) { | |
950 HtmlLink(linkData$Link[i], linkData$Label[i]) | |
951 } | |
952 } | |
953 | |
954 cata("<h4>Tables:</h4>\n") | |
955 for (i in 1:nrow(linkData)) { | |
956 if (grepl(".tsv", linkData$Link[i])) { | |
957 HtmlLink(linkData$Link[i], linkData$Label[i]) | |
958 } | |
959 } | |
960 | |
961 cata("<p>Alt-click links to download file.</p>\n") | |
962 cata("<p>Click floppy disc icon on associated history item to download ") | |
963 cata("all files.</p>\n") | |
964 cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n") | |
965 | |
966 cata("<h4>Additional Information:</h4>\n") | |
967 | |
968 if (inputType == "fastq") { | |
969 | |
970 ListItem("Data was gathered from fastq raw read file(s).") | |
971 | |
972 } else if (inputType == "counts") { | |
973 | |
974 ListItem("Data was gathered from a table of counts.") | |
975 | |
976 } | |
977 | |
978 if (cpmReq != 0 && sampleReq != 0) { | |
979 tempStr <- paste("Target sequences without more than", cpmReq, | |
980 "CPM in at least", sampleReq, "samples are insignificant", | |
981 "and filtered out.") | |
982 ListItem(tempStr) | |
983 | |
984 filterProp <- round(filteredCount/preFilterCount*100, digits=2) | |
985 tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, | |
986 "%) target sequences were filtered out for low ", | |
987 "count-per-million.") | |
988 ListItem(tempStr) | |
989 } | |
990 | |
991 if (readReq != 0) { | |
992 tempStr <- paste("Samples that did not produce more than", readReq, | |
993 "counts were filtered out.") | |
994 ListItem(tempStr) | |
995 | |
996 tempStr <- paste0(sampleFilterCount, " samples were filtered out for low ", | |
997 "counts.") | |
998 ListItem(tempStr) | |
999 } | |
1000 | |
1001 if (exists("filteredSamples")) { | |
1002 tempStr <- paste("The following samples were filtered out for having zero", | |
1003 "library size: ", filteredSamples) | |
1004 ListItem(tempStr) | |
1005 } | |
1006 | |
1007 if (workMode == "classic") { | |
1008 ListItem("An exact test was performed on each target sequence.") | |
1009 } else if (workMode == "glm") { | |
1010 ListItem("A generalised linear model was fitted to each target sequence.") | |
1011 } | |
1012 | |
1013 cit <- character() | |
1014 link <-character() | |
1015 link[1] <- paste0("<a href=\"", | |
1016 "http://www.bioconductor.org/packages/release/bioc/", | |
1017 "vignettes/limma/inst/doc/usersguide.pdf", | |
1018 "\">", "limma User's Guide", "</a>.") | |
1019 link[2] <- paste0("<a href=\"", | |
1020 "http://www.bioconductor.org/packages/release/bioc/", | |
1021 "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf", | |
1022 "\">", "edgeR User's Guide", "</a>") | |
1023 | |
1024 cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).", | |
1025 "edgeR: a Bioconductor package for differential", | |
1026 "expression analysis of digital gene expression", | |
1027 "data. Bioinformatics 26, 139-140") | |
1028 cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests", | |
1029 "for assessing differences in tag abundance. Bioinformatics", | |
1030 "23, 2881-2887") | |
1031 cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of", | |
1032 "negative binomial dispersion, with applications to SAGE data.", | |
1033 "Biostatistics, 9, 321-332") | |
1034 | |
1035 cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential", | |
1036 "expression analysis of multifactor RNA-Seq experiments with", | |
1037 "respect to biological variation. Nucleic Acids Research 40,", | |
1038 "4288-4297") | |
1039 | |
1040 cata("<h4>Citations</h4>") | |
1041 cata("<ol>\n") | |
1042 ListItem(cit[1]) | |
1043 ListItem(cit[2]) | |
1044 ListItem(cit[3]) | |
1045 ListItem(cit[4]) | |
1046 cata("</ol>\n") | |
1047 | |
1048 cata("<p>Report problems to: su.s@wehi.edu.au</p>\n") | |
1049 | |
1050 for (i in 1:nrow(linkData)) { | |
1051 if (grepl("session_info", linkData$Link[i])) { | |
1052 HtmlLink(linkData$Link[i], linkData$Label[i]) | |
1053 } | |
1054 } | |
1055 | |
1056 cata("<table border=\"0\">\n") | |
1057 cata("<tr>\n") | |
1058 TableItem("Task started at:"); TableItem(timeStart) | |
1059 cata("</tr>\n") | |
1060 cata("<tr>\n") | |
1061 TableItem("Task ended at:"); TableItem(timeEnd) | |
1062 cata("</tr>\n") | |
1063 cata("<tr>\n") | |
1064 TableItem("Task run time:"); TableItem(timeTaken) | |
1065 cata("<tr>\n") | |
1066 cata("</table>\n") | |
1067 | |
1068 cata("</body>\n") | |
1069 cata("</html>") |