changeset 1:cad1cc829e2e

- test commit
author shian_su <registertonysu@gmail.com>
date Tue, 18 Mar 2014 12:45:36 +1100
parents 8848246b5612
children 02d484876935
files hairpinTool.R hairpinTool.xml
diffstat 2 files changed, 188 insertions(+), 50 deletions(-) [+]
line wrap: on
line diff
--- a/hairpinTool.R	Fri Feb 21 12:49:45 2014 +1100
+++ b/hairpinTool.R	Tue Mar 18 12:45:36 2014 +1100
@@ -1,3 +1,51 @@
+# ARGS: 1.inputType         -String specifying format of input (fastq or table)
+#    IF inputType is "fastQ":
+#       2*.fastqPath        -One or more strings specifying path to fastq files
+#       2.annoPath        -String specifying path to hairpin annotation table
+#       3.samplePath        -String specifying path to sample annotation table
+#       4.barStart          -Integer specifying starting position of barcode
+#       5.barEnd            -Integer specifying ending position of barcode
+#       6.hpStart           -Integer specifying startins position of hairpin
+#                            unique region
+#       7.hpEnd             -Integer specifying ending position of hairpin
+#                            unique region
+#    ###   
+#    IF inputType is "counts":
+#       2.countPath         -String specifying path to count table
+#       3.annoPath          -String specifying path to hairpin annotation table
+#       4.samplePath        -String specifying path to sample annotation table
+#    ###
+#       8.cpmReq            -Float specifying cpm requirement
+#       9.sampleReq         -Integer specifying cpm requirement
+#       10.fdrThresh        -Float specifying the FDR requirement
+#       11.lfcThresh        -Float specifying the log-fold-change requirement
+#       12.workMode         -String specifying exact test or GLM usage
+#       13.htmlPath         -String specifying path to HTML file
+#       14.folderPath       -STring specifying path to folder for output
+#    IF workMode is "classic" (exact test)
+#       15.pairData[2]      -String specifying first group for exact test
+#       16.pairData[1]      -String specifying second group for exact test
+#    ###
+#    IF workMode is "glm"
+#       15.contrastData     -String specifying contrasts to be made
+#       16.roastOpt         -String specifying usage of gene-wise tests
+#       17.hairpinReq       -String specifying hairpin requirement for gene-
+#                            wise test
+#       18.selectOpt        -String specifying type of selection for barcode
+#                            plots
+#       19.selectVals       -String specifying members selected for barcode
+#                            plots
+#
+# OUT:  Bar Plot of Counts Per Index
+#       Bar Plot of Counts Per Hairpin
+#       MDS Plot
+#       Smear Plot
+#       Barcode Plots (If Genewise testing was selected)
+#       Top Expression Table
+#       HTML file linking to the ouputs
+#
+# Author: Shian Su - registertonysu@gmail.com - Jan 2014
+
 # Record starting time
 timeStart <- as.character(Sys.time())
 
@@ -115,7 +163,7 @@
   fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], 
                                  fixed=TRUE))
   argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
-  hairpinPath <- as.character(argv[2])
+  annoPath <- as.character(argv[2])
   samplePath <- as.character(argv[3])
   barStart <- as.numeric(argv[4])
   barEnd <- as.numeric(argv[5])
@@ -147,14 +195,13 @@
 }
 
 # Read in inputs
-if (inputType=="fastq") {
-  samples <- read.table(samplePath, header=TRUE, sep="\t")
-  hairpins <- read.table(hairpinPath, header=TRUE, sep="\t")
-} else if (inputType=="counts") {
-  samples <- read.table(samplePath, header=TRUE, sep="\t")
+
+samples <- read.table(samplePath, header=TRUE, sep="\t")
+anno <- read.table(annoPath, header=TRUE, sep="\t")
+if (inputType=="counts") {
   counts <- read.table(countPath, header=TRUE, sep="\t")
-  anno <- read.table(annoPath, header=TRUE, sep="\t")
 }
+
 ###################### Check inputs for correctness ############################
 samples$ID <- make.names(samples$ID)
 
@@ -166,32 +213,41 @@
   tab <- table(samples$ID)
   offenders <- paste(names(tab[tab>1]), collapse=", ")
   offenders <- unmake.names(offenders)
-  stop("ID column of sample annotation must have unique values, values ",
+  stop("'ID' column of sample annotation must have unique values, values ",
        offenders, " are repeated")
 } # Check that IDs in sample annotation are unique
 
-if (any(is.na(match(samples$ID, colnames(counts))))) {
-  stop("not all samples have groups specified")
-} # Check that a group has be specifed for each sample
-
 if (inputType=="fastq") {
-
-  if (any(table(hairpins$ID)>1)){
-    tab <- table(hairpins$ID)
+  
+  if (any(table(anno$ID)>1)){
+    tab <- table(anno$ID)
     offenders <- paste(names(tab[tab>1]), collapse=", ")
-    stop("ID column of hairpin annotation must have unique values, values ",
+    stop("'ID' column of hairpin annotation must have unique values, values ",
     offenders, " are repeated")
   } # Check that IDs in hairpin annotation are unique
-
+  
 } else if (inputType=="counts") {
+  if (any(is.na(match(samples$ID, colnames(counts))))) {
+    stop("not all samples have groups specified")
+  } # Check that a group has be specifed for each sample
   
   if (any(table(counts$ID)>1)){
     tab <- table(counts$ID)
     offenders <- paste(names(tab[tab>1]), collapse=", ")
-    stop("ID column of count table must have unique values, values ",
+    stop("'ID' column of count table must have unique values, values ",
     offenders, " are repeated")
   } # Check that IDs in count table are unique
 }
+if (workMode=="glm") {
+  if (roastOpt == "yes") {
+    if (is.na(match("Gene", colnames(anno)))) {
+      tempStr <- paste("Gene-wise tests selected but'Gene' column not",
+      "specified in hairpin annotation file")
+      stop(tempStr)
+    }
+  }
+}
+
 ################################################################################
 
 # Process arguments
@@ -277,23 +333,22 @@
 }
                                                   
 if (inputType=="fastq") {
-# Use EdgeR hairpin process and capture outputs
-hpReadout <- capture.output(
-  data <- processHairpinReads(fastqPath, samplePath, hairpinPath,
+  # Use EdgeR hairpin process and capture outputs
+  hpReadout <- capture.output(
+  data <- processHairpinReads(fastqPath, samplePath, annoPath,
                               hairpinStart=hpStart, hairpinEnd=hpEnd, 
                               verbose=TRUE)
-)
-
-# Remove function output entries that show processing data or is empty
-hpReadout <- hpReadout[hpReadout!=""]
-hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
-hpReadout <- hpReadout[!grepl("in file", hpReadout)]
-hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
-
-
-# Make the names of groups syntactically valid (replace spaces with periods)
-data$samples$group <- make.names(data$samples$group)
-} else {
+  )
+  
+  # Remove function output entries that show processing data or is empty
+  hpReadout <- hpReadout[hpReadout!=""]
+  hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
+  hpReadout <- hpReadout[!grepl("in file", hpReadout)]
+  hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
+  
+  # Make the names of groups syntactically valid (replace spaces with periods)
+  data$samples$group <- make.names(data$samples$group)
+} else if (inputType=="counts") {
   # Process counts information, set ID column to be row names
   rownames(counts) <- counts$ID
   counts <- counts[ , !(colnames(counts)=="ID")]
@@ -319,14 +374,17 @@
   # Create DGEList
   data <- DGEList(counts=counts, lib.size=colSums(counts), 
                   norm.factors=rep(1,ncol(counts)), genes=anno, group=factors)
-                  
+  
   # Make the names of groups syntactically valid (replace spaces with periods)
   data$samples$group <- make.names(data$samples$group)
 }
 
 # Filter hairpins with low counts
+preFilterCount <- nrow(data)
 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
 data <- data[sel, ]
+postFilterCount <- nrow(data)
+filteredCount <- preFilterCount-postFilterCount
 
 # Estimate dispersions
 data <- estimateDisp(data)
@@ -541,8 +599,10 @@
   }
 }
 
-# Record ending time
+# Record ending time and calculate total run time
 timeEnd <- as.character(Sys.time())
+timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3))
+timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE)
 ################################################################################
 ### HTML Generation
 ################################################################################
@@ -620,12 +680,79 @@
 cata("or other spreadsheet programs</p>\n")
 cata("<table border=\"0\">\n")
 
+cata("<h4>Additional Information:</h4>\n")
+
+if (inputType == "fastq") {
+  ListItem("Data was gathered from fastq raw read file(s).")
+} else if (inputType == "counts") {
+  ListItem("Data was gathered from a table of counts.")
+}
+
+if (cpmReq!=0 && sampleReq!=0) {
+  tempStr <- paste("Hairpins that do not have more than", cpmReq,
+                   "CPM in at least", sampleReq, "samples are considered",
+                   "insignificant and filtered out.")
+  ListItem(tempStr)
+  filterProp <- round(filteredCount/preFilterCount*100, digits=2)
+  tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
+                   "%) hairpins were filtered out for low count-per-million.")
+  ListItem(tempStr)
+}
+
+if (workMode == "classic") {
+  ListItem("An exact test was performed on each hairpin.")
+} else if (workMode == "glm") {
+  ListItem("A generalised linear model was fitted to each hairpin.")
+}
+
+
+
+cit <- character()
+link <-character()
+link[1] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/limma/inst/doc/usersguide.pdf",
+                  "\">", "limma User's Guide", "</a>.")
+link[2] <- paste0("<a href=\"",
+                  "http://www.bioconductor.org/packages/release/bioc/",
+                  "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",
+                  "\">", "edgeR User's Guide", "</a>")
+                  
+cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).",
+                "edgeR: a Bioconductor package for differential",
+                "expression analysis of digital gene expression",
+                "data. Bioinformatics 26, 139-140")
+cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",
+                "for assessing differences in tag abundance. Bioinformatics",
+                "23, 2881-2887")
+cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",
+                "negative binomial dispersion, with applications to SAGE data.",
+                "Biostatistics, 9, 321-332")
+
+cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",
+                "expression analysis of multifactor RNA-Seq experiments with",
+                "respect to biological variation. Nucleic Acids Research 40,",
+                "4288-4297")
+
+cata("<h4>Citations</h4>")
+cata("<ol>\n")
+ListItem(cit[1])
+ListItem(cit[2])
+ListItem(cit[3])
+ListItem(cit[4])
+cata("</ol>\n")
+
+cata("<table border=\"0\">\n")
 cata("<tr>\n")
 TableItem("Task started at:"); TableItem(timeStart)
 cata("</tr>\n")
 cata("<tr>\n")
 TableItem("Task ended at:"); TableItem(timeEnd)
 cata("</tr>\n")
+cata("<tr>\n")
+TableItem("Task run time:"); TableItem(timeTaken)
+cata("<tr>\n")
+cata("</table>\n")
 
 cata("</body>\n")
 cata("</html>")
--- a/hairpinTool.xml	Fri Feb 21 12:49:45 2014 +1100
+++ b/hairpinTool.xml	Tue Mar 18 12:45:36 2014 +1100
@@ -1,4 +1,4 @@
-<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.5">
+<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.6">
   <description>
     Analyse hairpin differential representation using edgeR
   </description>
@@ -13,8 +13,8 @@
   </stdio>
   
   <command interpreter="Rscript">
-  hairpinTool.R $inputOpt.type
-                #if $inputOpt.type=="fastq":
+  hairpinTool.R $inputOpt.inputType
+                #if $inputOpt.inputType=="fastq":
                   #for $i, $fas in enumerate($inputOpt.fastq):
                     fastq::$fas.file
                   #end for
@@ -22,7 +22,7 @@
                   $inputOpt.hairpin
                   $inputOpt.samples
                     
-                  #if $inputOpt.positions.option=="yes":
+                  #if $inputOpt.positions.posOption=="yes":
                     $inputOpt.positions.barstart
                     $inputOpt.positions.barend
                     $inputOpt.positions.hpstart
@@ -36,11 +36,11 @@
                 #else:
                   $inputOpt.counts
                   $inputOpt.anno
-                  "$inputOpt.factors"
+                  $inputOpt.factors
                   0 0 0
                 #end if
           
-                #if $filterCPM.option=="yes":
+                #if $filterCPM.filtOption=="yes":
                   $filterCPM.cpmReq
                   $filterCPM.sampleReq
                 #else:
@@ -59,10 +59,10 @@
                   "$workMode.pair2"
                 #else:
                   "$workMode.contrast"
-                  $workMode.roast.option
-                  #if $workMode.roast.option=="yes":
+                  $workMode.roast.roastOption
+                  #if $workMode.roast.roastOption=="yes":
                     $workMode.roast.hairpinReq
-                    $workMode.roast.select.option
+                    $workMode.roast.select.selOption
                     "$workMode.roast.select.selection"
                   #else:
                     0
@@ -74,7 +74,7 @@
   
   <inputs>
     <conditional name="inputOpt">
-      <param name="type" type="select" label="Input File Type">
+      <param name="inputType" type="select" label="Input File Type">
         <option value="fastq">FastQ File</option>
         <option value="counts">Table of Counts</option>
       </param>
@@ -92,7 +92,7 @@
         </repeat>
           
         <conditional name="positions">
-          <param name="option" type="select" 
+          <param name="posOption" type="select" 
                  label="Specify Barcode and Hairpin Locations?"
                  help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57.">
             <option value="no" selected="True">No</option>
@@ -126,7 +126,7 @@
     </conditional>
     
     <conditional name="filterCPM">
-      <param name="option" type="select" label="Filter Low CPM?"
+      <param name="filtOption" type="select" label="Filter Low CPM?"
        help="Ignore hairpins with very low representation when performing 
              analysis.">
         <option value="yes">Yes</option>
@@ -172,7 +172,7 @@
                      expression."/>
                
         <conditional name="roast">
-          <param name="option" type="select" 
+          <param name="roastOption" type="select" 
                  label="Perform Gene Level Analysis?"
                  help="Analyse LogFC tendencies for hairpins belonging
                        to the same gene.">
@@ -187,7 +187,7 @@
                          be analysed."/>
                          
             <conditional name="select">
-              <param name="option" type="select"
+              <param name="selOption" type="select"
                      label="Gene Selection Method">
                 <option value="rank">By p-value Rank</option>
                 <option value="geneID">By Gene Identifier</option>
@@ -232,7 +232,18 @@
   <outputs>
     <data format="html" name="outFile" label="shRNAseq Analysis"/>
   </outputs>
-  
+  <!-- Tests under construction
+  <tests>
+    <test>
+      <param name="counts" value="zuber_count.txt"/>
+      <param name="anno" value="zuber_anno.txt"/>
+      <param name="factors" value="zuber_samples.txt"/>
+      <param name="fdr" value="0.05"/>
+      <output name="output" file="shRNAseq_Analysis_html" lines_diff="50"/>
+    </test>
+    
+  </tests>
+  -->
   <help>
 .. class:: infomark