Mercurial > repos > hspitia > rpkm_values

diff GetRPKMvalues/GetRPKMvalues.xml @ 0:888cb13321fa draft default tip
Uploaded
author: hspitia
date: Sun, 07 Sep 2014 02:02:23 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/GetRPKMvalues/GetRPKMvalues.xml	Sun Sep 07 02:02:23 2014 -0400
@@ -0,0 +1,189 @@
+<tool id="GetRPKMvalues" name="GetRPKMvalues" version="0.01">
+<description> from BAM file</description>
+
+<command interpreter="python">
+
+        GetRPKMvalues.py --script_path "$runMe" --interpreter "Rscript" 
+            --tool_name "GetRPKMvalues" --input_tab "$input1"   --output_dir "./" --output_tab "$tab_file" 
+</command>
+<inputs>
+<param name="input1"  type="data" format="bam" label="Select a suitable input file from your history"/> 
+<param name="job_name" type="text" label="Supply a name for the outputs to remind you what they contain" value="GetRPKMvalues"/> 
+
+</inputs>
+<outputs>
+ <data format="tabular" name="tab_file" label="${job_name}"/>
+
+</outputs>
+<configfiles>
+<configfile name="runMe">
+#!/usr/bin/Rscript
+
+# source("http://bioconductor.org/biocLite.R")
+# biocLite("Rsamtools")
+
+ourargs = commandArgs(trailingOnly = TRUE)
+inf  = ourargs[1]
+outf = ourargs[2]
+
+
+library(Rsamtools)
+library(foreach)
+
+GetMappedReadsFreqs = function(bam.file){
+  require(Rsamtools)
+  mapped.flag  = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
+  mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
+  bam.index.file  = indexBam(bam.file)
+  mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
+  mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
+  colnames(mapped) = c("ref.sequence", "mapped.reads")
+  mapped
+}
+
+GetRefSeqsLengths = function(bam.file){
+  require(Rsamtools)
+  header = scanBamHeader(files = bam.file)
+  ref.seqs.data = as.data.frame(header[[1]]\$targets)
+  data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
+}
+
+GetIdxStats =  function(bam.file) {
+  mapped = GetMappedReadsFreqs(bam.file = bam.file)
+  ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
+  merge(x = mapped, y = ref.seqs.lengths,
+        by.x = "ref.sequence", by.y = "row.names")
+}
+
+GetRPKMValues = function(bam.file) {
+  bam.stats = GetIdxStats(bam.file = bam.file)
+  total.mapped  = sum(bam.stats\$mapped.reads)
+  mapped.factor = total.mapped / 10^9
+
+  bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
+  data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
+}
+
+bam.files = inf
+
+if(any(!file.exists(bam.files))){
+  not.found = bam.files[!file.exists(bam.files)]
+  out.names = paste(not.found, collapse = "\n ")
+  stop(paste("One or more files does not exist:\n", out.names))
+} else {
+  rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
+    GetRPKMValues(bam.file = c.bam.file)
+  }
+  colnames(rpkm.data) = sapply(bam.files, basename)
+  out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
+  write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
+              file = outf)
+}
+</configfile>
+</configfiles>
+
+
+        <tests>
+        <test>
+        <param name="input1" value="GetRPKMvalues_test1_input.xls" ftype="bam"/>
+        <param name="job_name" value="test1"/>
+        <param name="runMe" value="$runMe"/>
+        <output name="tab_file" file="GetRPKMvalues_test1_output.xls" ftype="tabular"/>
+        </test>
+        </tests>
+        
+
+<help>
+
+
+This tool computes the RPKM values from a given BAM file.
+
+This tool is a R script and requires Rsamtools and foreach packages.
+
+The output is a tabular file with two columns: 
+1. ref.sequence: which contains the names of reference sequences (genes/transcripts), and
+
+2. dataset_#.dat: which referes to the name of input dataset (BAM file specified) and contains the RPKM values for each reference sequence. The symbol # represents a consecutive number automatically assigned by Galaxy.
+
+**Script**
+Pressing execute will run the following code over your input file and generate some outputs in your history::
+
+
+ #!/usr/bin/Rscript
+ 
+ # source("http://bioconductor.org/biocLite.R")
+ # biocLite("Rsamtools")
+ 
+ ourargs = commandArgs(trailingOnly = TRUE)
+ inf  = ourargs[1]
+ outf = ourargs[2]
+ 
+ 
+ library(Rsamtools)
+ library(foreach)
+ 
+ GetMappedReadsFreqs = function(bam.file){
+   require(Rsamtools)
+   mapped.flag  = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
+   mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
+   bam.index.file  = indexBam(bam.file)
+   mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
+   mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
+   colnames(mapped) = c("ref.sequence", "mapped.reads")
+   mapped
+ }
+ 
+ GetRefSeqsLengths = function(bam.file){
+   require(Rsamtools)
+   header = scanBamHeader(files = bam.file)
+   ref.seqs.data = as.data.frame(header[[1]]\$targets)
+   data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
+ }
+ 
+ GetIdxStats =  function(bam.file) {
+   mapped = GetMappedReadsFreqs(bam.file = bam.file)
+   ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
+   merge(x = mapped, y = ref.seqs.lengths,
+         by.x = "ref.sequence", by.y = "row.names")
+ }
+ 
+ GetRPKMValues = function(bam.file) {
+   bam.stats = GetIdxStats(bam.file = bam.file)
+   total.mapped  = sum(bam.stats\$mapped.reads)
+   mapped.factor = total.mapped / 10^9
+ 
+   bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
+   data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
+ }
+ 
+ bam.files = inf
+ 
+ if(any(!file.exists(bam.files))){
+   not.found = bam.files[!file.exists(bam.files)]
+   out.names = paste(not.found, collapse = "\n ")
+   stop(paste("One or more files does not exist:\n", out.names))
+ } else {
+   rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
+     GetRPKMValues(bam.file = c.bam.file)
+   }
+   colnames(rpkm.data) = sapply(bam.files, basename)
+   out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
+   write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
+               file = outf)
+ }
+
+**Attribution**
+This Galaxy tool was created by hfespitia@cenicana.org at 06/09/2014 23:41:54
+using the Galaxy Tool Factory.
+
+See https://bitbucket.org/fubar/galaxytoolfactory for details of that project
+Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team. 
+Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573
+
+
+</help>
+<citations>
+    
+    <citation type="doi">10.1093/bioinformatics/bts573</citation>
+</citations>
+</tool>
author	hspitia
date	Sun, 07 Sep 2014 02:02:23 -0400
parents
children