view GetRPKMvalues/GetRPKMvalues.xml @ 0:888cb13321fa draft default tip

Uploaded
author hspitia
date Sun, 07 Sep 2014 02:02:23 -0400
parents
children
line wrap: on
line source

<tool id="GetRPKMvalues" name="GetRPKMvalues" version="0.01">
<description> from BAM file</description>

<command interpreter="python">

        GetRPKMvalues.py --script_path "$runMe" --interpreter "Rscript" 
            --tool_name "GetRPKMvalues" --input_tab "$input1"   --output_dir "./" --output_tab "$tab_file" 
</command>
<inputs>
<param name="input1"  type="data" format="bam" label="Select a suitable input file from your history"/> 
<param name="job_name" type="text" label="Supply a name for the outputs to remind you what they contain" value="GetRPKMvalues"/> 

</inputs>
<outputs>
 <data format="tabular" name="tab_file" label="${job_name}"/>

</outputs>
<configfiles>
<configfile name="runMe">
#!/usr/bin/Rscript

# source("http://bioconductor.org/biocLite.R")
# biocLite("Rsamtools")

ourargs = commandArgs(trailingOnly = TRUE)
inf  = ourargs[1]
outf = ourargs[2]


library(Rsamtools)
library(foreach)

GetMappedReadsFreqs = function(bam.file){
  require(Rsamtools)
  mapped.flag  = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
  mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
  bam.index.file  = indexBam(bam.file)
  mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
  mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
  colnames(mapped) = c("ref.sequence", "mapped.reads")
  mapped
}

GetRefSeqsLengths = function(bam.file){
  require(Rsamtools)
  header = scanBamHeader(files = bam.file)
  ref.seqs.data = as.data.frame(header[[1]]\$targets)
  data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
}

GetIdxStats =  function(bam.file) {
  mapped = GetMappedReadsFreqs(bam.file = bam.file)
  ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
  merge(x = mapped, y = ref.seqs.lengths,
        by.x = "ref.sequence", by.y = "row.names")
}

GetRPKMValues = function(bam.file) {
  bam.stats = GetIdxStats(bam.file = bam.file)
  total.mapped  = sum(bam.stats\$mapped.reads)
  mapped.factor = total.mapped / 10^9

  bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
  data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
}

bam.files = inf

if(any(!file.exists(bam.files))){
  not.found = bam.files[!file.exists(bam.files)]
  out.names = paste(not.found, collapse = "\n ")
  stop(paste("One or more files does not exist:\n", out.names))
} else {
  rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
    GetRPKMValues(bam.file = c.bam.file)
  }
  colnames(rpkm.data) = sapply(bam.files, basename)
  out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
  write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
              file = outf)
}
</configfile>
</configfiles>


        <tests>
        <test>
        <param name="input1" value="GetRPKMvalues_test1_input.xls" ftype="bam"/>
        <param name="job_name" value="test1"/>
        <param name="runMe" value="$runMe"/>
        <output name="tab_file" file="GetRPKMvalues_test1_output.xls" ftype="tabular"/>
        </test>
        </tests>
        

<help>


This tool computes the RPKM values from a given BAM file.

This tool is a R script and requires Rsamtools and foreach packages.

The output is a tabular file with two columns: 
1. ref.sequence: which contains the names of reference sequences (genes/transcripts), and

2. dataset_#.dat: which referes to the name of input dataset (BAM file specified) and contains the RPKM values for each reference sequence. The symbol # represents a consecutive number automatically assigned by Galaxy.

**Script**
Pressing execute will run the following code over your input file and generate some outputs in your history::


 #!/usr/bin/Rscript
 
 # source("http://bioconductor.org/biocLite.R")
 # biocLite("Rsamtools")
 
 ourargs = commandArgs(trailingOnly = TRUE)
 inf  = ourargs[1]
 outf = ourargs[2]
 
 
 library(Rsamtools)
 library(foreach)
 
 GetMappedReadsFreqs = function(bam.file){
   require(Rsamtools)
   mapped.flag  = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
   mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
   bam.index.file  = indexBam(bam.file)
   mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
   mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
   colnames(mapped) = c("ref.sequence", "mapped.reads")
   mapped
 }
 
 GetRefSeqsLengths = function(bam.file){
   require(Rsamtools)
   header = scanBamHeader(files = bam.file)
   ref.seqs.data = as.data.frame(header[[1]]\$targets)
   data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
 }
 
 GetIdxStats =  function(bam.file) {
   mapped = GetMappedReadsFreqs(bam.file = bam.file)
   ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
   merge(x = mapped, y = ref.seqs.lengths,
         by.x = "ref.sequence", by.y = "row.names")
 }
 
 GetRPKMValues = function(bam.file) {
   bam.stats = GetIdxStats(bam.file = bam.file)
   total.mapped  = sum(bam.stats\$mapped.reads)
   mapped.factor = total.mapped / 10^9
 
   bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
   data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
 }
 
 bam.files = inf
 
 if(any(!file.exists(bam.files))){
   not.found = bam.files[!file.exists(bam.files)]
   out.names = paste(not.found, collapse = "\n ")
   stop(paste("One or more files does not exist:\n", out.names))
 } else {
   rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
     GetRPKMValues(bam.file = c.bam.file)
   }
   colnames(rpkm.data) = sapply(bam.files, basename)
   out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
   write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
               file = outf)
 }

**Attribution**
This Galaxy tool was created by hfespitia@cenicana.org at 06/09/2014 23:41:54
using the Galaxy Tool Factory.

See https://bitbucket.org/fubar/galaxytoolfactory for details of that project
Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team. 
Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573


</help>
<citations>
    
    <citation type="doi">10.1093/bioinformatics/bts573</citation>
</citations>
</tool>