Mercurial > repos > hspitia > rpkm_values
diff GetRPKMvalues/GetRPKMvalues.xml @ 0:888cb13321fa draft default tip
Uploaded
| author | hspitia |
|---|---|
| date | Sun, 07 Sep 2014 02:02:23 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GetRPKMvalues/GetRPKMvalues.xml Sun Sep 07 02:02:23 2014 -0400 @@ -0,0 +1,189 @@ +<tool id="GetRPKMvalues" name="GetRPKMvalues" version="0.01"> +<description> from BAM file</description> + +<command interpreter="python"> + + GetRPKMvalues.py --script_path "$runMe" --interpreter "Rscript" + --tool_name "GetRPKMvalues" --input_tab "$input1" --output_dir "./" --output_tab "$tab_file" +</command> +<inputs> +<param name="input1" type="data" format="bam" label="Select a suitable input file from your history"/> +<param name="job_name" type="text" label="Supply a name for the outputs to remind you what they contain" value="GetRPKMvalues"/> + +</inputs> +<outputs> + <data format="tabular" name="tab_file" label="${job_name}"/> + +</outputs> +<configfiles> +<configfile name="runMe"> +#!/usr/bin/Rscript + +# source("http://bioconductor.org/biocLite.R") +# biocLite("Rsamtools") + +ourargs = commandArgs(trailingOnly = TRUE) +inf = ourargs[1] +outf = ourargs[2] + + +library(Rsamtools) +library(foreach) + +GetMappedReadsFreqs = function(bam.file){ + require(Rsamtools) + mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads + mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped + bam.index.file = indexBam(bam.file) + mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file) + mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname)) + colnames(mapped) = c("ref.sequence", "mapped.reads") + mapped +} + +GetRefSeqsLengths = function(bam.file){ + require(Rsamtools) + header = scanBamHeader(files = bam.file) + ref.seqs.data = as.data.frame(header[[1]]\$targets) + data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets)) +} + +GetIdxStats = function(bam.file) { + mapped = GetMappedReadsFreqs(bam.file = bam.file) + ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file) + merge(x = mapped, y = ref.seqs.lengths, + by.x = "ref.sequence", by.y = "row.names") +} + +GetRPKMValues = function(bam.file) { + bam.stats = GetIdxStats(bam.file = bam.file) + total.mapped = sum(bam.stats\$mapped.reads) + mapped.factor = total.mapped / 10^9 + + bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor)) + data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence) +} + +bam.files = inf + +if(any(!file.exists(bam.files))){ + not.found = bam.files[!file.exists(bam.files)] + out.names = paste(not.found, collapse = "\n ") + stop(paste("One or more files does not exist:\n", out.names)) +} else { + rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% { + GetRPKMValues(bam.file = c.bam.file) + } + colnames(rpkm.data) = sapply(bam.files, basename) + out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data) + write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, + file = outf) +} +</configfile> +</configfiles> + + + <tests> + <test> + <param name="input1" value="GetRPKMvalues_test1_input.xls" ftype="bam"/> + <param name="job_name" value="test1"/> + <param name="runMe" value="$runMe"/> + <output name="tab_file" file="GetRPKMvalues_test1_output.xls" ftype="tabular"/> + </test> + </tests> + + +<help> + + +This tool computes the RPKM values from a given BAM file. + +This tool is a R script and requires Rsamtools and foreach packages. + +The output is a tabular file with two columns: +1. ref.sequence: which contains the names of reference sequences (genes/transcripts), and + +2. dataset_#.dat: which referes to the name of input dataset (BAM file specified) and contains the RPKM values for each reference sequence. The symbol # represents a consecutive number automatically assigned by Galaxy. + +**Script** +Pressing execute will run the following code over your input file and generate some outputs in your history:: + + + #!/usr/bin/Rscript + + # source("http://bioconductor.org/biocLite.R") + # biocLite("Rsamtools") + + ourargs = commandArgs(trailingOnly = TRUE) + inf = ourargs[1] + outf = ourargs[2] + + + library(Rsamtools) + library(foreach) + + GetMappedReadsFreqs = function(bam.file){ + require(Rsamtools) + mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads + mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped + bam.index.file = indexBam(bam.file) + mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file) + mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname)) + colnames(mapped) = c("ref.sequence", "mapped.reads") + mapped + } + + GetRefSeqsLengths = function(bam.file){ + require(Rsamtools) + header = scanBamHeader(files = bam.file) + ref.seqs.data = as.data.frame(header[[1]]\$targets) + data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets)) + } + + GetIdxStats = function(bam.file) { + mapped = GetMappedReadsFreqs(bam.file = bam.file) + ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file) + merge(x = mapped, y = ref.seqs.lengths, + by.x = "ref.sequence", by.y = "row.names") + } + + GetRPKMValues = function(bam.file) { + bam.stats = GetIdxStats(bam.file = bam.file) + total.mapped = sum(bam.stats\$mapped.reads) + mapped.factor = total.mapped / 10^9 + + bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor)) + data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence) + } + + bam.files = inf + + if(any(!file.exists(bam.files))){ + not.found = bam.files[!file.exists(bam.files)] + out.names = paste(not.found, collapse = "\n ") + stop(paste("One or more files does not exist:\n", out.names)) + } else { + rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% { + GetRPKMValues(bam.file = c.bam.file) + } + colnames(rpkm.data) = sapply(bam.files, basename) + out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data) + write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, + file = outf) + } + +**Attribution** +This Galaxy tool was created by hfespitia@cenicana.org at 06/09/2014 23:41:54 +using the Galaxy Tool Factory. + +See https://bitbucket.org/fubar/galaxytoolfactory for details of that project +Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team. +Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573 + + +</help> +<citations> + + <citation type="doi">10.1093/bioinformatics/bts573</citation> +</citations> +</tool>
