Mercurial > repos > hspitia > rpkm_values
view GetRPKMvalues/GetRPKMvalues.xml @ 0:888cb13321fa draft default tip
Uploaded
| author | hspitia |
|---|---|
| date | Sun, 07 Sep 2014 02:02:23 -0400 |
| parents | |
| children |
line wrap: on
line source
<tool id="GetRPKMvalues" name="GetRPKMvalues" version="0.01"> <description> from BAM file</description> <command interpreter="python"> GetRPKMvalues.py --script_path "$runMe" --interpreter "Rscript" --tool_name "GetRPKMvalues" --input_tab "$input1" --output_dir "./" --output_tab "$tab_file" </command> <inputs> <param name="input1" type="data" format="bam" label="Select a suitable input file from your history"/> <param name="job_name" type="text" label="Supply a name for the outputs to remind you what they contain" value="GetRPKMvalues"/> </inputs> <outputs> <data format="tabular" name="tab_file" label="${job_name}"/> </outputs> <configfiles> <configfile name="runMe"> #!/usr/bin/Rscript # source("http://bioconductor.org/biocLite.R") # biocLite("Rsamtools") ourargs = commandArgs(trailingOnly = TRUE) inf = ourargs[1] outf = ourargs[2] library(Rsamtools) library(foreach) GetMappedReadsFreqs = function(bam.file){ require(Rsamtools) mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped bam.index.file = indexBam(bam.file) mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file) mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname)) colnames(mapped) = c("ref.sequence", "mapped.reads") mapped } GetRefSeqsLengths = function(bam.file){ require(Rsamtools) header = scanBamHeader(files = bam.file) ref.seqs.data = as.data.frame(header[[1]]\$targets) data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets)) } GetIdxStats = function(bam.file) { mapped = GetMappedReadsFreqs(bam.file = bam.file) ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file) merge(x = mapped, y = ref.seqs.lengths, by.x = "ref.sequence", by.y = "row.names") } GetRPKMValues = function(bam.file) { bam.stats = GetIdxStats(bam.file = bam.file) total.mapped = sum(bam.stats\$mapped.reads) mapped.factor = total.mapped / 10^9 bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor)) data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence) } bam.files = inf if(any(!file.exists(bam.files))){ not.found = bam.files[!file.exists(bam.files)] out.names = paste(not.found, collapse = "\n ") stop(paste("One or more files does not exist:\n", out.names)) } else { rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% { GetRPKMValues(bam.file = c.bam.file) } colnames(rpkm.data) = sapply(bam.files, basename) out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data) write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, file = outf) } </configfile> </configfiles> <tests> <test> <param name="input1" value="GetRPKMvalues_test1_input.xls" ftype="bam"/> <param name="job_name" value="test1"/> <param name="runMe" value="$runMe"/> <output name="tab_file" file="GetRPKMvalues_test1_output.xls" ftype="tabular"/> </test> </tests> <help> This tool computes the RPKM values from a given BAM file. This tool is a R script and requires Rsamtools and foreach packages. The output is a tabular file with two columns: 1. ref.sequence: which contains the names of reference sequences (genes/transcripts), and 2. dataset_#.dat: which referes to the name of input dataset (BAM file specified) and contains the RPKM values for each reference sequence. The symbol # represents a consecutive number automatically assigned by Galaxy. **Script** Pressing execute will run the following code over your input file and generate some outputs in your history:: #!/usr/bin/Rscript # source("http://bioconductor.org/biocLite.R") # biocLite("Rsamtools") ourargs = commandArgs(trailingOnly = TRUE) inf = ourargs[1] outf = ourargs[2] library(Rsamtools) library(foreach) GetMappedReadsFreqs = function(bam.file){ require(Rsamtools) mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped bam.index.file = indexBam(bam.file) mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file) mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname)) colnames(mapped) = c("ref.sequence", "mapped.reads") mapped } GetRefSeqsLengths = function(bam.file){ require(Rsamtools) header = scanBamHeader(files = bam.file) ref.seqs.data = as.data.frame(header[[1]]\$targets) data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets)) } GetIdxStats = function(bam.file) { mapped = GetMappedReadsFreqs(bam.file = bam.file) ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file) merge(x = mapped, y = ref.seqs.lengths, by.x = "ref.sequence", by.y = "row.names") } GetRPKMValues = function(bam.file) { bam.stats = GetIdxStats(bam.file = bam.file) total.mapped = sum(bam.stats\$mapped.reads) mapped.factor = total.mapped / 10^9 bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor)) data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence) } bam.files = inf if(any(!file.exists(bam.files))){ not.found = bam.files[!file.exists(bam.files)] out.names = paste(not.found, collapse = "\n ") stop(paste("One or more files does not exist:\n", out.names)) } else { rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% { GetRPKMValues(bam.file = c.bam.file) } colnames(rpkm.data) = sapply(bam.files, basename) out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data) write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE, file = outf) } **Attribution** This Galaxy tool was created by hfespitia@cenicana.org at 06/09/2014 23:41:54 using the Galaxy Tool Factory. See https://bitbucket.org/fubar/galaxytoolfactory for details of that project Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team. Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573 </help> <citations> <citation type="doi">10.1093/bioinformatics/bts573</citation> </citations> </tool>
