comparison GetRPKMvalues/GetRPKMvalues.xml @ 0:888cb13321fa draft default tip

Uploaded
author hspitia
date Sun, 07 Sep 2014 02:02:23 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:888cb13321fa
1 <tool id="GetRPKMvalues" name="GetRPKMvalues" version="0.01">
2 <description> from BAM file</description>
3
4 <command interpreter="python">
5
6 GetRPKMvalues.py --script_path "$runMe" --interpreter "Rscript"
7 --tool_name "GetRPKMvalues" --input_tab "$input1" --output_dir "./" --output_tab "$tab_file"
8 </command>
9 <inputs>
10 <param name="input1" type="data" format="bam" label="Select a suitable input file from your history"/>
11 <param name="job_name" type="text" label="Supply a name for the outputs to remind you what they contain" value="GetRPKMvalues"/>
12
13 </inputs>
14 <outputs>
15 <data format="tabular" name="tab_file" label="${job_name}"/>
16
17 </outputs>
18 <configfiles>
19 <configfile name="runMe">
20 #!/usr/bin/Rscript
21
22 # source("http://bioconductor.org/biocLite.R")
23 # biocLite("Rsamtools")
24
25 ourargs = commandArgs(trailingOnly = TRUE)
26 inf = ourargs[1]
27 outf = ourargs[2]
28
29
30 library(Rsamtools)
31 library(foreach)
32
33 GetMappedReadsFreqs = function(bam.file){
34 require(Rsamtools)
35 mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
36 mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
37 bam.index.file = indexBam(bam.file)
38 mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
39 mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
40 colnames(mapped) = c("ref.sequence", "mapped.reads")
41 mapped
42 }
43
44 GetRefSeqsLengths = function(bam.file){
45 require(Rsamtools)
46 header = scanBamHeader(files = bam.file)
47 ref.seqs.data = as.data.frame(header[[1]]\$targets)
48 data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
49 }
50
51 GetIdxStats = function(bam.file) {
52 mapped = GetMappedReadsFreqs(bam.file = bam.file)
53 ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
54 merge(x = mapped, y = ref.seqs.lengths,
55 by.x = "ref.sequence", by.y = "row.names")
56 }
57
58 GetRPKMValues = function(bam.file) {
59 bam.stats = GetIdxStats(bam.file = bam.file)
60 total.mapped = sum(bam.stats\$mapped.reads)
61 mapped.factor = total.mapped / 10^9
62
63 bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
64 data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
65 }
66
67 bam.files = inf
68
69 if(any(!file.exists(bam.files))){
70 not.found = bam.files[!file.exists(bam.files)]
71 out.names = paste(not.found, collapse = "\n ")
72 stop(paste("One or more files does not exist:\n", out.names))
73 } else {
74 rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
75 GetRPKMValues(bam.file = c.bam.file)
76 }
77 colnames(rpkm.data) = sapply(bam.files, basename)
78 out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
79 write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
80 file = outf)
81 }
82 </configfile>
83 </configfiles>
84
85
86 <tests>
87 <test>
88 <param name="input1" value="GetRPKMvalues_test1_input.xls" ftype="bam"/>
89 <param name="job_name" value="test1"/>
90 <param name="runMe" value="$runMe"/>
91 <output name="tab_file" file="GetRPKMvalues_test1_output.xls" ftype="tabular"/>
92 </test>
93 </tests>
94
95
96 <help>
97
98
99 This tool computes the RPKM values from a given BAM file.
100
101 This tool is a R script and requires Rsamtools and foreach packages.
102
103 The output is a tabular file with two columns:
104 1. ref.sequence: which contains the names of reference sequences (genes/transcripts), and
105
106 2. dataset_#.dat: which referes to the name of input dataset (BAM file specified) and contains the RPKM values for each reference sequence. The symbol # represents a consecutive number automatically assigned by Galaxy.
107
108 **Script**
109 Pressing execute will run the following code over your input file and generate some outputs in your history::
110
111
112 #!/usr/bin/Rscript
113
114 # source("http://bioconductor.org/biocLite.R")
115 # biocLite("Rsamtools")
116
117 ourargs = commandArgs(trailingOnly = TRUE)
118 inf = ourargs[1]
119 outf = ourargs[2]
120
121
122 library(Rsamtools)
123 library(foreach)
124
125 GetMappedReadsFreqs = function(bam.file){
126 require(Rsamtools)
127 mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
128 mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
129 bam.index.file = indexBam(bam.file)
130 mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
131 mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
132 colnames(mapped) = c("ref.sequence", "mapped.reads")
133 mapped
134 }
135
136 GetRefSeqsLengths = function(bam.file){
137 require(Rsamtools)
138 header = scanBamHeader(files = bam.file)
139 ref.seqs.data = as.data.frame(header[[1]]\$targets)
140 data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
141 }
142
143 GetIdxStats = function(bam.file) {
144 mapped = GetMappedReadsFreqs(bam.file = bam.file)
145 ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
146 merge(x = mapped, y = ref.seqs.lengths,
147 by.x = "ref.sequence", by.y = "row.names")
148 }
149
150 GetRPKMValues = function(bam.file) {
151 bam.stats = GetIdxStats(bam.file = bam.file)
152 total.mapped = sum(bam.stats\$mapped.reads)
153 mapped.factor = total.mapped / 10^9
154
155 bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
156 data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
157 }
158
159 bam.files = inf
160
161 if(any(!file.exists(bam.files))){
162 not.found = bam.files[!file.exists(bam.files)]
163 out.names = paste(not.found, collapse = "\n ")
164 stop(paste("One or more files does not exist:\n", out.names))
165 } else {
166 rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
167 GetRPKMValues(bam.file = c.bam.file)
168 }
169 colnames(rpkm.data) = sapply(bam.files, basename)
170 out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
171 write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
172 file = outf)
173 }
174
175 **Attribution**
176 This Galaxy tool was created by hfespitia@cenicana.org at 06/09/2014 23:41:54
177 using the Galaxy Tool Factory.
178
179 See https://bitbucket.org/fubar/galaxytoolfactory for details of that project
180 Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team.
181 Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573
182
183
184 </help>
185 <citations>
186
187 <citation type="doi">10.1093/bioinformatics/bts573</citation>
188 </citations>
189 </tool>