|
0
|
1 <tool id="GetRPKMvalues" name="GetRPKMvalues" version="0.01">
|
|
|
2 <description> from BAM file</description>
|
|
|
3
|
|
|
4 <command interpreter="python">
|
|
|
5
|
|
|
6 GetRPKMvalues.py --script_path "$runMe" --interpreter "Rscript"
|
|
|
7 --tool_name "GetRPKMvalues" --input_tab "$input1" --output_dir "./" --output_tab "$tab_file"
|
|
|
8 </command>
|
|
|
9 <inputs>
|
|
|
10 <param name="input1" type="data" format="bam" label="Select a suitable input file from your history"/>
|
|
|
11 <param name="job_name" type="text" label="Supply a name for the outputs to remind you what they contain" value="GetRPKMvalues"/>
|
|
|
12
|
|
|
13 </inputs>
|
|
|
14 <outputs>
|
|
|
15 <data format="tabular" name="tab_file" label="${job_name}"/>
|
|
|
16
|
|
|
17 </outputs>
|
|
|
18 <configfiles>
|
|
|
19 <configfile name="runMe">
|
|
|
20 #!/usr/bin/Rscript
|
|
|
21
|
|
|
22 # source("http://bioconductor.org/biocLite.R")
|
|
|
23 # biocLite("Rsamtools")
|
|
|
24
|
|
|
25 ourargs = commandArgs(trailingOnly = TRUE)
|
|
|
26 inf = ourargs[1]
|
|
|
27 outf = ourargs[2]
|
|
|
28
|
|
|
29
|
|
|
30 library(Rsamtools)
|
|
|
31 library(foreach)
|
|
|
32
|
|
|
33 GetMappedReadsFreqs = function(bam.file){
|
|
|
34 require(Rsamtools)
|
|
|
35 mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
|
|
|
36 mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
|
|
|
37 bam.index.file = indexBam(bam.file)
|
|
|
38 mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
|
|
|
39 mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
|
|
|
40 colnames(mapped) = c("ref.sequence", "mapped.reads")
|
|
|
41 mapped
|
|
|
42 }
|
|
|
43
|
|
|
44 GetRefSeqsLengths = function(bam.file){
|
|
|
45 require(Rsamtools)
|
|
|
46 header = scanBamHeader(files = bam.file)
|
|
|
47 ref.seqs.data = as.data.frame(header[[1]]\$targets)
|
|
|
48 data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
|
|
|
49 }
|
|
|
50
|
|
|
51 GetIdxStats = function(bam.file) {
|
|
|
52 mapped = GetMappedReadsFreqs(bam.file = bam.file)
|
|
|
53 ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
|
|
|
54 merge(x = mapped, y = ref.seqs.lengths,
|
|
|
55 by.x = "ref.sequence", by.y = "row.names")
|
|
|
56 }
|
|
|
57
|
|
|
58 GetRPKMValues = function(bam.file) {
|
|
|
59 bam.stats = GetIdxStats(bam.file = bam.file)
|
|
|
60 total.mapped = sum(bam.stats\$mapped.reads)
|
|
|
61 mapped.factor = total.mapped / 10^9
|
|
|
62
|
|
|
63 bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
|
|
|
64 data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
|
|
|
65 }
|
|
|
66
|
|
|
67 bam.files = inf
|
|
|
68
|
|
|
69 if(any(!file.exists(bam.files))){
|
|
|
70 not.found = bam.files[!file.exists(bam.files)]
|
|
|
71 out.names = paste(not.found, collapse = "\n ")
|
|
|
72 stop(paste("One or more files does not exist:\n", out.names))
|
|
|
73 } else {
|
|
|
74 rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
|
|
|
75 GetRPKMValues(bam.file = c.bam.file)
|
|
|
76 }
|
|
|
77 colnames(rpkm.data) = sapply(bam.files, basename)
|
|
|
78 out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
|
|
|
79 write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
|
|
|
80 file = outf)
|
|
|
81 }
|
|
|
82 </configfile>
|
|
|
83 </configfiles>
|
|
|
84
|
|
|
85
|
|
|
86 <tests>
|
|
|
87 <test>
|
|
|
88 <param name="input1" value="GetRPKMvalues_test1_input.xls" ftype="bam"/>
|
|
|
89 <param name="job_name" value="test1"/>
|
|
|
90 <param name="runMe" value="$runMe"/>
|
|
|
91 <output name="tab_file" file="GetRPKMvalues_test1_output.xls" ftype="tabular"/>
|
|
|
92 </test>
|
|
|
93 </tests>
|
|
|
94
|
|
|
95
|
|
|
96 <help>
|
|
|
97
|
|
|
98
|
|
|
99 This tool computes the RPKM values from a given BAM file.
|
|
|
100
|
|
|
101 This tool is a R script and requires Rsamtools and foreach packages.
|
|
|
102
|
|
|
103 The output is a tabular file with two columns:
|
|
|
104 1. ref.sequence: which contains the names of reference sequences (genes/transcripts), and
|
|
|
105
|
|
|
106 2. dataset_#.dat: which referes to the name of input dataset (BAM file specified) and contains the RPKM values for each reference sequence. The symbol # represents a consecutive number automatically assigned by Galaxy.
|
|
|
107
|
|
|
108 **Script**
|
|
|
109 Pressing execute will run the following code over your input file and generate some outputs in your history::
|
|
|
110
|
|
|
111
|
|
|
112 #!/usr/bin/Rscript
|
|
|
113
|
|
|
114 # source("http://bioconductor.org/biocLite.R")
|
|
|
115 # biocLite("Rsamtools")
|
|
|
116
|
|
|
117 ourargs = commandArgs(trailingOnly = TRUE)
|
|
|
118 inf = ourargs[1]
|
|
|
119 outf = ourargs[2]
|
|
|
120
|
|
|
121
|
|
|
122 library(Rsamtools)
|
|
|
123 library(foreach)
|
|
|
124
|
|
|
125 GetMappedReadsFreqs = function(bam.file){
|
|
|
126 require(Rsamtools)
|
|
|
127 mapped.flag = scanBamFlag(isUnmappedQuery = FALSE) # Get all mapped reads
|
|
|
128 mapped.param = ScanBamParam(flag = mapped.flag, what = c("rname")) # Get only the reference name to which reads were mapped
|
|
|
129 bam.index.file = indexBam(bam.file)
|
|
|
130 mapped.scan.res = scanBam(param = mapped.param, file = bam.file, index = bam.index.file)
|
|
|
131 mapped = as.data.frame(table(mapped.scan.res[[1]]\$rname))
|
|
|
132 colnames(mapped) = c("ref.sequence", "mapped.reads")
|
|
|
133 mapped
|
|
|
134 }
|
|
|
135
|
|
|
136 GetRefSeqsLengths = function(bam.file){
|
|
|
137 require(Rsamtools)
|
|
|
138 header = scanBamHeader(files = bam.file)
|
|
|
139 ref.seqs.data = as.data.frame(header[[1]]\$targets)
|
|
|
140 data.frame(length = header[[1]]\$targets, row.names = names(header[[1]]\$targets))
|
|
|
141 }
|
|
|
142
|
|
|
143 GetIdxStats = function(bam.file) {
|
|
|
144 mapped = GetMappedReadsFreqs(bam.file = bam.file)
|
|
|
145 ref.seqs.lengths = GetRefSeqsLengths(bam.file = bam.file)
|
|
|
146 merge(x = mapped, y = ref.seqs.lengths,
|
|
|
147 by.x = "ref.sequence", by.y = "row.names")
|
|
|
148 }
|
|
|
149
|
|
|
150 GetRPKMValues = function(bam.file) {
|
|
|
151 bam.stats = GetIdxStats(bam.file = bam.file)
|
|
|
152 total.mapped = sum(bam.stats\$mapped.reads)
|
|
|
153 mapped.factor = total.mapped / 10^9
|
|
|
154
|
|
|
155 bam.stats\$RPKM = (bam.stats\$mapped.reads / (bam.stats\$length * mapped.factor))
|
|
|
156 data.frame(RPKM = bam.stats\$RPKM, row.names = bam.stats\$ref.sequence)
|
|
|
157 }
|
|
|
158
|
|
|
159 bam.files = inf
|
|
|
160
|
|
|
161 if(any(!file.exists(bam.files))){
|
|
|
162 not.found = bam.files[!file.exists(bam.files)]
|
|
|
163 out.names = paste(not.found, collapse = "\n ")
|
|
|
164 stop(paste("One or more files does not exist:\n", out.names))
|
|
|
165 } else {
|
|
|
166 rpkm.data = foreach(c.bam.file = bam.files, .combine = "cbind") %do% {
|
|
|
167 GetRPKMValues(bam.file = c.bam.file)
|
|
|
168 }
|
|
|
169 colnames(rpkm.data) = sapply(bam.files, basename)
|
|
|
170 out.data = cbind(ref.sequence = rownames(rpkm.data), rpkm.data)
|
|
|
171 write.table(out.data, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE,
|
|
|
172 file = outf)
|
|
|
173 }
|
|
|
174
|
|
|
175 **Attribution**
|
|
|
176 This Galaxy tool was created by hfespitia@cenicana.org at 06/09/2014 23:41:54
|
|
|
177 using the Galaxy Tool Factory.
|
|
|
178
|
|
|
179 See https://bitbucket.org/fubar/galaxytoolfactory for details of that project
|
|
|
180 Please cite: Creating re-usable tools from scripts: The Galaxy Tool Factory. Ross Lazarus; Antony Kaspi; Mark Ziemann; The Galaxy Team.
|
|
|
181 Bioinformatics 2012; doi: 10.1093/bioinformatics/bts573
|
|
|
182
|
|
|
183
|
|
|
184 </help>
|
|
|
185 <citations>
|
|
|
186
|
|
|
187 <citation type="doi">10.1093/bioinformatics/bts573</citation>
|
|
|
188 </citations>
|
|
|
189 </tool>
|