annotate bincalculator.r @ 1:243f75d0ed6e draft default tip

Uploaded. Includes new release 1.0.7 with fixed optional controls.
author messersc
date Thu, 19 Feb 2015 05:39:45 -0500
parents d42f4d78c85e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
1 ########################################################################
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
2 # JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
3 # Copyright (C) 2014-2015 Mahmoud Ibrahim
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
4 #
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
5 # This program is free software: you can redistribute it and/or modify
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
6 # it under the terms of the GNU General Public License as published by
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
7 # the Free Software Foundation, either version 3 of the License, or
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
8 # (at your option) any later version.
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
9 #
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
10 # This program is distributed in the hope that it will be useful,
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
13 # GNU General Public License for more details.
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
14 #
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
15 # You should have received a copy of the GNU General Public License
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
17 #
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
18 # Contact: mahmoud.ibrahim@mdc-berlin.de
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
19 ########################################################################
0
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
20
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
21
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
22 # =======================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
23 # User-defined variables
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
24 # =======================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
25 defaultBins = seq(50, 50*15, by = 50) # default binsize search space, used when fragment length is equal or less than read length
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
26 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
27
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
28
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
29
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
30
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
31 # ================================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
32 # Required Libraries check & load
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
33 # ================================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
34 if ((is.element('parallel', installed.packages()[,1]) == FALSE)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
35 stop("R package'parallel' is required. Please install it!")
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
36 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
37 suppressPackageStartupMessages(library("parallel"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
38 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
39
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
40
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
41
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
42
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
43 # =================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
44 # Custom Functions
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
45 # =================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
46 #Implements the Shimazaki procedure
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
47 shimazaki = function(bedfile, rl, bins, maxIter, filelist, chromSize, type) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
48
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
49 #read in the file data
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
50 if (type == "single") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
51 reads = read.table(bedfile, header = FALSE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
52 reads = cbind(as.character(reads[[1]]), as.character(reads[[2]]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
53 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
54 if (type == "paired") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
55 #reads = sqldf("SELECT V1 As 'start', V2 As 'end' FROM readsFile", file.format=list(header = FALSE, sep = "\t"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
56 reads = read.table(bedfile, header = FALSE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
57 reads = cbind(as.numeric(reads[[1]]), as.numeric(reads[[2]]))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
58 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
59 readnum = length(reads[[1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
60 o = which(filelist == bedfile)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
61 readlen = rl[o]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
62 jack = o - 1
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
63 bins = bins[(1+(jack*15)):(15+(jack*15))]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
64
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
65 costs = vector(mode = "numeric", length = length(bins))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
66 #Shimazaki procedure
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
67 for (i in 1:length(bins)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
68 #construct the counting breaks vector
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
69 genomevec = seq(0, chromSize, by = bins[i]);
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
70 if (max(genomevec) < chromSize) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
71 genomevec = append(genomevec, chromSize);
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
72 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
73
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
74 #create a vector of read counts
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
75 if (type == "single") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
76 ameirah = sort(c((as.numeric(reads[reads[,2] == "+",,drop = FALSE][,1])), ((as.numeric(reads[reads[,2] == "-",,drop = FALSE][,1])) + readlen - 1)))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
77 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
78 if (type == "paired") {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
79 ameirah = sort(c((reads[[1]]), (reads[[2]])))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
80 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
81 ameirah = hist(ameirah, breaks = genomevec, plot = FALSE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
82 ameirah = ameirah$counts
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
83
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
84 #get cost function
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
85 m = mean(ameirah)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
86 v = (sum((ameirah - m)^2)) / (length(ameirah))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
87 num = ((2*m) - v)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
88 den = ((bins[i]) * readnum)^2
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
89 cost = -(log(abs(num)) - log(den))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
90 costs[i] = cost
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
91 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
92
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
93 index = which.min(costs)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
94 finbin = bins[index]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
95
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
96 return(finbin)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
97 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
98 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
99
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
100
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
101
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
102
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
103 # ==========================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
104 # Parse-in System Variables
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
105 # ==========================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
106 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
107
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
108
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
109 #Set arguments to default values
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
110 ibed = NA # input bed file
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
111 sFile = NA # chromosome size
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
112 storeFile = NA # file to store result
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
113 cornum = 1 # number of processors to use
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
114 rl = NA # read length
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
115 frags = NA # fragment lengths
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
116 bins = NA
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
117
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
118 #Parsing arguments and storing values
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
119 for (each.arg in args) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
120 if (grepl('^-ibed=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
121 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
122 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
123 ibed <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
124 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
125 stop('No input bed file')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
126 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
127 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
128 if (grepl('^-s=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
129 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
130 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
131 sFile <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
132 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
133 stop('No chromosome size file')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
134 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
135 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
136 if (grepl('^-rl=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
137 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
138 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
139 rl <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
140 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
141 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
142 if (grepl('^-d=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
143 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
144 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
145 storeFile <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
146 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
147 stop('No file to store result')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
148 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
149 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
150 if (grepl('^-p=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
151 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
152 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
153 cornum <- as.numeric(arg.split[2])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
154 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
155 stop('No number of cores given')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
156 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
157 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
158 if (grepl('^-f=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
159 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
160 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
161 frags <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
162 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
163 stop('No Fragment lengths given')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
164 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
165 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
166 if (grepl('^-type=',each.arg)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
167 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
168 if (! is.na(arg.split[2]) ) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
169 type <- arg.split[2]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
170 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
171 stop('No type given')
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
172 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
173 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
174 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
175
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
176 #Read in variables
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
177 chromosomes = read.table(sFile, header=FALSE)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
178 chromSize = as.numeric(chromosomes$V2) #chromosome size
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
179 chromSize = max(chromSize) #get maximum chrom size
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
180 rm(chromosomes)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
181
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
182 ibed = strsplit(ibed, ",", fixed = TRUE)[[1]]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
183 rl = as.numeric(strsplit(rl, ",", fixed = TRUE)[[1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
184 frags = as.numeric(strsplit(frags, ",", fixed = TRUE)[[1]])
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
185 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
186
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
187
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
188
1
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
189 # ===================================================
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
190 # Shimazaki Procedure (Shimazaki and Shinomoto 2007)
243f75d0ed6e Uploaded.
messersc
parents: 0
diff changeset
191 # ===================================================
0
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
192 for (i in 1:length(ibed)) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
193 if (frags[i] > rl[i]) {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
194 minbin = floor(frags[i] / 2)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
195 bins = c(bins, seq(minbin, minbin*15, by = minbin))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
196 } else {
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
197 bins = c(bins, defaultBins)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
198 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
199 }
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
200 bins = bins[!is.na(bins)]
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
201 bins = mclapply(ibed, shimazaki, rl, bins, maxIter, ibed, chromSize, type = type, mc.cores = cornum)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
202 bins = min(unlist(bins))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
203 #=======================> DONE!
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
204
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
205
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
206 # ==================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
207 # Write Information
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
208 # ==================
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
209 write(paste0(bins), file = paste0(storeFile, "/binsize.txt"))
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
210 message(bins)
d42f4d78c85e Uploaded
messersc
parents:
diff changeset
211 #=======================> DONE!