1
|
1 ########################################################################
|
|
2 # JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
|
|
3 # Copyright (C) 2014-2015 Mahmoud Ibrahim
|
|
4 #
|
|
5 # This program is free software: you can redistribute it and/or modify
|
|
6 # it under the terms of the GNU General Public License as published by
|
|
7 # the Free Software Foundation, either version 3 of the License, or
|
|
8 # (at your option) any later version.
|
|
9 #
|
|
10 # This program is distributed in the hope that it will be useful,
|
|
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
13 # GNU General Public License for more details.
|
|
14 #
|
|
15 # You should have received a copy of the GNU General Public License
|
|
16 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
17 #
|
|
18 # Contact: mahmoud.ibrahim@mdc-berlin.de
|
|
19 ########################################################################
|
0
|
20
|
|
21
|
|
22 # =======================
|
|
23 # User-defined variables
|
|
24 # =======================
|
|
25 defaultBins = seq(50, 50*15, by = 50) # default binsize search space, used when fragment length is equal or less than read length
|
|
26 #=======================> DONE!
|
|
27
|
|
28
|
|
29
|
|
30
|
|
31 # ================================
|
|
32 # Required Libraries check & load
|
|
33 # ================================
|
|
34 if ((is.element('parallel', installed.packages()[,1]) == FALSE)) {
|
|
35 stop("R package'parallel' is required. Please install it!")
|
|
36 }
|
|
37 suppressPackageStartupMessages(library("parallel"))
|
|
38 #=======================> DONE!
|
|
39
|
|
40
|
|
41
|
|
42
|
|
43 # =================
|
|
44 # Custom Functions
|
|
45 # =================
|
|
46 #Implements the Shimazaki procedure
|
|
47 shimazaki = function(bedfile, rl, bins, maxIter, filelist, chromSize, type) {
|
|
48
|
|
49 #read in the file data
|
|
50 if (type == "single") {
|
|
51 reads = read.table(bedfile, header = FALSE)
|
|
52 reads = cbind(as.character(reads[[1]]), as.character(reads[[2]]))
|
|
53 }
|
|
54 if (type == "paired") {
|
|
55 #reads = sqldf("SELECT V1 As 'start', V2 As 'end' FROM readsFile", file.format=list(header = FALSE, sep = "\t"))
|
|
56 reads = read.table(bedfile, header = FALSE)
|
|
57 reads = cbind(as.numeric(reads[[1]]), as.numeric(reads[[2]]))
|
|
58 }
|
|
59 readnum = length(reads[[1]])
|
|
60 o = which(filelist == bedfile)
|
|
61 readlen = rl[o]
|
|
62 jack = o - 1
|
|
63 bins = bins[(1+(jack*15)):(15+(jack*15))]
|
|
64
|
|
65 costs = vector(mode = "numeric", length = length(bins))
|
|
66 #Shimazaki procedure
|
|
67 for (i in 1:length(bins)) {
|
|
68 #construct the counting breaks vector
|
|
69 genomevec = seq(0, chromSize, by = bins[i]);
|
|
70 if (max(genomevec) < chromSize) {
|
|
71 genomevec = append(genomevec, chromSize);
|
|
72 }
|
|
73
|
|
74 #create a vector of read counts
|
|
75 if (type == "single") {
|
|
76 ameirah = sort(c((as.numeric(reads[reads[,2] == "+",,drop = FALSE][,1])), ((as.numeric(reads[reads[,2] == "-",,drop = FALSE][,1])) + readlen - 1)))
|
|
77 }
|
|
78 if (type == "paired") {
|
|
79 ameirah = sort(c((reads[[1]]), (reads[[2]])))
|
|
80 }
|
|
81 ameirah = hist(ameirah, breaks = genomevec, plot = FALSE)
|
|
82 ameirah = ameirah$counts
|
|
83
|
|
84 #get cost function
|
|
85 m = mean(ameirah)
|
|
86 v = (sum((ameirah - m)^2)) / (length(ameirah))
|
|
87 num = ((2*m) - v)
|
|
88 den = ((bins[i]) * readnum)^2
|
|
89 cost = -(log(abs(num)) - log(den))
|
|
90 costs[i] = cost
|
|
91 }
|
|
92
|
|
93 index = which.min(costs)
|
|
94 finbin = bins[index]
|
|
95
|
|
96 return(finbin)
|
|
97 }
|
|
98 #=======================> DONE!
|
|
99
|
|
100
|
|
101
|
|
102
|
|
103 # ==========================
|
|
104 # Parse-in System Variables
|
|
105 # ==========================
|
|
106 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
|
|
107
|
|
108
|
|
109 #Set arguments to default values
|
|
110 ibed = NA # input bed file
|
|
111 sFile = NA # chromosome size
|
|
112 storeFile = NA # file to store result
|
|
113 cornum = 1 # number of processors to use
|
|
114 rl = NA # read length
|
|
115 frags = NA # fragment lengths
|
|
116 bins = NA
|
|
117
|
|
118 #Parsing arguments and storing values
|
|
119 for (each.arg in args) {
|
|
120 if (grepl('^-ibed=',each.arg)) {
|
|
121 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
122 if (! is.na(arg.split[2]) ) {
|
|
123 ibed <- arg.split[2]
|
|
124 } else {
|
|
125 stop('No input bed file')
|
|
126 }
|
|
127 }
|
|
128 if (grepl('^-s=',each.arg)) {
|
|
129 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
130 if (! is.na(arg.split[2]) ) {
|
|
131 sFile <- arg.split[2]
|
|
132 } else {
|
|
133 stop('No chromosome size file')
|
|
134 }
|
|
135 }
|
|
136 if (grepl('^-rl=',each.arg)) {
|
|
137 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
138 if (! is.na(arg.split[2]) ) {
|
|
139 rl <- arg.split[2]
|
|
140 }
|
|
141 }
|
|
142 if (grepl('^-d=',each.arg)) {
|
|
143 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
144 if (! is.na(arg.split[2]) ) {
|
|
145 storeFile <- arg.split[2]
|
|
146 } else {
|
|
147 stop('No file to store result')
|
|
148 }
|
|
149 }
|
|
150 if (grepl('^-p=',each.arg)) {
|
|
151 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
152 if (! is.na(arg.split[2]) ) {
|
|
153 cornum <- as.numeric(arg.split[2])
|
|
154 } else {
|
|
155 stop('No number of cores given')
|
|
156 }
|
|
157 }
|
|
158 if (grepl('^-f=',each.arg)) {
|
|
159 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
160 if (! is.na(arg.split[2]) ) {
|
|
161 frags <- arg.split[2]
|
|
162 } else {
|
|
163 stop('No Fragment lengths given')
|
|
164 }
|
|
165 }
|
|
166 if (grepl('^-type=',each.arg)) {
|
|
167 arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
|
|
168 if (! is.na(arg.split[2]) ) {
|
|
169 type <- arg.split[2]
|
|
170 } else {
|
|
171 stop('No type given')
|
|
172 }
|
|
173 }
|
|
174 }
|
|
175
|
|
176 #Read in variables
|
|
177 chromosomes = read.table(sFile, header=FALSE)
|
|
178 chromSize = as.numeric(chromosomes$V2) #chromosome size
|
|
179 chromSize = max(chromSize) #get maximum chrom size
|
|
180 rm(chromosomes)
|
|
181
|
|
182 ibed = strsplit(ibed, ",", fixed = TRUE)[[1]]
|
|
183 rl = as.numeric(strsplit(rl, ",", fixed = TRUE)[[1]])
|
|
184 frags = as.numeric(strsplit(frags, ",", fixed = TRUE)[[1]])
|
|
185 #=======================> DONE!
|
|
186
|
|
187
|
|
188
|
1
|
189 # ===================================================
|
|
190 # Shimazaki Procedure (Shimazaki and Shinomoto 2007)
|
|
191 # ===================================================
|
0
|
192 for (i in 1:length(ibed)) {
|
|
193 if (frags[i] > rl[i]) {
|
|
194 minbin = floor(frags[i] / 2)
|
|
195 bins = c(bins, seq(minbin, minbin*15, by = minbin))
|
|
196 } else {
|
|
197 bins = c(bins, defaultBins)
|
|
198 }
|
|
199 }
|
|
200 bins = bins[!is.na(bins)]
|
|
201 bins = mclapply(ibed, shimazaki, rl, bins, maxIter, ibed, chromSize, type = type, mc.cores = cornum)
|
|
202 bins = min(unlist(bins))
|
|
203 #=======================> DONE!
|
|
204
|
|
205
|
|
206 # ==================
|
|
207 # Write Information
|
|
208 # ==================
|
|
209 write(paste0(bins), file = paste0(storeFile, "/binsize.txt"))
|
|
210 message(bins)
|
|
211 #=======================> DONE!
|