jamm: peakfinder.r comparison

comparison peakfinder.r @ 1:243f75d0ed6e draft default tip

Uploaded. Includes new release 1.0.7 with fixed optional controls.

author	messersc
date	Thu, 19 Feb 2015 05:39:45 -0500
parents	d42f4d78c85e
children

comparison

equal deleted inserted replaced

-:d42f4d78c85e
+:243f75d0ed6e
-###############################
+########################################################################
-#### Peak finder for NGS Data
+# JAMMv1.0.7rev1 is a peak finder for joint analysis of NGS replicates.
-#### R script
+# Copyright (C) 2014-2015  Mahmoud Ibrahim
-###############################
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Contact: mahmoud.ibrahim@mdc-berlin.de
+########################################################################
 # =======================
 # User-defined variables
 # =======================
-cutoff = NA #To enforce an SNR cutoff ratio, delete NA and enter the number you want
+samplingSeed = 1011414 #the seed makes the random sampling that JAMM does "deterministic". You can change this to your integer seed of choice to reproduce exact same results or randomize it this way: ceiling(rnorm(1,50,300)).
-strict = 1 #to make peak calling more / less strict, increase / decrease this number
+reportNoClust = "n" #report windows for which clustering failed? Reported windows will be marked by "NoClust" in the peak name (4th column).
-meanAdjust = "n" #adjust the initialization mean vector for each window before clustering? If you want to do so, change this to "y".
+cutoff = NA #To enforce an SNR cutoff ratio for bin enrichment calling, delete NA and enter the number you want.
-options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000)
+strict = 1 #To make bin enrichment calling more / less strict, increase / decrease this number.
+meanAdjust = "n" #Adjust the initialization mean vector for each window before clustering? If you want to do so, change this to "y".
+options(warn = -1, scipen = 1000) #R will not report any warnings (warn = -1), R will not use scientific notation (scipen = 1000).
 #=======================> DONE!
 		} else {
 			write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
 		}
 		counts = counts/mCount
 	}
 	return(counts)
 }
 		} else {
 			write(paste(mCount), file = paste0(out, "/norma.", o, ".info"), append = TRUE)
 		}
 		counts = counts/mCount
 	}
 	return(counts)
 }
 	#extract subset of the background
 	if (bkgd != "None") {
 		Cs = counts[[numdup+1]][winStart:tempend]
 		mCs = mean(Cs)
 		dCs = sd(Cs)
-	} else {
+	}
-		Cs = sample(C, binSizeTemp, replace = TRUE)
-	}
 	go = rep(0, numdup)
 	for (g in 1:numdup) {
 		mS = (mean(counts[[g]][winStart:tempend]))
 		ratio = mS/dCs
 		if ((mS > (mCs * strict)) && (ratio > cutoff)) {
 		Cs = counts[[numdup+1]]
 		Cmin = min(Cs[Cs > 0])
 		Cs = Cs[winStart:winEnd]
 		Cs = filtfilt(rep(1,80)/80,1,Cs) + Cmin #gets rid of Inf in the fold change
 	} else {
+		set.seed(samplingSeed)
 		Cs = sample(C, rWinSizeTemp, replace = TRUE)
 		Cs = filtfilt(rep(1,80)/80,1,Cs)
 	}
 	#start scoring
 		Rs[,j] = filtfilt(rep(1,80)/80,1,Rs[,j])
 	}
 	return(Rs)
 }
 cluster = function(model, sig, init, clustnummer, noise) { #helper function2
+	set.seed(samplingSeed)
 	noisy = sample(noise, length(sig[,1]), replace = TRUE)
 	clust = me(model, sig+noisy, init)
 	bicc =  bic(model, clust$loglik, length(sig[,1]), length(sig[1,]), clustnummer)
 	out = list(bicc = bicc, param = clust$parameters)
 	return(out)
 		sig = lapply(coffeeshopNord, smoothcounts, coffeeshopSud, numdup, counts, startlist = coffeeshopNord)
 	}
 	sig = do.call(rbind, sig)
 	#kmeans initialization
-	init = kmeans(sig, clustnummer)
+	set.seed(samplingSeed)
+	init = kmeans(sig, clustnummer, nstart = 20)
 	init = unmap(init$cluster)
 	if (cornum > 1) {
 		param = mclapply(modelnames, cluster, sig, init, clustnummer, noise, mc.cores = cornum, mc.preschedule = TRUE)
 #find peaks
-findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist, meanAdjust) {
+findpeak = function(winStart, coffeeshopSud, numdup, C, param, bkgd, resol, counts, noise, startlist, meanAdjust, clustnummer) {
 	plz = which(startlist == winStart)
 	winEnd = coffeeshopSud[plz]
 	#will store peak information
 	writethis = list()
+	ccx = 1 #default is clustering didNOT work
 	rWinSizeTemp = winEnd - winStart + 1
 	#extract subset of the IP
 			Rs[,j] = filtfilt(rep(1,80)/80,1,Rsr[,j])
 		}
 	}
 	if (resol != "window") {
-		####CLUSTERING START HERE#####
+		#clustering (take 1)
+		take = 1
+		set.seed(samplingSeed)
 		noisy = sample(noise, rWinSizeTemp, replace = TRUE)
 		clust = em(param$modelname, Rs+noisy, param$initparam)
 		clust$classification = map(clust$z)
-		####CLUSTERING DONE####
+		if (!((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification)))))) { #clustering didn't work, take1
-		#check whether clutering succeeded
+			#repeat clustering from scratch, take 2!
-		if((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) {
+			set.seed(samplingSeed)
+			init = kmeans(Rs, clustnummer, nstart = 20)
+			init = unmap(init$cluster)
-			#check whether all components replicates agreed on clustering assignments
+			set.seed(samplingSeed)
-			ccx = 1
+			noisy = sample(noise, rWinSizeTemp, replace = TRUE)
-			if (numdup > 1) {
+			clust = me(param$modelname, Rs+noisy, init)
+			clust$classification = map(clust$z)
+			if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) {
+				ccx = 0 #clustering worked, take2
+				take = 2
+			}
+		} else {ccx = 0}  #clustering worked, take1
+		if (ccx != 1) { #clustering worked either in take1 or take2
+			if (numdup > 1) { #check whether all components replicates agreed on clustering assignments
 				cc = vector(mode = "numeric", length = numdup)
 				for (g in 1:numdup) {
 					cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
 				}
 				ccx = sum(diff(cc))
 				cluster = cc[1]
 				rm(cc)
+				if ((ccx != 0) && (take == 1)) { #not all replicates agreed? Repeat the clustering with from scratch if not already done!
+					set.seed(samplingSeed)
+					init = kmeans(Rs, clustnummer, nstart = 20)
+					init = unmap(init$cluster)
+					set.seed(samplingSeed)
+					noisy = sample(noise, rWinSizeTemp, replace = TRUE)
+					clust = me(param$modelname, Rs+noisy, init)
+					clust$classification = map(clust$z)
+					if ((any(diff(clust$classification)) != 0) && (!(any(is.na(clust$classification))))) { #clustering worked? check whether replicates agreed take 3
+						cc = vector(mode = "numeric", length = numdup)
+						for (g in 1:numdup) {
+							cc[g] = which.max(clust$parameters$mean[g,]) #which cluster has the largest mean (this is the peak cluster, hopefully!)
+						}
+						ccx = sum(diff(cc))
+						cluster = cc[1]
+						rm(cc)
+						take = 3
+					}
+				}
+			} else { #no replicates!
+				cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!)
+			}
+		}
+		if ((ccx != 0) && (reportNoClust=="y")) { resol = "window" } #clustering did not work and windows should be reported
+		if (ccx == 0) { #clustering worked and all replicates agree on the cluster assignments
+			#extract subset of the background
+			if (bkgd != "None") {
+				Cs = counts[[numdup+1]][winStart:winEnd]
+				Cs = filtfilt(rep(1,80)/80,1,Cs)
 			} else {
-				cluster = which.max(clust$parameters$mean) #which cluster has the largest mean (this is the peak cluster, hopefully!)
+				set.seed(samplingSeed)
-				ccx = 0
+				Cs = sample(C, rWinSizeTemp, replace = TRUE)
+				Cs = filtfilt(rep(1,80)/80,1,Cs)
 			}
-			if (ccx == 0) {
-				#extract subset of the background
+			#find region boundaries
-				if (bkgd != "None") {
+			loc = 1:length(clust$classification)
-					Cs = counts[[numdup+1]][winStart:winEnd]
+			gmclass = cbind(loc, clust$classification)
-					Cs = filtfilt(rep(1,80)/80,1,Cs)
+			locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE]
-				} else {
+			rStart = locPeak[1] #start position of the region
-					Cs = sample(C, rWinSizeTemp, replace = TRUE)
+			rEnd = locPeak[length(locPeak[,1]),1] #end position of the region
-					Cs = filtfilt(rep(1,80)/80,1,Cs)
+			#peak resolution check
+			if (resol == "region") {
+				pSize = rEnd - rStart
+				signal = (geomean(Rs[rStart:rEnd,]))
+				signal2 = (signal) - (Cs[rStart:rEnd])
+				gm = mean(signal2)
+				summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1
+				will2k = wilcox.test(signal, Cs[rStart:rEnd])
+				#Is there signal in the region above background
+				if (gm > 0) {
+					writethis[[1]] = rStart + winStart - 1
+					writethis[[2]] = rEnd + winStart
+					writethis[[3]] = paste0(chromName, ".", rStart+winStart -1)
+					writethis[[4]] = "1000"
+					writethis[[5]] = "."
+					writethis[[6]] = gm
+					writethis[[7]] = will2k$p.value
+					writethis[[8]] = "-1"
+					writethis[[9]] = summit
 				}
+			} else if (resol == "peak") {
-				#find region boundaries
+				#find out where separate peaks are
-				loc = 1:length(clust$classification)
+				d = diff(locPeak[,1])
-				gmclass = cbind(loc, clust$classification)
+				d[length(d)+1] = 0
-				locPeak = gmclass[gmclass[,2] == cluster,,drop=FALSE]
+				locPeak = cbind(locPeak, d)
-				rStart = locPeak[1] #start position of the region
+				bound1 = which(locPeak[,3] > 1, arr.in=TRUE)
-				rEnd = locPeak[length(locPeak[,1]),1] #end position of the region
+				bound2 = bound1 + 1
+				bound = locPeak[sort(c(bound1,bound2))]
-				#peak resolution check
+				bound = c(rStart, bound, rEnd)
-				if (resol == "region") {
+				w = 1
-					pSize = rEnd - rStart
+				warum = 0
-					signal = (geomean(Rs[rStart:rEnd,]))
+				while (w < length(bound)) {
-					signal2 = (signal) - (Cs[rStart:rEnd])
+					pStart = bound[w] + winStart - 1
+					pEnd = bound[w+1] + winStart
+					pSize = pEnd - pStart
+					signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
+					signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
 					gm = mean(signal2)
-					summit = which.max(geomean(Rsr[rStart:rEnd,])) - 1
+					summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
-					will2k = wilcox.test(signal, Cs[rStart:rEnd])
+					will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
+					weil = warum * 9
 					#Is there signal in the region above background
 					if (gm > 0) {
-						writethis[[1]] = rStart + winStart - 1
+						writethis[[1+weil]] = pStart
-						writethis[[2]] = rEnd + winStart
+						writethis[[2+weil]] = pEnd
-						writethis[[3]] = paste0(chromName, ".", rStart+winStart -1)
+						writethis[[3+weil]] = paste0(chromName, ".", pStart)
-						writethis[[4]] = "1000"
+						writethis[[4+weil]] = "1000"
-						writethis[[5]] = "."
+						writethis[[5+weil]] = "."
-						writethis[[6]] = gm
+						writethis[[6+weil]] = gm
-						writethis[[7]] = will2k$p.value
+						writethis[[7+weil]] = will2k$p.value
-						writethis[[8]] = "-1"
+						writethis[[8+weil]] = "-1"
-						writethis[[9]] = summit
+						writethis[[9+weil]] = summit
 					}
-				} else if (resol == "peak") {
+					w = w + 2
-					#find out where separate peaks are
+					warum = warum + 1
-					d = diff(locPeak[,1])
+				}
-					d[length(d)+1] = 0
+			} #peak resolution check
-					locPeak = cbind(locPeak, d)
+		} #clustering worked and all replicates agree on clustering assignments?
-					bound1 = which(locPeak[,3] > 1, arr.in=TRUE)
+	} #window resolution check
-					bound2 = bound1 + 1
-					bound = locPeak[sort(c(bound1,bound2))]
+	if (resol == "window") {
-					bound = c(rStart, bound, rEnd)
-					w = 1
-					warum = 0
-					while (w < length(bound)) {
-						pStart = bound[w] + winStart - 1
-						pEnd = bound[w+1] + winStart
-						pSize = pEnd - pStart
-						signal = (geomean(Rs[(bound[w]):(bound[w+1]),]))
-						signal2 = (signal) - (Cs[bound[w]:bound[w+1]])
-						gm = mean(signal2)
-						summit = which.max(geomean(Rsr[(bound[w]):(bound[w+1]),])) - 1
-						will2k = wilcox.test(signal, Cs[(bound[w]):(bound[w+1])])
-						weil = warum * 9
-						#Is there signal in the region above background
-						if (gm > 0) {
-							writethis[[1+weil]] = pStart
-							writethis[[2+weil]] = pEnd
-							writethis[[3+weil]] = paste0(chromName, ".", pStart)
-							writethis[[4+weil]] = "1000"
-							writethis[[5+weil]] = "."
-							writethis[[6+weil]] = gm
-							writethis[[7+weil]] = will2k$p.value
-							writethis[[8+weil]] = "-1"
-							writethis[[9+weil]] = summit
-						}
-						w = w + 2
-						warum = warum + 1
-					}
-				} #peak resolution check
-			} #all replicates agree on clustering assignments?
-		} #clustering worked?
-	} else if (resol == "window") { #window resolution check
 		#extract subset of the background
 		if (bkgd != "None") {
 			Cs = counts[[numdup+1]][winStart:winEnd]
 			Cs = filtfilt(rep(1,80)/80,1,Cs)
 		} else {
+			set.seed(samplingSeed)
 			Cs = sample(C, rWinSizeTemp, replace = TRUE)
 			Cs = filtfilt(rep(1,80)/80,1,Cs)
 		}
 		#calculate scores
 		#Is there signal in the region above background
 		if (gm > 0) {
 			writethis[[1]] = winStart - 1
 			writethis[[2]] = winEnd
-			writethis[[3]] = paste0(chromName, ".", winStart -1)
+			writethis[[3]] = paste0(chromName, ".", winStart -1, ".NoClust")
 			writethis[[4]] = "1000"
 			writethis[[5]] = "."
 			writethis[[6]] = gm
 			writethis[[7]] = will2k$p.value
 			writethis[[8]] = "-1"
 			writethis[[9]] = summit
 		}
-	} #window clustering
+	} #window reporting
 return(writethis)
 }
 args = commandArgs(trailingOnly = TRUE) # Read Arguments from command line
 #Parsing arguments and storing values
 for (each.arg in args) {
 	#chormosome size file
-	if (grepl('^-sfile=',each.arg)) {
+	if (grepl('-sfile=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				size.file <- arg.split[2]
 		} else {
 			stop('No genome size file')
 		}
 	}
 	#bed file names
-	if (grepl('^-bednames=',each.arg)) {
+	if (grepl('-bednames=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				bednames <- arg.split[2]
 		} else {
 			stop('No bed file names')
 		}
 	}
 	#bed files directory
-	if (grepl('^-frag=',each.arg)) {
+	if (grepl('-frag=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				frag <- arg.split[2]
 		} else {
 			stop('No fragment length given')
 		}
 	}
 	#bakcground files directory
-	if (grepl('^-bkgd=',each.arg)) {
+	if (grepl('-bkgd=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				bkgd <- arg.split[2]
 		} else {
 			stop('No background file')
 		}
 	}
 	#bakcground files directory
-	if (grepl('^-out=',each.arg)) {
+	if (grepl('-out=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				out <- arg.split[2]
 		} else {
 			stop('No output directory given')
 		}
 	}
 	#Cluster number
-	if (grepl('^-clustnummer=',each.arg)) {
+	if (grepl('-clustnummer=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				clustnummer <- as.numeric(arg.split[2])
 		}
 	}
 	#resolution
-	if (grepl('^-resolution=',each.arg)) {
+	if (grepl('-resolution=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				resol <- arg.split[2]
 		}
 	}
 	#processor cores
-	if (grepl('^-p=',each.arg)) {
+	if (grepl('-p=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 				cornum <- as.numeric(arg.split[2])
 		}
 	}
 	#minimum window size
-	if (grepl('^-window=',each.arg)) {
+	if (grepl('-window=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 			winSize <- arg.split[2]
 		}
 	}
 	#window size
-	if (grepl('^-bin=',each.arg)) {
+	if (grepl('-bin=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 			binsize <- arg.split[2]
 		}
 	}
 	#type (paired / single)
-	if (grepl('^-type=',each.arg)) {
+	if (grepl('-type=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 			type <- arg.split[2]
 		}
 	}
 	#chromosome number
-	if (grepl('^-chrcount=',each.arg)) {
+	if (grepl('-chrcount=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 			chrcount <- as.numeric(arg.split[2])
 		}
 	}
 	#window enrichment cutoff
-	if (grepl('^-windowe=',each.arg)) {
+	if (grepl('-windowe=',each.arg)) {
 		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
 		if (! is.na(arg.split[2]) ) {
 			windowe <- arg.split[2]
 		}
 	}
+	#initialize
+	if (grepl('-initModel=',each.arg)) {
+		arg.split <- strsplit(each.arg,'=',fixed=TRUE)[[1]]
+		if (! is.na(arg.split[2]) ) {
+			initialize <- arg.split[2]
+		}
+	}
 }
 ##Parse in variables
 chromosomes = read.table(size.file, header=FALSE)
 chromName = chromosomes$V1; #which chromosome
 chromSize = chromosomes$V2; #chromosomes size
 rm(chromosomes)
+if (chrcount == 1) {
+	write(paste(samplingSeed), file = paste0(out, "/seed.info"))
+} else {
+	samplingSeed = as.numeric(read.table(paste0(out, "/seed.info"), header = FALSE)[[1]])
+}
 readsFiles = as.list(strsplit(bednames, ",", fixed = TRUE)[[1]])
 numdup = length(readsFiles) #number of replicates
 if (bkgd != "None") {
 	readsFiles[[numdup+1]] = bkgd
 }
 # ============================
 # Estimating Background Model
 # ============================
 if (chrcount == 1){ #first chromosome, estimate bkgd (includes SNR cutoff)
 	if (is.na(cutoff)) {
 		if (bkgd != "None") {
 			cutoff = vector(length = numdup)
 			sdC = sd(counts[[numdup+1]])
 			cutoff = vector(length = numdup)
 			mmV = var(geomeanL(counts))
 			mmM = mean(geomeanL(counts))
 			sigma = log(1+((mmV) / ((mmM)^2)))
 			mu = (log(mmM)) - (0.5 * (sigma))
+			set.seed(samplingSeed)
 			C = rlnorm(100000, mu, sqrt(sigma))
 			for (x in 1:numdup) {
 				cutoff[x] = (mean(counts[[x]]))/(sd(C))
 			}
 			cutoff = max(cutoff)
-			mCs = (mean(sample(C, binSize*5, replace = TRUE)))
+			set.seed(samplingSeed)
-			dCs = (sd(sample(C, binSize*5, replace = TRUE)))
+			snow = sample(C, binSize*5, replace = TRUE)
+			mCs = mean(snow)
+			dCs = sd(snow)
 			write(paste(c(cutoff,sigma,mu)), file = paste0(out, "/bkgd.info"), append = TRUE)
 		}
 	}
 } else { #bkgd estiamted from before
 	bkgdInfo = read.table(paste0(out, "/bkgd.info"), header = FALSE)
 		C = NULL
 		mCs = NULL
 	} else {
 		sigma = as.numeric(bkgdInfo[[1]][2])
 		mu = as.numeric(bkgdInfo[[1]][3])
+		set.seed(samplingSeed)
 		C = rlnorm(100000, mu, sqrt(sigma))
-		mCs = (mean(sample(C, binSize*5, replace = TRUE)))
+		set.seed(samplingSeed)
-		dCs = (sd(sample(C, binSize*5, replace = TRUE)))
+		snow = sample(C, binSize*5, replace = TRUE)
+		mCs = mean(snow)
+		dCs = sd(snow)
 	}
 }
 #=======================> DONE!
 } else {
 	coffeeshop = lapply(bins, pickbins, counts, binSize, chromSize, numdup, C, cutoff, strict, mCs, dCs, bkgd)
 }
 coffeeshop = as.numeric(unlist(coffeeshop))
 coffeeshop[coffeeshop != numdup] = 0
+if (sum(coffeeshop) != 0) { #Any enriched bins?
 coffeeshop = c(0, diff(coffeeshop))
 coffeeshop = cbind(coffeeshop, bins)
 coffeeshopNord = coffeeshop[coffeeshop[,1] == numdup,,drop=FALSE]
 coffeeshopSud = coffeeshop[coffeeshop[,1] == -numdup,,drop=FALSE]
 coffeeshopNord = coffeeshopNord[,2]
 # ===================================
 # Initializing Clustering Parameters
 # ===================================
 if (length(coffeeshop[,1]) > 0) { #any enriched windows detected?
-yummy = ceiling(length(coffeeshop[,1]) / 4)
+if (initialize == "deterministic") {
-if (yummy > 20) {
+	yummy = ceiling(length(coffeeshop[,1]) / 1000)
-	yummy = sample(1:yummy, 20)
+	if (yummy == 0) {
-} else if (yummy > 0) {
+		yummy = 1
-	yummy = 1:yummy
+	}
-} else {
+}
-	yummy = 1
+if (initialize == "stochastic") {
+	yummy = ceiling(length(coffeeshop[,1]) / 4)
+	if (yummy > 20) {
+		set.seed(samplingSeed)
+		yummy = sample(1:yummy, 20)
+	} else if (yummy > 0) {
+		yummy = 1:yummy
+	} else {
+		yummy = 1
+	}
 }
 coffeeshopNord = coffeeshop[yummy,1]
 coffeeshopSud = coffeeshop[yummy,2]
+set.seed(samplingSeed)
 noise = rnorm(100000, mean=0, sd=0.1)
 param = initparam(coffeeshopNord, coffeeshopSud, numdup, counts, cornum, clustnummer, modelnames, noise)
 #=======================> DONE!
 		scores = mclapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1], mc.cores = cornum, mc.preschedule = TRUE)
 	} else {
 		scores = lapply(coffeeshop[,1], scorewindow, coffeeshop[,2], numdup, C, bkgd, counts, startlist = coffeeshop[,1])
 	}
 	scores = unlist(scores)
-	#scores = scores[is.finite(scores)] #just a sanity check
 	if (windowe == "auto") {
 		lscores = log(scores)
 		if (length(scores) > 0) {
 			if (chrcount == 1) {
 # ==============
 # Finding Peaks
 # ==============
-#if (length(coffeeshop[,1]) > 0) { #any enriched windows left after filtering?, just a sanity check!
+if (length(coffeeshop[,1]) > 0) { #any enriched windows left after filtering?
 coffeeshop = coffeeshop[,-3]
 if (cornum > 1) {
-	peaks = mclapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, mc.cores = cornum, mc.preschedule = TRUE)
+	peaks = mclapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer, mc.cores = cornum, mc.preschedule = TRUE)
 } else {
-	peaks = lapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust)
+	peaks = lapply(coffeeshop[,1], findpeak, coffeeshop[,2], numdup, C, param, bkgd, resol, counts, noise, startlist = coffeeshop[,1], meanAdjust, clustnummer)
 }
 if (!(is.null(peaks))) { #any peaks discovered?
 writethis = processPeaks(peaks)
 #=======================> DONE!
 # =========================
 # Writing Peak Information
 # =========================
 } else { nothing = TRUE } #no peaks
-#} else { nothing = TRUE } #no enriched windows left after filtering, just a sanity check (at least one window should still be there)
+} else { nothing = TRUE } #no enriched windows left after filtering
 } else { nothing = TRUE } #no enriched widnows discovered
+} else { nothing = TRUE; cutthisW = windowe } #no enriched bins discovered
 if (isTRUE(nothing)) {
 	file.create(paste0(out, "/", chromName, ".peaks.bed"))
 	write(paste(chromName, minpeak, sep = "	"), file = paste0(out, "/min.peaksize"), append=TRUE)
 	if (chrcount == 1) {
-		message(paste0("No peaks found! - Window Fold Enrichment: ", round(cutthisW,2)))
+		message(paste0("No peaks found! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed))
 	} else {
 		message("No peaks found!")
 	}
 } else {
 	write(paste(chromName, writethis$X1, writethis$X2, writethis$X3, writethis$X4, writethis$X5, writethis$X6, writethis$X7, writethis$X8, writethis$X9, minpeak, sep = "	"), file = paste0(out, "/", chromName, ".peaks.bed"), ncolumns = 1)
 	write(paste(chromName, minpeak, sep = "	"), file = paste0(out, "/min.peaksize"), append=TRUE)
 	if (chrcount == 1) {
-		message(paste0("Done! - Window Fold Enrichment: ", round(cutthisW,2)))
+		message(paste0("Done! - Window Fold Enrichment: ", cutthisW, " - Seed: ", samplingSeed))
 	} else {
 		message("Done!")
 	}
 }
 #=======================> DONE!

Mercurial > repos > messersc > jamm

comparison peakfinder.r @ 1:243f75d0ed6e draft default tip