Mercurial > repos > davidvanzessen > baseline_bayesian_estimation
comparison Baseline_Functions.r @ 0:d685e7ba0ed4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 15 Jul 2014 08:43:49 -0400 |
parents | |
children | 94fada165724 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d685e7ba0ed4 |
---|---|
1 ######################################################################################### | |
2 # License Agreement | |
3 # | |
4 # THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE | |
5 # ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER | |
6 # APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE | |
7 # OR COPYRIGHT LAW IS PROHIBITED. | |
8 # | |
9 # BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE | |
10 # BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED | |
11 # TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN | |
12 # CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. | |
13 # | |
14 # BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences | |
15 # Coded by: Mohamed Uduman & Gur Yaari | |
16 # Copyright 2012 Kleinstein Lab | |
17 # Version: 1.3 (01/23/2014) | |
18 ######################################################################################### | |
19 | |
20 # Global variables | |
21 | |
22 FILTER_BY_MUTATIONS = 1000 | |
23 | |
24 # Nucleotides | |
25 NUCLEOTIDES = c("A","C","G","T") | |
26 | |
27 # Amino Acids | |
28 AMINO_ACIDS <- c("F", "F", "L", "L", "S", "S", "S", "S", "Y", "Y", "*", "*", "C", "C", "*", "W", "L", "L", "L", "L", "P", "P", "P", "P", "H", "H", "Q", "Q", "R", "R", "R", "R", "I", "I", "I", "M", "T", "T", "T", "T", "N", "N", "K", "K", "S", "S", "R", "R", "V", "V", "V", "V", "A", "A", "A", "A", "D", "D", "E", "E", "G", "G", "G", "G") | |
29 names(AMINO_ACIDS) <- c("TTT", "TTC", "TTA", "TTG", "TCT", "TCC", "TCA", "TCG", "TAT", "TAC", "TAA", "TAG", "TGT", "TGC", "TGA", "TGG", "CTT", "CTC", "CTA", "CTG", "CCT", "CCC", "CCA", "CCG", "CAT", "CAC", "CAA", "CAG", "CGT", "CGC", "CGA", "CGG", "ATT", "ATC", "ATA", "ATG", "ACT", "ACC", "ACA", "ACG", "AAT", "AAC", "AAA", "AAG", "AGT", "AGC", "AGA", "AGG", "GTT", "GTC", "GTA", "GTG", "GCT", "GCC", "GCA", "GCG", "GAT", "GAC", "GAA", "GAG", "GGT", "GGC", "GGA", "GGG") | |
30 names(AMINO_ACIDS) <- names(AMINO_ACIDS) | |
31 | |
32 #Amino Acid Traits | |
33 #"*" "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T" "V" "W" "Y" | |
34 #B = "Hydrophobic/Burried" N = "Intermediate/Neutral" S="Hydrophilic/Surface") | |
35 TRAITS_AMINO_ACIDS_CHOTHIA98 <- c("*","N","B","S","S","B","N","N","B","S","B","B","S","N","S","S","N","N","B","B","N") | |
36 names(TRAITS_AMINO_ACIDS_CHOTHIA98) <- sort(unique(AMINO_ACIDS)) | |
37 TRAITS_AMINO_ACIDS <- array(NA,21) | |
38 | |
39 # Codon Table | |
40 CODON_TABLE <- as.data.frame(matrix(NA,ncol=64,nrow=12)) | |
41 | |
42 # Substitution Model: Smith DS et al. 1996 | |
43 substitution_Literature_Mouse <- matrix(c(0, 0.156222928, 0.601501588, 0.242275484, 0.172506739, 0, 0.241239892, 0.586253369, 0.54636291, 0.255795364, 0, 0.197841727, 0.290240811, 0.467680608, 0.24207858, 0),nrow=4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) | |
44 substitution_Flu_Human <- matrix(c(0,0.2795596,0.5026927,0.2177477,0.1693210,0,0.3264723,0.5042067,0.4983549,0.3328321,0,0.1688130,0.2021079,0.4696077,0.3282844,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) | |
45 substitution_Flu25_Human <- matrix(c(0,0.2580641,0.5163685,0.2255674,0.1541125,0,0.3210224,0.5248651,0.5239281,0.3101292,0,0.1659427,0.1997207,0.4579444,0.3423350,0),4,4,byrow=T,dimnames=list(NUCLEOTIDES,NUCLEOTIDES)) | |
46 load("FiveS_Substitution.RData") | |
47 | |
48 # Mutability Models: Shapiro GS et al. 2002 | |
49 triMutability_Literature_Human <- matrix(c(0.24, 1.2, 0.96, 0.43, 2.14, 2, 1.11, 1.9, 0.85, 1.83, 2.36, 1.31, 0.82, 0.52, 0.89, 1.33, 1.4, 0.82, 1.83, 0.73, 1.83, 1.62, 1.53, 0.57, 0.92, 0.42, 0.42, 1.47, 3.44, 2.58, 1.18, 0.47, 0.39, 1.12, 1.8, 0.68, 0.47, 2.19, 2.35, 2.19, 1.05, 1.84, 1.26, 0.28, 0.98, 2.37, 0.66, 1.58, 0.67, 0.92, 1.76, 0.83, 0.97, 0.56, 0.75, 0.62, 2.26, 0.62, 0.74, 1.11, 1.16, 0.61, 0.88, 0.67, 0.37, 0.07, 1.08, 0.46, 0.31, 0.94, 0.62, 0.57, 0.29, NA, 1.44, 0.46, 0.69, 0.57, 0.24, 0.37, 1.1, 0.99, 1.39, 0.6, 2.26, 1.24, 1.36, 0.52, 0.33, 0.26, 1.25, 0.37, 0.58, 1.03, 1.2, 0.34, 0.49, 0.33, 2.62, 0.16, 0.4, 0.16, 0.35, 0.75, 1.85, 0.94, 1.61, 0.85, 2.09, 1.39, 0.3, 0.52, 1.33, 0.29, 0.51, 0.26, 0.51, 3.83, 2.01, 0.71, 0.58, 0.62, 1.07, 0.28, 1.2, 0.74, 0.25, 0.59, 1.09, 0.91, 1.36, 0.45, 2.89, 1.27, 3.7, 0.69, 0.28, 0.41, 1.17, 0.56, 0.93, 3.41, 1, 1, NA, 5.9, 0.74, 2.51, 2.24, 2.24, 1.95, 3.32, 2.34, 1.3, 2.3, 1, 0.66, 0.73, 0.93, 0.41, 0.65, 0.89, 0.65, 0.32, NA, 0.43, 0.85, 0.43, 0.31, 0.31, 0.23, 0.29, 0.57, 0.71, 0.48, 0.44, 0.76, 0.51, 1.7, 0.85, 0.74, 2.23, 2.08, 1.16, 0.51, 0.51, 1, 0.5, NA, NA, 0.71, 2.14), nrow=64,byrow=T) | |
50 triMutability_Literature_Mouse <- matrix(c(1.31, 1.35, 1.42, 1.18, 2.02, 2.02, 1.02, 1.61, 1.99, 1.42, 2.01, 1.03, 2.02, 0.97, 0.53, 0.71, 1.19, 0.83, 0.96, 0.96, 0, 1.7, 2.22, 0.59, 1.24, 1.07, 0.51, 1.68, 3.36, 3.36, 1.14, 0.29, 0.33, 0.9, 1.11, 0.63, 1.08, 2.07, 2.27, 1.74, 0.22, 1.19, 2.37, 1.15, 1.15, 1.56, 0.81, 0.34, 0.87, 0.79, 2.13, 0.49, 0.85, 0.97, 0.36, 0.82, 0.66, 0.63, 1.15, 0.94, 0.85, 0.25, 0.93, 1.19, 0.4, 0.2, 0.44, 0.44, 0.88, 1.06, 0.77, 0.39, 0, 0, 0, 0, 0, 0, 0.43, 0.43, 0.86, 0.59, 0.59, 0, 1.18, 0.86, 2.9, 1.66, 0.4, 0.2, 1.54, 0.43, 0.69, 1.71, 0.68, 0.55, 0.91, 0.7, 1.71, 0.09, 0.27, 0.63, 0.2, 0.45, 1.01, 1.63, 0.96, 1.48, 2.18, 1.2, 1.31, 0.66, 2.13, 0.49, 0, 0, 0, 2.97, 2.8, 0.79, 0.4, 0.5, 0.4, 0.11, 1.68, 0.42, 0.13, 0.44, 0.93, 0.71, 1.11, 1.19, 2.71, 1.08, 3.43, 0.4, 0.67, 0.47, 1.02, 0.14, 1.56, 1.98, 0.53, 0.33, 0.63, 2.06, 1.77, 1.46, 3.74, 2.93, 2.1, 2.18, 0.78, 0.73, 2.93, 0.63, 0.57, 0.17, 0.85, 0.52, 0.31, 0.31, 0, 0, 0.51, 0.29, 0.83, 0.54, 0.28, 0.47, 0.9, 0.99, 1.24, 2.47, 0.73, 0.23, 1.13, 0.24, 2.12, 0.24, 0.33, 0.83, 1.41, 0.62, 0.28, 0.35, 0.77, 0.17, 0.72, 0.58, 0.45, 0.41), nrow=64,byrow=T) | |
51 triMutability_Names <- c("AAA", "AAC", "AAG", "AAT", "ACA", "ACC", "ACG", "ACT", "AGA", "AGC", "AGG", "AGT", "ATA", "ATC", "ATG", "ATT", "CAA", "CAC", "CAG", "CAT", "CCA", "CCC", "CCG", "CCT", "CGA", "CGC", "CGG", "CGT", "CTA", "CTC", "CTG", "CTT", "GAA", "GAC", "GAG", "GAT", "GCA", "GCC", "GCG", "GCT", "GGA", "GGC", "GGG", "GGT", "GTA", "GTC", "GTG", "GTT", "TAA", "TAC", "TAG", "TAT", "TCA", "TCC", "TCG", "TCT", "TGA", "TGC", "TGG", "TGT", "TTA", "TTC", "TTG", "TTT") | |
52 load("FiveS_Mutability.RData") | |
53 | |
54 # Functions | |
55 | |
56 # Translate codon to amino acid | |
57 translateCodonToAminoAcid<-function(Codon){ | |
58 return(AMINO_ACIDS[Codon]) | |
59 } | |
60 | |
61 # Translate amino acid to trait change | |
62 translateAminoAcidToTraitChange<-function(AminoAcid){ | |
63 return(TRAITS_AMINO_ACIDS[AminoAcid]) | |
64 } | |
65 | |
66 # Initialize Amino Acid Trait Changes | |
67 initializeTraitChange <- function(traitChangeModel=1,species=1,traitChangeFileName=NULL){ | |
68 if(!is.null(traitChangeFileName)){ | |
69 tryCatch( | |
70 traitChange <- read.delim(traitChangeFileName,sep="\t",header=T) | |
71 , error = function(ex){ | |
72 cat("Error|Error reading trait changes. Please check file name/path and format.\n") | |
73 q() | |
74 } | |
75 ) | |
76 }else{ | |
77 traitChange <- TRAITS_AMINO_ACIDS_CHOTHIA98 | |
78 } | |
79 TRAITS_AMINO_ACIDS <<- traitChange | |
80 } | |
81 | |
82 # Read in formatted nucleotide substitution matrix | |
83 initializeSubstitutionMatrix <- function(substitutionModel,species,subsMatFileName=NULL){ | |
84 if(!is.null(subsMatFileName)){ | |
85 tryCatch( | |
86 subsMat <- read.delim(subsMatFileName,sep="\t",header=T) | |
87 , error = function(ex){ | |
88 cat("Error|Error reading substitution matrix. Please check file name/path and format.\n") | |
89 q() | |
90 } | |
91 ) | |
92 if(sum(apply(subsMat,1,sum)==1)!=4) subsMat = t(apply(subsMat,1,function(x)x/sum(x))) | |
93 }else{ | |
94 if(substitutionModel==1)subsMat <- substitution_Literature_Mouse | |
95 if(substitutionModel==2)subsMat <- substitution_Flu_Human | |
96 if(substitutionModel==3)subsMat <- substitution_Flu25_Human | |
97 | |
98 } | |
99 | |
100 if(substitutionModel==0){ | |
101 subsMat <- matrix(1,4,4) | |
102 subsMat[,] = 1/3 | |
103 subsMat[1,1] = 0 | |
104 subsMat[2,2] = 0 | |
105 subsMat[3,3] = 0 | |
106 subsMat[4,4] = 0 | |
107 } | |
108 | |
109 | |
110 NUCLEOTIDESN = c(NUCLEOTIDES,"N", "-") | |
111 if(substitutionModel==5){ | |
112 subsMat <- FiveS_Substitution | |
113 return(subsMat) | |
114 }else{ | |
115 subsMat <- rbind(subsMat,rep(NA,4),rep(NA,4)) | |
116 return( matrix(data.matrix(subsMat),6,4,dimnames=list(NUCLEOTIDESN,NUCLEOTIDES) ) ) | |
117 } | |
118 } | |
119 | |
120 | |
121 # Read in formatted Mutability file | |
122 initializeMutabilityMatrix <- function(mutabilityModel=1, species=1,mutabilityMatFileName=NULL){ | |
123 if(!is.null(mutabilityMatFileName)){ | |
124 tryCatch( | |
125 mutabilityMat <- read.delim(mutabilityMatFileName,sep="\t",header=T) | |
126 , error = function(ex){ | |
127 cat("Error|Error reading mutability matrix. Please check file name/path and format.\n") | |
128 q() | |
129 } | |
130 ) | |
131 }else{ | |
132 mutabilityMat <- triMutability_Literature_Human | |
133 if(species==2) mutabilityMat <- triMutability_Literature_Mouse | |
134 } | |
135 | |
136 if(mutabilityModel==0){ mutabilityMat <- matrix(1,64,3)} | |
137 | |
138 if(mutabilityModel==5){ | |
139 mutabilityMat <- FiveS_Mutability | |
140 return(mutabilityMat) | |
141 }else{ | |
142 return( matrix( data.matrix(mutabilityMat), 64, 3, dimnames=list(triMutability_Names,1:3)) ) | |
143 } | |
144 } | |
145 | |
146 # Read FASTA file formats | |
147 # Modified from read.fasta from the seqinR package | |
148 baseline.read.fasta <- | |
149 function (file = system.file("sequences/sample.fasta", package = "seqinr"), | |
150 seqtype = c("DNA", "AA"), as.string = FALSE, forceDNAtolower = TRUE, | |
151 set.attributes = TRUE, legacy.mode = TRUE, seqonly = FALSE, | |
152 strip.desc = FALSE, sizeof.longlong = .Machine$sizeof.longlong, | |
153 endian = .Platform$endian, apply.mask = TRUE) | |
154 { | |
155 seqtype <- match.arg(seqtype) | |
156 | |
157 lines <- readLines(file) | |
158 | |
159 if (legacy.mode) { | |
160 comments <- grep("^;", lines) | |
161 if (length(comments) > 0) | |
162 lines <- lines[-comments] | |
163 } | |
164 | |
165 | |
166 ind_groups<-which(substr(lines, 1L, 3L) == ">>>") | |
167 lines_mod<-lines | |
168 | |
169 if(!length(ind_groups)){ | |
170 lines_mod<-c(">>>All sequences combined",lines) | |
171 } | |
172 | |
173 ind_groups<-which(substr(lines_mod, 1L, 3L) == ">>>") | |
174 | |
175 lines <- array("BLA",dim=(length(ind_groups)+length(lines_mod))) | |
176 id<-sapply(1:length(ind_groups),function(i)ind_groups[i]+i-1)+1 | |
177 lines[id] <- "THIS IS A FAKE SEQUENCE" | |
178 lines[-id] <- lines_mod | |
179 rm(lines_mod) | |
180 | |
181 ind <- which(substr(lines, 1L, 1L) == ">") | |
182 nseq <- length(ind) | |
183 if (nseq == 0) { | |
184 stop("no line starting with a > character found") | |
185 } | |
186 start <- ind + 1 | |
187 end <- ind - 1 | |
188 | |
189 while( any(which(ind%in%end)) ){ | |
190 ind=ind[-which(ind%in%end)] | |
191 nseq <- length(ind) | |
192 if (nseq == 0) { | |
193 stop("no line starting with a > character found") | |
194 } | |
195 start <- ind + 1 | |
196 end <- ind - 1 | |
197 } | |
198 | |
199 end <- c(end[-1], length(lines)) | |
200 sequences <- lapply(seq_len(nseq), function(i) paste(lines[start[i]:end[i]], collapse = "")) | |
201 if (seqonly) | |
202 return(sequences) | |
203 nomseq <- lapply(seq_len(nseq), function(i) { | |
204 | |
205 #firstword <- strsplit(lines[ind[i]], " ")[[1]][1] | |
206 substr(lines[ind[i]], 2, nchar(lines[ind[i]])) | |
207 | |
208 }) | |
209 if (seqtype == "DNA") { | |
210 if (forceDNAtolower) { | |
211 sequences <- as.list(tolower(chartr(".","-",sequences))) | |
212 }else{ | |
213 sequences <- as.list(toupper(chartr(".","-",sequences))) | |
214 } | |
215 } | |
216 if (as.string == FALSE) | |
217 sequences <- lapply(sequences, s2c) | |
218 if (set.attributes) { | |
219 for (i in seq_len(nseq)) { | |
220 Annot <- lines[ind[i]] | |
221 if (strip.desc) | |
222 Annot <- substr(Annot, 2L, nchar(Annot)) | |
223 attributes(sequences[[i]]) <- list(name = nomseq[[i]], | |
224 Annot = Annot, class = switch(seqtype, AA = "SeqFastaAA", | |
225 DNA = "SeqFastadna")) | |
226 } | |
227 } | |
228 names(sequences) <- nomseq | |
229 return(sequences) | |
230 } | |
231 | |
232 | |
233 # Replaces non FASTA characters in input files with N | |
234 replaceNonFASTAChars <-function(inSeq="ACGTN-AApA"){ | |
235 gsub('[^ACGTNacgt[:punct:]-[:punct:].]','N',inSeq,perl=TRUE) | |
236 } | |
237 | |
238 # Find the germlines in the FASTA list | |
239 germlinesInFile <- function(seqIDs){ | |
240 firstChar = sapply(seqIDs,function(x){substr(x,1,1)}) | |
241 secondChar = sapply(seqIDs,function(x){substr(x,2,2)}) | |
242 return(firstChar==">" & secondChar!=">") | |
243 } | |
244 | |
245 # Find the groups in the FASTA list | |
246 groupsInFile <- function(seqIDs){ | |
247 sapply(seqIDs,function(x){substr(x,1,2)})==">>" | |
248 } | |
249 | |
250 # In the process of finding germlines/groups, expand from the start to end of the group | |
251 expandTillNext <- function(vecPosToID){ | |
252 IDs = names(vecPosToID) | |
253 posOfInterests = which(vecPosToID) | |
254 | |
255 expandedID = rep(NA,length(IDs)) | |
256 expandedIDNames = gsub(">","",IDs[posOfInterests]) | |
257 startIndexes = c(1,posOfInterests[-1]) | |
258 stopIndexes = c(posOfInterests[-1]-1,length(IDs)) | |
259 expandedID = unlist(sapply(1:length(startIndexes),function(i){ | |
260 rep(i,stopIndexes[i]-startIndexes[i]+1) | |
261 })) | |
262 names(expandedID) = unlist(sapply(1:length(startIndexes),function(i){ | |
263 rep(expandedIDNames[i],stopIndexes[i]-startIndexes[i]+1) | |
264 })) | |
265 return(expandedID) | |
266 } | |
267 | |
268 # Process FASTA (list) to return a matrix[input, germline) | |
269 processInputAdvanced <- function(inputFASTA){ | |
270 | |
271 seqIDs = names(inputFASTA) | |
272 numbSeqs = length(seqIDs) | |
273 posGermlines1 = germlinesInFile(seqIDs) | |
274 numbGermlines = sum(posGermlines1) | |
275 posGroups1 = groupsInFile(seqIDs) | |
276 numbGroups = sum(posGroups1) | |
277 consDef = NA | |
278 | |
279 if(numbGermlines==0){ | |
280 posGermlines = 2 | |
281 numbGermlines = 1 | |
282 } | |
283 | |
284 glPositionsSum = cumsum(posGermlines1) | |
285 glPositions = table(glPositionsSum) | |
286 #Find the position of the conservation row | |
287 consDefPos = as.numeric(names(glPositions[names(glPositions)!=0 & glPositions==1]))+1 | |
288 if( length(consDefPos)> 0 ){ | |
289 consDefID = match(consDefPos, glPositionsSum) | |
290 #The coservation rows need to be pulled out and stores seperately | |
291 consDef = inputFASTA[consDefID] | |
292 inputFASTA = inputFASTA[-consDefID] | |
293 | |
294 seqIDs = names(inputFASTA) | |
295 numbSeqs = length(seqIDs) | |
296 posGermlines1 = germlinesInFile(seqIDs) | |
297 numbGermlines = sum(posGermlines1) | |
298 posGroups1 = groupsInFile(seqIDs) | |
299 numbGroups = sum(posGroups1) | |
300 if(numbGermlines==0){ | |
301 posGermlines = 2 | |
302 numbGermlines = 1 | |
303 } | |
304 } | |
305 | |
306 posGroups <- expandTillNext(posGroups1) | |
307 posGermlines <- expandTillNext(posGermlines1) | |
308 posGermlines[posGroups1] = 0 | |
309 names(posGermlines)[posGroups1] = names(posGroups)[posGroups1] | |
310 posInput = rep(TRUE,numbSeqs) | |
311 posInput[posGroups1 | posGermlines1] = FALSE | |
312 | |
313 matInput = matrix(NA, nrow=sum(posInput), ncol=2) | |
314 rownames(matInput) = seqIDs[posInput] | |
315 colnames(matInput) = c("Input","Germline") | |
316 | |
317 vecInputFASTA = unlist(inputFASTA) | |
318 matInput[,1] = vecInputFASTA[posInput] | |
319 matInput[,2] = vecInputFASTA[ which( names(inputFASTA)%in%paste(">",names(posGermlines)[posInput],sep="") )[ posGermlines[posInput]] ] | |
320 | |
321 germlines = posGermlines[posInput] | |
322 groups = posGroups[posInput] | |
323 | |
324 return( list("matInput"=matInput, "germlines"=germlines, "groups"=groups, "conservationDefinition"=consDef )) | |
325 } | |
326 | |
327 | |
328 # Replace leading and trailing dashes in the sequence | |
329 replaceLeadingTrailingDashes <- function(x,readEnd){ | |
330 iiGap = unlist(gregexpr("-",x[1])) | |
331 ggGap = unlist(gregexpr("-",x[2])) | |
332 #posToChange = intersect(iiGap,ggGap) | |
333 | |
334 | |
335 seqIn = replaceLeadingTrailingDashesHelper(x[1]) | |
336 seqGL = replaceLeadingTrailingDashesHelper(x[2]) | |
337 seqTemplate = rep('N',readEnd) | |
338 seqIn <- c(seqIn,seqTemplate[(length(seqIn)+1):readEnd]) | |
339 seqGL <- c(seqGL,seqTemplate[(length(seqGL)+1):readEnd]) | |
340 # if(posToChange!=-1){ | |
341 # seqIn[posToChange] = "-" | |
342 # seqGL[posToChange] = "-" | |
343 # } | |
344 | |
345 seqIn = c2s(seqIn[1:readEnd]) | |
346 seqGL = c2s(seqGL[1:readEnd]) | |
347 | |
348 lenGL = nchar(seqGL) | |
349 if(lenGL<readEnd){ | |
350 seqGL = paste(seqGL,c2s(rep("N",readEnd-lenGL)),sep="") | |
351 } | |
352 | |
353 lenInput = nchar(seqIn) | |
354 if(lenInput<readEnd){ | |
355 seqIn = paste(seqIn,c2s(rep("N",readEnd-lenInput)),sep="") | |
356 } | |
357 return( c(seqIn,seqGL) ) | |
358 } | |
359 | |
360 replaceLeadingTrailingDashesHelper <- function(x){ | |
361 grepResults = gregexpr("-*",x) | |
362 grepResultsPos = unlist(grepResults) | |
363 grepResultsLen = attr(grepResults[[1]],"match.length") | |
364 x = s2c(x) | |
365 if(x[1]=="-"){ | |
366 x[1:grepResultsLen[1]] = "N" | |
367 } | |
368 if(x[length(x)]=="-"){ | |
369 x[(length(x)-grepResultsLen[length(grepResultsLen)]+1):length(x)] = "N" | |
370 } | |
371 return(x) | |
372 } | |
373 | |
374 | |
375 | |
376 | |
377 # Check sequences for indels | |
378 checkForInDels <- function(matInputP){ | |
379 insPos <- checkInsertion(matInputP) | |
380 delPos <- checkDeletions(matInputP) | |
381 return(list("Insertions"=insPos, "Deletions"=delPos)) | |
382 } | |
383 | |
384 # Check sequences for insertions | |
385 checkInsertion <- function(matInputP){ | |
386 insertionCheck = apply( matInputP,1, function(x){ | |
387 inputGaps <- as.vector( gregexpr("-",x[1])[[1]] ) | |
388 glGaps <- as.vector( gregexpr("-",x[2])[[1]] ) | |
389 return( is.finite( match(FALSE, glGaps%in%inputGaps ) ) ) | |
390 }) | |
391 return(as.vector(insertionCheck)) | |
392 } | |
393 # Fix inserstions | |
394 fixInsertions <- function(matInputP){ | |
395 insPos <- checkInsertion(matInputP) | |
396 sapply((1:nrow(matInputP))[insPos],function(rowIndex){ | |
397 x <- matInputP[rowIndex,] | |
398 inputGaps <- gregexpr("-",x[1])[[1]] | |
399 glGaps <- gregexpr("-",x[2])[[1]] | |
400 posInsertions <- glGaps[!(glGaps%in%inputGaps)] | |
401 inputInsertionToN <- s2c(x[2]) | |
402 inputInsertionToN[posInsertions]!="-" | |
403 inputInsertionToN[posInsertions] <- "N" | |
404 inputInsertionToN <- c2s(inputInsertionToN) | |
405 matInput[rowIndex,2] <<- inputInsertionToN | |
406 }) | |
407 return(insPos) | |
408 } | |
409 | |
410 # Check sequences for deletions | |
411 checkDeletions <-function(matInputP){ | |
412 deletionCheck = apply( matInputP,1, function(x){ | |
413 inputGaps <- as.vector( gregexpr("-",x[1])[[1]] ) | |
414 glGaps <- as.vector( gregexpr("-",x[2])[[1]] ) | |
415 return( is.finite( match(FALSE, inputGaps%in%glGaps ) ) ) | |
416 }) | |
417 return(as.vector(deletionCheck)) | |
418 } | |
419 # Fix sequences with deletions | |
420 fixDeletions <- function(matInputP){ | |
421 delPos <- checkDeletions(matInputP) | |
422 sapply((1:nrow(matInputP))[delPos],function(rowIndex){ | |
423 x <- matInputP[rowIndex,] | |
424 inputGaps <- gregexpr("-",x[1])[[1]] | |
425 glGaps <- gregexpr("-",x[2])[[1]] | |
426 posDeletions <- inputGaps[!(inputGaps%in%glGaps)] | |
427 inputDeletionToN <- s2c(x[1]) | |
428 inputDeletionToN[posDeletions] <- "N" | |
429 inputDeletionToN <- c2s(inputDeletionToN) | |
430 matInput[rowIndex,1] <<- inputDeletionToN | |
431 }) | |
432 return(delPos) | |
433 } | |
434 | |
435 | |
436 # Trim DNA sequence to the last codon | |
437 trimToLastCodon <- function(seqToTrim){ | |
438 seqLen = nchar(seqToTrim) | |
439 trimmedSeq = s2c(seqToTrim) | |
440 poi = seqLen | |
441 tailLen = 0 | |
442 | |
443 while(trimmedSeq[poi]=="-" || trimmedSeq[poi]=="."){ | |
444 tailLen = tailLen + 1 | |
445 poi = poi - 1 | |
446 } | |
447 | |
448 trimmedSeq = c2s(trimmedSeq[1:(seqLen-tailLen)]) | |
449 seqLen = nchar(trimmedSeq) | |
450 # Trim sequence to last codon | |
451 if( getCodonPos(seqLen)[3] > seqLen ) | |
452 trimmedSeq = substr(seqToTrim,1, ( (getCodonPos(seqLen)[1])-1 ) ) | |
453 | |
454 return(trimmedSeq) | |
455 } | |
456 | |
457 # Given a nuclotide position, returns the pos of the 3 nucs that made the codon | |
458 # e.g. nuc 86 is part of nucs 85,86,87 | |
459 getCodonPos <- function(nucPos){ | |
460 codonNum = (ceiling(nucPos/3))*3 | |
461 return( (codonNum-2):codonNum) | |
462 } | |
463 | |
464 # Given a nuclotide position, returns the codon number | |
465 # e.g. nuc 86 = codon 29 | |
466 getCodonNumb <- function(nucPos){ | |
467 return( ceiling(nucPos/3) ) | |
468 } | |
469 | |
470 # Given a codon, returns all the nuc positions that make the codon | |
471 getCodonNucs <- function(codonNumb){ | |
472 getCodonPos(codonNumb*3) | |
473 } | |
474 | |
475 computeCodonTable <- function(testID=1){ | |
476 | |
477 if(testID<=4){ | |
478 # Pre-compute every codons | |
479 intCounter = 1 | |
480 for(pOne in NUCLEOTIDES){ | |
481 for(pTwo in NUCLEOTIDES){ | |
482 for(pThree in NUCLEOTIDES){ | |
483 codon = paste(pOne,pTwo,pThree,sep="") | |
484 colnames(CODON_TABLE)[intCounter] = codon | |
485 intCounter = intCounter + 1 | |
486 CODON_TABLE[,codon] = mutationTypeOptimized(cbind(permutateAllCodon(codon),rep(codon,12))) | |
487 } | |
488 } | |
489 } | |
490 chars = c("N","A","C","G","T", "-") | |
491 for(a in chars){ | |
492 for(b in chars){ | |
493 for(c in chars){ | |
494 if(a=="N" | b=="N" | c=="N"){ | |
495 #cat(paste(a,b,c),sep="","\n") | |
496 CODON_TABLE[,paste(a,b,c,sep="")] = rep(NA,12) | |
497 } | |
498 } | |
499 } | |
500 } | |
501 | |
502 chars = c("-","A","C","G","T") | |
503 for(a in chars){ | |
504 for(b in chars){ | |
505 for(c in chars){ | |
506 if(a=="-" | b=="-" | c=="-"){ | |
507 #cat(paste(a,b,c),sep="","\n") | |
508 CODON_TABLE[,paste(a,b,c,sep="")] = rep(NA,12) | |
509 } | |
510 } | |
511 } | |
512 } | |
513 CODON_TABLE <<- as.matrix(CODON_TABLE) | |
514 } | |
515 } | |
516 | |
517 collapseClone <- function(vecInputSeqs,glSeq,readEnd,nonTerminalOnly=0){ | |
518 #print(length(vecInputSeqs)) | |
519 vecInputSeqs = unique(vecInputSeqs) | |
520 if(length(vecInputSeqs)==1){ | |
521 return( list( c(vecInputSeqs,glSeq), F) ) | |
522 }else{ | |
523 charInputSeqs <- sapply(vecInputSeqs, function(x){ | |
524 s2c(x)[1:readEnd] | |
525 }) | |
526 charGLSeq <- s2c(glSeq) | |
527 matClone <- sapply(1:readEnd, function(i){ | |
528 posNucs = unique(charInputSeqs[i,]) | |
529 posGL = charGLSeq[i] | |
530 error = FALSE | |
531 if(posGL=="-" & sum(!(posNucs%in%c("-","N")))==0 ){ | |
532 return(c("-",error)) | |
533 } | |
534 if(length(posNucs)==1) | |
535 return(c(posNucs[1],error)) | |
536 else{ | |
537 if("N"%in%posNucs){ | |
538 error=TRUE | |
539 } | |
540 if(sum(!posNucs[posNucs!="N"]%in%posGL)==0){ | |
541 return( c(posGL,error) ) | |
542 }else{ | |
543 #return( c(sample(posNucs[posNucs!="N"],1),error) ) | |
544 if(nonTerminalOnly==0){ | |
545 return( c(sample(charInputSeqs[i,charInputSeqs[i,]!="N" & charInputSeqs[i,]!=posGL],1),error) ) | |
546 }else{ | |
547 posNucs = charInputSeqs[i,charInputSeqs[i,]!="N" & charInputSeqs[i,]!=posGL] | |
548 posNucsTable = table(posNucs) | |
549 if(sum(posNucsTable>1)==0){ | |
550 return( c(posGL,error) ) | |
551 }else{ | |
552 return( c(sample( posNucs[posNucs%in%names(posNucsTable)[posNucsTable>1]],1),error) ) | |
553 } | |
554 } | |
555 | |
556 } | |
557 } | |
558 }) | |
559 | |
560 | |
561 #print(length(vecInputSeqs)) | |
562 return(list(c(c2s(matClone[1,]),glSeq),"TRUE"%in%matClone[2,])) | |
563 } | |
564 } | |
565 | |
566 # Compute the expected for each sequence-germline pair | |
567 getExpectedIndividual <- function(matInput){ | |
568 if( any(grep("multicore",search())) ){ | |
569 facGL <- factor(matInput[,2]) | |
570 facLevels = levels(facGL) | |
571 LisGLs_MutabilityU = mclapply(1:length(facLevels), function(x){ | |
572 computeMutabilities(facLevels[x]) | |
573 }) | |
574 facIndex = match(facGL,facLevels) | |
575 | |
576 LisGLs_Mutability = mclapply(1:nrow(matInput), function(x){ | |
577 cInput = rep(NA,nchar(matInput[x,1])) | |
578 cInput[s2c(matInput[x,1])!="N"] = 1 | |
579 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
580 }) | |
581 | |
582 LisGLs_Targeting = mclapply(1:dim(matInput)[1], function(x){ | |
583 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
584 }) | |
585 | |
586 LisGLs_MutationTypes = mclapply(1:length(matInput[,2]),function(x){ | |
587 #print(x) | |
588 computeMutationTypes(matInput[x,2]) | |
589 }) | |
590 | |
591 LisGLs_Exp = mclapply(1:dim(matInput)[1], function(x){ | |
592 computeExpected(LisGLs_Targeting[[x]],LisGLs_MutationTypes[[x]]) | |
593 }) | |
594 | |
595 ul_LisGLs_Exp = unlist(LisGLs_Exp) | |
596 return(matrix(ul_LisGLs_Exp,ncol=4,nrow=(length(ul_LisGLs_Exp)/4),byrow=T)) | |
597 }else{ | |
598 facGL <- factor(matInput[,2]) | |
599 facLevels = levels(facGL) | |
600 LisGLs_MutabilityU = lapply(1:length(facLevels), function(x){ | |
601 computeMutabilities(facLevels[x]) | |
602 }) | |
603 facIndex = match(facGL,facLevels) | |
604 | |
605 LisGLs_Mutability = lapply(1:nrow(matInput), function(x){ | |
606 cInput = rep(NA,nchar(matInput[x,1])) | |
607 cInput[s2c(matInput[x,1])!="N"] = 1 | |
608 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
609 }) | |
610 | |
611 LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){ | |
612 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
613 }) | |
614 | |
615 LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){ | |
616 #print(x) | |
617 computeMutationTypes(matInput[x,2]) | |
618 }) | |
619 | |
620 LisGLs_Exp = lapply(1:dim(matInput)[1], function(x){ | |
621 computeExpected(LisGLs_Targeting[[x]],LisGLs_MutationTypes[[x]]) | |
622 }) | |
623 | |
624 ul_LisGLs_Exp = unlist(LisGLs_Exp) | |
625 return(matrix(ul_LisGLs_Exp,ncol=4,nrow=(length(ul_LisGLs_Exp)/4),byrow=T)) | |
626 | |
627 } | |
628 } | |
629 | |
630 # Compute mutabilities of sequence based on the tri-nucleotide model | |
631 computeMutabilities <- function(paramSeq){ | |
632 seqLen = nchar(paramSeq) | |
633 seqMutabilites = rep(NA,seqLen) | |
634 | |
635 gaplessSeq = gsub("-", "", paramSeq) | |
636 gaplessSeqLen = nchar(gaplessSeq) | |
637 gaplessSeqMutabilites = rep(NA,gaplessSeqLen) | |
638 | |
639 if(mutabilityModel!=5){ | |
640 pos<- 3:(gaplessSeqLen) | |
641 subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) | |
642 gaplessSeqMutabilites[pos] = | |
643 tapply( c( | |
644 getMutability( substr(subSeq,1,3), 3) , | |
645 getMutability( substr(subSeq,2,4), 2), | |
646 getMutability( substr(subSeq,3,5), 1) | |
647 ),rep(1:(gaplessSeqLen-2),3),mean,na.rm=TRUE | |
648 ) | |
649 #Pos 1 | |
650 subSeq = substr(gaplessSeq,1,3) | |
651 gaplessSeqMutabilites[1] = getMutability(subSeq , 1) | |
652 #Pos 2 | |
653 subSeq = substr(gaplessSeq,1,4) | |
654 gaplessSeqMutabilites[2] = mean( c( | |
655 getMutability( substr(subSeq,1,3), 2) , | |
656 getMutability( substr(subSeq,2,4), 1) | |
657 ),na.rm=T | |
658 ) | |
659 seqMutabilites[which(s2c(paramSeq)!="-")]<- gaplessSeqMutabilites | |
660 return(seqMutabilites) | |
661 }else{ | |
662 | |
663 pos<- 3:(gaplessSeqLen) | |
664 subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) | |
665 gaplessSeqMutabilites[pos] = sapply(subSeq,function(x){ getMutability5(x) }, simplify=T) | |
666 seqMutabilites[which(s2c(paramSeq)!="-")]<- gaplessSeqMutabilites | |
667 return(seqMutabilites) | |
668 } | |
669 | |
670 } | |
671 | |
672 # Returns the mutability of a triplet at a given position | |
673 getMutability <- function(codon, pos=1:3){ | |
674 triplets <- rownames(mutability) | |
675 mutability[ match(codon,triplets) ,pos] | |
676 } | |
677 | |
678 getMutability5 <- function(fivemer){ | |
679 return(mutability[fivemer]) | |
680 } | |
681 | |
682 # Returns the substitution probabilty | |
683 getTransistionProb <- function(nuc){ | |
684 substitution[nuc,] | |
685 } | |
686 | |
687 getTransistionProb5 <- function(fivemer){ | |
688 if(any(which(fivemer==colnames(substitution)))){ | |
689 return(substitution[,fivemer]) | |
690 }else{ | |
691 return(array(NA,4)) | |
692 } | |
693 } | |
694 | |
695 # Given a nuc, returns the other 3 nucs it can mutate to | |
696 canMutateTo <- function(nuc){ | |
697 NUCLEOTIDES[- which(NUCLEOTIDES==nuc)] | |
698 } | |
699 | |
700 # Given a nucleotide, returns the probabilty of other nucleotide it can mutate to | |
701 canMutateToProb <- function(nuc){ | |
702 substitution[nuc,canMutateTo(nuc)] | |
703 } | |
704 | |
705 # Compute targeting, based on precomputed mutatbility & substitution | |
706 computeTargeting <- function(param_strSeq,param_vecMutabilities){ | |
707 | |
708 if(substitutionModel!=5){ | |
709 vecSeq = s2c(param_strSeq) | |
710 matTargeting = sapply( 1:length(vecSeq), function(x) { param_vecMutabilities[x] * getTransistionProb(vecSeq[x]) } ) | |
711 #matTargeting = apply( rbind(vecSeq,param_vecMutabilities),2, function(x) { as.vector(as.numeric(x[2]) * getTransistionProb(x[1])) } ) | |
712 dimnames( matTargeting ) = list(NUCLEOTIDES,1:(length(vecSeq))) | |
713 return (matTargeting) | |
714 }else{ | |
715 | |
716 seqLen = nchar(param_strSeq) | |
717 seqsubstitution = matrix(NA,ncol=seqLen,nrow=4) | |
718 paramSeq <- param_strSeq | |
719 gaplessSeq = gsub("-", "", paramSeq) | |
720 gaplessSeqLen = nchar(gaplessSeq) | |
721 gaplessSeqSubstitution = matrix(NA,ncol=gaplessSeqLen,nrow=4) | |
722 | |
723 pos<- 3:(gaplessSeqLen) | |
724 subSeq = substr(rep(gaplessSeq,gaplessSeqLen-2),(pos-2),(pos+2)) | |
725 gaplessSeqSubstitution[,pos] = sapply(subSeq,function(x){ getTransistionProb5(x) }, simplify=T) | |
726 seqsubstitution[,which(s2c(paramSeq)!="-")]<- gaplessSeqSubstitution | |
727 #matTargeting <- param_vecMutabilities %*% seqsubstitution | |
728 matTargeting <- sweep(seqsubstitution,2,param_vecMutabilities,`*`) | |
729 dimnames( matTargeting ) = list(NUCLEOTIDES,1:(seqLen)) | |
730 return (matTargeting) | |
731 } | |
732 } | |
733 | |
734 # Compute the mutations types | |
735 computeMutationTypes <- function(param_strSeq){ | |
736 #cat(param_strSeq,"\n") | |
737 #vecSeq = trimToLastCodon(param_strSeq) | |
738 lenSeq = nchar(param_strSeq) | |
739 vecCodons = sapply({1:(lenSeq/3)}*3-2,function(x){substr(param_strSeq,x,x+2)}) | |
740 matMutationTypes = matrix( unlist(CODON_TABLE[,vecCodons]) ,ncol=lenSeq,nrow=4, byrow=F) | |
741 dimnames( matMutationTypes ) = list(NUCLEOTIDES,1:(ncol(matMutationTypes))) | |
742 return(matMutationTypes) | |
743 } | |
744 computeMutationTypesFast <- function(param_strSeq){ | |
745 matMutationTypes = matrix( CODON_TABLE[,param_strSeq] ,ncol=3,nrow=4, byrow=F) | |
746 #dimnames( matMutationTypes ) = list(NUCLEOTIDES,1:(length(vecSeq))) | |
747 return(matMutationTypes) | |
748 } | |
749 mutationTypeOptimized <- function( matOfCodons ){ | |
750 apply( matOfCodons,1,function(x){ mutationType(x[2],x[1]) } ) | |
751 } | |
752 | |
753 # Returns a vector of codons 1 mutation away from the given codon | |
754 permutateAllCodon <- function(codon){ | |
755 cCodon = s2c(codon) | |
756 matCodons = t(array(cCodon,dim=c(3,12))) | |
757 matCodons[1:4,1] = NUCLEOTIDES | |
758 matCodons[5:8,2] = NUCLEOTIDES | |
759 matCodons[9:12,3] = NUCLEOTIDES | |
760 apply(matCodons,1,c2s) | |
761 } | |
762 | |
763 # Given two codons, tells you if the mutation is R or S (based on your definition) | |
764 mutationType <- function(codonFrom,codonTo){ | |
765 if(testID==4){ | |
766 if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ | |
767 return(NA) | |
768 }else{ | |
769 mutationType = "S" | |
770 if( translateAminoAcidToTraitChange(translateCodonToAminoAcid(codonFrom)) != translateAminoAcidToTraitChange(translateCodonToAminoAcid(codonTo)) ){ | |
771 mutationType = "R" | |
772 } | |
773 if(translateCodonToAminoAcid(codonTo)=="*" | translateCodonToAminoAcid(codonFrom)=="*"){ | |
774 mutationType = "Stop" | |
775 } | |
776 return(mutationType) | |
777 } | |
778 }else if(testID==5){ | |
779 if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ | |
780 return(NA) | |
781 }else{ | |
782 if(codonFrom==codonTo){ | |
783 mutationType = "S" | |
784 }else{ | |
785 codonFrom = s2c(codonFrom) | |
786 codonTo = s2c(codonTo) | |
787 mutationType = "Stop" | |
788 nucOfI = codonFrom[which(codonTo!=codonFrom)] | |
789 if(nucOfI=="C"){ | |
790 mutationType = "R" | |
791 }else if(nucOfI=="G"){ | |
792 mutationType = "S" | |
793 } | |
794 } | |
795 return(mutationType) | |
796 } | |
797 }else{ | |
798 if( is.na(codonFrom) | is.na(codonTo) | is.na(translateCodonToAminoAcid(codonFrom)) | is.na(translateCodonToAminoAcid(codonTo)) ){ | |
799 return(NA) | |
800 }else{ | |
801 mutationType = "S" | |
802 if( translateCodonToAminoAcid(codonFrom) != translateCodonToAminoAcid(codonTo) ){ | |
803 mutationType = "R" | |
804 } | |
805 if(translateCodonToAminoAcid(codonTo)=="*" | translateCodonToAminoAcid(codonFrom)=="*"){ | |
806 mutationType = "Stop" | |
807 } | |
808 return(mutationType) | |
809 } | |
810 } | |
811 } | |
812 | |
813 | |
814 #given a mat of targeting & it's corresponding mutationtypes returns | |
815 #a vector of Exp_RCDR,Exp_SCDR,Exp_RFWR,Exp_RFWR | |
816 computeExpected <- function(paramTargeting,paramMutationTypes){ | |
817 # Replacements | |
818 RPos = which(paramMutationTypes=="R") | |
819 #FWR | |
820 Exp_R_FWR = sum(paramTargeting[ RPos[which(FWR_Nuc_Mat[RPos]==T)] ],na.rm=T) | |
821 #CDR | |
822 Exp_R_CDR = sum(paramTargeting[ RPos[which(CDR_Nuc_Mat[RPos]==T)] ],na.rm=T) | |
823 # Silents | |
824 SPos = which(paramMutationTypes=="S") | |
825 #FWR | |
826 Exp_S_FWR = sum(paramTargeting[ SPos[which(FWR_Nuc_Mat[SPos]==T)] ],na.rm=T) | |
827 #CDR | |
828 Exp_S_CDR = sum(paramTargeting[ SPos[which(CDR_Nuc_Mat[SPos]==T)] ],na.rm=T) | |
829 | |
830 return(c(Exp_R_CDR,Exp_S_CDR,Exp_R_FWR,Exp_S_FWR)) | |
831 } | |
832 | |
833 # Count the mutations in a sequence | |
834 # each mutation is treated independently | |
835 analyzeMutations2NucUri_website <- function( rev_in_matrix ){ | |
836 paramGL = rev_in_matrix[2,] | |
837 paramSeq = rev_in_matrix[1,] | |
838 | |
839 #Fill seq with GL seq if gapped | |
840 #if( any(paramSeq=="-") ){ | |
841 # gapPos_Seq = which(paramSeq=="-") | |
842 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "-"] | |
843 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
844 #} | |
845 | |
846 | |
847 #if( any(paramSeq=="N") ){ | |
848 # gapPos_Seq = which(paramSeq=="N") | |
849 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] | |
850 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
851 #} | |
852 | |
853 analyzeMutations2NucUri( matrix(c( paramGL, paramSeq ),2,length(paramGL),byrow=T) ) | |
854 | |
855 } | |
856 | |
857 #1 = GL | |
858 #2 = Seq | |
859 analyzeMutations2NucUri <- function( in_matrix=matrix(c(c("A","A","A","C","C","C"),c("A","G","G","C","C","A")),2,6,byrow=T) ){ | |
860 paramGL = in_matrix[2,] | |
861 paramSeq = in_matrix[1,] | |
862 paramSeqUri = paramGL | |
863 #mutations = apply(rbind(paramGL,paramSeq), 2, function(x){!x[1]==x[2]}) | |
864 mutations_val = paramGL != paramSeq | |
865 if(any(mutations_val)){ | |
866 mutationPos = {1:length(mutations_val)}[mutations_val] | |
867 mutationPos = mutationPos[sapply(mutationPos, function(x){!any(paramSeq[getCodonPos(x)]=="N")})] | |
868 length_mutations =length(mutationPos) | |
869 mutationInfo = rep(NA,length_mutations) | |
870 if(any(mutationPos)){ | |
871 | |
872 pos<- mutationPos | |
873 pos_array<-array(sapply(pos,getCodonPos)) | |
874 codonGL = paramGL[pos_array] | |
875 | |
876 codonSeq = sapply(pos,function(x){ | |
877 seqP = paramGL[getCodonPos(x)] | |
878 muCodonPos = {x-1}%%3+1 | |
879 seqP[muCodonPos] = paramSeq[x] | |
880 return(seqP) | |
881 }) | |
882 GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) | |
883 Seqcodons = apply(codonSeq,2,c2s) | |
884 mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
885 names(mutationInfo) = mutationPos | |
886 } | |
887 if(any(!is.na(mutationInfo))){ | |
888 return(mutationInfo[!is.na(mutationInfo)]) | |
889 }else{ | |
890 return(NA) | |
891 } | |
892 | |
893 | |
894 }else{ | |
895 return (NA) | |
896 } | |
897 } | |
898 | |
899 processNucMutations2 <- function(mu){ | |
900 if(!is.na(mu)){ | |
901 #R | |
902 if(any(mu=="R")){ | |
903 Rs = mu[mu=="R"] | |
904 nucNumbs = as.numeric(names(Rs)) | |
905 R_CDR = sum(as.integer(CDR_Nuc[nucNumbs]),na.rm=T) | |
906 R_FWR = sum(as.integer(FWR_Nuc[nucNumbs]),na.rm=T) | |
907 }else{ | |
908 R_CDR = 0 | |
909 R_FWR = 0 | |
910 } | |
911 | |
912 #S | |
913 if(any(mu=="S")){ | |
914 Ss = mu[mu=="S"] | |
915 nucNumbs = as.numeric(names(Ss)) | |
916 S_CDR = sum(as.integer(CDR_Nuc[nucNumbs]),na.rm=T) | |
917 S_FWR = sum(as.integer(FWR_Nuc[nucNumbs]),na.rm=T) | |
918 }else{ | |
919 S_CDR = 0 | |
920 S_FWR = 0 | |
921 } | |
922 | |
923 | |
924 retVec = c(R_CDR,S_CDR,R_FWR,S_FWR) | |
925 retVec[is.na(retVec)]=0 | |
926 return(retVec) | |
927 }else{ | |
928 return(rep(0,4)) | |
929 } | |
930 } | |
931 | |
932 | |
933 ## Z-score Test | |
934 computeZScore <- function(mat, test="Focused"){ | |
935 matRes <- matrix(NA,ncol=2,nrow=(nrow(mat))) | |
936 if(test=="Focused"){ | |
937 #Z_Focused_CDR | |
938 #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) | |
939 P = apply(mat[,c(5,6,8)],1,function(x){(x[1]/sum(x))}) | |
940 R_mean = apply(cbind(mat[,c(1,2,4)],P),1,function(x){x[4]*(sum(x[1:3]))}) | |
941 R_sd=sqrt(R_mean*(1-P)) | |
942 matRes[,1] = (mat[,1]-R_mean)/R_sd | |
943 | |
944 #Z_Focused_FWR | |
945 #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) | |
946 P = apply(mat[,c(7,6,8)],1,function(x){(x[1]/sum(x))}) | |
947 R_mean = apply(cbind(mat[,c(3,2,4)],P),1,function(x){x[4]*(sum(x[1:3]))}) | |
948 R_sd=sqrt(R_mean*(1-P)) | |
949 matRes[,2] = (mat[,3]-R_mean)/R_sd | |
950 } | |
951 | |
952 if(test=="Local"){ | |
953 #Z_Focused_CDR | |
954 #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) | |
955 P = apply(mat[,c(5,6)],1,function(x){(x[1]/sum(x))}) | |
956 R_mean = apply(cbind(mat[,c(1,2)],P),1,function(x){x[3]*(sum(x[1:2]))}) | |
957 R_sd=sqrt(R_mean*(1-P)) | |
958 matRes[,1] = (mat[,1]-R_mean)/R_sd | |
959 | |
960 #Z_Focused_FWR | |
961 #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) | |
962 P = apply(mat[,c(7,8)],1,function(x){(x[1]/sum(x))}) | |
963 R_mean = apply(cbind(mat[,c(3,4)],P),1,function(x){x[3]*(sum(x[1:2]))}) | |
964 R_sd=sqrt(R_mean*(1-P)) | |
965 matRes[,2] = (mat[,3]-R_mean)/R_sd | |
966 } | |
967 | |
968 if(test=="Imbalanced"){ | |
969 #Z_Focused_CDR | |
970 #P_Denom = sum( mat[1,c(5,6,8)], na.rm=T ) | |
971 P = apply(mat[,5:8],1,function(x){((x[1]+x[2])/sum(x))}) | |
972 R_mean = apply(cbind(mat[,1:4],P),1,function(x){x[5]*(sum(x[1:4]))}) | |
973 R_sd=sqrt(R_mean*(1-P)) | |
974 matRes[,1] = (mat[,1]-R_mean)/R_sd | |
975 | |
976 #Z_Focused_FWR | |
977 #P_Denom = sum( mat[1,c(7,6,8)], na.rm=T ) | |
978 P = apply(mat[,5:8],1,function(x){((x[3]+x[4])/sum(x))}) | |
979 R_mean = apply(cbind(mat[,1:4],P),1,function(x){x[5]*(sum(x[1:4]))}) | |
980 R_sd=sqrt(R_mean*(1-P)) | |
981 matRes[,2] = (mat[,3]-R_mean)/R_sd | |
982 } | |
983 | |
984 matRes[is.nan(matRes)] = NA | |
985 return(matRes) | |
986 } | |
987 | |
988 # Return a p-value for a z-score | |
989 z2p <- function(z){ | |
990 p=NA | |
991 if( !is.nan(z) && !is.na(z)){ | |
992 if(z>0){ | |
993 p = (1 - pnorm(z,0,1)) | |
994 } else if(z<0){ | |
995 p = (-1 * pnorm(z,0,1)) | |
996 } else{ | |
997 p = 0.5 | |
998 } | |
999 }else{ | |
1000 p = NA | |
1001 } | |
1002 return(p) | |
1003 } | |
1004 | |
1005 | |
1006 ## Bayesian Test | |
1007 | |
1008 # Fitted parameter for the bayesian framework | |
1009 BAYESIAN_FITTED<-c(0.407277142798302, 0.554007336744485, 0.63777155771234, 0.693989162719009, 0.735450014674917, 0.767972534429806, 0.794557287143399, 0.816906816601605, 0.83606796225341, 0.852729446430296, 0.867370424541641, 0.880339760590323, 0.891900995024999, 0.902259181289864, 0.911577919359,0.919990301665853, 0.927606458124537, 0.934518806350661, 0.940805863754375, 0.946534836475715, 0.951763691199255, 0.95654428191308, 0.960920179487397, 0.964930893680829, 0.968611312149038, 0.971992459313836, 0.975102110004818, 0.977964943023096, 0.980603428208439, 0.983037660179428, 0.985285800977406, 0.987364285326685, 0.989288037855441, 0.991070478823525, 0.992723699729969, 0.994259575477392, 0.995687688867975, 0.997017365051493, 0.998257085153047, 0.999414558305388, 1.00049681357804, 1.00151036237481, 1.00246080204981, 1.00335370751909, 1.0041939329768, 1.0049859393417, 1.00573382091263, 1.00644127217376, 1.00711179729107, 1.00774845526417, 1.00835412715854, 1.00893143010366, 1.00948275846309, 1.01001030293661, 1.01051606798079, 1.01100188771288, 1.01146944044216, 1.01192026195449, 1.01235575766094, 1.01277721370986) | |
1010 CONST_i <- sort(c(((2^(seq(-39,0,length.out=201)))/2)[1:200],(c(0:11,13:99)+0.5)/100,1-(2^(seq(-39,0,length.out=201)))/2)) | |
1011 | |
1012 # Given x, M & p, returns a pdf | |
1013 calculate_bayes <- function ( x=3, N=10, p=0.33, | |
1014 i=CONST_i, | |
1015 max_sigma=20,length_sigma=4001 | |
1016 ){ | |
1017 if(!0%in%N){ | |
1018 G <- max(length(x),length(N),length(p)) | |
1019 x=array(x,dim=G) | |
1020 N=array(N,dim=G) | |
1021 p=array(p,dim=G) | |
1022 sigma_s<-seq(-max_sigma,max_sigma,length.out=length_sigma) | |
1023 sigma_1<-log({i/{1-i}}/{p/{1-p}}) | |
1024 index<-min(N,60) | |
1025 y<-dbeta(i,x+BAYESIAN_FITTED[index],N+BAYESIAN_FITTED[index]-x)*(1-p)*p*exp(sigma_1)/({1-p}^2+2*p*{1-p}*exp(sigma_1)+{p^2}*exp(2*sigma_1)) | |
1026 if(!sum(is.na(y))){ | |
1027 tmp<-approx(sigma_1,y,sigma_s)$y | |
1028 tmp/sum(tmp)/{2*max_sigma/{length_sigma-1}} | |
1029 }else{ | |
1030 return(NA) | |
1031 } | |
1032 }else{ | |
1033 return(NA) | |
1034 } | |
1035 } | |
1036 # Given a mat of observed & expected, return a list of CDR & FWR pdf for selection | |
1037 computeBayesianScore <- function(mat, test="Focused", max_sigma=20,length_sigma=4001){ | |
1038 flagOneSeq = F | |
1039 if(nrow(mat)==1){ | |
1040 mat=rbind(mat,mat) | |
1041 flagOneSeq = T | |
1042 } | |
1043 if(test=="Focused"){ | |
1044 #CDR | |
1045 P = c(apply(mat[,c(5,6,8)],1,function(x){(x[1]/sum(x))}),0.5) | |
1046 N = c(apply(mat[,c(1,2,4)],1,function(x){(sum(x))}),0) | |
1047 X = c(mat[,1],0) | |
1048 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1049 bayesCDR = bayesCDR[-length(bayesCDR)] | |
1050 | |
1051 #FWR | |
1052 P = c(apply(mat[,c(7,6,8)],1,function(x){(x[1]/sum(x))}),0.5) | |
1053 N = c(apply(mat[,c(3,2,4)],1,function(x){(sum(x))}),0) | |
1054 X = c(mat[,3],0) | |
1055 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1056 bayesFWR = bayesFWR[-length(bayesFWR)] | |
1057 } | |
1058 | |
1059 if(test=="Local"){ | |
1060 #CDR | |
1061 P = c(apply(mat[,c(5,6)],1,function(x){(x[1]/sum(x))}),0.5) | |
1062 N = c(apply(mat[,c(1,2)],1,function(x){(sum(x))}),0) | |
1063 X = c(mat[,1],0) | |
1064 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1065 bayesCDR = bayesCDR[-length(bayesCDR)] | |
1066 | |
1067 #FWR | |
1068 P = c(apply(mat[,c(7,8)],1,function(x){(x[1]/sum(x))}),0.5) | |
1069 N = c(apply(mat[,c(3,4)],1,function(x){(sum(x))}),0) | |
1070 X = c(mat[,3],0) | |
1071 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1072 bayesFWR = bayesFWR[-length(bayesFWR)] | |
1073 } | |
1074 | |
1075 if(test=="Imbalanced"){ | |
1076 #CDR | |
1077 P = c(apply(mat[,c(5:8)],1,function(x){((x[1]+x[2])/sum(x))}),0.5) | |
1078 N = c(apply(mat[,c(1:4)],1,function(x){(sum(x))}),0) | |
1079 X = c(apply(mat[,c(1:2)],1,function(x){(sum(x))}),0) | |
1080 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1081 bayesCDR = bayesCDR[-length(bayesCDR)] | |
1082 | |
1083 #FWR | |
1084 P = c(apply(mat[,c(5:8)],1,function(x){((x[3]+x[4])/sum(x))}),0.5) | |
1085 N = c(apply(mat[,c(1:4)],1,function(x){(sum(x))}),0) | |
1086 X = c(apply(mat[,c(3:4)],1,function(x){(sum(x))}),0) | |
1087 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1088 bayesFWR = bayesFWR[-length(bayesFWR)] | |
1089 } | |
1090 | |
1091 if(test=="ImbalancedSilent"){ | |
1092 #CDR | |
1093 P = c(apply(mat[,c(6,8)],1,function(x){((x[1])/sum(x))}),0.5) | |
1094 N = c(apply(mat[,c(2,4)],1,function(x){(sum(x))}),0) | |
1095 X = c(apply(mat[,c(2,4)],1,function(x){(x[1])}),0) | |
1096 bayesCDR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1097 bayesCDR = bayesCDR[-length(bayesCDR)] | |
1098 | |
1099 #FWR | |
1100 P = c(apply(mat[,c(6,8)],1,function(x){((x[2])/sum(x))}),0.5) | |
1101 N = c(apply(mat[,c(2,4)],1,function(x){(sum(x))}),0) | |
1102 X = c(apply(mat[,c(2,4)],1,function(x){(x[2])}),0) | |
1103 bayesFWR = apply(cbind(X,N,P),1,function(x){calculate_bayes(x=x[1],N=x[2],p=x[3],max_sigma=max_sigma,length_sigma=length_sigma)}) | |
1104 bayesFWR = bayesFWR[-length(bayesFWR)] | |
1105 } | |
1106 | |
1107 if(flagOneSeq==T){ | |
1108 bayesCDR = bayesCDR[1] | |
1109 bayesFWR = bayesFWR[1] | |
1110 } | |
1111 return( list("CDR"=bayesCDR, "FWR"=bayesFWR) ) | |
1112 } | |
1113 | |
1114 ##Covolution | |
1115 break2chunks<-function(G=1000){ | |
1116 base<-2^round(log(sqrt(G),2),0) | |
1117 return(c(rep(base,floor(G/base)-1),base+G-(floor(G/base)*base))) | |
1118 } | |
1119 | |
1120 PowersOfTwo <- function(G=100){ | |
1121 exponents <- array() | |
1122 i = 0 | |
1123 while(G > 0){ | |
1124 i=i+1 | |
1125 exponents[i] <- floor( log2(G) ) | |
1126 G <- G-2^exponents[i] | |
1127 } | |
1128 return(exponents) | |
1129 } | |
1130 | |
1131 convolutionPowersOfTwo <- function( cons, length_sigma=4001 ){ | |
1132 G = ncol(cons) | |
1133 if(G>1){ | |
1134 for(gen in log(G,2):1){ | |
1135 ll<-seq(from=2,to=2^gen,by=2) | |
1136 sapply(ll,function(l){cons[,l/2]<<-weighted_conv(cons[,l],cons[,l-1],length_sigma=length_sigma)}) | |
1137 } | |
1138 } | |
1139 return( cons[,1] ) | |
1140 } | |
1141 | |
1142 convolutionPowersOfTwoByTwos <- function( cons, length_sigma=4001,G=1 ){ | |
1143 if(length(ncol(cons))) G<-ncol(cons) | |
1144 groups <- PowersOfTwo(G) | |
1145 matG <- matrix(NA, ncol=length(groups), nrow=length(cons)/G ) | |
1146 startIndex = 1 | |
1147 for( i in 1:length(groups) ){ | |
1148 stopIndex <- 2^groups[i] + startIndex - 1 | |
1149 if(stopIndex!=startIndex){ | |
1150 matG[,i] <- convolutionPowersOfTwo( cons[,startIndex:stopIndex], length_sigma=length_sigma ) | |
1151 startIndex = stopIndex + 1 | |
1152 } | |
1153 else { | |
1154 if(G>1) matG[,i] <- cons[,startIndex:stopIndex] | |
1155 else matG[,i] <- cons | |
1156 #startIndex = stopIndex + 1 | |
1157 } | |
1158 } | |
1159 return( list( matG, groups ) ) | |
1160 } | |
1161 | |
1162 weighted_conv<-function(x,y,w=1,m=100,length_sigma=4001){ | |
1163 lx<-length(x) | |
1164 ly<-length(y) | |
1165 if({lx<m}| {{lx*w}<m}| {{ly}<m}| {{ly*w}<m}){ | |
1166 if(w<1){ | |
1167 y1<-approx(1:ly,y,seq(1,ly,length.out=m))$y | |
1168 x1<-approx(1:lx,x,seq(1,lx,length.out=m/w))$y | |
1169 lx<-length(x1) | |
1170 ly<-length(y1) | |
1171 } | |
1172 else { | |
1173 y1<-approx(1:ly,y,seq(1,ly,length.out=m*w))$y | |
1174 x1<-approx(1:lx,x,seq(1,lx,length.out=m))$y | |
1175 lx<-length(x1) | |
1176 ly<-length(y1) | |
1177 } | |
1178 } | |
1179 else{ | |
1180 x1<-x | |
1181 y1<-approx(1:ly,y,seq(1,ly,length.out=floor(lx*w)))$y | |
1182 ly<-length(y1) | |
1183 } | |
1184 tmp<-approx(x=1:(lx+ly-1),y=convolve(x1,rev(y1),type="open"),xout=seq(1,lx+ly-1,length.out=length_sigma))$y | |
1185 tmp[tmp<=0] = 0 | |
1186 return(tmp/sum(tmp)) | |
1187 } | |
1188 | |
1189 calculate_bayesGHelper <- function( listMatG,length_sigma=4001 ){ | |
1190 matG <- listMatG[[1]] | |
1191 groups <- listMatG[[2]] | |
1192 i = 1 | |
1193 resConv <- matG[,i] | |
1194 denom <- 2^groups[i] | |
1195 if(length(groups)>1){ | |
1196 while( i<length(groups) ){ | |
1197 i = i + 1 | |
1198 resConv <- weighted_conv(resConv, matG[,i], w= {{2^groups[i]}/denom} ,length_sigma=length_sigma) | |
1199 #cat({{2^groups[i]}/denom},"\n") | |
1200 denom <- denom + 2^groups[i] | |
1201 } | |
1202 } | |
1203 return(resConv) | |
1204 } | |
1205 | |
1206 # Given a list of PDFs, returns a convoluted PDF | |
1207 groupPosteriors <- function( listPosteriors, max_sigma=20, length_sigma=4001 ,Threshold=2 ){ | |
1208 listPosteriors = listPosteriors[ !is.na(listPosteriors) ] | |
1209 Length_Postrior<-length(listPosteriors) | |
1210 if(Length_Postrior>1 & Length_Postrior<=Threshold){ | |
1211 cons = matrix(unlist(listPosteriors),length(listPosteriors[[1]]),length(listPosteriors)) | |
1212 listMatG <- convolutionPowersOfTwoByTwos(cons,length_sigma=length_sigma) | |
1213 y<-calculate_bayesGHelper(listMatG,length_sigma=length_sigma) | |
1214 return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) | |
1215 }else if(Length_Postrior==1) return(listPosteriors[[1]]) | |
1216 else if(Length_Postrior==0) return(NA) | |
1217 else { | |
1218 cons = matrix(unlist(listPosteriors),length(listPosteriors[[1]]),length(listPosteriors)) | |
1219 y = fastConv(cons,max_sigma=max_sigma, length_sigma=length_sigma ) | |
1220 return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) | |
1221 } | |
1222 } | |
1223 | |
1224 fastConv<-function(cons, max_sigma=20, length_sigma=4001){ | |
1225 chunks<-break2chunks(G=ncol(cons)) | |
1226 if(ncol(cons)==3) chunks<-2:1 | |
1227 index_chunks_end <- cumsum(chunks) | |
1228 index_chunks_start <- c(1,index_chunks_end[-length(index_chunks_end)]+1) | |
1229 index_chunks <- cbind(index_chunks_start,index_chunks_end) | |
1230 | |
1231 case <- sum(chunks!=chunks[1]) | |
1232 if(case==1) End <- max(1,((length(index_chunks)/2)-1)) | |
1233 else End <- max(1,((length(index_chunks)/2))) | |
1234 | |
1235 firsts <- sapply(1:End,function(i){ | |
1236 indexes<-index_chunks[i,1]:index_chunks[i,2] | |
1237 convolutionPowersOfTwoByTwos(cons[ ,indexes])[[1]] | |
1238 }) | |
1239 if(case==0){ | |
1240 result<-calculate_bayesGHelper( convolutionPowersOfTwoByTwos(firsts) ) | |
1241 }else if(case==1){ | |
1242 last<-list(calculate_bayesGHelper( | |
1243 convolutionPowersOfTwoByTwos( cons[ ,index_chunks[length(index_chunks)/2,1]:index_chunks[length(index_chunks)/2,2]] ) | |
1244 ),0) | |
1245 result_first<-calculate_bayesGHelper(convolutionPowersOfTwoByTwos(firsts)) | |
1246 result<-calculate_bayesGHelper( | |
1247 list( | |
1248 cbind( | |
1249 result_first,last[[1]]), | |
1250 c(log(index_chunks_end[length(index_chunks)/2-1],2),log(index_chunks[length(index_chunks)/2,2]-index_chunks[length(index_chunks)/2,1]+1,2)) | |
1251 ) | |
1252 ) | |
1253 } | |
1254 return(as.vector(result)) | |
1255 } | |
1256 | |
1257 # Computes the 95% CI for a pdf | |
1258 calcBayesCI <- function(Pdf,low=0.025,up=0.975,max_sigma=20, length_sigma=4001){ | |
1259 if(length(Pdf)!=length_sigma) return(NA) | |
1260 sigma_s=seq(-max_sigma,max_sigma,length.out=length_sigma) | |
1261 cdf = cumsum(Pdf) | |
1262 cdf = cdf/cdf[length(cdf)] | |
1263 return( c(sigma_s[findInterval(low,cdf)-1] , sigma_s[findInterval(up,cdf)]) ) | |
1264 } | |
1265 | |
1266 # Computes a mean for a pdf | |
1267 calcBayesMean <- function(Pdf,max_sigma=20,length_sigma=4001){ | |
1268 if(length(Pdf)!=length_sigma) return(NA) | |
1269 sigma_s=seq(-max_sigma,max_sigma,length.out=length_sigma) | |
1270 norm = {length_sigma-1}/2/max_sigma | |
1271 return( (Pdf%*%sigma_s/norm) ) | |
1272 } | |
1273 | |
1274 # Returns the mean, and the 95% CI for a pdf | |
1275 calcBayesOutputInfo <- function(Pdf,low=0.025,up=0.975,max_sigma=20, length_sigma=4001){ | |
1276 if(is.na(Pdf)) | |
1277 return(rep(NA,3)) | |
1278 bCI = calcBayesCI(Pdf=Pdf,low=low,up=up,max_sigma=max_sigma,length_sigma=length_sigma) | |
1279 bMean = calcBayesMean(Pdf=Pdf,max_sigma=max_sigma,length_sigma=length_sigma) | |
1280 return(c(bMean, bCI)) | |
1281 } | |
1282 | |
1283 # Computes the p-value of a pdf | |
1284 computeSigmaP <- function(Pdf, length_sigma=4001, max_sigma=20){ | |
1285 if(length(Pdf)>1){ | |
1286 norm = {length_sigma-1}/2/max_sigma | |
1287 pVal = {sum(Pdf[1:{{length_sigma-1}/2}]) + Pdf[{{length_sigma+1}/2}]/2}/norm | |
1288 if(pVal>0.5){ | |
1289 pVal = pVal-1 | |
1290 } | |
1291 return(pVal) | |
1292 }else{ | |
1293 return(NA) | |
1294 } | |
1295 } | |
1296 | |
1297 # Compute p-value of two distributions | |
1298 compareTwoDistsFaster <-function(sigma_S=seq(-20,20,length.out=4001), N=10000, dens1=runif(4001,0,1), dens2=runif(4001,0,1)){ | |
1299 #print(c(length(dens1),length(dens2))) | |
1300 if(length(dens1)>1 & length(dens2)>1 ){ | |
1301 dens1<-dens1/sum(dens1) | |
1302 dens2<-dens2/sum(dens2) | |
1303 cum2 <- cumsum(dens2)-dens2/2 | |
1304 tmp<- sum(sapply(1:length(dens1),function(i)return(dens1[i]*cum2[i]))) | |
1305 #print(tmp) | |
1306 if(tmp>0.5)tmp<-tmp-1 | |
1307 return( tmp ) | |
1308 } | |
1309 else { | |
1310 return(NA) | |
1311 } | |
1312 #return (sum(sapply(1:N,function(i)(sample(sigma_S,1,prob=dens1)>sample(sigma_S,1,prob=dens2))))/N) | |
1313 } | |
1314 | |
1315 # get number of seqeunces contributing to the sigma (i.e. seqeunces with mutations) | |
1316 numberOfSeqsWithMutations <- function(matMutations,test=1){ | |
1317 if(test==4)test=2 | |
1318 cdrSeqs <- 0 | |
1319 fwrSeqs <- 0 | |
1320 if(test==1){#focused | |
1321 cdrMutations <- apply(matMutations, 1, function(x){ sum(x[c(1,2,4)]) }) | |
1322 fwrMutations <- apply(matMutations, 1, function(x){ sum(x[c(3,4,2)]) }) | |
1323 if( any(which(cdrMutations>0)) ) cdrSeqs <- sum(cdrMutations>0) | |
1324 if( any(which(fwrMutations>0)) ) fwrSeqs <- sum(fwrMutations>0) | |
1325 } | |
1326 if(test==2){#local | |
1327 cdrMutations <- apply(matMutations, 1, function(x){ sum(x[c(1,2)]) }) | |
1328 fwrMutations <- apply(matMutations, 1, function(x){ sum(x[c(3,4)]) }) | |
1329 if( any(which(cdrMutations>0)) ) cdrSeqs <- sum(cdrMutations>0) | |
1330 if( any(which(fwrMutations>0)) ) fwrSeqs <- sum(fwrMutations>0) | |
1331 } | |
1332 return(c("CDR"=cdrSeqs, "FWR"=fwrSeqs)) | |
1333 } | |
1334 | |
1335 | |
1336 | |
1337 shadeColor <- function(sigmaVal=NA,pVal=NA){ | |
1338 if(is.na(sigmaVal) & is.na(pVal)) return(NA) | |
1339 if(is.na(sigmaVal) & !is.na(pVal)) sigmaVal=sign(pVal) | |
1340 if(is.na(pVal) || pVal==1 || pVal==0){ | |
1341 returnColor = "#FFFFFF"; | |
1342 }else{ | |
1343 colVal=abs(pVal); | |
1344 | |
1345 if(sigmaVal<0){ | |
1346 if(colVal>0.1) | |
1347 returnColor = "#CCFFCC"; | |
1348 if(colVal<=0.1) | |
1349 returnColor = "#99FF99"; | |
1350 if(colVal<=0.050) | |
1351 returnColor = "#66FF66"; | |
1352 if(colVal<=0.010) | |
1353 returnColor = "#33FF33"; | |
1354 if(colVal<=0.005) | |
1355 returnColor = "#00FF00"; | |
1356 | |
1357 }else{ | |
1358 if(colVal>0.1) | |
1359 returnColor = "#FFCCCC"; | |
1360 if(colVal<=0.1) | |
1361 returnColor = "#FF9999"; | |
1362 if(colVal<=0.05) | |
1363 returnColor = "#FF6666"; | |
1364 if(colVal<=0.01) | |
1365 returnColor = "#FF3333"; | |
1366 if(colVal<0.005) | |
1367 returnColor = "#FF0000"; | |
1368 } | |
1369 } | |
1370 | |
1371 return(returnColor) | |
1372 } | |
1373 | |
1374 | |
1375 | |
1376 plotHelp <- function(xfrac=0.05,yfrac=0.05,log=FALSE){ | |
1377 if(!log){ | |
1378 x = par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac | |
1379 y = par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac | |
1380 }else { | |
1381 if(log==2){ | |
1382 x = par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac | |
1383 y = 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac) | |
1384 } | |
1385 if(log==1){ | |
1386 x = 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac) | |
1387 y = par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac | |
1388 } | |
1389 if(log==3){ | |
1390 x = 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac) | |
1391 y = 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac) | |
1392 } | |
1393 } | |
1394 return(c("x"=x,"y"=y)) | |
1395 } | |
1396 | |
1397 # SHMulation | |
1398 | |
1399 # Based on targeting, introduce a single mutation & then update the targeting | |
1400 oneMutation <- function(){ | |
1401 # Pick a postion + mutation | |
1402 posMutation = sample(1:(seqGermlineLen*4),1,replace=F,prob=as.vector(seqTargeting)) | |
1403 posNucNumb = ceiling(posMutation/4) # Nucleotide number | |
1404 posNucKind = 4 - ( (posNucNumb*4) - posMutation ) # Nuc the position mutates to | |
1405 | |
1406 #mutate the simulation sequence | |
1407 seqSimVec <- s2c(seqSim) | |
1408 seqSimVec[posNucNumb] <- NUCLEOTIDES[posNucKind] | |
1409 seqSim <<- c2s(seqSimVec) | |
1410 | |
1411 #update Mutability, Targeting & MutationsTypes | |
1412 updateMutabilityNTargeting(posNucNumb) | |
1413 | |
1414 #return(c(posNucNumb,NUCLEOTIDES[posNucKind])) | |
1415 return(posNucNumb) | |
1416 } | |
1417 | |
1418 updateMutabilityNTargeting <- function(position){ | |
1419 min_i<-max((position-2),1) | |
1420 max_i<-min((position+2),nchar(seqSim)) | |
1421 min_ii<-min(min_i,3) | |
1422 | |
1423 #mutability - update locally | |
1424 seqMutability[(min_i):(max_i)] <<- computeMutabilities(substr(seqSim,position-4,position+4))[(min_ii):(max_i-min_i+min_ii)] | |
1425 | |
1426 | |
1427 #targeting - compute locally | |
1428 seqTargeting[,min_i:max_i] <<- computeTargeting(substr(seqSim,min_i,max_i),seqMutability[min_i:max_i]) | |
1429 seqTargeting[is.na(seqTargeting)] <<- 0 | |
1430 #mutCodonPos = getCodonPos(position) | |
1431 mutCodonPos = seq(getCodonPos(min_i)[1],getCodonPos(max_i)[3]) | |
1432 #cat(mutCodonPos,"\n") | |
1433 mutTypeCodon = getCodonPos(position) | |
1434 seqMutationTypes[,mutTypeCodon] <<- computeMutationTypesFast( substr(seqSim,mutTypeCodon[1],mutTypeCodon[3]) ) | |
1435 # Stop = 0 | |
1436 if(any(seqMutationTypes[,mutCodonPos]=="Stop",na.rm=T )){ | |
1437 seqTargeting[,mutCodonPos][seqMutationTypes[,mutCodonPos]=="Stop"] <<- 0 | |
1438 } | |
1439 | |
1440 | |
1441 #Selection | |
1442 selectedPos = (min_i*4-4)+(which(seqMutationTypes[,min_i:max_i]=="R")) | |
1443 # CDR | |
1444 selectedCDR = selectedPos[which(matCDR[selectedPos]==T)] | |
1445 seqTargeting[selectedCDR] <<- seqTargeting[selectedCDR] * exp(selCDR) | |
1446 seqTargeting[selectedCDR] <<- seqTargeting[selectedCDR]/baseLineCDR_K | |
1447 | |
1448 # FWR | |
1449 selectedFWR = selectedPos[which(matFWR[selectedPos]==T)] | |
1450 seqTargeting[selectedFWR] <<- seqTargeting[selectedFWR] * exp(selFWR) | |
1451 seqTargeting[selectedFWR] <<- seqTargeting[selectedFWR]/baseLineFWR_K | |
1452 | |
1453 } | |
1454 | |
1455 | |
1456 | |
1457 # Validate the mutation: if the mutation has not been sampled before validate it, else discard it. | |
1458 validateMutation <- function(){ | |
1459 if( !(mutatedPos%in%mutatedPositions) ){ # if it's a new mutation | |
1460 uniqueMutationsIntroduced <<- uniqueMutationsIntroduced + 1 | |
1461 mutatedPositions[uniqueMutationsIntroduced] <<- mutatedPos | |
1462 }else{ | |
1463 if(substr(seqSim,mutatedPos,mutatedPos)==substr(seqGermline,mutatedPos,mutatedPos)){ # back to germline mutation | |
1464 mutatedPositions <<- mutatedPositions[-which(mutatedPositions==mutatedPos)] | |
1465 uniqueMutationsIntroduced <<- uniqueMutationsIntroduced - 1 | |
1466 } | |
1467 } | |
1468 } | |
1469 | |
1470 | |
1471 | |
1472 # Places text (labels) at normalized coordinates | |
1473 myaxis <- function(xfrac=0.05,yfrac=0.05,log=FALSE,w="text",cex=1,adj=1,thecol="black"){ | |
1474 par(xpd=TRUE) | |
1475 if(!log) | |
1476 text(par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac,par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac,w,cex=cex,adj=adj,col=thecol) | |
1477 else { | |
1478 if(log==2) | |
1479 text( | |
1480 par()$usr[1]-(par()$usr[2]-par()$usr[1])*xfrac, | |
1481 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac), | |
1482 w,cex=cex,adj=adj,col=thecol) | |
1483 if(log==1) | |
1484 text( | |
1485 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac), | |
1486 par()$usr[4]+(par()$usr[4]-par()$usr[3])*yfrac, | |
1487 w,cex=cex,adj=adj,col=thecol) | |
1488 if(log==3) | |
1489 text( | |
1490 10^((par()$usr[1])-((par()$usr[2])-(par()$usr[1]))*xfrac), | |
1491 10^((par()$usr[4])+((par()$usr[4])-(par()$usr[3]))*yfrac), | |
1492 w,cex=cex,adj=adj,col=thecol) | |
1493 } | |
1494 par(xpd=FALSE) | |
1495 } | |
1496 | |
1497 | |
1498 | |
1499 # Count the mutations in a sequence | |
1500 analyzeMutations <- function( inputMatrixIndex, model = 0 , multipleMutation=0, seqWithStops=0){ | |
1501 | |
1502 paramGL = s2c(matInput[inputMatrixIndex,2]) | |
1503 paramSeq = s2c(matInput[inputMatrixIndex,1]) | |
1504 | |
1505 #if( any(paramSeq=="N") ){ | |
1506 # gapPos_Seq = which(paramSeq=="N") | |
1507 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] | |
1508 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
1509 #} | |
1510 mutations_val = paramGL != paramSeq | |
1511 | |
1512 if(any(mutations_val)){ | |
1513 mutationPos = which(mutations_val)#{1:length(mutations_val)}[mutations_val] | |
1514 length_mutations =length(mutationPos) | |
1515 mutationInfo = rep(NA,length_mutations) | |
1516 | |
1517 pos<- mutationPos | |
1518 pos_array<-array(sapply(pos,getCodonPos)) | |
1519 codonGL = paramGL[pos_array] | |
1520 codonSeqWhole = paramSeq[pos_array] | |
1521 codonSeq = sapply(pos,function(x){ | |
1522 seqP = paramGL[getCodonPos(x)] | |
1523 muCodonPos = {x-1}%%3+1 | |
1524 seqP[muCodonPos] = paramSeq[x] | |
1525 return(seqP) | |
1526 }) | |
1527 GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) | |
1528 SeqcodonsWhole = apply(matrix(codonSeqWhole,length_mutations,3,byrow=TRUE),1,c2s) | |
1529 Seqcodons = apply(codonSeq,2,c2s) | |
1530 | |
1531 mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
1532 names(mutationInfo) = mutationPos | |
1533 | |
1534 mutationInfoWhole = apply(rbind(GLcodons , SeqcodonsWhole),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
1535 names(mutationInfoWhole) = mutationPos | |
1536 | |
1537 mutationInfo <- mutationInfo[!is.na(mutationInfo)] | |
1538 mutationInfoWhole <- mutationInfoWhole[!is.na(mutationInfoWhole)] | |
1539 | |
1540 if(any(!is.na(mutationInfo))){ | |
1541 | |
1542 #Filter based on Stop (at the codon level) | |
1543 if(seqWithStops==1){ | |
1544 nucleotidesAtStopCodons = names(mutationInfoWhole[mutationInfoWhole!="Stop"]) | |
1545 mutationInfo = mutationInfo[nucleotidesAtStopCodons] | |
1546 mutationInfoWhole = mutationInfo[nucleotidesAtStopCodons] | |
1547 }else{ | |
1548 countStops = sum(mutationInfoWhole=="Stop") | |
1549 if(seqWithStops==2 & countStops==0) mutationInfo = NA | |
1550 if(seqWithStops==3 & countStops>0) mutationInfo = NA | |
1551 } | |
1552 | |
1553 if(any(!is.na(mutationInfo))){ | |
1554 #Filter mutations based on multipleMutation | |
1555 if(multipleMutation==1 & !is.na(mutationInfo)){ | |
1556 mutationCodons = getCodonNumb(as.numeric(names(mutationInfoWhole))) | |
1557 tableMutationCodons <- table(mutationCodons) | |
1558 codonsWithMultipleMutations <- as.numeric(names(tableMutationCodons[tableMutationCodons>1])) | |
1559 if(any(codonsWithMultipleMutations)){ | |
1560 #remove the nucleotide mutations in the codons with multiple mutations | |
1561 mutationInfo <- mutationInfo[!(mutationCodons %in% codonsWithMultipleMutations)] | |
1562 #replace those codons with Ns in the input sequence | |
1563 paramSeq[unlist(lapply(codonsWithMultipleMutations, getCodonNucs))] = "N" | |
1564 matInput[inputMatrixIndex,1] <<- c2s(paramSeq) | |
1565 } | |
1566 } | |
1567 | |
1568 #Filter mutations based on the model | |
1569 if(any(mutationInfo)==T | is.na(any(mutationInfo))){ | |
1570 | |
1571 if(model==1 & !is.na(mutationInfo)){ | |
1572 mutationInfo <- mutationInfo[mutationInfo=="S"] | |
1573 } | |
1574 if(any(mutationInfo)==T | is.na(any(mutationInfo))) return(mutationInfo) | |
1575 else return(NA) | |
1576 }else{ | |
1577 return(NA) | |
1578 } | |
1579 }else{ | |
1580 return(NA) | |
1581 } | |
1582 | |
1583 | |
1584 }else{ | |
1585 return(NA) | |
1586 } | |
1587 | |
1588 | |
1589 }else{ | |
1590 return (NA) | |
1591 } | |
1592 } | |
1593 | |
1594 analyzeMutationsFixed <- function( inputArray, model = 0 , multipleMutation=0, seqWithStops=0){ | |
1595 | |
1596 paramGL = s2c(inputArray[2]) | |
1597 paramSeq = s2c(inputArray[1]) | |
1598 inputSeq <- inputArray[1] | |
1599 #if( any(paramSeq=="N") ){ | |
1600 # gapPos_Seq = which(paramSeq=="N") | |
1601 # gapPos_Seq_ToReplace = gapPos_Seq[paramGL[gapPos_Seq] != "N"] | |
1602 # paramSeq[gapPos_Seq_ToReplace] = paramGL[gapPos_Seq_ToReplace] | |
1603 #} | |
1604 mutations_val = paramGL != paramSeq | |
1605 | |
1606 if(any(mutations_val)){ | |
1607 mutationPos = which(mutations_val)#{1:length(mutations_val)}[mutations_val] | |
1608 length_mutations =length(mutationPos) | |
1609 mutationInfo = rep(NA,length_mutations) | |
1610 | |
1611 pos<- mutationPos | |
1612 pos_array<-array(sapply(pos,getCodonPos)) | |
1613 codonGL = paramGL[pos_array] | |
1614 codonSeqWhole = paramSeq[pos_array] | |
1615 codonSeq = sapply(pos,function(x){ | |
1616 seqP = paramGL[getCodonPos(x)] | |
1617 muCodonPos = {x-1}%%3+1 | |
1618 seqP[muCodonPos] = paramSeq[x] | |
1619 return(seqP) | |
1620 }) | |
1621 GLcodons = apply(matrix(codonGL,length_mutations,3,byrow=TRUE),1,c2s) | |
1622 SeqcodonsWhole = apply(matrix(codonSeqWhole,length_mutations,3,byrow=TRUE),1,c2s) | |
1623 Seqcodons = apply(codonSeq,2,c2s) | |
1624 | |
1625 mutationInfo = apply(rbind(GLcodons , Seqcodons),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
1626 names(mutationInfo) = mutationPos | |
1627 | |
1628 mutationInfoWhole = apply(rbind(GLcodons , SeqcodonsWhole),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
1629 names(mutationInfoWhole) = mutationPos | |
1630 | |
1631 mutationInfo <- mutationInfo[!is.na(mutationInfo)] | |
1632 mutationInfoWhole <- mutationInfoWhole[!is.na(mutationInfoWhole)] | |
1633 | |
1634 if(any(!is.na(mutationInfo))){ | |
1635 | |
1636 #Filter based on Stop (at the codon level) | |
1637 if(seqWithStops==1){ | |
1638 nucleotidesAtStopCodons = names(mutationInfoWhole[mutationInfoWhole!="Stop"]) | |
1639 mutationInfo = mutationInfo[nucleotidesAtStopCodons] | |
1640 mutationInfoWhole = mutationInfo[nucleotidesAtStopCodons] | |
1641 }else{ | |
1642 countStops = sum(mutationInfoWhole=="Stop") | |
1643 if(seqWithStops==2 & countStops==0) mutationInfo = NA | |
1644 if(seqWithStops==3 & countStops>0) mutationInfo = NA | |
1645 } | |
1646 | |
1647 if(any(!is.na(mutationInfo))){ | |
1648 #Filter mutations based on multipleMutation | |
1649 if(multipleMutation==1 & !is.na(mutationInfo)){ | |
1650 mutationCodons = getCodonNumb(as.numeric(names(mutationInfoWhole))) | |
1651 tableMutationCodons <- table(mutationCodons) | |
1652 codonsWithMultipleMutations <- as.numeric(names(tableMutationCodons[tableMutationCodons>1])) | |
1653 if(any(codonsWithMultipleMutations)){ | |
1654 #remove the nucleotide mutations in the codons with multiple mutations | |
1655 mutationInfo <- mutationInfo[!(mutationCodons %in% codonsWithMultipleMutations)] | |
1656 #replace those codons with Ns in the input sequence | |
1657 paramSeq[unlist(lapply(codonsWithMultipleMutations, getCodonNucs))] = "N" | |
1658 #matInput[inputMatrixIndex,1] <<- c2s(paramSeq) | |
1659 inputSeq <- c2s(paramSeq) | |
1660 } | |
1661 } | |
1662 | |
1663 #Filter mutations based on the model | |
1664 if(any(mutationInfo)==T | is.na(any(mutationInfo))){ | |
1665 | |
1666 if(model==1 & !is.na(mutationInfo)){ | |
1667 mutationInfo <- mutationInfo[mutationInfo=="S"] | |
1668 } | |
1669 if(any(mutationInfo)==T | is.na(any(mutationInfo))) return(list(mutationInfo,inputSeq)) | |
1670 else return(list(NA,inputSeq)) | |
1671 }else{ | |
1672 return(list(NA,inputSeq)) | |
1673 } | |
1674 }else{ | |
1675 return(list(NA,inputSeq)) | |
1676 } | |
1677 | |
1678 | |
1679 }else{ | |
1680 return(list(NA,inputSeq)) | |
1681 } | |
1682 | |
1683 | |
1684 }else{ | |
1685 return (list(NA,inputSeq)) | |
1686 } | |
1687 } | |
1688 | |
1689 # triMutability Background Count | |
1690 buildMutabilityModel <- function( inputMatrixIndex, model=0 , multipleMutation=0, seqWithStops=0, stopMutations=0){ | |
1691 | |
1692 #rowOrigMatInput = matInput[inputMatrixIndex,] | |
1693 seqGL = gsub("-", "", matInput[inputMatrixIndex,2]) | |
1694 seqInput = gsub("-", "", matInput[inputMatrixIndex,1]) | |
1695 #matInput[inputMatrixIndex,] <<- cbind(seqInput,seqGL) | |
1696 tempInput <- cbind(seqInput,seqGL) | |
1697 seqLength = nchar(seqGL) | |
1698 list_analyzeMutationsFixed<- analyzeMutationsFixed(tempInput, model, multipleMutation, seqWithStops) | |
1699 mutationCount <- list_analyzeMutationsFixed[[1]] | |
1700 seqInput <- list_analyzeMutationsFixed[[2]] | |
1701 BackgroundMatrix = mutabilityMatrix | |
1702 MutationMatrix = mutabilityMatrix | |
1703 MutationCountMatrix = mutabilityMatrix | |
1704 if(!is.na(mutationCount)){ | |
1705 if((stopMutations==0 & model==0) | (stopMutations==1 & (sum(mutationCount=="Stop")<length(mutationCount))) | (model==1 & (sum(mutationCount=="S")>0)) ){ | |
1706 | |
1707 fivermerStartPos = 1:(seqLength-4) | |
1708 fivemerLength <- length(fivermerStartPos) | |
1709 fivemerGL <- substr(rep(seqGL,length(fivermerStartPos)),(fivermerStartPos),(fivermerStartPos+4)) | |
1710 fivemerSeq <- substr(rep(seqInput,length(fivermerStartPos)),(fivermerStartPos),(fivermerStartPos+4)) | |
1711 | |
1712 #Background | |
1713 for(fivemerIndex in 1:fivemerLength){ | |
1714 fivemer = fivemerGL[fivemerIndex] | |
1715 if(!any(grep("N",fivemer))){ | |
1716 fivemerCodonPos = fivemerCodon(fivemerIndex) | |
1717 fivemerReadingFrameCodon = substr(fivemer,fivemerCodonPos[1],fivemerCodonPos[3]) | |
1718 fivemerReadingFrameCodonInputSeq = substr(fivemerSeq[fivemerIndex],fivemerCodonPos[1],fivemerCodonPos[3]) | |
1719 | |
1720 # All mutations model | |
1721 #if(!any(grep("N",fivemerReadingFrameCodon))){ | |
1722 if(model==0){ | |
1723 if(stopMutations==0){ | |
1724 if(!any(grep("N",fivemerReadingFrameCodonInputSeq))) | |
1725 BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + 1) | |
1726 }else{ | |
1727 if( !any(grep("N",fivemerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(fivemerReadingFrameCodon)!="*" ){ | |
1728 positionWithinCodon = which(fivemerCodonPos==3)#positionsWithinCodon[(fivemerCodonPos[1]%%3)+1] | |
1729 BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + probNonStopMutations[fivemerReadingFrameCodon,positionWithinCodon]) | |
1730 } | |
1731 } | |
1732 }else{ # Only silent mutations | |
1733 if( !any(grep("N",fivemerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(fivemerReadingFrameCodon)!="*" & translateCodonToAminoAcid(fivemerReadingFrameCodonInputSeq)==translateCodonToAminoAcid(fivemerReadingFrameCodon)){ | |
1734 positionWithinCodon = which(fivemerCodonPos==3) | |
1735 BackgroundMatrix[fivemer] <- (BackgroundMatrix[fivemer] + probSMutations[fivemerReadingFrameCodon,positionWithinCodon]) | |
1736 } | |
1737 } | |
1738 #} | |
1739 } | |
1740 } | |
1741 | |
1742 #Mutations | |
1743 if(stopMutations==1) mutationCount = mutationCount[mutationCount!="Stop"] | |
1744 if(model==1) mutationCount = mutationCount[mutationCount=="S"] | |
1745 mutationPositions = as.numeric(names(mutationCount)) | |
1746 mutationCount = mutationCount[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
1747 mutationPositions = mutationPositions[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
1748 countMutations = 0 | |
1749 for(mutationPosition in mutationPositions){ | |
1750 fivemerIndex = mutationPosition-2 | |
1751 fivemer = fivemerSeq[fivemerIndex] | |
1752 GLfivemer = fivemerGL[fivemerIndex] | |
1753 fivemerCodonPos = fivemerCodon(fivemerIndex) | |
1754 fivemerReadingFrameCodon = substr(fivemer,fivemerCodonPos[1],fivemerCodonPos[3]) | |
1755 fivemerReadingFrameCodonGL = substr(GLfivemer,fivemerCodonPos[1],fivemerCodonPos[3]) | |
1756 if(!any(grep("N",fivemer)) & !any(grep("N",GLfivemer))){ | |
1757 if(model==0){ | |
1758 countMutations = countMutations + 1 | |
1759 MutationMatrix[GLfivemer] <- (MutationMatrix[GLfivemer] + 1) | |
1760 MutationCountMatrix[GLfivemer] <- (MutationCountMatrix[GLfivemer] + 1) | |
1761 }else{ | |
1762 if( translateCodonToAminoAcid(fivemerReadingFrameCodonGL)!="*" ){ | |
1763 countMutations = countMutations + 1 | |
1764 positionWithinCodon = which(fivemerCodonPos==3) | |
1765 glNuc = substr(fivemerReadingFrameCodonGL,positionWithinCodon,positionWithinCodon) | |
1766 inputNuc = substr(fivemerReadingFrameCodon,positionWithinCodon,positionWithinCodon) | |
1767 MutationMatrix[GLfivemer] <- (MutationMatrix[GLfivemer] + substitution[glNuc,inputNuc]) | |
1768 MutationCountMatrix[GLfivemer] <- (MutationCountMatrix[GLfivemer] + 1) | |
1769 } | |
1770 } | |
1771 } | |
1772 } | |
1773 | |
1774 seqMutability = MutationMatrix/BackgroundMatrix | |
1775 seqMutability = seqMutability/sum(seqMutability,na.rm=TRUE) | |
1776 #cat(inputMatrixIndex,"\t",countMutations,"\n") | |
1777 return(list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix, "BackgroundMatrix"=BackgroundMatrix)) | |
1778 | |
1779 } | |
1780 } | |
1781 | |
1782 } | |
1783 | |
1784 #Returns the codon position containing the middle nucleotide | |
1785 fivemerCodon <- function(fivemerIndex){ | |
1786 codonPos = list(2:4,1:3,3:5) | |
1787 fivemerType = fivemerIndex%%3 | |
1788 return(codonPos[[fivemerType+1]]) | |
1789 } | |
1790 | |
1791 #returns probability values for one mutation in codons resulting in R, S or Stop | |
1792 probMutations <- function(typeOfMutation){ | |
1793 matMutationProb <- matrix(0,ncol=3,nrow=125,dimnames=list(words(alphabet = c(NUCLEOTIDES,"N"), length=3),c(1:3))) | |
1794 for(codon in rownames(matMutationProb)){ | |
1795 if( !any(grep("N",codon)) ){ | |
1796 for(muPos in 1:3){ | |
1797 matCodon = matrix(rep(s2c(codon),3),nrow=3,ncol=3,byrow=T) | |
1798 glNuc = matCodon[1,muPos] | |
1799 matCodon[,muPos] = canMutateTo(glNuc) | |
1800 substitutionRate = substitution[glNuc,matCodon[,muPos]] | |
1801 typeOfMutations = apply(rbind(rep(codon,3),apply(matCodon,1,c2s)),2,function(x){mutationType(c2s(x[1]),c2s(x[2]))}) | |
1802 matMutationProb[codon,muPos] <- sum(substitutionRate[typeOfMutations==typeOfMutation]) | |
1803 } | |
1804 } | |
1805 } | |
1806 | |
1807 return(matMutationProb) | |
1808 } | |
1809 | |
1810 | |
1811 | |
1812 | |
1813 #Mapping Trinucleotides to fivemers | |
1814 mapTriToFivemer <- function(triMutability=triMutability_Literature_Human){ | |
1815 rownames(triMutability) <- triMutability_Names | |
1816 Fivemer<-rep(NA,1024) | |
1817 names(Fivemer)<-words(alphabet=NUCLEOTIDES,length=5) | |
1818 Fivemer<-sapply(names(Fivemer),function(Word)return(sum( c(triMutability[substring(Word,3,5),1],triMutability[substring(Word,2,4),2],triMutability[substring(Word,1,3),3]),na.rm=TRUE))) | |
1819 Fivemer<-Fivemer/sum(Fivemer) | |
1820 return(Fivemer) | |
1821 } | |
1822 | |
1823 collapseFivemerToTri<-function(Fivemer,Weights=MutabilityWeights,position=1,NUC="A"){ | |
1824 Indices<-substring(names(Fivemer),3,3)==NUC | |
1825 Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) | |
1826 tapply(which(Indices),Factors,function(i)weighted.mean(Fivemer[i],Weights[i],na.rm=TRUE)) | |
1827 } | |
1828 | |
1829 | |
1830 | |
1831 CountFivemerToTri<-function(Fivemer,Weights=MutabilityWeights,position=1,NUC="A"){ | |
1832 Indices<-substring(names(Fivemer),3,3)==NUC | |
1833 Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) | |
1834 tapply(which(Indices),Factors,function(i)sum(Weights[i],na.rm=TRUE)) | |
1835 } | |
1836 | |
1837 #Uses the real counts of the mutated fivemers | |
1838 CountFivemerToTri2<-function(Fivemer,Counts=MutabilityCounts,position=1,NUC="A"){ | |
1839 Indices<-substring(names(Fivemer),3,3)==NUC | |
1840 Factors<-substring(names(Fivemer[Indices]),(4-position),(6-position)) | |
1841 tapply(which(Indices),Factors,function(i)sum(Counts[i],na.rm=TRUE)) | |
1842 } | |
1843 | |
1844 bootstrap<-function(x=c(33,12,21),M=10000,alpha=0.05){ | |
1845 N<-sum(x) | |
1846 if(N){ | |
1847 p<-x/N | |
1848 k<-length(x)-1 | |
1849 tmp<-rmultinom(M, size = N, prob=p) | |
1850 tmp_p<-apply(tmp,2,function(y)y/N) | |
1851 (apply(tmp_p,1,function(y)quantile(y,c(alpha/2/k,1-alpha/2/k)))) | |
1852 } | |
1853 else return(matrix(0,2,length(x))) | |
1854 } | |
1855 | |
1856 | |
1857 | |
1858 | |
1859 bootstrap2<-function(x=c(33,12,21),n=10,M=10000,alpha=0.05){ | |
1860 | |
1861 N<-sum(x) | |
1862 k<-length(x) | |
1863 y<-rep(1:k,x) | |
1864 tmp<-sapply(1:M,function(i)sample(y,n)) | |
1865 if(n>1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[,j]==i)))/n | |
1866 if(n==1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[j]==i)))/n | |
1867 (apply(tmp_p,1,function(z)quantile(z,c(alpha/2/(k-1),1-alpha/2/(k-1))))) | |
1868 } | |
1869 | |
1870 | |
1871 | |
1872 p_value<-function(x=c(33,12,21),M=100000,x_obs=c(2,5,3)){ | |
1873 n=sum(x_obs) | |
1874 N<-sum(x) | |
1875 k<-length(x) | |
1876 y<-rep(1:k,x) | |
1877 tmp<-sapply(1:M,function(i)sample(y,n)) | |
1878 if(n>1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[,j]==i))) | |
1879 if(n==1)tmp_p<-sapply(1:M,function(j)sapply(1:k,function(i)sum(tmp[j]==i))) | |
1880 tmp<-rbind(sapply(1:3,function(i)sum(tmp_p[i,]>=x_obs[i])/M), | |
1881 sapply(1:3,function(i)sum(tmp_p[i,]<=x_obs[i])/M)) | |
1882 sapply(1:3,function(i){if(tmp[1,i]>=tmp[2,i])return(-tmp[2,i])else return(tmp[1,i])}) | |
1883 } | |
1884 | |
1885 #"D:\\Sequences\\IMGT Germlines\\Human_SNPless_IGHJ.FASTA" | |
1886 # Remove SNPs from IMGT germline segment alleles | |
1887 generateUnambiguousRepertoire <- function(repertoireInFile,repertoireOutFile){ | |
1888 repertoireIn <- read.fasta(repertoireInFile, seqtype="DNA",as.string=T,set.attributes=F,forceDNAtolower=F) | |
1889 alleleNames <- sapply(names(repertoireIn),function(x)strsplit(x,"|",fixed=TRUE)[[1]][2]) | |
1890 SNPs <- tapply(repertoireIn,sapply(alleleNames,function(x)strsplit(x,"*",fixed=TRUE)[[1]][1]),function(x){ | |
1891 Indices<-NULL | |
1892 for(i in 1:length(x)){ | |
1893 firstSeq = s2c(x[[1]]) | |
1894 iSeq = s2c(x[[i]]) | |
1895 Indices<-c(Indices,which(firstSeq[1:320]!=iSeq[1:320] & firstSeq[1:320]!="." & iSeq[1:320]!="." )) | |
1896 } | |
1897 return(sort(unique(Indices))) | |
1898 }) | |
1899 repertoireOut <- repertoireIn | |
1900 repertoireOut <- lapply(names(repertoireOut), function(repertoireName){ | |
1901 alleleName <- strsplit(repertoireName,"|",fixed=TRUE)[[1]][2] | |
1902 geneSegmentName <- strsplit(alleleName,"*",fixed=TRUE)[[1]][1] | |
1903 alleleSeq <- s2c(repertoireOut[[repertoireName]]) | |
1904 alleleSeq[as.numeric(unlist(SNPs[geneSegmentName]))] <- "N" | |
1905 alleleSeq <- c2s(alleleSeq) | |
1906 repertoireOut[[repertoireName]] <- alleleSeq | |
1907 }) | |
1908 names(repertoireOut) <- names(repertoireIn) | |
1909 write.fasta(repertoireOut,names(repertoireOut),file.out=repertoireOutFile) | |
1910 | |
1911 } | |
1912 | |
1913 | |
1914 | |
1915 | |
1916 | |
1917 | |
1918 ############ | |
1919 groupBayes2 = function(indexes, param_resultMat){ | |
1920 | |
1921 BayesGDist_Focused_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2]+x[4])})) | |
1922 BayesGDist_Focused_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(3,2,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[3]+x[2]+x[4])})) | |
1923 #BayesGDist_Local_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2])})) | |
1924 #BayesGDist_Local_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[3]+x[4])})) | |
1925 #BayesGDist_Global_CDR = calculate_bayesG( x=param_resultMat[indexes,1], N=apply(param_resultMat[indexes,c(1,2,3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[1]/(x[1]+x[2]+x[3]+x[4])})) | |
1926 #BayesGDist_Global_FWR = calculate_bayesG( x=param_resultMat[indexes,3], N=apply(param_resultMat[indexes,c(1,2,3,4)],1,sum,na.rm=T), p=apply(param_resultMat[indexes,5:8],1,function(x){x[3]/(x[1]+x[2]+x[3]+x[4])})) | |
1927 return ( list("BayesGDist_Focused_CDR"=BayesGDist_Focused_CDR, | |
1928 "BayesGDist_Focused_FWR"=BayesGDist_Focused_FWR) ) | |
1929 #"BayesGDist_Local_CDR"=BayesGDist_Local_CDR, | |
1930 #"BayesGDist_Local_FWR" = BayesGDist_Local_FWR)) | |
1931 # "BayesGDist_Global_CDR" = BayesGDist_Global_CDR, | |
1932 # "BayesGDist_Global_FWR" = BayesGDist_Global_FWR) ) | |
1933 | |
1934 | |
1935 } | |
1936 | |
1937 | |
1938 calculate_bayesG <- function( x=array(), N=array(), p=array(), max_sigma=20, length_sigma=4001){ | |
1939 G <- max(length(x),length(N),length(p)) | |
1940 x=array(x,dim=G) | |
1941 N=array(N,dim=G) | |
1942 p=array(p,dim=G) | |
1943 | |
1944 indexOfZero = N>0 & p>0 | |
1945 N = N[indexOfZero] | |
1946 x = x[indexOfZero] | |
1947 p = p[indexOfZero] | |
1948 G <- length(x) | |
1949 | |
1950 if(G){ | |
1951 | |
1952 cons<-array( dim=c(length_sigma,G) ) | |
1953 if(G==1) { | |
1954 return(calculate_bayes(x=x[G],N=N[G],p=p[G],max_sigma=max_sigma,length_sigma=length_sigma)) | |
1955 } | |
1956 else { | |
1957 for(g in 1:G) cons[,g] <- calculate_bayes(x=x[g],N=N[g],p=p[g],max_sigma=max_sigma,length_sigma=length_sigma) | |
1958 listMatG <- convolutionPowersOfTwoByTwos(cons,length_sigma=length_sigma) | |
1959 y<-calculate_bayesGHelper(listMatG,length_sigma=length_sigma) | |
1960 return( y/sum(y)/(2*max_sigma/(length_sigma-1)) ) | |
1961 } | |
1962 }else{ | |
1963 return(NA) | |
1964 } | |
1965 } | |
1966 | |
1967 | |
1968 calculate_bayesGHelper <- function( listMatG,length_sigma=4001 ){ | |
1969 matG <- listMatG[[1]] | |
1970 groups <- listMatG[[2]] | |
1971 i = 1 | |
1972 resConv <- matG[,i] | |
1973 denom <- 2^groups[i] | |
1974 if(length(groups)>1){ | |
1975 while( i<length(groups) ){ | |
1976 i = i + 1 | |
1977 resConv <- weighted_conv(resConv, matG[,i], w= {{2^groups[i]}/denom} ,length_sigma=length_sigma) | |
1978 #cat({{2^groups[i]}/denom},"\n") | |
1979 denom <- denom + 2^groups[i] | |
1980 } | |
1981 } | |
1982 return(resConv) | |
1983 } | |
1984 | |
1985 weighted_conv<-function(x,y,w=1,m=100,length_sigma=4001){ | |
1986 lx<-length(x) | |
1987 ly<-length(y) | |
1988 if({lx<m}| {{lx*w}<m}| {{ly}<m}| {{ly*w}<m}){ | |
1989 if(w<1){ | |
1990 y1<-approx(1:ly,y,seq(1,ly,length.out=m))$y | |
1991 x1<-approx(1:lx,x,seq(1,lx,length.out=m/w))$y | |
1992 lx<-length(x1) | |
1993 ly<-length(y1) | |
1994 } | |
1995 else { | |
1996 y1<-approx(1:ly,y,seq(1,ly,length.out=m*w))$y | |
1997 x1<-approx(1:lx,x,seq(1,lx,length.out=m))$y | |
1998 lx<-length(x1) | |
1999 ly<-length(y1) | |
2000 } | |
2001 } | |
2002 else{ | |
2003 x1<-x | |
2004 y1<-approx(1:ly,y,seq(1,ly,length.out=floor(lx*w)))$y | |
2005 ly<-length(y1) | |
2006 } | |
2007 tmp<-approx(x=1:(lx+ly-1),y=convolve(x1,rev(y1),type="open"),xout=seq(1,lx+ly-1,length.out=length_sigma))$y | |
2008 tmp[tmp<=0] = 0 | |
2009 return(tmp/sum(tmp)) | |
2010 } | |
2011 | |
2012 ######################## | |
2013 | |
2014 | |
2015 | |
2016 | |
2017 mutabilityMatrixONE<-rep(0,4) | |
2018 names(mutabilityMatrixONE)<-NUCLEOTIDES | |
2019 | |
2020 # triMutability Background Count | |
2021 buildMutabilityModelONE <- function( inputMatrixIndex, model=0 , multipleMutation=0, seqWithStops=0, stopMutations=0){ | |
2022 | |
2023 #rowOrigMatInput = matInput[inputMatrixIndex,] | |
2024 seqGL = gsub("-", "", matInput[inputMatrixIndex,2]) | |
2025 seqInput = gsub("-", "", matInput[inputMatrixIndex,1]) | |
2026 matInput[inputMatrixIndex,] <<- c(seqInput,seqGL) | |
2027 seqLength = nchar(seqGL) | |
2028 mutationCount <- analyzeMutations(inputMatrixIndex, model, multipleMutation, seqWithStops) | |
2029 BackgroundMatrix = mutabilityMatrixONE | |
2030 MutationMatrix = mutabilityMatrixONE | |
2031 MutationCountMatrix = mutabilityMatrixONE | |
2032 if(!is.na(mutationCount)){ | |
2033 if((stopMutations==0 & model==0) | (stopMutations==1 & (sum(mutationCount=="Stop")<length(mutationCount))) | (model==1 & (sum(mutationCount=="S")>0)) ){ | |
2034 | |
2035 # ONEmerStartPos = 1:(seqLength) | |
2036 # ONEmerLength <- length(ONEmerStartPos) | |
2037 ONEmerGL <- s2c(seqGL) | |
2038 ONEmerSeq <- s2c(seqInput) | |
2039 | |
2040 #Background | |
2041 for(ONEmerIndex in 1:seqLength){ | |
2042 ONEmer = ONEmerGL[ONEmerIndex] | |
2043 if(ONEmer!="N"){ | |
2044 ONEmerCodonPos = getCodonPos(ONEmerIndex) | |
2045 ONEmerReadingFrameCodon = c2s(ONEmerGL[ONEmerCodonPos]) | |
2046 ONEmerReadingFrameCodonInputSeq = c2s(ONEmerSeq[ONEmerCodonPos] ) | |
2047 | |
2048 # All mutations model | |
2049 #if(!any(grep("N",ONEmerReadingFrameCodon))){ | |
2050 if(model==0){ | |
2051 if(stopMutations==0){ | |
2052 if(!any(grep("N",ONEmerReadingFrameCodonInputSeq))) | |
2053 BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + 1) | |
2054 }else{ | |
2055 if( !any(grep("N",ONEmerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)!="*"){ | |
2056 positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex)#positionsWithinCodon[(ONEmerCodonPos[1]%%3)+1] | |
2057 BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + probNonStopMutations[ONEmerReadingFrameCodon,positionWithinCodon]) | |
2058 } | |
2059 } | |
2060 }else{ # Only silent mutations | |
2061 if( !any(grep("N",ONEmerReadingFrameCodonInputSeq)) & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)!="*" & translateCodonToAminoAcid(ONEmerReadingFrameCodonInputSeq)==translateCodonToAminoAcid(ONEmerReadingFrameCodon) ){ | |
2062 positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex) | |
2063 BackgroundMatrix[ONEmer] <- (BackgroundMatrix[ONEmer] + probSMutations[ONEmerReadingFrameCodon,positionWithinCodon]) | |
2064 } | |
2065 } | |
2066 } | |
2067 } | |
2068 } | |
2069 | |
2070 #Mutations | |
2071 if(stopMutations==1) mutationCount = mutationCount[mutationCount!="Stop"] | |
2072 if(model==1) mutationCount = mutationCount[mutationCount=="S"] | |
2073 mutationPositions = as.numeric(names(mutationCount)) | |
2074 mutationCount = mutationCount[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
2075 mutationPositions = mutationPositions[mutationPositions>2 & mutationPositions<(seqLength-1)] | |
2076 countMutations = 0 | |
2077 for(mutationPosition in mutationPositions){ | |
2078 ONEmerIndex = mutationPosition | |
2079 ONEmer = ONEmerSeq[ONEmerIndex] | |
2080 GLONEmer = ONEmerGL[ONEmerIndex] | |
2081 ONEmerCodonPos = getCodonPos(ONEmerIndex) | |
2082 ONEmerReadingFrameCodon = c2s(ONEmerSeq[ONEmerCodonPos]) | |
2083 ONEmerReadingFrameCodonGL =c2s(ONEmerGL[ONEmerCodonPos]) | |
2084 if(!any(grep("N",ONEmer)) & !any(grep("N",GLONEmer))){ | |
2085 if(model==0){ | |
2086 countMutations = countMutations + 1 | |
2087 MutationMatrix[GLONEmer] <- (MutationMatrix[GLONEmer] + 1) | |
2088 MutationCountMatrix[GLONEmer] <- (MutationCountMatrix[GLONEmer] + 1) | |
2089 }else{ | |
2090 if( translateCodonToAminoAcid(ONEmerReadingFrameCodonGL)!="*" ){ | |
2091 countMutations = countMutations + 1 | |
2092 positionWithinCodon = which(ONEmerCodonPos==ONEmerIndex) | |
2093 glNuc = substr(ONEmerReadingFrameCodonGL,positionWithinCodon,positionWithinCodon) | |
2094 inputNuc = substr(ONEmerReadingFrameCodon,positionWithinCodon,positionWithinCodon) | |
2095 MutationMatrix[GLONEmer] <- (MutationMatrix[GLONEmer] + substitution[glNuc,inputNuc]) | |
2096 MutationCountMatrix[GLONEmer] <- (MutationCountMatrix[GLONEmer] + 1) | |
2097 } | |
2098 } | |
2099 } | |
2100 } | |
2101 | |
2102 seqMutability = MutationMatrix/BackgroundMatrix | |
2103 seqMutability = seqMutability/sum(seqMutability,na.rm=TRUE) | |
2104 #cat(inputMatrixIndex,"\t",countMutations,"\n") | |
2105 return(list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix, "BackgroundMatrix"=BackgroundMatrix)) | |
2106 # tmp<-list("seqMutability" = seqMutability,"numbMutations" = countMutations,"seqMutabilityCount" = MutationCountMatrix) | |
2107 } | |
2108 } | |
2109 | |
2110 ################ | |
2111 # $Id: trim.R 989 2006-10-29 15:28:26Z ggorjan $ | |
2112 | |
2113 trim <- function(s, recode.factor=TRUE, ...) | |
2114 UseMethod("trim", s) | |
2115 | |
2116 trim.default <- function(s, recode.factor=TRUE, ...) | |
2117 s | |
2118 | |
2119 trim.character <- function(s, recode.factor=TRUE, ...) | |
2120 { | |
2121 s <- sub(pattern="^ +", replacement="", x=s) | |
2122 s <- sub(pattern=" +$", replacement="", x=s) | |
2123 s | |
2124 } | |
2125 | |
2126 trim.factor <- function(s, recode.factor=TRUE, ...) | |
2127 { | |
2128 levels(s) <- trim(levels(s)) | |
2129 if(recode.factor) { | |
2130 dots <- list(x=s, ...) | |
2131 if(is.null(dots$sort)) dots$sort <- sort | |
2132 s <- do.call(what=reorder.factor, args=dots) | |
2133 } | |
2134 s | |
2135 } | |
2136 | |
2137 trim.list <- function(s, recode.factor=TRUE, ...) | |
2138 lapply(s, trim, recode.factor=recode.factor, ...) | |
2139 | |
2140 trim.data.frame <- function(s, recode.factor=TRUE, ...) | |
2141 { | |
2142 s[] <- trim.list(s, recode.factor=recode.factor, ...) | |
2143 s | |
2144 } | |
2145 ####################################### | |
2146 # Compute the expected for each sequence-germline pair by codon | |
2147 getExpectedIndividualByCodon <- function(matInput){ | |
2148 if( any(grep("multicore",search())) ){ | |
2149 facGL <- factor(matInput[,2]) | |
2150 facLevels = levels(facGL) | |
2151 LisGLs_MutabilityU = mclapply(1:length(facLevels), function(x){ | |
2152 computeMutabilities(facLevels[x]) | |
2153 }) | |
2154 facIndex = match(facGL,facLevels) | |
2155 | |
2156 LisGLs_Mutability = mclapply(1:nrow(matInput), function(x){ | |
2157 cInput = rep(NA,nchar(matInput[x,1])) | |
2158 cInput[s2c(matInput[x,1])!="N"] = 1 | |
2159 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
2160 }) | |
2161 | |
2162 LisGLs_Targeting = mclapply(1:dim(matInput)[1], function(x){ | |
2163 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
2164 }) | |
2165 | |
2166 LisGLs_MutationTypes = mclapply(1:length(matInput[,2]),function(x){ | |
2167 #print(x) | |
2168 computeMutationTypes(matInput[x,2]) | |
2169 }) | |
2170 | |
2171 LisGLs_R_Exp = mclapply(1:nrow(matInput), function(x){ | |
2172 Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
2173 function(codonNucs){ | |
2174 RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") | |
2175 sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) | |
2176 } | |
2177 ) | |
2178 }) | |
2179 | |
2180 LisGLs_S_Exp = mclapply(1:nrow(matInput), function(x){ | |
2181 Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
2182 function(codonNucs){ | |
2183 SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") | |
2184 sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T ) | |
2185 } | |
2186 ) | |
2187 }) | |
2188 | |
2189 Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
2190 Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
2191 return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) | |
2192 }else{ | |
2193 facGL <- factor(matInput[,2]) | |
2194 facLevels = levels(facGL) | |
2195 LisGLs_MutabilityU = lapply(1:length(facLevels), function(x){ | |
2196 computeMutabilities(facLevels[x]) | |
2197 }) | |
2198 facIndex = match(facGL,facLevels) | |
2199 | |
2200 LisGLs_Mutability = lapply(1:nrow(matInput), function(x){ | |
2201 cInput = rep(NA,nchar(matInput[x,1])) | |
2202 cInput[s2c(matInput[x,1])!="N"] = 1 | |
2203 LisGLs_MutabilityU[[facIndex[x]]] * cInput | |
2204 }) | |
2205 | |
2206 LisGLs_Targeting = lapply(1:dim(matInput)[1], function(x){ | |
2207 computeTargeting(matInput[x,2],LisGLs_Mutability[[x]]) | |
2208 }) | |
2209 | |
2210 LisGLs_MutationTypes = lapply(1:length(matInput[,2]),function(x){ | |
2211 #print(x) | |
2212 computeMutationTypes(matInput[x,2]) | |
2213 }) | |
2214 | |
2215 LisGLs_R_Exp = lapply(1:nrow(matInput), function(x){ | |
2216 Exp_R <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
2217 function(codonNucs){ | |
2218 RPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="R") | |
2219 sum( LisGLs_Targeting[[x]][,codonNucs][RPos], na.rm=T ) | |
2220 } | |
2221 ) | |
2222 }) | |
2223 | |
2224 LisGLs_S_Exp = lapply(1:nrow(matInput), function(x){ | |
2225 Exp_S <- rollapply(as.zoo(1:readEnd),width=3,by=3, | |
2226 function(codonNucs){ | |
2227 SPos = which(LisGLs_MutationTypes[[x]][,codonNucs]=="S") | |
2228 sum( LisGLs_Targeting[[x]][,codonNucs][SPos], na.rm=T ) | |
2229 } | |
2230 ) | |
2231 }) | |
2232 | |
2233 Exp_R = matrix(unlist(LisGLs_R_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
2234 Exp_S = matrix(unlist(LisGLs_S_Exp),nrow=nrow(matInput),ncol=readEnd/3,T) | |
2235 return( list( "Expected_R"=Exp_R, "Expected_S"=Exp_S) ) | |
2236 } | |
2237 } | |
2238 | |
2239 # getObservedMutationsByCodon <- function(listMutations){ | |
2240 # numbSeqs <- length(listMutations) | |
2241 # obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3)))) | |
2242 # obsMu_S <- obsMu_R | |
2243 # temp <- mclapply(1:length(listMutations), function(i){ | |
2244 # arrMutations = listMutations[[i]] | |
2245 # RPos = as.numeric(names(arrMutations)[arrMutations=="R"]) | |
2246 # RPos <- sapply(RPos,getCodonNumb) | |
2247 # if(any(RPos)){ | |
2248 # tabR <- table(RPos) | |
2249 # obsMu_R[i,as.numeric(names(tabR))] <<- tabR | |
2250 # } | |
2251 # | |
2252 # SPos = as.numeric(names(arrMutations)[arrMutations=="S"]) | |
2253 # SPos <- sapply(SPos,getCodonNumb) | |
2254 # if(any(SPos)){ | |
2255 # tabS <- table(SPos) | |
2256 # obsMu_S[i,names(tabS)] <<- tabS | |
2257 # } | |
2258 # } | |
2259 # ) | |
2260 # return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) | |
2261 # } | |
2262 | |
2263 getObservedMutationsByCodon <- function(listMutations){ | |
2264 numbSeqs <- length(listMutations) | |
2265 obsMu_R <- matrix(0,nrow=numbSeqs,ncol=readEnd/3,dimnames=list(c(1:numbSeqs),c(1:(readEnd/3)))) | |
2266 obsMu_S <- obsMu_R | |
2267 temp <- lapply(1:length(listMutations), function(i){ | |
2268 arrMutations = listMutations[[i]] | |
2269 RPos = as.numeric(names(arrMutations)[arrMutations=="R"]) | |
2270 RPos <- sapply(RPos,getCodonNumb) | |
2271 if(any(RPos)){ | |
2272 tabR <- table(RPos) | |
2273 obsMu_R[i,as.numeric(names(tabR))] <<- tabR | |
2274 } | |
2275 | |
2276 SPos = as.numeric(names(arrMutations)[arrMutations=="S"]) | |
2277 SPos <- sapply(SPos,getCodonNumb) | |
2278 if(any(SPos)){ | |
2279 tabS <- table(SPos) | |
2280 obsMu_S[i,names(tabS)] <<- tabS | |
2281 } | |
2282 } | |
2283 ) | |
2284 return( list( "Observed_R"=obsMu_R, "Observed_S"=obsMu_S) ) | |
2285 } | |
2286 |