Mercurial > repos > davidvanzessen > baseline_bayesian_estimation
comparison Baseline_Main.r @ 0:d685e7ba0ed4 draft
Uploaded
author | davidvanzessen |
---|---|
date | Tue, 15 Jul 2014 08:43:49 -0400 |
parents | |
children | d8de51314d3f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d685e7ba0ed4 |
---|---|
1 ######################################################################################### | |
2 # License Agreement | |
3 # | |
4 # THIS WORK IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE | |
5 # ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER | |
6 # APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE | |
7 # OR COPYRIGHT LAW IS PROHIBITED. | |
8 # | |
9 # BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE | |
10 # BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY BE CONSIDERED | |
11 # TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN | |
12 # CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. | |
13 # | |
14 # BASELIne: Bayesian Estimation of Antigen-Driven Selection in Immunoglobulin Sequences | |
15 # Coded by: Mohamed Uduman & Gur Yaari | |
16 # Copyright 2012 Kleinstein Lab | |
17 # Version: 1.3 (01/23/2014) | |
18 ######################################################################################### | |
19 | |
20 op <- options(); | |
21 options(showWarnCalls=FALSE, showErrorCalls=FALSE, warn=-1) | |
22 library('seqinr') | |
23 if( Sys.info()[1]=="Linux"){ | |
24 library("multicore") | |
25 } | |
26 | |
27 | |
28 # Load functions and initialize global variables | |
29 source("Baseline_Functions.r") | |
30 | |
31 # Initialize parameters with user provided arguments | |
32 arg <- commandArgs(TRUE) | |
33 #arg = c(2,1,5,5,0,1,"1:26:38:55:65:104:116", "test.fasta","","sample") | |
34 #arg = c(1,1,5,5,0,1,"1:38:55:65:104:116:200", "test.fasta","","sample") | |
35 #arg = c(1,1,5,5,1,1,"1:26:38:55:65:104:116", "/home/mu37/Wu/Wu_Cloned_gapped_sequences_D-masked.fasta","/home/mu37/Wu/","Wu") | |
36 testID <- as.numeric(arg[1]) # 1 = Focused, 2 = Local | |
37 species <- as.numeric(arg[2]) # 1 = Human. 2 = Mouse | |
38 substitutionModel <- as.numeric(arg[3]) # 0 = Uniform substitution, 1 = Smith DS et al. 1996, 5 = FiveS | |
39 mutabilityModel <- as.numeric(arg[4]) # 0 = Uniform mutablity, 1 = Tri-nucleotide (Shapiro GS et al. 2002) , 5 = FiveS | |
40 clonal <- as.numeric(arg[5]) # 0 = Independent sequences, 1 = Clonally related, 2 = Clonally related & only non-terminal mutations | |
41 fixIndels <- as.numeric(arg[6]) # 0 = Do nothing, 1 = Try and fix Indels | |
42 region <- as.numeric(strsplit(arg[7],":")[[1]]) # StartPos:LastNucleotideF1:C1:F2:C2:F3:C3 | |
43 inputFilePath <- arg[8] # Full path to input file | |
44 outputPath <- arg[9] # Full path to location of output files | |
45 outputID <- arg[10] # ID for session output | |
46 | |
47 | |
48 if(testID==5){ | |
49 traitChangeModel <- 1 | |
50 if( !is.na(any(arg[11])) ) traitChangeModel <- as.numeric(arg[11]) # 1 <- Chothia 1998 | |
51 initializeTraitChange(traitChangeModel) | |
52 } | |
53 | |
54 # Initialize other parameters/variables | |
55 | |
56 # Initialzie the codon table ( definitions of R/S ) | |
57 computeCodonTable(testID) | |
58 | |
59 # Initialize | |
60 # Test Name | |
61 testName<-"Focused" | |
62 if(testID==2) testName<-"Local" | |
63 if(testID==3) testName<-"Imbalanced" | |
64 if(testID==4) testName<-"ImbalancedSilent" | |
65 | |
66 # Indel placeholders initialization | |
67 indelPos <- NULL | |
68 delPos <- NULL | |
69 insPos <- NULL | |
70 | |
71 # Initialize in Tranistion & Mutability matrixes | |
72 substitution <- initializeSubstitutionMatrix(substitutionModel,species) | |
73 mutability <- initializeMutabilityMatrix(mutabilityModel,species) | |
74 | |
75 # FWR/CDR boundaries | |
76 flagTrim <- F | |
77 if( is.na(region[7])){ | |
78 flagTrim <- T | |
79 region[7]<-region[6] | |
80 } | |
81 readStart = min(region,na.rm=T) | |
82 readEnd = max(region,na.rm=T) | |
83 if(readStart>1){ | |
84 region = region - (readStart - 1) | |
85 } | |
86 region_Nuc = c( (region[1]*3-2) , (region[2:7]*3) ) | |
87 region_Cod = region | |
88 | |
89 readStart = (readStart*3)-2 | |
90 readEnd = (readEnd*3) | |
91 | |
92 FWR_Nuc <- c( rep(TRUE,(region_Nuc[2])), | |
93 rep(FALSE,(region_Nuc[3]-region_Nuc[2])), | |
94 rep(TRUE,(region_Nuc[4]-region_Nuc[3])), | |
95 rep(FALSE,(region_Nuc[5]-region_Nuc[4])), | |
96 rep(TRUE,(region_Nuc[6]-region_Nuc[5])), | |
97 rep(FALSE,(region_Nuc[7]-region_Nuc[6])) | |
98 ) | |
99 CDR_Nuc <- (1-FWR_Nuc) | |
100 CDR_Nuc <- as.logical(CDR_Nuc) | |
101 FWR_Nuc_Mat <- matrix( rep(FWR_Nuc,4), ncol=length(FWR_Nuc), nrow=4, byrow=T) | |
102 CDR_Nuc_Mat <- matrix( rep(CDR_Nuc,4), ncol=length(CDR_Nuc), nrow=4, byrow=T) | |
103 | |
104 FWR_Codon <- c( rep(TRUE,(region[2])), | |
105 rep(FALSE,(region[3]-region[2])), | |
106 rep(TRUE,(region[4]-region[3])), | |
107 rep(FALSE,(region[5]-region[4])), | |
108 rep(TRUE,(region[6]-region[5])), | |
109 rep(FALSE,(region[7]-region[6])) | |
110 ) | |
111 CDR_Codon <- (1-FWR_Codon) | |
112 CDR_Codon <- as.logical(CDR_Codon) | |
113 | |
114 | |
115 # Read input FASTA file | |
116 tryCatch( | |
117 inputFASTA <- baseline.read.fasta(inputFilePath, seqtype="DNA",as.string=T,set.attributes=F,forceDNAtolower=F) | |
118 , error = function(ex){ | |
119 cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n") | |
120 q() | |
121 } | |
122 ) | |
123 | |
124 if (length(inputFASTA)==1) { | |
125 cat("Error|Error reading input. Please enter or upload a valid FASTA file.\n") | |
126 q() | |
127 } | |
128 | |
129 # Process sequence IDs/names | |
130 names(inputFASTA) <- sapply(names(inputFASTA),function(x){trim(x)}) | |
131 | |
132 # Convert non nucleotide characters to N | |
133 inputFASTA[length(inputFASTA)] = gsub("\t","",inputFASTA[length(inputFASTA)]) | |
134 inputFASTA <- lapply(inputFASTA,replaceNonFASTAChars) | |
135 | |
136 # Process the FASTA file and conver to Matrix[inputSequence, germlineSequence] | |
137 processedInput <- processInputAdvanced(inputFASTA) | |
138 matInput <- processedInput[[1]] | |
139 germlines <- processedInput[[2]] | |
140 lenGermlines = length(unique(germlines)) | |
141 groups <- processedInput[[3]] | |
142 lenGroups = length(unique(groups)) | |
143 rm(processedInput) | |
144 rm(inputFASTA) | |
145 | |
146 # # remove clones with less than 2 seqeunces | |
147 # tableGL <- table(germlines) | |
148 # singletons <- which(tableGL<8) | |
149 # rowsToRemove <- match(singletons,germlines) | |
150 # if(any(rowsToRemove)){ | |
151 # matInput <- matInput[-rowsToRemove,] | |
152 # germlines <- germlines[-rowsToRemove] | |
153 # groups <- groups[-rowsToRemove] | |
154 # } | |
155 # | |
156 # # remove unproductive seqs | |
157 # nonFuctionalSeqs <- sapply(rownames(matInput),function(x){any(grep("unproductive",x))}) | |
158 # if(any(nonFuctionalSeqs)){ | |
159 # if(sum(nonFuctionalSeqs)==length(germlines)){ | |
160 # write.table("Unproductive",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
161 # q() | |
162 # } | |
163 # matInput <- matInput[-which(nonFuctionalSeqs),] | |
164 # germlines <- germlines[-which(nonFuctionalSeqs)] | |
165 # germlines[1:length(germlines)] <- 1:length(germlines) | |
166 # groups <- groups[-which(nonFuctionalSeqs)] | |
167 # } | |
168 # | |
169 # if(class(matInput)=="character"){ | |
170 # write.table("All unproductive seqs",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
171 # q() | |
172 # } | |
173 # | |
174 # if(nrow(matInput)<10 | is.null(nrow(matInput))){ | |
175 # write.table(paste(nrow(matInput), "seqs only",sep=""),file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
176 # q() | |
177 # } | |
178 | |
179 # replace leading & trailing "-" with "N: | |
180 matInput <- t(apply(matInput,1,replaceLeadingTrailingDashes,readEnd)) | |
181 | |
182 # Trim (nucleotide) input sequences to the last codon | |
183 #matInput[,1] <- apply(matrix(matInput[,1]),1,trimToLastCodon) | |
184 | |
185 # # Check for Indels | |
186 # if(fixIndels){ | |
187 # delPos <- fixDeletions(matInput) | |
188 # insPos <- fixInsertions(matInput) | |
189 # }else{ | |
190 # # Check for indels | |
191 # indelPos <- checkForInDels(matInput) | |
192 # indelPos <- apply(cbind(indelPos[[1]],indelPos[[2]]),1,function(x){(x[1]==T & x[2]==T)}) | |
193 # } | |
194 | |
195 # If indels are present, remove mutations in the seqeunce & throw warning at end | |
196 #matInput[indelPos,] <- apply(matrix(matInput[indelPos,],nrow=sum(indelPos),ncol=2),1,function(x){x[1]=x[2]; return(x) }) | |
197 | |
198 colnames(matInput)=c("Input","Germline") | |
199 | |
200 # If seqeunces are clonal, create effective sequence for each clone & modify germline/group definitions | |
201 germlinesOriginal = NULL | |
202 if(clonal){ | |
203 germlinesOriginal <- germlines | |
204 collapseCloneResults <- tapply(1:nrow(matInput),germlines,function(i){ | |
205 collapseClone(matInput[i,1],matInput[i[1],2],readEnd,nonTerminalOnly=(clonal-1)) | |
206 }) | |
207 matInput = t(sapply(collapseCloneResults,function(x){return(x[[1]])})) | |
208 names_groups = tapply(groups,germlines,function(x){names(x[1])}) | |
209 groups = tapply(groups,germlines,function(x){array(x[1],dimnames=names(x[1]))}) | |
210 names(groups) = names_groups | |
211 | |
212 names_germlines = tapply(germlines,germlines,function(x){names(x[1])}) | |
213 germlines = tapply( germlines,germlines,function(x){array(x[1],dimnames=names(x[1]))} ) | |
214 names(germlines) = names_germlines | |
215 matInputErrors = sapply(collapseCloneResults,function(x){return(x[[2]])}) | |
216 } | |
217 | |
218 | |
219 # Selection Analysis | |
220 | |
221 | |
222 # if (length(germlines)>sequenceLimit) { | |
223 # # Code to parallelize processing goes here | |
224 # stop( paste("Error: Cannot process more than ", Upper_limit," sequences",sep="") ) | |
225 # } | |
226 | |
227 # if (length(germlines)<sequenceLimit) {} | |
228 | |
229 # Compute expected mutation frequencies | |
230 matExpected <- getExpectedIndividual(matInput) | |
231 | |
232 # Count observed number of mutations in the different regions | |
233 mutations <- lapply( 1:nrow(matInput), function(i){ | |
234 #cat(i,"\n") | |
235 seqI = s2c(matInput[i,1]) | |
236 seqG = s2c(matInput[i,2]) | |
237 matIGL = matrix(c(seqI,seqG),ncol=length(seqI),nrow=2,byrow=T) | |
238 retVal <- NA | |
239 tryCatch( | |
240 retVal <- analyzeMutations2NucUri(matIGL) | |
241 , error = function(ex){ | |
242 retVal <- NA | |
243 } | |
244 ) | |
245 | |
246 | |
247 return( retVal ) | |
248 }) | |
249 | |
250 matObserved <- t(sapply( mutations, processNucMutations2 )) | |
251 numberOfSeqsWithMutations <- numberOfSeqsWithMutations(matObserved, testID) | |
252 | |
253 #if(sum(numberOfSeqsWithMutations)==0){ | |
254 # write.table("No mutated sequences",file=paste(outputPath,outputID,".txt",sep=""),quote=F,sep="\t",row.names=F,col.names=T) | |
255 # q() | |
256 #} | |
257 | |
258 matMutationInfo <- cbind(matObserved,matExpected) | |
259 rm(matObserved,matExpected) | |
260 | |
261 | |
262 #Bayesian PDFs | |
263 bayes_pdf = computeBayesianScore(matMutationInfo, test=testName, max_sigma=20,length_sigma=4001) | |
264 bayesPDF_cdr = bayes_pdf[[1]] | |
265 bayesPDF_fwr = bayes_pdf[[2]] | |
266 rm(bayes_pdf) | |
267 | |
268 bayesPDF_germlines_cdr = tapply(bayesPDF_cdr,germlines,function(x) groupPosteriors(x,length_sigma=4001)) | |
269 bayesPDF_germlines_fwr = tapply(bayesPDF_fwr,germlines,function(x) groupPosteriors(x,length_sigma=4001)) | |
270 | |
271 bayesPDF_groups_cdr = tapply(bayesPDF_cdr,groups,function(x) groupPosteriors(x,length_sigma=4001)) | |
272 bayesPDF_groups_fwr = tapply(bayesPDF_fwr,groups,function(x) groupPosteriors(x,length_sigma=4001)) | |
273 | |
274 if(lenGroups>1){ | |
275 groups <- c(groups,lenGroups+1) | |
276 names(groups)[length(groups)] = "All sequences combined" | |
277 bayesPDF_groups_cdr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_cdr,length_sigma=4001) | |
278 bayesPDF_groups_fwr[[lenGroups+1]] = groupPosteriors(bayesPDF_groups_fwr,length_sigma=4001) | |
279 } | |
280 | |
281 #Bayesian Outputs | |
282 bayes_cdr = t(sapply(bayesPDF_cdr,calcBayesOutputInfo)) | |
283 bayes_fwr = t(sapply(bayesPDF_fwr,calcBayesOutputInfo)) | |
284 bayes_germlines_cdr = t(sapply(bayesPDF_germlines_cdr,calcBayesOutputInfo)) | |
285 bayes_germlines_fwr = t(sapply(bayesPDF_germlines_fwr,calcBayesOutputInfo)) | |
286 bayes_groups_cdr = t(sapply(bayesPDF_groups_cdr,calcBayesOutputInfo)) | |
287 bayes_groups_fwr = t(sapply(bayesPDF_groups_fwr,calcBayesOutputInfo)) | |
288 | |
289 #P-values | |
290 simgaP_cdr = sapply(bayesPDF_cdr,computeSigmaP) | |
291 simgaP_fwr = sapply(bayesPDF_fwr,computeSigmaP) | |
292 | |
293 simgaP_germlines_cdr = sapply(bayesPDF_germlines_cdr,computeSigmaP) | |
294 simgaP_germlines_fwr = sapply(bayesPDF_germlines_fwr,computeSigmaP) | |
295 | |
296 simgaP_groups_cdr = sapply(bayesPDF_groups_cdr,computeSigmaP) | |
297 simgaP_groups_fwr = sapply(bayesPDF_groups_fwr,computeSigmaP) | |
298 | |
299 | |
300 #Format output | |
301 | |
302 # Round expected mutation frequencies to 3 decimal places | |
303 matMutationInfo[germlinesOriginal[indelPos],] = NA | |
304 if(nrow(matMutationInfo)==1){ | |
305 matMutationInfo[5:8] = round(matMutationInfo[,5:8]/sum(matMutationInfo[,5:8],na.rm=T),3) | |
306 }else{ | |
307 matMutationInfo[,5:8] = t(round(apply(matMutationInfo[,5:8],1,function(x){ return(x/sum(x,na.rm=T)) }),3)) | |
308 } | |
309 | |
310 listPDFs = list() | |
311 nRows = length(unique(groups)) + length(unique(germlines)) + length(groups) | |
312 | |
313 matOutput = matrix(NA,ncol=18,nrow=nRows) | |
314 rowNumb = 1 | |
315 for(G in unique(groups)){ | |
316 #print(G) | |
317 matOutput[rowNumb,c(1,2,11:18)] = c("Group",names(groups)[groups==G][1],bayes_groups_cdr[G,],bayes_groups_fwr[G,],simgaP_groups_cdr[G],simgaP_groups_fwr[G]) | |
318 listPDFs[[rowNumb]] = list("CDR"=bayesPDF_groups_cdr[[G]],"FWR"=bayesPDF_groups_fwr[[G]]) | |
319 names(listPDFs)[rowNumb] = names(groups[groups==paste(G)])[1] | |
320 #if(names(groups)[which(groups==G)[1]]!="All sequences combined"){ | |
321 gs = unique(germlines[groups==G]) | |
322 rowNumb = rowNumb+1 | |
323 if( !is.na(gs) ){ | |
324 for( g in gs ){ | |
325 matOutput[rowNumb,c(1,2,11:18)] = c("Germline",names(germlines)[germlines==g][1],bayes_germlines_cdr[g,],bayes_germlines_fwr[g,],simgaP_germlines_cdr[g],simgaP_germlines_fwr[g]) | |
326 listPDFs[[rowNumb]] = list("CDR"=bayesPDF_germlines_cdr[[g]],"FWR"=bayesPDF_germlines_fwr[[g]]) | |
327 names(listPDFs)[rowNumb] = names(germlines[germlines==paste(g)])[1] | |
328 rowNumb = rowNumb+1 | |
329 indexesOfInterest = which(germlines==g) | |
330 numbSeqsOfInterest = length(indexesOfInterest) | |
331 rowNumb = seq(rowNumb,rowNumb+(numbSeqsOfInterest-1)) | |
332 matOutput[rowNumb,] = matrix( c( rep("Sequence",numbSeqsOfInterest), | |
333 rownames(matInput)[indexesOfInterest], | |
334 c(matMutationInfo[indexesOfInterest,1:4]), | |
335 c(matMutationInfo[indexesOfInterest,5:8]), | |
336 c(bayes_cdr[indexesOfInterest,]), | |
337 c(bayes_fwr[indexesOfInterest,]), | |
338 c(simgaP_cdr[indexesOfInterest]), | |
339 c(simgaP_fwr[indexesOfInterest]) | |
340 ), ncol=18, nrow=numbSeqsOfInterest,byrow=F) | |
341 increment=0 | |
342 for( ioi in indexesOfInterest){ | |
343 listPDFs[[min(rowNumb)+increment]] = list("CDR"=bayesPDF_cdr[[ioi]] , "FWR"=bayesPDF_fwr[[ioi]]) | |
344 names(listPDFs)[min(rowNumb)+increment] = rownames(matInput)[ioi] | |
345 increment = increment + 1 | |
346 } | |
347 rowNumb=max(rowNumb)+1 | |
348 | |
349 } | |
350 } | |
351 } | |
352 colsToFormat = 11:18 | |
353 matOutput[,colsToFormat] = formatC( matrix(as.numeric(matOutput[,colsToFormat]), nrow=nrow(matOutput), ncol=length(colsToFormat)) , digits=3) | |
354 matOutput[matOutput== " NaN"] = NA | |
355 | |
356 | |
357 | |
358 colnames(matOutput) = c("Type", "ID", "Observed_CDR_R", "Observed_CDR_S", "Observed_FWR_R", "Observed_FWR_S", | |
359 "Expected_CDR_R", "Expected_CDR_S", "Expected_FWR_R", "Expected_FWR_S", | |
360 paste( rep(testName,6), rep(c("Sigma","CIlower","CIupper"),2),rep(c("CDR","FWR"),each=3), sep="_"), | |
361 paste( rep(testName,2), rep("P",2),c("CDR","FWR"), sep="_") | |
362 ) | |
363 fileName = paste(outputPath,outputID,".txt",sep="") | |
364 write.table(matOutput,file=fileName,quote=F,sep="\t",row.names=T,col.names=NA) | |
365 fileName = paste(outputPath,outputID,".RData",sep="") | |
366 save(listPDFs,file=fileName) | |
367 | |
368 indelWarning = FALSE | |
369 if(sum(indelPos)>0){ | |
370 indelWarning = "<P>Warning: The following sequences have either gaps and/or deletions, and have been ommited from the analysis."; | |
371 indelWarning = paste( indelWarning , "<UL>", sep="" ) | |
372 for(indels in names(indelPos)[indelPos]){ | |
373 indelWarning = paste( indelWarning , "<LI>", indels, "</LI>", sep="" ) | |
374 } | |
375 indelWarning = paste( indelWarning , "</UL></P>", sep="" ) | |
376 } | |
377 | |
378 cloneWarning = FALSE | |
379 if(clonal==1){ | |
380 if(sum(matInputErrors)>0){ | |
381 cloneWarning = "<P>Warning: The following clones have sequences of unequal length."; | |
382 cloneWarning = paste( cloneWarning , "<UL>", sep="" ) | |
383 for(clone in names(matInputErrors)[matInputErrors]){ | |
384 cloneWarning = paste( cloneWarning , "<LI>", names(germlines)[as.numeric(clone)], "</LI>", sep="" ) | |
385 } | |
386 cloneWarning = paste( cloneWarning , "</UL></P>", sep="" ) | |
387 } | |
388 } | |
389 cat(paste("Success",outputID,indelWarning,cloneWarning,sep="|")) |