|
2
|
1 require(maftools);
|
|
|
2 library(argparse);
|
|
|
3 require(data.table);
|
|
|
4
|
|
|
5 ###
|
|
|
6
|
|
|
7 parser <- ArgumentParser(description="Create a Gene Lollipop using Maftools");
|
|
|
8
|
|
|
9 parser$add_argument(
|
|
|
10 "--input_maf", "-maf",
|
|
|
11 required="True",
|
|
|
12 help="Input Variants in MAF format"
|
|
|
13 );
|
|
|
14
|
|
|
15 parser$add_argument(
|
|
|
16 "--gene_blacklist", "-gl",
|
|
|
17 help="Input gene list with separated by newline"
|
|
|
18 );
|
|
|
19
|
|
|
20 parser$add_argument(
|
|
|
21 "--min_mut", "-mm",
|
|
|
22 default=5,
|
|
|
23 help="Minimum number of mutations seen in the gene for it to be included in the calculation");
|
|
|
24
|
|
|
25 parser$add_argument(
|
|
|
26 "--fdr", "-f",
|
|
|
27 default=0.1,
|
|
|
28 help="FDR threshold to use in plots and returned gene list");
|
|
|
29
|
|
|
30 parser$add_argument(
|
|
|
31 "--aacol", "-ac",
|
|
|
32 help="Optionally provide the name of the column that contains the amino acid annotation in your MAF file");
|
|
|
33
|
|
|
34 parser$add_argument(
|
|
|
35 "--output_detail", "-o",
|
|
|
36 required="True",
|
|
|
37 help="Output text file for oncodriveclust detail"
|
|
|
38 )
|
|
|
39
|
|
|
40 parser$add_argument(
|
|
|
41 "--output_plot", "-p",
|
|
|
42 required="True",
|
|
|
43 help="Output pdf file for oncodriveclust detail"
|
|
|
44 )
|
|
|
45
|
|
|
46 args <- parser$parse_args();
|
|
|
47
|
|
|
48 ###
|
|
|
49
|
|
|
50
|
|
|
51 aacol = 'HGVSp_Short'
|
|
|
52 if(!is.null(args$aacol)){
|
|
|
53 aacol = args$aacol
|
|
|
54 }
|
|
|
55
|
|
|
56 min_mut = as.integer(args$min_mut)
|
|
|
57
|
|
|
58
|
|
|
59 #--------------------- based on binaomial distribution, estimate threshhold.
|
|
|
60 get_threshold = function(gene_muts, gene_length){
|
|
|
61 th = which(unlist(lapply(X = 2:gene_muts, FUN = function(x) dbinom(x = x, size = gene_muts, prob = 1/gene_length) )) < 0.01)[1]
|
|
|
62 return(th+1)
|
|
|
63 }
|
|
|
64 #-------------------- end of function.
|
|
|
65
|
|
|
66 parse_prot_fix = function(dat, AACol, gl, m, calBg = FALSE, nBg){
|
|
|
67
|
|
|
68 if(is.null(AACol)){
|
|
|
69 if(! 'AAChange' %in% colnames(dat)){
|
|
|
70 message('Available fields:')
|
|
|
71 print(colnames(dat))
|
|
|
72 stop('AAChange field not found in MAF. Use argument AACol to manually specifiy field name containing protein changes.')
|
|
|
73 }
|
|
|
74 }else{
|
|
|
75 colnames(dat)[which(colnames(dat) == AACol)] = 'AAChange'
|
|
|
76 }
|
|
|
77
|
|
|
78 all.prot.dat = dat[,.(Hugo_Symbol, Variant_Classification, AAChange)]
|
|
|
79 all.prot.dat = all.prot.dat[Variant_Classification != 'Splice_Site']
|
|
|
80 #parse AAchanges to get postion
|
|
|
81 prot.spl = strsplit(x = as.character(all.prot.dat$AAChange), split = '.', fixed = TRUE)
|
|
|
82 prot.conv = sapply(prot.spl, function(x) x[length(x)])
|
|
|
83
|
|
|
84 all.prot.dat[,conv := prot.conv]
|
|
|
85 all.prot.dat = all.prot.dat[!conv == 'NULL']
|
|
|
86
|
|
|
87 #If conversions are in HGVSp_long (default HGVSp) format, we will remove strings Ter followed by anything (e.g; p.Asn1986GlnfsTer13)
|
|
|
88 pos = gsub(pattern = 'Ter.*', replacement = '',x = all.prot.dat$conv)
|
|
|
89
|
|
|
90 #Following parsing takes care of most of HGVSp_short and HGVSp_long format
|
|
|
91 pos = gsub(pattern = '[[:alpha:]]', replacement = '', x = pos)
|
|
|
92 pos = gsub(pattern = '\\*$', replacement = '', x = pos) #Remove * if nonsense mutation ends with *
|
|
|
93 pos = gsub(pattern = '^\\*', replacement = '', x = pos) #Remove * if nonsense mutation starts with *
|
|
|
94 pos = gsub(pattern = '\\*.*', replacement = '', x = pos) #Remove * followed by position e.g, p.C229Lfs*18
|
|
|
95
|
|
|
96 pos = suppressWarnings( as.numeric(sapply(strsplit(x = pos, split = '_', fixed = TRUE), '[', 1)) )
|
|
|
97 all.prot.dat[,pos := pos]
|
|
|
98
|
|
|
99 if(nrow( all.prot.dat[is.na(all.prot.dat$pos),]) > 0){
|
|
|
100 #message(paste('Removed', nrow( all.prot.dat[is.na(all.prot.dat$pos),]), 'mutations for which AA position was not available', sep = ' '))
|
|
|
101 #print(prot.dat[is.na(prot.dat$pos),])
|
|
|
102 all.prot.dat = all.prot.dat[!is.na(all.prot.dat$pos),]
|
|
|
103 }
|
|
|
104
|
|
|
105 gene.sum = summarizeMaf_fix(maf = dat)$gene.summary
|
|
|
106 #gene.sum = merge.data.frame(x = gene.sum, y = gl, by = 'Hugo_Symbol', all.x = TRUE)
|
|
|
107 gene.sum = merge(x = gene.sum, y = gl, by = 'Hugo_Symbol', all.x = TRUE)
|
|
|
108 #gene.sum = gene.sum[!is.na(gene.sum$aa.length),]
|
|
|
109 gene.sum = gene.sum[!is.na(gene.sum$aa.length)]
|
|
|
110
|
|
|
111 num_mut_colIndex = which(colnames(gene.sum) == 'total')
|
|
|
112 aalen_colIndex = which(colnames(gene.sum) == 'aa.length')
|
|
|
113
|
|
|
114 #Get background threshold
|
|
|
115 gene.sum$th = apply(gene.sum, 1, function(x) get_threshold(gene_muts = as.numeric(x[num_mut_colIndex]), gene_length = as.numeric(x[aalen_colIndex])))
|
|
|
116 #use only genes with atleast 2 (or m ) mutations.
|
|
|
117 gene.sum = gene.sum[total >= m]
|
|
|
118
|
|
|
119 if(calBg){
|
|
|
120 if(nrow(gene.sum) < nBg){
|
|
|
121 #message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)")
|
|
|
122 return(NULL)
|
|
|
123 } else{
|
|
|
124 syn.res = c()
|
|
|
125 pb <- txtProgressBar(min = 0, max = nrow(gene.sum), style = 3) #progress bar
|
|
|
126
|
|
|
127 for(i in 1:nrow(gene.sum)){
|
|
|
128 prot.dat = all.prot.dat[Hugo_Symbol == gene.sum[i, "Hugo_Symbol"]]
|
|
|
129 syn.res = rbind(syn.res, cluster_prot_fix(prot.dat = prot.dat, gene = gene.sum[i, "Hugo_Symbol"], th = gene.sum[i,"th"], protLen = gene.sum[i,"aa.length"]))
|
|
|
130 setTxtProgressBar(pb, i)
|
|
|
131 }
|
|
|
132 return(syn.res)
|
|
|
133 }
|
|
|
134 } else{
|
|
|
135 nonsyn.res = c()
|
|
|
136 pb <- txtProgressBar(min = 0, max = nrow(gene.sum), style = 3) #progress bar
|
|
|
137
|
|
|
138 for(i in 1:nrow(gene.sum)){
|
|
|
139 hs = gene.sum[i, Hugo_Symbol]
|
|
|
140 #print(hs)
|
|
|
141 prot.dat = all.prot.dat[Hugo_Symbol %in% hs]
|
|
|
142 nonsyn.res = rbind(nonsyn.res, cluster_prot_fix(prot.dat = prot.dat, gene = hs, th = gene.sum[Hugo_Symbol %in% hs, th], protLen = gene.sum[Hugo_Symbol %in% hs, aa.length]))
|
|
|
143 setTxtProgressBar(pb, i)
|
|
|
144 }
|
|
|
145 return(nonsyn.res)
|
|
|
146 }
|
|
|
147 }
|
|
|
148
|
|
|
149 cluster_prot_fix = function(prot.dat, gene, th, protLen){
|
|
|
150
|
|
|
151 mergeDist = 5 #hard coded inter event distance.
|
|
|
152 #prot.dat = all.prot.dat[Hugo_Symbol == gene]
|
|
|
153
|
|
|
154 #Summarise counts per position
|
|
|
155 pos.counts = prot.dat[,.N,pos]
|
|
|
156 pos.counts = pos.counts[order(pos)]
|
|
|
157
|
|
|
158 #classify position as meaningful if its greater than background threshhold.
|
|
|
159 pos.counts$cluster = ifelse(test = pos.counts$N >= th, yes = 'meaningful', no = 'nonMeaningful')
|
|
|
160
|
|
|
161 #Just choose meaningful positions
|
|
|
162 clust.tbl = pos.counts[cluster %in% 'meaningful']
|
|
|
163 nonclust.tbl = pos.counts[cluster %in% 'nonMeaningful']
|
|
|
164
|
|
|
165 if(nrow(clust.tbl) == 0){
|
|
|
166 #message(paste('No meaningful positions found for', gene, sep=' '))
|
|
|
167 return(NULL)
|
|
|
168 }
|
|
|
169
|
|
|
170 clust.tbl$distance = c(0,diff(clust.tbl$pos)) #calculate inter event distance.
|
|
|
171
|
|
|
172 #If more than one meaningful positions are found within a 5 aa distance, join them to form a cluster.
|
|
|
173 if(nrow(clust.tbl) > 1){
|
|
|
174
|
|
|
175 #initialize variables.
|
|
|
176 cstart = end = clust.tbl[1,pos]
|
|
|
177 n = clust.tbl[1,N]
|
|
|
178 cdf = c()
|
|
|
179 cluster = 1
|
|
|
180
|
|
|
181 #Go through entire table and update variables.
|
|
|
182 for(i in 2:nrow(clust.tbl)){
|
|
|
183 pos = clust.tbl[i,pos]
|
|
|
184
|
|
|
185 d = clust.tbl[i,distance]
|
|
|
186
|
|
|
187 if(d < mergeDist){
|
|
|
188 end = pos
|
|
|
189 n = n + clust.tbl[i,N]
|
|
|
190 }else{
|
|
|
191 tempdf = data.frame(cluster = paste('cluster', cluster, sep='_'), start = cstart, end = end ,N = n)
|
|
|
192 cdf = rbind(cdf, tempdf)
|
|
|
193 cstart = end = pos
|
|
|
194 n = clust.tbl[i,N]
|
|
|
195 cluster = cluster + 1
|
|
|
196 }
|
|
|
197 }
|
|
|
198 cdf = rbind(cdf, data.frame(cluster = paste('cluster', cluster, sep='_'), start = cstart, end = end ,N = n))
|
|
|
199 } else {
|
|
|
200 cdf = data.frame(cluster = 'cluster_1', start = clust.tbl$pos, end = clust.tbl$pos ,N = clust.tbl$N)
|
|
|
201 }
|
|
|
202
|
|
|
203 #merge adjacent variants to clusters.
|
|
|
204 for(i in 1:nrow(cdf)){
|
|
|
205 tempcdf = cdf[i,]
|
|
|
206 nonclust.tbl$startDist = nonclust.tbl$pos - tempcdf$start
|
|
|
207 nonclust.tbl$endDist = nonclust.tbl$pos - tempcdf$end
|
|
|
208
|
|
|
209 merge.adj.to.start = nonclust.tbl[startDist >= -5 & startDist <= 0]
|
|
|
210 if(nrow(merge.adj.to.start) > 0){
|
|
|
211 tempcdf$start = merge.adj.to.start[which(merge.adj.to.start$startDist == min(merge.adj.to.start$startDist)),pos]
|
|
|
212 tempcdf$N = tempcdf$N + sum(merge.adj.to.start$N)
|
|
|
213 }
|
|
|
214
|
|
|
215 merge.adj.to.end = nonclust.tbl[endDist <= 5 & endDist >= 0]
|
|
|
216 if(nrow(merge.adj.to.end) > 0){
|
|
|
217 tempcdf$end = merge.adj.to.end[which(merge.adj.to.end$endDist == max(merge.adj.to.end$endDist)),pos]
|
|
|
218 tempcdf$N = tempcdf$N + sum(merge.adj.to.end$N)
|
|
|
219 }
|
|
|
220 cdf[i,] = tempcdf
|
|
|
221 }
|
|
|
222 cdf$Hugo_Symbol = gene
|
|
|
223
|
|
|
224 #Calcluate cluster score.
|
|
|
225
|
|
|
226 total.muts = nrow(prot.dat) #total variants for this gene.
|
|
|
227 clusterScores = c()
|
|
|
228
|
|
|
229 for(i in 1:nrow(cdf)){
|
|
|
230 temp.prot.dat = prot.dat[pos >= as.numeric(cdf$start[i]) & pos <= as.numeric(cdf$end[i])]
|
|
|
231 temp.prot.dat.summary = temp.prot.dat[,.N, pos]
|
|
|
232 temp.prot.dat.summary[,fraction:= N/total.muts]
|
|
|
233
|
|
|
234 peak = temp.prot.dat.summary[N == max(N), pos]
|
|
|
235
|
|
|
236 posVector = as.numeric(temp.prot.dat.summary[,pos])
|
|
|
237 fractionMutVector = unlist(lapply(posVector, FUN = function(x) temp.prot.dat.summary[pos == x, fraction]))
|
|
|
238 distanceVector = suppressWarnings(abs(posVector - peak))
|
|
|
239
|
|
|
240 clusterScores = c(clusterScores, sum( fractionMutVector / (sqrt(2)^ distanceVector)))
|
|
|
241
|
|
|
242 }
|
|
|
243
|
|
|
244 cdf$clusterScore = clusterScores
|
|
|
245
|
|
|
246 gene.clust.res = data.frame(Hugo_Symbol = gene, clusters = nrow(cdf), muts_in_clusters = sum(cdf$N), clusterScores = sum(cdf$clusterScore), protLen = protLen)
|
|
|
247 return(gene.clust.res)
|
|
|
248 }
|
|
|
249
|
|
|
250
|
|
|
251
|
|
|
252
|
|
|
253
|
|
|
254 createOncoMatrix<- function(maf){
|
|
|
255
|
|
|
256 message('Creating oncomatrix (this might take a while)..')
|
|
|
257
|
|
|
258 oncomat = data.table::dcast(data = maf[,.(Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode)], formula = Hugo_Symbol ~ Tumor_Sample_Barcode,
|
|
|
259 fun.aggregate = function(x) {ifelse(test = length(as.character(x))>1 ,
|
|
|
260 no = as.character(x), yes = vcr(x, gis = FALSE))
|
|
|
261 }, value.var = 'Variant_Classification', fill = '')
|
|
|
262
|
|
|
263 #If maf contains only one sample converting to matrix is not trivial.
|
|
|
264 if(ncol(oncomat) == 2){
|
|
|
265 genes = oncomat[,Hugo_Symbol]
|
|
|
266 sampleId = colnames(oncomat)[2]
|
|
|
267 oncomat = as.matrix(data.frame(row.names = genes, sample = oncomat[,2, with =FALSE]))
|
|
|
268 }else if(nrow(oncomat) == 1){
|
|
|
269 #If MAF has only one gene
|
|
|
270 gene = oncomat[,Hugo_Symbol]
|
|
|
271 oncomat[,Hugo_Symbol:= NULL]
|
|
|
272 oncomat = as.matrix(oncomat)
|
|
|
273 rownames(oncomat) = gene
|
|
|
274 sampleID = colnames(oncomat)
|
|
|
275 }else{
|
|
|
276 oncomat = as.matrix(oncomat)
|
|
|
277 rownames(oncomat) = oncomat[,1]
|
|
|
278 oncomat = oncomat[,-1]
|
|
|
279 }
|
|
|
280
|
|
|
281 variant.classes = as.character(unique(maf[,Variant_Classification]))
|
|
|
282 variant.classes = c('',variant.classes, 'Multi_Hit')
|
|
|
283 names(variant.classes) = 0:(length(variant.classes)-1)
|
|
|
284
|
|
|
285 #Complex variant classes will be assigned a single integer.
|
|
|
286 vc.onc = unique(unlist(apply(oncomat, 2, unique)))
|
|
|
287 vc.onc = vc.onc[!vc.onc %in% names(variant.classes)]
|
|
|
288 names(vc.onc) = rep(as.character(as.numeric(names(variant.classes)[length(variant.classes)])+1), length(vc.onc))
|
|
|
289 variant.classes2 = c(variant.classes, vc.onc)
|
|
|
290
|
|
|
291 oncomat.copy <- oncomat
|
|
|
292 #Make a numeric coded matrix
|
|
|
293 for(i in 1:length(variant.classes2)){
|
|
|
294 oncomat[oncomat == variant.classes2[i]] = names(variant.classes2)[i]
|
|
|
295 }
|
|
|
296
|
|
|
297 #If maf has only one gene
|
|
|
298 if(nrow(oncomat) == 1){
|
|
|
299 mdf = t(matrix(as.numeric(oncomat)))
|
|
|
300 rownames(mdf) = gene
|
|
|
301 colnames(mdf) = sampleID
|
|
|
302 return(list(oncomat = oncomat.copy, nummat = mdf, vc = variant.classes))
|
|
|
303 }
|
|
|
304
|
|
|
305 #convert from character to numeric
|
|
|
306 mdf = as.matrix(apply(oncomat, 2, function(x) as.numeric(as.character(x))))
|
|
|
307 rownames(mdf) = rownames(oncomat.copy)
|
|
|
308
|
|
|
309 message('Sorting..')
|
|
|
310
|
|
|
311 #If MAF file contains a single sample, simple sorting is enuf.
|
|
|
312 if(ncol(mdf) == 1){
|
|
|
313 mdf = as.matrix(mdf[order(mdf, decreasing = TRUE),])
|
|
|
314 colnames(mdf) = sampleId
|
|
|
315
|
|
|
316 oncomat.copy = as.matrix(oncomat.copy[rownames(mdf),])
|
|
|
317 colnames(oncomat.copy) = sampleId
|
|
|
318
|
|
|
319 return(list(oncomat = oncomat.copy, nummat = mdf, vc = variant.classes))
|
|
|
320 } else{
|
|
|
321 #Sort by rows as well columns if >1 samples present in MAF
|
|
|
322 #Add total variants per gene
|
|
|
323 mdf = cbind(mdf, variants = apply(mdf, 1, function(x) {
|
|
|
324 length(x[x != "0"])
|
|
|
325 }))
|
|
|
326 #Sort by total variants
|
|
|
327 mdf = mdf[order(mdf[, ncol(mdf)], decreasing = TRUE), ]
|
|
|
328 colnames(mdf) = gsub(pattern = "^X", replacement = "", colnames(mdf))
|
|
|
329 nMut = mdf[, ncol(mdf)]
|
|
|
330
|
|
|
331 mdf = mdf[, -ncol(mdf)]
|
|
|
332
|
|
|
333 mdf.temp.copy = mdf #temp copy of original unsorted numeric coded matrix
|
|
|
334
|
|
|
335 mdf[mdf != 0] = 1 #replacing all non-zero integers with 1 improves sorting (& grouping)
|
|
|
336 tmdf = t(mdf) #transposematrix
|
|
|
337 mdf = t(tmdf[do.call(order, c(as.list(as.data.frame(tmdf)), decreasing = TRUE)), ]) #sort
|
|
|
338
|
|
|
339 mdf.temp.copy = mdf.temp.copy[rownames(mdf),] #organise original matrix into sorted matrix
|
|
|
340 mdf.temp.copy = mdf.temp.copy[,colnames(mdf)]
|
|
|
341 mdf = mdf.temp.copy
|
|
|
342
|
|
|
343 #organise original character matrix into sorted matrix
|
|
|
344 oncomat.copy <- oncomat.copy[,colnames(mdf)]
|
|
|
345 oncomat.copy <- oncomat.copy[rownames(mdf),]
|
|
|
346
|
|
|
347 return(list(oncomat = oncomat.copy, nummat = mdf, vc = variant.classes))
|
|
|
348 }
|
|
|
349 }
|
|
|
350
|
|
|
351 validateMaf<-function(maf, rdup = TRUE, isTCGA = isTCGA){
|
|
|
352
|
|
|
353 #necessary fields.
|
|
|
354 required.fields = c('Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2',
|
|
|
355 'Variant_Classification', 'Variant_Type', 'Tumor_Sample_Barcode')
|
|
|
356
|
|
|
357 #Change column names to standard names; i.e, camel case
|
|
|
358 for(i in 1:length(required.fields)){
|
|
|
359 colId = suppressWarnings(grep(pattern = required.fields[i], x = colnames(maf), ignore.case = TRUE))
|
|
|
360 if(length(colId) > 0){
|
|
|
361 colnames(maf)[colId] = required.fields[i]
|
|
|
362 }
|
|
|
363 }
|
|
|
364
|
|
|
365 missing.fileds = required.fields[!required.fields %in% colnames(maf)] #check if any of them are missing
|
|
|
366
|
|
|
367 if(length(missing.fileds) > 0){
|
|
|
368 missing.fileds = paste(missing.fileds[1], sep = ',', collapse = ', ')
|
|
|
369 stop(paste('missing required fields from MAF:', missing.fileds)) #stop if any of required.fields are missing
|
|
|
370 }
|
|
|
371
|
|
|
372 #convert "-" to "." in "Tumor_Sample_Barcode" to avoid complexity in naming
|
|
|
373 maf$Tumor_Sample_Barcode = gsub(pattern = '-', replacement = '.', x = as.character(maf$Tumor_Sample_Barcode))
|
|
|
374
|
|
|
375 if(rdup){
|
|
|
376 maf = maf[, variantId := paste(Chromosome, Start_Position, Tumor_Sample_Barcode, sep = ':')]
|
|
|
377 if(nrow(maf[duplicated(variantId)]) > 0){
|
|
|
378 message("NOTE: Removed ", nrow(maf[duplicated(variantId)]) ," duplicated variants")
|
|
|
379 maf = maf[!duplicated(variantId)]
|
|
|
380 }
|
|
|
381 maf[,variantId := NULL]
|
|
|
382 }
|
|
|
383
|
|
|
384 if(nrow(maf[Hugo_Symbol %in% ""]) > 0){
|
|
|
385 message('NOTE: Found ', nrow(maf[Hugo_Symbol %in% ""]), ' variants with no Gene Symbols.')
|
|
|
386 print(maf[Hugo_Symbol %in% "", required.fields, with = FALSE])
|
|
|
387 message("Annotating them as 'UnknownGene' for convenience")
|
|
|
388 maf$Hugo_Symbol = ifelse(test = maf$Hugo_Symbol == "", yes = 'UnknownGene', no = maf$Hugo_Symbol)
|
|
|
389 }
|
|
|
390
|
|
|
391 if(nrow(maf[is.na(Hugo_Symbol)]) > 0){
|
|
|
392 message('NOTE: Found ', nrow(maf[is.na(Hugo_Symbol) > 0]), ' variants with no Gene Symbols.')
|
|
|
393 print(maf[is.na(Hugo_Symbol), required.fields, with =FALSE])
|
|
|
394 message("Annotating them as 'UnknownGene' for convenience")
|
|
|
395 maf$Hugo_Symbol = ifelse(test = is.na(maf$Hugo_Symbol), yes = 'UnknownGene', no = maf$Hugo_Symbol)
|
|
|
396 }
|
|
|
397
|
|
|
398 if(isTCGA){
|
|
|
399 maf$Tumor_Sample_Barcode = substr(x = maf$Tumor_Sample_Barcode, start = 1, stop = 12)
|
|
|
400 }
|
|
|
401
|
|
|
402 return(maf)
|
|
|
403 }
|
|
|
404
|
|
|
405 read.maf_fix = function(maf, removeSilent = TRUE, useAll = TRUE, gisticAllLesionsFile = NULL, gisticAmpGenesFile = NULL,
|
|
|
406 gisticDelGenesFile = NULL, cnTable = NULL, removeDuplicatedVariants = TRUE, isTCGA = FALSE){
|
|
|
407
|
|
|
408 message('reading maf..')
|
|
|
409
|
|
|
410 if(as.logical(length(grep(pattern = 'gz$', x = maf, fixed = FALSE)))){
|
|
|
411 #If system is Linux use fread, else use gz connection to read gz file.
|
|
|
412 if(Sys.info()[['sysname']] == 'Windows'){
|
|
|
413 maf.gz = gzfile(description = maf, open = 'r')
|
|
|
414 suppressWarnings(maf <- data.table(read.csv(file = maf.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)))
|
|
|
415 close(maf.gz)
|
|
|
416 } else{
|
|
|
417 maf = suppressWarnings(data.table::fread(input = paste('zcat <', maf), sep = '\t', stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE))
|
|
|
418 }
|
|
|
419 } else{
|
|
|
420 suppressWarnings(maf <- data.table::fread(input = maf, sep = "\t", stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE))
|
|
|
421 }
|
|
|
422
|
|
|
423 #validate MAF file
|
|
|
424 maf = validateMaf(maf = maf, isTCGA = isTCGA, rdup = removeDuplicatedVariants)
|
|
|
425
|
|
|
426 #validation check for variants classified as Somatic in Mutation_Status field.
|
|
|
427 if(length(colnames(maf)[colnames(x = maf) %in% 'Mutation_Status']) > 0){
|
|
|
428 if(!useAll){
|
|
|
429 message('Using only Somatic variants from Mutation_Status. Switch on useAll to include everything.')
|
|
|
430 maf = maf[Mutation_Status %in% "Somatic"]
|
|
|
431
|
|
|
432 if(nrow(maf) == 0){
|
|
|
433 stop('No more Somatic mutations left after filtering for Mutation_Status! Maybe set useAll to TRUE ?')
|
|
|
434 }
|
|
|
435
|
|
|
436 #maf = subset(maf, Mutation_Status == 'Somatic')
|
|
|
437 }else {
|
|
|
438 message('Using all variants.')
|
|
|
439 }
|
|
|
440 }else{
|
|
|
441 message('Mutation_Status not found. Assuming all variants are Somatic and validated.')
|
|
|
442 }
|
|
|
443 #Variant Classification with Low/Modifier variant consequences. http://asia.ensembl.org/Help/Glossary?id=535
|
|
|
444 silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron",
|
|
|
445 "RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA")
|
|
|
446 #Variant Classification with High/Moderate variant consequences. http://asia.ensembl.org/Help/Glossary?id=535
|
|
|
447 vc.nonSilent = c("Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site",
|
|
|
448 "Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del",
|
|
|
449 "In_Frame_Ins", "Missense_Mutation")
|
|
|
450
|
|
|
451 maf.silent = maf[Variant_Classification %in% silent]
|
|
|
452
|
|
|
453 if(removeSilent){
|
|
|
454
|
|
|
455 if(nrow(maf.silent) > 0){
|
|
|
456 maf.silent.vc = maf.silent[,.N, .(Tumor_Sample_Barcode, Variant_Classification)]
|
|
|
457 maf.silent.vc.cast = data.table::dcast(data = maf.silent.vc, formula = Tumor_Sample_Barcode ~ Variant_Classification, fill = 0, value.var = 'N') #why dcast is not returning it as data.table ?
|
|
|
458 summary.silent = data.table(ID = c('Samples',colnames(maf.silent.vc.cast)[2:ncol(maf.silent.vc.cast)]),
|
|
|
459 N = c(nrow(maf.silent.vc.cast), colSums(maf.silent.vc.cast[,2:ncol(maf.silent.vc.cast), with = FALSE])))
|
|
|
460
|
|
|
461 maf = maf[!Variant_Classification %in% silent] #Remove silent variants from main table
|
|
|
462 message(paste('Excluding',nrow(maf.silent), 'silent variants.'))
|
|
|
463 print(summary.silent)
|
|
|
464 } else{
|
|
|
465 message(message(paste('Excluding',nrow(maf.silent), 'silent variants.')))
|
|
|
466 }
|
|
|
467 }else{
|
|
|
468 message('Silent variants are being kept!')
|
|
|
469 }
|
|
|
470
|
|
|
471 if(!is.null(gisticAllLesionsFile)){
|
|
|
472 gisticIp = readGistic(gisticAllLesionsFile = gisticAllLesionsFile, gisticAmpGenesFile = gisticAmpGenesFile,
|
|
|
473 gisticDelGenesFile = gisticDelGenesFile, isTCGA = isTCGA)
|
|
|
474 gisticIp = gisticIp@data
|
|
|
475
|
|
|
476 gisticIp[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')]
|
|
|
477 gisticIp = gisticIp[!duplicated(id)]
|
|
|
478 gisticIp[,id := NULL]
|
|
|
479
|
|
|
480 maf = rbind(maf, gisticIp, fill =TRUE)
|
|
|
481 oncomat = createOncoMatrix(maf)
|
|
|
482 }else if(!is.null(cnTable)){
|
|
|
483 message('Processing copy number data..')
|
|
|
484 cnDat = data.table::fread(input = cnTable, sep = '\t', stringsAsFactors = FALSE, header = TRUE, colClasses = 'character')
|
|
|
485 colnames(cnDat) = c('Hugo_Symbol', 'Tumor_Sample_Barcode', 'Variant_Classification')
|
|
|
486 cnDat$Variant_Type = 'CNV'
|
|
|
487 suppressWarnings(cnDat[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')])
|
|
|
488 cnDat = cnDat[!duplicated(id)]
|
|
|
489 cnDat[,id := NULL]
|
|
|
490 maf = rbind(maf, cnDat, fill =TRUE)
|
|
|
491 oncomat = createOncoMatrix(maf)
|
|
|
492 }else{
|
|
|
493 oncomat = createOncoMatrix(maf)
|
|
|
494 }
|
|
|
495
|
|
|
496 #convert to factors
|
|
|
497 maf$Variant_Type = as.factor(as.character(maf$Variant_Type))
|
|
|
498 maf$Variant_Classification = as.factor(as.character(maf$Variant_Classification))
|
|
|
499 maf$Tumor_Sample_Barcode = as.factor(as.character(maf$Tumor_Sample_Barcode))
|
|
|
500
|
|
|
501 message('Summarizing..')
|
|
|
502 mafSummary = summarizeMaf_fix(maf = maf)
|
|
|
503
|
|
|
504 #Create MAF object
|
|
|
505 m = MAF(data = maf, variants.per.sample = mafSummary$variants.per.sample, variant.type.summary = mafSummary$variant.type.summary,
|
|
|
506 variant.classification.summary = mafSummary$variant.classification.summary,gene.summary = mafSummary$gene.summary,
|
|
|
507 oncoMatrix = oncomat$oncomat, numericMatrix = oncomat$nummat, summary = mafSummary$summary,
|
|
|
508 classCode = oncomat$vc, maf.silent = maf.silent)
|
|
|
509
|
|
|
510
|
|
|
511 message('Done !')
|
|
|
512 return(m)
|
|
|
513 }
|
|
|
514
|
|
|
515
|
|
|
516 #' Class MAF
|
|
|
517 #' @description S4 class for storing summarized MAF.
|
|
|
518 #' @slot data data.table of original MAF file.
|
|
|
519 #' @slot variants.per.sample table containing variants per sample
|
|
|
520 #' @slot variant.type.summary table containing variant types per sample
|
|
|
521 #' @slot variant.classification.summary table containing variant classification per sample
|
|
|
522 #' @slot gene.summary table containing variant classification per gene
|
|
|
523 #' @slot oncoMatrix character matrix of dimension n*m where n is number of genes and m is number of variants
|
|
|
524 #' @slot numericMatrix numeric matrix of dimension n*m where n is number of genes and m is number of variants
|
|
|
525 #' @slot summary table with basic MAF summary stats
|
|
|
526 #' @slot classCode mapping between numeric values in numericMatrix and Variant Classification
|
|
|
527 #' @slot maf.silent subset of main MAF containing only silent variants
|
|
|
528 #' @exportClass MAF
|
|
|
529 #' @import methods
|
|
|
530 #' @seealso \code{\link{getGeneSummary}} \code{\link{getSampleSummary}} \code{\link{getFields}}
|
|
|
531
|
|
|
532 ## MAF object
|
|
|
533 MAF <- setClass(Class = 'MAF', slots = c(data = 'data.table', variants.per.sample = 'data.table', variant.type.summary = 'data.table',
|
|
|
534 variant.classification.summary = 'data.table', gene.summary = 'data.table', oncoMatrix = 'matrix',
|
|
|
535 numericMatrix = 'matrix', summary = 'data.table', classCode = 'character',
|
|
|
536 maf.silent = 'data.table'))
|
|
|
537
|
|
|
538 setMethod(f = 'show', signature = 'MAF', definition = function(object){
|
|
|
539 cat(paste('An object of class ', class(object), "\n"))
|
|
|
540 print(object@summary)
|
|
|
541 })
|
|
|
542
|
|
|
543
|
|
|
544 summarizeMaf_fix = function(maf){
|
|
|
545
|
|
|
546 if('NCBI_Build' %in% colnames(maf)){
|
|
|
547 NCBI_Build = unique(maf[!Variant_Type %in% 'CNV', NCBI_Build])
|
|
|
548 NCBI_Build = NCBI_Build[!is.na(NCBI_Build)]
|
|
|
549
|
|
|
550 if(length(NCBI_Build) > 1){
|
|
|
551 message('NOTE: Mutiple reference builds found!')
|
|
|
552 NCBI_Build = do.call(paste, c(as.list(NCBI_Build), sep=";"))
|
|
|
553 message(NCBI_Build)
|
|
|
554 }
|
|
|
555 }else{
|
|
|
556 NCBI_Build = NA
|
|
|
557 }
|
|
|
558
|
|
|
559 if('Center' %in% colnames(maf)){
|
|
|
560 Center = unique(maf[!Variant_Type %in% 'CNV', Center])
|
|
|
561 #Center = Center[is.na(Center)]
|
|
|
562 if(length(Center) > 1){
|
|
|
563 message('Mutiple centers found.')
|
|
|
564 Center = do.call(paste, c(as.list(Center), sep=";"))
|
|
|
565 print(Center)
|
|
|
566 }
|
|
|
567 }else{
|
|
|
568 Center = NA
|
|
|
569 }
|
|
|
570
|
|
|
571 #nGenes
|
|
|
572 nGenes = length(unique(maf[,Hugo_Symbol]))
|
|
|
573
|
|
|
574
|
|
|
575
|
|
|
576 #Top 20 FLAGS - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4267152/
|
|
|
577 flags = c("TTN", "MUC16", "OBSCN", "AHNAK2", "SYNE1", "FLG", "MUC5B",
|
|
|
578 "DNAH17", "PLEC", "DST", "SYNE2", "NEB", "HSPG2", "LAMA5", "AHNAK",
|
|
|
579 "HMCN1", "USH2A", "DNAH11", "MACF1", "MUC17")
|
|
|
580
|
|
|
581 #Variants per TSB
|
|
|
582 tsb = maf[,.N, Tumor_Sample_Barcode]
|
|
|
583 colnames(tsb)[2] = 'Variants'
|
|
|
584 tsb = tsb[order(tsb$Variants, decreasing = TRUE),]
|
|
|
585
|
|
|
586 #summarise and casting by 'Variant_Classification'
|
|
|
587 vc = maf[,.N, .(Tumor_Sample_Barcode, Variant_Classification )]
|
|
|
588 vc.cast = data.table::dcast(data = vc, formula = Tumor_Sample_Barcode ~ Variant_Classification, fill = 0, value.var = 'N')
|
|
|
589
|
|
|
590 if(any(colnames(vc.cast) %in% c('Amp', 'Del'))){
|
|
|
591 vc.cast.cnv = vc.cast[,colnames(vc.cast)[colnames(vc.cast) %in% c('Amp', 'Del')], with =FALSE]
|
|
|
592 vc.cast.cnv$CNV_total = rowSums(x = vc.cast.cnv)
|
|
|
593
|
|
|
594 vc.cast = vc.cast[,!colnames(vc.cast)[colnames(vc.cast) %in% c('Amp', 'Del')], with =FALSE]
|
|
|
595 vc.cast[,total:=rowSums(vc.cast[,2:ncol(vc.cast), with = FALSE])]
|
|
|
596
|
|
|
597 vc.cast = cbind(vc.cast, vc.cast.cnv)
|
|
|
598 vc.cast = vc.cast[order(total, CNV_total, decreasing = TRUE)]
|
|
|
599
|
|
|
600 vc.mean = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, mean))))
|
|
|
601 vc.median = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, median))))
|
|
|
602 }else{
|
|
|
603 vc.cast[,total:=rowSums(vc.cast[,2:ncol(vc.cast), with = FALSE])]
|
|
|
604 vc.cast = vc.cast[order(total, decreasing = TRUE)]
|
|
|
605
|
|
|
606 vc.mean = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, mean))))
|
|
|
607 vc.median = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, median))))
|
|
|
608 }
|
|
|
609
|
|
|
610 #summarise and casting by 'Variant_Type'
|
|
|
611 vt = maf[,.N, .(Tumor_Sample_Barcode, Variant_Type )]
|
|
|
612 vt.cast = data.table::dcast(data = vt, formula = Tumor_Sample_Barcode ~ Variant_Type, value.var = 'N', fill = 0)
|
|
|
613 if(any(colnames(vt.cast) %in% c('CNV'))){
|
|
|
614 vt.cast.cnv = vt.cast[,colnames(vt.cast)[colnames(vt.cast) %in% c('CNV')], with =FALSE]
|
|
|
615
|
|
|
616 vt.cast = vt.cast[,!colnames(vt.cast)[colnames(vt.cast) %in% c('CNV')], with =FALSE]
|
|
|
617 vt.cast[,total:=rowSums(vt.cast[,2:ncol(vt.cast), with = FALSE])]
|
|
|
618 vt.cast = vt.cast[order(total, decreasing = TRUE)]
|
|
|
619
|
|
|
620 vt.cast = cbind(vt.cast, vt.cast.cnv)
|
|
|
621 vt.cast[order(total, CNV, decreasing = TRUE)]
|
|
|
622 }else{
|
|
|
623 vt.cast[,total:=rowSums(vt.cast[,2:ncol(vt.cast), with = FALSE])]
|
|
|
624 vt.cast = vt.cast[order(total, decreasing = TRUE)]
|
|
|
625 }
|
|
|
626
|
|
|
627 #summarise and casting by 'Hugo_Symbol'
|
|
|
628 hs = maf[,.N, .(Hugo_Symbol, Variant_Classification)]
|
|
|
629 hs.cast = data.table::dcast(data = hs, formula = Hugo_Symbol ~Variant_Classification, fill = 0, value.var = 'N')
|
|
|
630 #----
|
|
|
631 if(any(colnames(hs.cast) %in% c('Amp', 'Del'))){
|
|
|
632 hs.cast.cnv = hs.cast[,colnames(hs.cast)[colnames(hs.cast) %in% c('Amp', 'Del')], with =FALSE]
|
|
|
633 hs.cast.cnv$CNV_total = rowSums(x = hs.cast.cnv)
|
|
|
634
|
|
|
635 hs.cast = hs.cast[,!colnames(hs.cast)[colnames(hs.cast) %in% c('Amp', 'Del')], with =FALSE]
|
|
|
636 hs.cast[,total:=rowSums(hs.cast[,2:ncol(hs.cast), with = FALSE])]
|
|
|
637
|
|
|
638 hs.cast = cbind(hs.cast, hs.cast.cnv)
|
|
|
639 hs.cast = hs.cast[order(total, CNV_total, decreasing = TRUE)]
|
|
|
640
|
|
|
641 }else{
|
|
|
642 hs.cast[,total:=rowSums(hs.cast[,2:ncol(hs.cast), with = FALSE])]
|
|
|
643 hs.cast = hs.cast[order(total, decreasing = TRUE)]
|
|
|
644
|
|
|
645 }
|
|
|
646 #Get in how many samples a gene ismutated
|
|
|
647 numMutatedSamples = maf[!Variant_Type %in% 'CNV', .(MutatedSamples = length(unique(Tumor_Sample_Barcode))), by = Hugo_Symbol]
|
|
|
648 #Merge and sort
|
|
|
649 hs.cast = merge(hs.cast, numMutatedSamples, by = 'Hugo_Symbol', all = TRUE)
|
|
|
650 hs.cast = hs.cast[order(MutatedSamples, total, decreasing = TRUE)]
|
|
|
651 #Make a summarized table
|
|
|
652 summary = data.table::data.table(ID = c('NCBI_Build', 'Center','Samples', 'nGenes',colnames(vc.cast)[2:ncol(vc.cast)]),
|
|
|
653 summary = c(NCBI_Build, Center, nrow(vc.cast), nGenes, colSums(vc.cast[,2:ncol(vc.cast), with =FALSE])))
|
|
|
654 summary[,Mean := vc.mean]
|
|
|
655 summary[,Median := vc.median]
|
|
|
656
|
|
|
657 print(summary)
|
|
|
658
|
|
|
659 message("Frequently mutated genes..")
|
|
|
660 print(hs.cast)
|
|
|
661
|
|
|
662 #Check for flags.
|
|
|
663 if(nrow(hs.cast) > 10){
|
|
|
664 topten = hs.cast[1:10, Hugo_Symbol]
|
|
|
665 topten = topten[topten %in% flags]
|
|
|
666 if(length(topten) > 0){
|
|
|
667 message('NOTE: Possible FLAGS among top ten genes:')
|
|
|
668 print(topten)
|
|
|
669 }
|
|
|
670 }
|
|
|
671
|
|
|
672 return(list(variants.per.sample = tsb, variant.type.summary = vt.cast, variant.classification.summary = vc.cast,
|
|
|
673 gene.summary = hs.cast, summary = summary))
|
|
|
674 }
|
|
|
675
|
|
|
676 oncodrive_fix = function(maf, AACol = NULL, minMut = 5, pvalMethod = 'zscore', nBgGenes = 100, bgEstimate = TRUE, ignoreGenes = NULL){
|
|
|
677
|
|
|
678 #Proetin Length source
|
|
|
679 gl = system.file('extdata', 'prot_len.txt.gz', package = 'maftools')
|
|
|
680
|
|
|
681 if(Sys.info()[['sysname']] == 'Windows'){
|
|
|
682 gl.gz = gzfile(description = gl, open = 'r')
|
|
|
683 gl <- suppressWarnings( data.table(read.csv( file = gl.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) )
|
|
|
684 close(gl.gz)
|
|
|
685 } else{
|
|
|
686 gl = data.table::fread(input = paste('zcat <', gl), sep = '\t', stringsAsFactors = FALSE)
|
|
|
687 }
|
|
|
688
|
|
|
689 pval.options = c('zscore', 'poisson', 'combined')
|
|
|
690
|
|
|
691 if(!pvalMethod %in% pval.options){
|
|
|
692 stop('pvalMethod can only be either zscore, poisson or combined')
|
|
|
693 }
|
|
|
694
|
|
|
695 if(length(pvalMethod) > 1){
|
|
|
696 stop('pvalMethod can only be either zscore, poisson or combined')
|
|
|
697 }
|
|
|
698
|
|
|
699
|
|
|
700 #syn variants for background
|
|
|
701 syn.maf = maf@maf.silent
|
|
|
702 #number of samples in maf
|
|
|
703 numSamples = as.numeric(maf@summary[3,summary])
|
|
|
704 #Perform clustering and calculate background scores.
|
|
|
705 if(bgEstimate){
|
|
|
706 if(nrow(syn.maf) == 0){
|
|
|
707 message('No syn mutations found! Skipping background estimation. Using predefined values. (Mean = 0.279; SD = 0.13)')
|
|
|
708 bg.mean = 0.279
|
|
|
709 bg.sd = 0.13
|
|
|
710 }else{
|
|
|
711 message('Estimating background scores from synonymous variants..')
|
|
|
712 syn.bg.scores = parse_prot_fix(dat = syn.maf, AACol = AACol, gl, m = minMut, calBg = TRUE, nBg = nBgGenes)
|
|
|
713
|
|
|
714 #If number of genes to calculate background scores is not enough, use predefined scores.
|
|
|
715 if(is.null(syn.bg.scores)){
|
|
|
716 message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)")
|
|
|
717 bg.mean = 0.279
|
|
|
718 bg.sd = 0.13
|
|
|
719 }else {
|
|
|
720 if(nrow(syn.bg.scores) < nBgGenes){
|
|
|
721 message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)")
|
|
|
722 bg.mean = 0.279
|
|
|
723 bg.sd = 0.13
|
|
|
724 }else{
|
|
|
725 bg.mean = mean(syn.bg.scores$clusterScores)
|
|
|
726 bg.sd = sd(syn.bg.scores$clusterScores)
|
|
|
727 message(paste('Estimated background mean: ', bg.mean))
|
|
|
728 message(paste('Estimated background SD: ', bg.sd))
|
|
|
729 }
|
|
|
730 }
|
|
|
731 }
|
|
|
732 }else{
|
|
|
733 message("Using predefined values for background. (Mean = 0.279; SD = 0.13)")
|
|
|
734 bg.mean = 0.279
|
|
|
735 bg.sd = 0.13
|
|
|
736 }
|
|
|
737
|
|
|
738
|
|
|
739
|
|
|
740 #non-syn variants
|
|
|
741 non.syn.maf = maf@data
|
|
|
742 #Variant Classification with Low/Modifier variant consequences. http://asia.ensembl.org/Help/Glossary?id=535
|
|
|
743 silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron",
|
|
|
744 "RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA", "Amp", "Del")
|
|
|
745 non.syn.maf = non.syn.maf[!Variant_Classification %in% silent] #Remove silent variants from main table
|
|
|
746
|
|
|
747 #Remove genes to ignore
|
|
|
748 if(!is.null(ignoreGenes)){
|
|
|
749 ignoreGenes.count = nrow(non.syn.maf[Hugo_Symbol %in% ignoreGenes])
|
|
|
750 message(paste('Removed', ignoreGenes.count, 'variants belonging to', paste(ignoreGenes, collapse = ', ', sep=',')))
|
|
|
751 non.syn.maf = non.syn.maf[!Hugo_Symbol %in% ignoreGenes]
|
|
|
752 }
|
|
|
753
|
|
|
754 #Perform clustering and calculate cluster scores for nonsyn variants.
|
|
|
755 message('Estimating cluster scores from non-syn variants..')
|
|
|
756 nonsyn.scores = parse_prot_fix(dat = non.syn.maf, AACol = AACol, gl = gl, m = minMut, calBg = FALSE, nBg = nBgGenes)
|
|
|
757
|
|
|
758 if(pvalMethod == 'combined'){
|
|
|
759 message('Comapring with background model and estimating p-values..')
|
|
|
760 nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd
|
|
|
761 nonsyn.scores$tPval = 1- pnorm(nonsyn.scores$zscore)
|
|
|
762 nonsyn.scores$tFdr = p.adjust(nonsyn.scores$tPval, method = 'fdr')
|
|
|
763
|
|
|
764 nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
|
|
|
765 nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]
|
|
|
766
|
|
|
767 counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model
|
|
|
768 nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model
|
|
|
769
|
|
|
770 observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total')
|
|
|
771 expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected')
|
|
|
772
|
|
|
773 #Poisson test to caluclate difference (p-value)
|
|
|
774 nonsyn.scores$poissonPval = apply(nonsyn.scores, 1, function(x) {
|
|
|
775 poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value
|
|
|
776 })
|
|
|
777
|
|
|
778 nonsyn.scores$poissonFdr = p.adjust(nonsyn.scores$poissonPval)
|
|
|
779 nonsyn.scores = nonsyn.scores[order(poissonFdr)]
|
|
|
780
|
|
|
781 nonsyn.scores$fdr = apply(nonsyn.scores[,.(tFdr, poissonFdr)], MARGIN = 1, FUN = min)
|
|
|
782
|
|
|
783 } else if(pvalMethod == 'zscore'){
|
|
|
784 #Oncodrive clust way of caluclating pvalues
|
|
|
785 #Calculate z scores; compare it to bg scores and estimate z-score, pvalues, corrected pvalues (fdr) (assumes normal distribution)
|
|
|
786 message('Comapring with background model and estimating p-values..')
|
|
|
787 nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd
|
|
|
788 nonsyn.scores$pval = 1- pnorm(nonsyn.scores$zscore)
|
|
|
789 nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval, method = 'fdr')
|
|
|
790
|
|
|
791 nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
|
|
|
792 nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]
|
|
|
793 #nonsyn.scores[,fract_MutatedSamples := MutatedSamples/numSamples]
|
|
|
794 nonsyn.scores = nonsyn.scores[order(fdr)]
|
|
|
795 }else{
|
|
|
796 #Assuming poisson distribution of mutation counts
|
|
|
797 #Now model observed number of mutations as a function of number of clusters and protein length. Calculate expected number of events based on poisson distribution.
|
|
|
798 nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol')
|
|
|
799 nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total]
|
|
|
800
|
|
|
801 counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model
|
|
|
802 nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model
|
|
|
803
|
|
|
804 observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total')
|
|
|
805 expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected')
|
|
|
806
|
|
|
807 #Poisson test to caluclate difference (p-value)
|
|
|
808 nonsyn.scores$pval = apply(nonsyn.scores, 1, function(x) {
|
|
|
809 poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value
|
|
|
810 })
|
|
|
811
|
|
|
812 nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval)
|
|
|
813 nonsyn.scores = nonsyn.scores[order(fdr)]
|
|
|
814 }
|
|
|
815 message('Done !')
|
|
|
816 return(nonsyn.scores)
|
|
|
817 }
|
|
|
818
|
|
|
819
|
|
|
820 laml = read.maf(maf = args$input_maf, removeSilent = F, useAll = T)
|
|
|
821
|
|
|
822 if(is.null(args$gene_blacklist)){
|
|
|
823 laml.sig = oncodrive(maf =laml, AACol = aacol, pvalMethod = 'zscore',minMut = min_mut)
|
|
|
824 write.table(laml.sig,file=args$output_detail, quote=FALSE,row.names=FALSE,sep="\t")
|
|
|
825 pdf(args$output_plot)
|
|
|
826 plotOncodrive(res=laml.sig,fdrCutOff=as.numeric(args$fdr),useFraction=TRUE)
|
|
|
827 dev.off()
|
|
|
828 }else{
|
|
|
829 all_genes <- read.table(args$gene_blacklist, stringsAsFactors=FALSE)[,1]
|
|
|
830 laml.sig = oncodrive(maf =laml, AACol = aacol, pvalMethod = 'zscore',minMut = min_mut,ignoreGenes=all_genes)
|
|
|
831 write.table(laml.sig,file=args$output_detail, quote=FALSE,row.names=FALSE,sep="\t")
|
|
|
832 pdf(args$output_plot)
|
|
|
833 plotOncodrive(res=laml.sig,fdrCutOff=as.numeric(args$fdr),useFraction=TRUE)
|
|
|
834 dev.off()
|
|
|
835 } |