Mercurial > repos > morinlab > maftools
changeset 4:8263ddfcd202 draft
Deleted selected files
| author | morinlab |
|---|---|
| date | Sun, 04 Dec 2016 20:48:24 -0500 |
| parents | 7ed78705ba77 |
| children | b78c81a7ebaf |
| files | oncodriveclust_merge.R |
| diffstat | 1 files changed, 0 insertions(+), 835 deletions(-) [+] |
line wrap: on
line diff
--- a/oncodriveclust_merge.R Sun Dec 04 20:48:19 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,835 +0,0 @@ -require(maftools); -library(argparse); -require(data.table); - -### - -parser <- ArgumentParser(description="Create a Gene Lollipop using Maftools"); - -parser$add_argument( - "--input_maf", "-maf", - required="True", - help="Input Variants in MAF format" - ); - -parser$add_argument( - "--gene_blacklist", "-gl", - help="Input gene list with separated by newline" - ); - -parser$add_argument( - "--min_mut", "-mm", - default=5, - help="Minimum number of mutations seen in the gene for it to be included in the calculation"); - -parser$add_argument( - "--fdr", "-f", - default=0.1, - help="FDR threshold to use in plots and returned gene list"); - -parser$add_argument( - "--aacol", "-ac", - help="Optionally provide the name of the column that contains the amino acid annotation in your MAF file"); - -parser$add_argument( - "--output_detail", "-o", - required="True", - help="Output text file for oncodriveclust detail" - ) - -parser$add_argument( - "--output_plot", "-p", - required="True", - help="Output pdf file for oncodriveclust detail" - ) - -args <- parser$parse_args(); - -### - - -aacol = 'HGVSp_Short' -if(!is.null(args$aacol)){ -aacol = args$aacol -} - -min_mut = as.integer(args$min_mut) - - -#--------------------- based on binaomial distribution, estimate threshhold. -get_threshold = function(gene_muts, gene_length){ - th = which(unlist(lapply(X = 2:gene_muts, FUN = function(x) dbinom(x = x, size = gene_muts, prob = 1/gene_length) )) < 0.01)[1] - return(th+1) - } - #-------------------- end of function. - -parse_prot_fix = function(dat, AACol, gl, m, calBg = FALSE, nBg){ - - if(is.null(AACol)){ - if(! 'AAChange' %in% colnames(dat)){ - message('Available fields:') - print(colnames(dat)) - stop('AAChange field not found in MAF. Use argument AACol to manually specifiy field name containing protein changes.') - } - }else{ - colnames(dat)[which(colnames(dat) == AACol)] = 'AAChange' - } - - all.prot.dat = dat[,.(Hugo_Symbol, Variant_Classification, AAChange)] - all.prot.dat = all.prot.dat[Variant_Classification != 'Splice_Site'] - #parse AAchanges to get postion - prot.spl = strsplit(x = as.character(all.prot.dat$AAChange), split = '.', fixed = TRUE) - prot.conv = sapply(prot.spl, function(x) x[length(x)]) - - all.prot.dat[,conv := prot.conv] - all.prot.dat = all.prot.dat[!conv == 'NULL'] - - #If conversions are in HGVSp_long (default HGVSp) format, we will remove strings Ter followed by anything (e.g; p.Asn1986GlnfsTer13) - pos = gsub(pattern = 'Ter.*', replacement = '',x = all.prot.dat$conv) - - #Following parsing takes care of most of HGVSp_short and HGVSp_long format - pos = gsub(pattern = '[[:alpha:]]', replacement = '', x = pos) - pos = gsub(pattern = '\\*$', replacement = '', x = pos) #Remove * if nonsense mutation ends with * - pos = gsub(pattern = '^\\*', replacement = '', x = pos) #Remove * if nonsense mutation starts with * - pos = gsub(pattern = '\\*.*', replacement = '', x = pos) #Remove * followed by position e.g, p.C229Lfs*18 - - pos = suppressWarnings( as.numeric(sapply(strsplit(x = pos, split = '_', fixed = TRUE), '[', 1)) ) - all.prot.dat[,pos := pos] - - if(nrow( all.prot.dat[is.na(all.prot.dat$pos),]) > 0){ - #message(paste('Removed', nrow( all.prot.dat[is.na(all.prot.dat$pos),]), 'mutations for which AA position was not available', sep = ' ')) - #print(prot.dat[is.na(prot.dat$pos),]) - all.prot.dat = all.prot.dat[!is.na(all.prot.dat$pos),] - } - - gene.sum = summarizeMaf_fix(maf = dat)$gene.summary - #gene.sum = merge.data.frame(x = gene.sum, y = gl, by = 'Hugo_Symbol', all.x = TRUE) - gene.sum = merge(x = gene.sum, y = gl, by = 'Hugo_Symbol', all.x = TRUE) - #gene.sum = gene.sum[!is.na(gene.sum$aa.length),] - gene.sum = gene.sum[!is.na(gene.sum$aa.length)] - - num_mut_colIndex = which(colnames(gene.sum) == 'total') - aalen_colIndex = which(colnames(gene.sum) == 'aa.length') - - #Get background threshold - gene.sum$th = apply(gene.sum, 1, function(x) get_threshold(gene_muts = as.numeric(x[num_mut_colIndex]), gene_length = as.numeric(x[aalen_colIndex]))) - #use only genes with atleast 2 (or m ) mutations. - gene.sum = gene.sum[total >= m] - - if(calBg){ - if(nrow(gene.sum) < nBg){ - #message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)") - return(NULL) - } else{ - syn.res = c() - pb <- txtProgressBar(min = 0, max = nrow(gene.sum), style = 3) #progress bar - - for(i in 1:nrow(gene.sum)){ - prot.dat = all.prot.dat[Hugo_Symbol == gene.sum[i, "Hugo_Symbol"]] - syn.res = rbind(syn.res, cluster_prot_fix(prot.dat = prot.dat, gene = gene.sum[i, "Hugo_Symbol"], th = gene.sum[i,"th"], protLen = gene.sum[i,"aa.length"])) - setTxtProgressBar(pb, i) - } - return(syn.res) - } - } else{ - nonsyn.res = c() - pb <- txtProgressBar(min = 0, max = nrow(gene.sum), style = 3) #progress bar - - for(i in 1:nrow(gene.sum)){ - hs = gene.sum[i, Hugo_Symbol] - #print(hs) - prot.dat = all.prot.dat[Hugo_Symbol %in% hs] - nonsyn.res = rbind(nonsyn.res, cluster_prot_fix(prot.dat = prot.dat, gene = hs, th = gene.sum[Hugo_Symbol %in% hs, th], protLen = gene.sum[Hugo_Symbol %in% hs, aa.length])) - setTxtProgressBar(pb, i) - } - return(nonsyn.res) - } - } - -cluster_prot_fix = function(prot.dat, gene, th, protLen){ - - mergeDist = 5 #hard coded inter event distance. - #prot.dat = all.prot.dat[Hugo_Symbol == gene] - - #Summarise counts per position - pos.counts = prot.dat[,.N,pos] - pos.counts = pos.counts[order(pos)] - - #classify position as meaningful if its greater than background threshhold. - pos.counts$cluster = ifelse(test = pos.counts$N >= th, yes = 'meaningful', no = 'nonMeaningful') - - #Just choose meaningful positions - clust.tbl = pos.counts[cluster %in% 'meaningful'] - nonclust.tbl = pos.counts[cluster %in% 'nonMeaningful'] - - if(nrow(clust.tbl) == 0){ - #message(paste('No meaningful positions found for', gene, sep=' ')) - return(NULL) - } - - clust.tbl$distance = c(0,diff(clust.tbl$pos)) #calculate inter event distance. - - #If more than one meaningful positions are found within a 5 aa distance, join them to form a cluster. - if(nrow(clust.tbl) > 1){ - - #initialize variables. - cstart = end = clust.tbl[1,pos] - n = clust.tbl[1,N] - cdf = c() - cluster = 1 - - #Go through entire table and update variables. - for(i in 2:nrow(clust.tbl)){ - pos = clust.tbl[i,pos] - - d = clust.tbl[i,distance] - - if(d < mergeDist){ - end = pos - n = n + clust.tbl[i,N] - }else{ - tempdf = data.frame(cluster = paste('cluster', cluster, sep='_'), start = cstart, end = end ,N = n) - cdf = rbind(cdf, tempdf) - cstart = end = pos - n = clust.tbl[i,N] - cluster = cluster + 1 - } - } - cdf = rbind(cdf, data.frame(cluster = paste('cluster', cluster, sep='_'), start = cstart, end = end ,N = n)) - } else { - cdf = data.frame(cluster = 'cluster_1', start = clust.tbl$pos, end = clust.tbl$pos ,N = clust.tbl$N) - } - - #merge adjacent variants to clusters. - for(i in 1:nrow(cdf)){ - tempcdf = cdf[i,] - nonclust.tbl$startDist = nonclust.tbl$pos - tempcdf$start - nonclust.tbl$endDist = nonclust.tbl$pos - tempcdf$end - - merge.adj.to.start = nonclust.tbl[startDist >= -5 & startDist <= 0] - if(nrow(merge.adj.to.start) > 0){ - tempcdf$start = merge.adj.to.start[which(merge.adj.to.start$startDist == min(merge.adj.to.start$startDist)),pos] - tempcdf$N = tempcdf$N + sum(merge.adj.to.start$N) - } - - merge.adj.to.end = nonclust.tbl[endDist <= 5 & endDist >= 0] - if(nrow(merge.adj.to.end) > 0){ - tempcdf$end = merge.adj.to.end[which(merge.adj.to.end$endDist == max(merge.adj.to.end$endDist)),pos] - tempcdf$N = tempcdf$N + sum(merge.adj.to.end$N) - } - cdf[i,] = tempcdf - } - cdf$Hugo_Symbol = gene - - #Calcluate cluster score. - - total.muts = nrow(prot.dat) #total variants for this gene. - clusterScores = c() - - for(i in 1:nrow(cdf)){ - temp.prot.dat = prot.dat[pos >= as.numeric(cdf$start[i]) & pos <= as.numeric(cdf$end[i])] - temp.prot.dat.summary = temp.prot.dat[,.N, pos] - temp.prot.dat.summary[,fraction:= N/total.muts] - - peak = temp.prot.dat.summary[N == max(N), pos] - - posVector = as.numeric(temp.prot.dat.summary[,pos]) - fractionMutVector = unlist(lapply(posVector, FUN = function(x) temp.prot.dat.summary[pos == x, fraction])) - distanceVector = suppressWarnings(abs(posVector - peak)) - - clusterScores = c(clusterScores, sum( fractionMutVector / (sqrt(2)^ distanceVector))) - - } - - cdf$clusterScore = clusterScores - - gene.clust.res = data.frame(Hugo_Symbol = gene, clusters = nrow(cdf), muts_in_clusters = sum(cdf$N), clusterScores = sum(cdf$clusterScore), protLen = protLen) - return(gene.clust.res) - } - - - - - -createOncoMatrix<- function(maf){ - - message('Creating oncomatrix (this might take a while)..') - - oncomat = data.table::dcast(data = maf[,.(Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode)], formula = Hugo_Symbol ~ Tumor_Sample_Barcode, - fun.aggregate = function(x) {ifelse(test = length(as.character(x))>1 , - no = as.character(x), yes = vcr(x, gis = FALSE)) - }, value.var = 'Variant_Classification', fill = '') - - #If maf contains only one sample converting to matrix is not trivial. - if(ncol(oncomat) == 2){ - genes = oncomat[,Hugo_Symbol] - sampleId = colnames(oncomat)[2] - oncomat = as.matrix(data.frame(row.names = genes, sample = oncomat[,2, with =FALSE])) - }else if(nrow(oncomat) == 1){ - #If MAF has only one gene - gene = oncomat[,Hugo_Symbol] - oncomat[,Hugo_Symbol:= NULL] - oncomat = as.matrix(oncomat) - rownames(oncomat) = gene - sampleID = colnames(oncomat) - }else{ - oncomat = as.matrix(oncomat) - rownames(oncomat) = oncomat[,1] - oncomat = oncomat[,-1] - } - - variant.classes = as.character(unique(maf[,Variant_Classification])) - variant.classes = c('',variant.classes, 'Multi_Hit') - names(variant.classes) = 0:(length(variant.classes)-1) - - #Complex variant classes will be assigned a single integer. - vc.onc = unique(unlist(apply(oncomat, 2, unique))) - vc.onc = vc.onc[!vc.onc %in% names(variant.classes)] - names(vc.onc) = rep(as.character(as.numeric(names(variant.classes)[length(variant.classes)])+1), length(vc.onc)) - variant.classes2 = c(variant.classes, vc.onc) - - oncomat.copy <- oncomat - #Make a numeric coded matrix - for(i in 1:length(variant.classes2)){ - oncomat[oncomat == variant.classes2[i]] = names(variant.classes2)[i] - } - - #If maf has only one gene - if(nrow(oncomat) == 1){ - mdf = t(matrix(as.numeric(oncomat))) - rownames(mdf) = gene - colnames(mdf) = sampleID - return(list(oncomat = oncomat.copy, nummat = mdf, vc = variant.classes)) - } - - #convert from character to numeric - mdf = as.matrix(apply(oncomat, 2, function(x) as.numeric(as.character(x)))) - rownames(mdf) = rownames(oncomat.copy) - - message('Sorting..') - - #If MAF file contains a single sample, simple sorting is enuf. - if(ncol(mdf) == 1){ - mdf = as.matrix(mdf[order(mdf, decreasing = TRUE),]) - colnames(mdf) = sampleId - - oncomat.copy = as.matrix(oncomat.copy[rownames(mdf),]) - colnames(oncomat.copy) = sampleId - - return(list(oncomat = oncomat.copy, nummat = mdf, vc = variant.classes)) - } else{ - #Sort by rows as well columns if >1 samples present in MAF - #Add total variants per gene - mdf = cbind(mdf, variants = apply(mdf, 1, function(x) { - length(x[x != "0"]) - })) - #Sort by total variants - mdf = mdf[order(mdf[, ncol(mdf)], decreasing = TRUE), ] - colnames(mdf) = gsub(pattern = "^X", replacement = "", colnames(mdf)) - nMut = mdf[, ncol(mdf)] - - mdf = mdf[, -ncol(mdf)] - - mdf.temp.copy = mdf #temp copy of original unsorted numeric coded matrix - - mdf[mdf != 0] = 1 #replacing all non-zero integers with 1 improves sorting (& grouping) - tmdf = t(mdf) #transposematrix - mdf = t(tmdf[do.call(order, c(as.list(as.data.frame(tmdf)), decreasing = TRUE)), ]) #sort - - mdf.temp.copy = mdf.temp.copy[rownames(mdf),] #organise original matrix into sorted matrix - mdf.temp.copy = mdf.temp.copy[,colnames(mdf)] - mdf = mdf.temp.copy - - #organise original character matrix into sorted matrix - oncomat.copy <- oncomat.copy[,colnames(mdf)] - oncomat.copy <- oncomat.copy[rownames(mdf),] - - return(list(oncomat = oncomat.copy, nummat = mdf, vc = variant.classes)) - } - } - -validateMaf<-function(maf, rdup = TRUE, isTCGA = isTCGA){ - - #necessary fields. - required.fields = c('Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', - 'Variant_Classification', 'Variant_Type', 'Tumor_Sample_Barcode') - - #Change column names to standard names; i.e, camel case - for(i in 1:length(required.fields)){ - colId = suppressWarnings(grep(pattern = required.fields[i], x = colnames(maf), ignore.case = TRUE)) - if(length(colId) > 0){ - colnames(maf)[colId] = required.fields[i] - } - } - - missing.fileds = required.fields[!required.fields %in% colnames(maf)] #check if any of them are missing - - if(length(missing.fileds) > 0){ - missing.fileds = paste(missing.fileds[1], sep = ',', collapse = ', ') - stop(paste('missing required fields from MAF:', missing.fileds)) #stop if any of required.fields are missing - } - - #convert "-" to "." in "Tumor_Sample_Barcode" to avoid complexity in naming - maf$Tumor_Sample_Barcode = gsub(pattern = '-', replacement = '.', x = as.character(maf$Tumor_Sample_Barcode)) - - if(rdup){ - maf = maf[, variantId := paste(Chromosome, Start_Position, Tumor_Sample_Barcode, sep = ':')] - if(nrow(maf[duplicated(variantId)]) > 0){ - message("NOTE: Removed ", nrow(maf[duplicated(variantId)]) ," duplicated variants") - maf = maf[!duplicated(variantId)] - } - maf[,variantId := NULL] - } - - if(nrow(maf[Hugo_Symbol %in% ""]) > 0){ - message('NOTE: Found ', nrow(maf[Hugo_Symbol %in% ""]), ' variants with no Gene Symbols.') - print(maf[Hugo_Symbol %in% "", required.fields, with = FALSE]) - message("Annotating them as 'UnknownGene' for convenience") - maf$Hugo_Symbol = ifelse(test = maf$Hugo_Symbol == "", yes = 'UnknownGene', no = maf$Hugo_Symbol) - } - - if(nrow(maf[is.na(Hugo_Symbol)]) > 0){ - message('NOTE: Found ', nrow(maf[is.na(Hugo_Symbol) > 0]), ' variants with no Gene Symbols.') - print(maf[is.na(Hugo_Symbol), required.fields, with =FALSE]) - message("Annotating them as 'UnknownGene' for convenience") - maf$Hugo_Symbol = ifelse(test = is.na(maf$Hugo_Symbol), yes = 'UnknownGene', no = maf$Hugo_Symbol) - } - - if(isTCGA){ - maf$Tumor_Sample_Barcode = substr(x = maf$Tumor_Sample_Barcode, start = 1, stop = 12) - } - - return(maf) - } - -read.maf_fix = function(maf, removeSilent = TRUE, useAll = TRUE, gisticAllLesionsFile = NULL, gisticAmpGenesFile = NULL, - gisticDelGenesFile = NULL, cnTable = NULL, removeDuplicatedVariants = TRUE, isTCGA = FALSE){ - - message('reading maf..') - - if(as.logical(length(grep(pattern = 'gz$', x = maf, fixed = FALSE)))){ - #If system is Linux use fread, else use gz connection to read gz file. - if(Sys.info()[['sysname']] == 'Windows'){ - maf.gz = gzfile(description = maf, open = 'r') - suppressWarnings(maf <- data.table(read.csv(file = maf.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE))) - close(maf.gz) - } else{ - maf = suppressWarnings(data.table::fread(input = paste('zcat <', maf), sep = '\t', stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE)) - } - } else{ - suppressWarnings(maf <- data.table::fread(input = maf, sep = "\t", stringsAsFactors = FALSE, verbose = FALSE, data.table = TRUE, showProgress = TRUE, header = TRUE)) - } - - #validate MAF file - maf = validateMaf(maf = maf, isTCGA = isTCGA, rdup = removeDuplicatedVariants) - - #validation check for variants classified as Somatic in Mutation_Status field. - if(length(colnames(maf)[colnames(x = maf) %in% 'Mutation_Status']) > 0){ - if(!useAll){ - message('Using only Somatic variants from Mutation_Status. Switch on useAll to include everything.') - maf = maf[Mutation_Status %in% "Somatic"] - - if(nrow(maf) == 0){ - stop('No more Somatic mutations left after filtering for Mutation_Status! Maybe set useAll to TRUE ?') - } - - #maf = subset(maf, Mutation_Status == 'Somatic') - }else { - message('Using all variants.') - } - }else{ - message('Mutation_Status not found. Assuming all variants are Somatic and validated.') - } - #Variant Classification with Low/Modifier variant consequences. http://asia.ensembl.org/Help/Glossary?id=535 - silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron", - "RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA") - #Variant Classification with High/Moderate variant consequences. http://asia.ensembl.org/Help/Glossary?id=535 - vc.nonSilent = c("Frame_Shift_Del", "Frame_Shift_Ins", "Splice_Site", "Translation_Start_Site", - "Nonsense_Mutation", "Nonstop_Mutation", "In_Frame_Del", - "In_Frame_Ins", "Missense_Mutation") - - maf.silent = maf[Variant_Classification %in% silent] - - if(removeSilent){ - - if(nrow(maf.silent) > 0){ - maf.silent.vc = maf.silent[,.N, .(Tumor_Sample_Barcode, Variant_Classification)] - maf.silent.vc.cast = data.table::dcast(data = maf.silent.vc, formula = Tumor_Sample_Barcode ~ Variant_Classification, fill = 0, value.var = 'N') #why dcast is not returning it as data.table ? - summary.silent = data.table(ID = c('Samples',colnames(maf.silent.vc.cast)[2:ncol(maf.silent.vc.cast)]), - N = c(nrow(maf.silent.vc.cast), colSums(maf.silent.vc.cast[,2:ncol(maf.silent.vc.cast), with = FALSE]))) - - maf = maf[!Variant_Classification %in% silent] #Remove silent variants from main table - message(paste('Excluding',nrow(maf.silent), 'silent variants.')) - print(summary.silent) - } else{ - message(message(paste('Excluding',nrow(maf.silent), 'silent variants.'))) - } - }else{ - message('Silent variants are being kept!') - } - - if(!is.null(gisticAllLesionsFile)){ - gisticIp = readGistic(gisticAllLesionsFile = gisticAllLesionsFile, gisticAmpGenesFile = gisticAmpGenesFile, - gisticDelGenesFile = gisticDelGenesFile, isTCGA = isTCGA) - gisticIp = gisticIp@data - - gisticIp[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')] - gisticIp = gisticIp[!duplicated(id)] - gisticIp[,id := NULL] - - maf = rbind(maf, gisticIp, fill =TRUE) - oncomat = createOncoMatrix(maf) - }else if(!is.null(cnTable)){ - message('Processing copy number data..') - cnDat = data.table::fread(input = cnTable, sep = '\t', stringsAsFactors = FALSE, header = TRUE, colClasses = 'character') - colnames(cnDat) = c('Hugo_Symbol', 'Tumor_Sample_Barcode', 'Variant_Classification') - cnDat$Variant_Type = 'CNV' - suppressWarnings(cnDat[, id := paste(Hugo_Symbol, Tumor_Sample_Barcode, sep=':')]) - cnDat = cnDat[!duplicated(id)] - cnDat[,id := NULL] - maf = rbind(maf, cnDat, fill =TRUE) - oncomat = createOncoMatrix(maf) - }else{ - oncomat = createOncoMatrix(maf) - } - - #convert to factors - maf$Variant_Type = as.factor(as.character(maf$Variant_Type)) - maf$Variant_Classification = as.factor(as.character(maf$Variant_Classification)) - maf$Tumor_Sample_Barcode = as.factor(as.character(maf$Tumor_Sample_Barcode)) - - message('Summarizing..') - mafSummary = summarizeMaf_fix(maf = maf) - - #Create MAF object - m = MAF(data = maf, variants.per.sample = mafSummary$variants.per.sample, variant.type.summary = mafSummary$variant.type.summary, - variant.classification.summary = mafSummary$variant.classification.summary,gene.summary = mafSummary$gene.summary, - oncoMatrix = oncomat$oncomat, numericMatrix = oncomat$nummat, summary = mafSummary$summary, - classCode = oncomat$vc, maf.silent = maf.silent) - - - message('Done !') - return(m) -} - - -#' Class MAF -#' @description S4 class for storing summarized MAF. -#' @slot data data.table of original MAF file. -#' @slot variants.per.sample table containing variants per sample -#' @slot variant.type.summary table containing variant types per sample -#' @slot variant.classification.summary table containing variant classification per sample -#' @slot gene.summary table containing variant classification per gene -#' @slot oncoMatrix character matrix of dimension n*m where n is number of genes and m is number of variants -#' @slot numericMatrix numeric matrix of dimension n*m where n is number of genes and m is number of variants -#' @slot summary table with basic MAF summary stats -#' @slot classCode mapping between numeric values in numericMatrix and Variant Classification -#' @slot maf.silent subset of main MAF containing only silent variants -#' @exportClass MAF -#' @import methods -#' @seealso \code{\link{getGeneSummary}} \code{\link{getSampleSummary}} \code{\link{getFields}} - -## MAF object -MAF <- setClass(Class = 'MAF', slots = c(data = 'data.table', variants.per.sample = 'data.table', variant.type.summary = 'data.table', - variant.classification.summary = 'data.table', gene.summary = 'data.table', oncoMatrix = 'matrix', - numericMatrix = 'matrix', summary = 'data.table', classCode = 'character', - maf.silent = 'data.table')) - -setMethod(f = 'show', signature = 'MAF', definition = function(object){ - cat(paste('An object of class ', class(object), "\n")) - print(object@summary) - }) - - -summarizeMaf_fix = function(maf){ - - if('NCBI_Build' %in% colnames(maf)){ - NCBI_Build = unique(maf[!Variant_Type %in% 'CNV', NCBI_Build]) - NCBI_Build = NCBI_Build[!is.na(NCBI_Build)] - - if(length(NCBI_Build) > 1){ - message('NOTE: Mutiple reference builds found!') - NCBI_Build = do.call(paste, c(as.list(NCBI_Build), sep=";")) - message(NCBI_Build) - } - }else{ - NCBI_Build = NA - } - - if('Center' %in% colnames(maf)){ - Center = unique(maf[!Variant_Type %in% 'CNV', Center]) - #Center = Center[is.na(Center)] - if(length(Center) > 1){ - message('Mutiple centers found.') - Center = do.call(paste, c(as.list(Center), sep=";")) - print(Center) - } - }else{ - Center = NA - } - - #nGenes - nGenes = length(unique(maf[,Hugo_Symbol])) - - - - #Top 20 FLAGS - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4267152/ - flags = c("TTN", "MUC16", "OBSCN", "AHNAK2", "SYNE1", "FLG", "MUC5B", - "DNAH17", "PLEC", "DST", "SYNE2", "NEB", "HSPG2", "LAMA5", "AHNAK", - "HMCN1", "USH2A", "DNAH11", "MACF1", "MUC17") - - #Variants per TSB - tsb = maf[,.N, Tumor_Sample_Barcode] - colnames(tsb)[2] = 'Variants' - tsb = tsb[order(tsb$Variants, decreasing = TRUE),] - - #summarise and casting by 'Variant_Classification' - vc = maf[,.N, .(Tumor_Sample_Barcode, Variant_Classification )] - vc.cast = data.table::dcast(data = vc, formula = Tumor_Sample_Barcode ~ Variant_Classification, fill = 0, value.var = 'N') - - if(any(colnames(vc.cast) %in% c('Amp', 'Del'))){ - vc.cast.cnv = vc.cast[,colnames(vc.cast)[colnames(vc.cast) %in% c('Amp', 'Del')], with =FALSE] - vc.cast.cnv$CNV_total = rowSums(x = vc.cast.cnv) - - vc.cast = vc.cast[,!colnames(vc.cast)[colnames(vc.cast) %in% c('Amp', 'Del')], with =FALSE] - vc.cast[,total:=rowSums(vc.cast[,2:ncol(vc.cast), with = FALSE])] - - vc.cast = cbind(vc.cast, vc.cast.cnv) - vc.cast = vc.cast[order(total, CNV_total, decreasing = TRUE)] - - vc.mean = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, mean)))) - vc.median = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, median)))) - }else{ - vc.cast[,total:=rowSums(vc.cast[,2:ncol(vc.cast), with = FALSE])] - vc.cast = vc.cast[order(total, decreasing = TRUE)] - - vc.mean = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, mean)))) - vc.median = as.numeric(as.character(c(NA, NA, NA, NA, apply(vc.cast[,2:ncol(vc.cast), with = FALSE], 2, median)))) - } - - #summarise and casting by 'Variant_Type' - vt = maf[,.N, .(Tumor_Sample_Barcode, Variant_Type )] - vt.cast = data.table::dcast(data = vt, formula = Tumor_Sample_Barcode ~ Variant_Type, value.var = 'N', fill = 0) - if(any(colnames(vt.cast) %in% c('CNV'))){ - vt.cast.cnv = vt.cast[,colnames(vt.cast)[colnames(vt.cast) %in% c('CNV')], with =FALSE] - - vt.cast = vt.cast[,!colnames(vt.cast)[colnames(vt.cast) %in% c('CNV')], with =FALSE] - vt.cast[,total:=rowSums(vt.cast[,2:ncol(vt.cast), with = FALSE])] - vt.cast = vt.cast[order(total, decreasing = TRUE)] - - vt.cast = cbind(vt.cast, vt.cast.cnv) - vt.cast[order(total, CNV, decreasing = TRUE)] - }else{ - vt.cast[,total:=rowSums(vt.cast[,2:ncol(vt.cast), with = FALSE])] - vt.cast = vt.cast[order(total, decreasing = TRUE)] - } - - #summarise and casting by 'Hugo_Symbol' - hs = maf[,.N, .(Hugo_Symbol, Variant_Classification)] - hs.cast = data.table::dcast(data = hs, formula = Hugo_Symbol ~Variant_Classification, fill = 0, value.var = 'N') - #---- - if(any(colnames(hs.cast) %in% c('Amp', 'Del'))){ - hs.cast.cnv = hs.cast[,colnames(hs.cast)[colnames(hs.cast) %in% c('Amp', 'Del')], with =FALSE] - hs.cast.cnv$CNV_total = rowSums(x = hs.cast.cnv) - - hs.cast = hs.cast[,!colnames(hs.cast)[colnames(hs.cast) %in% c('Amp', 'Del')], with =FALSE] - hs.cast[,total:=rowSums(hs.cast[,2:ncol(hs.cast), with = FALSE])] - - hs.cast = cbind(hs.cast, hs.cast.cnv) - hs.cast = hs.cast[order(total, CNV_total, decreasing = TRUE)] - - }else{ - hs.cast[,total:=rowSums(hs.cast[,2:ncol(hs.cast), with = FALSE])] - hs.cast = hs.cast[order(total, decreasing = TRUE)] - - } - #Get in how many samples a gene ismutated - numMutatedSamples = maf[!Variant_Type %in% 'CNV', .(MutatedSamples = length(unique(Tumor_Sample_Barcode))), by = Hugo_Symbol] - #Merge and sort - hs.cast = merge(hs.cast, numMutatedSamples, by = 'Hugo_Symbol', all = TRUE) - hs.cast = hs.cast[order(MutatedSamples, total, decreasing = TRUE)] - #Make a summarized table - summary = data.table::data.table(ID = c('NCBI_Build', 'Center','Samples', 'nGenes',colnames(vc.cast)[2:ncol(vc.cast)]), - summary = c(NCBI_Build, Center, nrow(vc.cast), nGenes, colSums(vc.cast[,2:ncol(vc.cast), with =FALSE]))) - summary[,Mean := vc.mean] - summary[,Median := vc.median] - - print(summary) - - message("Frequently mutated genes..") - print(hs.cast) - - #Check for flags. - if(nrow(hs.cast) > 10){ - topten = hs.cast[1:10, Hugo_Symbol] - topten = topten[topten %in% flags] - if(length(topten) > 0){ - message('NOTE: Possible FLAGS among top ten genes:') - print(topten) - } - } - - return(list(variants.per.sample = tsb, variant.type.summary = vt.cast, variant.classification.summary = vc.cast, - gene.summary = hs.cast, summary = summary)) -} - -oncodrive_fix = function(maf, AACol = NULL, minMut = 5, pvalMethod = 'zscore', nBgGenes = 100, bgEstimate = TRUE, ignoreGenes = NULL){ - - #Proetin Length source - gl = system.file('extdata', 'prot_len.txt.gz', package = 'maftools') - - if(Sys.info()[['sysname']] == 'Windows'){ - gl.gz = gzfile(description = gl, open = 'r') - gl <- suppressWarnings( data.table(read.csv( file = gl.gz, header = TRUE, sep = '\t', stringsAsFactors = FALSE)) ) - close(gl.gz) - } else{ - gl = data.table::fread(input = paste('zcat <', gl), sep = '\t', stringsAsFactors = FALSE) - } - - pval.options = c('zscore', 'poisson', 'combined') - - if(!pvalMethod %in% pval.options){ - stop('pvalMethod can only be either zscore, poisson or combined') - } - - if(length(pvalMethod) > 1){ - stop('pvalMethod can only be either zscore, poisson or combined') - } - - - #syn variants for background - syn.maf = maf@maf.silent - #number of samples in maf - numSamples = as.numeric(maf@summary[3,summary]) - #Perform clustering and calculate background scores. - if(bgEstimate){ - if(nrow(syn.maf) == 0){ - message('No syn mutations found! Skipping background estimation. Using predefined values. (Mean = 0.279; SD = 0.13)') - bg.mean = 0.279 - bg.sd = 0.13 - }else{ - message('Estimating background scores from synonymous variants..') - syn.bg.scores = parse_prot_fix(dat = syn.maf, AACol = AACol, gl, m = minMut, calBg = TRUE, nBg = nBgGenes) - - #If number of genes to calculate background scores is not enough, use predefined scores. - if(is.null(syn.bg.scores)){ - message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)") - bg.mean = 0.279 - bg.sd = 0.13 - }else { - if(nrow(syn.bg.scores) < nBgGenes){ - message("Not enough genes to build background. Using predefined values. (Mean = 0.279; SD = 0.13)") - bg.mean = 0.279 - bg.sd = 0.13 - }else{ - bg.mean = mean(syn.bg.scores$clusterScores) - bg.sd = sd(syn.bg.scores$clusterScores) - message(paste('Estimated background mean: ', bg.mean)) - message(paste('Estimated background SD: ', bg.sd)) - } - } - } - }else{ - message("Using predefined values for background. (Mean = 0.279; SD = 0.13)") - bg.mean = 0.279 - bg.sd = 0.13 - } - - - - #non-syn variants - non.syn.maf = maf@data - #Variant Classification with Low/Modifier variant consequences. http://asia.ensembl.org/Help/Glossary?id=535 - silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron", - "RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA", "Amp", "Del") - non.syn.maf = non.syn.maf[!Variant_Classification %in% silent] #Remove silent variants from main table - - #Remove genes to ignore - if(!is.null(ignoreGenes)){ - ignoreGenes.count = nrow(non.syn.maf[Hugo_Symbol %in% ignoreGenes]) - message(paste('Removed', ignoreGenes.count, 'variants belonging to', paste(ignoreGenes, collapse = ', ', sep=','))) - non.syn.maf = non.syn.maf[!Hugo_Symbol %in% ignoreGenes] - } - - #Perform clustering and calculate cluster scores for nonsyn variants. - message('Estimating cluster scores from non-syn variants..') - nonsyn.scores = parse_prot_fix(dat = non.syn.maf, AACol = AACol, gl = gl, m = minMut, calBg = FALSE, nBg = nBgGenes) - - if(pvalMethod == 'combined'){ - message('Comapring with background model and estimating p-values..') - nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd - nonsyn.scores$tPval = 1- pnorm(nonsyn.scores$zscore) - nonsyn.scores$tFdr = p.adjust(nonsyn.scores$tPval, method = 'fdr') - - nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol') - nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total] - - counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model - nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model - - observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total') - expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected') - - #Poisson test to caluclate difference (p-value) - nonsyn.scores$poissonPval = apply(nonsyn.scores, 1, function(x) { - poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value - }) - - nonsyn.scores$poissonFdr = p.adjust(nonsyn.scores$poissonPval) - nonsyn.scores = nonsyn.scores[order(poissonFdr)] - - nonsyn.scores$fdr = apply(nonsyn.scores[,.(tFdr, poissonFdr)], MARGIN = 1, FUN = min) - - } else if(pvalMethod == 'zscore'){ - #Oncodrive clust way of caluclating pvalues - #Calculate z scores; compare it to bg scores and estimate z-score, pvalues, corrected pvalues (fdr) (assumes normal distribution) - message('Comapring with background model and estimating p-values..') - nonsyn.scores$zscore = (nonsyn.scores$clusterScores - bg.mean) / bg.sd - nonsyn.scores$pval = 1- pnorm(nonsyn.scores$zscore) - nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval, method = 'fdr') - - nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol') - nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total] - #nonsyn.scores[,fract_MutatedSamples := MutatedSamples/numSamples] - nonsyn.scores = nonsyn.scores[order(fdr)] - }else{ - #Assuming poisson distribution of mutation counts - #Now model observed number of mutations as a function of number of clusters and protein length. Calculate expected number of events based on poisson distribution. - nonsyn.scores = merge(getGeneSummary(maf), nonsyn.scores, by = 'Hugo_Symbol') - nonsyn.scores[,fract_muts_in_clusters := muts_in_clusters/total] - - counts.glm = glm(formula = total ~ protLen+clusters, family = poisson(link = identity), data = nonsyn.scores) #Poisson model - nonsyn.scores$Expected = counts.glm$fitted.values #Get expected number of events (mutations) from the model - - observed_mut_colIndex = which(colnames(nonsyn.scores) == 'total') - expected_mut_colIndex = which(colnames(nonsyn.scores) == 'Expected') - - #Poisson test to caluclate difference (p-value) - nonsyn.scores$pval = apply(nonsyn.scores, 1, function(x) { - poisson.test(as.numeric(x[observed_mut_colIndex]), as.numeric(x[expected_mut_colIndex]))$p.value - }) - - nonsyn.scores$fdr = p.adjust(nonsyn.scores$pval) - nonsyn.scores = nonsyn.scores[order(fdr)] - } - message('Done !') - return(nonsyn.scores) - } - - -laml = read.maf(maf = args$input_maf, removeSilent = F, useAll = T) - -if(is.null(args$gene_blacklist)){ - laml.sig = oncodrive(maf =laml, AACol = aacol, pvalMethod = 'zscore',minMut = min_mut) - write.table(laml.sig,file=args$output_detail, quote=FALSE,row.names=FALSE,sep="\t") - pdf(args$output_plot) - plotOncodrive(res=laml.sig,fdrCutOff=as.numeric(args$fdr),useFraction=TRUE) - dev.off() - }else{ - all_genes <- read.table(args$gene_blacklist, stringsAsFactors=FALSE)[,1] - laml.sig = oncodrive(maf =laml, AACol = aacol, pvalMethod = 'zscore',minMut = min_mut,ignoreGenes=all_genes) - write.table(laml.sig,file=args$output_detail, quote=FALSE,row.names=FALSE,sep="\t") - pdf(args$output_plot) - plotOncodrive(res=laml.sig,fdrCutOff=as.numeric(args$fdr),useFraction=TRUE) - dev.off() -} \ No newline at end of file
