# HG changeset patch # User davidvanzessen # Date 1459434390 14400 # Node ID 11ec9edfefeea5366ae9242f0d63ab80025f674d # Parent a073fa12ef98b109d0c4b5afb2e2da867bd6232e Uploaded diff -r a073fa12ef98 -r 11ec9edfefee report_clonality/RScript.r --- a/report_clonality/RScript.r Fri Mar 18 08:02:22 2016 -0400 +++ b/report_clonality/RScript.r Thu Mar 31 10:26:30 2016 -0400 @@ -64,18 +64,32 @@ inputdata$clonaltype = 1:nrow(inputdata) +#keep track of the count of sequences in samples or samples/replicates for the front page overview +input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")]) +input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")]) + PRODF = inputdata UNPROD = inputdata if(filterproductive){ if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column - PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] - UNPROD = inputdata[!(inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)"), ] + #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] + PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),] + + PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")]) + + UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ] } else { PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ] UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ] } } +prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")]) +prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")]) + +unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")]) +unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")]) + clonalityFrame = PRODF #remove duplicates based on the clonaltype @@ -83,7 +97,7 @@ clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) PRODF = PRODF[!duplicated(PRODF$clonaltype), ] - + UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] @@ -93,6 +107,12 @@ clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] } +prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")]) +prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")]) + +unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")]) +unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")]) + PRODF$freq = 1 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" @@ -122,65 +142,35 @@ print("Report Clonality - counting productive/unproductive/unique") -if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data - inputdata$Functionality = "unproductive" - search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND") - if(sum(search) > 0){ - inputdata[search,]$Functionality = "productive" - } -} - -inputdata.dt = data.table(inputdata) #for speed - -if(clonaltype == "none"){ - ct = c("clonaltype") -} +#create the table on the overview page with the productive/unique counts per sample/replicate +#first for sample +sample.count = merge(input.sample.count, prod.sample.count, by="Sample") +sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100) +sample.count = merge(sample.count, prod.unique.sample.count, by="Sample") +sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100) -inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") -samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) -frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) - - -sample_productive_count = inputdata.dt[, list(All=.N, - Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), - perc_prod = 1, - Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), - perc_prod_un = 1, - Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]), - perc_unprod = 1, - Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]), - perc_unprod_un = 1), - by=c("Sample")] +sample.count = merge(sample.count , unprod.sample.count, by="Sample") +sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100) +sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample") +sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100) -sample_productive_count$perc_prod = round(sample_productive_count$Productive / sample_productive_count$All * 100) -sample_productive_count$perc_prod_un = round(sample_productive_count$Productive_unique / sample_productive_count$All * 100) - -sample_productive_count$perc_unprod = round(sample_productive_count$Unproductive / sample_productive_count$All * 100) -sample_productive_count$perc_unprod_un = round(sample_productive_count$Unproductive_unique / sample_productive_count$All * 100) +#then sample/replicate +rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate")) +rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100) +rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate")) +rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100) -sample_replicate_productive_count = inputdata.dt[, list(All=.N, - Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), - perc_prod = 1, - Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), - perc_prod_un = 1, - Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]), - perc_unprod = 1, - Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]), - perc_unprod_un = 1), - by=c("samples_replicates")] +rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate")) +rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100) +rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate")) +rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100) -sample_replicate_productive_count$perc_prod = round(sample_replicate_productive_count$Productive / sample_replicate_productive_count$All * 100) -sample_replicate_productive_count$perc_prod_un = round(sample_replicate_productive_count$Productive_unique / sample_replicate_productive_count$All * 100) - -sample_replicate_productive_count$perc_unprod = round(sample_replicate_productive_count$Unproductive / sample_replicate_productive_count$All * 100) -sample_replicate_productive_count$perc_unprod_un = round(sample_replicate_productive_count$Unproductive_unique / sample_replicate_productive_count$All * 100) +rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_") +rep.count = rep.count[,names(rep.count) != "Replicate"] -setnames(sample_replicate_productive_count, colnames(sample_productive_count)) +count = rbind(sample.count, rep.count) -counts = rbind(sample_replicate_productive_count, sample_productive_count) -counts = counts[order(counts$Sample),] - -write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) +write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) # ---------------------- Frequency calculation for V, D and J ---------------------- @@ -381,6 +371,7 @@ if(length(dat[,1]) == 0){ return() } + img = ggplot() + geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + @@ -402,15 +393,17 @@ VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) VandDCount$relLength = VandDCount$l / VandDCount$max - cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(inputdata$Sample)) + cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name) - completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) + completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE) + completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) + completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) fltr = is.nan(completeVD$relLength) - if(any(fltr)){ - completeVD[fltr,"relLength"] = 1 + if(all(fltr)){ + completeVD[fltr,"relLength"] = 0 } VDList = split(completeVD, f=completeVD[,"Sample"]) @@ -445,7 +438,7 @@ VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) VandJCount$relLength = VandJCount$l / VandJCount$max -cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) +cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name) completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) @@ -489,7 +482,7 @@ DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) DandJCount$relLength = DandJCount$l / DandJCount$max - cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) + cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name) completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)