# HG changeset patch
# User davidvanzessen
# Date 1459434390 14400
# Node ID 11ec9edfefeea5366ae9242f0d63ab80025f674d
# Parent  a073fa12ef98b109d0c4b5afb2e2da867bd6232e
Uploaded

diff -r a073fa12ef98 -r 11ec9edfefee report_clonality/RScript.r
--- a/report_clonality/RScript.r	Fri Mar 18 08:02:22 2016 -0400
+++ b/report_clonality/RScript.r	Thu Mar 31 10:26:30 2016 -0400
@@ -64,18 +64,32 @@
 
 inputdata$clonaltype = 1:nrow(inputdata)
 
+#keep track of the count of sequences in samples or samples/replicates for the front page overview
+input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")])
+input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")])
+
 PRODF = inputdata
 UNPROD = inputdata
 if(filterproductive){
   if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column
-    PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
-    UNPROD = inputdata[!(inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)"), ]
+    #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
+    PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),]
+    
+    PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")])
+    
+    UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ]
   } else {
     PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ]
     UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ]
   }
 }
 
+prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")])
+prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")])
+
+unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")])
+unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")])
+
 clonalityFrame = PRODF
 
 #remove duplicates based on the clonaltype
@@ -83,7 +97,7 @@
   clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples
   PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":"))
   PRODF = PRODF[!duplicated(PRODF$clonaltype), ]
-  
+    
   UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":"))
   UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ]
   
@@ -93,6 +107,12 @@
   clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ]
 }
 
+prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")])
+prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")])
+
+unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")])
+unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")])
+
 PRODF$freq = 1
 
 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*"
@@ -122,65 +142,35 @@
 
 print("Report Clonality - counting productive/unproductive/unique")
 
-if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data
-  inputdata$Functionality = "unproductive"
-  search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND")
-  if(sum(search) > 0){
-    inputdata[search,]$Functionality = "productive"
-  }
-}
-
-inputdata.dt = data.table(inputdata) #for speed
-
-if(clonaltype == "none"){
-  ct = c("clonaltype")
-}
+#create the table on the overview page with the productive/unique counts per sample/replicate
+#first for sample
+sample.count = merge(input.sample.count, prod.sample.count, by="Sample")
+sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100)
+sample.count = merge(sample.count, prod.unique.sample.count, by="Sample")
+sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100)
 
-inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_")
-samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample)))
-frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)])
-
-
-sample_productive_count = inputdata.dt[, list(All=.N, 
-                                              Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), 
-                                              perc_prod = 1,
-                                              Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), 
-                                              perc_prod_un = 1,
-                                              Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]),
-                                              perc_unprod = 1,
-                                              Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]),
-                                              perc_unprod_un = 1),
-                                       by=c("Sample")]
+sample.count = merge(sample.count , unprod.sample.count, by="Sample")
+sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100)
+sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample")
+sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100)
 
-sample_productive_count$perc_prod = round(sample_productive_count$Productive / sample_productive_count$All * 100)
-sample_productive_count$perc_prod_un = round(sample_productive_count$Productive_unique / sample_productive_count$All * 100)
-
-sample_productive_count$perc_unprod = round(sample_productive_count$Unproductive / sample_productive_count$All * 100)
-sample_productive_count$perc_unprod_un = round(sample_productive_count$Unproductive_unique / sample_productive_count$All * 100)
+#then sample/replicate
+rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100)
+rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100)
 
-sample_replicate_productive_count = inputdata.dt[, list(All=.N, 
-                                                        Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), 
-                                                        perc_prod = 1,
-                                                        Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), 
-                                                        perc_prod_un = 1,
-                                                        Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]),
-                                                        perc_unprod = 1,
-                                                        Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]),
-                                                        perc_unprod_un = 1),
-                                                 by=c("samples_replicates")]
+rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100)
+rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100)
 
-sample_replicate_productive_count$perc_prod = round(sample_replicate_productive_count$Productive / sample_replicate_productive_count$All * 100)
-sample_replicate_productive_count$perc_prod_un = round(sample_replicate_productive_count$Productive_unique / sample_replicate_productive_count$All * 100)
-
-sample_replicate_productive_count$perc_unprod = round(sample_replicate_productive_count$Unproductive / sample_replicate_productive_count$All * 100)
-sample_replicate_productive_count$perc_unprod_un = round(sample_replicate_productive_count$Unproductive_unique / sample_replicate_productive_count$All * 100)
+rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_")
+rep.count = rep.count[,names(rep.count) != "Replicate"]
 
-setnames(sample_replicate_productive_count, colnames(sample_productive_count))
+count = rbind(sample.count, rep.count)
 
-counts = rbind(sample_replicate_productive_count, sample_productive_count)
-counts = counts[order(counts$Sample),]
-
-write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
+write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
 
 # ---------------------- Frequency calculation for V, D and J ----------------------
 
@@ -381,6 +371,7 @@
     if(length(dat[,1]) == 0){
       return()
     }
+    
     img = ggplot() + 
       geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + 
       theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
@@ -402,15 +393,17 @@
   VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T)
   VandDCount$relLength = VandDCount$l / VandDCount$max
   
-  cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(inputdata$Sample))
+  cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name)
   
-  completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
+  completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE)
+ 
   completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
+ 
   completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
   
   fltr = is.nan(completeVD$relLength)
-  if(any(fltr)){
-	  completeVD[fltr,"relLength"] = 1
+  if(all(fltr)){
+	  completeVD[fltr,"relLength"] = 0
   }
   
   VDList = split(completeVD, f=completeVD[,"Sample"])
@@ -445,7 +438,7 @@
 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T)
 VandJCount$relLength = VandJCount$l / VandJCount$max
 
-cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample))
+cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name)
 
 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
@@ -489,7 +482,7 @@
   DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T)
   DandJCount$relLength = DandJCount$l / DandJCount$max
   
-  cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample))
+  cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name)
   
   completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
   completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)