Mercurial > repos > davidvanzessen > argalaxy_tools

--- a/report_clonality/RScript.r	Fri Mar 18 08:02:22 2016 -0400
+++ b/report_clonality/RScript.r	Thu Mar 31 10:26:30 2016 -0400
@@ -64,18 +64,32 @@

 inputdata$clonaltype = 1:nrow(inputdata)

+#keep track of the count of sequences in samples or samples/replicates for the front page overview
+input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")])
+input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")])
+
 PRODF = inputdata
 UNPROD = inputdata
 if(filterproductive){
   if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column
-    PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
-    UNPROD = inputdata[!(inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)"), ]
+    #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
+    PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),]
+
+    PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")])
+
+    UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ]
   } else {
     PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ]
     UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ]
   }
 }

+prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")])
+prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")])
+
+unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")])
+unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")])
+
 clonalityFrame = PRODF

 #remove duplicates based on the clonaltype
@@ -83,7 +97,7 @@
   clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples
   PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":"))
   PRODF = PRODF[!duplicated(PRODF$clonaltype), ]
-
+
   UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":"))
   UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ]

@@ -93,6 +107,12 @@
   clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ]
 }

+prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")])
+prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")])
+
+unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")])
+unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")])
+
 PRODF$freq = 1

 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*"
@@ -122,65 +142,35 @@

 print("Report Clonality - counting productive/unproductive/unique")

-if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data
-  inputdata$Functionality = "unproductive"
-  search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND")
-  if(sum(search) > 0){
-    inputdata[search,]$Functionality = "productive"
-  }
-}
-
-inputdata.dt = data.table(inputdata) #for speed
-
-if(clonaltype == "none"){
-  ct = c("clonaltype")
-}
+#create the table on the overview page with the productive/unique counts per sample/replicate
+#first for sample
+sample.count = merge(input.sample.count, prod.sample.count, by="Sample")
+sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100)
+sample.count = merge(sample.count, prod.unique.sample.count, by="Sample")
+sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100)

-inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_")
-samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample)))
-frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)])
-
-
-sample_productive_count = inputdata.dt[, list(All=.N,
-                                              Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]),
-                                              perc_prod = 1,
-                                              Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]),
-                                              perc_prod_un = 1,
-                                              Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]),
-                                              perc_unprod = 1,
-                                              Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]),
-                                              perc_unprod_un = 1),
-                                       by=c("Sample")]
+sample.count = merge(sample.count , unprod.sample.count, by="Sample")
+sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100)
+sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample")
+sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100)

-sample_productive_count$perc_prod = round(sample_productive_count$Productive / sample_productive_count$All * 100)
-sample_productive_count$perc_prod_un = round(sample_productive_count$Productive_unique / sample_productive_count$All * 100)
-
-sample_productive_count$perc_unprod = round(sample_productive_count$Unproductive / sample_productive_count$All * 100)
-sample_productive_count$perc_unprod_un = round(sample_productive_count$Unproductive_unique / sample_productive_count$All * 100)
+#then sample/replicate
+rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100)
+rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100)

-sample_replicate_productive_count = inputdata.dt[, list(All=.N,
-                                                        Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]),
-                                                        perc_prod = 1,
-                                                        Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]),
-                                                        perc_prod_un = 1,
-                                                        Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]),
-                                                        perc_unprod = 1,
-                                                        Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]),
-                                                        perc_unprod_un = 1),
-                                                 by=c("samples_replicates")]
+rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100)
+rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate"))
+rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100)

-sample_replicate_productive_count$perc_prod = round(sample_replicate_productive_count$Productive / sample_replicate_productive_count$All * 100)
-sample_replicate_productive_count$perc_prod_un = round(sample_replicate_productive_count$Productive_unique / sample_replicate_productive_count$All * 100)
-
-sample_replicate_productive_count$perc_unprod = round(sample_replicate_productive_count$Unproductive / sample_replicate_productive_count$All * 100)
-sample_replicate_productive_count$perc_unprod_un = round(sample_replicate_productive_count$Unproductive_unique / sample_replicate_productive_count$All * 100)
+rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_")
+rep.count = rep.count[,names(rep.count) != "Replicate"]

-setnames(sample_replicate_productive_count, colnames(sample_productive_count))
+count = rbind(sample.count, rep.count)

-counts = rbind(sample_replicate_productive_count, sample_productive_count)
-counts = counts[order(counts$Sample),]
-
-write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
+write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)

 # ---------------------- Frequency calculation for V, D and J ----------------------

@@ -381,6 +371,7 @@
     if(length(dat[,1]) == 0){
       return()
     }
+
     img = ggplot() +
       geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
       theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
@@ -402,15 +393,17 @@
   VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T)
   VandDCount$relLength = VandDCount$l / VandDCount$max

-  cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(inputdata$Sample))
+  cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name)

-  completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
+  completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE)
+
   completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
+
   completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)

   fltr = is.nan(completeVD$relLength)
-  if(any(fltr)){
-	  completeVD[fltr,"relLength"] = 1
+  if(all(fltr)){
+	  completeVD[fltr,"relLength"] = 0
   }

   VDList = split(completeVD, f=completeVD[,"Sample"])
@@ -445,7 +438,7 @@
 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T)
 VandJCount$relLength = VandJCount$l / VandJCount$max

-cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample))
+cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name)

 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
@@ -489,7 +482,7 @@
   DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T)
   DandJCount$relLength = DandJCount$l / DandJCount$max

-  cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample))
+  cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name)

   completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
   completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)