argalaxy_tools: report_clonality/RScript.r comparison

comparison report_clonality/RScript.r @ 59:11ec9edfefee draft

Uploaded

author	davidvanzessen
date	Thu, 31 Mar 2016 10:26:30 -0400
parents	a073fa12ef98
children	14ea4c464435

comparison

equal deleted inserted replaced

-:a073fa12ef98
+:11ec9edfefee
 #filter uniques
 inputdata.removed = inputdata[NULL,]
 inputdata$clonaltype = 1:nrow(inputdata)
+#keep track of the count of sequences in samples or samples/replicates for the front page overview
+input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")])
+input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")])
 PRODF = inputdata
 UNPROD = inputdata
 if(filterproductive){
 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column
-PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
+#PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ]
-UNPROD = inputdata[!(inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)"), ]
+PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),]
+PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")])
+UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ]
 } else {
 PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ]
 UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ]
 }
 }
+prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")])
+prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")])
+unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")])
+unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")])
 clonalityFrame = PRODF
 #remove duplicates based on the clonaltype
 if(clonaltype != "none"){
 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples
 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":"))
 PRODF = PRODF[!duplicated(PRODF$clonaltype), ]
 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":"))
 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ]
 #again for clonalityFrame but with sample+replicate
 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":"))
 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":"))
 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ]
 }
+prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")])
+prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")])
+unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")])
+unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")])
 PRODF$freq = 1
 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*"
 PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID)
 # ---------------------- Counting the productive/unproductive and unique sequences ----------------------
 print("Report Clonality - counting productive/unproductive/unique")
-if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data
+#create the table on the overview page with the productive/unique counts per sample/replicate
-inputdata$Functionality = "unproductive"
+#first for sample
-search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND")
+sample.count = merge(input.sample.count, prod.sample.count, by="Sample")
-if(sum(search) > 0){
+sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100)
-inputdata[search,]$Functionality = "productive"
+sample.count = merge(sample.count, prod.unique.sample.count, by="Sample")
-}
+sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100)
-}
+sample.count = merge(sample.count , unprod.sample.count, by="Sample")
-inputdata.dt = data.table(inputdata) #for speed
+sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100)
+sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample")
-if(clonaltype == "none"){
+sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100)
-ct = c("clonaltype")
-}
+#then sample/replicate
+rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate"))
-inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_")
+rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100)
-samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample)))
+rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate"))
-frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)])
+rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100)
+rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate"))
-sample_productive_count = inputdata.dt[, list(All=.N,
+rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100)
-Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]),
+rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate"))
-perc_prod = 1,
+rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100)
-Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]),
-perc_prod_un = 1,
+rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_")
-Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]),
+rep.count = rep.count[,names(rep.count) != "Replicate"]
-perc_unprod = 1,
-Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]),
+count = rbind(sample.count, rep.count)
-perc_unprod_un = 1),
-by=c("Sample")]
+write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
-sample_productive_count$perc_prod = round(sample_productive_count$Productive / sample_productive_count$All * 100)
-sample_productive_count$perc_prod_un = round(sample_productive_count$Productive_unique / sample_productive_count$All * 100)
-sample_productive_count$perc_unprod = round(sample_productive_count$Unproductive / sample_productive_count$All * 100)
-sample_productive_count$perc_unprod_un = round(sample_productive_count$Unproductive_unique / sample_productive_count$All * 100)
-sample_replicate_productive_count = inputdata.dt[, list(All=.N,
-Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]),
-perc_prod = 1,
-Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]),
-perc_prod_un = 1,
-Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]),
-perc_unprod = 1,
-Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]),
-perc_unprod_un = 1),
-by=c("samples_replicates")]
-sample_replicate_productive_count$perc_prod = round(sample_replicate_productive_count$Productive / sample_replicate_productive_count$All * 100)
-sample_replicate_productive_count$perc_prod_un = round(sample_replicate_productive_count$Productive_unique / sample_replicate_productive_count$All * 100)
-sample_replicate_productive_count$perc_unprod = round(sample_replicate_productive_count$Unproductive / sample_replicate_productive_count$All * 100)
-sample_replicate_productive_count$perc_unprod_un = round(sample_replicate_productive_count$Unproductive_unique / sample_replicate_productive_count$All * 100)
-setnames(sample_replicate_productive_count, colnames(sample_productive_count))
-counts = rbind(sample_replicate_productive_count, sample_productive_count)
-counts = counts[order(counts$Sample),]
-write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F)
 # ---------------------- Frequency calculation for V, D and J ----------------------
 print("Report Clonality - frequency calculation V, D and J")
 print("Report Clonality - Heatmaps VD")
 plotVD <- function(dat){
 if(length(dat[,1]) == 0){
 return()
 }
 img = ggplot() +
 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) +
 theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
 scale_fill_gradient(low="gold", high="blue", na.value="white") +
 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) +
 VandDCount$l = log(VandDCount$Length)
 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")])
 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T)
 VandDCount$relLength = VandDCount$l / VandDCount$max
-cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(inputdata$Sample))
+cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name)
-completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE)
+completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE)
 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
 fltr = is.nan(completeVD$relLength)
-if(any(fltr)){
+if(all(fltr)){
-	  completeVD[fltr,"relLength"] = 1
+	  completeVD[fltr,"relLength"] = 0
 }
 VDList = split(completeVD, f=completeVD[,"Sample"])
 lapply(VDList, FUN=plotVD)
 }
 VandJCount$l = log(VandJCount$Length)
 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")])
 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T)
 VandJCount$relLength = VandJCount$l / VandJCount$max
-cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample))
+cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name)
 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE)
 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE)
 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)
 DandJCount$l = log(DandJCount$Length)
 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")])
 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T)
 DandJCount$relLength = DandJCount$l / DandJCount$max
-cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample))
+cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name)
 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE)
 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE)
 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE)

Mercurial > repos > davidvanzessen > argalaxy_tools

comparison report_clonality/RScript.r @ 59:11ec9edfefee draft