Mercurial > repos > davidvanzessen > argalaxy_tools
comparison report_clonality/RScript.r @ 59:11ec9edfefee draft
Uploaded
author | davidvanzessen |
---|---|
date | Thu, 31 Mar 2016 10:26:30 -0400 |
parents | a073fa12ef98 |
children | 14ea4c464435 |
comparison
equal
deleted
inserted
replaced
58:a073fa12ef98 | 59:11ec9edfefee |
---|---|
62 #filter uniques | 62 #filter uniques |
63 inputdata.removed = inputdata[NULL,] | 63 inputdata.removed = inputdata[NULL,] |
64 | 64 |
65 inputdata$clonaltype = 1:nrow(inputdata) | 65 inputdata$clonaltype = 1:nrow(inputdata) |
66 | 66 |
67 #keep track of the count of sequences in samples or samples/replicates for the front page overview | |
68 input.sample.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample")]) | |
69 input.rep.count = data.frame(data.table(inputdata)[, list(All=.N), by=c("Sample", "Replicate")]) | |
70 | |
67 PRODF = inputdata | 71 PRODF = inputdata |
68 UNPROD = inputdata | 72 UNPROD = inputdata |
69 if(filterproductive){ | 73 if(filterproductive){ |
70 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column | 74 if("Functionality" %in% colnames(inputdata)) { # "Functionality" is an IMGT column |
71 PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] | 75 #PRODF = inputdata[inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)", ] |
72 UNPROD = inputdata[!(inputdata$Functionality == "productive" | inputdata$Functionality == "productive (see comment)"), ] | 76 PRODF = inputdata[inputdata$Functionality %in% c("productive (see comment)","productive"),] |
77 | |
78 PRODF.count = data.frame(data.table(PRODF)[, list(count=.N), by=c("Sample")]) | |
79 | |
80 UNPROD = inputdata[inputdata$Functionality %in% c("unproductive (see comment)","unproductive"), ] | |
73 } else { | 81 } else { |
74 PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ] | 82 PRODF = inputdata[inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" , ] |
75 UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ] | 83 UNPROD = inputdata[!(inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND" ), ] |
76 } | 84 } |
77 } | 85 } |
86 | |
87 prod.sample.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample")]) | |
88 prod.rep.count = data.frame(data.table(PRODF)[, list(Productive=.N), by=c("Sample", "Replicate")]) | |
89 | |
90 unprod.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample")]) | |
91 unprod.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive=.N), by=c("Sample", "Replicate")]) | |
78 | 92 |
79 clonalityFrame = PRODF | 93 clonalityFrame = PRODF |
80 | 94 |
81 #remove duplicates based on the clonaltype | 95 #remove duplicates based on the clonaltype |
82 if(clonaltype != "none"){ | 96 if(clonaltype != "none"){ |
83 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples | 97 clonaltype = paste(clonaltype, ",Sample", sep="") #add sample column to clonaltype, unique within samples |
84 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) | 98 PRODF$clonaltype = do.call(paste, c(PRODF[unlist(strsplit(clonaltype, ","))], sep = ":")) |
85 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] | 99 PRODF = PRODF[!duplicated(PRODF$clonaltype), ] |
86 | 100 |
87 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) | 101 UNPROD$clonaltype = do.call(paste, c(UNPROD[unlist(strsplit(clonaltype, ","))], sep = ":")) |
88 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] | 102 UNPROD = UNPROD[!duplicated(UNPROD$clonaltype), ] |
89 | 103 |
90 #again for clonalityFrame but with sample+replicate | 104 #again for clonalityFrame but with sample+replicate |
91 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) | 105 clonalityFrame$clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(clonaltype, ","))], sep = ":")) |
92 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":")) | 106 clonalityFrame$clonality_clonaltype = do.call(paste, c(clonalityFrame[unlist(strsplit(paste(clonaltype, ",Replicate", sep=""), ","))], sep = ":")) |
93 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] | 107 clonalityFrame = clonalityFrame[!duplicated(clonalityFrame$clonality_clonaltype), ] |
94 } | 108 } |
109 | |
110 prod.unique.sample.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample")]) | |
111 prod.unique.rep.count = data.frame(data.table(PRODF)[, list(Productive_unique=.N), by=c("Sample", "Replicate")]) | |
112 | |
113 unprod.unique.sample.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample")]) | |
114 unprod.unique.rep.count = data.frame(data.table(UNPROD)[, list(Unproductive_unique=.N), by=c("Sample", "Replicate")]) | |
95 | 115 |
96 PRODF$freq = 1 | 116 PRODF$freq = 1 |
97 | 117 |
98 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" | 118 if(any(grepl(pattern="_", x=PRODF$ID))){ #the frequency can be stored in the ID with the pattern ".*_freq_.*" |
99 PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID) | 119 PRODF$freq = gsub("^[0-9]+_", "", PRODF$ID) |
120 | 140 |
121 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- | 141 # ---------------------- Counting the productive/unproductive and unique sequences ---------------------- |
122 | 142 |
123 print("Report Clonality - counting productive/unproductive/unique") | 143 print("Report Clonality - counting productive/unproductive/unique") |
124 | 144 |
125 if(!("Functionality" %in% inputdata)){ #add a functionality column to the igblast data | 145 #create the table on the overview page with the productive/unique counts per sample/replicate |
126 inputdata$Functionality = "unproductive" | 146 #first for sample |
127 search = (inputdata$VDJ.Frame != "In-frame with stop codon" & inputdata$VDJ.Frame != "Out-of-frame" & inputdata$CDR3.Found.How != "NOT_FOUND") | 147 sample.count = merge(input.sample.count, prod.sample.count, by="Sample") |
128 if(sum(search) > 0){ | 148 sample.count$perc_prod = round(sample.count$Productive / sample.count$All * 100) |
129 inputdata[search,]$Functionality = "productive" | 149 sample.count = merge(sample.count, prod.unique.sample.count, by="Sample") |
130 } | 150 sample.count$perc_prod_un = round(sample.count$Productive_unique / sample.count$All * 100) |
131 } | 151 |
132 | 152 sample.count = merge(sample.count , unprod.sample.count, by="Sample") |
133 inputdata.dt = data.table(inputdata) #for speed | 153 sample.count$perc_unprod = round(sample.count$Unproductive / sample.count$All * 100) |
134 | 154 sample.count = merge(sample.count, unprod.unique.sample.count, by="Sample") |
135 if(clonaltype == "none"){ | 155 sample.count$perc_unprod_un = round(sample.count$Unproductive_unique / sample.count$All * 100) |
136 ct = c("clonaltype") | 156 |
137 } | 157 #then sample/replicate |
138 | 158 rep.count = merge(input.rep.count, prod.rep.count, by=c("Sample", "Replicate")) |
139 inputdata.dt$samples_replicates = paste(inputdata.dt$Sample, inputdata.dt$Replicate, sep="_") | 159 rep.count$perc_prod = round(rep.count$Productive / rep.count$All * 100) |
140 samples_replicates = c(unique(inputdata.dt$samples_replicates), unique(as.character(inputdata.dt$Sample))) | 160 rep.count = merge(rep.count, prod.unique.rep.count, by=c("Sample", "Replicate")) |
141 frequency_table = data.frame(ID = samples_replicates[order(samples_replicates)]) | 161 rep.count$perc_prod_un = round(rep.count$Productive_unique / rep.count$All * 100) |
142 | 162 |
143 | 163 rep.count = merge(rep.count, unprod.rep.count, by=c("Sample", "Replicate")) |
144 sample_productive_count = inputdata.dt[, list(All=.N, | 164 rep.count$perc_unprod = round(rep.count$Unproductive / rep.count$All * 100) |
145 Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), | 165 rep.count = merge(rep.count, unprod.unique.rep.count, by=c("Sample", "Replicate")) |
146 perc_prod = 1, | 166 rep.count$perc_unprod_un = round(rep.count$Unproductive_unique / rep.count$All * 100) |
147 Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), | 167 |
148 perc_prod_un = 1, | 168 rep.count$Sample = paste(rep.count$Sample, rep.count$Replicate, sep="_") |
149 Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]), | 169 rep.count = rep.count[,names(rep.count) != "Replicate"] |
150 perc_unprod = 1, | 170 |
151 Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]), | 171 count = rbind(sample.count, rep.count) |
152 perc_unprod_un = 1), | 172 |
153 by=c("Sample")] | 173 write.table(x=count, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) |
154 | |
155 sample_productive_count$perc_prod = round(sample_productive_count$Productive / sample_productive_count$All * 100) | |
156 sample_productive_count$perc_prod_un = round(sample_productive_count$Productive_unique / sample_productive_count$All * 100) | |
157 | |
158 sample_productive_count$perc_unprod = round(sample_productive_count$Unproductive / sample_productive_count$All * 100) | |
159 sample_productive_count$perc_unprod_un = round(sample_productive_count$Unproductive_unique / sample_productive_count$All * 100) | |
160 | |
161 sample_replicate_productive_count = inputdata.dt[, list(All=.N, | |
162 Productive = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",]), | |
163 perc_prod = 1, | |
164 Productive_unique = nrow(.SD[.SD$Functionality == "productive" | .SD$Functionality == "productive (see comment)",list(count=.N),by=ct]), | |
165 perc_prod_un = 1, | |
166 Unproductive= nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",]), | |
167 perc_unprod = 1, | |
168 Unproductive_unique =nrow(.SD[.SD$Functionality != "productive" & .SD$Functionality != "productive (see comment)",list(count=.N),by=ct]), | |
169 perc_unprod_un = 1), | |
170 by=c("samples_replicates")] | |
171 | |
172 sample_replicate_productive_count$perc_prod = round(sample_replicate_productive_count$Productive / sample_replicate_productive_count$All * 100) | |
173 sample_replicate_productive_count$perc_prod_un = round(sample_replicate_productive_count$Productive_unique / sample_replicate_productive_count$All * 100) | |
174 | |
175 sample_replicate_productive_count$perc_unprod = round(sample_replicate_productive_count$Unproductive / sample_replicate_productive_count$All * 100) | |
176 sample_replicate_productive_count$perc_unprod_un = round(sample_replicate_productive_count$Unproductive_unique / sample_replicate_productive_count$All * 100) | |
177 | |
178 setnames(sample_replicate_productive_count, colnames(sample_productive_count)) | |
179 | |
180 counts = rbind(sample_replicate_productive_count, sample_productive_count) | |
181 counts = counts[order(counts$Sample),] | |
182 | |
183 write.table(x=counts, file="productive_counting.txt", sep=",",quote=F,row.names=F,col.names=F) | |
184 | 174 |
185 # ---------------------- Frequency calculation for V, D and J ---------------------- | 175 # ---------------------- Frequency calculation for V, D and J ---------------------- |
186 | 176 |
187 print("Report Clonality - frequency calculation V, D and J") | 177 print("Report Clonality - frequency calculation V, D and J") |
188 | 178 |
379 print("Report Clonality - Heatmaps VD") | 369 print("Report Clonality - Heatmaps VD") |
380 plotVD <- function(dat){ | 370 plotVD <- function(dat){ |
381 if(length(dat[,1]) == 0){ | 371 if(length(dat[,1]) == 0){ |
382 return() | 372 return() |
383 } | 373 } |
374 | |
384 img = ggplot() + | 375 img = ggplot() + |
385 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + | 376 geom_tile(data=dat, aes(x=factor(reorder(Top.D.Gene, chr.orderD)), y=factor(reorder(Top.V.Gene, chr.orderV)), fill=relLength)) + |
386 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + | 377 theme(axis.text.x = element_text(angle = 90, hjust = 1)) + |
387 scale_fill_gradient(low="gold", high="blue", na.value="white") + | 378 scale_fill_gradient(low="gold", high="blue", na.value="white") + |
388 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + | 379 ggtitle(paste(unique(dat$Sample), " (N=" , sum(dat$Length, na.rm=T) ,")", sep="")) + |
400 VandDCount$l = log(VandDCount$Length) | 391 VandDCount$l = log(VandDCount$Length) |
401 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) | 392 maxVD = data.frame(data.table(VandDCount)[, list(max=max(l)), by=c("Sample")]) |
402 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) | 393 VandDCount = merge(VandDCount, maxVD, by.x="Sample", by.y="Sample", all.x=T) |
403 VandDCount$relLength = VandDCount$l / VandDCount$max | 394 VandDCount$relLength = VandDCount$l / VandDCount$max |
404 | 395 |
405 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name, Sample = unique(inputdata$Sample)) | 396 cartegianProductVD = expand.grid(Top.V.Gene = Vchain$v.name, Top.D.Gene = Dchain$v.name) |
406 | 397 |
407 completeVD = merge(VandDCount, cartegianProductVD, all.y=TRUE) | 398 completeVD = merge(VandDCount, cartegianProductVD, by.x=c("Top.V.Gene", "Top.D.Gene"), by.y=c("Top.V.Gene", "Top.D.Gene"), all=TRUE) |
399 | |
408 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 400 completeVD = merge(completeVD, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) |
401 | |
409 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) | 402 completeVD = merge(completeVD, Dchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) |
410 | 403 |
411 fltr = is.nan(completeVD$relLength) | 404 fltr = is.nan(completeVD$relLength) |
412 if(any(fltr)){ | 405 if(all(fltr)){ |
413 completeVD[fltr,"relLength"] = 1 | 406 completeVD[fltr,"relLength"] = 0 |
414 } | 407 } |
415 | 408 |
416 VDList = split(completeVD, f=completeVD[,"Sample"]) | 409 VDList = split(completeVD, f=completeVD[,"Sample"]) |
417 lapply(VDList, FUN=plotVD) | 410 lapply(VDList, FUN=plotVD) |
418 } | 411 } |
443 VandJCount$l = log(VandJCount$Length) | 436 VandJCount$l = log(VandJCount$Length) |
444 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) | 437 maxVJ = data.frame(data.table(VandJCount)[, list(max=max(l)), by=c("Sample")]) |
445 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) | 438 VandJCount = merge(VandJCount, maxVJ, by.x="Sample", by.y="Sample", all.x=T) |
446 VandJCount$relLength = VandJCount$l / VandJCount$max | 439 VandJCount$relLength = VandJCount$l / VandJCount$max |
447 | 440 |
448 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) | 441 cartegianProductVJ = expand.grid(Top.V.Gene = Vchain$v.name, Top.J.Gene = Jchain$v.name) |
449 | 442 |
450 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) | 443 completeVJ = merge(VandJCount, cartegianProductVJ, all.y=TRUE) |
451 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) | 444 completeVJ = merge(completeVJ, revVchain, by.x="Top.V.Gene", by.y="v.name", all.x=TRUE) |
452 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) | 445 completeVJ = merge(completeVJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) |
453 | 446 |
487 DandJCount$l = log(DandJCount$Length) | 480 DandJCount$l = log(DandJCount$Length) |
488 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) | 481 maxDJ = data.frame(data.table(DandJCount)[, list(max=max(l)), by=c("Sample")]) |
489 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) | 482 DandJCount = merge(DandJCount, maxDJ, by.x="Sample", by.y="Sample", all.x=T) |
490 DandJCount$relLength = DandJCount$l / DandJCount$max | 483 DandJCount$relLength = DandJCount$l / DandJCount$max |
491 | 484 |
492 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name, Sample = unique(inputdata$Sample)) | 485 cartegianProductDJ = expand.grid(Top.D.Gene = Dchain$v.name, Top.J.Gene = Jchain$v.name) |
493 | 486 |
494 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) | 487 completeDJ = merge(DandJCount, cartegianProductDJ, all.y=TRUE) |
495 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) | 488 completeDJ = merge(completeDJ, revDchain, by.x="Top.D.Gene", by.y="v.name", all.x=TRUE) |
496 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) | 489 completeDJ = merge(completeDJ, Jchain, by.x="Top.J.Gene", by.y="v.name", all.x=TRUE) |
497 | 490 |