Mercurial > repos > melpetera > testtest
comparison CorrTable/Corr_Script_samples_row.R @ 4:24df6dfccbd9 draft default tip
Uploaded
| author | melpetera |
|---|---|
| date | Fri, 05 Oct 2018 10:33:45 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 3:8fb560e858ab | 4:24df6dfccbd9 |
|---|---|
| 1 ################################################################################################# | |
| 2 # CORRELATION TABLE # | |
| 3 # # | |
| 4 # # | |
| 5 # Input : 2 tables with common samples # | |
| 6 # Output : Correlation table ; Heatmap (pdf) # | |
| 7 # # | |
| 8 # Dependencies : Libraries "ggplot2" and "reshape2" # | |
| 9 # # | |
| 10 ################################################################################################# | |
| 11 | |
| 12 | |
| 13 # Parameters (for dev) | |
| 14 if(FALSE){ | |
| 15 | |
| 16 rm(list = ls()) | |
| 17 setwd(dir = "Y:/Developpement") | |
| 18 | |
| 19 tab1.name <- "Test/Ressources/Inputs/CT2_DM.tabular" | |
| 20 tab2.name <- "Test/Ressources/Inputs/CT2_base_Diapason_14ClinCES_PRIN.txt" | |
| 21 param1.samples <- "column" | |
| 22 param2.samples <- "row" | |
| 23 corr.method <- "pearson" | |
| 24 test.corr <- "yes" | |
| 25 alpha <- 0.05 | |
| 26 multi.name <- "none" | |
| 27 filter <- "yes" | |
| 28 filters.choice <- "filters_0_thr" | |
| 29 threshold <- 0.2 | |
| 30 reorder.var <- "yes" | |
| 31 color.heatmap <- "yes" | |
| 32 type.classes <-"irregular" | |
| 33 reg.value <- 1/3 | |
| 34 irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4) | |
| 35 output1 <- "Correlation_table.txt" | |
| 36 output2 <- "Heatmap.pdf" | |
| 37 | |
| 38 } | |
| 39 | |
| 40 | |
| 41 | |
| 42 correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, | |
| 43 multi.name, filter, filters.choice, threshold, reorder.var, color.heatmap, type.classes, | |
| 44 reg.value, irreg.vect, output1, output2){ | |
| 45 | |
| 46 # This function allows to visualize the correlation between two tables | |
| 47 # | |
| 48 # Parameters: | |
| 49 # - tab1.name: table 1 file's access | |
| 50 # - tab2.name: table 2 file's access | |
| 51 # - param1.samples ("row" or "column"): where the samples are in tab1 | |
| 52 # - param2.samples ("row" or "column"): where the samples are in tab2 | |
| 53 # - corr.method ("pearson", "spearman", "kendall"): | |
| 54 # - test.corr ("yes" or "no"): test the significance of a correlation coefficient | |
| 55 # - alpha (value between 0 and 1): risk for the correlation significance test | |
| 56 # - multi.name ("holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"): correction of multiple tests | |
| 57 # - filter ("yes", "no"): use filter.0 or/and filter.threshold | |
| 58 # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0 | |
| 59 # and threshold filter remove variables with all their correlation coefficients in abs < threshold | |
| 60 # - threshold (value between 0 and 1): threshold for filter threshold | |
| 61 # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA | |
| 62 # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user | |
| 63 # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes | |
| 64 # - reg.value (value between 0 and 1): value for regular classes | |
| 65 # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes) | |
| 66 # - output1: correlation table file's access | |
| 67 # - output2: heatmap (colored correlation table) file's access | |
| 68 | |
| 69 | |
| 70 # Input ---------------------------------------------------------------------------------------------- | |
| 71 | |
| 72 tab1 <- read.table(tab1.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) | |
| 73 tab2 <- read.table(tab2.name, sep = "\t", header = TRUE, check.names = FALSE, row.names = 1) | |
| 74 | |
| 75 # Transpose tables according to the samples | |
| 76 if(param1.samples == "column"){ | |
| 77 tab1 <- t(tab1) | |
| 78 } | |
| 79 | |
| 80 if(param2.samples == "column"){ | |
| 81 tab2 <- t(tab2) | |
| 82 } | |
| 83 | |
| 84 # Sorting tables in alphabetical order of the samples | |
| 85 tab1 <- tab1[order(rownames(tab1)),] | |
| 86 tab2 <- tab2[order(rownames(tab2)),] | |
| 87 | |
| 88 | |
| 89 # Check if the 2 datasets match regarding samples identifiers | |
| 90 # Adapt from functions "check.err" and "match2", RcheckLibrary.R | |
| 91 | |
| 92 err.stock <- NULL | |
| 93 | |
| 94 id1 <- rownames(tab1) | |
| 95 id2 <- rownames(tab2) | |
| 96 | |
| 97 if(sum(id1 != id2) > 0){ | |
| 98 err.stock <- c("\nThe two tables do not match regarding sample identifiers.\n") | |
| 99 | |
| 100 if(length(which(id1%in%id2)) != length(id1)){ | |
| 101 identif <- id1[which(!(id1%in%id2))] | |
| 102 if (length(identif) < 4){ | |
| 103 err.stock <- c(err.stock, "\nThe following identifier(s) found in the first table do not appear in the second table:\n") | |
| 104 } | |
| 105 else { | |
| 106 err.stock <- c(err.stock, "\nFor example, the following identifiers found in the first table do not appear in the second table:\n") | |
| 107 } | |
| 108 identif <- identif[1:min(3,length(which(!(id1%in%id2))))] | |
| 109 err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") | |
| 110 } | |
| 111 | |
| 112 if(length(which(id2%in%id1)) != length(id2)){ | |
| 113 identif <- id2[which(!(id2%in%id1))] | |
| 114 if (length(identif) < 4){ | |
| 115 err.stock <- c(err.stock, "\nThe following identifier(s) found in the second table do not appear in the first table:\n") | |
| 116 } | |
| 117 else{ | |
| 118 err.stock <- c(err.stock, "\nFor example, the following identifiers found in the second table do not appear in the first table:\n") | |
| 119 } | |
| 120 identif <- identif[1:min(3,length(which(!(id2%in%id1))))] | |
| 121 err.stock <- c(err.stock," ",paste(identif,collapse="\n "),"\n") | |
| 122 } | |
| 123 err.stock <- c(err.stock,"\nPlease check your data.\n") | |
| 124 } | |
| 125 | |
| 126 if(length(err.stock)!=0){ | |
| 127 stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n") | |
| 128 } | |
| 129 | |
| 130 | |
| 131 # Check qualitative variables in each input tables | |
| 132 err.msg <- NULL | |
| 133 | |
| 134 var1.quali <- vector() | |
| 135 var2.quali <- vector() | |
| 136 | |
| 137 for (i in 1:dim(tab1)[2]){ | |
| 138 if(class(tab1[,i]) != "numeric" & class(tab1[,i]) != "integer"){ | |
| 139 var1.quali <- c(var1.quali,i) | |
| 140 } | |
| 141 } | |
| 142 | |
| 143 for (j in 1:dim(tab2)[2]){ | |
| 144 if(class(tab2[,j]) != "numeric" & class(tab2[,j]) != "integer"){ | |
| 145 var2.quali <- c(var2.quali, j) | |
| 146 } | |
| 147 } | |
| 148 | |
| 149 if (length(var1.quali) != 0 | length(var2.quali) != 0){ | |
| 150 err.msg <- c(err.msg, "\nThere are qualitative variables in your input tables which have been removed to compute the correlation table.\n\n") | |
| 151 | |
| 152 if(length(var1.quali) != 0 && length(var1.quali) < 4){ | |
| 153 err.msg <- c(err.msg, "In table 1, the following qualitative variables have been removed:\n", | |
| 154 " ",paste(colnames(tab1)[var1.quali],collapse="\n "),"\n") | |
| 155 } else if(length(var1.quali) != 0 && length(var1.quali) > 3){ | |
| 156 err.msg <- c(err.msg, "For example, in table 1, the following qualitative variables have been removed:\n", | |
| 157 " ",paste(colnames(tab1)[var1.quali[1:3]],collapse="\n "),"\n") | |
| 158 } | |
| 159 | |
| 160 if(length(var2.quali) != 0 && length(var2.quali) < 4){ | |
| 161 err.msg <- c(err.msg, "In table 2, the following qualitative variables have been removed:\n", | |
| 162 " ",paste(colnames(tab2)[var2.quali],collapse="\n "),"\n") | |
| 163 } else if(length(var2.quali) != 0 && length(var2.quali) > 3){ | |
| 164 err.msg <- c(err.msg, "For example, in table 2, the following qualitative variables have been removed:\n", | |
| 165 " ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n "),"\n") | |
| 166 } | |
| 167 } | |
| 168 | |
| 169 if(length(var1.quali) != 0){ | |
| 170 tab1 <- tab1[,-var1.quali] | |
| 171 } | |
| 172 if(length(var2.quali) != 0){ | |
| 173 tab2 <- tab2[,-var2.quali] | |
| 174 } | |
| 175 | |
| 176 if(length(err.msg) != 0){ | |
| 177 cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n") | |
| 178 } | |
| 179 | |
| 180 # Correlation table --------------------------------------------------------------------------------- | |
| 181 | |
| 182 tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2]) | |
| 183 for (i in 1:dim(tab2)[2]){ | |
| 184 for (j in 1:dim(tab1)[2]){ | |
| 185 tab.corr[i,j] <- cor(tab2[,i], tab1[,j], method = corr.method, use = "pairwise.complete.obs") | |
| 186 } | |
| 187 } | |
| 188 | |
| 189 colnames(tab.corr) <- colnames(tab1) | |
| 190 rownames(tab.corr) <- colnames(tab2) | |
| 191 | |
| 192 | |
| 193 | |
| 194 # Significance of correlation test ------------------------------------------------------------------ | |
| 195 | |
| 196 if (test.corr == "yes"){ | |
| 197 | |
| 198 pvalue <- vector() | |
| 199 for (i in 1:dim(tab.corr)[1]){ | |
| 200 for (j in 1:dim(tab.corr)[2]){ | |
| 201 suppressWarnings(corrtest <- cor.test(tab2[,i], tab1[,j], method = corr.method)) | |
| 202 pvalue <- c(pvalue, corrtest$p.value) | |
| 203 if (multi.name == "none"){ | |
| 204 if (corrtest$p.value > alpha){ | |
| 205 tab.corr[i,j] <- 0 | |
| 206 } | |
| 207 } | |
| 208 } | |
| 209 } | |
| 210 | |
| 211 if(multi.name != "none"){ | |
| 212 adjust <- matrix(p.adjust(pvalue, method = multi.name), nrow = dim(tab.corr)[1], ncol = dim(tab.corr)[2], byrow = T) | |
| 213 tab.corr[adjust > alpha] <- 0 | |
| 214 } | |
| 215 } | |
| 216 | |
| 217 | |
| 218 # Filter settings ------------------------------------------------------------------------------------ | |
| 219 | |
| 220 if (filter == "yes"){ | |
| 221 | |
| 222 # Remove variables with all their correlation coefficients = 0 : | |
| 223 if (filters.choice == "filter_0"){ | |
| 224 threshold <- 0 | |
| 225 } | |
| 226 | |
| 227 var2.thres <- vector() | |
| 228 for (i in 1:dim(tab.corr)[1]){ | |
| 229 if (length(which(abs(tab.corr[i,]) <= threshold)) == dim(tab.corr)[2]){ | |
| 230 var2.thres <- c(var2.thres, i) | |
| 231 } | |
| 232 } | |
| 233 | |
| 234 if (length(var2.thres) != 0){ | |
| 235 tab.corr <- tab.corr[-var2.thres,] | |
| 236 tab2 <- tab2[, -var2.thres] | |
| 237 } | |
| 238 | |
| 239 var1.thres <- vector() | |
| 240 for (i in 1:dim(tab.corr)[2]){ | |
| 241 if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){ | |
| 242 var1.thres <- c(var1.thres, i) | |
| 243 } | |
| 244 } | |
| 245 | |
| 246 if (length(var1.thres) != 0){ | |
| 247 tab.corr <- tab.corr[,-var1.thres] | |
| 248 tab1 <- tab1[,-var1.thres] | |
| 249 } | |
| 250 | |
| 251 } | |
| 252 | |
| 253 | |
| 254 # Reorder variables in the correlation table (with the HCA) ------------------------------------------ | |
| 255 if (reorder.var == "yes"){ | |
| 256 | |
| 257 cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs") | |
| 258 dist.tab2 <- as.dist(1 - cormat.tab2) | |
| 259 hc.tab2 <- hclust(dist.tab2, method = "ward.D2") | |
| 260 tab.corr <- tab.corr[hc.tab2$order,] | |
| 261 | |
| 262 cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs") | |
| 263 dist.tab1 <- as.dist(1 - cormat.tab1) | |
| 264 hc.tab1 <- hclust(dist.tab1, method = "ward.D2") | |
| 265 tab.corr <- tab.corr[,hc.tab1$order] | |
| 266 | |
| 267 } | |
| 268 | |
| 269 | |
| 270 | |
| 271 # Output 1 : Correlation table ----------------------------------------------------------------------- | |
| 272 | |
| 273 # Export correlation table | |
| 274 write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE) | |
| 275 | |
| 276 | |
| 277 | |
| 278 # Create the heatmap --------------------------------------------------------------------------------- | |
| 279 | |
| 280 # A message if no variable kept | |
| 281 if(length(tab.corr)==0){ | |
| 282 pdf(output2) | |
| 283 plot.new() | |
| 284 legend("center","Filtering leads to no remaining correlation coefficient.") | |
| 285 dev.off() | |
| 286 } else { | |
| 287 | |
| 288 | |
| 289 library(ggplot2) | |
| 290 library(reshape2) | |
| 291 | |
| 292 # Melt the correlation table : | |
| 293 melted.tab.corr <- melt(tab.corr) | |
| 294 | |
| 295 if (color.heatmap == "yes") { | |
| 296 | |
| 297 # Add a column for the classes of each correlation coefficient | |
| 298 classe <- rep(0, dim(melted.tab.corr)[1]) | |
| 299 melted <- cbind(melted.tab.corr, classe) | |
| 300 | |
| 301 if (type.classes == "regular"){ | |
| 302 | |
| 303 vect <- vector() | |
| 304 if (seq(-1,0,reg.value)[length(seq(-1,0,reg.value))] == 0){ | |
| 305 vect <- c(seq(-1,0,reg.value)[-length(seq(-1,0,reg.value))], | |
| 306 rev(seq(1,0,-reg.value))) | |
| 307 } else { | |
| 308 vect <- c(seq(-1,0,reg.value), 0, rev(seq(1,0,-reg.value))) | |
| 309 } | |
| 310 | |
| 311 } else if (type.classes == "irregular") { | |
| 312 | |
| 313 irreg.vect <- c(-1, irreg.vect, 1) | |
| 314 vect <- irreg.vect | |
| 315 | |
| 316 } | |
| 317 | |
| 318 # Color palette : | |
| 319 myPal <- colorRampPalette(c("#00CC00", "white", "red"), space = "Lab", interpolate = "spline") | |
| 320 | |
| 321 # Create vector intervals | |
| 322 cl <- vector() | |
| 323 cl <- paste("[", vect[1], ";", round(vect[2],3), "]", sep = "") | |
| 324 | |
| 325 for (x in 2:(length(vect)-1)) { | |
| 326 if (vect[x+1] == 0) { | |
| 327 cl <- c(cl, paste("]", round(vect[x],3), ";", round(vect[x+1],3), "[", sep = "")) | |
| 328 } else { | |
| 329 cl <- c(cl, paste("]", round(vect[x],3), ";", | |
| 330 round(vect[x+1],3), "]", sep = "")) | |
| 331 } | |
| 332 } | |
| 333 | |
| 334 # Assign an interval to each correlation coefficient | |
| 335 for (i in 1:dim(melted.tab.corr)[1]){ | |
| 336 for (j in 1:(length(cl))){ | |
| 337 if (vect[j] == -1){ | |
| 338 melted$classe[i][melted$value[i] >= vect[j] | |
| 339 && melted$value[i] <= vect[j+1]] <- cl[j] | |
| 340 } else { | |
| 341 melted$classe[i][melted$value[i] > vect[j] | |
| 342 && melted$value[i] <= vect[j+1]] <- cl[j] | |
| 343 } | |
| 344 } | |
| 345 } | |
| 346 | |
| 347 # Find the 0 and assign it the white as name | |
| 348 if (length(which(vect == 0)) == 1) { | |
| 349 melted$classe[melted$value == 0] <- "0" | |
| 350 indic <- which(vect == 0) | |
| 351 cl <- c(cl[1:(indic-1)], 0, cl[indic:length(cl)]) | |
| 352 names(cl)[indic] <- "#FFFFFF" | |
| 353 } else if (length(which(vect == 0)) == 0) { | |
| 354 indic <- 0 | |
| 355 for (x in 1:(length(vect)-1)) { | |
| 356 if (0 > vect[x] && 0 <= vect[x+1]) { | |
| 357 names(cl)[x] <- "#FFFFFF" | |
| 358 indic <- x | |
| 359 } | |
| 360 } | |
| 361 } | |
| 362 | |
| 363 indic <- length(cl) - indic + 1 | |
| 364 cl <- rev(cl) | |
| 365 | |
| 366 # Assign the colors of each intervals as their name | |
| 367 names(cl)[1:(indic-1)] <- myPal(length(cl[1:indic])*2-1)[1:indic-1] | |
| 368 names(cl)[(indic+1):length(cl)] <- myPal(length(cl[indic:length(cl)])*2-1)[(ceiling(length(myPal(length(cl[indic:length(cl)])*2-1))/2)+1):length(myPal(length(cl[indic:length(cl)])*2-1))] | |
| 369 | |
| 370 | |
| 371 melted$classe <- factor(melted$classe) | |
| 372 melted$classe <- factor(melted$classe, levels = cl[cl%in%levels(melted$classe)]) | |
| 373 | |
| 374 # Heatmap if color.heatmap = yes : | |
| 375 ggplot(melted, aes(Var2, Var1, fill = classe)) + | |
| 376 ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + | |
| 377 geom_tile(color ="ghostwhite") + | |
| 378 scale_fill_manual( breaks = levels(melted$classe), | |
| 379 values = names(cl)[cl%in%levels(melted$classe)], | |
| 380 name = paste(corr.method, "correlation", sep = "\n")) + | |
| 381 theme_classic() + | |
| 382 theme(axis.text.x = element_text(angle = 90, vjust = 0.5), | |
| 383 plot.title = element_text(hjust = 0.5)) | |
| 384 | |
| 385 } else { | |
| 386 | |
| 387 # Heatmap if color.heatmap = no : | |
| 388 ggplot(melted.tab.corr, aes(Var2, Var1, fill = value)) + | |
| 389 ggtitle("Colored correlation table" ) + xlab("Table 1") + ylab("Table 2") + | |
| 390 geom_tile(color ="ghostwhite") + | |
| 391 scale_fill_gradient2(low = "red", high = "#00CC00", mid = "white", midpoint = 0, limit = c(-1,1), | |
| 392 name = paste(corr.method, "correlation", sep = "\n")) + | |
| 393 theme_classic() + | |
| 394 theme(axis.text.x = element_text(angle = 90, vjust = 0.5), | |
| 395 plot.title = element_text(hjust = 0.5)) | |
| 396 } | |
| 397 | |
| 398 | |
| 399 ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE) | |
| 400 | |
| 401 | |
| 402 } # End if(length(tab.corr)==0)else | |
| 403 | |
| 404 } # End of correlation.tab | |
| 405 | |
| 406 | |
| 407 # Function call | |
| 408 # correlation.tab(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, multi.name, filter, | |
| 409 # filters.choice, threshold, reorder.var, color.heatmap, type.classes, | |
| 410 # reg.value, irreg.vect, output1, output2) |
