comparison Intchecks/Script_intensity_check.R @ 0:51c39ea1fd54 draft

Uploaded
author melpetera
date Mon, 14 Jan 2019 08:37:27 -0500
parents
children d1133a7c26f9
comparison
equal deleted inserted replaced
-1:000000000000 0:51c39ea1fd54
1 #########################################################################
2 # SCRIPT INTENSITY CHECK #
3 # #
4 # Input: Data Matrix, VariableMetadata, SampleMetadata #
5 # Output: VariableMetadata, Graphics (barplots and boxplots) #
6 # #
7 # Dependencies: RcheckLibrary.R #
8 # #
9 #########################################################################
10
11
12 # Parameters (for dev)
13 if(FALSE){
14
15 rm(list = ls())
16 setwd("Y:\\Developpement\\Intensity check\\Pour tests")
17
18 DM.name <- "DM_NA.tabular"
19 SM.name <- "SM_NA.tabular"
20 VM.name <- "vM_NA.tabular"
21 class.col <- "2"
22 type <- "One_class"
23 class1 <- "Blanks"
24 fold.frac <- "Top"
25 logarithm <- "log2"
26 VM.output <- "new_VM.txt"
27 graphs.output <- "Barplots_and_Boxplots.pdf"
28 }
29
30
31
32
33 intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm,
34 VM.output, graphs.output){
35
36
37 # This function allows to check the intensities considering classes with a mean fold change calculation,
38 # the number and the proportion of missing values (NA) in dataMatrix
39 #
40 # Two options:
41 # - one class (selected by the user) against all the remaining samples ("One_class")
42 # - tests on each class ("Each_class")
43 #
44 # Parameters:
45 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
46 # class.col: number of the sampleMetadata's column with classes
47 # type: "One_class" or "Each_class"
48 # class1: name of the class, only if type="One_class"
49 # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom")
50 # logarithm: "log2", "log10" or "none" for log mean fold change
51 # VM.output: output file's access (VM with new columns)
52 # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values
53
54
55
56
57
58 # Input ---------------------------------------------------------------------------------------------------
59
60 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
61 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
62 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
63
64
65
66 # Table match check with Rchecklibrary
67 table.check <- match3(DM, SM, VM)
68 check.err(table.check)
69
70
71
72 rownames(DM) <- DM[,1]
73 var_names <- DM[,1]
74 DM <- DM[,-1]
75 DM <- data.frame(t(DM))
76
77 class.col <- colnames(SM)[as.numeric(class.col)]
78
79
80 # check class.col, class1 and the number of classes ---------------------------------------------------------
81
82 if(!(class.col %in% colnames(SM))){
83 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
84 }
85
86 c_class <- SM[,class.col]
87 c_class <- as.factor(c_class)
88 nb_class <- nlevels(c_class)
89 classnames <- levels(c_class)
90
91 if(nb_class < 2){
92 err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n")
93 cat(err.1class)
94 }
95
96 if((nb_class > (nrow(SM))/3)){
97 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
98 with few samples \n")
99 cat(class.err)
100 }
101
102
103 if(type == "One_class"){
104 if(!(class1 %in% classnames)){
105 list.class1 <- c("\n Classes:",classnames,"\n")
106 cat(list.class1)
107 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
108 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
109 }
110 }
111
112
113 #If type is "one_class", change others classes in "other"
114 if(type == "One_class"){
115 for(i in 1:length(c_class)){
116 if(c_class[i]!=class1){
117 c_class <- as.character(c_class)
118 c_class[i] <- "Other"
119 c_class <- as.factor(c_class)
120 nb_class <- nlevels(c_class)
121 classnames <- c(class1,"Other")
122
123 }
124 }
125 }
126
127 DM <- cbind(DM,c_class)
128
129
130
131 # fold calculation -------------------------------------------------------------------------------------------
132
133 if(nb_class >= 2){
134
135
136 fold <- data.frame()
137 n <- 1
138 ratio1 <- NULL
139 ratio2 <- NULL
140
141 if(type=="Each_class"){
142 fold.frac <- "Top"
143 }
144
145 for(j in 1:(nb_class-1)){
146 for(k in (j+1):nb_class) {
147
148 if(fold.frac=="Bottom"){
149 ratio1 <- classnames[k]
150 ratio2 <- classnames[j]
151 }else{
152 ratio1 <- classnames[j]
153 ratio2 <- classnames[k]
154 }
155
156 for (i in 1:(length(DM)-1)){
157 fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/
158 mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE)
159 if(logarithm=="log2"){
160 fold[i,n] <- log2(fold[i,n])
161 }else if(logarithm=="log10"){
162 fold[i,n] <- log10(fold[i,n])
163 }
164 }
165 names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_")
166 if(logarithm != "none"){
167 names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_")
168 }
169 n <- n + 1}
170 }
171
172 }
173
174 # number and proportion of NA ---------------------------------------------------------------------------------
175
176 calcul_NA <- data.frame()
177 pct_NA <- data.frame()
178 for (i in 1:(length(DM)-1)){
179 for (j in 1:nb_class){
180 n <- 0
181 new_DM <- DM[which(DM$c_class==classnames[j]),i]
182 for(k in 1:length(new_DM)){
183 if (is.na(new_DM[k])){
184 n <- n + 1}
185 calcul_NA[i,j] <- n
186 pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
187 }
188 }
189 names(calcul_NA) <- paste("NA",classnames, sep="_")
190 names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
191
192 # Alert message if there is no NA in data matrix
193
194 sumNA <- colSums(calcul_NA)
195 sum_total <- sum(sumNA)
196 alerte <- NULL
197 if(sum_total==0){
198 alerte <- c(alerte, "Data Matrix contains no NA.\n")
199 }
200
201 if(length(alerte) != 0){
202 cat(alerte,"\n")
203 }
204 table_NA <- cbind(calcul_NA, pct_NA)
205
206
207
208 # check columns names ---------------------------------------------------------------------------------------
209
210
211 VM.names <- colnames(VM)
212
213 # Fold
214
215 if(nb_class >=2){
216 fold.names <- colnames(fold)
217
218 for (i in 1:length(VM.names)){
219 for (j in 1:length(fold.names)){
220 if (VM.names[i]==fold.names[j]){
221 fold.names[j] <- paste(fold.names[j],"2", sep="_")
222 }
223 }
224 }
225 colnames(fold) <- fold.names
226
227 VM <- cbind(VM,fold)
228 }
229
230 # NA
231 NA.names <- colnames(table_NA)
232
233 for (i in 1:length(VM.names)){
234 for (j in 1:length(NA.names)){
235 if (VM.names[i]==NA.names[j]){
236 NA.names[j] <- paste(NA.names[j],"2", sep="_")
237 }
238 }
239 }
240 colnames(table_NA) <- NA.names
241 VM <- cbind(VM,table_NA)
242
243
244 #for NA barplots -------------------------------------------------------------------------------------------
245
246 data_bp <- data.frame()
247
248 for (j in 1:ncol(pct_NA)){
249 Nb_NA_0_20 <- 0
250 Nb_NA_20_40 <- 0
251 Nb_NA_40_60 <- 0
252 Nb_NA_60_80 <- 0
253 Nb_NA_80_100 <- 0
254 for (i in 1:nrow(pct_NA)){
255
256 if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){
257 Nb_NA_0_20=Nb_NA_0_20+1
258 }
259
260 if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){
261 Nb_NA_20_40=Nb_NA_20_40+1}
262
263 if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){
264 Nb_NA_40_60=Nb_NA_40_60+1}
265
266 if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){
267 Nb_NA_60_80=Nb_NA_60_80+1}
268
269 if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){
270 Nb_NA_80_100=Nb_NA_80_100+1}
271 }
272 data_bp[1,j] <- Nb_NA_0_20
273 data_bp[2,j] <- Nb_NA_20_40
274 data_bp[3,j] <- Nb_NA_40_60
275 data_bp[4,j] <- Nb_NA_60_80
276 data_bp[5,j] <- Nb_NA_80_100
277 }
278 rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%")
279 colnames(data_bp) <- classnames
280 data_bp <- as.matrix(data_bp)
281
282
283 # Output ---------------------------------------------------------------------------------------------------
284
285
286 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
287
288 #graphics pdf
289
290 pdf(graphs.output)
291
292 #Barplots for NA
293 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
294
295 bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
296 legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
297
298 stock=0
299 for (i in 1:nrow(data_bp)){
300 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7)
301 stock <- stock+data_bp[i,]
302 }
303
304
305 #Boxplots for fold test
306
307 if(nb_class >= 2){
308
309 clean_fold <- fold
310 for(i in 1:nrow(clean_fold)){
311 for(j in 1:ncol(clean_fold)){
312 if(is.infinite(clean_fold[i,j])){
313 clean_fold[i,j] <- NA
314 }
315 }
316 }
317 for (j in 1:ncol(clean_fold)){
318 title <- paste(fold.names[j])
319 boxplot(clean_fold[j], main=title)
320 }
321 }
322
323 dev.off()
324
325 }
326
327