annotate ST NMF/test-data/NMF.R @ 0:7f8631f74bff draft default tip

Uploaded
author jfb
date Wed, 27 Jun 2018 11:42:38 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7f8631f74bff Uploaded
jfb
parents:
diff changeset
1 NAMEOFOUTPUTFILE<-"output1.csv"
7f8631f74bff Uploaded
jfb
parents:
diff changeset
2
7f8631f74bff Uploaded
jfb
parents:
diff changeset
3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
4 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name
7f8631f74bff Uploaded
jfb
parents:
diff changeset
5 #of the csv into this line between the quote marks.
7f8631f74bff Uploaded
jfb
parents:
diff changeset
6
7f8631f74bff Uploaded
jfb
parents:
diff changeset
7 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
8 SBF<-t(SBF)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
9
7f8631f74bff Uploaded
jfb
parents:
diff changeset
10 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
11 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in
7f8631f74bff Uploaded
jfb
parents:
diff changeset
12 #the motif
7f8631f74bff Uploaded
jfb
parents:
diff changeset
13
7f8631f74bff Uploaded
jfb
parents:
diff changeset
14 YsToim<-rep("xY",times=nrow(PositiveMotifs))
7f8631f74bff Uploaded
jfb
parents:
diff changeset
15 #PositiveMotifs[,11]<-YsToim
7f8631f74bff Uploaded
jfb
parents:
diff changeset
16
7f8631f74bff Uploaded
jfb
parents:
diff changeset
17
7f8631f74bff Uploaded
jfb
parents:
diff changeset
18
7f8631f74bff Uploaded
jfb
parents:
diff changeset
19 #right, this one just NEEDS the data to have come out of C&D
7f8631f74bff Uploaded
jfb
parents:
diff changeset
20
7f8631f74bff Uploaded
jfb
parents:
diff changeset
21 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
22 #I have to paste them, then split and unlist them, then find the x and paste again
7f8631f74bff Uploaded
jfb
parents:
diff changeset
23 Positive9Letters<-PositiveMotifs[,4:18]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
24 #head(Positive9Letters)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
25 PositiveTrueMotifs<-c()
7f8631f74bff Uploaded
jfb
parents:
diff changeset
26
7f8631f74bff Uploaded
jfb
parents:
diff changeset
27 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
7f8631f74bff Uploaded
jfb
parents:
diff changeset
28 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
29 ALLPOSSIBLE<-SuperAwesometrial[,1]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
30 ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
31 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
32
7f8631f74bff Uploaded
jfb
parents:
diff changeset
33 for (q in 1:nrow(Positive9Letters)) {
7f8631f74bff Uploaded
jfb
parents:
diff changeset
34 LeftJust<-0
7f8631f74bff Uploaded
jfb
parents:
diff changeset
35 RightJust<-0
7f8631f74bff Uploaded
jfb
parents:
diff changeset
36
7f8631f74bff Uploaded
jfb
parents:
diff changeset
37 motifmotif<-Positive9Letters[q,]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
38 motifmotif<-paste(motifmotif, collapse = "",sep = "")
7f8631f74bff Uploaded
jfb
parents:
diff changeset
39
7f8631f74bff Uploaded
jfb
parents:
diff changeset
40 motifmotif<-unlist(strsplit(motifmotif, split = ""))
7f8631f74bff Uploaded
jfb
parents:
diff changeset
41
7f8631f74bff Uploaded
jfb
parents:
diff changeset
42 position <- match(x = "x", table = motifmotif)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
43 LeftJust<-position-1
7f8631f74bff Uploaded
jfb
parents:
diff changeset
44 RightJust<-length(motifmotif)-position-1
7f8631f74bff Uploaded
jfb
parents:
diff changeset
45
7f8631f74bff Uploaded
jfb
parents:
diff changeset
46 LeftSpaces<-rep(x=" ", times=(7-LeftJust))
7f8631f74bff Uploaded
jfb
parents:
diff changeset
47 RightSpaces<-rep(x=" ", times=(7-RightJust))
7f8631f74bff Uploaded
jfb
parents:
diff changeset
48
7f8631f74bff Uploaded
jfb
parents:
diff changeset
49 motifmotif<-motifmotif[!motifmotif %in% c("x")]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
50
7f8631f74bff Uploaded
jfb
parents:
diff changeset
51 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
52 motifmotif<-paste(motifmotif, collapse = "",sep = "")
7f8631f74bff Uploaded
jfb
parents:
diff changeset
53 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
54 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
55
7f8631f74bff Uploaded
jfb
parents:
diff changeset
56
7f8631f74bff Uploaded
jfb
parents:
diff changeset
57 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
58 allmotifs<-matrix(data=c("Motifs"),nrow=1)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
59 thenames<-matrix(data=c("AccessionNumbers"),nrow = 1)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
60 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
61
7f8631f74bff Uploaded
jfb
parents:
diff changeset
62 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
63
7f8631f74bff Uploaded
jfb
parents:
diff changeset
64 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
65 #fihlodeANs<-c()
7f8631f74bff Uploaded
jfb
parents:
diff changeset
66 for (q in 1:length(AccessionNumbers)) {
7f8631f74bff Uploaded
jfb
parents:
diff changeset
67 patterno<-as.character(AccessionNumbers[q])
7f8631f74bff Uploaded
jfb
parents:
diff changeset
68 location<-sapply(ALLPOSSIBLE, grepl, pattern=patterno, fixed=TRUE)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
69 if (sum(location)>0){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
70 whereisit<-which(location %in% TRUE)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
71 for (u in 1:length(whereisit)) {
7f8631f74bff Uploaded
jfb
parents:
diff changeset
72 i<-whereisit[u]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
73 name<-c()
7f8631f74bff Uploaded
jfb
parents:
diff changeset
74 data<-c()
7f8631f74bff Uploaded
jfb
parents:
diff changeset
75 name<-as.character(SuperAwesometrial[i,1])
7f8631f74bff Uploaded
jfb
parents:
diff changeset
76 #the name of each protein is the first column
7f8631f74bff Uploaded
jfb
parents:
diff changeset
77 name<-sub(x=name, pattern=",", replacement="")
7f8631f74bff Uploaded
jfb
parents:
diff changeset
78 #the names may contain commas, remove them
7f8631f74bff Uploaded
jfb
parents:
diff changeset
79 data<-as.character(SuperAwesometrial[i,3])
7f8631f74bff Uploaded
jfb
parents:
diff changeset
80 #the amino acids are stored in the third column
7f8631f74bff Uploaded
jfb
parents:
diff changeset
81 data<-strsplit(data,"")
7f8631f74bff Uploaded
jfb
parents:
diff changeset
82 #split them into their component letters
7f8631f74bff Uploaded
jfb
parents:
diff changeset
83 data<-unlist(data)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
84 #turn them into a vector
7f8631f74bff Uploaded
jfb
parents:
diff changeset
85 motif<-c()
7f8631f74bff Uploaded
jfb
parents:
diff changeset
86 for (j in 1:length(data)){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
87 if ("S" %in% data[j] || "T" %in% data[j]){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
88 #if there is an S/T in the data
7f8631f74bff Uploaded
jfb
parents:
diff changeset
89 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)])
7f8631f74bff Uploaded
jfb
parents:
diff changeset
90 a<-j-7
7f8631f74bff Uploaded
jfb
parents:
diff changeset
91 if (a<1){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
92 a<-1
7f8631f74bff Uploaded
jfb
parents:
diff changeset
93 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
94 b<-j+7
7f8631f74bff Uploaded
jfb
parents:
diff changeset
95 if (b>length(data)){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
96 b<-length(data)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
97 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
98 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
7f8631f74bff Uploaded
jfb
parents:
diff changeset
99
7f8631f74bff Uploaded
jfb
parents:
diff changeset
100 LeftSide<-7-(j-a)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
101 RightSide<-7-(b-j)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
102 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not?
7f8631f74bff Uploaded
jfb
parents:
diff changeset
103
7f8631f74bff Uploaded
jfb
parents:
diff changeset
104 leftspaces<-rep(" ",times=LeftSide)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
105 rightspaces<-rep(" ",times=RightSide)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
106 #add blank spaces if the motif has less than 4 letters to the left/right
7f8631f74bff Uploaded
jfb
parents:
diff changeset
107
7f8631f74bff Uploaded
jfb
parents:
diff changeset
108
7f8631f74bff Uploaded
jfb
parents:
diff changeset
109 motif<-(data[(a):(b)])
7f8631f74bff Uploaded
jfb
parents:
diff changeset
110 motif<-c(leftspaces,motif,rightspaces)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
111 #save that motif, which is the Y and +/- 4 amino acids, including truncation
7f8631f74bff Uploaded
jfb
parents:
diff changeset
112
7f8631f74bff Uploaded
jfb
parents:
diff changeset
113 # lens<-c(lens,length(motif))
7f8631f74bff Uploaded
jfb
parents:
diff changeset
114 # leni<-c(leni,i)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
115 # lenj<-c(lenj,j)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
116
7f8631f74bff Uploaded
jfb
parents:
diff changeset
117 motif<-paste(motif, sep="", collapse="")
7f8631f74bff Uploaded
jfb
parents:
diff changeset
118 #the 4 amino acids, put them back together into a single string
7f8631f74bff Uploaded
jfb
parents:
diff changeset
119 motif<-matrix(data=c(motif),nrow = 1)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
120 namesss<-matrix(data=c(name),nrow = 1)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
121 #keep this motif and separately keep the name of the protein it came from
7f8631f74bff Uploaded
jfb
parents:
diff changeset
122 allmotifs<-rbind(allmotifs,motif)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
123 thenames<-rbind(thenames,namesss)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
124 #add names and motifs to a growing list
7f8631f74bff Uploaded
jfb
parents:
diff changeset
125
7f8631f74bff Uploaded
jfb
parents:
diff changeset
126 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",",
7f8631f74bff Uploaded
jfb
parents:
diff changeset
127 # row.names=FALSE,col.names = FALSE, na="", append=TRUE)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
128 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated
7f8631f74bff Uploaded
jfb
parents:
diff changeset
129 #append has 1to equal true because this thing will loop around many times adding more and more data points
7f8631f74bff Uploaded
jfb
parents:
diff changeset
130 #you must create a new filename/filepath with each new data you run
7f8631f74bff Uploaded
jfb
parents:
diff changeset
131 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
132 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
133 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
134 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
135 }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
136
7f8631f74bff Uploaded
jfb
parents:
diff changeset
137
7f8631f74bff Uploaded
jfb
parents:
diff changeset
138
7f8631f74bff Uploaded
jfb
parents:
diff changeset
139
7f8631f74bff Uploaded
jfb
parents:
diff changeset
140 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
141 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
142 ################################################################################################################################
7f8631f74bff Uploaded
jfb
parents:
diff changeset
143
7f8631f74bff Uploaded
jfb
parents:
diff changeset
144
7f8631f74bff Uploaded
jfb
parents:
diff changeset
145 # for (i in 1:nrow(SuperAwesometrial)){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
146 #
7f8631f74bff Uploaded
jfb
parents:
diff changeset
147 # }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
148
7f8631f74bff Uploaded
jfb
parents:
diff changeset
149 names(allmotifs)<-thenames
7f8631f74bff Uploaded
jfb
parents:
diff changeset
150
7f8631f74bff Uploaded
jfb
parents:
diff changeset
151 truemotifs<-allmotifs[!duplicated(allmotifs)]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
152 #truenames<-thenames[!duplicated(thenames)]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
153 #remove duplicates from the motifs and names
7f8631f74bff Uploaded
jfb
parents:
diff changeset
154
7f8631f74bff Uploaded
jfb
parents:
diff changeset
155 #make the motifs and names into matrices
7f8631f74bff Uploaded
jfb
parents:
diff changeset
156
7f8631f74bff Uploaded
jfb
parents:
diff changeset
157 # for (w in 1:nrow(truemotifs)) {
7f8631f74bff Uploaded
jfb
parents:
diff changeset
158 # for (e in 1:length(PositiveTrueMotifs)){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
159 # if (grepl(pattern=PositiveTrueMotifs[e], x=truemotifs[w,1],ignore.case = TRUE)==TRUE){
7f8631f74bff Uploaded
jfb
parents:
diff changeset
160 # truemotifs[w,1]<-NA
7f8631f74bff Uploaded
jfb
parents:
diff changeset
161 # }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
162 # }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
163 # }
7f8631f74bff Uploaded
jfb
parents:
diff changeset
164
7f8631f74bff Uploaded
jfb
parents:
diff changeset
165 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
7f8631f74bff Uploaded
jfb
parents:
diff changeset
166
7f8631f74bff Uploaded
jfb
parents:
diff changeset
167 # truemotifs<-matrix(data = truemotifs,ncol = 1)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
168 # truenames<-matrix(data=truenames,ncol = 1)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
169 #
7f8631f74bff Uploaded
jfb
parents:
diff changeset
170 #
7f8631f74bff Uploaded
jfb
parents:
diff changeset
171 # #program only works if there are more motifs than names, fuck it
7f8631f74bff Uploaded
jfb
parents:
diff changeset
172 #
7f8631f74bff Uploaded
jfb
parents:
diff changeset
173 # rowsrows<-nrow(truemotifs)-nrow(truenames)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
174 # nanas<-rep(NA,times=rowsrows)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
175 # nanas<-matrix(data = nanas,ncol = 1)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
176 # truenames<-rbind(truenames,nanas)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
177 # #to turn the motifs and names into a single output matrix, add enough rows of NAs so the two initial matrices are equivalent,
7f8631f74bff Uploaded
jfb
parents:
diff changeset
178 # #then put them together columnwise
7f8631f74bff Uploaded
jfb
parents:
diff changeset
179
7f8631f74bff Uploaded
jfb
parents:
diff changeset
180 outputfile<-cbind(names(truemotifs),truemotifs)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
181
7f8631f74bff Uploaded
jfb
parents:
diff changeset
182 outputfile <- gsub(",","",outputfile)
7f8631f74bff Uploaded
jfb
parents:
diff changeset
183
7f8631f74bff Uploaded
jfb
parents:
diff changeset
184 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
7f8631f74bff Uploaded
jfb
parents:
diff changeset
185 row.names=FALSE,col.names = FALSE, na="", append=TRUE)