comparison ST NMF/NMF.R @ 0:7f8631f74bff draft default tip

Uploaded
author jfb
date Wed, 27 Jun 2018 11:42:38 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:7f8631f74bff
1 NAMEOFOUTPUTFILE<-"output1.csv"
2
3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE)
4 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name
5 #of the csv into this line between the quote marks.
6
7 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE)
8 SBF<-t(SBF)
9
10 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE)
11 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in
12 #the motif
13
14 YsToim<-rep("xY",times=nrow(PositiveMotifs))
15 #PositiveMotifs[,11]<-YsToim
16
17
18
19 #right, this one just NEEDS the data to have come out of C&D
20
21 ################################################################################################################################
22 #I have to paste them, then split and unlist them, then find the x and paste again
23 Positive9Letters<-PositiveMotifs[,4:18]
24 #head(Positive9Letters)
25 PositiveTrueMotifs<-c()
26
27 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1])
28 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)]
29 ALLPOSSIBLE<-SuperAwesometrial[,1]
30 ALLPOSSIBLE<-as.character(ALLPOSSIBLE)
31 ################################################################################################################################
32
33 for (q in 1:nrow(Positive9Letters)) {
34 LeftJust<-0
35 RightJust<-0
36
37 motifmotif<-Positive9Letters[q,]
38 motifmotif<-paste(motifmotif, collapse = "",sep = "")
39
40 motifmotif<-unlist(strsplit(motifmotif, split = ""))
41
42 position <- match(x = "x", table = motifmotif)
43 LeftJust<-position-1
44 RightJust<-length(motifmotif)-position-1
45
46 LeftSpaces<-rep(x=" ", times=(7-LeftJust))
47 RightSpaces<-rep(x=" ", times=(7-RightJust))
48
49 motifmotif<-motifmotif[!motifmotif %in% c("x")]
50
51 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces)
52 motifmotif<-paste(motifmotif, collapse = "",sep = "")
53 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif)
54 }
55
56
57 ################################################################################################################################
58 allmotifs<-matrix(data=c("Motifs"),nrow=1)
59 thenames<-matrix(data=c("AccessionNumbers"),nrow = 1)
60 ################################################################################################################################
61
62 ################################################################################################################################
63
64 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers)
65 #fihlodeANs<-c()
66 for (q in 1:length(AccessionNumbers)) {
67 patterno<-as.character(AccessionNumbers[q])
68 location<-sapply(ALLPOSSIBLE, grepl, pattern=patterno, fixed=TRUE)
69 if (sum(location)>0){
70 whereisit<-which(location %in% TRUE)
71 for (u in 1:length(whereisit)) {
72 i<-whereisit[u]
73 name<-c()
74 data<-c()
75 name<-as.character(SuperAwesometrial[i,1])
76 #the name of each protein is the first column
77 name<-sub(x=name, pattern=",", replacement="")
78 #the names may contain commas, remove them
79 data<-as.character(SuperAwesometrial[i,3])
80 #the amino acids are stored in the third column
81 data<-strsplit(data,"")
82 #split them into their component letters
83 data<-unlist(data)
84 #turn them into a vector
85 motif<-c()
86 for (j in 1:length(data)){
87 if ("S" %in% data[j] || "T" %in% data[j]){
88 #if there is an S/T in the data
89 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)])
90 a<-j-7
91 if (a<1){
92 a<-1
93 }
94 b<-j+7
95 if (b>length(data)){
96 b<-length(data)
97 }
98 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein
99
100 LeftSide<-7-(j-a)
101 RightSide<-7-(b-j)
102 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not?
103
104 leftspaces<-rep(" ",times=LeftSide)
105 rightspaces<-rep(" ",times=RightSide)
106 #add blank spaces if the motif has less than 4 letters to the left/right
107
108
109 motif<-(data[(a):(b)])
110 motif<-c(leftspaces,motif,rightspaces)
111 #save that motif, which is the Y and +/- 4 amino acids, including truncation
112
113 # lens<-c(lens,length(motif))
114 # leni<-c(leni,i)
115 # lenj<-c(lenj,j)
116
117 motif<-paste(motif, sep="", collapse="")
118 #the 4 amino acids, put them back together into a single string
119 motif<-matrix(data=c(motif),nrow = 1)
120 namesss<-matrix(data=c(name),nrow = 1)
121 #keep this motif and separately keep the name of the protein it came from
122 allmotifs<-rbind(allmotifs,motif)
123 thenames<-rbind(thenames,namesss)
124 #add names and motifs to a growing list
125
126 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",",
127 # row.names=FALSE,col.names = FALSE, na="", append=TRUE)
128 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated
129 #append has 1to equal true because this thing will loop around many times adding more and more data points
130 #you must create a new filename/filepath with each new data you run
131 }
132 }
133 }
134 }
135 }
136
137
138
139
140 ################################################################################################################################
141 ################################################################################################################################
142 ################################################################################################################################
143
144
145 # for (i in 1:nrow(SuperAwesometrial)){
146 #
147 # }
148
149 names(allmotifs)<-thenames
150
151 truemotifs<-allmotifs[!duplicated(allmotifs)]
152 #truenames<-thenames[!duplicated(thenames)]
153 #remove duplicates from the motifs and names
154
155 #make the motifs and names into matrices
156
157 # for (w in 1:nrow(truemotifs)) {
158 # for (e in 1:length(PositiveTrueMotifs)){
159 # if (grepl(pattern=PositiveTrueMotifs[e], x=truemotifs[w,1],ignore.case = TRUE)==TRUE){
160 # truemotifs[w,1]<-NA
161 # }
162 # }
163 # }
164
165 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs]
166
167 # truemotifs<-matrix(data = truemotifs,ncol = 1)
168 # truenames<-matrix(data=truenames,ncol = 1)
169 #
170 #
171 # #program only works if there are more motifs than names, fuck it
172 #
173 # rowsrows<-nrow(truemotifs)-nrow(truenames)
174 # nanas<-rep(NA,times=rowsrows)
175 # nanas<-matrix(data = nanas,ncol = 1)
176 # truenames<-rbind(truenames,nanas)
177 # #to turn the motifs and names into a single output matrix, add enough rows of NAs so the two initial matrices are equivalent,
178 # #then put them together columnwise
179
180 outputfile<-cbind(names(truemotifs),truemotifs)
181
182 outputfile <- gsub(",","",outputfile)
183
184 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",",
185 row.names=FALSE,col.names = FALSE, na="", append=TRUE)