Mercurial > repos > jfb > st_nmf
comparison ST NMF/NMF.R @ 0:7f8631f74bff draft default tip
Uploaded
| author | jfb |
|---|---|
| date | Wed, 27 Jun 2018 11:42:38 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7f8631f74bff |
|---|---|
| 1 NAMEOFOUTPUTFILE<-"output1.csv" | |
| 2 | |
| 3 SuperAwesometrial <- read.delim2("input1.tabular", header=FALSE) | |
| 4 #once you've used the other script to turn the FASFA into a CSV, copypaste the filepath and name | |
| 5 #of the csv into this line between the quote marks. | |
| 6 | |
| 7 SBF<-read.csv("input3.csv", stringsAsFactors = FALSE, header = FALSE) | |
| 8 SBF<-t(SBF) | |
| 9 | |
| 10 PositiveMotifs <- read.csv("input2.csv", stringsAsFactors=FALSE) | |
| 11 #because of R reasons, it is required that the motifs in this file have blank cells instead of spaces where there is no letter in | |
| 12 #the motif | |
| 13 | |
| 14 YsToim<-rep("xY",times=nrow(PositiveMotifs)) | |
| 15 #PositiveMotifs[,11]<-YsToim | |
| 16 | |
| 17 | |
| 18 | |
| 19 #right, this one just NEEDS the data to have come out of C&D | |
| 20 | |
| 21 ################################################################################################################################ | |
| 22 #I have to paste them, then split and unlist them, then find the x and paste again | |
| 23 Positive9Letters<-PositiveMotifs[,4:18] | |
| 24 #head(Positive9Letters) | |
| 25 PositiveTrueMotifs<-c() | |
| 26 | |
| 27 AccessionNumbers<-as.character(SBF[2:nrow(SBF),1]) | |
| 28 AccessionNumbers<-AccessionNumbers[!is.na(AccessionNumbers)] | |
| 29 ALLPOSSIBLE<-SuperAwesometrial[,1] | |
| 30 ALLPOSSIBLE<-as.character(ALLPOSSIBLE) | |
| 31 ################################################################################################################################ | |
| 32 | |
| 33 for (q in 1:nrow(Positive9Letters)) { | |
| 34 LeftJust<-0 | |
| 35 RightJust<-0 | |
| 36 | |
| 37 motifmotif<-Positive9Letters[q,] | |
| 38 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
| 39 | |
| 40 motifmotif<-unlist(strsplit(motifmotif, split = "")) | |
| 41 | |
| 42 position <- match(x = "x", table = motifmotif) | |
| 43 LeftJust<-position-1 | |
| 44 RightJust<-length(motifmotif)-position-1 | |
| 45 | |
| 46 LeftSpaces<-rep(x=" ", times=(7-LeftJust)) | |
| 47 RightSpaces<-rep(x=" ", times=(7-RightJust)) | |
| 48 | |
| 49 motifmotif<-motifmotif[!motifmotif %in% c("x")] | |
| 50 | |
| 51 motifmotif<-c(LeftSpaces,motifmotif,RightSpaces) | |
| 52 motifmotif<-paste(motifmotif, collapse = "",sep = "") | |
| 53 PositiveTrueMotifs<-c(PositiveTrueMotifs,motifmotif) | |
| 54 } | |
| 55 | |
| 56 | |
| 57 ################################################################################################################################ | |
| 58 allmotifs<-matrix(data=c("Motifs"),nrow=1) | |
| 59 thenames<-matrix(data=c("AccessionNumbers"),nrow = 1) | |
| 60 ################################################################################################################################ | |
| 61 | |
| 62 ################################################################################################################################ | |
| 63 | |
| 64 #TrueMotifNums<-which(ALLPOSSIBLE %in% AccessionNumbers) | |
| 65 #fihlodeANs<-c() | |
| 66 for (q in 1:length(AccessionNumbers)) { | |
| 67 patterno<-as.character(AccessionNumbers[q]) | |
| 68 location<-sapply(ALLPOSSIBLE, grepl, pattern=patterno, fixed=TRUE) | |
| 69 if (sum(location)>0){ | |
| 70 whereisit<-which(location %in% TRUE) | |
| 71 for (u in 1:length(whereisit)) { | |
| 72 i<-whereisit[u] | |
| 73 name<-c() | |
| 74 data<-c() | |
| 75 name<-as.character(SuperAwesometrial[i,1]) | |
| 76 #the name of each protein is the first column | |
| 77 name<-sub(x=name, pattern=",", replacement="") | |
| 78 #the names may contain commas, remove them | |
| 79 data<-as.character(SuperAwesometrial[i,3]) | |
| 80 #the amino acids are stored in the third column | |
| 81 data<-strsplit(data,"") | |
| 82 #split them into their component letters | |
| 83 data<-unlist(data) | |
| 84 #turn them into a vector | |
| 85 motif<-c() | |
| 86 for (j in 1:length(data)){ | |
| 87 if ("S" %in% data[j] || "T" %in% data[j]){ | |
| 88 #if there is an S/T in the data | |
| 89 #allmotifs=rbind(allmotifs,data[(i-4):(i+4)]) | |
| 90 a<-j-7 | |
| 91 if (a<1){ | |
| 92 a<-1 | |
| 93 } | |
| 94 b<-j+7 | |
| 95 if (b>length(data)){ | |
| 96 b<-length(data) | |
| 97 } | |
| 98 #take the motif that is +/- 4 from that Y, sanity checks so that values are never off the grid from the protein | |
| 99 | |
| 100 LeftSide<-7-(j-a) | |
| 101 RightSide<-7-(b-j) | |
| 102 #how is the motif justified? Does it have exactly 4 letters to the left/right, or does it not? | |
| 103 | |
| 104 leftspaces<-rep(" ",times=LeftSide) | |
| 105 rightspaces<-rep(" ",times=RightSide) | |
| 106 #add blank spaces if the motif has less than 4 letters to the left/right | |
| 107 | |
| 108 | |
| 109 motif<-(data[(a):(b)]) | |
| 110 motif<-c(leftspaces,motif,rightspaces) | |
| 111 #save that motif, which is the Y and +/- 4 amino acids, including truncation | |
| 112 | |
| 113 # lens<-c(lens,length(motif)) | |
| 114 # leni<-c(leni,i) | |
| 115 # lenj<-c(lenj,j) | |
| 116 | |
| 117 motif<-paste(motif, sep="", collapse="") | |
| 118 #the 4 amino acids, put them back together into a single string | |
| 119 motif<-matrix(data=c(motif),nrow = 1) | |
| 120 namesss<-matrix(data=c(name),nrow = 1) | |
| 121 #keep this motif and separately keep the name of the protein it came from | |
| 122 allmotifs<-rbind(allmotifs,motif) | |
| 123 thenames<-rbind(thenames,namesss) | |
| 124 #add names and motifs to a growing list | |
| 125 | |
| 126 # write.table(motif, file="TRIALTIALRIAALSKFDJSD.csv", quote=FALSE, sep=",", | |
| 127 # row.names=FALSE,col.names = FALSE, na="", append=TRUE) | |
| 128 #and then write it into a csv, the sep is needed so that the two pieces of the data frame are separated | |
| 129 #append has 1to equal true because this thing will loop around many times adding more and more data points | |
| 130 #you must create a new filename/filepath with each new data you run | |
| 131 } | |
| 132 } | |
| 133 } | |
| 134 } | |
| 135 } | |
| 136 | |
| 137 | |
| 138 | |
| 139 | |
| 140 ################################################################################################################################ | |
| 141 ################################################################################################################################ | |
| 142 ################################################################################################################################ | |
| 143 | |
| 144 | |
| 145 # for (i in 1:nrow(SuperAwesometrial)){ | |
| 146 # | |
| 147 # } | |
| 148 | |
| 149 names(allmotifs)<-thenames | |
| 150 | |
| 151 truemotifs<-allmotifs[!duplicated(allmotifs)] | |
| 152 #truenames<-thenames[!duplicated(thenames)] | |
| 153 #remove duplicates from the motifs and names | |
| 154 | |
| 155 #make the motifs and names into matrices | |
| 156 | |
| 157 # for (w in 1:nrow(truemotifs)) { | |
| 158 # for (e in 1:length(PositiveTrueMotifs)){ | |
| 159 # if (grepl(pattern=PositiveTrueMotifs[e], x=truemotifs[w,1],ignore.case = TRUE)==TRUE){ | |
| 160 # truemotifs[w,1]<-NA | |
| 161 # } | |
| 162 # } | |
| 163 # } | |
| 164 | |
| 165 truemotifs<-truemotifs[!truemotifs %in% PositiveTrueMotifs] | |
| 166 | |
| 167 # truemotifs<-matrix(data = truemotifs,ncol = 1) | |
| 168 # truenames<-matrix(data=truenames,ncol = 1) | |
| 169 # | |
| 170 # | |
| 171 # #program only works if there are more motifs than names, fuck it | |
| 172 # | |
| 173 # rowsrows<-nrow(truemotifs)-nrow(truenames) | |
| 174 # nanas<-rep(NA,times=rowsrows) | |
| 175 # nanas<-matrix(data = nanas,ncol = 1) | |
| 176 # truenames<-rbind(truenames,nanas) | |
| 177 # #to turn the motifs and names into a single output matrix, add enough rows of NAs so the two initial matrices are equivalent, | |
| 178 # #then put them together columnwise | |
| 179 | |
| 180 outputfile<-cbind(names(truemotifs),truemotifs) | |
| 181 | |
| 182 outputfile <- gsub(",","",outputfile) | |
| 183 | |
| 184 write.table(outputfile, file=NAMEOFOUTPUTFILE, quote=FALSE, sep=",", | |
| 185 row.names=FALSE,col.names = FALSE, na="", append=TRUE) |
