Mercurial > repos > rnateam > rnacommender
comparison rbpfeatures.py @ 0:d04fa5201f51 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 7ad344d108076116e702e1c1e91cea73d8fcadc4
author | rnateam |
---|---|
date | Thu, 28 Jul 2016 05:56:54 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d04fa5201f51 |
---|---|
1 """Compute the RBP features.""" | |
2 | |
3 import re | |
4 import sys | |
5 import subprocess as sp | |
6 import uuid | |
7 from os import mkdir | |
8 from os import listdir | |
9 from os.path import isfile, join | |
10 from os import devnull | |
11 from shutil import rmtree | |
12 | |
13 import numpy as np | |
14 | |
15 import pandas as pd | |
16 | |
17 import fasta_utils | |
18 import pfam_utils | |
19 | |
20 __author__ = "Gianluca Corrado" | |
21 __copyright__ = "Copyright 2016, Gianluca Corrado" | |
22 __license__ = "MIT" | |
23 __maintainer__ = "Gianluca Corrado" | |
24 __email__ = "gianluca.corrado@unitn.it" | |
25 __status__ = "Production" | |
26 | |
27 | |
28 class RBPVectorizer(): | |
29 """Compute the RBP features.""" | |
30 | |
31 def __init__(self, fasta): | |
32 """ | |
33 Constructor. | |
34 | |
35 Parameters | |
36 ---------- | |
37 fasta : str | |
38 Fasta file containing the RBP sequences to predict. | |
39 """ | |
40 self.fasta = fasta | |
41 | |
42 self._mod_fold = "AURA_Human_data/RBP_features/mod" | |
43 self._reference_fisher_scores = \ | |
44 "AURA_Human_data/RBP_features/fisher_scores_ref" | |
45 self._train_rbps_file = \ | |
46 "AURA_Human_data/RBP_features/rbps_in_train.txt" | |
47 | |
48 self._temp_fold = "temp_" + str(uuid.uuid4()) | |
49 self.pfam_scan = "%s/pfam_scan.txt" % self._temp_fold | |
50 self._dom_fold = "%s/domains" % self._temp_fold | |
51 self._seeds_fold = "%s/seeds" % self._temp_fold | |
52 self._fisher_fold = "%s/fisher_scores" % self._temp_fold | |
53 | |
54 def _pfam_scan(self): | |
55 """Scan the sequences against the Pfam database.""" | |
56 nf = open(self.pfam_scan, "w") | |
57 nf.write(pfam_utils.search_header()) | |
58 | |
59 fasta = fasta_utils.import_fasta(self.fasta) | |
60 | |
61 if len(fasta) != 1: | |
62 sys.exit("""Fasta file must contain exactly one sequence.""") | |
63 | |
64 for rbp in sorted(fasta.keys()): | |
65 seq = fasta[rbp] | |
66 text = pfam_utils.sequence_search(rbp, seq) | |
67 nf.write(text) | |
68 | |
69 nf.close() | |
70 | |
71 def _overlapping_domains(self): | |
72 """Compute the set of domains contributing to the similarity.""" | |
73 reference_domains = set([dom.replace(".mod", "") for dom in | |
74 listdir(self._mod_fold) if | |
75 isfile(join(self._mod_fold, dom))]) | |
76 | |
77 data = pfam_utils.read_pfam_output(self.pfam_scan) | |
78 if data is None: | |
79 return [] | |
80 | |
81 prot_domains = set([a.split('.')[0] for a in data["hmm_acc"]]) | |
82 dom_list = sorted(list(reference_domains & prot_domains)) | |
83 | |
84 return dom_list | |
85 | |
86 def _prepare_domains(self, dom_list): | |
87 """Select domain subsequences from the entire protein sequences.""" | |
88 def prepare_domains(fasta_dic, dom_list, pfam_scan, out_folder): | |
89 out_file_dic = {} | |
90 for acc in dom_list: | |
91 out_file_dic[acc] = open("%s/%s.fa" % (out_folder, acc), "w") | |
92 | |
93 f = open(pfam_scan) | |
94 f.readline() | |
95 for line in f: | |
96 split = line.split() | |
97 rbp = split[0] | |
98 start = int(split[3]) | |
99 stop = int(split[4]) | |
100 acc = split[5].split('.')[0] | |
101 if acc in out_file_dic.keys(): | |
102 out_file_dic[acc].write( | |
103 ">%s:%i-%i\n%s\n" % (rbp, start, stop, | |
104 fasta_dic[rbp][start:stop])) | |
105 f.close() | |
106 | |
107 for acc in dom_list: | |
108 out_file_dic[acc].close() | |
109 | |
110 mkdir(self._dom_fold) | |
111 fasta = fasta_utils.import_fasta(self.fasta) | |
112 prepare_domains(fasta, dom_list, self.pfam_scan, | |
113 self._dom_fold) | |
114 | |
115 def _compute_fisher_scores(self, dom_list): | |
116 """Wrapper for SAM 3.5 get_fisher_scores.""" | |
117 def get_fisher_scores(dom_list, mod_fold, dom_fold, fisher_fold): | |
118 for acc in dom_list: | |
119 _FNULL = open(devnull, 'w') | |
120 cmd = "get_fisher_scores run -i %s/%s.mod -db %s/%s.fa" % ( | |
121 mod_fold, acc, dom_fold, acc) | |
122 fisher = sp.check_output( | |
123 cmd, shell=True, stderr=_FNULL) | |
124 nf = open("%s/%s.txt" % (fisher_fold, acc), "w") | |
125 nf.write(fisher) | |
126 nf.close() | |
127 | |
128 mkdir(self._fisher_fold) | |
129 get_fisher_scores(dom_list, self._mod_fold, self._dom_fold, | |
130 self._fisher_fold) | |
131 | |
132 def _ekm(self, dom_list): | |
133 """Compute the empirical kernel map from the Fisher scores.""" | |
134 def process_seg(e): | |
135 """Process segment of a SAM 3.5 get_fisher_scores output file.""" | |
136 seg = e.split() | |
137 c = seg[0].split(':')[0] | |
138 m = map(float, seg[3:]) | |
139 return c, m | |
140 | |
141 def read_sam_file(samfile): | |
142 """Read a SAM 3.5 get_fisher_scores output file.""" | |
143 f = open(samfile) | |
144 data = f.read() | |
145 f.close() | |
146 | |
147 columns = [] | |
148 m = [] | |
149 split = re.split(">A ", data)[1:] | |
150 for e in split: | |
151 c, m_ = process_seg(e) | |
152 columns.append(c) | |
153 m.append(m_) | |
154 | |
155 m = np.matrix(m) | |
156 df = pd.DataFrame(data=m.T, columns=columns) | |
157 return df | |
158 | |
159 def dom_features(fisher_fold, dom_list, names=None): | |
160 """Compute the features with respect to a domain type.""" | |
161 dfs = [] | |
162 for acc in dom_list: | |
163 df = read_sam_file("%s/%s.txt" % (fisher_fold, acc)) | |
164 df = df.groupby(df.columns, axis=1).mean() | |
165 dfs.append(df) | |
166 | |
167 con = pd.concat(dfs, ignore_index=True) | |
168 | |
169 if names is not None: | |
170 add = sorted(list(set(names) - set(con.columns))) | |
171 fil = sorted(list(set(names) - set(add))) | |
172 con = con[fil] | |
173 for c in add: | |
174 con[c] = np.zeros(len(con.index), dtype='float64') | |
175 con = con[names] | |
176 | |
177 con = con.fillna(0.0) | |
178 return con | |
179 | |
180 f = open(self._train_rbps_file) | |
181 train_rbps = f.read().strip().split('\n') | |
182 f.close() | |
183 ref = dom_features(self._reference_fisher_scores, dom_list, | |
184 names=train_rbps) | |
185 ekm_ref = ref.T.dot(ref) | |
186 ekm_ref.index = ekm_ref.columns | |
187 | |
188 sel = dom_features(self._fisher_fold, dom_list) | |
189 | |
190 ekm_sel = sel.T.dot(sel) | |
191 ekm_sel.index = ekm_sel.columns | |
192 | |
193 ekm = ref.T.dot(sel) | |
194 | |
195 for rs in ekm.columns: | |
196 for rr in ekm.index: | |
197 if ekm_ref[rr][rr] > 0 and ekm_sel[rs][rs] > 0: | |
198 ekm[rs][rr] /= np.sqrt(ekm_ref[rr][rr] * ekm_sel[rs][rs]) | |
199 return ekm | |
200 | |
201 def vectorize(self): | |
202 """Produce the RBP features.""" | |
203 # create a temporary folder | |
204 mkdir(self._temp_fold) | |
205 # scan the RBP sequences against Pfam | |
206 self._pfam_scan() | |
207 # determine the accession numbers of the pfam domains needed for | |
208 # computing the features | |
209 dom_list = self._overlapping_domains() | |
210 if len(dom_list) == 0: | |
211 rmtree(self._temp_fold) | |
212 return None | |
213 # prepare fasta file with the sequence of the domains | |
214 self._prepare_domains(dom_list) | |
215 # compute fisher scores using SAM 3.5 | |
216 self._compute_fisher_scores(dom_list) | |
217 # compute the empirical kernel map | |
218 ekm = self._ekm(dom_list) | |
219 # remove the temporary folder | |
220 rmtree(self._temp_fold) | |
221 return ekm |