comparison rbpfeatures.py @ 0:d04fa5201f51 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/rna_commander/tools/rna_tools/rna_commender commit 7ad344d108076116e702e1c1e91cea73d8fcadc4
author rnateam
date Thu, 28 Jul 2016 05:56:54 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d04fa5201f51
1 """Compute the RBP features."""
2
3 import re
4 import sys
5 import subprocess as sp
6 import uuid
7 from os import mkdir
8 from os import listdir
9 from os.path import isfile, join
10 from os import devnull
11 from shutil import rmtree
12
13 import numpy as np
14
15 import pandas as pd
16
17 import fasta_utils
18 import pfam_utils
19
20 __author__ = "Gianluca Corrado"
21 __copyright__ = "Copyright 2016, Gianluca Corrado"
22 __license__ = "MIT"
23 __maintainer__ = "Gianluca Corrado"
24 __email__ = "gianluca.corrado@unitn.it"
25 __status__ = "Production"
26
27
28 class RBPVectorizer():
29 """Compute the RBP features."""
30
31 def __init__(self, fasta):
32 """
33 Constructor.
34
35 Parameters
36 ----------
37 fasta : str
38 Fasta file containing the RBP sequences to predict.
39 """
40 self.fasta = fasta
41
42 self._mod_fold = "AURA_Human_data/RBP_features/mod"
43 self._reference_fisher_scores = \
44 "AURA_Human_data/RBP_features/fisher_scores_ref"
45 self._train_rbps_file = \
46 "AURA_Human_data/RBP_features/rbps_in_train.txt"
47
48 self._temp_fold = "temp_" + str(uuid.uuid4())
49 self.pfam_scan = "%s/pfam_scan.txt" % self._temp_fold
50 self._dom_fold = "%s/domains" % self._temp_fold
51 self._seeds_fold = "%s/seeds" % self._temp_fold
52 self._fisher_fold = "%s/fisher_scores" % self._temp_fold
53
54 def _pfam_scan(self):
55 """Scan the sequences against the Pfam database."""
56 nf = open(self.pfam_scan, "w")
57 nf.write(pfam_utils.search_header())
58
59 fasta = fasta_utils.import_fasta(self.fasta)
60
61 if len(fasta) != 1:
62 sys.exit("""Fasta file must contain exactly one sequence.""")
63
64 for rbp in sorted(fasta.keys()):
65 seq = fasta[rbp]
66 text = pfam_utils.sequence_search(rbp, seq)
67 nf.write(text)
68
69 nf.close()
70
71 def _overlapping_domains(self):
72 """Compute the set of domains contributing to the similarity."""
73 reference_domains = set([dom.replace(".mod", "") for dom in
74 listdir(self._mod_fold) if
75 isfile(join(self._mod_fold, dom))])
76
77 data = pfam_utils.read_pfam_output(self.pfam_scan)
78 if data is None:
79 return []
80
81 prot_domains = set([a.split('.')[0] for a in data["hmm_acc"]])
82 dom_list = sorted(list(reference_domains & prot_domains))
83
84 return dom_list
85
86 def _prepare_domains(self, dom_list):
87 """Select domain subsequences from the entire protein sequences."""
88 def prepare_domains(fasta_dic, dom_list, pfam_scan, out_folder):
89 out_file_dic = {}
90 for acc in dom_list:
91 out_file_dic[acc] = open("%s/%s.fa" % (out_folder, acc), "w")
92
93 f = open(pfam_scan)
94 f.readline()
95 for line in f:
96 split = line.split()
97 rbp = split[0]
98 start = int(split[3])
99 stop = int(split[4])
100 acc = split[5].split('.')[0]
101 if acc in out_file_dic.keys():
102 out_file_dic[acc].write(
103 ">%s:%i-%i\n%s\n" % (rbp, start, stop,
104 fasta_dic[rbp][start:stop]))
105 f.close()
106
107 for acc in dom_list:
108 out_file_dic[acc].close()
109
110 mkdir(self._dom_fold)
111 fasta = fasta_utils.import_fasta(self.fasta)
112 prepare_domains(fasta, dom_list, self.pfam_scan,
113 self._dom_fold)
114
115 def _compute_fisher_scores(self, dom_list):
116 """Wrapper for SAM 3.5 get_fisher_scores."""
117 def get_fisher_scores(dom_list, mod_fold, dom_fold, fisher_fold):
118 for acc in dom_list:
119 _FNULL = open(devnull, 'w')
120 cmd = "get_fisher_scores run -i %s/%s.mod -db %s/%s.fa" % (
121 mod_fold, acc, dom_fold, acc)
122 fisher = sp.check_output(
123 cmd, shell=True, stderr=_FNULL)
124 nf = open("%s/%s.txt" % (fisher_fold, acc), "w")
125 nf.write(fisher)
126 nf.close()
127
128 mkdir(self._fisher_fold)
129 get_fisher_scores(dom_list, self._mod_fold, self._dom_fold,
130 self._fisher_fold)
131
132 def _ekm(self, dom_list):
133 """Compute the empirical kernel map from the Fisher scores."""
134 def process_seg(e):
135 """Process segment of a SAM 3.5 get_fisher_scores output file."""
136 seg = e.split()
137 c = seg[0].split(':')[0]
138 m = map(float, seg[3:])
139 return c, m
140
141 def read_sam_file(samfile):
142 """Read a SAM 3.5 get_fisher_scores output file."""
143 f = open(samfile)
144 data = f.read()
145 f.close()
146
147 columns = []
148 m = []
149 split = re.split(">A ", data)[1:]
150 for e in split:
151 c, m_ = process_seg(e)
152 columns.append(c)
153 m.append(m_)
154
155 m = np.matrix(m)
156 df = pd.DataFrame(data=m.T, columns=columns)
157 return df
158
159 def dom_features(fisher_fold, dom_list, names=None):
160 """Compute the features with respect to a domain type."""
161 dfs = []
162 for acc in dom_list:
163 df = read_sam_file("%s/%s.txt" % (fisher_fold, acc))
164 df = df.groupby(df.columns, axis=1).mean()
165 dfs.append(df)
166
167 con = pd.concat(dfs, ignore_index=True)
168
169 if names is not None:
170 add = sorted(list(set(names) - set(con.columns)))
171 fil = sorted(list(set(names) - set(add)))
172 con = con[fil]
173 for c in add:
174 con[c] = np.zeros(len(con.index), dtype='float64')
175 con = con[names]
176
177 con = con.fillna(0.0)
178 return con
179
180 f = open(self._train_rbps_file)
181 train_rbps = f.read().strip().split('\n')
182 f.close()
183 ref = dom_features(self._reference_fisher_scores, dom_list,
184 names=train_rbps)
185 ekm_ref = ref.T.dot(ref)
186 ekm_ref.index = ekm_ref.columns
187
188 sel = dom_features(self._fisher_fold, dom_list)
189
190 ekm_sel = sel.T.dot(sel)
191 ekm_sel.index = ekm_sel.columns
192
193 ekm = ref.T.dot(sel)
194
195 for rs in ekm.columns:
196 for rr in ekm.index:
197 if ekm_ref[rr][rr] > 0 and ekm_sel[rs][rs] > 0:
198 ekm[rs][rr] /= np.sqrt(ekm_ref[rr][rr] * ekm_sel[rs][rs])
199 return ekm
200
201 def vectorize(self):
202 """Produce the RBP features."""
203 # create a temporary folder
204 mkdir(self._temp_fold)
205 # scan the RBP sequences against Pfam
206 self._pfam_scan()
207 # determine the accession numbers of the pfam domains needed for
208 # computing the features
209 dom_list = self._overlapping_domains()
210 if len(dom_list) == 0:
211 rmtree(self._temp_fold)
212 return None
213 # prepare fasta file with the sequence of the domains
214 self._prepare_domains(dom_list)
215 # compute fisher scores using SAM 3.5
216 self._compute_fisher_scores(dom_list)
217 # compute the empirical kernel map
218 ekm = self._ekm(dom_list)
219 # remove the temporary folder
220 rmtree(self._temp_fold)
221 return ekm