Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/rxlr_motifs.py @ 26:20139cb4c844 draft
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
author | peterjc |
---|---|
date | Wed, 13 May 2015 06:14:42 -0400 |
parents | 2b35b5c4b7f4 |
children | 3cb02adf4326 |
comparison
equal
deleted
inserted
replaced
25:41a42022f815 | 26:20139cb4c844 |
---|---|
38 """ | 38 """ |
39 import os | 39 import os |
40 import sys | 40 import sys |
41 import re | 41 import re |
42 import subprocess | 42 import subprocess |
43 from seq_analysis_utils import stop_err, fasta_iterator | 43 from seq_analysis_utils import sys_exit, fasta_iterator |
44 | |
45 if "-v" in sys.argv: | |
46 print("RXLR Motifs v0.0.10") | |
47 sys.exit(0) | |
44 | 48 |
45 if len(sys.argv) != 5: | 49 if len(sys.argv) != 5: |
46 stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename") | 50 sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") |
47 | 51 |
48 fasta_file, threads, model, tabular_file = sys.argv[1:] | 52 fasta_file, threads, model, tabular_file = sys.argv[1:] |
49 hmm_output_file = tabular_file + ".hmm.tmp" | 53 hmm_output_file = tabular_file + ".hmm.tmp" |
50 signalp_input_file = tabular_file + ".fasta.tmp" | 54 signalp_input_file = tabular_file + ".fasta.tmp" |
51 signalp_output_file = tabular_file + ".tabular.tmp" | 55 signalp_output_file = tabular_file + ".tabular.tmp" |
52 min_signalp_hmm = 0.9 | 56 min_signalp_hmm = 0.9 |
53 hmmer_search = "hmmsearch2" | 57 hmmer_search = "hmmsearch2" |
54 | 58 |
55 if model == "Bhattacharjee2006": | 59 if model == "Bhattacharjee2006": |
56 signalp_trunc = 70 | 60 signalp_trunc = 70 |
57 re_rxlr = re.compile("R.LR") | 61 re_rxlr = re.compile("R.LR") |
58 min_sp = 10 | 62 min_sp = 10 |
59 max_sp = 40 | 63 max_sp = 40 |
60 max_sp_rxlr = 100 | 64 max_sp_rxlr = 100 |
61 min_rxlr_start = 1 | 65 min_rxlr_start = 1 |
62 #Allow signal peptide to be at most 40aa, and want RXLR to be | 66 # Allow signal peptide to be at most 40aa, and want RXLR to be |
63 #within 100aa, therefore for the prescreen the max start is 140: | 67 # within 100aa, therefore for the prescreen the max start is 140: |
64 max_rxlr_start = max_sp + max_sp_rxlr | 68 max_rxlr_start = max_sp + max_sp_rxlr |
65 elif model == "Win2007": | 69 elif model == "Win2007": |
66 signalp_trunc = 70 | 70 signalp_trunc = 70 |
67 re_rxlr = re.compile("R.LR") | 71 re_rxlr = re.compile("R.LR") |
68 min_sp = 10 | 72 min_sp = 10 |
69 max_sp = 40 | 73 max_sp = 40 |
70 min_rxlr_start = 30 | 74 min_rxlr_start = 30 |
71 max_rxlr_start = 60 | 75 max_rxlr_start = 60 |
72 #No explicit limit on separation of signal peptide clevage | 76 # No explicit limit on separation of signal peptide clevage |
73 #and RXLR, but shortest signal peptide is 10, and furthest | 77 # and RXLR, but shortest signal peptide is 10, and furthest |
74 #away RXLR is 60, so effectively limit is 50. | 78 # away RXLR is 60, so effectively limit is 50. |
75 max_sp_rxlr = max_rxlr_start - min_sp + 1 | 79 max_sp_rxlr = max_rxlr_start - min_sp + 1 |
76 elif model == "Whisson2007": | 80 elif model == "Whisson2007": |
77 signalp_trunc = 0 #zero for no truncation | 81 signalp_trunc = 0 # zero for no truncation |
78 re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") | 82 re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") |
79 min_sp = 10 | 83 min_sp = 10 |
80 max_sp = 40 | 84 max_sp = 40 |
81 max_sp_rxlr = 100 | 85 max_sp_rxlr = 100 |
82 min_rxlr_start = 1 | 86 min_rxlr_start = 1 |
83 max_rxlr_start = max_sp + max_sp_rxlr | 87 max_rxlr_start = max_sp + max_sp_rxlr |
84 else: | 88 else: |
85 stop_err("Did not recognise the model name %r\n" | 89 sys_exit("Did not recognise the model name %r\n" |
86 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) | 90 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) |
87 | 91 |
88 | 92 |
89 def get_hmmer_version(exe, required=None): | 93 def get_hmmer_version(exe, required=None): |
90 cmd = "%s -h" % exe | 94 cmd = "%s -h" % exe |
106 #Run hmmsearch for Whisson et al. (2007) | 110 #Run hmmsearch for Whisson et al. (2007) |
107 if model == "Whisson2007": | 111 if model == "Whisson2007": |
108 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], | 112 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], |
109 "whisson_et_al_rxlr_eer_cropped.hmm") | 113 "whisson_et_al_rxlr_eer_cropped.hmm") |
110 if not os.path.isfile(hmm_file): | 114 if not os.path.isfile(hmm_file): |
111 stop_err("Missing HMM file for Whisson et al. (2007)") | 115 sys_exit("Missing HMM file for Whisson et al. (2007)") |
112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | 116 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): |
113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) | 117 sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) |
114 | 118 |
115 hmm_hits = set() | 119 hmm_hits = set() |
116 valid_ids = set() | 120 valid_ids = set() |
117 for title, seq in fasta_iterator(fasta_file): | 121 for title, seq in fasta_iterator(fasta_file): |
118 name = title.split(None,1)[0] | 122 name = title.split(None,1)[0] |
119 if name in valid_ids: | 123 if name in valid_ids: |
120 stop_err("Duplicated identifier %r" % name) | 124 sys_exit("Duplicated identifier %r" % name) |
121 else: | 125 else: |
122 valid_ids.add(name) | 126 valid_ids.add(name) |
123 if not valid_ids: | 127 if not valid_ids: |
124 #Special case, don't need to run HMMER if there are no sequences | 128 # Special case, don't need to run HMMER if there are no sequences |
125 pass | 129 pass |
126 else: | 130 else: |
127 #I've left the code to handle HMMER 3 in situ, in case | 131 # I've left the code to handle HMMER 3 in situ, in case |
128 #we revisit the choice to insist on HMMER 2. | 132 # we revisit the choice to insist on HMMER 2. |
129 hmmer3 = (3 == get_hmmer_version(hmmer_search)) | 133 hmmer3 = (3 == get_hmmer_version(hmmer_search)) |
130 #Using zero (or 5.6?) for bitscore threshold | 134 # Using zero (or 5.6?) for bitscore threshold |
131 if hmmer3: | 135 if hmmer3: |
132 #The HMMER3 table output is easy to parse | 136 # The HMMER3 table output is easy to parse |
133 #In HMMER3 can't use both -T and -E | 137 # In HMMER3 can't use both -T and -E |
134 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ | 138 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ |
135 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) | 139 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) |
136 else: | 140 else: |
137 #For HMMER2 we are stuck with parsing stdout | 141 # For HMMER2 we are stuck with parsing stdout |
138 #Put 1e6 to effectively have no expectation threshold (otherwise | 142 # Put 1e6 to effectively have no expectation threshold (otherwise |
139 #HMMER defaults to 10 and the calculated e-value depends on the | 143 # HMMER defaults to 10 and the calculated e-value depends on the |
140 #input FASTA file, and we can loose hits of interest). | 144 # input FASTA file, and we can loose hits of interest). |
141 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | 145 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ |
142 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | 146 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) |
143 return_code = os.system(cmd) | 147 return_code = os.system(cmd) |
144 if return_code: | 148 if return_code: |
145 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) | 149 sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) |
146 | 150 |
147 handle = open(hmm_output_file) | 151 handle = open(hmm_output_file) |
148 for line in handle: | 152 for line in handle: |
149 if not line.strip(): | 153 if not line.strip(): |
150 #We expect blank lines in the HMMER2 stdout | 154 # We expect blank lines in the HMMER2 stdout |
151 continue | 155 continue |
152 elif line.startswith("#"): | 156 elif line.startswith("#"): |
153 #Header | 157 # Header |
154 continue | 158 continue |
155 else: | 159 else: |
156 name = line.split(None,1)[0] | 160 name = line.split(None,1)[0] |
157 #Should be a sequence name in the HMMER3 table output. | 161 #Should be a sequence name in the HMMER3 table output. |
158 #Could be anything in the HMMER2 stdout. | 162 #Could be anything in the HMMER2 stdout. |
159 if name in valid_ids: | 163 if name in valid_ids: |
160 hmm_hits.add(name) | 164 hmm_hits.add(name) |
161 elif hmmer3: | 165 elif hmmer3: |
162 stop_err("Unexpected identifer %r in hmmsearch output" % name) | 166 sys_exit("Unexpected identifer %r in hmmsearch output" % name) |
163 handle.close() | 167 handle.close() |
164 #if hmmer3: | 168 # if hmmer3: |
165 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 169 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) |
166 #else: | 170 # else: |
167 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) | 171 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) |
168 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) | 172 # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) |
169 os.remove(hmm_output_file) | 173 os.remove(hmm_output_file) |
170 del valid_ids | 174 del valid_ids |
171 | 175 |
172 | 176 |
173 #Prepare short list of candidates containing RXLR to pass to SignalP | 177 # Prepare short list of candidates containing RXLR to pass to SignalP |
174 assert min_rxlr_start > 0, "Min value one, since zero based counting" | 178 assert min_rxlr_start > 0, "Min value one, since zero based counting" |
175 count = 0 | 179 count = 0 |
176 total = 0 | 180 total = 0 |
177 handle = open(signalp_input_file, "w") | 181 handle = open(signalp_input_file, "w") |
178 for title, seq in fasta_iterator(fasta_file): | 182 for title, seq in fasta_iterator(fasta_file): |
179 total += 1 | 183 total += 1 |
180 name = title.split(None,1)[0] | 184 name = title.split(None,1)[0] |
181 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) | 185 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) |
182 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 186 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: |
183 #This is a potential RXLR, depending on the SignalP results. | 187 # This is a potential RXLR, depending on the SignalP results. |
184 #Might as well truncate the sequence now, makes the temp file smaller | 188 # Might as well truncate the sequence now, makes the temp file smaller |
185 if signalp_trunc: | 189 if signalp_trunc: |
186 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) | 190 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) |
187 else: | 191 else: |
188 #Does it matter we don't line wrap? | 192 # Does it matter we don't line wrap? |
189 handle.write(">%s\n%s\n" % (name, seq)) | 193 handle.write(">%s\n%s\n" % (name, seq)) |
190 count += 1 | 194 count += 1 |
191 handle.close() | 195 handle.close() |
192 #print "Running SignalP on %i/%i potentials." % (count, total) | 196 # print "Running SignalP on %i/%i potentials." % (count, total) |
193 | 197 |
194 | 198 |
195 #Run SignalP (using our wrapper script to get multi-core support etc) | 199 # Run SignalP (using our wrapper script to get multi-core support etc) |
196 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") | 200 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") |
197 if not os.path.isfile(signalp_script): | 201 if not os.path.isfile(signalp_script): |
198 stop_err("Error - missing signalp3.py script") | 202 sys_exit("Error - missing signalp3.py script") |
199 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) | 203 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) |
200 return_code = os.system(cmd) | 204 return_code = os.system(cmd) |
201 if return_code: | 205 if return_code: |
202 stop_err("Error %i from SignalP:\n%s" % (return_code, cmd)) | 206 sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd)) |
203 #print "SignalP done" | 207 # print "SignalP done" |
208 | |
204 | 209 |
205 def parse_signalp(filename): | 210 def parse_signalp(filename): |
206 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. | 211 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. |
207 | 212 |
208 For signal peptide length we use NN_Ymax_pos (minus one). | 213 For signal peptide length we use NN_Ymax_pos (minus one). |
215 assert len(parts)==20, repr(line) | 220 assert len(parts)==20, repr(line) |
216 yield parts[0], float(parts[18]), int(parts[5])-1 | 221 yield parts[0], float(parts[18]), int(parts[5])-1 |
217 handle.close() | 222 handle.close() |
218 | 223 |
219 | 224 |
220 #Parse SignalP results and apply the strict RXLR criteria | 225 # Parse SignalP results and apply the strict RXLR criteria |
221 total = 0 | 226 total = 0 |
222 tally = dict() | 227 tally = dict() |
223 handle = open(tabular_file, "w") | 228 handle = open(tabular_file, "w") |
224 handle.write("#ID\t%s\n" % model) | 229 handle.write("#ID\t%s\n" % model) |
225 signalp_results = parse_signalp(signalp_output_file) | 230 signalp_results = parse_signalp(signalp_output_file) |
228 rxlr = "N" | 233 rxlr = "N" |
229 name = title.split(None,1)[0] | 234 name = title.split(None,1)[0] |
230 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) | 235 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) |
231 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 236 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: |
232 del match | 237 del match |
233 #This was the criteria for calling SignalP, | 238 # This was the criteria for calling SignalP, |
234 #so it will be in the SignalP results. | 239 #so it will be in the SignalP results. |
235 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() | 240 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() |
236 assert name == sp_id, "%s vs %s" % (name, sp_id) | 241 assert name == sp_id, "%s vs %s" % (name, sp_id) |
237 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: | 242 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: |
238 match = re_rxlr.search(seq[sp_nn_len:].upper()) | 243 match = re_rxlr.search(seq[sp_nn_len:].upper()) |
239 if match and match.start() + 1 <= max_sp_rxlr: #1-based counting | 244 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting |
240 rxlr_start = sp_nn_len + match.start() + 1 | 245 rxlr_start = sp_nn_len + match.start() + 1 |
241 if min_rxlr_start <= rxlr_start <= max_rxlr_start: | 246 if min_rxlr_start <= rxlr_start <= max_rxlr_start: |
242 rxlr = "Y" | 247 rxlr = "Y" |
243 if model == "Whisson2007": | 248 if model == "Whisson2007": |
244 #Combine the signalp with regular expression heuristic and the HMM | 249 # Combine the signalp with regular expression heuristic and the HMM |
245 if name in hmm_hits and rxlr == "N": | 250 if name in hmm_hits and rxlr == "N": |
246 rxlr = "hmm" #HMM only | 251 rxlr = "hmm" # HMM only |
247 elif rxlr == "N": | 252 elif rxlr == "N": |
248 rxlr = "neither" #Don't use N (no) | 253 rxlr = "neither" # Don't use N (no) |
249 elif name not in hmm_hits and rxlr == "Y": | 254 elif name not in hmm_hits and rxlr == "Y": |
250 rxlr = "re" #Heuristic only | 255 rxlr = "re" # Heuristic only |
251 #Now have a four way classifier: Y, hmm, re, neither | 256 # Now have a four way classifier: Y, hmm, re, neither |
252 #and count is the number of Y results (both HMM and heuristic) | 257 # and count is the number of Y results (both HMM and heuristic) |
253 handle.write("%s\t%s\n" % (name, rxlr)) | 258 handle.write("%s\t%s\n" % (name, rxlr)) |
254 try: | 259 try: |
255 tally[rxlr] += 1 | 260 tally[rxlr] += 1 |
256 except KeyError: | 261 except KeyError: |
257 tally[rxlr] = 1 | 262 tally[rxlr] = 1 |
258 handle.close() | 263 handle.close() |
259 assert sum(tally.values()) == total | 264 assert sum(tally.values()) == total |
260 | 265 |
261 #Check the iterator is finished | 266 # Check the iterator is finished |
262 try: | 267 try: |
263 signalp_results.next() | 268 signalp_results.next() |
264 assert False, "Unexpected data in SignalP output" | 269 assert False, "Unexpected data in SignalP output" |
265 except StopIteration: | 270 except StopIteration: |
266 pass | 271 pass |
267 | 272 |
268 #Cleanup | 273 # Cleanup |
269 os.remove(signalp_input_file) | 274 os.remove(signalp_input_file) |
270 os.remove(signalp_output_file) | 275 os.remove(signalp_output_file) |
271 | 276 |
272 #Short summary to stdout for Galaxy's info display | 277 # Short summary to stdout for Galaxy's info display |
273 print "%s for %i sequences:" % (model, total) | 278 print "%s for %i sequences:" % (model, total) |
274 print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems())) | 279 print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems())) |