Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/psortb.py @ 34:7a2e20baacee draft default tip
"v0.2.13 - Python 3 fix for raising StopIteration"
author | peterjc |
---|---|
date | Thu, 17 Jun 2021 17:58:23 +0000 |
parents | 20da7f48b56f |
children |
rev | line source |
---|---|
8 | 1 #!/usr/bin/env python |
2 """Wrapper for psortb for use in Galaxy. | |
3 | |
4 This script takes exactly six command line arguments - which includes the | |
5 number of threads, and the input protein FASTA filename and output | |
6 tabular filename. It then splits up the FASTA input and calls multiple | |
7 copies of the standalone psortb v3 program, then collates the output. | |
8 e.g. Rather than this, | |
9 | |
10 psort $type -c $cutoff -d $divergent -o long $sequence > $outfile | |
11 | |
12 Call this: | |
13 | |
14 psort $threads $type $cutoff $divergent $sequence $outfile | |
15 | |
16 If ommitting -c or -d options, set $cutoff and $divergent to zero or blank. | |
17 | |
18 Note that this is somewhat redundant with job-splitting available in Galaxy | |
19 itself (see the SignalP XML file for settings), but both can be applied. | |
20 | |
21 Additionally it ensures the header line (with the column names) starts | |
22 with a # character as used elsewhere in Galaxy. | |
23 """ | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
24 |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
25 from __future__ import print_function |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
26 |
8 | 27 import os |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
28 import sys |
8 | 29 import tempfile |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
30 |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
31 from seq_analysis_utils import run_jobs, split_fasta, thread_count |
8 | 32 |
33 FASTA_CHUNK = 500 | |
34 | |
35 if "-v" in sys.argv or "--version" in sys.argv: | |
36 """Return underlying PSORTb's version""" | |
37 sys.exit(os.system("psort --version")) | |
38 | |
39 if len(sys.argv) != 8: | |
32 | 40 sys.exit( |
41 "Require 7 arguments, number of threads (int), type (e.g. archaea), " | |
42 "output (e.g. terse/normal/long), cutoff, divergent, input protein " | |
43 "FASTA file & output tabular file" | |
44 ) | |
8 | 45 |
46 num_threads = thread_count(sys.argv[1], default=4) | |
47 org_type = sys.argv[2] | |
48 out_type = sys.argv[3] | |
49 cutoff = sys.argv[4] | |
50 if cutoff.strip() and float(cutoff.strip()) != 0.0: | |
51 cutoff = "-c %s" % cutoff | |
52 else: | |
53 cutoff = "" | |
54 divergent = sys.argv[5] | |
55 if divergent.strip() and float(divergent.strip()) != 0.0: | |
56 divergent = "-d %s" % divergent | |
57 else: | |
58 divergent = "" | |
59 fasta_file = sys.argv[6] | |
60 tabular_file = sys.argv[7] | |
61 | |
62 if out_type == "terse": | |
32 | 63 header = ["SeqID", "Localization", "Score"] |
8 | 64 elif out_type == "normal": |
29 | 65 sys.exit("Normal output not implemented yet, sorry.") |
8 | 66 elif out_type == "long": |
67 if org_type == "-n": | |
29 | 68 # Gram negative bacteria |
32 | 69 header = [ |
70 "SeqID", | |
71 "CMSVM-_Localization", | |
72 "CMSVM-_Details", | |
73 "CytoSVM-_Localization", | |
74 "CytoSVM-_Details", | |
75 "ECSVM-_Localization", | |
76 "ECSVM-_Details", | |
77 "ModHMM-_Localization", | |
78 "ModHMM-_Details", | |
79 "Motif-_Localization", | |
80 "Motif-_Details", | |
81 "OMPMotif-_Localization", | |
82 "OMPMotif-_Details", | |
83 "OMSVM-_Localization", | |
84 "OMSVM-_Details", | |
85 "PPSVM-_Localization", | |
86 "PPSVM-_Details", | |
87 "Profile-_Localization", | |
88 "Profile-_Details", | |
89 "SCL-BLAST-_Localization", | |
90 "SCL-BLAST-_Details", | |
91 "SCL-BLASTe-_Localization", | |
92 "SCL-BLASTe-_Details", | |
93 "Signal-_Localization", | |
94 "Signal-_Details", | |
95 "Cytoplasmic_Score", | |
96 "CytoplasmicMembrane_Score", | |
97 "Periplasmic_Score", | |
98 "OuterMembrane_Score", | |
99 "Extracellular_Score", | |
100 "Final_Localization", | |
101 "Final_Localization_Details", | |
102 "Final_Score", | |
103 "Secondary_Localization", | |
104 "PSortb_Version", | |
105 ] | |
8 | 106 elif org_type == "-p": |
29 | 107 # Gram positive bacteria |
32 | 108 header = [ |
109 "SeqID", | |
110 "CMSVM+_Localization", | |
111 "CMSVM+_Details", | |
112 "CWSVM+_Localization", | |
113 "CWSVM+_Details", | |
114 "CytoSVM+_Localization", | |
115 "CytoSVM+_Details", | |
116 "ECSVM+_Localization", | |
117 "ECSVM+_Details", | |
118 "ModHMM+_Localization", | |
119 "ModHMM+_Details", | |
120 "Motif+_Localization", | |
121 "Motif+_Details", | |
122 "Profile+_Localization", | |
123 "Profile+_Details", | |
124 "SCL-BLAST+_Localization", | |
125 "SCL-BLAST+_Details", | |
126 "SCL-BLASTe+_Localization", | |
127 "SCL-BLASTe+_Details", | |
128 "Signal+_Localization", | |
129 "Signal+_Details", | |
130 "Cytoplasmic_Score", | |
131 "CytoplasmicMembrane_Score", | |
132 "Cellwall_Score", | |
133 "Extracellular_Score", | |
134 "Final_Localization", | |
135 "Final_Localization_Details", | |
136 "Final_Score", | |
137 "Secondary_Localization", | |
138 "PSortb_Version", | |
139 ] | |
8 | 140 elif org_type == "-a": |
29 | 141 # Archaea |
32 | 142 header = [ |
143 "SeqID", | |
144 "CMSVM_a_Localization", | |
145 "CMSVM_a_Details", | |
146 "CWSVM_a_Localization", | |
147 "CWSVM_a_Details", | |
148 "CytoSVM_a_Localization", | |
149 "CytoSVM_a_Details", | |
150 "ECSVM_a_Localization", | |
151 "ECSVM_a_Details", | |
152 "ModHMM_a_Localization", | |
153 "ModHMM_a_Details", | |
154 "Motif_a_Localization", | |
155 "Motif_a_Details", | |
156 "Profile_a_Localization", | |
157 "Profile_a_Details", | |
158 "SCL-BLAST_a_Localization", | |
159 "SCL-BLAST_a_Details", | |
160 "SCL-BLASTe_a_Localization", | |
161 "SCL-BLASTe_a_Details", | |
162 "Signal_a_Localization", | |
163 "Signal_a_Details", | |
164 "Cytoplasmic_Score", | |
165 "CytoplasmicMembrane_Score", | |
166 "Cellwall_Score", | |
167 "Extracellular_Score", | |
168 "Final_Localization", | |
169 "Final_Localization_Details", | |
170 "Final_Score", | |
171 "Secondary_Localization", | |
172 "PSortb_Version", | |
173 ] | |
8 | 174 else: |
29 | 175 sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) |
8 | 176 else: |
29 | 177 sys.exit("Expected terse, normal or long for the output type, not %r" % out_type) |
8 | 178 |
179 tmp_dir = tempfile.mkdtemp() | |
180 | |
29 | 181 |
8 | 182 def clean_tabular(raw_handle, out_handle): |
183 """Clean up tabular TMHMM output, returns output line count.""" | |
184 global header | |
185 count = 0 | |
186 for line in raw_handle: | |
187 if not line.strip() or line.startswith("#"): | |
29 | 188 # Ignore any blank lines or comment lines |
8 | 189 continue |
190 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")] | |
191 if parts == header: | |
29 | 192 # Ignore the header line |
8 | 193 continue |
194 if not parts[-1] and len(parts) == len(header) + 1: | |
29 | 195 # Ignore dummy blank extra column, e.g. |
196 # "...2.0\t\tPSORTb version 3.0\t\n" | |
8 | 197 parts = parts[:-1] |
32 | 198 assert len(parts) == len(header), "%i fields, not %i, in line:\n%r" % ( |
199 len(line), | |
200 len(header), | |
201 line, | |
202 ) | |
8 | 203 out_handle.write(line) |
204 count += 1 | |
205 return count | |
206 | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
207 |
29 | 208 # Note that if the input FASTA file contains no sequences, |
209 # split_fasta returns an empty list (i.e. zero temp files). | |
8 | 210 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) |
29 | 211 temp_files = [f + ".out" for f in fasta_files] |
32 | 212 jobs = [ |
213 "psort %s %s %s -o %s %s > %s" | |
214 % (org_type, cutoff, divergent, out_type, fasta, temp) | |
215 for fasta, temp in zip(fasta_files, temp_files) | |
216 ] | |
8 | 217 |
29 | 218 |
8 | 219 def clean_up(file_list): |
220 for f in file_list: | |
221 if os.path.isfile(f): | |
222 os.remove(f) | |
223 try: | |
224 os.rmdir(tmp_dir) | |
29 | 225 except Exception: |
8 | 226 pass |
227 | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
228 |
8 | 229 if len(jobs) > 1 and num_threads > 1: |
29 | 230 # A small "info" message for Galaxy to show the user. |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
231 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) |
8 | 232 results = run_jobs(jobs, num_threads) |
233 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
234 error_level = results[cmd] | |
235 if error_level: | |
236 try: | |
237 output = open(temp).readline() | |
238 except IOError: | |
239 output = "" | |
240 clean_up(fasta_files + temp_files) | |
32 | 241 sys.exit( |
242 "One or more tasks failed, e.g. %i from %r gave:\n%s" | |
243 % (error_level, cmd, output), | |
244 error_level, | |
245 ) | |
8 | 246 del results |
247 del jobs | |
248 | |
249 out_handle = open(tabular_file, "w") | |
250 out_handle.write("#%s\n" % "\t".join(header)) | |
11 | 251 count = 0 |
8 | 252 for temp in temp_files: |
253 data_handle = open(temp) | |
11 | 254 count += clean_tabular(data_handle, out_handle) |
8 | 255 data_handle.close() |
256 if not count: | |
257 clean_up(fasta_files + temp_files) | |
29 | 258 sys.exit("No output from psortb") |
8 | 259 out_handle.close() |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
260 print("%i records" % count) |
8 | 261 |
262 clean_up(fasta_files + temp_files) |