Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/promoter2.py @ 34:7a2e20baacee draft default tip
"v0.2.13 - Python 3 fix for raising StopIteration"
author | peterjc |
---|---|
date | Thu, 17 Jun 2021 17:58:23 +0000 |
parents | 20da7f48b56f |
children |
rev | line source |
---|---|
7 | 1 #!/usr/bin/env python |
2 """Wrapper for Promoter 2.0 for use in Galaxy. | |
3 | |
4 This script takes exactly three command line arguments: | |
5 * number of threads | |
6 * an input DNA FASTA filename | |
7 * output tabular filename. | |
8 | |
9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, | |
10 bypassing the Perl wrapper script 'promoter' which imposes a significant | |
11 performace overhead for no benefit here (we don't need HTML output for | |
12 example). | |
13 | |
14 The main feature is this Python wrapper script parsers the bespoke | |
15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly | |
16 tab separated table. | |
17 | |
18 Additionally, in order to take advantage of multiple cores the input FASTA | |
19 file is broken into chunks and multiple copies of promoter run at once. | |
20 This can be used in combination with the job-splitting available in Galaxy. | |
21 Note that rewriting the FASTA input file allows us to avoid a bug in | |
22 promoter 2 with long descriptions in the FASTA header line (over 200 | |
23 characters) which produces stray fragements of the description in the | |
24 output file, making parsing non-trivial. | |
25 | |
26 TODO - Automatically extract the sequence containing a promoter prediction? | |
27 """ | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
28 |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
29 from __future__ import print_function |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
30 |
7 | 31 import commands |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
32 import os |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
33 import sys |
7 | 34 import tempfile |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
35 |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
36 from seq_analysis_utils import run_jobs, split_fasta, thread_count |
7 | 37 |
38 FASTA_CHUNK = 500 | |
39 | |
28 | 40 if "-v" in sys.argv or "--version" in sys.argv: |
41 sys.exit(os.system("promoter -V")) | |
42 | |
7 | 43 if len(sys.argv) != 4: |
32 | 44 sys.exit( |
45 "Require three arguments, number of threads (int), input DNA FASTA " | |
46 "file & output tabular file. Got %i arguments." % (len(sys.argv) - 1) | |
47 ) | |
7 | 48 |
29 | 49 num_threads = thread_count(sys.argv[3], default=4) |
7 | 50 fasta_file = os.path.abspath(sys.argv[2]) |
51 tabular_file = os.path.abspath(sys.argv[3]) | |
52 | |
53 tmp_dir = tempfile.mkdtemp() | |
54 | |
29 | 55 |
7 | 56 def get_path_and_binary(): |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
57 """Determine path and binary names for promoter tool.""" |
29 | 58 platform = commands.getoutput("uname") # e.g. Linux |
7 | 59 shell_script = commands.getoutput("which promoter") |
60 if not os.path.isfile(shell_script): | |
29 | 61 sys.exit("ERROR: Missing promoter executable shell script") |
7 | 62 path = None |
63 for line in open(shell_script): | |
29 | 64 if line.startswith("setenv"): # could then be tab or space! |
7 | 65 parts = line.rstrip().split(None, 2) |
66 if parts[0] == "setenv" and parts[1] == "PROM": | |
67 path = parts[2] | |
68 if not path: | |
29 | 69 sys.exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) |
7 | 70 if not os.path.isdir(path): |
29 | 71 sys.exit("ERROR: %r is not a directory" % path) |
7 | 72 bin = "%s/bin/promoter_%s" % (path, platform) |
73 if not os.path.isfile(bin): | |
29 | 74 sys.exit("ERROR: Missing promoter binary %r" % bin) |
7 | 75 return path, bin |
76 | |
29 | 77 |
7 | 78 def make_tabular(raw_handle, out_handle): |
79 """Parse text output into tabular, return query count.""" | |
80 identifier = None | |
81 queries = 0 | |
82 for line in raw_handle: | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
83 # print(repr(line)) |
7 | 84 if not line.strip() or line == "Promoter prediction:\n": |
85 pass | |
86 elif line[0] != " ": | |
29 | 87 identifier = line.strip().replace("\t", " ").split(None, 1)[0] |
7 | 88 queries += 1 |
89 elif line == " No promoter predicted\n": | |
29 | 90 # End of a record |
7 | 91 identifier = None |
92 elif line == " Position Score Likelihood\n": | |
93 assert identifier | |
94 else: | |
95 try: | |
29 | 96 position, score, likelihood = line.strip().split(None, 2) |
7 | 97 except ValueError: |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
98 print("WARNING: Problem with line: %r" % line) |
7 | 99 continue |
29 | 100 # sys.exit("ERROR: Problem with line: %r" % line) |
32 | 101 if likelihood not in [ |
102 "ignored", | |
103 "Marginal prediction", | |
104 "Medium likely prediction", | |
105 "Highly likely prediction", | |
106 ]: | |
29 | 107 sys.exit("ERROR: Problem with line: %r" % line) |
32 | 108 out_handle.write( |
109 "%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood) | |
110 ) | |
7 | 111 return queries |
29 | 112 |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
113 |
7 | 114 working_dir, bin = get_path_and_binary() |
115 | |
116 if not os.path.isfile(fasta_file): | |
29 | 117 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) |
7 | 118 |
29 | 119 # Note that if the input FASTA file contains no sequences, |
120 # split_fasta returns an empty list (i.e. zero temp files). | |
121 # We deliberately omit the FASTA descriptions to avoid a | |
122 # bug in promoter2 with descriptions over 200 characters. | |
32 | 123 fasta_files = split_fasta( |
124 fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False | |
125 ) | |
29 | 126 temp_files = [f + ".out" for f in fasta_files] |
32 | 127 jobs = [ |
128 "%s %s > %s" % (bin, fasta, temp) for fasta, temp in zip(fasta_files, temp_files) | |
129 ] | |
7 | 130 |
29 | 131 |
7 | 132 def clean_up(file_list): |
133 for f in file_list: | |
134 if os.path.isfile(f): | |
135 os.remove(f) | |
136 try: | |
137 os.rmdir(tmp_dir) | |
29 | 138 except Exception: |
7 | 139 pass |
140 | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
141 |
7 | 142 if len(jobs) > 1 and num_threads > 1: |
29 | 143 # A small "info" message for Galaxy to show the user. |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
144 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) |
7 | 145 cur_dir = os.path.abspath(os.curdir) |
146 os.chdir(working_dir) | |
147 results = run_jobs(jobs, num_threads) | |
148 os.chdir(cur_dir) | |
149 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
150 error_level = results[cmd] | |
151 if error_level: | |
152 try: | |
153 output = open(temp).readline() | |
154 except IOError: | |
155 output = "" | |
156 clean_up(fasta_files + temp_files) | |
32 | 157 sys.exit( |
158 "One or more tasks failed, e.g. %i from %r gave:\n%s" | |
159 % (error_level, cmd, output), | |
160 error_level, | |
161 ) | |
7 | 162 |
163 del results | |
164 del jobs | |
165 | |
166 out_handle = open(tabular_file, "w") | |
167 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") | |
168 queries = 0 | |
169 for temp in temp_files: | |
170 data_handle = open(temp) | |
171 count = make_tabular(data_handle, out_handle) | |
172 data_handle.close() | |
173 if not count: | |
174 clean_up(fasta_files + temp_files) | |
29 | 175 sys.exit("No output from promoter2") |
7 | 176 queries += count |
177 out_handle.close() | |
178 | |
179 clean_up(fasta_files + temp_files) | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
180 print("Results for %i queries" % queries) |