Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/promoter2.py @ 30:6d9d7cdf00fc draft
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
author | peterjc |
---|---|
date | Thu, 21 Sep 2017 11:15:55 -0400 |
parents | 3cb02adf4326 |
children | 20da7f48b56f |
rev | line source |
---|---|
7 | 1 #!/usr/bin/env python |
2 """Wrapper for Promoter 2.0 for use in Galaxy. | |
3 | |
4 This script takes exactly three command line arguments: | |
5 * number of threads | |
6 * an input DNA FASTA filename | |
7 * output tabular filename. | |
8 | |
9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, | |
10 bypassing the Perl wrapper script 'promoter' which imposes a significant | |
11 performace overhead for no benefit here (we don't need HTML output for | |
12 example). | |
13 | |
14 The main feature is this Python wrapper script parsers the bespoke | |
15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly | |
16 tab separated table. | |
17 | |
18 Additionally, in order to take advantage of multiple cores the input FASTA | |
19 file is broken into chunks and multiple copies of promoter run at once. | |
20 This can be used in combination with the job-splitting available in Galaxy. | |
21 Note that rewriting the FASTA input file allows us to avoid a bug in | |
22 promoter 2 with long descriptions in the FASTA header line (over 200 | |
23 characters) which produces stray fragements of the description in the | |
24 output file, making parsing non-trivial. | |
25 | |
26 TODO - Automatically extract the sequence containing a promoter prediction? | |
27 """ | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
28 |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
29 from __future__ import print_function |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
30 |
7 | 31 import commands |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
32 import os |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
33 import sys |
7 | 34 import tempfile |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
35 |
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
36 from seq_analysis_utils import run_jobs, split_fasta, thread_count |
7 | 37 |
38 FASTA_CHUNK = 500 | |
39 | |
28 | 40 if "-v" in sys.argv or "--version" in sys.argv: |
41 sys.exit(os.system("promoter -V")) | |
42 | |
7 | 43 if len(sys.argv) != 4: |
29 | 44 sys.exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " |
45 "Got %i arguments." % (len(sys.argv) - 1)) | |
7 | 46 |
29 | 47 num_threads = thread_count(sys.argv[3], default=4) |
7 | 48 fasta_file = os.path.abspath(sys.argv[2]) |
49 tabular_file = os.path.abspath(sys.argv[3]) | |
50 | |
51 tmp_dir = tempfile.mkdtemp() | |
52 | |
29 | 53 |
7 | 54 def get_path_and_binary(): |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
55 """Determine path and binary names for promoter tool.""" |
29 | 56 platform = commands.getoutput("uname") # e.g. Linux |
7 | 57 shell_script = commands.getoutput("which promoter") |
58 if not os.path.isfile(shell_script): | |
29 | 59 sys.exit("ERROR: Missing promoter executable shell script") |
7 | 60 path = None |
61 for line in open(shell_script): | |
29 | 62 if line.startswith("setenv"): # could then be tab or space! |
7 | 63 parts = line.rstrip().split(None, 2) |
64 if parts[0] == "setenv" and parts[1] == "PROM": | |
65 path = parts[2] | |
66 if not path: | |
29 | 67 sys.exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) |
7 | 68 if not os.path.isdir(path): |
29 | 69 sys.exit("ERROR: %r is not a directory" % path) |
7 | 70 bin = "%s/bin/promoter_%s" % (path, platform) |
71 if not os.path.isfile(bin): | |
29 | 72 sys.exit("ERROR: Missing promoter binary %r" % bin) |
7 | 73 return path, bin |
74 | |
29 | 75 |
7 | 76 def make_tabular(raw_handle, out_handle): |
77 """Parse text output into tabular, return query count.""" | |
78 identifier = None | |
79 queries = 0 | |
80 for line in raw_handle: | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
81 # print(repr(line)) |
7 | 82 if not line.strip() or line == "Promoter prediction:\n": |
83 pass | |
84 elif line[0] != " ": | |
29 | 85 identifier = line.strip().replace("\t", " ").split(None, 1)[0] |
7 | 86 queries += 1 |
87 elif line == " No promoter predicted\n": | |
29 | 88 # End of a record |
7 | 89 identifier = None |
90 elif line == " Position Score Likelihood\n": | |
91 assert identifier | |
92 else: | |
93 try: | |
29 | 94 position, score, likelihood = line.strip().split(None, 2) |
7 | 95 except ValueError: |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
96 print("WARNING: Problem with line: %r" % line) |
7 | 97 continue |
29 | 98 # sys.exit("ERROR: Problem with line: %r" % line) |
7 | 99 if likelihood not in ["ignored", |
100 "Marginal prediction", | |
101 "Medium likely prediction", | |
102 "Highly likely prediction"]: | |
29 | 103 sys.exit("ERROR: Problem with line: %r" % line) |
7 | 104 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) |
105 return queries | |
29 | 106 |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
107 |
7 | 108 working_dir, bin = get_path_and_binary() |
109 | |
110 if not os.path.isfile(fasta_file): | |
29 | 111 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) |
7 | 112 |
29 | 113 # Note that if the input FASTA file contains no sequences, |
114 # split_fasta returns an empty list (i.e. zero temp files). | |
115 # We deliberately omit the FASTA descriptions to avoid a | |
116 # bug in promoter2 with descriptions over 200 characters. | |
7 | 117 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) |
29 | 118 temp_files = [f + ".out" for f in fasta_files] |
7 | 119 jobs = ["%s %s > %s" % (bin, fasta, temp) |
120 for fasta, temp in zip(fasta_files, temp_files)] | |
121 | |
29 | 122 |
7 | 123 def clean_up(file_list): |
124 for f in file_list: | |
125 if os.path.isfile(f): | |
126 os.remove(f) | |
127 try: | |
128 os.rmdir(tmp_dir) | |
29 | 129 except Exception: |
7 | 130 pass |
131 | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
132 |
7 | 133 if len(jobs) > 1 and num_threads > 1: |
29 | 134 # A small "info" message for Galaxy to show the user. |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
135 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) |
7 | 136 cur_dir = os.path.abspath(os.curdir) |
137 os.chdir(working_dir) | |
138 results = run_jobs(jobs, num_threads) | |
139 os.chdir(cur_dir) | |
140 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
141 error_level = results[cmd] | |
142 if error_level: | |
143 try: | |
144 output = open(temp).readline() | |
145 except IOError: | |
146 output = "" | |
147 clean_up(fasta_files + temp_files) | |
29 | 148 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
7 | 149 error_level) |
150 | |
151 del results | |
152 del jobs | |
153 | |
154 out_handle = open(tabular_file, "w") | |
155 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") | |
156 queries = 0 | |
157 for temp in temp_files: | |
158 data_handle = open(temp) | |
159 count = make_tabular(data_handle, out_handle) | |
160 data_handle.close() | |
161 if not count: | |
162 clean_up(fasta_files + temp_files) | |
29 | 163 sys.exit("No output from promoter2") |
7 | 164 queries += count |
165 out_handle.close() | |
166 | |
167 clean_up(fasta_files + temp_files) | |
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
168 print("Results for %i queries" % queries) |