Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/promoter2.py @ 30:6d9d7cdf00fc draft
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
| author | peterjc |
|---|---|
| date | Thu, 21 Sep 2017 11:15:55 -0400 |
| parents | 3cb02adf4326 |
| children | 20da7f48b56f |
| rev | line source |
|---|---|
| 7 | 1 #!/usr/bin/env python |
| 2 """Wrapper for Promoter 2.0 for use in Galaxy. | |
| 3 | |
| 4 This script takes exactly three command line arguments: | |
| 5 * number of threads | |
| 6 * an input DNA FASTA filename | |
| 7 * output tabular filename. | |
| 8 | |
| 9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux, | |
| 10 bypassing the Perl wrapper script 'promoter' which imposes a significant | |
| 11 performace overhead for no benefit here (we don't need HTML output for | |
| 12 example). | |
| 13 | |
| 14 The main feature is this Python wrapper script parsers the bespoke | |
| 15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly | |
| 16 tab separated table. | |
| 17 | |
| 18 Additionally, in order to take advantage of multiple cores the input FASTA | |
| 19 file is broken into chunks and multiple copies of promoter run at once. | |
| 20 This can be used in combination with the job-splitting available in Galaxy. | |
| 21 Note that rewriting the FASTA input file allows us to avoid a bug in | |
| 22 promoter 2 with long descriptions in the FASTA header line (over 200 | |
| 23 characters) which produces stray fragements of the description in the | |
| 24 output file, making parsing non-trivial. | |
| 25 | |
| 26 TODO - Automatically extract the sequence containing a promoter prediction? | |
| 27 """ | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
28 |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
29 from __future__ import print_function |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
30 |
| 7 | 31 import commands |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
32 import os |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
33 import sys |
| 7 | 34 import tempfile |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
35 |
|
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
36 from seq_analysis_utils import run_jobs, split_fasta, thread_count |
| 7 | 37 |
| 38 FASTA_CHUNK = 500 | |
| 39 | |
| 28 | 40 if "-v" in sys.argv or "--version" in sys.argv: |
| 41 sys.exit(os.system("promoter -V")) | |
| 42 | |
| 7 | 43 if len(sys.argv) != 4: |
| 29 | 44 sys.exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " |
| 45 "Got %i arguments." % (len(sys.argv) - 1)) | |
| 7 | 46 |
| 29 | 47 num_threads = thread_count(sys.argv[3], default=4) |
| 7 | 48 fasta_file = os.path.abspath(sys.argv[2]) |
| 49 tabular_file = os.path.abspath(sys.argv[3]) | |
| 50 | |
| 51 tmp_dir = tempfile.mkdtemp() | |
| 52 | |
| 29 | 53 |
| 7 | 54 def get_path_and_binary(): |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
55 """Determine path and binary names for promoter tool.""" |
| 29 | 56 platform = commands.getoutput("uname") # e.g. Linux |
| 7 | 57 shell_script = commands.getoutput("which promoter") |
| 58 if not os.path.isfile(shell_script): | |
| 29 | 59 sys.exit("ERROR: Missing promoter executable shell script") |
| 7 | 60 path = None |
| 61 for line in open(shell_script): | |
| 29 | 62 if line.startswith("setenv"): # could then be tab or space! |
| 7 | 63 parts = line.rstrip().split(None, 2) |
| 64 if parts[0] == "setenv" and parts[1] == "PROM": | |
| 65 path = parts[2] | |
| 66 if not path: | |
| 29 | 67 sys.exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) |
| 7 | 68 if not os.path.isdir(path): |
| 29 | 69 sys.exit("ERROR: %r is not a directory" % path) |
| 7 | 70 bin = "%s/bin/promoter_%s" % (path, platform) |
| 71 if not os.path.isfile(bin): | |
| 29 | 72 sys.exit("ERROR: Missing promoter binary %r" % bin) |
| 7 | 73 return path, bin |
| 74 | |
| 29 | 75 |
| 7 | 76 def make_tabular(raw_handle, out_handle): |
| 77 """Parse text output into tabular, return query count.""" | |
| 78 identifier = None | |
| 79 queries = 0 | |
| 80 for line in raw_handle: | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
81 # print(repr(line)) |
| 7 | 82 if not line.strip() or line == "Promoter prediction:\n": |
| 83 pass | |
| 84 elif line[0] != " ": | |
| 29 | 85 identifier = line.strip().replace("\t", " ").split(None, 1)[0] |
| 7 | 86 queries += 1 |
| 87 elif line == " No promoter predicted\n": | |
| 29 | 88 # End of a record |
| 7 | 89 identifier = None |
| 90 elif line == " Position Score Likelihood\n": | |
| 91 assert identifier | |
| 92 else: | |
| 93 try: | |
| 29 | 94 position, score, likelihood = line.strip().split(None, 2) |
| 7 | 95 except ValueError: |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
96 print("WARNING: Problem with line: %r" % line) |
| 7 | 97 continue |
| 29 | 98 # sys.exit("ERROR: Problem with line: %r" % line) |
| 7 | 99 if likelihood not in ["ignored", |
| 100 "Marginal prediction", | |
| 101 "Medium likely prediction", | |
| 102 "Highly likely prediction"]: | |
| 29 | 103 sys.exit("ERROR: Problem with line: %r" % line) |
| 7 | 104 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) |
| 105 return queries | |
| 29 | 106 |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
107 |
| 7 | 108 working_dir, bin = get_path_and_binary() |
| 109 | |
| 110 if not os.path.isfile(fasta_file): | |
| 29 | 111 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) |
| 7 | 112 |
| 29 | 113 # Note that if the input FASTA file contains no sequences, |
| 114 # split_fasta returns an empty list (i.e. zero temp files). | |
| 115 # We deliberately omit the FASTA descriptions to avoid a | |
| 116 # bug in promoter2 with descriptions over 200 characters. | |
| 7 | 117 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) |
| 29 | 118 temp_files = [f + ".out" for f in fasta_files] |
| 7 | 119 jobs = ["%s %s > %s" % (bin, fasta, temp) |
| 120 for fasta, temp in zip(fasta_files, temp_files)] | |
| 121 | |
| 29 | 122 |
| 7 | 123 def clean_up(file_list): |
| 124 for f in file_list: | |
| 125 if os.path.isfile(f): | |
| 126 os.remove(f) | |
| 127 try: | |
| 128 os.rmdir(tmp_dir) | |
| 29 | 129 except Exception: |
| 7 | 130 pass |
| 131 | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
132 |
| 7 | 133 if len(jobs) > 1 and num_threads > 1: |
| 29 | 134 # A small "info" message for Galaxy to show the user. |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
135 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) |
| 7 | 136 cur_dir = os.path.abspath(os.curdir) |
| 137 os.chdir(working_dir) | |
| 138 results = run_jobs(jobs, num_threads) | |
| 139 os.chdir(cur_dir) | |
| 140 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
| 141 error_level = results[cmd] | |
| 142 if error_level: | |
| 143 try: | |
| 144 output = open(temp).readline() | |
| 145 except IOError: | |
| 146 output = "" | |
| 147 clean_up(fasta_files + temp_files) | |
| 29 | 148 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
| 7 | 149 error_level) |
| 150 | |
| 151 del results | |
| 152 del jobs | |
| 153 | |
| 154 out_handle = open(tabular_file, "w") | |
| 155 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n") | |
| 156 queries = 0 | |
| 157 for temp in temp_files: | |
| 158 data_handle = open(temp) | |
| 159 count = make_tabular(data_handle, out_handle) | |
| 160 data_handle.close() | |
| 161 if not count: | |
| 162 clean_up(fasta_files + temp_files) | |
| 29 | 163 sys.exit("No output from promoter2") |
| 7 | 164 queries += count |
| 165 out_handle.close() | |
| 166 | |
| 167 clean_up(fasta_files + temp_files) | |
|
30
6d9d7cdf00fc
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
peterjc
parents:
29
diff
changeset
|
168 print("Results for %i queries" % queries) |
