comparison tools/protein_analysis/promoter2.py @ 7:5e62aefb2918 draft

Uploaded v0.1.2 to Test Tool Shed
author peterjc
date Tue, 26 Mar 2013 14:24:56 -0400
parents
children 20139cb4c844
comparison
equal deleted inserted replaced
6:39a6e46cdda3 7:5e62aefb2918
1 #!/usr/bin/env python
2 """Wrapper for Promoter 2.0 for use in Galaxy.
3
4 This script takes exactly three command line arguments:
5 * number of threads
6 * an input DNA FASTA filename
7 * output tabular filename.
8
9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux,
10 bypassing the Perl wrapper script 'promoter' which imposes a significant
11 performace overhead for no benefit here (we don't need HTML output for
12 example).
13
14 The main feature is this Python wrapper script parsers the bespoke
15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly
16 tab separated table.
17
18 Additionally, in order to take advantage of multiple cores the input FASTA
19 file is broken into chunks and multiple copies of promoter run at once.
20 This can be used in combination with the job-splitting available in Galaxy.
21
22 Note that rewriting the FASTA input file allows us to avoid a bug in
23 promoter 2 with long descriptions in the FASTA header line (over 200
24 characters) which produces stray fragements of the description in the
25 output file, making parsing non-trivial.
26
27 TODO - Automatically extract the sequence containing a promoter prediction?
28 """
29 import sys
30 import os
31 import commands
32 import tempfile
33 from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
34
35 FASTA_CHUNK = 500
36
37 if len(sys.argv) != 4:
38 stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. "
39 "Got %i arguments." % (len(sys.argv)-1))
40
41 num_threads = thread_count(sys.argv[3],default=4)
42 fasta_file = os.path.abspath(sys.argv[2])
43 tabular_file = os.path.abspath(sys.argv[3])
44
45 tmp_dir = tempfile.mkdtemp()
46
47 def get_path_and_binary():
48 platform = commands.getoutput("uname") #e.g. Linux
49 shell_script = commands.getoutput("which promoter")
50 if not os.path.isfile(shell_script):
51 stop_err("ERROR: Missing promoter executable shell script")
52 path = None
53 for line in open(shell_script):
54 if line.startswith("setenv"): #could then be tab or space!
55 parts = line.rstrip().split(None, 2)
56 if parts[0] == "setenv" and parts[1] == "PROM":
57 path = parts[2]
58 if not path:
59 stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script)
60 if not os.path.isdir(path):
61 stop_error("ERROR: %r is not a directory" % path)
62 bin = "%s/bin/promoter_%s" % (path, platform)
63 if not os.path.isfile(bin):
64 stop_err("ERROR: Missing promoter binary %r" % bin)
65 return path, bin
66
67 def make_tabular(raw_handle, out_handle):
68 """Parse text output into tabular, return query count."""
69 identifier = None
70 queries = 0
71 for line in raw_handle:
72 #print repr(line)
73 if not line.strip() or line == "Promoter prediction:\n":
74 pass
75 elif line[0] != " ":
76 identifier = line.strip().replace("\t", " ").split(None,1)[0]
77 queries += 1
78 elif line == " No promoter predicted\n":
79 #End of a record
80 identifier = None
81 elif line == " Position Score Likelihood\n":
82 assert identifier
83 else:
84 try:
85 position, score, likelihood = line.strip().split(None,2)
86 except ValueError:
87 print "WARNING: Problem with line: %r" % line
88 continue
89 #stop_err("ERROR: Problem with line: %r" % line)
90 if likelihood not in ["ignored",
91 "Marginal prediction",
92 "Medium likely prediction",
93 "Highly likely prediction"]:
94 stop_err("ERROR: Problem with line: %r" % line)
95 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood))
96 return queries
97
98 working_dir, bin = get_path_and_binary()
99
100 if not os.path.isfile(fasta_file):
101 stop_err("ERROR: Missing input FASTA file %r" % fasta_file)
102
103 #Note that if the input FASTA file contains no sequences,
104 #split_fasta returns an empty list (i.e. zero temp files).
105 #We deliberately omit the FASTA descriptions to avoid a
106 #bug in promoter2 with descriptions over 200 characters.
107 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False)
108 temp_files = [f+".out" for f in fasta_files]
109 jobs = ["%s %s > %s" % (bin, fasta, temp)
110 for fasta, temp in zip(fasta_files, temp_files)]
111
112 def clean_up(file_list):
113 for f in file_list:
114 if os.path.isfile(f):
115 os.remove(f)
116 try:
117 os.rmdir(tmp_dir)
118 except:
119 pass
120
121 if len(jobs) > 1 and num_threads > 1:
122 #A small "info" message for Galaxy to show the user.
123 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
124 cur_dir = os.path.abspath(os.curdir)
125 os.chdir(working_dir)
126 results = run_jobs(jobs, num_threads)
127 os.chdir(cur_dir)
128 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
129 error_level = results[cmd]
130 if error_level:
131 try:
132 output = open(temp).readline()
133 except IOError:
134 output = ""
135 clean_up(fasta_files + temp_files)
136 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
137 error_level)
138
139 del results
140 del jobs
141
142 out_handle = open(tabular_file, "w")
143 out_handle.write("#Identifier\tPosition\tScore\tLikelihood\n")
144 queries = 0
145 for temp in temp_files:
146 data_handle = open(temp)
147 count = make_tabular(data_handle, out_handle)
148 data_handle.close()
149 if not count:
150 clean_up(fasta_files + temp_files)
151 stop_err("No output from promoter2")
152 queries += count
153 out_handle.close()
154
155 clean_up(fasta_files + temp_files)
156 print "Results for %i queries" % queries