comparison tools/protein_analysis/signalp3.py @ 7:5e62aefb2918 draft

Uploaded v0.1.2 to Test Tool Shed
author peterjc
date Tue, 26 Mar 2013 14:24:56 -0400
parents ef7ceca37e3f
children 391a142c1e60
comparison
equal deleted inserted replaced
6:39a6e46cdda3 7:5e62aefb2918
2 """Wrapper for SignalP v3.0 for use in Galaxy. 2 """Wrapper for SignalP v3.0 for use in Galaxy.
3 3
4 This script takes exactly five command line arguments: 4 This script takes exactly five command line arguments:
5 * the organism type (euk, gram+ or gram-) 5 * the organism type (euk, gram+ or gram-)
6 * length to truncate sequences to (integer) 6 * length to truncate sequences to (integer)
7 * number of threads to use (integer) 7 * number of threads to use (integer, defaults to one)
8 * an input protein FASTA filename 8 * an input protein FASTA filename
9 * output tabular filename. 9 * output tabular filename.
10
11 There are two further optional arguments
12 * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax)
13 * output GFF3 filename
10 14
11 It then calls the standalone SignalP v3.0 program (not the webservice) 15 It then calls the standalone SignalP v3.0 program (not the webservice)
12 requesting the short output (one line per protein) using both NN and HMM 16 requesting the short output (one line per protein) using both NN and HMM
13 for predictions. 17 for predictions.
14 18
39 The third major feature is taking advantage of multiple cores (since SignalP 43 The third major feature is taking advantage of multiple cores (since SignalP
40 v3.0 itself is single threaded) by using the individual FASTA input files to 44 v3.0 itself is single threaded) by using the individual FASTA input files to
41 run multiple copies of TMHMM in parallel. I would normally use Python's 45 run multiple copies of TMHMM in parallel. I would normally use Python's
42 multiprocessing library in this situation but it requires at least Python 2.6 46 multiprocessing library in this situation but it requires at least Python 2.6
43 and at the time of writing Galaxy still supports Python 2.4. 47 and at the time of writing Galaxy still supports Python 2.4.
48
49 Note that this is somewhat redundant with job-splitting available in Galaxy
50 itself (see the SignalP XML file for settings).
51
52 Finally, you can opt to have a GFF3 file produced which will describe the
53 predicted signal peptide and mature peptide for each protein (using one of
54 the predictors which gives a cleavage site). *WORK IN PROGRESS*
44 """ 55 """
45 import sys 56 import sys
46 import os 57 import os
47 from seq_analysis_utils import stop_err, split_fasta, run_jobs 58 import tempfile
59 from seq_analysis_utils import stop_err, split_fasta, fasta_iterator
60 from seq_analysis_utils import run_jobs, thread_count
48 61
49 FASTA_CHUNK = 500 62 FASTA_CHUNK = 500
50 MAX_LEN = 6000 #Found by trial and error 63 MAX_LEN = 6000 #Found by trial and error
51 64
52 if len(sys.argv) != 6: 65 if len(sys.argv) not in [6,8]:
53 stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") 66 stop_err("Require five (or 7) arguments, organism, truncate, threads, "
67 "input protein FASTA file & output tabular file (plus "
68 "optionally cut method and GFF3 output file). "
69 "Got %i arguments." % (len(sys.argv)-1))
54 70
55 organism = sys.argv[1] 71 organism = sys.argv[1]
56 if organism not in ["euk", "gram+", "gram-"]: 72 if organism not in ["euk", "gram+", "gram-"]:
57 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) 73 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism)
58 74
61 except: 77 except:
62 truncate = 0 78 truncate = 0
63 if truncate < 0: 79 if truncate < 0:
64 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) 80 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
65 81
66 try: 82 num_threads = thread_count(sys.argv[3], default=4)
67 num_threads = int(sys.argv[3])
68 except:
69 num_threads = 0
70 if num_threads < 1:
71 stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
72
73 fasta_file = sys.argv[4] 83 fasta_file = sys.argv[4]
74
75 tabular_file = sys.argv[5] 84 tabular_file = sys.argv[5]
76 85
77 def clean_tabular(raw_handle, out_handle): 86 if len(sys.argv) == 8:
87 cut_method = sys.argv[6]
88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
89 stop_err("Invalid cut method %r" % cut_method)
90 gff3_file = sys.argv[7]
91 else:
92 cut_method = None
93 gff3_file = None
94
95
96 tmp_dir = tempfile.mkdtemp()
97
98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
78 """Clean up SignalP output to make it tabular.""" 99 """Clean up SignalP output to make it tabular."""
100 if cut_method:
101 cut_col = {"NN_Cmax" : 2,
102 "NN_Ymax" : 5,
103 "NN_Smax" : 8,
104 "HMM_Cmax" : 16}[cut_method]
105 else:
106 cut_col = None
79 for line in raw_handle: 107 for line in raw_handle:
80 if not line or line.startswith("#"): 108 if not line or line.startswith("#"):
81 continue 109 continue
82 parts = line.rstrip("\r\n").split() 110 parts = line.rstrip("\r\n").split()
83 assert len(parts)==21, repr(line) 111 assert len(parts)==21, repr(line)
85 #Remove redundant truncated name column (col 0) 113 #Remove redundant truncated name column (col 0)
86 #and put full name at start (col 14) 114 #and put full name at start (col 14)
87 parts = parts[14:15] + parts[1:14] + parts[15:] 115 parts = parts[14:15] + parts[1:14] + parts[15:]
88 out_handle.write("\t".join(parts) + "\n") 116 out_handle.write("\t".join(parts) + "\n")
89 117
90 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) 118 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
119 cut_col, score_col = {"NN_Cmax" : (2,1),
120 "NN_Ymax" : (5,4),
121 "NN_Smax" : (8,7),
122 "HMM_Cmax" : (16,15),
123 }[cut_method]
124
125 source = "SignalP"
126 strand = "." #not stranded
127 phase = "." #not phased
128 tags = "Note=%s" % cut_method
129
130 tab_handle = open(tabular_file)
131 line = tab_handle.readline()
132 assert line.startswith("#ID\t"), line
133
134 gff_handle = open(gff_file, "w")
135 gff_handle.write("##gff-version 3\n")
136
137 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
138 parts = line.rstrip("\n").split("\t")
139 seqid = parts[0]
140 assert title.startswith(seqid), "%s vs %s" % (seqid, title)
141 if len(seq)==0:
142 #Is it possible to have a zero length reference in GFF3?
143 continue
144 cut = int(parts[cut_col])
145 if cut == 0:
146 assert cut_method == "HMM_Cmax", cut_method
147 #TODO - Why does it do this?
148 cut = 1
149 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
150 score = parts[score_col]
151 gff_handle.write("##sequence-region %s %i %i\n" \
152 % (seqid, 1, len(seq)))
153 #If the cut is at the very begining, there is no signal peptide!
154 if cut > 1:
155 #signal_peptide = SO:0000418
156 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
157 % (seqid, source,
158 "signal_peptide", 1, cut-1,
159 score, strand, phase, tags))
160 #mature_protein_region = SO:0000419
161 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
162 % (seqid, source,
163 "mature_protein_region", cut, len(seq),
164 score, strand, phase, tags))
165 tab_handle.close()
166 gff_handle.close()
167
168
169 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
170 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
91 temp_files = [f+".out" for f in fasta_files] 171 temp_files = [f+".out" for f in fasta_files]
92 assert len(fasta_files) == len(temp_files) 172 assert len(fasta_files) == len(temp_files)
93 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) 173 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
94 for (fasta, temp) in zip(fasta_files, temp_files)] 174 for (fasta, temp) in zip(fasta_files, temp_files)]
95 assert len(fasta_files) == len(temp_files) == len(jobs) 175 assert len(fasta_files) == len(temp_files) == len(jobs)
96 176
97 def clean_up(file_list): 177 def clean_up(file_list):
98 for f in file_list: 178 for f in file_list:
99 if os.path.isfile(f): 179 if os.path.isfile(f):
100 os.remove(f) 180 os.remove(f)
181 try:
182 os.rmdir(tmp_dir)
183 except:
184 pass
101 185
102 if len(jobs) > 1 and num_threads > 1: 186 if len(jobs) > 1 and num_threads > 1:
103 #A small "info" message for Galaxy to show the user. 187 #A small "info" message for Galaxy to show the user.
104 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 188 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
105 results = run_jobs(jobs, num_threads) 189 results = run_jobs(jobs, num_threads)
107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 191 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
108 error_level = results[cmd] 192 error_level = results[cmd]
109 try: 193 try:
110 output = open(temp).readline() 194 output = open(temp).readline()
111 except IOError: 195 except IOError:
112 output = "" 196 output = "(no output)"
113 if error_level or output.lower().startswith("error running"): 197 if error_level or output.lower().startswith("error running"):
114 clean_up(fasta_files) 198 clean_up(fasta_files + temp_files)
115 clean_up(temp_files)
116 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 199 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
117 error_level) 200 error_level)
118 del results 201 del results
119 202
120 out_handle = open(tabular_file, "w") 203 out_handle = open(tabular_file, "w")
131 data_handle = open(temp) 214 data_handle = open(temp)
132 clean_tabular(data_handle, out_handle) 215 clean_tabular(data_handle, out_handle)
133 data_handle.close() 216 data_handle.close()
134 out_handle.close() 217 out_handle.close()
135 218
136 clean_up(fasta_files) 219 #GFF3:
137 clean_up(temp_files) 220 if cut_method:
221 make_gff(fasta_file, tabular_file, gff3_file, cut_method)
222
223 clean_up(fasta_files + temp_files)