Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 7:5e62aefb2918 draft
Uploaded v0.1.2 to Test Tool Shed
| author | peterjc | 
|---|---|
| date | Tue, 26 Mar 2013 14:24:56 -0400 | 
| parents | ef7ceca37e3f | 
| children | 391a142c1e60 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 6:39a6e46cdda3 | 7:5e62aefb2918 | 
|---|---|
| 2 """Wrapper for SignalP v3.0 for use in Galaxy. | 2 """Wrapper for SignalP v3.0 for use in Galaxy. | 
| 3 | 3 | 
| 4 This script takes exactly five command line arguments: | 4 This script takes exactly five command line arguments: | 
| 5 * the organism type (euk, gram+ or gram-) | 5 * the organism type (euk, gram+ or gram-) | 
| 6 * length to truncate sequences to (integer) | 6 * length to truncate sequences to (integer) | 
| 7 * number of threads to use (integer) | 7 * number of threads to use (integer, defaults to one) | 
| 8 * an input protein FASTA filename | 8 * an input protein FASTA filename | 
| 9 * output tabular filename. | 9 * output tabular filename. | 
| 10 | |
| 11 There are two further optional arguments | |
| 12 * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax) | |
| 13 * output GFF3 filename | |
| 10 | 14 | 
| 11 It then calls the standalone SignalP v3.0 program (not the webservice) | 15 It then calls the standalone SignalP v3.0 program (not the webservice) | 
| 12 requesting the short output (one line per protein) using both NN and HMM | 16 requesting the short output (one line per protein) using both NN and HMM | 
| 13 for predictions. | 17 for predictions. | 
| 14 | 18 | 
| 39 The third major feature is taking advantage of multiple cores (since SignalP | 43 The third major feature is taking advantage of multiple cores (since SignalP | 
| 40 v3.0 itself is single threaded) by using the individual FASTA input files to | 44 v3.0 itself is single threaded) by using the individual FASTA input files to | 
| 41 run multiple copies of TMHMM in parallel. I would normally use Python's | 45 run multiple copies of TMHMM in parallel. I would normally use Python's | 
| 42 multiprocessing library in this situation but it requires at least Python 2.6 | 46 multiprocessing library in this situation but it requires at least Python 2.6 | 
| 43 and at the time of writing Galaxy still supports Python 2.4. | 47 and at the time of writing Galaxy still supports Python 2.4. | 
| 48 | |
| 49 Note that this is somewhat redundant with job-splitting available in Galaxy | |
| 50 itself (see the SignalP XML file for settings). | |
| 51 | |
| 52 Finally, you can opt to have a GFF3 file produced which will describe the | |
| 53 predicted signal peptide and mature peptide for each protein (using one of | |
| 54 the predictors which gives a cleavage site). *WORK IN PROGRESS* | |
| 44 """ | 55 """ | 
| 45 import sys | 56 import sys | 
| 46 import os | 57 import os | 
| 47 from seq_analysis_utils import stop_err, split_fasta, run_jobs | 58 import tempfile | 
| 59 from seq_analysis_utils import stop_err, split_fasta, fasta_iterator | |
| 60 from seq_analysis_utils import run_jobs, thread_count | |
| 48 | 61 | 
| 49 FASTA_CHUNK = 500 | 62 FASTA_CHUNK = 500 | 
| 50 MAX_LEN = 6000 #Found by trial and error | 63 MAX_LEN = 6000 #Found by trial and error | 
| 51 | 64 | 
| 52 if len(sys.argv) != 6: | 65 if len(sys.argv) not in [6,8]: | 
| 53 stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") | 66 stop_err("Require five (or 7) arguments, organism, truncate, threads, " | 
| 67 "input protein FASTA file & output tabular file (plus " | |
| 68 "optionally cut method and GFF3 output file). " | |
| 69 "Got %i arguments." % (len(sys.argv)-1)) | |
| 54 | 70 | 
| 55 organism = sys.argv[1] | 71 organism = sys.argv[1] | 
| 56 if organism not in ["euk", "gram+", "gram-"]: | 72 if organism not in ["euk", "gram+", "gram-"]: | 
| 57 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) | 73 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) | 
| 58 | 74 | 
| 61 except: | 77 except: | 
| 62 truncate = 0 | 78 truncate = 0 | 
| 63 if truncate < 0: | 79 if truncate < 0: | 
| 64 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) | 80 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) | 
| 65 | 81 | 
| 66 try: | 82 num_threads = thread_count(sys.argv[3], default=4) | 
| 67 num_threads = int(sys.argv[3]) | |
| 68 except: | |
| 69 num_threads = 0 | |
| 70 if num_threads < 1: | |
| 71 stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) | |
| 72 | |
| 73 fasta_file = sys.argv[4] | 83 fasta_file = sys.argv[4] | 
| 74 | |
| 75 tabular_file = sys.argv[5] | 84 tabular_file = sys.argv[5] | 
| 76 | 85 | 
| 77 def clean_tabular(raw_handle, out_handle): | 86 if len(sys.argv) == 8: | 
| 87 cut_method = sys.argv[6] | |
| 88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: | |
| 89 stop_err("Invalid cut method %r" % cut_method) | |
| 90 gff3_file = sys.argv[7] | |
| 91 else: | |
| 92 cut_method = None | |
| 93 gff3_file = None | |
| 94 | |
| 95 | |
| 96 tmp_dir = tempfile.mkdtemp() | |
| 97 | |
| 98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): | |
| 78 """Clean up SignalP output to make it tabular.""" | 99 """Clean up SignalP output to make it tabular.""" | 
| 100 if cut_method: | |
| 101 cut_col = {"NN_Cmax" : 2, | |
| 102 "NN_Ymax" : 5, | |
| 103 "NN_Smax" : 8, | |
| 104 "HMM_Cmax" : 16}[cut_method] | |
| 105 else: | |
| 106 cut_col = None | |
| 79 for line in raw_handle: | 107 for line in raw_handle: | 
| 80 if not line or line.startswith("#"): | 108 if not line or line.startswith("#"): | 
| 81 continue | 109 continue | 
| 82 parts = line.rstrip("\r\n").split() | 110 parts = line.rstrip("\r\n").split() | 
| 83 assert len(parts)==21, repr(line) | 111 assert len(parts)==21, repr(line) | 
| 85 #Remove redundant truncated name column (col 0) | 113 #Remove redundant truncated name column (col 0) | 
| 86 #and put full name at start (col 14) | 114 #and put full name at start (col 14) | 
| 87 parts = parts[14:15] + parts[1:14] + parts[15:] | 115 parts = parts[14:15] + parts[1:14] + parts[15:] | 
| 88 out_handle.write("\t".join(parts) + "\n") | 116 out_handle.write("\t".join(parts) + "\n") | 
| 89 | 117 | 
| 90 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | 118 def make_gff(fasta_file, tabular_file, gff_file, cut_method): | 
| 119 cut_col, score_col = {"NN_Cmax" : (2,1), | |
| 120 "NN_Ymax" : (5,4), | |
| 121 "NN_Smax" : (8,7), | |
| 122 "HMM_Cmax" : (16,15), | |
| 123 }[cut_method] | |
| 124 | |
| 125 source = "SignalP" | |
| 126 strand = "." #not stranded | |
| 127 phase = "." #not phased | |
| 128 tags = "Note=%s" % cut_method | |
| 129 | |
| 130 tab_handle = open(tabular_file) | |
| 131 line = tab_handle.readline() | |
| 132 assert line.startswith("#ID\t"), line | |
| 133 | |
| 134 gff_handle = open(gff_file, "w") | |
| 135 gff_handle.write("##gff-version 3\n") | |
| 136 | |
| 137 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): | |
| 138 parts = line.rstrip("\n").split("\t") | |
| 139 seqid = parts[0] | |
| 140 assert title.startswith(seqid), "%s vs %s" % (seqid, title) | |
| 141 if len(seq)==0: | |
| 142 #Is it possible to have a zero length reference in GFF3? | |
| 143 continue | |
| 144 cut = int(parts[cut_col]) | |
| 145 if cut == 0: | |
| 146 assert cut_method == "HMM_Cmax", cut_method | |
| 147 #TODO - Why does it do this? | |
| 148 cut = 1 | |
| 149 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) | |
| 150 score = parts[score_col] | |
| 151 gff_handle.write("##sequence-region %s %i %i\n" \ | |
| 152 % (seqid, 1, len(seq))) | |
| 153 #If the cut is at the very begining, there is no signal peptide! | |
| 154 if cut > 1: | |
| 155 #signal_peptide = SO:0000418 | |
| 156 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ | |
| 157 % (seqid, source, | |
| 158 "signal_peptide", 1, cut-1, | |
| 159 score, strand, phase, tags)) | |
| 160 #mature_protein_region = SO:0000419 | |
| 161 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ | |
| 162 % (seqid, source, | |
| 163 "mature_protein_region", cut, len(seq), | |
| 164 score, strand, phase, tags)) | |
| 165 tab_handle.close() | |
| 166 gff_handle.close() | |
| 167 | |
| 168 | |
| 169 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), | |
| 170 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | |
| 91 temp_files = [f+".out" for f in fasta_files] | 171 temp_files = [f+".out" for f in fasta_files] | 
| 92 assert len(fasta_files) == len(temp_files) | 172 assert len(fasta_files) == len(temp_files) | 
| 93 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) | 173 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) | 
| 94 for (fasta, temp) in zip(fasta_files, temp_files)] | 174 for (fasta, temp) in zip(fasta_files, temp_files)] | 
| 95 assert len(fasta_files) == len(temp_files) == len(jobs) | 175 assert len(fasta_files) == len(temp_files) == len(jobs) | 
| 96 | 176 | 
| 97 def clean_up(file_list): | 177 def clean_up(file_list): | 
| 98 for f in file_list: | 178 for f in file_list: | 
| 99 if os.path.isfile(f): | 179 if os.path.isfile(f): | 
| 100 os.remove(f) | 180 os.remove(f) | 
| 181 try: | |
| 182 os.rmdir(tmp_dir) | |
| 183 except: | |
| 184 pass | |
| 101 | 185 | 
| 102 if len(jobs) > 1 and num_threads > 1: | 186 if len(jobs) > 1 and num_threads > 1: | 
| 103 #A small "info" message for Galaxy to show the user. | 187 #A small "info" message for Galaxy to show the user. | 
| 104 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 188 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 
| 105 results = run_jobs(jobs, num_threads) | 189 results = run_jobs(jobs, num_threads) | 
| 107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 191 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 
| 108 error_level = results[cmd] | 192 error_level = results[cmd] | 
| 109 try: | 193 try: | 
| 110 output = open(temp).readline() | 194 output = open(temp).readline() | 
| 111 except IOError: | 195 except IOError: | 
| 112 output = "" | 196 output = "(no output)" | 
| 113 if error_level or output.lower().startswith("error running"): | 197 if error_level or output.lower().startswith("error running"): | 
| 114 clean_up(fasta_files) | 198 clean_up(fasta_files + temp_files) | 
| 115 clean_up(temp_files) | |
| 116 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 199 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 
| 117 error_level) | 200 error_level) | 
| 118 del results | 201 del results | 
| 119 | 202 | 
| 120 out_handle = open(tabular_file, "w") | 203 out_handle = open(tabular_file, "w") | 
| 131 data_handle = open(temp) | 214 data_handle = open(temp) | 
| 132 clean_tabular(data_handle, out_handle) | 215 clean_tabular(data_handle, out_handle) | 
| 133 data_handle.close() | 216 data_handle.close() | 
| 134 out_handle.close() | 217 out_handle.close() | 
| 135 | 218 | 
| 136 clean_up(fasta_files) | 219 #GFF3: | 
| 137 clean_up(temp_files) | 220 if cut_method: | 
| 221 make_gff(fasta_file, tabular_file, gff3_file, cut_method) | |
| 222 | |
| 223 clean_up(fasta_files + temp_files) | 
