8
|
1 #!/usr/bin/env python
|
|
2 """Wrapper for psortb for use in Galaxy.
|
|
3
|
|
4 This script takes exactly six command line arguments - which includes the
|
|
5 number of threads, and the input protein FASTA filename and output
|
|
6 tabular filename. It then splits up the FASTA input and calls multiple
|
|
7 copies of the standalone psortb v3 program, then collates the output.
|
|
8 e.g. Rather than this,
|
|
9
|
|
10 psort $type -c $cutoff -d $divergent -o long $sequence > $outfile
|
|
11
|
|
12 Call this:
|
|
13
|
|
14 psort $threads $type $cutoff $divergent $sequence $outfile
|
|
15
|
|
16 If ommitting -c or -d options, set $cutoff and $divergent to zero or blank.
|
|
17
|
|
18 Note that this is somewhat redundant with job-splitting available in Galaxy
|
|
19 itself (see the SignalP XML file for settings), but both can be applied.
|
|
20
|
|
21 Additionally it ensures the header line (with the column names) starts
|
|
22 with a # character as used elsewhere in Galaxy.
|
|
23 """
|
|
24 import sys
|
|
25 import os
|
|
26 import tempfile
|
|
27 from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
|
|
28
|
|
29 FASTA_CHUNK = 500
|
|
30
|
|
31 if "-v" in sys.argv or "--version" in sys.argv:
|
|
32 """Return underlying PSORTb's version"""
|
|
33 sys.exit(os.system("psort --version"))
|
|
34
|
|
35 if len(sys.argv) != 8:
|
|
36 stop_err("Require 7 arguments, number of threads (int), type (e.g. archaea), "
|
|
37 "output (e.g. terse/normal/long), cutoff, divergent, input protein "
|
|
38 "FASTA file & output tabular file")
|
|
39
|
|
40 num_threads = thread_count(sys.argv[1], default=4)
|
|
41 org_type = sys.argv[2]
|
|
42 out_type = sys.argv[3]
|
|
43 cutoff = sys.argv[4]
|
|
44 if cutoff.strip() and float(cutoff.strip()) != 0.0:
|
|
45 cutoff = "-c %s" % cutoff
|
|
46 else:
|
|
47 cutoff = ""
|
|
48 divergent = sys.argv[5]
|
|
49 if divergent.strip() and float(divergent.strip()) != 0.0:
|
|
50 divergent = "-d %s" % divergent
|
|
51 else:
|
|
52 divergent = ""
|
|
53 fasta_file = sys.argv[6]
|
|
54 tabular_file = sys.argv[7]
|
|
55
|
|
56 if out_type == "terse":
|
|
57 header = ['SeqID', 'Localization', 'Score']
|
|
58 elif out_type == "normal":
|
|
59 stop_err("Normal output not implemented yet, sorry.")
|
|
60 elif out_type == "long":
|
|
61 if org_type == "-n":
|
|
62 #Gram negative bacteria
|
|
63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details',
|
|
64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details',
|
|
65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details',
|
|
66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details',
|
|
67 'Profile-_Localization', 'Profile-_Details',
|
|
68 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details',
|
|
69 'Signal-_Localization', 'Signal-_Details',
|
|
70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score',
|
|
71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
|
|
72 'Secondary_Localization', 'PSortb_Version']
|
|
73 elif org_type == "-p":
|
|
74 #Gram positive bacteria
|
|
75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details',
|
|
76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details',
|
|
77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details',
|
|
78 'Profile+_Localization', 'Profile+_Details',
|
|
79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details',
|
|
80 'Signal+_Localization', 'Signal+_Details',
|
|
81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
|
|
82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
|
|
83 'Secondary_Localization', 'PSortb_Version']
|
|
84 elif org_type == "-a":
|
|
85 #Archaea
|
|
86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details',
|
|
87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details',
|
|
88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details',
|
|
89 'Profile_a_Localization', 'Profile_a_Details',
|
|
90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details',
|
|
91 'Signal_a_Localization', 'Signal_a_Details',
|
|
92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
|
|
93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
|
|
94 'Secondary_Localization', 'PSortb_Version']
|
|
95 else:
|
|
96 stop_err("Expected -n, -p or -a for the organism type, not %r" % org_type)
|
|
97 else:
|
|
98 stop_err("Expected terse, normal or long for the output type, not %r" % out_type)
|
|
99
|
|
100 tmp_dir = tempfile.mkdtemp()
|
|
101
|
|
102 def clean_tabular(raw_handle, out_handle):
|
|
103 """Clean up tabular TMHMM output, returns output line count."""
|
|
104 global header
|
|
105 count = 0
|
|
106 for line in raw_handle:
|
|
107 if not line.strip() or line.startswith("#"):
|
|
108 #Ignore any blank lines or comment lines
|
|
109 continue
|
|
110 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")]
|
|
111 if parts == header:
|
|
112 #Ignore the header line
|
|
113 continue
|
|
114 if not parts[-1] and len(parts) == len(header) + 1:
|
|
115 #Ignore dummy blank extra column, e.g.
|
|
116 #"...2.0\t\tPSORTb version 3.0\t\n"
|
|
117 parts = parts[:-1]
|
|
118 assert len(parts) == len(header), \
|
|
119 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line)
|
|
120 out_handle.write(line)
|
|
121 count += 1
|
|
122 return count
|
|
123
|
|
124 #Note that if the input FASTA file contains no sequences,
|
|
125 #split_fasta returns an empty list (i.e. zero temp files).
|
|
126 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
|
|
127 temp_files = [f+".out" for f in fasta_files]
|
|
128 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp)
|
|
129 for fasta, temp in zip(fasta_files, temp_files)]
|
|
130
|
|
131 def clean_up(file_list):
|
|
132 for f in file_list:
|
|
133 if os.path.isfile(f):
|
|
134 os.remove(f)
|
|
135 try:
|
|
136 os.rmdir(tmp_dir)
|
|
137 except:
|
|
138 pass
|
|
139
|
|
140 if len(jobs) > 1 and num_threads > 1:
|
|
141 #A small "info" message for Galaxy to show the user.
|
|
142 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
|
|
143 results = run_jobs(jobs, num_threads)
|
|
144 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
|
|
145 error_level = results[cmd]
|
|
146 if error_level:
|
|
147 try:
|
|
148 output = open(temp).readline()
|
|
149 except IOError:
|
|
150 output = ""
|
|
151 clean_up(fasta_files + temp_files)
|
|
152 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
|
|
153 error_level)
|
|
154 del results
|
|
155 del jobs
|
|
156
|
|
157 out_handle = open(tabular_file, "w")
|
|
158 out_handle.write("#%s\n" % "\t".join(header))
|
11
|
159 count = 0
|
8
|
160 for temp in temp_files:
|
|
161 data_handle = open(temp)
|
11
|
162 count += clean_tabular(data_handle, out_handle)
|
8
|
163 data_handle.close()
|
|
164 if not count:
|
|
165 clean_up(fasta_files + temp_files)
|
|
166 stop_err("No output from psortb")
|
|
167 out_handle.close()
|
|
168 print "%i records" % count
|
|
169
|
|
170 clean_up(fasta_files + temp_files)
|