diff tools/protein_analysis/psortb.py @ 8:391a142c1e60 draft

Uploaded
author peterjc
date Tue, 26 Mar 2013 14:27:44 -0400
parents
children 3d74c1176d67
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/psortb.py	Tue Mar 26 14:27:44 2013 -0400
@@ -0,0 +1,169 @@
+#!/usr/bin/env python
+"""Wrapper for psortb for use in Galaxy.
+
+This script takes exactly six command line arguments - which includes the
+number of threads, and the input protein FASTA filename and output
+tabular filename. It then splits up the FASTA input and calls multiple
+copies of the standalone psortb v3 program, then collates the output.
+e.g. Rather than this,
+
+psort $type -c $cutoff -d $divergent -o long $sequence > $outfile
+
+Call this:
+
+psort $threads $type $cutoff $divergent $sequence $outfile
+
+If ommitting -c or -d options, set $cutoff and $divergent to zero or blank.
+
+Note that this is somewhat redundant with job-splitting available in Galaxy
+itself (see the SignalP XML file for settings), but both can be applied.
+
+Additionally it ensures the header line (with the column names) starts
+with a # character as used elsewhere in Galaxy.
+"""
+import sys
+import os
+import tempfile
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, thread_count
+
+FASTA_CHUNK = 500
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    """Return underlying PSORTb's version"""
+    sys.exit(os.system("psort --version"))
+
+if len(sys.argv) != 8:
+    stop_err("Require 7 arguments, number of threads (int), type (e.g. archaea), "
+             "output (e.g. terse/normal/long), cutoff, divergent, input protein "
+             "FASTA file & output tabular file")
+
+num_threads = thread_count(sys.argv[1], default=4)
+org_type = sys.argv[2]
+out_type = sys.argv[3]
+cutoff = sys.argv[4]
+if cutoff.strip() and float(cutoff.strip()) != 0.0:
+    cutoff = "-c %s" % cutoff
+else:
+    cutoff = ""
+divergent = sys.argv[5]
+if divergent.strip() and float(divergent.strip()) != 0.0:
+    divergent = "-d %s" % divergent
+else:
+    divergent = ""
+fasta_file = sys.argv[6]
+tabular_file = sys.argv[7]
+
+if out_type == "terse":
+    header = ['SeqID', 'Localization', 'Score']
+elif out_type == "normal":
+    stop_err("Normal output not implemented yet, sorry.")
+elif out_type == "long":
+    if org_type == "-n":
+        #Gram negative bacteria
+        header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details',
+                  'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details',
+                  'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details',
+                  'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details',
+                  'Profile-_Localization', 'Profile-_Details',
+                  'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details',
+                  'Signal-_Localization', 'Signal-_Details',
+                  'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score',
+                  'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
+                  'Secondary_Localization', 'PSortb_Version']
+    elif org_type == "-p":
+        #Gram positive bacteria
+        header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details',
+                  'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details',
+                  'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details',
+                  'Profile+_Localization', 'Profile+_Details',
+                  'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details',
+                  'Signal+_Localization', 'Signal+_Details',
+                  'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
+                  'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
+                  'Secondary_Localization', 'PSortb_Version']
+    elif org_type == "-a":
+        #Archaea
+        header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details',
+                  'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details',
+                  'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details',
+                  'Profile_a_Localization', 'Profile_a_Details',
+                  'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details',
+                  'Signal_a_Localization', 'Signal_a_Details',
+                  'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
+                  'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
+                  'Secondary_Localization', 'PSortb_Version']
+    else:
+        stop_err("Expected -n, -p or -a for the organism type, not %r" % org_type)
+else:
+    stop_err("Expected terse, normal or long for the output type, not %r" % out_type)
+
+tmp_dir = tempfile.mkdtemp()
+
+def clean_tabular(raw_handle, out_handle):
+    """Clean up tabular TMHMM output, returns output line count."""
+    global header
+    count = 0
+    for line in raw_handle:
+        if not line.strip() or line.startswith("#"):
+            #Ignore any blank lines or comment lines
+            continue
+        parts = [x.strip() for x in line.rstrip("\r\n").split("\t")]
+        if parts == header:
+            #Ignore the header line
+            continue
+        if not parts[-1] and len(parts) == len(header) + 1:
+            #Ignore dummy blank extra column, e.g.
+            #"...2.0\t\tPSORTb version 3.0\t\n"
+            parts = parts[:-1]
+        assert len(parts) == len(header), \
+            "%i fields, not %i, in line:\n%r" % (len(line), len(header), line)
+        out_handle.write(line)
+        count += 1
+    return count
+
+#Note that if the input FASTA file contains no sequences,
+#split_fasta returns an empty list (i.e. zero temp files).
+fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
+temp_files = [f+".out" for f in fasta_files]
+jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp)
+        for fasta, temp in zip(fasta_files, temp_files)]
+
+def clean_up(file_list):
+    for f in file_list:
+        if os.path.isfile(f):
+            os.remove(f)
+    try:
+        os.rmdir(tmp_dir)
+    except:
+        pass
+
+if len(jobs) > 1 and num_threads > 1:
+    #A small "info" message for Galaxy to show the user.
+    print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
+results = run_jobs(jobs, num_threads)
+for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
+    error_level = results[cmd]
+    if error_level:
+        try:
+            output = open(temp).readline()
+        except IOError:
+            output = ""
+        clean_up(fasta_files + temp_files)
+        stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+                 error_level)
+del results
+del jobs
+
+out_handle = open(tabular_file, "w")
+out_handle.write("#%s\n" % "\t".join(header))
+for temp in temp_files:
+    data_handle = open(temp)
+    count = clean_tabular(data_handle, out_handle)
+    data_handle.close()
+    if not count:
+        clean_up(fasta_files + temp_files)
+        stop_err("No output from psortb")
+out_handle.close()
+print "%i records" % count
+
+clean_up(fasta_files + temp_files)