Mercurial > repos > peterjc > tmhmm_and_signalp
diff tools/protein_analysis/rxlr_motifs.py @ 26:20139cb4c844 draft
planemo upload for repository https://github.com/peterjc/pico_galaxy/tools/protein_analysis commit 221d4187992cbb993e02dc3ea0ef0150c7916a4a-dirty
author | peterjc |
---|---|
date | Wed, 13 May 2015 06:14:42 -0400 |
parents | 2b35b5c4b7f4 |
children | 3cb02adf4326 |
line wrap: on
line diff
--- a/tools/protein_analysis/rxlr_motifs.py Fri Nov 21 08:17:36 2014 -0500 +++ b/tools/protein_analysis/rxlr_motifs.py Wed May 13 06:14:42 2015 -0400 @@ -40,10 +40,14 @@ import sys import re import subprocess -from seq_analysis_utils import stop_err, fasta_iterator +from seq_analysis_utils import sys_exit, fasta_iterator + +if "-v" in sys.argv: + print("RXLR Motifs v0.0.10") + sys.exit(0) if len(sys.argv) != 5: - stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename") + sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") fasta_file, threads, model, tabular_file = sys.argv[1:] hmm_output_file = tabular_file + ".hmm.tmp" @@ -53,36 +57,36 @@ hmmer_search = "hmmsearch2" if model == "Bhattacharjee2006": - signalp_trunc = 70 - re_rxlr = re.compile("R.LR") - min_sp = 10 - max_sp = 40 - max_sp_rxlr = 100 - min_rxlr_start = 1 - #Allow signal peptide to be at most 40aa, and want RXLR to be - #within 100aa, therefore for the prescreen the max start is 140: - max_rxlr_start = max_sp + max_sp_rxlr + signalp_trunc = 70 + re_rxlr = re.compile("R.LR") + min_sp = 10 + max_sp = 40 + max_sp_rxlr = 100 + min_rxlr_start = 1 + # Allow signal peptide to be at most 40aa, and want RXLR to be + # within 100aa, therefore for the prescreen the max start is 140: + max_rxlr_start = max_sp + max_sp_rxlr elif model == "Win2007": - signalp_trunc = 70 - re_rxlr = re.compile("R.LR") - min_sp = 10 - max_sp = 40 - min_rxlr_start = 30 - max_rxlr_start = 60 - #No explicit limit on separation of signal peptide clevage - #and RXLR, but shortest signal peptide is 10, and furthest - #away RXLR is 60, so effectively limit is 50. - max_sp_rxlr = max_rxlr_start - min_sp + 1 + signalp_trunc = 70 + re_rxlr = re.compile("R.LR") + min_sp = 10 + max_sp = 40 + min_rxlr_start = 30 + max_rxlr_start = 60 + # No explicit limit on separation of signal peptide clevage + # and RXLR, but shortest signal peptide is 10, and furthest + # away RXLR is 60, so effectively limit is 50. + max_sp_rxlr = max_rxlr_start - min_sp + 1 elif model == "Whisson2007": - signalp_trunc = 0 #zero for no truncation - re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") - min_sp = 10 - max_sp = 40 - max_sp_rxlr = 100 - min_rxlr_start = 1 - max_rxlr_start = max_sp + max_sp_rxlr + signalp_trunc = 0 # zero for no truncation + re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") + min_sp = 10 + max_sp = 40 + max_sp_rxlr = 100 + min_rxlr_start = 1 + max_rxlr_start = max_sp + max_sp_rxlr else: - stop_err("Did not recognise the model name %r\n" + sys_exit("Did not recognise the model name %r\n" "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) @@ -108,49 +112,49 @@ hmm_file = os.path.join(os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm") if not os.path.isfile(hmm_file): - stop_err("Missing HMM file for Whisson et al. (2007)") + sys_exit("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): - stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) + sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) hmm_hits = set() valid_ids = set() for title, seq in fasta_iterator(fasta_file): name = title.split(None,1)[0] if name in valid_ids: - stop_err("Duplicated identifier %r" % name) + sys_exit("Duplicated identifier %r" % name) else: valid_ids.add(name) if not valid_ids: - #Special case, don't need to run HMMER if there are no sequences + # Special case, don't need to run HMMER if there are no sequences pass else: - #I've left the code to handle HMMER 3 in situ, in case - #we revisit the choice to insist on HMMER 2. + # I've left the code to handle HMMER 3 in situ, in case + # we revisit the choice to insist on HMMER 2. hmmer3 = (3 == get_hmmer_version(hmmer_search)) - #Using zero (or 5.6?) for bitscore threshold + # Using zero (or 5.6?) for bitscore threshold if hmmer3: - #The HMMER3 table output is easy to parse - #In HMMER3 can't use both -T and -E + # The HMMER3 table output is easy to parse + # In HMMER3 can't use both -T and -E cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ % (hmmer_search, hmm_output_file, hmm_file, fasta_file) else: - #For HMMER2 we are stuck with parsing stdout - #Put 1e6 to effectively have no expectation threshold (otherwise - #HMMER defaults to 10 and the calculated e-value depends on the - #input FASTA file, and we can loose hits of interest). + # For HMMER2 we are stuck with parsing stdout + # Put 1e6 to effectively have no expectation threshold (otherwise + # HMMER defaults to 10 and the calculated e-value depends on the + # input FASTA file, and we can loose hits of interest). cmd = "%s -T 0 -E 1e6 %s %s > %s" \ % (hmmer_search, hmm_file, fasta_file, hmm_output_file) return_code = os.system(cmd) if return_code: - stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) + sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) handle = open(hmm_output_file) for line in handle: if not line.strip(): - #We expect blank lines in the HMMER2 stdout + # We expect blank lines in the HMMER2 stdout continue elif line.startswith("#"): - #Header + # Header continue else: name = line.split(None,1)[0] @@ -159,18 +163,18 @@ if name in valid_ids: hmm_hits.add(name) elif hmmer3: - stop_err("Unexpected identifer %r in hmmsearch output" % name) + sys_exit("Unexpected identifer %r in hmmsearch output" % name) handle.close() - #if hmmer3: - # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) - #else: - # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) - #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) + # if hmmer3: + # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + # else: + # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) os.remove(hmm_output_file) del valid_ids -#Prepare short list of candidates containing RXLR to pass to SignalP +# Prepare short list of candidates containing RXLR to pass to SignalP assert min_rxlr_start > 0, "Min value one, since zero based counting" count = 0 total = 0 @@ -180,27 +184,28 @@ name = title.split(None,1)[0] match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: - #This is a potential RXLR, depending on the SignalP results. - #Might as well truncate the sequence now, makes the temp file smaller + # This is a potential RXLR, depending on the SignalP results. + # Might as well truncate the sequence now, makes the temp file smaller if signalp_trunc: handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) else: - #Does it matter we don't line wrap? + # Does it matter we don't line wrap? handle.write(">%s\n%s\n" % (name, seq)) count += 1 handle.close() -#print "Running SignalP on %i/%i potentials." % (count, total) +# print "Running SignalP on %i/%i potentials." % (count, total) -#Run SignalP (using our wrapper script to get multi-core support etc) +# Run SignalP (using our wrapper script to get multi-core support etc) signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") if not os.path.isfile(signalp_script): - stop_err("Error - missing signalp3.py script") + sys_exit("Error - missing signalp3.py script") cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) return_code = os.system(cmd) if return_code: - stop_err("Error %i from SignalP:\n%s" % (return_code, cmd)) -#print "SignalP done" + sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd)) +# print "SignalP done" + def parse_signalp(filename): """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. @@ -217,7 +222,7 @@ handle.close() -#Parse SignalP results and apply the strict RXLR criteria +# Parse SignalP results and apply the strict RXLR criteria total = 0 tally = dict() handle = open(tabular_file, "w") @@ -230,26 +235,26 @@ match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: del match - #This was the criteria for calling SignalP, + # This was the criteria for calling SignalP, #so it will be in the SignalP results. sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() assert name == sp_id, "%s vs %s" % (name, sp_id) if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: match = re_rxlr.search(seq[sp_nn_len:].upper()) - if match and match.start() + 1 <= max_sp_rxlr: #1-based counting + if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting rxlr_start = sp_nn_len + match.start() + 1 if min_rxlr_start <= rxlr_start <= max_rxlr_start: rxlr = "Y" if model == "Whisson2007": - #Combine the signalp with regular expression heuristic and the HMM + # Combine the signalp with regular expression heuristic and the HMM if name in hmm_hits and rxlr == "N": - rxlr = "hmm" #HMM only + rxlr = "hmm" # HMM only elif rxlr == "N": - rxlr = "neither" #Don't use N (no) + rxlr = "neither" # Don't use N (no) elif name not in hmm_hits and rxlr == "Y": - rxlr = "re" #Heuristic only - #Now have a four way classifier: Y, hmm, re, neither - #and count is the number of Y results (both HMM and heuristic) + rxlr = "re" # Heuristic only + # Now have a four way classifier: Y, hmm, re, neither + # and count is the number of Y results (both HMM and heuristic) handle.write("%s\t%s\n" % (name, rxlr)) try: tally[rxlr] += 1 @@ -258,17 +263,17 @@ handle.close() assert sum(tally.values()) == total -#Check the iterator is finished +# Check the iterator is finished try: signalp_results.next() assert False, "Unexpected data in SignalP output" except StopIteration: pass -#Cleanup +# Cleanup os.remove(signalp_input_file) os.remove(signalp_output_file) -#Short summary to stdout for Galaxy's info display +# Short summary to stdout for Galaxy's info display print "%s for %i sequences:" % (model, total) print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))