annotate tools/protein_analysis/seq_analysis_utils.py @ 0:a2eeeaa6f75e

Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 17:37:26 -0400
parents
children fe10f448d641
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
1 """A few useful functions for working with FASTA files and running jobs.
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
2
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
3 This module was originally written to hold common code used in both the TMHMM
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
4 and SignalP wrappers in Galaxy.
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
5
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
6 Given Galaxy currently supports Python 2.4+ this cannot use the Python module
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
7 multiprocessing so the function run_jobs instead is a simple pool approach
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
8 using just the subprocess library.
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
9 """
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
10 import sys
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
11 import os
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
12 import subprocess
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
13 from time import sleep
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
14
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
15 __version__ = "0.0.1"
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
16
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
17 def stop_err(msg, error_level=1):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
18 """Print error message to stdout and quit with given error level."""
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
19 sys.stderr.write("%s\n" % msg)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
20 sys.exit(error_level)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
21
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
22 def fasta_iterator(filename, max_len=None, truncate=None):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
23 """Simple FASTA parser yielding tuples of (title, sequence) strings."""
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
24 handle = open(filename)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
25 title, seq = "", ""
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
26 for line in handle:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
27 if line.startswith(">"):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
28 if title:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
29 if truncate:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
30 seq = seq[:truncate]
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
31 if max_len and len(seq) > max_len:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
32 raise ValueError("Sequence %s is length %i, max length %i" \
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
33 % (title.split()[0], len(seq), max_len))
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
34 yield title, seq
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
35 title = line[1:].rstrip()
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
36 seq = ""
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
37 elif title:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
38 seq += line.strip()
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
39 elif not line.strip() or line.startswith("#"):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
40 #Ignore blank lines, and any comment lines
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
41 #between records (starting with hash).
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
42 pass
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
43 else:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
44 raise ValueError("Bad FASTA line %r" % line)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
45 handle.close()
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
46 if title:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
47 if truncate:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
48 seq = seq[:truncate]
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
49 if max_len and len(seq) > max_len:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
50 raise ValueError("Sequence %s is length %i, max length %i" \
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
51 % (title.split()[0], len(seq), max_len))
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
52 yield title, seq
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
53 raise StopIteration
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
54
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
55 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
56 """Split FASTA file into sub-files each of at most n sequences.
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
57
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
58 Returns a list of the filenames used (based on the input filename).
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
59 Each sequence can also be truncated (since we only need the start for
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
60 SignalP), and have its description discarded (since we don't usually
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
61 care about it and some tools don't like very long title lines).
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
62
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
63 If a max_len is given and any sequence exceeds it no temp files are
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
64 created and an exception is raised.
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
65 """
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
66 iterator = fasta_iterator(input_filename, max_len, truncate)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
67 files = []
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
68 try:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
69 while True:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
70 records = []
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
71 for i in range(n):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
72 try:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
73 records.append(iterator.next())
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
74 except StopIteration:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
75 break
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
76 if not records:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
77 break
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
78 new_filename = "%s.%i.tmp" % (output_filename_base, len(files))
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
79 handle = open(new_filename, "w")
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
80 if keep_descr:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
81 for title, seq in records:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
82 handle.write(">%s\n" % title)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
83 for i in range(0, len(seq), 60):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
84 handle.write(seq[i:i+60] + "\n")
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
85 else:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
86 for title, seq in records:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
87 handle.write(">%s\n" % title.split()[0])
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
88 for i in range(0, len(seq), 60):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
89 handle.write(seq[i:i+60] + "\n")
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
90 handle.close()
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
91 files.append(new_filename)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
92 #print "%i records in %s" % (len(records), new_filename)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
93 except ValueError, err:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
94 #Max length failure from parser - clean up
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
95 try:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
96 handle.close()
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
97 except:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
98 pass
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
99 for f in files:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
100 if os.path.isfile(f):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
101 os.remove(f)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
102 raise err
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
103 return files
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
104
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
105 def run_jobs(jobs, threads):
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
106 """Takes list of cmd strings, returns dict with error levels."""
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
107 pending = jobs[:]
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
108 running = []
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
109 results = {}
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
110 while pending or running:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
111 #print "%i jobs pending, %i running, %i completed" \
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
112 # % (len(jobs), len(running), len(results))
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
113 #See if any have finished
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
114 for (cmd, process) in running:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
115 return_code = process.wait()
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
116 if return_code is not None:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
117 results[cmd] = return_code
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
118 running = [(cmd, process) for (cmd, process) in running \
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
119 if cmd not in results]
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
120 #See if we can start any new threads
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
121 while pending and len(running) < threads:
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
122 cmd = pending.pop(0)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
123 process = subprocess.Popen(cmd, shell=True)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
124 running.append((cmd, process))
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
125 #Loop...
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
126 sleep(1)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
127 #print "%i jobs completed" % len(results)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
128 assert set(jobs) == set(results)
a2eeeaa6f75e Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff changeset
129 return results