Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/seq_analysis_utils.py @ 1:9a8a7f680dd6
Migrated tool version 0.0.3 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 17:38:05 -0400 |
parents | a2eeeaa6f75e |
children | fe10f448d641 |
rev | line source |
---|---|
0
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
1 """A few useful functions for working with FASTA files and running jobs. |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
2 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
3 This module was originally written to hold common code used in both the TMHMM |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
4 and SignalP wrappers in Galaxy. |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
5 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
6 Given Galaxy currently supports Python 2.4+ this cannot use the Python module |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
7 multiprocessing so the function run_jobs instead is a simple pool approach |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
8 using just the subprocess library. |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
9 """ |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
10 import sys |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
11 import os |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
12 import subprocess |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
13 from time import sleep |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
14 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
15 __version__ = "0.0.1" |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
16 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
17 def stop_err(msg, error_level=1): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
18 """Print error message to stdout and quit with given error level.""" |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
19 sys.stderr.write("%s\n" % msg) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
20 sys.exit(error_level) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
21 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
22 def fasta_iterator(filename, max_len=None, truncate=None): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
23 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
24 handle = open(filename) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
25 title, seq = "", "" |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
26 for line in handle: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
27 if line.startswith(">"): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
28 if title: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
29 if truncate: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
30 seq = seq[:truncate] |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
31 if max_len and len(seq) > max_len: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
32 raise ValueError("Sequence %s is length %i, max length %i" \ |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
33 % (title.split()[0], len(seq), max_len)) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
34 yield title, seq |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
35 title = line[1:].rstrip() |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
36 seq = "" |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
37 elif title: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
38 seq += line.strip() |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
39 elif not line.strip() or line.startswith("#"): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
40 #Ignore blank lines, and any comment lines |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
41 #between records (starting with hash). |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
42 pass |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
43 else: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
44 raise ValueError("Bad FASTA line %r" % line) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
45 handle.close() |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
46 if title: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
47 if truncate: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
48 seq = seq[:truncate] |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
49 if max_len and len(seq) > max_len: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
50 raise ValueError("Sequence %s is length %i, max length %i" \ |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
51 % (title.split()[0], len(seq), max_len)) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
52 yield title, seq |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
53 raise StopIteration |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
54 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
55 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
56 """Split FASTA file into sub-files each of at most n sequences. |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
57 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
58 Returns a list of the filenames used (based on the input filename). |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
59 Each sequence can also be truncated (since we only need the start for |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
60 SignalP), and have its description discarded (since we don't usually |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
61 care about it and some tools don't like very long title lines). |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
62 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
63 If a max_len is given and any sequence exceeds it no temp files are |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
64 created and an exception is raised. |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
65 """ |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
66 iterator = fasta_iterator(input_filename, max_len, truncate) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
67 files = [] |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
68 try: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
69 while True: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
70 records = [] |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
71 for i in range(n): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
72 try: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
73 records.append(iterator.next()) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
74 except StopIteration: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
75 break |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
76 if not records: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
77 break |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
78 new_filename = "%s.%i.tmp" % (output_filename_base, len(files)) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
79 handle = open(new_filename, "w") |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
80 if keep_descr: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
81 for title, seq in records: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
82 handle.write(">%s\n" % title) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
83 for i in range(0, len(seq), 60): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
84 handle.write(seq[i:i+60] + "\n") |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
85 else: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
86 for title, seq in records: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
87 handle.write(">%s\n" % title.split()[0]) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
88 for i in range(0, len(seq), 60): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
89 handle.write(seq[i:i+60] + "\n") |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
90 handle.close() |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
91 files.append(new_filename) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
92 #print "%i records in %s" % (len(records), new_filename) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
93 except ValueError, err: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
94 #Max length failure from parser - clean up |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
95 try: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
96 handle.close() |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
97 except: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
98 pass |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
99 for f in files: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
100 if os.path.isfile(f): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
101 os.remove(f) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
102 raise err |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
103 return files |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
104 |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
105 def run_jobs(jobs, threads): |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
106 """Takes list of cmd strings, returns dict with error levels.""" |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
107 pending = jobs[:] |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
108 running = [] |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
109 results = {} |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
110 while pending or running: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
111 #print "%i jobs pending, %i running, %i completed" \ |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
112 # % (len(jobs), len(running), len(results)) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
113 #See if any have finished |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
114 for (cmd, process) in running: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
115 return_code = process.wait() |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
116 if return_code is not None: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
117 results[cmd] = return_code |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
118 running = [(cmd, process) for (cmd, process) in running \ |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
119 if cmd not in results] |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
120 #See if we can start any new threads |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
121 while pending and len(running) < threads: |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
122 cmd = pending.pop(0) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
123 process = subprocess.Popen(cmd, shell=True) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
124 running.append((cmd, process)) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
125 #Loop... |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
126 sleep(1) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
127 #print "%i jobs completed" % len(results) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
128 assert set(jobs) == set(results) |
a2eeeaa6f75e
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
129 return results |