annotate kmersvm/scripts/libkmersvm.py @ 2:e8dcc2ed0f9f draft

Included bugfix, README
author test-svm
date Sun, 05 Aug 2012 16:32:03 -0400
parents 66088269713e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
1 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
2 libkmersvm.py; common library for kmersvm_train.py and kmersvm_classify.py
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
3 Copyright (C) 2011 Dongwon Lee
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
4
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
5 This program is free software: you can redistribute it and/or modify
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
6 it under the terms of the GNU General Public License as published by
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
7 the Free Software Foundation, either version 3 of the License, or
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
8 (at your option) any later version.
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
9
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
10 This program is distributed in the hope that it will be useful,
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
13 GNU General Public License for more details.
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
14
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
15 You should have received a copy of the GNU General Public License
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
17 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
18
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
19 import sys
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
20 import os
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
21 import os.path
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
22 import optparse
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
23
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
24 from bitarray import bitarray
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
25
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
26 def bitarray_fromfile(filename):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
27 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
28 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
29 fh = open(filename, 'rb')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
30 bits = bitarray()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
31 bits.fromfile(fh)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
32
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
33 return bits, fh
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
34
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
35 def generate_kmers(kmerlen):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
36 """make a full list of k-mers
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
37
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
38 Arguments:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
39 kmerlen -- integer, length of k-mer
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
40
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
41 Return:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
42 a list of the full set of k-mers
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
43 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
44
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
45 nts = ['A', 'C', 'G', 'T']
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
46 kmers = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
47 kmers.append('')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
48 l = 0
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
49 while l < kmerlen:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
50 imers = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
51 for imer in kmers:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
52 for nt in nts:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
53 imers.append(imer+nt)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
54 kmers = imers
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
55 l += 1
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
56
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
57 return kmers
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
58
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
59
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
60 def revcomp(seq):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
61 """get reverse complement DNA sequence
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
62
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
63 Arguments:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
64 seq -- string, DNA sequence
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
65
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
66 Return:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
67 the reverse complement sequence of the given sequence
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
68 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
69 rc = {'A':'T', 'G':'C', 'C':'G', 'T':'A'}
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
70 return ''.join([rc[seq[i]] for i in xrange(len(seq)-1, -1, -1)])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
71
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
72
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
73 def generate_rcmap_table(kmerlen, kmers):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
74 """make a lookup table for reverse complement k-mer ids for speed
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
75
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
76 Arguments:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
77 kmerlen -- integer, length of k-mer
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
78 kmers -- list, a full set of k-mers generated by generate_kmers
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
79
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
80 Return:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
81 a dictionary containing the mapping table
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
82 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
83 revcomp_func = revcomp
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
84
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
85 kmer_id_dict = {}
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
86 for i in xrange(len(kmers)):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
87 kmer_id_dict[kmers[i]] = i
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
88
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
89 revcomp_mapping_table = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
90 for kmerid in xrange(len(kmers)):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
91 rc_id = kmer_id_dict[revcomp_func(kmers[kmerid])]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
92 if rc_id < kmerid:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
93 revcomp_mapping_table.append(rc_id)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
94 else:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
95 revcomp_mapping_table.append(kmerid)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
96
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
97 return revcomp_mapping_table
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
98
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
99
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
100 def read_fastafile(filename, subs=True):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
101 """Read sequences from a file in FASTA format
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
102
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
103 Arguments:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
104 filename -- string, the name of the sequence file in FASTA format
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
105 subs -- bool, substitute 'N' with 'A' if set true
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
106
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
107 Return:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
108 list of sequences, list of sequence ids
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
109 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
110
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
111 sids = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
112 seqs = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
113
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
114 try:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
115 f = open(filename, 'r')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
116 lines = f.readlines()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
117 f.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
118
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
119 except IOError, (errno, strerror):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
120 print "I/O error(%d): %s" % (errno, strerror)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
121 sys.exit(0)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
122
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
123 seq = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
124 for line in lines:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
125 if line[0] == '>':
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
126 sids.append(line[1:].rstrip('\n').split()[0])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
127 if seq != []: seqs.append("".join(seq))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
128 seq = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
129 else:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
130 if subs:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
131 seq.append(line.rstrip('\n').replace('N', 'A').upper())
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
132 else:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
133 seq.append(line.rstrip('\n').upper())
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
134
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
135 if seq != []:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
136 seqs.append("".join(seq))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
137
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
138 return seqs, sids