|
0
|
1 """
|
|
|
2 libkmersvm.py; common library for kmersvm_train.py and kmersvm_classify.py
|
|
|
3 Copyright (C) 2011 Dongwon Lee
|
|
|
4
|
|
|
5 This program is free software: you can redistribute it and/or modify
|
|
|
6 it under the terms of the GNU General Public License as published by
|
|
|
7 the Free Software Foundation, either version 3 of the License, or
|
|
|
8 (at your option) any later version.
|
|
|
9
|
|
|
10 This program is distributed in the hope that it will be useful,
|
|
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
13 GNU General Public License for more details.
|
|
|
14
|
|
|
15 You should have received a copy of the GNU General Public License
|
|
|
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
17 """
|
|
|
18
|
|
|
19 import sys
|
|
|
20 import os
|
|
|
21 import os.path
|
|
|
22 import optparse
|
|
|
23
|
|
|
24 from bitarray import bitarray
|
|
|
25
|
|
|
26 def bitarray_fromfile(filename):
|
|
|
27 """
|
|
|
28 """
|
|
|
29 fh = open(filename, 'rb')
|
|
|
30 bits = bitarray()
|
|
|
31 bits.fromfile(fh)
|
|
|
32
|
|
|
33 return bits, fh
|
|
|
34
|
|
|
35 def generate_kmers(kmerlen):
|
|
|
36 """make a full list of k-mers
|
|
|
37
|
|
|
38 Arguments:
|
|
|
39 kmerlen -- integer, length of k-mer
|
|
|
40
|
|
|
41 Return:
|
|
|
42 a list of the full set of k-mers
|
|
|
43 """
|
|
|
44
|
|
|
45 nts = ['A', 'C', 'G', 'T']
|
|
|
46 kmers = []
|
|
|
47 kmers.append('')
|
|
|
48 l = 0
|
|
|
49 while l < kmerlen:
|
|
|
50 imers = []
|
|
|
51 for imer in kmers:
|
|
|
52 for nt in nts:
|
|
|
53 imers.append(imer+nt)
|
|
|
54 kmers = imers
|
|
|
55 l += 1
|
|
|
56
|
|
|
57 return kmers
|
|
|
58
|
|
|
59
|
|
|
60 def revcomp(seq):
|
|
|
61 """get reverse complement DNA sequence
|
|
|
62
|
|
|
63 Arguments:
|
|
|
64 seq -- string, DNA sequence
|
|
|
65
|
|
|
66 Return:
|
|
|
67 the reverse complement sequence of the given sequence
|
|
|
68 """
|
|
|
69 rc = {'A':'T', 'G':'C', 'C':'G', 'T':'A'}
|
|
|
70 return ''.join([rc[seq[i]] for i in xrange(len(seq)-1, -1, -1)])
|
|
|
71
|
|
|
72
|
|
|
73 def generate_rcmap_table(kmerlen, kmers):
|
|
|
74 """make a lookup table for reverse complement k-mer ids for speed
|
|
|
75
|
|
|
76 Arguments:
|
|
|
77 kmerlen -- integer, length of k-mer
|
|
|
78 kmers -- list, a full set of k-mers generated by generate_kmers
|
|
|
79
|
|
|
80 Return:
|
|
|
81 a dictionary containing the mapping table
|
|
|
82 """
|
|
|
83 revcomp_func = revcomp
|
|
|
84
|
|
|
85 kmer_id_dict = {}
|
|
|
86 for i in xrange(len(kmers)):
|
|
|
87 kmer_id_dict[kmers[i]] = i
|
|
|
88
|
|
|
89 revcomp_mapping_table = []
|
|
|
90 for kmerid in xrange(len(kmers)):
|
|
|
91 rc_id = kmer_id_dict[revcomp_func(kmers[kmerid])]
|
|
|
92 if rc_id < kmerid:
|
|
|
93 revcomp_mapping_table.append(rc_id)
|
|
|
94 else:
|
|
|
95 revcomp_mapping_table.append(kmerid)
|
|
|
96
|
|
|
97 return revcomp_mapping_table
|
|
|
98
|
|
|
99
|
|
|
100 def read_fastafile(filename, subs=True):
|
|
|
101 """Read sequences from a file in FASTA format
|
|
|
102
|
|
|
103 Arguments:
|
|
|
104 filename -- string, the name of the sequence file in FASTA format
|
|
|
105 subs -- bool, substitute 'N' with 'A' if set true
|
|
|
106
|
|
|
107 Return:
|
|
|
108 list of sequences, list of sequence ids
|
|
|
109 """
|
|
|
110
|
|
|
111 sids = []
|
|
|
112 seqs = []
|
|
|
113
|
|
|
114 try:
|
|
|
115 f = open(filename, 'r')
|
|
|
116 lines = f.readlines()
|
|
|
117 f.close()
|
|
|
118
|
|
|
119 except IOError, (errno, strerror):
|
|
|
120 print "I/O error(%d): %s" % (errno, strerror)
|
|
|
121 sys.exit(0)
|
|
|
122
|
|
|
123 seq = []
|
|
|
124 for line in lines:
|
|
|
125 if line[0] == '>':
|
|
|
126 sids.append(line[1:].rstrip('\n').split()[0])
|
|
|
127 if seq != []: seqs.append("".join(seq))
|
|
|
128 seq = []
|
|
|
129 else:
|
|
|
130 if subs:
|
|
|
131 seq.append(line.rstrip('\n').replace('N', 'A').upper())
|
|
|
132 else:
|
|
|
133 seq.append(line.rstrip('\n').upper())
|
|
|
134
|
|
|
135 if seq != []:
|
|
|
136 seqs.append("".join(seq))
|
|
|
137
|
|
|
138 return seqs, sids
|