Mercurial > repos > test-svm > kmersvm_test
comparison kmersvm/scripts/libkmersvm.py @ 5:f99b5099ea55 draft
Uploaded
| author | test-svm |
|---|---|
| date | Sun, 05 Aug 2012 16:50:57 -0400 |
| parents | 66088269713e |
| children |
comparison
equal
deleted
inserted
replaced
| 4:f2130156fd5d | 5:f99b5099ea55 |
|---|---|
| 1 """ | |
| 2 libkmersvm.py; common library for kmersvm_train.py and kmersvm_classify.py | |
| 3 Copyright (C) 2011 Dongwon Lee | |
| 4 | |
| 5 This program is free software: you can redistribute it and/or modify | |
| 6 it under the terms of the GNU General Public License as published by | |
| 7 the Free Software Foundation, either version 3 of the License, or | |
| 8 (at your option) any later version. | |
| 9 | |
| 10 This program is distributed in the hope that it will be useful, | |
| 11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 13 GNU General Public License for more details. | |
| 14 | |
| 15 You should have received a copy of the GNU General Public License | |
| 16 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 17 """ | |
| 18 | |
| 19 import sys | |
| 20 import os | |
| 21 import os.path | |
| 22 import optparse | |
| 23 | |
| 24 from bitarray import bitarray | |
| 25 | |
| 26 def bitarray_fromfile(filename): | |
| 27 """ | |
| 28 """ | |
| 29 fh = open(filename, 'rb') | |
| 30 bits = bitarray() | |
| 31 bits.fromfile(fh) | |
| 32 | |
| 33 return bits, fh | |
| 34 | |
| 35 def generate_kmers(kmerlen): | |
| 36 """make a full list of k-mers | |
| 37 | |
| 38 Arguments: | |
| 39 kmerlen -- integer, length of k-mer | |
| 40 | |
| 41 Return: | |
| 42 a list of the full set of k-mers | |
| 43 """ | |
| 44 | |
| 45 nts = ['A', 'C', 'G', 'T'] | |
| 46 kmers = [] | |
| 47 kmers.append('') | |
| 48 l = 0 | |
| 49 while l < kmerlen: | |
| 50 imers = [] | |
| 51 for imer in kmers: | |
| 52 for nt in nts: | |
| 53 imers.append(imer+nt) | |
| 54 kmers = imers | |
| 55 l += 1 | |
| 56 | |
| 57 return kmers | |
| 58 | |
| 59 | |
| 60 def revcomp(seq): | |
| 61 """get reverse complement DNA sequence | |
| 62 | |
| 63 Arguments: | |
| 64 seq -- string, DNA sequence | |
| 65 | |
| 66 Return: | |
| 67 the reverse complement sequence of the given sequence | |
| 68 """ | |
| 69 rc = {'A':'T', 'G':'C', 'C':'G', 'T':'A'} | |
| 70 return ''.join([rc[seq[i]] for i in xrange(len(seq)-1, -1, -1)]) | |
| 71 | |
| 72 | |
| 73 def generate_rcmap_table(kmerlen, kmers): | |
| 74 """make a lookup table for reverse complement k-mer ids for speed | |
| 75 | |
| 76 Arguments: | |
| 77 kmerlen -- integer, length of k-mer | |
| 78 kmers -- list, a full set of k-mers generated by generate_kmers | |
| 79 | |
| 80 Return: | |
| 81 a dictionary containing the mapping table | |
| 82 """ | |
| 83 revcomp_func = revcomp | |
| 84 | |
| 85 kmer_id_dict = {} | |
| 86 for i in xrange(len(kmers)): | |
| 87 kmer_id_dict[kmers[i]] = i | |
| 88 | |
| 89 revcomp_mapping_table = [] | |
| 90 for kmerid in xrange(len(kmers)): | |
| 91 rc_id = kmer_id_dict[revcomp_func(kmers[kmerid])] | |
| 92 if rc_id < kmerid: | |
| 93 revcomp_mapping_table.append(rc_id) | |
| 94 else: | |
| 95 revcomp_mapping_table.append(kmerid) | |
| 96 | |
| 97 return revcomp_mapping_table | |
| 98 | |
| 99 | |
| 100 def read_fastafile(filename, subs=True): | |
| 101 """Read sequences from a file in FASTA format | |
| 102 | |
| 103 Arguments: | |
| 104 filename -- string, the name of the sequence file in FASTA format | |
| 105 subs -- bool, substitute 'N' with 'A' if set true | |
| 106 | |
| 107 Return: | |
| 108 list of sequences, list of sequence ids | |
| 109 """ | |
| 110 | |
| 111 sids = [] | |
| 112 seqs = [] | |
| 113 | |
| 114 try: | |
| 115 f = open(filename, 'r') | |
| 116 lines = f.readlines() | |
| 117 f.close() | |
| 118 | |
| 119 except IOError, (errno, strerror): | |
| 120 print "I/O error(%d): %s" % (errno, strerror) | |
| 121 sys.exit(0) | |
| 122 | |
| 123 seq = [] | |
| 124 for line in lines: | |
| 125 if line[0] == '>': | |
| 126 sids.append(line[1:].rstrip('\n').split()[0]) | |
| 127 if seq != []: seqs.append("".join(seq)) | |
| 128 seq = [] | |
| 129 else: | |
| 130 if subs: | |
| 131 seq.append(line.rstrip('\n').replace('N', 'A').upper()) | |
| 132 else: | |
| 133 seq.append(line.rstrip('\n').upper()) | |
| 134 | |
| 135 if seq != []: | |
| 136 seqs.append("".join(seq)) | |
| 137 | |
| 138 return seqs, sids |
