diff kmersvm/scripts/libkmersvm.py @ 5:f99b5099ea55 draft

Uploaded
author test-svm
date Sun, 05 Aug 2012 16:50:57 -0400
parents 66088269713e
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kmersvm/scripts/libkmersvm.py	Sun Aug 05 16:50:57 2012 -0400
@@ -0,0 +1,138 @@
+"""
+	libkmersvm.py; common library for kmersvm_train.py and kmersvm_classify.py
+	Copyright (C) 2011 Dongwon Lee
+
+	This program is free software: you can redistribute it and/or modify
+	it under the terms of the GNU General Public License as published by
+	the Free Software Foundation, either version 3 of the License, or
+	(at your option) any later version.
+
+	This program is distributed in the hope that it will be useful,
+	but WITHOUT ANY WARRANTY; without even the implied warranty of
+	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+	GNU General Public License for more details.
+
+	You should have received a copy of the GNU General Public License
+	along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import sys
+import os
+import os.path
+import optparse
+
+from bitarray import bitarray
+
+def bitarray_fromfile(filename):
+	"""
+	"""
+	fh = open(filename, 'rb')
+	bits = bitarray()
+	bits.fromfile(fh)
+
+	return bits, fh
+
+def generate_kmers(kmerlen):
+	"""make a full list of k-mers 
+
+	Arguments:
+	kmerlen -- integer, length of k-mer
+
+	Return:
+	a list of the full set of k-mers
+	"""
+
+	nts = ['A', 'C', 'G', 'T']
+	kmers = [] 
+	kmers.append('')
+	l = 0
+	while l < kmerlen: 
+		imers = [] 
+		for imer in kmers:
+			for nt in nts: 
+				imers.append(imer+nt)
+		kmers = imers
+		l += 1 
+	
+	return kmers
+
+
+def revcomp(seq):
+	"""get reverse complement DNA sequence
+
+	Arguments:
+	seq -- string, DNA sequence
+
+	Return:
+	the reverse complement sequence of the given sequence
+	"""
+	rc = {'A':'T', 'G':'C', 'C':'G', 'T':'A'}
+	return ''.join([rc[seq[i]] for i in xrange(len(seq)-1, -1, -1)])
+
+
+def generate_rcmap_table(kmerlen, kmers):
+	"""make a lookup table for reverse complement k-mer ids for speed 
+
+	Arguments:
+	kmerlen -- integer, length of k-mer
+	kmers -- list, a full set of k-mers generated by generate_kmers
+
+	Return:
+	a dictionary containing the mapping table
+	"""
+	revcomp_func = revcomp
+
+	kmer_id_dict = {}
+	for i in xrange(len(kmers)): 
+		kmer_id_dict[kmers[i]] = i
+
+	revcomp_mapping_table = []
+	for kmerid in xrange(len(kmers)): 
+		rc_id = kmer_id_dict[revcomp_func(kmers[kmerid])]
+		if rc_id < kmerid:
+			revcomp_mapping_table.append(rc_id)
+		else:
+			revcomp_mapping_table.append(kmerid)
+	
+	return revcomp_mapping_table
+
+
+def read_fastafile(filename, subs=True):
+	"""Read sequences from a file in FASTA format
+
+	Arguments:
+	filename -- string, the name of the sequence file in FASTA format
+	subs -- bool, substitute 'N' with 'A' if set true
+
+	Return: 
+	list of sequences, list of sequence ids
+	"""
+
+	sids = []
+	seqs = []
+
+	try:
+		f = open(filename, 'r')
+		lines = f.readlines()
+		f.close()
+
+	except IOError, (errno, strerror):
+		print "I/O error(%d): %s" % (errno, strerror)
+		sys.exit(0)
+
+	seq = [] 
+	for line in lines:
+		if line[0] == '>':
+			sids.append(line[1:].rstrip('\n').split()[0])
+			if seq != []: seqs.append("".join(seq))
+			seq = []
+		else:
+			if subs:
+				seq.append(line.rstrip('\n').replace('N', 'A').upper())
+			else:
+				seq.append(line.rstrip('\n').upper())
+
+	if seq != []:
+		seqs.append("".join(seq))
+
+	return seqs, sids