diff kmersvm/scripts/nullseq_build_indices.py @ 0:66088269713e draft

Uploaded all files tracked by git
author test-svm
date Sun, 05 Aug 2012 15:32:16 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kmersvm/scripts/nullseq_build_indices.py	Sun Aug 05 15:32:16 2012 -0400
@@ -0,0 +1,129 @@
+import os
+import sys
+import tarfile
+import zipfile
+import optparse
+
+from bitarray import bitarray
+
+def clear_indexes(sid, buildname):
+	na = '.'.join([buildname, sid, "na", "out"])
+	gc = '.'.join([buildname, sid, "gc", "out"])
+	rpt = '.'.join([buildname, sid, "rpt", "out"])
+
+	#truncate files
+	for fn in (na, gc, rpt):
+		f = open(fn, 'wb')
+		f.close()
+
+
+def append_indexes(seq, sid, buildname):
+	na = '.'.join([buildname, sid, "na", "out"])
+	gc = '.'.join([buildname, sid, "gc", "out"])
+	rpt = '.'.join([buildname, sid, "rpt", "out"])
+
+	f = open(na, 'ab')
+	bitarray(map(lambda c: c in 'N', seq)).tofile(f)
+	f.close()
+
+	f = open(gc, 'ab')
+	bitarray(map(lambda c: c in 'cgCG', seq)).tofile(f)
+	f.close()
+
+	f = open(rpt, 'ab')
+	bitarray(map(lambda c: c in 'acgt', seq)).tofile(f)
+	f.close()
+
+
+def build_indexes(fn, buildname):
+	save_interval = 8*32*1024
+
+	try:
+		f = open(fn, 'r')
+		
+		seq = [] 
+		sid = ''
+		nlines = 0
+		for line in f:
+			if line[0] == '>':
+				if sid: 
+					append_indexes("".join(seq), sid, buildname)
+					seq = []
+
+				sid = line[1:].rstrip('\n').split()[0]
+				clear_indexes(sid, buildname)
+			else:
+				nlines += 1
+				seq.append(line.rstrip('\n'))
+
+				if nlines % save_interval == 0:
+					append_indexes("".join(seq), sid, buildname)
+					seq = []
+
+		#the last remaining sequence 
+		append_indexes("".join(seq), sid, buildname)
+
+	except IOError, (errno, strerror):
+		print "I/O error(%d): %s" % (errno, strerror)
+		sys.exit(0)
+
+
+def main(argv=sys.argv):
+	usage = "usage: %prog [options] <Chromosome File(TARBALL gzip (tar.gz) or zip)> <Genome Build Name>"
+	desc  = "generate bit index files for generating null sequences"
+
+	parser = optparse.OptionParser(usage=usage, description=desc)
+
+	parser.add_option("-q", dest="quiet", default=False, action="store_true", \
+  			help="supress messages (default=false)")
+
+	(options, args) = parser.parse_args()
+
+	if len(args) == 0:
+		parser.print_help()
+		sys.exit(0)
+
+	if len(args) != 2:
+		parser.error("incorrect number of arguments")
+		parser.print_help()
+		sys.exit(0)
+
+	chrom_file = args[0]
+	genome = args[1]
+
+	if zipfile.is_zipfile(chrom_file):
+		if options.quiet == False:
+			sys.stderr.write("detected file type is zip.\n")
+
+		zipfileobj = zipfile.ZipFile(chrom_file)
+
+		for fn in zipfileobj.namelist():
+			if options.quiet == False:
+				sys.stderr.write(' '.join(["processing", fn, "\n"]))
+
+			zipfileobj.extract(fn)
+			build_indexes(fn, genome)
+			os.remove(fn)
+
+		zipfileobj.close()
+
+	elif tarfile.is_tarfile(chrom_file):
+		if options.quiet == False:
+			sys.stderr.write("detected file type is tar.\n")
+
+		tarfileobj = tarfile.open(chrom_file)
+
+		for fn in tarfileobj.getnames():
+			if options.quiet == False:
+				sys.stderr.write(' '.join(["processing", fn, "\n"]))
+
+			tarfileobj.extract(fn)
+			build_indexes(fn, genome)
+			os.remove(fn)
+
+		tarfileobj.close()
+
+	else:
+		sys.stderr.write(' '.join(["unknown input file:", fn, "\n"]))
+
+if __name__ == "__main__": main()