comparison kmersvm/scripts/nullseq_build_indices.py @ 0:66088269713e draft

Uploaded all files tracked by git
author test-svm
date Sun, 05 Aug 2012 15:32:16 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:66088269713e
1 import os
2 import sys
3 import tarfile
4 import zipfile
5 import optparse
6
7 from bitarray import bitarray
8
9 def clear_indexes(sid, buildname):
10 na = '.'.join([buildname, sid, "na", "out"])
11 gc = '.'.join([buildname, sid, "gc", "out"])
12 rpt = '.'.join([buildname, sid, "rpt", "out"])
13
14 #truncate files
15 for fn in (na, gc, rpt):
16 f = open(fn, 'wb')
17 f.close()
18
19
20 def append_indexes(seq, sid, buildname):
21 na = '.'.join([buildname, sid, "na", "out"])
22 gc = '.'.join([buildname, sid, "gc", "out"])
23 rpt = '.'.join([buildname, sid, "rpt", "out"])
24
25 f = open(na, 'ab')
26 bitarray(map(lambda c: c in 'N', seq)).tofile(f)
27 f.close()
28
29 f = open(gc, 'ab')
30 bitarray(map(lambda c: c in 'cgCG', seq)).tofile(f)
31 f.close()
32
33 f = open(rpt, 'ab')
34 bitarray(map(lambda c: c in 'acgt', seq)).tofile(f)
35 f.close()
36
37
38 def build_indexes(fn, buildname):
39 save_interval = 8*32*1024
40
41 try:
42 f = open(fn, 'r')
43
44 seq = []
45 sid = ''
46 nlines = 0
47 for line in f:
48 if line[0] == '>':
49 if sid:
50 append_indexes("".join(seq), sid, buildname)
51 seq = []
52
53 sid = line[1:].rstrip('\n').split()[0]
54 clear_indexes(sid, buildname)
55 else:
56 nlines += 1
57 seq.append(line.rstrip('\n'))
58
59 if nlines % save_interval == 0:
60 append_indexes("".join(seq), sid, buildname)
61 seq = []
62
63 #the last remaining sequence
64 append_indexes("".join(seq), sid, buildname)
65
66 except IOError, (errno, strerror):
67 print "I/O error(%d): %s" % (errno, strerror)
68 sys.exit(0)
69
70
71 def main(argv=sys.argv):
72 usage = "usage: %prog [options] <Chromosome File(TARBALL gzip (tar.gz) or zip)> <Genome Build Name>"
73 desc = "generate bit index files for generating null sequences"
74
75 parser = optparse.OptionParser(usage=usage, description=desc)
76
77 parser.add_option("-q", dest="quiet", default=False, action="store_true", \
78 help="supress messages (default=false)")
79
80 (options, args) = parser.parse_args()
81
82 if len(args) == 0:
83 parser.print_help()
84 sys.exit(0)
85
86 if len(args) != 2:
87 parser.error("incorrect number of arguments")
88 parser.print_help()
89 sys.exit(0)
90
91 chrom_file = args[0]
92 genome = args[1]
93
94 if zipfile.is_zipfile(chrom_file):
95 if options.quiet == False:
96 sys.stderr.write("detected file type is zip.\n")
97
98 zipfileobj = zipfile.ZipFile(chrom_file)
99
100 for fn in zipfileobj.namelist():
101 if options.quiet == False:
102 sys.stderr.write(' '.join(["processing", fn, "\n"]))
103
104 zipfileobj.extract(fn)
105 build_indexes(fn, genome)
106 os.remove(fn)
107
108 zipfileobj.close()
109
110 elif tarfile.is_tarfile(chrom_file):
111 if options.quiet == False:
112 sys.stderr.write("detected file type is tar.\n")
113
114 tarfileobj = tarfile.open(chrom_file)
115
116 for fn in tarfileobj.getnames():
117 if options.quiet == False:
118 sys.stderr.write(' '.join(["processing", fn, "\n"]))
119
120 tarfileobj.extract(fn)
121 build_indexes(fn, genome)
122 os.remove(fn)
123
124 tarfileobj.close()
125
126 else:
127 sys.stderr.write(' '.join(["unknown input file:", fn, "\n"]))
128
129 if __name__ == "__main__": main()