|
0
|
1 import os
|
|
|
2 import sys
|
|
|
3 import tarfile
|
|
|
4 import zipfile
|
|
|
5 import optparse
|
|
|
6
|
|
|
7 from bitarray import bitarray
|
|
|
8
|
|
|
9 def clear_indexes(sid, buildname):
|
|
|
10 na = '.'.join([buildname, sid, "na", "out"])
|
|
|
11 gc = '.'.join([buildname, sid, "gc", "out"])
|
|
|
12 rpt = '.'.join([buildname, sid, "rpt", "out"])
|
|
|
13
|
|
|
14 #truncate files
|
|
|
15 for fn in (na, gc, rpt):
|
|
|
16 f = open(fn, 'wb')
|
|
|
17 f.close()
|
|
|
18
|
|
|
19
|
|
|
20 def append_indexes(seq, sid, buildname):
|
|
|
21 na = '.'.join([buildname, sid, "na", "out"])
|
|
|
22 gc = '.'.join([buildname, sid, "gc", "out"])
|
|
|
23 rpt = '.'.join([buildname, sid, "rpt", "out"])
|
|
|
24
|
|
|
25 f = open(na, 'ab')
|
|
|
26 bitarray(map(lambda c: c in 'N', seq)).tofile(f)
|
|
|
27 f.close()
|
|
|
28
|
|
|
29 f = open(gc, 'ab')
|
|
|
30 bitarray(map(lambda c: c in 'cgCG', seq)).tofile(f)
|
|
|
31 f.close()
|
|
|
32
|
|
|
33 f = open(rpt, 'ab')
|
|
|
34 bitarray(map(lambda c: c in 'acgt', seq)).tofile(f)
|
|
|
35 f.close()
|
|
|
36
|
|
|
37
|
|
|
38 def build_indexes(fn, buildname):
|
|
|
39 save_interval = 8*32*1024
|
|
|
40
|
|
|
41 try:
|
|
|
42 f = open(fn, 'r')
|
|
|
43
|
|
|
44 seq = []
|
|
|
45 sid = ''
|
|
|
46 nlines = 0
|
|
|
47 for line in f:
|
|
|
48 if line[0] == '>':
|
|
|
49 if sid:
|
|
|
50 append_indexes("".join(seq), sid, buildname)
|
|
|
51 seq = []
|
|
|
52
|
|
|
53 sid = line[1:].rstrip('\n').split()[0]
|
|
|
54 clear_indexes(sid, buildname)
|
|
|
55 else:
|
|
|
56 nlines += 1
|
|
|
57 seq.append(line.rstrip('\n'))
|
|
|
58
|
|
|
59 if nlines % save_interval == 0:
|
|
|
60 append_indexes("".join(seq), sid, buildname)
|
|
|
61 seq = []
|
|
|
62
|
|
|
63 #the last remaining sequence
|
|
|
64 append_indexes("".join(seq), sid, buildname)
|
|
|
65
|
|
|
66 except IOError, (errno, strerror):
|
|
|
67 print "I/O error(%d): %s" % (errno, strerror)
|
|
|
68 sys.exit(0)
|
|
|
69
|
|
|
70
|
|
|
71 def main(argv=sys.argv):
|
|
|
72 usage = "usage: %prog [options] <Chromosome File(TARBALL gzip (tar.gz) or zip)> <Genome Build Name>"
|
|
|
73 desc = "generate bit index files for generating null sequences"
|
|
|
74
|
|
|
75 parser = optparse.OptionParser(usage=usage, description=desc)
|
|
|
76
|
|
|
77 parser.add_option("-q", dest="quiet", default=False, action="store_true", \
|
|
|
78 help="supress messages (default=false)")
|
|
|
79
|
|
|
80 (options, args) = parser.parse_args()
|
|
|
81
|
|
|
82 if len(args) == 0:
|
|
|
83 parser.print_help()
|
|
|
84 sys.exit(0)
|
|
|
85
|
|
|
86 if len(args) != 2:
|
|
|
87 parser.error("incorrect number of arguments")
|
|
|
88 parser.print_help()
|
|
|
89 sys.exit(0)
|
|
|
90
|
|
|
91 chrom_file = args[0]
|
|
|
92 genome = args[1]
|
|
|
93
|
|
|
94 if zipfile.is_zipfile(chrom_file):
|
|
|
95 if options.quiet == False:
|
|
|
96 sys.stderr.write("detected file type is zip.\n")
|
|
|
97
|
|
|
98 zipfileobj = zipfile.ZipFile(chrom_file)
|
|
|
99
|
|
|
100 for fn in zipfileobj.namelist():
|
|
|
101 if options.quiet == False:
|
|
|
102 sys.stderr.write(' '.join(["processing", fn, "\n"]))
|
|
|
103
|
|
|
104 zipfileobj.extract(fn)
|
|
|
105 build_indexes(fn, genome)
|
|
|
106 os.remove(fn)
|
|
|
107
|
|
|
108 zipfileobj.close()
|
|
|
109
|
|
|
110 elif tarfile.is_tarfile(chrom_file):
|
|
|
111 if options.quiet == False:
|
|
|
112 sys.stderr.write("detected file type is tar.\n")
|
|
|
113
|
|
|
114 tarfileobj = tarfile.open(chrom_file)
|
|
|
115
|
|
|
116 for fn in tarfileobj.getnames():
|
|
|
117 if options.quiet == False:
|
|
|
118 sys.stderr.write(' '.join(["processing", fn, "\n"]))
|
|
|
119
|
|
|
120 tarfileobj.extract(fn)
|
|
|
121 build_indexes(fn, genome)
|
|
|
122 os.remove(fn)
|
|
|
123
|
|
|
124 tarfileobj.close()
|
|
|
125
|
|
|
126 else:
|
|
|
127 sys.stderr.write(' '.join(["unknown input file:", fn, "\n"]))
|
|
|
128
|
|
|
129 if __name__ == "__main__": main()
|