annotate kmersvm/scripts/nullseq_build_indices.py @ 0:66088269713e draft

Uploaded all files tracked by git
author test-svm
date Sun, 05 Aug 2012 15:32:16 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
1 import os
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
2 import sys
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
3 import tarfile
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
4 import zipfile
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
5 import optparse
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
6
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
7 from bitarray import bitarray
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
8
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
9 def clear_indexes(sid, buildname):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
10 na = '.'.join([buildname, sid, "na", "out"])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
11 gc = '.'.join([buildname, sid, "gc", "out"])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
12 rpt = '.'.join([buildname, sid, "rpt", "out"])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
13
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
14 #truncate files
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
15 for fn in (na, gc, rpt):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
16 f = open(fn, 'wb')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
17 f.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
18
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
19
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
20 def append_indexes(seq, sid, buildname):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
21 na = '.'.join([buildname, sid, "na", "out"])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
22 gc = '.'.join([buildname, sid, "gc", "out"])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
23 rpt = '.'.join([buildname, sid, "rpt", "out"])
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
24
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
25 f = open(na, 'ab')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
26 bitarray(map(lambda c: c in 'N', seq)).tofile(f)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
27 f.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
28
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
29 f = open(gc, 'ab')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
30 bitarray(map(lambda c: c in 'cgCG', seq)).tofile(f)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
31 f.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
32
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
33 f = open(rpt, 'ab')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
34 bitarray(map(lambda c: c in 'acgt', seq)).tofile(f)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
35 f.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
36
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
37
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
38 def build_indexes(fn, buildname):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
39 save_interval = 8*32*1024
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
40
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
41 try:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
42 f = open(fn, 'r')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
43
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
44 seq = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
45 sid = ''
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
46 nlines = 0
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
47 for line in f:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
48 if line[0] == '>':
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
49 if sid:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
50 append_indexes("".join(seq), sid, buildname)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
51 seq = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
52
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
53 sid = line[1:].rstrip('\n').split()[0]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
54 clear_indexes(sid, buildname)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
55 else:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
56 nlines += 1
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
57 seq.append(line.rstrip('\n'))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
58
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
59 if nlines % save_interval == 0:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
60 append_indexes("".join(seq), sid, buildname)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
61 seq = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
62
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
63 #the last remaining sequence
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
64 append_indexes("".join(seq), sid, buildname)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
65
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
66 except IOError, (errno, strerror):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
67 print "I/O error(%d): %s" % (errno, strerror)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
68 sys.exit(0)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
69
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
70
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
71 def main(argv=sys.argv):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
72 usage = "usage: %prog [options] <Chromosome File(TARBALL gzip (tar.gz) or zip)> <Genome Build Name>"
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
73 desc = "generate bit index files for generating null sequences"
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
74
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
75 parser = optparse.OptionParser(usage=usage, description=desc)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
76
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
77 parser.add_option("-q", dest="quiet", default=False, action="store_true", \
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
78 help="supress messages (default=false)")
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
79
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
80 (options, args) = parser.parse_args()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
81
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
82 if len(args) == 0:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
83 parser.print_help()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
84 sys.exit(0)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
85
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
86 if len(args) != 2:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
87 parser.error("incorrect number of arguments")
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
88 parser.print_help()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
89 sys.exit(0)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
90
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
91 chrom_file = args[0]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
92 genome = args[1]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
93
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
94 if zipfile.is_zipfile(chrom_file):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
95 if options.quiet == False:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
96 sys.stderr.write("detected file type is zip.\n")
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
97
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
98 zipfileobj = zipfile.ZipFile(chrom_file)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
99
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
100 for fn in zipfileobj.namelist():
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
101 if options.quiet == False:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
102 sys.stderr.write(' '.join(["processing", fn, "\n"]))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
103
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
104 zipfileobj.extract(fn)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
105 build_indexes(fn, genome)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
106 os.remove(fn)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
107
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
108 zipfileobj.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
109
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
110 elif tarfile.is_tarfile(chrom_file):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
111 if options.quiet == False:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
112 sys.stderr.write("detected file type is tar.\n")
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
113
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
114 tarfileobj = tarfile.open(chrom_file)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
115
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
116 for fn in tarfileobj.getnames():
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
117 if options.quiet == False:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
118 sys.stderr.write(' '.join(["processing", fn, "\n"]))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
119
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
120 tarfileobj.extract(fn)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
121 build_indexes(fn, genome)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
122 os.remove(fn)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
123
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
124 tarfileobj.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
125
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
126 else:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
127 sys.stderr.write(' '.join(["unknown input file:", fn, "\n"]))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
128
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
129 if __name__ == "__main__": main()