Mercurial > repos > test-svm > kmersvm_test
comparison kmersvm/scripts/nullseq_build_indices.py @ 0:66088269713e draft
Uploaded all files tracked by git
| author | test-svm |
|---|---|
| date | Sun, 05 Aug 2012 15:32:16 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:66088269713e |
|---|---|
| 1 import os | |
| 2 import sys | |
| 3 import tarfile | |
| 4 import zipfile | |
| 5 import optparse | |
| 6 | |
| 7 from bitarray import bitarray | |
| 8 | |
| 9 def clear_indexes(sid, buildname): | |
| 10 na = '.'.join([buildname, sid, "na", "out"]) | |
| 11 gc = '.'.join([buildname, sid, "gc", "out"]) | |
| 12 rpt = '.'.join([buildname, sid, "rpt", "out"]) | |
| 13 | |
| 14 #truncate files | |
| 15 for fn in (na, gc, rpt): | |
| 16 f = open(fn, 'wb') | |
| 17 f.close() | |
| 18 | |
| 19 | |
| 20 def append_indexes(seq, sid, buildname): | |
| 21 na = '.'.join([buildname, sid, "na", "out"]) | |
| 22 gc = '.'.join([buildname, sid, "gc", "out"]) | |
| 23 rpt = '.'.join([buildname, sid, "rpt", "out"]) | |
| 24 | |
| 25 f = open(na, 'ab') | |
| 26 bitarray(map(lambda c: c in 'N', seq)).tofile(f) | |
| 27 f.close() | |
| 28 | |
| 29 f = open(gc, 'ab') | |
| 30 bitarray(map(lambda c: c in 'cgCG', seq)).tofile(f) | |
| 31 f.close() | |
| 32 | |
| 33 f = open(rpt, 'ab') | |
| 34 bitarray(map(lambda c: c in 'acgt', seq)).tofile(f) | |
| 35 f.close() | |
| 36 | |
| 37 | |
| 38 def build_indexes(fn, buildname): | |
| 39 save_interval = 8*32*1024 | |
| 40 | |
| 41 try: | |
| 42 f = open(fn, 'r') | |
| 43 | |
| 44 seq = [] | |
| 45 sid = '' | |
| 46 nlines = 0 | |
| 47 for line in f: | |
| 48 if line[0] == '>': | |
| 49 if sid: | |
| 50 append_indexes("".join(seq), sid, buildname) | |
| 51 seq = [] | |
| 52 | |
| 53 sid = line[1:].rstrip('\n').split()[0] | |
| 54 clear_indexes(sid, buildname) | |
| 55 else: | |
| 56 nlines += 1 | |
| 57 seq.append(line.rstrip('\n')) | |
| 58 | |
| 59 if nlines % save_interval == 0: | |
| 60 append_indexes("".join(seq), sid, buildname) | |
| 61 seq = [] | |
| 62 | |
| 63 #the last remaining sequence | |
| 64 append_indexes("".join(seq), sid, buildname) | |
| 65 | |
| 66 except IOError, (errno, strerror): | |
| 67 print "I/O error(%d): %s" % (errno, strerror) | |
| 68 sys.exit(0) | |
| 69 | |
| 70 | |
| 71 def main(argv=sys.argv): | |
| 72 usage = "usage: %prog [options] <Chromosome File(TARBALL gzip (tar.gz) or zip)> <Genome Build Name>" | |
| 73 desc = "generate bit index files for generating null sequences" | |
| 74 | |
| 75 parser = optparse.OptionParser(usage=usage, description=desc) | |
| 76 | |
| 77 parser.add_option("-q", dest="quiet", default=False, action="store_true", \ | |
| 78 help="supress messages (default=false)") | |
| 79 | |
| 80 (options, args) = parser.parse_args() | |
| 81 | |
| 82 if len(args) == 0: | |
| 83 parser.print_help() | |
| 84 sys.exit(0) | |
| 85 | |
| 86 if len(args) != 2: | |
| 87 parser.error("incorrect number of arguments") | |
| 88 parser.print_help() | |
| 89 sys.exit(0) | |
| 90 | |
| 91 chrom_file = args[0] | |
| 92 genome = args[1] | |
| 93 | |
| 94 if zipfile.is_zipfile(chrom_file): | |
| 95 if options.quiet == False: | |
| 96 sys.stderr.write("detected file type is zip.\n") | |
| 97 | |
| 98 zipfileobj = zipfile.ZipFile(chrom_file) | |
| 99 | |
| 100 for fn in zipfileobj.namelist(): | |
| 101 if options.quiet == False: | |
| 102 sys.stderr.write(' '.join(["processing", fn, "\n"])) | |
| 103 | |
| 104 zipfileobj.extract(fn) | |
| 105 build_indexes(fn, genome) | |
| 106 os.remove(fn) | |
| 107 | |
| 108 zipfileobj.close() | |
| 109 | |
| 110 elif tarfile.is_tarfile(chrom_file): | |
| 111 if options.quiet == False: | |
| 112 sys.stderr.write("detected file type is tar.\n") | |
| 113 | |
| 114 tarfileobj = tarfile.open(chrom_file) | |
| 115 | |
| 116 for fn in tarfileobj.getnames(): | |
| 117 if options.quiet == False: | |
| 118 sys.stderr.write(' '.join(["processing", fn, "\n"])) | |
| 119 | |
| 120 tarfileobj.extract(fn) | |
| 121 build_indexes(fn, genome) | |
| 122 os.remove(fn) | |
| 123 | |
| 124 tarfileobj.close() | |
| 125 | |
| 126 else: | |
| 127 sys.stderr.write(' '.join(["unknown input file:", fn, "\n"])) | |
| 128 | |
| 129 if __name__ == "__main__": main() |
