Mercurial > repos > test-svm > kmersvm_test
comparison kmersvm/scripts/make_profile.py @ 5:f99b5099ea55 draft
Uploaded
| author | test-svm |
|---|---|
| date | Sun, 05 Aug 2012 16:50:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 4:f2130156fd5d | 5:f99b5099ea55 |
|---|---|
| 1 #!/usr/bin/env python2.7 | |
| 2 | |
| 3 import os | |
| 4 import os.path | |
| 5 import sys | |
| 6 import random | |
| 7 import optparse | |
| 8 | |
| 9 from bitarray import bitarray | |
| 10 | |
| 11 def read_bed_file(filename): | |
| 12 """ | |
| 13 """ | |
| 14 f = open(filename, 'r') | |
| 15 lines = f.readlines() | |
| 16 f.close() | |
| 17 | |
| 18 positions = [] | |
| 19 | |
| 20 for line in lines: | |
| 21 if line[0] == '#': | |
| 22 continue | |
| 23 | |
| 24 l = line.split() | |
| 25 | |
| 26 positions.append((l[0], int(l[1]), int(l[2]))) | |
| 27 | |
| 28 return positions | |
| 29 | |
| 30 | |
| 31 def bitarray_fromfile(filename): | |
| 32 """ | |
| 33 """ | |
| 34 fh = open(filename, 'rb') | |
| 35 bits = bitarray() | |
| 36 bits.fromfile(fh) | |
| 37 | |
| 38 return bits, fh | |
| 39 | |
| 40 def get_seqid(buildname, pos): | |
| 41 return '_'.join( [buildname, pos[0], str(pos[1]), str(pos[2]), '+'] ) | |
| 42 | |
| 43 def make_profile(positions, buildname, basedir): | |
| 44 """ | |
| 45 """ | |
| 46 chrnames = sorted(set(map(lambda p: p[0], positions))) | |
| 47 | |
| 48 profiles = {} | |
| 49 for chrom in chrnames: | |
| 50 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out'])) | |
| 51 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out'])) | |
| 52 | |
| 53 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False: | |
| 54 # continue | |
| 55 bits_gc, gcf = bitarray_fromfile(idxf_gc) | |
| 56 bits_rpt, rptf = bitarray_fromfile(idxf_rpt) | |
| 57 | |
| 58 for pos in positions: | |
| 59 if pos[0] != chrom: | |
| 60 continue | |
| 61 | |
| 62 seqid = get_seqid(buildname, pos) | |
| 63 slen = pos[2]-pos[1] | |
| 64 gc = bits_gc[pos[1]:pos[2]].count(True) | |
| 65 rpt = bits_rpt[pos[1]:pos[2]].count(True) | |
| 66 | |
| 67 profiles[seqid] = (slen, gc, rpt) | |
| 68 | |
| 69 gcf.close() | |
| 70 rptf.close() | |
| 71 | |
| 72 return profiles | |
| 73 | |
| 74 def main(): | |
| 75 parser = optparse.OptionParser() | |
| 76 if len(sys.argv) != 5: | |
| 77 print "Usage:", sys.argv[0], "BEDFILE BUILDNAME BASE_DIR OUT_FILE" | |
| 78 sys.exit() | |
| 79 parser.add_option("-o", dest="output", default="seq_profile.txt") | |
| 80 (options,args) = parser.parse_args() | |
| 81 | |
| 82 bedfile = sys.argv[1] | |
| 83 buildname = sys.argv[2] | |
| 84 basedir = sys.argv[3] | |
| 85 output = options.output | |
| 86 | |
| 87 positions = read_bed_file(bedfile) | |
| 88 seqids = [] | |
| 89 for pos in positions: | |
| 90 seqids.append(get_seqid(buildname, pos)) | |
| 91 | |
| 92 profiles = make_profile(positions, buildname, basedir) | |
| 93 | |
| 94 f = open(output, 'w') | |
| 95 f.write("\t".join(["#seq_id", "length", "GC content", "repeat fraction"]) + '\n') | |
| 96 for seqid in seqids: | |
| 97 prof = profiles[seqid] | |
| 98 f.write('\t'.join( map(str, [seqid, prof[0], prof[1]/float(prof[0]), prof[2]/float(prof[0])]) ) + '\n') | |
| 99 f.close() | |
| 100 | |
| 101 if __name__ == "__main__": main() |
