comparison kmersvm/scripts/make_profile.py @ 0:66088269713e draft

Uploaded all files tracked by git
author test-svm
date Sun, 05 Aug 2012 15:32:16 -0400
parents
children e8dcc2ed0f9f
comparison
equal deleted inserted replaced
-1:000000000000 0:66088269713e
1 #!/usr/bin/env python2.7
2
3 import os
4 import os.path
5 import sys
6 import random
7 import optparse
8
9 from bitarray import bitarray
10
11 def read_bed_file(filename):
12 """
13 """
14 f = open(filename, 'r')
15 lines = f.readlines()
16 f.close()
17
18 positions = []
19
20 for line in lines:
21 if line[0] == '#':
22 continue
23
24 l = line.split()
25
26 positions.append((l[0], int(l[1]), int(l[2])))
27
28 return positions
29
30
31 def bitarray_fromfile(filename):
32 """
33 """
34 fh = open(filename, 'rb')
35 bits = bitarray()
36 bits.fromfile(fh)
37
38 return bits, fh
39
40 def get_seqid(buildname, pos):
41 return '_'.join( [buildname, pos[0], str(pos[1]), str(pos[2]), '+'] )
42
43 def make_profile(positions, buildname, basedir):
44 """
45 """
46 chrnames = sorted(set(map(lambda p: p[0], positions)))
47
48 profiles = {}
49 for chrom in chrnames:
50 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
51 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))
52
53 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False:
54 # continue
55 bits_gc, gcf = bitarray_fromfile(idxf_gc)
56 bits_rpt, rptf = bitarray_fromfile(idxf_rpt)
57
58 for pos in positions:
59 if pos[0] != chrom:
60 continue
61
62 seqid = get_seqid(buildname, pos)
63 slen = pos[2]-pos[1]
64 gc = bits_gc[pos[1]:pos[2]].count(True)
65 rpt = bits_rpt[pos[1]:pos[2]].count(True)
66
67 profiles[seqid] = (slen, gc, rpt)
68
69 gcf.close()
70 rptf.close()
71
72 return profiles
73
74 def main():
75 parser = optparse.OptionParser()
76
77 if len(sys.argv) != 4:
78 print "Usage:", sys.argv[0], "BEDFILE BUILDNAME BASE_DIR OUT_FILE"
79 sys.exit()
80 parser.add_option("-o", dest="output", default="seq_profile.txt")
81 (options,args) = parser.parse_args()
82
83 bedfile = sys.argv[1]
84 buildname = sys.argv[2]
85 basedir = sys.argv[3]
86 output = options.output
87
88 positions = read_bed_file(bedfile)
89 seqids = []
90 for pos in positions:
91 seqids.append(get_seqid(buildname, pos))
92
93 profiles = make_profile(positions, buildname, basedir)
94
95 f = open(output, 'w')
96 f.write("\t".join(["#seq_id", "length", "GC content", "repeat fraction"]) + '\n')
97 for seqid in seqids:
98 prof = profiles[seqid]
99 f.write('\t'.join( map(str, [seqid, prof[0], prof[1]/float(prof[0]), prof[2]/float(prof[0])]) ) + '\n')
100 f.close()
101
102 if __name__ == "__main__": main()