|
0
|
1 #!/usr/bin/env python2.7
|
|
|
2
|
|
|
3 import os
|
|
|
4 import os.path
|
|
|
5 import sys
|
|
|
6 import random
|
|
|
7 import optparse
|
|
|
8
|
|
|
9 from bitarray import bitarray
|
|
|
10
|
|
|
11 def read_bed_file(filename):
|
|
|
12 """
|
|
|
13 """
|
|
|
14 f = open(filename, 'r')
|
|
|
15 lines = f.readlines()
|
|
|
16 f.close()
|
|
|
17
|
|
|
18 positions = []
|
|
|
19
|
|
|
20 for line in lines:
|
|
|
21 if line[0] == '#':
|
|
|
22 continue
|
|
|
23
|
|
|
24 l = line.split()
|
|
|
25
|
|
|
26 positions.append((l[0], int(l[1]), int(l[2])))
|
|
|
27
|
|
|
28 return positions
|
|
|
29
|
|
|
30
|
|
|
31 def bitarray_fromfile(filename):
|
|
|
32 """
|
|
|
33 """
|
|
|
34 fh = open(filename, 'rb')
|
|
|
35 bits = bitarray()
|
|
|
36 bits.fromfile(fh)
|
|
|
37
|
|
|
38 return bits, fh
|
|
|
39
|
|
|
40 def get_seqid(buildname, pos):
|
|
|
41 return '_'.join( [buildname, pos[0], str(pos[1]), str(pos[2]), '+'] )
|
|
|
42
|
|
|
43 def make_profile(positions, buildname, basedir):
|
|
|
44 """
|
|
|
45 """
|
|
|
46 chrnames = sorted(set(map(lambda p: p[0], positions)))
|
|
|
47
|
|
|
48 profiles = {}
|
|
|
49 for chrom in chrnames:
|
|
|
50 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
|
|
|
51 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))
|
|
|
52
|
|
|
53 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False:
|
|
|
54 # continue
|
|
|
55 bits_gc, gcf = bitarray_fromfile(idxf_gc)
|
|
|
56 bits_rpt, rptf = bitarray_fromfile(idxf_rpt)
|
|
|
57
|
|
|
58 for pos in positions:
|
|
|
59 if pos[0] != chrom:
|
|
|
60 continue
|
|
|
61
|
|
|
62 seqid = get_seqid(buildname, pos)
|
|
|
63 slen = pos[2]-pos[1]
|
|
|
64 gc = bits_gc[pos[1]:pos[2]].count(True)
|
|
|
65 rpt = bits_rpt[pos[1]:pos[2]].count(True)
|
|
|
66
|
|
|
67 profiles[seqid] = (slen, gc, rpt)
|
|
|
68
|
|
|
69 gcf.close()
|
|
|
70 rptf.close()
|
|
|
71
|
|
|
72 return profiles
|
|
|
73
|
|
|
74 def main():
|
|
|
75 parser = optparse.OptionParser()
|
|
2
|
76 if len(sys.argv) != 5:
|
|
0
|
77 print "Usage:", sys.argv[0], "BEDFILE BUILDNAME BASE_DIR OUT_FILE"
|
|
|
78 sys.exit()
|
|
|
79 parser.add_option("-o", dest="output", default="seq_profile.txt")
|
|
|
80 (options,args) = parser.parse_args()
|
|
|
81
|
|
|
82 bedfile = sys.argv[1]
|
|
|
83 buildname = sys.argv[2]
|
|
|
84 basedir = sys.argv[3]
|
|
|
85 output = options.output
|
|
|
86
|
|
|
87 positions = read_bed_file(bedfile)
|
|
|
88 seqids = []
|
|
|
89 for pos in positions:
|
|
|
90 seqids.append(get_seqid(buildname, pos))
|
|
|
91
|
|
|
92 profiles = make_profile(positions, buildname, basedir)
|
|
|
93
|
|
|
94 f = open(output, 'w')
|
|
|
95 f.write("\t".join(["#seq_id", "length", "GC content", "repeat fraction"]) + '\n')
|
|
|
96 for seqid in seqids:
|
|
|
97 prof = profiles[seqid]
|
|
|
98 f.write('\t'.join( map(str, [seqid, prof[0], prof[1]/float(prof[0]), prof[2]/float(prof[0])]) ) + '\n')
|
|
|
99 f.close()
|
|
|
100
|
|
|
101 if __name__ == "__main__": main()
|