annotate kmersvm/scripts/make_profile.py @ 2:e8dcc2ed0f9f draft

Included bugfix, README
author test-svm
date Sun, 05 Aug 2012 16:32:03 -0400
parents 66088269713e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
1 #!/usr/bin/env python2.7
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
2
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
3 import os
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
4 import os.path
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
5 import sys
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
6 import random
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
7 import optparse
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
8
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
9 from bitarray import bitarray
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
10
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
11 def read_bed_file(filename):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
12 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
13 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
14 f = open(filename, 'r')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
15 lines = f.readlines()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
16 f.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
17
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
18 positions = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
19
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
20 for line in lines:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
21 if line[0] == '#':
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
22 continue
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
23
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
24 l = line.split()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
25
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
26 positions.append((l[0], int(l[1]), int(l[2])))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
27
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
28 return positions
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
29
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
30
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
31 def bitarray_fromfile(filename):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
32 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
33 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
34 fh = open(filename, 'rb')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
35 bits = bitarray()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
36 bits.fromfile(fh)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
37
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
38 return bits, fh
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
39
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
40 def get_seqid(buildname, pos):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
41 return '_'.join( [buildname, pos[0], str(pos[1]), str(pos[2]), '+'] )
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
42
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
43 def make_profile(positions, buildname, basedir):
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
44 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
45 """
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
46 chrnames = sorted(set(map(lambda p: p[0], positions)))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
47
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
48 profiles = {}
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
49 for chrom in chrnames:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
50 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
51 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
52
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
53 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
54 # continue
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
55 bits_gc, gcf = bitarray_fromfile(idxf_gc)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
56 bits_rpt, rptf = bitarray_fromfile(idxf_rpt)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
57
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
58 for pos in positions:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
59 if pos[0] != chrom:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
60 continue
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
61
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
62 seqid = get_seqid(buildname, pos)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
63 slen = pos[2]-pos[1]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
64 gc = bits_gc[pos[1]:pos[2]].count(True)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
65 rpt = bits_rpt[pos[1]:pos[2]].count(True)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
66
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
67 profiles[seqid] = (slen, gc, rpt)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
68
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
69 gcf.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
70 rptf.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
71
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
72 return profiles
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
73
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
74 def main():
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
75 parser = optparse.OptionParser()
2
e8dcc2ed0f9f Included bugfix, README
test-svm
parents: 0
diff changeset
76 if len(sys.argv) != 5:
0
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
77 print "Usage:", sys.argv[0], "BEDFILE BUILDNAME BASE_DIR OUT_FILE"
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
78 sys.exit()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
79 parser.add_option("-o", dest="output", default="seq_profile.txt")
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
80 (options,args) = parser.parse_args()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
81
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
82 bedfile = sys.argv[1]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
83 buildname = sys.argv[2]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
84 basedir = sys.argv[3]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
85 output = options.output
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
86
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
87 positions = read_bed_file(bedfile)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
88 seqids = []
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
89 for pos in positions:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
90 seqids.append(get_seqid(buildname, pos))
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
91
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
92 profiles = make_profile(positions, buildname, basedir)
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
93
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
94 f = open(output, 'w')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
95 f.write("\t".join(["#seq_id", "length", "GC content", "repeat fraction"]) + '\n')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
96 for seqid in seqids:
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
97 prof = profiles[seqid]
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
98 f.write('\t'.join( map(str, [seqid, prof[0], prof[1]/float(prof[0]), prof[2]/float(prof[0])]) ) + '\n')
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
99 f.close()
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
100
66088269713e Uploaded all files tracked by git
test-svm
parents:
diff changeset
101 if __name__ == "__main__": main()