Mercurial > repos > test-svm > kmersvm_test
comparison kmersvm/scripts/nullseq_generate.py @ 0:66088269713e draft
Uploaded all files tracked by git
| author | test-svm |
|---|---|
| date | Sun, 05 Aug 2012 15:32:16 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:66088269713e |
|---|---|
| 1 import os | |
| 2 import os.path | |
| 3 import sys | |
| 4 import random | |
| 5 import optparse | |
| 6 | |
| 7 from bitarray import bitarray | |
| 8 | |
| 9 def read_bed_file(filename): | |
| 10 """ | |
| 11 """ | |
| 12 f = open(filename, 'r') | |
| 13 lines = f.readlines() | |
| 14 f.close() | |
| 15 | |
| 16 positions = [] | |
| 17 | |
| 18 for line in lines: | |
| 19 if line[0] == '#': | |
| 20 continue | |
| 21 | |
| 22 l = line.split() | |
| 23 | |
| 24 positions.append((l[0], int(l[1]), int(l[2]))) | |
| 25 | |
| 26 return positions | |
| 27 | |
| 28 | |
| 29 def bitarray_fromfile(filename): | |
| 30 """ | |
| 31 """ | |
| 32 fh = open(filename, 'rb') | |
| 33 bits = bitarray() | |
| 34 bits.fromfile(fh) | |
| 35 | |
| 36 return bits, fh | |
| 37 | |
| 38 | |
| 39 def make_profile(positions, buildname, basedir): | |
| 40 """ | |
| 41 """ | |
| 42 chrnames = sorted(set(map(lambda p: p[0], positions))) | |
| 43 | |
| 44 profiles = [] | |
| 45 for chrom in chrnames: | |
| 46 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out'])) | |
| 47 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out'])) | |
| 48 | |
| 49 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False: | |
| 50 # continue | |
| 51 bits_gc, gcf = bitarray_fromfile(idxf_gc) | |
| 52 bits_rpt, rptf = bitarray_fromfile(idxf_rpt) | |
| 53 | |
| 54 for pos in positions: | |
| 55 if pos[0] != chrom: | |
| 56 continue | |
| 57 | |
| 58 seqid = pos[0] + ':' + str(pos[1]+1) + '-' + str(pos[2]) | |
| 59 slen = pos[2]-pos[1] | |
| 60 gc = bits_gc[pos[1]:pos[2]].count(True) | |
| 61 rpt = bits_rpt[pos[1]:pos[2]].count(True) | |
| 62 | |
| 63 profiles.append((seqid, slen, gc, rpt)) | |
| 64 | |
| 65 gcf.close() | |
| 66 rptf.close() | |
| 67 | |
| 68 return profiles | |
| 69 | |
| 70 | |
| 71 def sample_sequences(positions, buildname, basedir, options): | |
| 72 """ | |
| 73 """ | |
| 74 rpt_err = options.rpt_err | |
| 75 gc_err = options.gc_err | |
| 76 max_trys = options.max_trys | |
| 77 norpt = options.norpt | |
| 78 nogc = options.nogc | |
| 79 | |
| 80 chrnames = sorted(set(map(lambda p: p[0], positions))) | |
| 81 profiles = make_profile(positions, buildname, basedir) | |
| 82 | |
| 83 excluded = [] | |
| 84 if options.excludefile: | |
| 85 excluded = read_bed_file(options.excludefile) | |
| 86 | |
| 87 #truncate (clear) file | |
| 88 f = open(options.output, 'w') | |
| 89 f.close() | |
| 90 | |
| 91 for chrom in chrnames: | |
| 92 if options.quiet == False: | |
| 93 sys.stderr.write("sampling from " + chrom + "\n") | |
| 94 | |
| 95 idxf_na = os.path.join(basedir, '.'.join([buildname, chrom, 'na', 'out'])) | |
| 96 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out'])) | |
| 97 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out'])) | |
| 98 | |
| 99 #this bit array is used to mark positions that are excluded from sampling | |
| 100 #this will be updated as we sample more sequences in order to prevent sampled sequences from overlapping | |
| 101 bits_na, naf = bitarray_fromfile(idxf_na) | |
| 102 bits_gc, gcf = bitarray_fromfile(idxf_gc) | |
| 103 bits_rpt, rptf = bitarray_fromfile(idxf_rpt) | |
| 104 | |
| 105 #mark excluded regions | |
| 106 for pos in excluded: | |
| 107 if pos[0] != chrom: | |
| 108 continue | |
| 109 bits_na[pos[1]:pos[2]] = True | |
| 110 | |
| 111 npos = 0 | |
| 112 #mark positive regions | |
| 113 for pos in positions: | |
| 114 if pos[0] != chrom: | |
| 115 continue | |
| 116 bits_na[pos[1]:pos[2]] = True | |
| 117 npos+=1 | |
| 118 | |
| 119 if options.count == 0: | |
| 120 count = options.fold*npos | |
| 121 else: | |
| 122 count = options.count | |
| 123 | |
| 124 sampled_positions = [] | |
| 125 while len(sampled_positions) < count: | |
| 126 sampled_prof = random.choice(profiles) | |
| 127 sampled_len = sampled_prof[1] | |
| 128 sampled_gc = sampled_prof[2] | |
| 129 sampled_rpt = sampled_prof[3] | |
| 130 | |
| 131 rpt_err_allowed = int(rpt_err*sampled_len) | |
| 132 gc_err_allowed = int(gc_err*sampled_len) | |
| 133 trys = 0 | |
| 134 while trys < max_trys: | |
| 135 trys += 1 | |
| 136 | |
| 137 pos = random.randint(1, bits_na.length() - sampled_len) | |
| 138 pos_e = pos+sampled_len | |
| 139 | |
| 140 if bits_na[pos:pos_e].any(): | |
| 141 continue | |
| 142 | |
| 143 if not norpt: | |
| 144 pos_rpt = bits_rpt[pos:pos_e].count(True) | |
| 145 if abs(sampled_rpt - pos_rpt) > rpt_err_allowed: | |
| 146 continue | |
| 147 | |
| 148 if not nogc: | |
| 149 pos_gc = bits_gc[pos:pos_e].count(True) | |
| 150 if abs(sampled_gc - pos_gc) > gc_err_allowed: | |
| 151 continue | |
| 152 | |
| 153 #accept the sampled position | |
| 154 #mark the sampled regions | |
| 155 bits_na[pos:pos_e] = True | |
| 156 | |
| 157 sampled_positions.append((chrom, pos, pos_e)) | |
| 158 | |
| 159 #print trys, chrom, pos, pos_e, sampled_len, pos_rpt, sampled_rpt, pos_gc, sampled_gc | |
| 160 break | |
| 161 else: | |
| 162 if options.quiet == False: | |
| 163 sys.stderr.write(' '.join(["fail to sample from", \ | |
| 164 "len=", str(sampled_len), \ | |
| 165 "rpt=", str(sampled_rpt), \ | |
| 166 "gc=", str(sampled_gc)]) + '\n') | |
| 167 | |
| 168 naf.close() | |
| 169 gcf.close() | |
| 170 rptf.close() | |
| 171 | |
| 172 f = open(options.output, 'a') | |
| 173 for spos in sorted(sampled_positions, key=lambda s: s[1]): | |
| 174 f.write('\t'.join([spos[0], str(spos[1]), str(spos[2])]) + '\n') | |
| 175 f.close() | |
| 176 | |
| 177 | |
| 178 def main(argv=sys.argv): | |
| 179 usage = "usage: %prog [options] <Input Bed File> <Genome Build Name> <Base Directory>" | |
| 180 | |
| 181 desc = "generate null sequences" | |
| 182 parser = optparse.OptionParser(usage=usage, description=desc) | |
| 183 | |
| 184 parser.add_option("-x", dest="fold", type="int", \ | |
| 185 default = 1, help="number of sequence to sample, FOLD times of given dataset (default=1)") | |
| 186 | |
| 187 parser.add_option("-c", dest="count", type="int", \ | |
| 188 default=0, help="number of sequences to sample, override -x option (default=NA, obsolete)") | |
| 189 | |
| 190 parser.add_option("-r", dest="rseed", type="int", \ | |
| 191 default=1, help="random number seed (default=1)") | |
| 192 | |
| 193 parser.add_option("-g", dest="gc_err", type="float", \ | |
| 194 default=0.02, help="GC errors allowed (default=0.02)") | |
| 195 | |
| 196 parser.add_option("-t", dest="rpt_err", type="float", \ | |
| 197 default=0.02, help="RPT errors allowed (default=0.02)") | |
| 198 | |
| 199 parser.add_option("-e", dest="excludefile", \ | |
| 200 default="", help="filename that contains regions to be excluded (default=NA)") | |
| 201 | |
| 202 parser.add_option("-G", dest="nogc", action="store_true", \ | |
| 203 default=False, help="do not match gc-contents") | |
| 204 | |
| 205 parser.add_option("-R", dest="norpt", action="store_true", \ | |
| 206 default=False, help="do not match repeats") | |
| 207 | |
| 208 parser.add_option("-m", dest="max_trys", type="int", \ | |
| 209 default=10000, help="number of maximum trys to sample of one sequence (default=10000)") | |
| 210 | |
| 211 parser.add_option("-o", dest="output", default="nullseq_output.bed", \ | |
| 212 help="set the name of output file (default=nullseq_output.bed)") | |
| 213 | |
| 214 parser.add_option("-q", dest="quiet", default=False, action="store_true", \ | |
| 215 help="supress messages (default=false)") | |
| 216 | |
| 217 (options, args) = parser.parse_args() | |
| 218 | |
| 219 if len(args) == 0: | |
| 220 parser.print_help() | |
| 221 sys.exit(0) | |
| 222 | |
| 223 if len(args) != 3: | |
| 224 parser.error("incorrect number of arguments") | |
| 225 parser.print_help() | |
| 226 sys.exit(0) | |
| 227 | |
| 228 | |
| 229 posfile = args[0] | |
| 230 buildname = args[1] | |
| 231 basedir = args[2] | |
| 232 | |
| 233 random.seed(options.rseed) | |
| 234 | |
| 235 if options.quiet == False: | |
| 236 sys.stderr.write('Options:\n') | |
| 237 sys.stderr.write(' N fold: ' + str(options.fold) + '\n') | |
| 238 sys.stderr.write(' GC match: ' + str(not options.nogc) + '\n') | |
| 239 sys.stderr.write(' repeat match: ' + str(not options.norpt) + '\n') | |
| 240 sys.stderr.write(' GC error allowed: ' + str(options.gc_err) + '\n') | |
| 241 sys.stderr.write(' repeat error allowed: ' + str(options.rpt_err) + '\n') | |
| 242 sys.stderr.write(' random seed: ' + str(options.rseed) + '\n') | |
| 243 sys.stderr.write(' max trys: ' + str(options.max_trys) + '\n') | |
| 244 sys.stderr.write(' excluded regions: ' + options.excludefile+ '\n') | |
| 245 sys.stderr.write(' output: ' + options.output + '\n') | |
| 246 sys.stderr.write('\n') | |
| 247 | |
| 248 sys.stderr.write('Input args:\n') | |
| 249 sys.stderr.write(' input bed file: ' + posfile+ '\n') | |
| 250 sys.stderr.write(' genome build name: ' + buildname+ '\n') | |
| 251 sys.stderr.write(' basedir: ' + basedir + '\n') | |
| 252 sys.stderr.write('\n') | |
| 253 | |
| 254 positions = read_bed_file(posfile) | |
| 255 | |
| 256 sample_sequences(positions, buildname, basedir, options) | |
| 257 | |
| 258 if __name__ == "__main__": main() |
