Mercurial > repos > morinlab > fetch_interval
comparison fetch_interval.py @ 1:588cdb7e20b2 draft
Uploaded
| author | morinlab |
|---|---|
| date | Sun, 04 Dec 2016 18:24:23 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 0:391b1466cab0 | 1:588cdb7e20b2 |
|---|---|
| 1 import argparse | |
| 2 import math | |
| 3 import subprocess | |
| 4 | |
| 5 if __name__ == "__main__": | |
| 6 desc = "Create BAM interval file for easily parallelism in Galaxy" | |
| 7 parser = argparse.ArgumentParser(description=desc) | |
| 8 parser.add_argument( | |
| 9 "--mode", "-m", | |
| 10 required=True, | |
| 11 choices=['by_rname', 'by_chunk'] | |
| 12 ) | |
| 13 parser.add_argument( | |
| 14 "--input", "-i", | |
| 15 type=str, | |
| 16 required=True, | |
| 17 help="Input BAM file" | |
| 18 ) | |
| 19 parser.add_argument( | |
| 20 "--output", "-o", | |
| 21 type=str, | |
| 22 required=True, | |
| 23 help="Output BED file" | |
| 24 ) | |
| 25 parser.add_argument( | |
| 26 "--order_file", "-of", | |
| 27 type=str, | |
| 28 required=False, | |
| 29 help="Generate a file with the original contig order" | |
| 30 ) | |
| 31 parser.add_argument( | |
| 32 "--chunk_size", "-s", | |
| 33 type=int, | |
| 34 required=False, | |
| 35 default=50000000, | |
| 36 help="The number of bases to include in a chunk interval, required when 'by_chunks' specified" | |
| 37 ) | |
| 38 parser.add_argument( | |
| 39 "--chromosome", | |
| 40 action="store_true", | |
| 41 required=False, | |
| 42 help="Add the Chromosome to the Interval File" | |
| 43 ) | |
| 44 parser.add_argument( | |
| 45 "--start_position", | |
| 46 action="store_true", | |
| 47 required=False, | |
| 48 help="Add the Start Position to the Interval File" | |
| 49 ) | |
| 50 parser.add_argument( | |
| 51 "--end_position", | |
| 52 action="store_true", | |
| 53 required=False, | |
| 54 help="Add the End Position to the Interval File" | |
| 55 ) | |
| 56 parser.add_argument( | |
| 57 "--interval", | |
| 58 action="store_true", | |
| 59 required=False, | |
| 60 help="Add the entire interval to the interval file" | |
| 61 ) | |
| 62 parser.add_argument( | |
| 63 "--prefixes_to_ignore", | |
| 64 nargs="+", | |
| 65 required=False, | |
| 66 help="Ignore Intervals that contains the following prefix" | |
| 67 ) | |
| 68 parser.add_argument( | |
| 69 "--group_according_to_largest_chromosome", | |
| 70 action="store_true", | |
| 71 required=False, | |
| 72 help="Output a set of files whose length do not sum larger then the largest chromosome" | |
| 73 ) | |
| 74 parser.add_argument( | |
| 75 "--output_dir", | |
| 76 type=str, | |
| 77 default="outputs", | |
| 78 help="Output directory for collection output" | |
| 79 ) | |
| 80 args = parser.parse_args() | |
| 81 | |
| 82 | |
| 83 order_file = None | |
| 84 if not args.order_file == None: | |
| 85 order_file = open(args.order_file, 'w') | |
| 86 | |
| 87 rnames = [] | |
| 88 data = {} | |
| 89 commands = [ | |
| 90 "samtools", "view", | |
| 91 "-H", args.input | |
| 92 ] | |
| 93 p = subprocess.Popen(commands, stdout=subprocess.PIPE) | |
| 94 out, err = p.communicate() | |
| 95 seqs = [] | |
| 96 lengths = [] | |
| 97 for line in out.split("\n"): | |
| 98 if not line.find("@SQ") == -1: | |
| 99 tmp = line.split("\t") | |
| 100 seqs.append(tmp[1][3:]) | |
| 101 lengths.append(tmp[2][3:]) | |
| 102 | |
| 103 | |
| 104 chunk_size = args.chunk_size | |
| 105 | |
| 106 for i in range(len(seqs)): | |
| 107 rname = seqs[i] | |
| 108 length = lengths[i] | |
| 109 | |
| 110 if not rname in data: | |
| 111 rnames.append(rname) | |
| 112 data[rname] = [] | |
| 113 | |
| 114 if args.mode == 'by_rname': | |
| 115 chunk_size = int(length) | |
| 116 for i in range(int(math.ceil(float(length) / float(chunk_size)))): | |
| 117 if (i+1) * chunk_size > length: | |
| 118 data[rname].append([rname,(i * chunk_size) + 1, length]) | |
| 119 else: | |
| 120 data[rname].append([rname,(i * chunk_size) + 1, (i+1) * chunk_size]) | |
| 121 | |
| 122 if args.group_according_to_largest_chromosome : | |
| 123 ids = [] | |
| 124 id_to_length = [] | |
| 125 bins_to_length = [] | |
| 126 bins_to_id = [] | |
| 127 for rname in rnames: | |
| 128 ids.append(rname) | |
| 129 | |
| 130 for i in range(len(ids)): | |
| 131 id_to_length.append(data[ids[i]][-1][2]) | |
| 132 | |
| 133 max_length = 0 | |
| 134 for i in range(len(id_to_length)): | |
| 135 if max_length < id_to_length[i]: | |
| 136 max_length = id_to_length[i] | |
| 137 | |
| 138 for i in range(len(ids)): | |
| 139 rname = ids[i] | |
| 140 if any([not rname.find(x) == -1 for x in args.prefixes_to_ignore]): | |
| 141 continue | |
| 142 if not order_file == None: | |
| 143 order_file.write(''.join([rname, '\n'])) | |
| 144 | |
| 145 if bins_to_id == []: | |
| 146 bins_to_id.append([rname]) | |
| 147 bins_to_length.append(id_to_length[i]) | |
| 148 else: | |
| 149 added = False | |
| 150 for bin_id in range(len(bins_to_length)): | |
| 151 if bins_to_length[bin_id] + id_to_length[i] < max_length: | |
| 152 bins_to_id[bin_id].append(rname) | |
| 153 bins_to_length[bin_id] += id_to_length[i] | |
| 154 added = True | |
| 155 break | |
| 156 if added == False: | |
| 157 bins_to_id.append([rname]) | |
| 158 bins_to_length.append(id_to_length[i]) | |
| 159 | |
| 160 if not order_file == None: | |
| 161 order_file.close() | |
| 162 | |
| 163 for i in range(len(bins_to_id)): | |
| 164 output = open(''.join(["./outputs/samp", str(i+1),".bed"]), 'w') | |
| 165 for rname in bins_to_id[i]: | |
| 166 line = ''.join(['%s\n' % rname]) | |
| 167 output.write(line) | |
| 168 output.close() | |
| 169 | |
| 170 | |
| 171 else: | |
| 172 output = open(args.output,'w') | |
| 173 for i in range(len(seqs)): | |
| 174 rname = seqs[i] | |
| 175 for interval in data[rname]: | |
| 176 if any([not interval[0].find(x) == -1 for x in args.prefixes_to_ignore]): | |
| 177 continue | |
| 178 line = '' | |
| 179 if args.interval: | |
| 180 line = ''.join([line, '%s:%s-%s\t' % (interval[0], interval[1], interval[2])]) | |
| 181 if args.chromosome: | |
| 182 line = ''.join([line,'%s\t' % interval[0]]) | |
| 183 if args.start_position: | |
| 184 line = ''.join([line,'%s\t' % interval[1]]) | |
| 185 if args.end_position: | |
| 186 line = ''.join([line,'%s\t' % interval[2]]) | |
| 187 output.write(''.join([line[:-1], '\n'])) |
