1
|
1 import argparse
|
|
2 import math
|
|
3 import subprocess
|
|
4
|
|
5 if __name__ == "__main__":
|
|
6 desc = "Create BAM interval file for easily parallelism in Galaxy"
|
|
7 parser = argparse.ArgumentParser(description=desc)
|
|
8 parser.add_argument(
|
|
9 "--mode", "-m",
|
|
10 required=True,
|
|
11 choices=['by_rname', 'by_chunk']
|
|
12 )
|
|
13 parser.add_argument(
|
|
14 "--input", "-i",
|
|
15 type=str,
|
|
16 required=True,
|
|
17 help="Input BAM file"
|
|
18 )
|
|
19 parser.add_argument(
|
|
20 "--output", "-o",
|
|
21 type=str,
|
|
22 required=True,
|
|
23 help="Output BED file"
|
|
24 )
|
|
25 parser.add_argument(
|
|
26 "--order_file", "-of",
|
|
27 type=str,
|
|
28 required=False,
|
|
29 help="Generate a file with the original contig order"
|
|
30 )
|
|
31 parser.add_argument(
|
|
32 "--chunk_size", "-s",
|
|
33 type=int,
|
|
34 required=False,
|
|
35 default=50000000,
|
|
36 help="The number of bases to include in a chunk interval, required when 'by_chunks' specified"
|
|
37 )
|
|
38 parser.add_argument(
|
|
39 "--chromosome",
|
|
40 action="store_true",
|
|
41 required=False,
|
|
42 help="Add the Chromosome to the Interval File"
|
|
43 )
|
|
44 parser.add_argument(
|
|
45 "--start_position",
|
|
46 action="store_true",
|
|
47 required=False,
|
|
48 help="Add the Start Position to the Interval File"
|
|
49 )
|
|
50 parser.add_argument(
|
|
51 "--end_position",
|
|
52 action="store_true",
|
|
53 required=False,
|
|
54 help="Add the End Position to the Interval File"
|
|
55 )
|
|
56 parser.add_argument(
|
|
57 "--interval",
|
|
58 action="store_true",
|
|
59 required=False,
|
|
60 help="Add the entire interval to the interval file"
|
|
61 )
|
|
62 parser.add_argument(
|
|
63 "--prefixes_to_ignore",
|
|
64 nargs="+",
|
|
65 required=False,
|
|
66 help="Ignore Intervals that contains the following prefix"
|
|
67 )
|
|
68 parser.add_argument(
|
|
69 "--group_according_to_largest_chromosome",
|
|
70 action="store_true",
|
|
71 required=False,
|
|
72 help="Output a set of files whose length do not sum larger then the largest chromosome"
|
|
73 )
|
|
74 parser.add_argument(
|
|
75 "--output_dir",
|
|
76 type=str,
|
|
77 default="outputs",
|
|
78 help="Output directory for collection output"
|
|
79 )
|
|
80 args = parser.parse_args()
|
|
81
|
|
82
|
|
83 order_file = None
|
|
84 if not args.order_file == None:
|
|
85 order_file = open(args.order_file, 'w')
|
|
86
|
|
87 rnames = []
|
|
88 data = {}
|
|
89 commands = [
|
|
90 "samtools", "view",
|
|
91 "-H", args.input
|
|
92 ]
|
|
93 p = subprocess.Popen(commands, stdout=subprocess.PIPE)
|
|
94 out, err = p.communicate()
|
|
95 seqs = []
|
|
96 lengths = []
|
|
97 for line in out.split("\n"):
|
|
98 if not line.find("@SQ") == -1:
|
|
99 tmp = line.split("\t")
|
|
100 seqs.append(tmp[1][3:])
|
|
101 lengths.append(tmp[2][3:])
|
|
102
|
|
103
|
|
104 chunk_size = args.chunk_size
|
|
105
|
|
106 for i in range(len(seqs)):
|
|
107 rname = seqs[i]
|
|
108 length = lengths[i]
|
|
109
|
|
110 if not rname in data:
|
|
111 rnames.append(rname)
|
|
112 data[rname] = []
|
|
113
|
|
114 if args.mode == 'by_rname':
|
|
115 chunk_size = int(length)
|
|
116 for i in range(int(math.ceil(float(length) / float(chunk_size)))):
|
|
117 if (i+1) * chunk_size > length:
|
|
118 data[rname].append([rname,(i * chunk_size) + 1, length])
|
|
119 else:
|
|
120 data[rname].append([rname,(i * chunk_size) + 1, (i+1) * chunk_size])
|
|
121
|
|
122 if args.group_according_to_largest_chromosome :
|
|
123 ids = []
|
|
124 id_to_length = []
|
|
125 bins_to_length = []
|
|
126 bins_to_id = []
|
|
127 for rname in rnames:
|
|
128 ids.append(rname)
|
|
129
|
|
130 for i in range(len(ids)):
|
|
131 id_to_length.append(data[ids[i]][-1][2])
|
|
132
|
|
133 max_length = 0
|
|
134 for i in range(len(id_to_length)):
|
|
135 if max_length < id_to_length[i]:
|
|
136 max_length = id_to_length[i]
|
|
137
|
|
138 for i in range(len(ids)):
|
|
139 rname = ids[i]
|
|
140 if any([not rname.find(x) == -1 for x in args.prefixes_to_ignore]):
|
|
141 continue
|
|
142 if not order_file == None:
|
|
143 order_file.write(''.join([rname, '\n']))
|
|
144
|
|
145 if bins_to_id == []:
|
|
146 bins_to_id.append([rname])
|
|
147 bins_to_length.append(id_to_length[i])
|
|
148 else:
|
|
149 added = False
|
|
150 for bin_id in range(len(bins_to_length)):
|
|
151 if bins_to_length[bin_id] + id_to_length[i] < max_length:
|
|
152 bins_to_id[bin_id].append(rname)
|
|
153 bins_to_length[bin_id] += id_to_length[i]
|
|
154 added = True
|
|
155 break
|
|
156 if added == False:
|
|
157 bins_to_id.append([rname])
|
|
158 bins_to_length.append(id_to_length[i])
|
|
159
|
|
160 if not order_file == None:
|
|
161 order_file.close()
|
|
162
|
|
163 for i in range(len(bins_to_id)):
|
|
164 output = open(''.join(["./outputs/samp", str(i+1),".bed"]), 'w')
|
|
165 for rname in bins_to_id[i]:
|
|
166 line = ''.join(['%s\n' % rname])
|
|
167 output.write(line)
|
|
168 output.close()
|
|
169
|
|
170
|
|
171 else:
|
|
172 output = open(args.output,'w')
|
|
173 for i in range(len(seqs)):
|
|
174 rname = seqs[i]
|
|
175 for interval in data[rname]:
|
|
176 if any([not interval[0].find(x) == -1 for x in args.prefixes_to_ignore]):
|
|
177 continue
|
|
178 line = ''
|
|
179 if args.interval:
|
|
180 line = ''.join([line, '%s:%s-%s\t' % (interval[0], interval[1], interval[2])])
|
|
181 if args.chromosome:
|
|
182 line = ''.join([line,'%s\t' % interval[0]])
|
|
183 if args.start_position:
|
|
184 line = ''.join([line,'%s\t' % interval[1]])
|
|
185 if args.end_position:
|
|
186 line = ''.join([line,'%s\t' % interval[2]])
|
|
187 output.write(''.join([line[:-1], '\n']))
|