comparison dotPrep.py @ 0:732267cab191 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/dotprep commit 6db87bb19217d256b13cd66810043b667d1c7638
author bgruening
date Wed, 03 Dec 2025 16:17:07 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:732267cab191
1 #! /usr/bin/env python
2
3 # Author: Maria Nattestad
4 # Email: maria.nattestad@gmail.com
5
6 import argparse
7 import gzip
8 import operator
9 import re
10 import time
11
12
13 import numpy as np
14
15
16 def run(args):
17 filename = args.delta
18 unique_length = args.unique_length
19 output_filename = args.out
20 keep_small_uniques = True
21 max_overview_alignments = args.overview
22
23 header_lines_by_query, lines_by_query = get_query_ref_combinations(filename)
24
25 unique_alignments = calculate_uniqueness(
26 header_lines_by_query, lines_by_query, unique_length, keep_small_uniques
27 )
28
29 reference_lengths, fields_by_query = write_filtered_delta_file(
30 filename,
31 output_filename,
32 unique_alignments,
33 unique_length,
34 header_lines_by_query,
35 )
36
37 index_for_dot(reference_lengths, fields_by_query, output_filename, max_overview_alignments)
38
39
40 def scrub(string):
41 return (
42 string.replace(",", "_")
43 .replace("!", "_")
44 .replace("~", "_")
45 .replace("#", "_")
46 )
47
48
49 def get_query_ref_combinations(filename):
50 print("header from delta file:")
51 try:
52 f = gzip.open(filename, "rt")
53 print(f.readline().strip())
54 except OSError:
55 f = open(filename, "r")
56 print(f.readline().strip())
57
58 print(f.readline().strip())
59 linecounter = 0
60 current_query_name = ""
61 current_header = ""
62 lines_by_query = {}
63 header_lines_by_query = {}
64 before = time.time()
65
66 for line in f:
67 if line[0] == ">":
68 linecounter += 1
69 current_header = line.strip()
70 current_query_name = scrub(current_header.split()[1])
71
72 if header_lines_by_query.get(current_query_name) is None:
73 lines_by_query[current_query_name] = []
74 header_lines_by_query[current_query_name] = []
75 else:
76 fields = line.strip().split()
77 if len(fields) > 4:
78 query_min = min(int(fields[2]), int(fields[3]))
79 query_max = max(int(fields[2]), int(fields[3]))
80 lines_by_query[current_query_name].append((query_min, query_max))
81 header_lines_by_query[current_query_name].append(current_header)
82
83 f.close()
84 print(
85 "First read through the file: %d seconds for %d query-reference combinations"
86 % (time.time() - before, linecounter)
87 )
88 return header_lines_by_query, lines_by_query
89
90
91 def calculate_uniqueness(header_lines_by_query, lines_by_query, unique_length, keep_small_uniques):
92 before = time.time()
93 unique_alignments = {}
94 num_queries = len(lines_by_query)
95 print("Filtering alignments of %d queries" % num_queries)
96
97 num_query_step_to_report = max(num_queries / 100, 1)
98 query_counter = 0
99
100 for query in lines_by_query:
101 unique_alignments[query] = summarize_planesweep(
102 lines_by_query[query],
103 unique_length_required=unique_length,
104 keep_small_uniques=keep_small_uniques,
105 )
106 query_counter += 1
107 if query_counter % num_query_step_to_report == 0:
108 print("Progress: %d%%" % (query_counter * 100 / num_queries))
109
110 print("Progress: 100%")
111 print(
112 "Deciding which alignments to keep: %d seconds for %d queries"
113 % (time.time() - before, num_queries)
114 )
115 return unique_alignments
116
117
118 def summarize_planesweep(lines, unique_length_required, keep_small_uniques=False):
119 unique_alignments = []
120 if len(lines) == 0:
121 return []
122 if len(lines) == 1:
123 if keep_small_uniques or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
124 return [0]
125 return []
126
127 starts_and_stops = []
128 for query_min, query_max in lines:
129 starts_and_stops.append((query_min, "start"))
130 starts_and_stops.append((query_max, "stop"))
131
132 sorted_starts_and_stops = sorted(starts_and_stops, key=operator.itemgetter(0))
133 current_coverage = 0
134 last_position = -1
135 sorted_unique_intervals_left = []
136 sorted_unique_intervals_right = []
137
138 for pos, change in sorted_starts_and_stops:
139 if current_coverage == 1:
140 sorted_unique_intervals_left.append(last_position)
141 sorted_unique_intervals_right.append(pos)
142 if change == "start":
143 current_coverage += 1
144 else:
145 current_coverage -= 1
146 last_position = pos
147
148 linecounter = 0
149 for query_min, query_max in lines:
150 i = binary_search(query_min, sorted_unique_intervals_left, 0, len(sorted_unique_intervals_left))
151 exact_match = False
152 if i < len(sorted_unique_intervals_left):
153 if (
154 sorted_unique_intervals_left[i] == query_min
155 and sorted_unique_intervals_right[i] == query_max
156 ):
157 exact_match = True
158 sum_uniq = 0
159 while (
160 i < len(sorted_unique_intervals_left)
161 and sorted_unique_intervals_left[i] >= query_min
162 and sorted_unique_intervals_right[i] <= query_max
163 ):
164 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
165 i += 1
166 if sum_uniq >= unique_length_required:
167 unique_alignments.append(linecounter)
168 elif keep_small_uniques and exact_match:
169 unique_alignments.append(linecounter)
170 linecounter += 1
171
172 return unique_alignments
173
174
175 def binary_search(query, numbers, left, right):
176 if left >= right:
177 return right
178 mid = int((right + left) / 2)
179 if query == numbers[mid]:
180 return mid
181 if query < numbers[mid]:
182 return binary_search(query, numbers, left, mid)
183 return binary_search(query, numbers, mid + 1, right)
184
185
186 def natural_key(string_):
187 return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)]
188
189
190 def write_filtered_delta_file(filename, output_filename, unique_alignments, unique_length, header_lines_by_query):
191 before = time.time()
192 f_out_delta = gzip.open(
193 f"{output_filename}.uniqueAnchorFiltered_l{unique_length}.delta.gz",
194 "wt",
195 )
196 try:
197 f = gzip.open(filename, "rt")
198 header1 = f.readline()
199 except OSError:
200 f = open(filename, "r")
201 header1 = f.readline()
202
203 f_out_delta.write(header1)
204 f_out_delta.write(f.readline())
205 linecounter = 0
206
207 list_of_unique_alignments = []
208 alignment_counter = {}
209 keep_printing = False
210 ref_sequences = set()
211 query_sequences = set()
212 reference_lengths = []
213 query_lengths = {}
214 fields_by_query = {}
215
216 for line in f:
217 linecounter += 1
218 if line[0] == ">":
219 fields = line.strip().split()
220 query = scrub(fields[1])
221 list_of_unique_alignments = unique_alignments[query]
222
223 header_needed = any(
224 line.strip() == header_lines_by_query[query][index]
225 for index in list_of_unique_alignments
226 )
227 if header_needed:
228 f_out_delta.write(line)
229
230 alignment_counter[query] = alignment_counter.get(query, 0)
231 current_reference_name = scrub(fields[0][1:])
232 current_query_name = scrub(fields[1])
233 current_reference_size = int(fields[2])
234 current_query_size = int(fields[3])
235
236 if current_reference_name not in ref_sequences:
237 reference_lengths.append((current_reference_name, current_reference_size))
238 ref_sequences.add(current_reference_name)
239 if current_query_name not in query_sequences:
240 query_lengths[current_query_name] = current_query_size
241 query_sequences.add(current_query_name)
242
243 else:
244 fields = line.strip().split()
245 if len(fields) > 4:
246 ref_start = int(fields[0])
247 ref_end = int(fields[1])
248 query_start = int(fields[2])
249 query_end = int(fields[3])
250 csv_tag = "repetitive"
251 if alignment_counter[query] in list_of_unique_alignments:
252 f_out_delta.write(line)
253 csv_tag = "unique"
254 keep_printing = True
255 else:
256 keep_printing = False
257 record = [
258 ref_start,
259 ref_end,
260 query_start,
261 query_end,
262 current_reference_size,
263 current_query_size,
264 current_reference_name,
265 current_query_name,
266 csv_tag,
267 ]
268 if fields_by_query.get(current_query_name) is None:
269 fields_by_query[current_query_name] = []
270 fields_by_query[current_query_name].append(record)
271 alignment_counter[query] += 1
272 elif keep_printing:
273 f_out_delta.write(line)
274
275 f.close()
276 f_out_delta.close()
277 print(
278 "Writing filtered delta file and capturing information for coords file: "
279 "%d seconds for %d total lines in file"
280 % (time.time() - before, linecounter)
281 )
282 return reference_lengths, fields_by_query
283
284
285 def index_for_dot(reference_lengths, fields_by_query, output_prefix, max_overview_alignments):
286 reference_lengths.sort(key=lambda x: natural_key(x[0]))
287 cumulative_sum = 0
288 ref_chrom_offsets = {}
289 queries_by_reference = {}
290
291 for ref_name, ref_length in reference_lengths:
292 ref_chrom_offsets[ref_name] = cumulative_sum
293 cumulative_sum += ref_length
294 queries_by_reference[ref_name] = set()
295
296 flip_by_query = {}
297 unique_references_by_query = {}
298 all_references_by_query = {}
299 relative_ref_position_by_query = []
300 ordered_tags = ["unique", "repetitive"]
301
302 f_out_coords = open(f"{output_prefix}.coords", "w", encoding="utf-8")
303 f_out_coords.write("ref_start,ref_end,query_start,query_end,ref\n")
304
305 query_byte_positions = {}
306 query_lengths = {}
307 all_alignments = []
308 last_query = ""
309
310 for query_name, lines in fields_by_query.items():
311 sum_forward = 0
312 sum_reverse = 0
313 ref_position_scores = []
314 unique_references_by_query[query_name] = set()
315 all_references_by_query[query_name] = set()
316
317 for fields in lines:
318 tag = fields[8]
319 query_lengths[query_name] = int(fields[5])
320 ref_name = fields[6]
321 all_references_by_query[query_name].add(ref_name)
322 if tag == "unique":
323 query_start = int(fields[2])
324 query_stop = int(fields[3])
325 ref_start = int(fields[0])
326 ref_stop = int(fields[1])
327 alignment_length = abs(query_stop - query_start)
328 unique_references_by_query[query_name].add(ref_name)
329 queries_by_reference[ref_name].add(query_name)
330 ref_position_scores.append(
331 ref_chrom_offsets[ref_name] + (ref_start + ref_stop) / 2
332 )
333 if query_stop < query_start:
334 sum_reverse += alignment_length
335 else:
336 sum_forward += alignment_length
337
338 flip = sum_reverse > sum_forward
339 flip_by_query[query_name] = "-" if flip else "+"
340
341 for tag in ordered_tags:
342 query_byte_positions[(last_query, "end")] = f_out_coords.tell()
343 query_byte_positions[(query_name, tag)] = f_out_coords.tell()
344 f_out_coords.write(f"!{query_name}!{tag}\n")
345 for fields in lines:
346 if fields[8] == tag:
347 if flip:
348 fields[2] = int(fields[5]) - int(fields[2])
349 fields[3] = int(fields[5]) - int(fields[3])
350 output_fields = [fields[0], fields[1], fields[2], fields[3], fields[6]]
351 f_out_coords.write(",".join(str(i) for i in output_fields) + "\n")
352 alignment_length = abs(int(fields[3]) - int(fields[2]))
353 all_alignments.append((fields, alignment_length))
354
355 rel_pos = np.median(ref_position_scores) if ref_position_scores else 0
356 relative_ref_position_by_query.append((query_name, rel_pos))
357 last_query = query_name
358
359 query_byte_positions[(last_query, "end")] = f_out_coords.tell()
360 relative_ref_position_by_query.sort(key=lambda x: x[1])
361
362 f_out_index = open(f"{output_prefix}.coords.idx", "w", encoding="utf-8")
363 f_out_index.write("#ref\n")
364 f_out_index.write("ref,ref_length,matching_queries\n")
365 for ref_name, ref_length in reference_lengths:
366 f_out_index.write(
367 f"{ref_name},{ref_length},{'~'.join(queries_by_reference[ref_name])}\n"
368 )
369
370 f_out_index.write("#query\n")
371 f_out_index.write(
372 "query,query_length,orientation,bytePosition_unique,"
373 "bytePosition_repetitive,bytePosition_end,unique_matching_refs,matching_refs\n"
374 )
375
376 for query_name, _ in relative_ref_position_by_query:
377 f_out_index.write(
378 f"{query_name},{query_lengths[query_name]},{flip_by_query[query_name]},"
379 f"{query_byte_positions[(query_name, 'unique')]},"
380 f"{query_byte_positions[(query_name, 'repetitive')] - query_byte_positions[(query_name, 'unique')]},"
381 f"{query_byte_positions[(query_name, 'end')] - query_byte_positions[(query_name, 'repetitive')]},"
382 f"{'~'.join(unique_references_by_query[query_name])},"
383 f"{'~'.join(all_references_by_query[query_name])}\n"
384 )
385
386 f_out_index.write("#overview\n")
387 f_out_index.write("ref_start,ref_end,query_start,query_end,ref,query,tag\n")
388
389 num_overview_alignments = min(max_overview_alignments, len(all_alignments))
390 if num_overview_alignments < len(all_alignments):
391 print(
392 f"Included the longest {max_overview_alignments} alignments in the index "
393 f"under #overview (change this with the --overview parameter), "
394 f"out of a total of {len(all_alignments)} alignments."
395 )
396
397 all_alignments.sort(key=lambda x: -x[1])
398 for fields, _ in all_alignments[:num_overview_alignments]:
399 f_out_index.write(",".join(str(i) for i in fields[:9]) + "\n")
400
401 f_out_coords.close()
402 f_out_index.close()
403
404
405 def main():
406 parser = argparse.ArgumentParser(
407 description="Take a delta file, apply Assemblytics unique anchor filtering, "
408 "and prepare coordinates input files for Dot"
409 )
410 parser.add_argument("--delta", help="delta file", dest="delta", type=str, required=True)
411 parser.add_argument("--out", help="output file", dest="out", type=str, default="output")
412 parser.add_argument(
413 "--unique-length",
414 help="Minimum unique query sequence length required (default: 10000)",
415 dest="unique_length",
416 type=int,
417 default=10000,
418 )
419 parser.add_argument(
420 "--overview",
421 help="Number of alignments to include in coords.idx overview (default: 1000)",
422 dest="overview",
423 type=int,
424 default=1000,
425 )
426 parser.set_defaults(func=run)
427 args = parser.parse_args()
428 args.func(args)
429
430
431 if __name__ == "__main__":
432 main()