Mercurial > repos > bgruening > dotprep
comparison dotPrep.py @ 0:732267cab191 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/dotprep commit 6db87bb19217d256b13cd66810043b667d1c7638
| author | bgruening |
|---|---|
| date | Wed, 03 Dec 2025 16:17:07 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:732267cab191 |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 | |
| 3 # Author: Maria Nattestad | |
| 4 # Email: maria.nattestad@gmail.com | |
| 5 | |
| 6 import argparse | |
| 7 import gzip | |
| 8 import operator | |
| 9 import re | |
| 10 import time | |
| 11 | |
| 12 | |
| 13 import numpy as np | |
| 14 | |
| 15 | |
| 16 def run(args): | |
| 17 filename = args.delta | |
| 18 unique_length = args.unique_length | |
| 19 output_filename = args.out | |
| 20 keep_small_uniques = True | |
| 21 max_overview_alignments = args.overview | |
| 22 | |
| 23 header_lines_by_query, lines_by_query = get_query_ref_combinations(filename) | |
| 24 | |
| 25 unique_alignments = calculate_uniqueness( | |
| 26 header_lines_by_query, lines_by_query, unique_length, keep_small_uniques | |
| 27 ) | |
| 28 | |
| 29 reference_lengths, fields_by_query = write_filtered_delta_file( | |
| 30 filename, | |
| 31 output_filename, | |
| 32 unique_alignments, | |
| 33 unique_length, | |
| 34 header_lines_by_query, | |
| 35 ) | |
| 36 | |
| 37 index_for_dot(reference_lengths, fields_by_query, output_filename, max_overview_alignments) | |
| 38 | |
| 39 | |
| 40 def scrub(string): | |
| 41 return ( | |
| 42 string.replace(",", "_") | |
| 43 .replace("!", "_") | |
| 44 .replace("~", "_") | |
| 45 .replace("#", "_") | |
| 46 ) | |
| 47 | |
| 48 | |
| 49 def get_query_ref_combinations(filename): | |
| 50 print("header from delta file:") | |
| 51 try: | |
| 52 f = gzip.open(filename, "rt") | |
| 53 print(f.readline().strip()) | |
| 54 except OSError: | |
| 55 f = open(filename, "r") | |
| 56 print(f.readline().strip()) | |
| 57 | |
| 58 print(f.readline().strip()) | |
| 59 linecounter = 0 | |
| 60 current_query_name = "" | |
| 61 current_header = "" | |
| 62 lines_by_query = {} | |
| 63 header_lines_by_query = {} | |
| 64 before = time.time() | |
| 65 | |
| 66 for line in f: | |
| 67 if line[0] == ">": | |
| 68 linecounter += 1 | |
| 69 current_header = line.strip() | |
| 70 current_query_name = scrub(current_header.split()[1]) | |
| 71 | |
| 72 if header_lines_by_query.get(current_query_name) is None: | |
| 73 lines_by_query[current_query_name] = [] | |
| 74 header_lines_by_query[current_query_name] = [] | |
| 75 else: | |
| 76 fields = line.strip().split() | |
| 77 if len(fields) > 4: | |
| 78 query_min = min(int(fields[2]), int(fields[3])) | |
| 79 query_max = max(int(fields[2]), int(fields[3])) | |
| 80 lines_by_query[current_query_name].append((query_min, query_max)) | |
| 81 header_lines_by_query[current_query_name].append(current_header) | |
| 82 | |
| 83 f.close() | |
| 84 print( | |
| 85 "First read through the file: %d seconds for %d query-reference combinations" | |
| 86 % (time.time() - before, linecounter) | |
| 87 ) | |
| 88 return header_lines_by_query, lines_by_query | |
| 89 | |
| 90 | |
| 91 def calculate_uniqueness(header_lines_by_query, lines_by_query, unique_length, keep_small_uniques): | |
| 92 before = time.time() | |
| 93 unique_alignments = {} | |
| 94 num_queries = len(lines_by_query) | |
| 95 print("Filtering alignments of %d queries" % num_queries) | |
| 96 | |
| 97 num_query_step_to_report = max(num_queries / 100, 1) | |
| 98 query_counter = 0 | |
| 99 | |
| 100 for query in lines_by_query: | |
| 101 unique_alignments[query] = summarize_planesweep( | |
| 102 lines_by_query[query], | |
| 103 unique_length_required=unique_length, | |
| 104 keep_small_uniques=keep_small_uniques, | |
| 105 ) | |
| 106 query_counter += 1 | |
| 107 if query_counter % num_query_step_to_report == 0: | |
| 108 print("Progress: %d%%" % (query_counter * 100 / num_queries)) | |
| 109 | |
| 110 print("Progress: 100%") | |
| 111 print( | |
| 112 "Deciding which alignments to keep: %d seconds for %d queries" | |
| 113 % (time.time() - before, num_queries) | |
| 114 ) | |
| 115 return unique_alignments | |
| 116 | |
| 117 | |
| 118 def summarize_planesweep(lines, unique_length_required, keep_small_uniques=False): | |
| 119 unique_alignments = [] | |
| 120 if len(lines) == 0: | |
| 121 return [] | |
| 122 if len(lines) == 1: | |
| 123 if keep_small_uniques or abs(lines[0][1] - lines[0][0]) >= unique_length_required: | |
| 124 return [0] | |
| 125 return [] | |
| 126 | |
| 127 starts_and_stops = [] | |
| 128 for query_min, query_max in lines: | |
| 129 starts_and_stops.append((query_min, "start")) | |
| 130 starts_and_stops.append((query_max, "stop")) | |
| 131 | |
| 132 sorted_starts_and_stops = sorted(starts_and_stops, key=operator.itemgetter(0)) | |
| 133 current_coverage = 0 | |
| 134 last_position = -1 | |
| 135 sorted_unique_intervals_left = [] | |
| 136 sorted_unique_intervals_right = [] | |
| 137 | |
| 138 for pos, change in sorted_starts_and_stops: | |
| 139 if current_coverage == 1: | |
| 140 sorted_unique_intervals_left.append(last_position) | |
| 141 sorted_unique_intervals_right.append(pos) | |
| 142 if change == "start": | |
| 143 current_coverage += 1 | |
| 144 else: | |
| 145 current_coverage -= 1 | |
| 146 last_position = pos | |
| 147 | |
| 148 linecounter = 0 | |
| 149 for query_min, query_max in lines: | |
| 150 i = binary_search(query_min, sorted_unique_intervals_left, 0, len(sorted_unique_intervals_left)) | |
| 151 exact_match = False | |
| 152 if i < len(sorted_unique_intervals_left): | |
| 153 if ( | |
| 154 sorted_unique_intervals_left[i] == query_min | |
| 155 and sorted_unique_intervals_right[i] == query_max | |
| 156 ): | |
| 157 exact_match = True | |
| 158 sum_uniq = 0 | |
| 159 while ( | |
| 160 i < len(sorted_unique_intervals_left) | |
| 161 and sorted_unique_intervals_left[i] >= query_min | |
| 162 and sorted_unique_intervals_right[i] <= query_max | |
| 163 ): | |
| 164 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i] | |
| 165 i += 1 | |
| 166 if sum_uniq >= unique_length_required: | |
| 167 unique_alignments.append(linecounter) | |
| 168 elif keep_small_uniques and exact_match: | |
| 169 unique_alignments.append(linecounter) | |
| 170 linecounter += 1 | |
| 171 | |
| 172 return unique_alignments | |
| 173 | |
| 174 | |
| 175 def binary_search(query, numbers, left, right): | |
| 176 if left >= right: | |
| 177 return right | |
| 178 mid = int((right + left) / 2) | |
| 179 if query == numbers[mid]: | |
| 180 return mid | |
| 181 if query < numbers[mid]: | |
| 182 return binary_search(query, numbers, left, mid) | |
| 183 return binary_search(query, numbers, mid + 1, right) | |
| 184 | |
| 185 | |
| 186 def natural_key(string_): | |
| 187 return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_)] | |
| 188 | |
| 189 | |
| 190 def write_filtered_delta_file(filename, output_filename, unique_alignments, unique_length, header_lines_by_query): | |
| 191 before = time.time() | |
| 192 f_out_delta = gzip.open( | |
| 193 f"{output_filename}.uniqueAnchorFiltered_l{unique_length}.delta.gz", | |
| 194 "wt", | |
| 195 ) | |
| 196 try: | |
| 197 f = gzip.open(filename, "rt") | |
| 198 header1 = f.readline() | |
| 199 except OSError: | |
| 200 f = open(filename, "r") | |
| 201 header1 = f.readline() | |
| 202 | |
| 203 f_out_delta.write(header1) | |
| 204 f_out_delta.write(f.readline()) | |
| 205 linecounter = 0 | |
| 206 | |
| 207 list_of_unique_alignments = [] | |
| 208 alignment_counter = {} | |
| 209 keep_printing = False | |
| 210 ref_sequences = set() | |
| 211 query_sequences = set() | |
| 212 reference_lengths = [] | |
| 213 query_lengths = {} | |
| 214 fields_by_query = {} | |
| 215 | |
| 216 for line in f: | |
| 217 linecounter += 1 | |
| 218 if line[0] == ">": | |
| 219 fields = line.strip().split() | |
| 220 query = scrub(fields[1]) | |
| 221 list_of_unique_alignments = unique_alignments[query] | |
| 222 | |
| 223 header_needed = any( | |
| 224 line.strip() == header_lines_by_query[query][index] | |
| 225 for index in list_of_unique_alignments | |
| 226 ) | |
| 227 if header_needed: | |
| 228 f_out_delta.write(line) | |
| 229 | |
| 230 alignment_counter[query] = alignment_counter.get(query, 0) | |
| 231 current_reference_name = scrub(fields[0][1:]) | |
| 232 current_query_name = scrub(fields[1]) | |
| 233 current_reference_size = int(fields[2]) | |
| 234 current_query_size = int(fields[3]) | |
| 235 | |
| 236 if current_reference_name not in ref_sequences: | |
| 237 reference_lengths.append((current_reference_name, current_reference_size)) | |
| 238 ref_sequences.add(current_reference_name) | |
| 239 if current_query_name not in query_sequences: | |
| 240 query_lengths[current_query_name] = current_query_size | |
| 241 query_sequences.add(current_query_name) | |
| 242 | |
| 243 else: | |
| 244 fields = line.strip().split() | |
| 245 if len(fields) > 4: | |
| 246 ref_start = int(fields[0]) | |
| 247 ref_end = int(fields[1]) | |
| 248 query_start = int(fields[2]) | |
| 249 query_end = int(fields[3]) | |
| 250 csv_tag = "repetitive" | |
| 251 if alignment_counter[query] in list_of_unique_alignments: | |
| 252 f_out_delta.write(line) | |
| 253 csv_tag = "unique" | |
| 254 keep_printing = True | |
| 255 else: | |
| 256 keep_printing = False | |
| 257 record = [ | |
| 258 ref_start, | |
| 259 ref_end, | |
| 260 query_start, | |
| 261 query_end, | |
| 262 current_reference_size, | |
| 263 current_query_size, | |
| 264 current_reference_name, | |
| 265 current_query_name, | |
| 266 csv_tag, | |
| 267 ] | |
| 268 if fields_by_query.get(current_query_name) is None: | |
| 269 fields_by_query[current_query_name] = [] | |
| 270 fields_by_query[current_query_name].append(record) | |
| 271 alignment_counter[query] += 1 | |
| 272 elif keep_printing: | |
| 273 f_out_delta.write(line) | |
| 274 | |
| 275 f.close() | |
| 276 f_out_delta.close() | |
| 277 print( | |
| 278 "Writing filtered delta file and capturing information for coords file: " | |
| 279 "%d seconds for %d total lines in file" | |
| 280 % (time.time() - before, linecounter) | |
| 281 ) | |
| 282 return reference_lengths, fields_by_query | |
| 283 | |
| 284 | |
| 285 def index_for_dot(reference_lengths, fields_by_query, output_prefix, max_overview_alignments): | |
| 286 reference_lengths.sort(key=lambda x: natural_key(x[0])) | |
| 287 cumulative_sum = 0 | |
| 288 ref_chrom_offsets = {} | |
| 289 queries_by_reference = {} | |
| 290 | |
| 291 for ref_name, ref_length in reference_lengths: | |
| 292 ref_chrom_offsets[ref_name] = cumulative_sum | |
| 293 cumulative_sum += ref_length | |
| 294 queries_by_reference[ref_name] = set() | |
| 295 | |
| 296 flip_by_query = {} | |
| 297 unique_references_by_query = {} | |
| 298 all_references_by_query = {} | |
| 299 relative_ref_position_by_query = [] | |
| 300 ordered_tags = ["unique", "repetitive"] | |
| 301 | |
| 302 f_out_coords = open(f"{output_prefix}.coords", "w", encoding="utf-8") | |
| 303 f_out_coords.write("ref_start,ref_end,query_start,query_end,ref\n") | |
| 304 | |
| 305 query_byte_positions = {} | |
| 306 query_lengths = {} | |
| 307 all_alignments = [] | |
| 308 last_query = "" | |
| 309 | |
| 310 for query_name, lines in fields_by_query.items(): | |
| 311 sum_forward = 0 | |
| 312 sum_reverse = 0 | |
| 313 ref_position_scores = [] | |
| 314 unique_references_by_query[query_name] = set() | |
| 315 all_references_by_query[query_name] = set() | |
| 316 | |
| 317 for fields in lines: | |
| 318 tag = fields[8] | |
| 319 query_lengths[query_name] = int(fields[5]) | |
| 320 ref_name = fields[6] | |
| 321 all_references_by_query[query_name].add(ref_name) | |
| 322 if tag == "unique": | |
| 323 query_start = int(fields[2]) | |
| 324 query_stop = int(fields[3]) | |
| 325 ref_start = int(fields[0]) | |
| 326 ref_stop = int(fields[1]) | |
| 327 alignment_length = abs(query_stop - query_start) | |
| 328 unique_references_by_query[query_name].add(ref_name) | |
| 329 queries_by_reference[ref_name].add(query_name) | |
| 330 ref_position_scores.append( | |
| 331 ref_chrom_offsets[ref_name] + (ref_start + ref_stop) / 2 | |
| 332 ) | |
| 333 if query_stop < query_start: | |
| 334 sum_reverse += alignment_length | |
| 335 else: | |
| 336 sum_forward += alignment_length | |
| 337 | |
| 338 flip = sum_reverse > sum_forward | |
| 339 flip_by_query[query_name] = "-" if flip else "+" | |
| 340 | |
| 341 for tag in ordered_tags: | |
| 342 query_byte_positions[(last_query, "end")] = f_out_coords.tell() | |
| 343 query_byte_positions[(query_name, tag)] = f_out_coords.tell() | |
| 344 f_out_coords.write(f"!{query_name}!{tag}\n") | |
| 345 for fields in lines: | |
| 346 if fields[8] == tag: | |
| 347 if flip: | |
| 348 fields[2] = int(fields[5]) - int(fields[2]) | |
| 349 fields[3] = int(fields[5]) - int(fields[3]) | |
| 350 output_fields = [fields[0], fields[1], fields[2], fields[3], fields[6]] | |
| 351 f_out_coords.write(",".join(str(i) for i in output_fields) + "\n") | |
| 352 alignment_length = abs(int(fields[3]) - int(fields[2])) | |
| 353 all_alignments.append((fields, alignment_length)) | |
| 354 | |
| 355 rel_pos = np.median(ref_position_scores) if ref_position_scores else 0 | |
| 356 relative_ref_position_by_query.append((query_name, rel_pos)) | |
| 357 last_query = query_name | |
| 358 | |
| 359 query_byte_positions[(last_query, "end")] = f_out_coords.tell() | |
| 360 relative_ref_position_by_query.sort(key=lambda x: x[1]) | |
| 361 | |
| 362 f_out_index = open(f"{output_prefix}.coords.idx", "w", encoding="utf-8") | |
| 363 f_out_index.write("#ref\n") | |
| 364 f_out_index.write("ref,ref_length,matching_queries\n") | |
| 365 for ref_name, ref_length in reference_lengths: | |
| 366 f_out_index.write( | |
| 367 f"{ref_name},{ref_length},{'~'.join(queries_by_reference[ref_name])}\n" | |
| 368 ) | |
| 369 | |
| 370 f_out_index.write("#query\n") | |
| 371 f_out_index.write( | |
| 372 "query,query_length,orientation,bytePosition_unique," | |
| 373 "bytePosition_repetitive,bytePosition_end,unique_matching_refs,matching_refs\n" | |
| 374 ) | |
| 375 | |
| 376 for query_name, _ in relative_ref_position_by_query: | |
| 377 f_out_index.write( | |
| 378 f"{query_name},{query_lengths[query_name]},{flip_by_query[query_name]}," | |
| 379 f"{query_byte_positions[(query_name, 'unique')]}," | |
| 380 f"{query_byte_positions[(query_name, 'repetitive')] - query_byte_positions[(query_name, 'unique')]}," | |
| 381 f"{query_byte_positions[(query_name, 'end')] - query_byte_positions[(query_name, 'repetitive')]}," | |
| 382 f"{'~'.join(unique_references_by_query[query_name])}," | |
| 383 f"{'~'.join(all_references_by_query[query_name])}\n" | |
| 384 ) | |
| 385 | |
| 386 f_out_index.write("#overview\n") | |
| 387 f_out_index.write("ref_start,ref_end,query_start,query_end,ref,query,tag\n") | |
| 388 | |
| 389 num_overview_alignments = min(max_overview_alignments, len(all_alignments)) | |
| 390 if num_overview_alignments < len(all_alignments): | |
| 391 print( | |
| 392 f"Included the longest {max_overview_alignments} alignments in the index " | |
| 393 f"under #overview (change this with the --overview parameter), " | |
| 394 f"out of a total of {len(all_alignments)} alignments." | |
| 395 ) | |
| 396 | |
| 397 all_alignments.sort(key=lambda x: -x[1]) | |
| 398 for fields, _ in all_alignments[:num_overview_alignments]: | |
| 399 f_out_index.write(",".join(str(i) for i in fields[:9]) + "\n") | |
| 400 | |
| 401 f_out_coords.close() | |
| 402 f_out_index.close() | |
| 403 | |
| 404 | |
| 405 def main(): | |
| 406 parser = argparse.ArgumentParser( | |
| 407 description="Take a delta file, apply Assemblytics unique anchor filtering, " | |
| 408 "and prepare coordinates input files for Dot" | |
| 409 ) | |
| 410 parser.add_argument("--delta", help="delta file", dest="delta", type=str, required=True) | |
| 411 parser.add_argument("--out", help="output file", dest="out", type=str, default="output") | |
| 412 parser.add_argument( | |
| 413 "--unique-length", | |
| 414 help="Minimum unique query sequence length required (default: 10000)", | |
| 415 dest="unique_length", | |
| 416 type=int, | |
| 417 default=10000, | |
| 418 ) | |
| 419 parser.add_argument( | |
| 420 "--overview", | |
| 421 help="Number of alignments to include in coords.idx overview (default: 1000)", | |
| 422 dest="overview", | |
| 423 type=int, | |
| 424 default=1000, | |
| 425 ) | |
| 426 parser.set_defaults(func=run) | |
| 427 args = parser.parse_args() | |
| 428 args.func(args) | |
| 429 | |
| 430 | |
| 431 if __name__ == "__main__": | |
| 432 main() |
