comparison make_EAR.py @ 0:6af76d4371f8 draft

planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit a08f73e00550020ac83f4d45045075962a8a2251
author bgruening
date Thu, 04 Jul 2024 12:43:34 +0000
parents
children 82450f7907ef
comparison
equal deleted inserted replaced
-1:000000000000 0:6af76d4371f8
1
2 import argparse
3 import glob
4 import logging
5 import math
6 import os
7 import re
8 import sys
9 from datetime import datetime
10
11 import pytz
12 import requests
13 import yaml
14 from reportlab.lib import colors
15 from reportlab.lib.pagesizes import A4
16 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
17 from reportlab.lib.units import cm
18 from reportlab.platypus import Image, PageBreak, Paragraph, SimpleDocTemplate, Spacer, Table, TableStyle
19
20
21 # make_EAR_glxy.py
22 # CAUTION: This is for the Galaxy version!
23 # by Diego De Panis
24 # ERGA Sequencing and Assembly Committee
25 EAR_version = "v24.05.20_glxy_beta"
26
27
28 def make_report(yaml_file):
29 logging.basicConfig(filename='EAR.log', level=logging.INFO)
30 # Read the content from EAR.yaml file
31 with open(yaml_file, "r") as file:
32 yaml_data = yaml.safe_load(file)
33
34 # FUNCTIONS ###################################################################################
35
36 def format_number(value):
37 try:
38 value_float = float(value)
39 if value_float.is_integer():
40 # format as an integer if no decimal part
41 return f'{int(value_float):,}'
42 else:
43 # format as a float
44 return f'{value_float:,}'
45 except ValueError:
46 # return the original value if it can't be converted to a float
47 return value
48
49 # extract gfastats values
50 def extract_gfastats_values(content, keys):
51 return [re.findall(f"{key}: (.+)", content)[0] for key in keys]
52
53 keys = [
54 "Total scaffold length",
55 "GC content %",
56 "# gaps in scaffolds",
57 "Total gap length in scaffolds",
58 "# scaffolds",
59 "Scaffold N50",
60 "Scaffold L50",
61 "Scaffold L90",
62 "# contigs",
63 "Contig N50",
64 "Contig L50",
65 "Contig L90",
66 ]
67
68 display_names = keys.copy()
69 display_names[display_names.index("Total scaffold length")] = "Total bp"
70 total_length_index = keys.index("Total scaffold length")
71 display_names[display_names.index("GC content %")] = "GC %"
72 display_names[display_names.index("Total gap length in scaffolds")] = "Total gap bp"
73 display_names[display_names.index("# scaffolds")] = "Scaffolds"
74 display_names[display_names.index("# contigs")] = "Contigs"
75
76 gaps_index = keys.index("# gaps in scaffolds")
77 exclusion_list = ["# gaps in scaffolds"]
78
79 # extract Total bp from gfastats report
80 def extract_total_bp_from_gfastats(gfastats_path):
81 with open(gfastats_path, "r") as f:
82 content = f.read()
83 total_bp = re.search(r"Total scaffold length: (.+)", content).group(1)
84 total_bp = int(total_bp.replace(',', ''))
85 return "{:,}".format(total_bp)
86
87 # compute EBP quality metric
88 def compute_ebp_metric(haplotype, gfastats_path, qv_value):
89 keys_needed = ["Contig N50", "Scaffold N50"]
90 content = ''
91 with open(gfastats_path, "r") as f:
92 content = f.read()
93
94 values = extract_gfastats_values(content, keys_needed)
95 contig_n50_log = math.floor(math.log10(int(values[0].replace(',', ''))))
96 scaffold_n50_log = math.floor(math.log10(int(values[1].replace(',', ''))))
97
98 return f"Obtained EBP quality metric for {haplotype}: {contig_n50_log}.{scaffold_n50_log}.Q{math.floor(float(qv_value))}"
99
100 # extract qv values
101 def get_qv_value(file_path, order, tool, haplotype):
102 try:
103 with open(file_path, 'r') as file:
104 lines = file.readlines()
105 if len(lines) > order and (len(lines) == 1 or lines[2].split('\t')[0].strip() == "Both"):
106 target_line = lines[order]
107 fourth_column_value = target_line.split('\t')[3]
108 return fourth_column_value
109 except Exception as e:
110 logging.error(f"Error reading {file_path} for tool {tool} and haplotype {haplotype}: {str(e)}")
111 return ''
112
113 # extract Kmer completeness values
114 def get_completeness_value(file_path, order, tool, haplotype):
115 try:
116 with open(file_path, 'r') as file:
117 lines = file.readlines()
118 if len(lines) > order:
119 target_line = lines[order]
120 fifth_column_value = target_line.split('\t')[4].strip()
121 return fifth_column_value
122 except Exception as e:
123 logging.warning(f"Error reading {file_path}: {str(e)}")
124 return ''
125
126 # Getting kmer plots for curated asm
127 def get_png_files(dir_path):
128 png_files = glob.glob(f"{dir_path}/*.ln.png")
129 if len(png_files) < 4:
130 logging.warning(f"Warning: Less than 4 png files found in {dir_path}. If this is diploid, some images may be missing.")
131 # fill missing with None
132 while len(png_files) < 4:
133 png_files.append(None)
134 return png_files[:4]
135
136 # get unique part in file names
137 def find_unique_parts(file1, file2):
138 # Split filenames into parts
139 parts1 = file1.split('.')
140 parts2 = file2.split('.')
141 # Find unique parts
142 unique_parts1 = [part for part in parts1 if part not in parts2]
143 unique_parts2 = [part for part in parts2 if part not in parts1]
144
145 return ' '.join(unique_parts1), ' '.join(unique_parts2)
146
147 # extract BUSCO values
148 def extract_busco_values(file_path):
149 try:
150 with open(file_path, 'r') as file:
151 content = file.read()
152 results_line = re.findall(r"C:.*n:\d+", content)[0]
153 s_value = re.findall(r"S:(\d+\.\d+%)", results_line)[0]
154 d_value = re.findall(r"D:(\d+\.\d+%)", results_line)[0]
155 f_value = re.findall(r"F:(\d+\.\d+%)", results_line)[0]
156 m_value = re.findall(r"M:(\d+\.\d+%)", results_line)[0]
157 return s_value, d_value, f_value, m_value
158 except Exception as e:
159 logging.warning(f"Error reading {file_path}: {str(e)}")
160 return '', '', '', ''
161
162 # extract BUSCO info
163 def extract_busco_info(file_path):
164 busco_version = None
165 lineage_info = None
166
167 try:
168 with open(file_path, 'r') as file:
169 content = file.read()
170 version_match = re.search(r"# BUSCO version is: ([\d.]+)", content)
171 if version_match:
172 busco_version = version_match.group(1)
173 lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of genomes: (\d+), number of BUSCOs: (\d+)\)", content)
174 if lineage_match:
175 lineage_info = lineage_match.groups()
176 if not lineage_info:
177 lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of species: (\d+), number of BUSCOs: (\d+)\)", content)
178 if lineage_match:
179 lineage_info = lineage_match.groups()
180
181 except Exception as e:
182 logging.warning(f"Error reading {file_path}: {str(e)}")
183
184 return busco_version, lineage_info
185
186 # Function to check and generate warning messages
187 def generate_warning_paragraphs(expected, observed, trait):
188 paragraphs = []
189 try:
190 if trait == "Haploid size (bp)":
191 expected_val = int(expected.replace(',', ''))
192 observed_val = int(observed.replace(',', ''))
193 if abs(expected_val - observed_val) / expected_val > 0.20:
194 message = f". Observed {trait} has >20% difference with Expected"
195 paragraphs.append(Paragraph(message, styles["midiStyle"]))
196 elif trait in ["Haploid Number", "Ploidy"]:
197 # Ensure both values are integers for comparison
198 expected_val = int(expected)
199 observed_val = int(observed)
200 if expected_val != observed_val:
201 message = f". Observed {trait} is different from Expected"
202 paragraphs.append(Paragraph(message, styles["midiStyle"]))
203 elif trait == "Sample Sex":
204 # Compare case-insensitive and trimmed strings
205 if expected.strip().lower() != observed.strip().lower():
206 message = ". Observed sex is different from Sample sex"
207 paragraphs.append(Paragraph(message, styles["midiStyle"]))
208 except Exception as e:
209 logging.warning(f"Error in generating warning for {trait}: {str(e)}")
210 return paragraphs
211
212 # Generate warnings for curated haplotypes (qv, kcomp, busco)
213 def generate_curated_warnings(haplotype, qv_value, completeness_value, busco_scores):
214 paragraphs = []
215 try:
216 # Ensure values are correctly interpreted as floats
217 qv_val = float(qv_value)
218 completeness_val = float(completeness_value)
219 s_value = float(busco_scores[0].rstrip('%'))
220 d_value = float(busco_scores[1].rstrip('%'))
221
222 # Check QV value
223 if qv_val < 40:
224 message = f". QV value is less than 40 for {haplotype}"
225 paragraphs.append(Paragraph(message, styles["midiStyle"]))
226
227 # Check Kmer completeness value
228 if completeness_val < 90:
229 message = f". Kmer completeness value is less than 90 for {haplotype}"
230 paragraphs.append(Paragraph(message, styles["midiStyle"]))
231
232 # Check BUSCO s_value
233 if s_value < 90:
234 message = f". BUSCO single copy value is less than 90% for {haplotype}"
235 paragraphs.append(Paragraph(message, styles["midiStyle"]))
236
237 # Check BUSCO d_value
238 if d_value > 5:
239 message = f". BUSCO duplicated value is more than 5% for {haplotype}"
240 paragraphs.append(Paragraph(message, styles["midiStyle"]))
241
242 except Exception as e:
243 logging.warning(f"Error in generating warnings for {haplotype}: {str(e)}")
244
245 return paragraphs
246
247 # Generate warnings for curated haplotypes (loss, gaps, 90inChrom)
248 def generate_assembly_warnings(asm_data, gaps_per_gbp_data, obs_haploid_num):
249 warnings = []
250
251 # Iterate over haplotypes and generate warnings based on the criteria
252 for haplotype in asm_stages:
253 pre_curation_bp = extract_total_bp_from_gfastats(asm_data['Pre-curation'][haplotype]['gfastats--nstar-report_txt'])
254 curated_bp = extract_total_bp_from_gfastats(asm_data['Curated'][haplotype]['gfastats--nstar-report_txt'])
255 scaffold_l90 = float(gfastats_data[('Curated', haplotype)][display_names.index('Scaffold L90')].replace(',', ''))
256
257 # Check for assembly length loss > 3%
258 if pre_curation_bp and curated_bp:
259 loss_percentage = (float(pre_curation_bp.replace(',', '')) - float(curated_bp.replace(',', ''))) / float(pre_curation_bp.replace(',', '')) * 100
260 if loss_percentage > 3:
261 warnings.append(Paragraph(f". Assembly length loss > 3% for {haplotype}", styles["midiStyle"]))
262
263 # Check for more than 1000 gaps/Gbp
264 gaps_gbp = gaps_per_gbp_data.get(('Curated', haplotype), 0)
265 if gaps_gbp > 1000:
266 warnings.append(Paragraph(f". More than 1000 gaps/Gbp for {haplotype}", styles["midiStyle"]))
267
268 # Check if Scaffold L90 value is more than Observed Haploid number
269 if scaffold_l90 > float(obs_haploid_num):
270 warnings.append(Paragraph(f". Not 90% of assembly in chromosomes for {haplotype}", styles["midiStyle"]))
271
272 return warnings
273
274 # Parse pipeline and generate "tree"
275 def generate_pipeline_tree(pipeline_data):
276 tree_lines = []
277 indent = "&nbsp;" * 2 # Adjust indent spacing as needed
278
279 for tool_version_param in pipeline_data:
280 parts = tool_version_param.split('|')
281 tool_version = parts[0]
282 tool, version = tool_version.split('_v') if '_v' in tool_version else (tool_version, "NA")
283
284 # Handle parameters: join all but the first (which is tool_version) with ', '
285 param_text = ', '.join(parts[1:]) if len(parts) > 1 else "NA"
286
287 # Tool line
288 tool_line = f"- <b>{tool}</b>"
289 tree_lines.append(tool_line)
290
291 # Version line
292 version_line = f"{indent*2}|_ <i>ver:</i> {version}"
293 tree_lines.append(version_line)
294
295 # Param line(s)
296 if param_text != "NA":
297 for param in param_text.split(','):
298 param = param.strip()
299 param_line = f"{indent*2}|_ <i>key param:</i> {param if param else 'NA'}"
300 tree_lines.append(param_line)
301 else:
302 param_line = f"{indent*2}|_ <i>key param:</i> NA"
303 tree_lines.append(param_line)
304
305 # Join lines with HTML break for paragraph
306 tree_diagram = "<br/>".join(tree_lines)
307 return tree_diagram
308
309 # Reading SAMPLE INFORMATION section from yaml ################################################
310
311 # Check for required fields
312 required_fields = ["ToLID", "Species", "Sex", "Submitter", "Affiliation", "Tags"]
313 missing_fields = [field for field in required_fields if field not in yaml_data or not yaml_data[field]]
314
315 if missing_fields:
316 logging.error(f"# GENERAL INFORMATION section in the yaml file is missing or empty for the following information: {', '.join(missing_fields)}")
317 sys.exit(1)
318
319 # Check that "Species" field is a string
320 if not isinstance(yaml_data["Species"], str):
321 logging.error(f"# GENERAL INFORMATION section in the yaml file contains incorrect data type for 'Species'. Expected 'str' but got '{type(yaml_data['Species']).__name__}'.")
322 sys.exit(1)
323
324 # Get data for Header, ToLID table and submitter
325 tol_id = yaml_data["ToLID"]
326 species = yaml_data["Species"]
327 sex = yaml_data["Sex"]
328 submitter = yaml_data["Submitter"]
329 affiliation = yaml_data["Affiliation"]
330 tags = yaml_data["Tags"]
331
332 # Check if tag is valid
333 valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Satellite"]
334 if tags not in valid_tags:
335 tags += "[INVALID TAG]"
336 logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Satellite")
337
338 # Get data from GoaT based on species name
339 # urllib.parse.quote to handle special characters and spaces in the species name
340 species_name = requests.utils.quote(species)
341
342 # Get stuff from GoaT
343 goat_response = requests.get(f'https://goat.genomehubs.org/api/v2/search?query=tax_name%28{species_name}%29&result=taxon')
344 goat_data = goat_response.json() # convert json to dict
345
346 taxon_number = goat_data['results'][0]['result']['taxon_id']
347
348 goat_results = goat_data['results']
349
350 class_name = 'NA'
351 order_name = 'NA'
352 haploid_number = 'NA'
353 haploid_source = 'NA'
354 ploidy = 'NA'
355 ploidy_source = 'NA'
356
357 for result in goat_results:
358 lineage = result['result']['lineage']
359 for node in lineage:
360 if node['taxon_rank'] == 'class':
361 class_name = node['scientific_name']
362 if node['taxon_rank'] == 'order':
363 order_name = node['scientific_name']
364
365 goat_second_response = requests.get(f'https://goat.genomehubs.org/api/v2/record?recordId={taxon_number}&result=taxon&taxonomy=ncbi')
366 goat_second_data = goat_second_response.json()
367
368 ploidy_info = goat_second_data['records'][0]['record']['attributes']['ploidy']
369
370 ploidy = ploidy_info['value']
371 ploidy_source = ploidy_info['aggregation_source']
372
373 haploid_info = goat_second_data['records'][0]['record']['attributes']['haploid_number']
374
375 haploid_number = haploid_info['value']
376 haploid_source = haploid_info['aggregation_source']
377
378 sp_data = [
379 ["TxID", "ToLID", "Species", "Class", "Order"],
380 [taxon_number, tol_id, species, class_name, order_name]
381 ]
382
383 # Transpose the data
384 transposed_sp_data = list(map(list, zip(*sp_data)))
385
386 # Reading SEQUENCING DATA section from yaml ###################################################
387
388 # get DATA section from yaml
389 data_list = yaml_data.get('DATA', [])
390
391 # Prepare headers
392 headers = ['Data']
393 data_values = ['Coverage']
394
395 # Extract data from YAML and format it for the table
396 for item in data_list:
397 for technology, coverage in item.items():
398 headers.append(technology)
399 data_values.append('NA' if not coverage else coverage)
400
401 # Create a list of lists for the table
402 table_data = [headers, data_values]
403
404 # Extract pipeline data from 'Pre-curation' category
405 asm_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Pre-curation', {}).get('pipeline', [])
406 asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data)
407
408 # Extract pipeline data from 'Curated' category
409 curation_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}).get('pipeline', [])
410 curation_pipeline_tree = generate_pipeline_tree(curation_pipeline_data)
411
412 # Reading GENOME PROFILING DATA section from yaml #############################################
413
414 profiling_data = yaml_data.get('PROFILING')
415
416 # Check if profiling_data is available
417 if not profiling_data:
418 logging.error('Error: No profiling data found in the YAML file.')
419 sys.exit(1)
420
421 # Handle GenomeScope specific processing
422 genomescope_data = profiling_data.get('GenomeScope')
423 if genomescope_data:
424 summary_file = genomescope_data.get('genomescope_summary_txt')
425 if summary_file and os.path.exists(summary_file):
426 with open(summary_file, "r") as f:
427 summary_txt = f.read()
428 genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1)
429 proposed_ploidy_match = re.search(r"p = (\d+)", summary_txt)
430 proposed_ploidy = proposed_ploidy_match.group(1) if proposed_ploidy_match else 'NA'
431 else:
432 logging.error(f"File {summary_file} not found for GenomeScope.")
433 sys.exit(1)
434 else:
435 logging.error("GenomeScope data is missing in the PROFILING section.")
436 sys.exit(1)
437
438 # Handle Smudgeplot specific processing
439 smudgeplot_data = profiling_data.get('Smudgeplot')
440 if smudgeplot_data:
441 verbose_summary_file = smudgeplot_data.get('smudgeplot_verbose_summary_txt')
442 if verbose_summary_file and os.path.exists(verbose_summary_file):
443 with open(verbose_summary_file, "r") as f:
444 smud_summary_txt = f.readlines()
445 for line in smud_summary_txt:
446 if line.startswith("* Proposed ploidy"):
447 proposed_ploidy = line.split(":")[1].strip()
448 break
449 else:
450 logging.warning(f"Verbose summary file {verbose_summary_file} not found for Smudgeplot; skipping detailed Smudgeplot analysis.")
451 else:
452 logging.warning("Smudgeplot data is missing in the PROFILING section; skipping Smudgeplot analysis.")
453
454 # Reading ASSEMBLY DATA section from yaml #####################################################
455
456 asm_data = yaml_data.get('ASSEMBLIES', {})
457
458 # make a list from the assemblies available in asm_data
459 asm_stages = []
460 for asm_stage, stage_properties in asm_data.items():
461 for haplotypes in stage_properties.keys():
462 if haplotypes != 'pipeline' and haplotypes not in asm_stages:
463 asm_stages.append(haplotypes)
464
465 # get gfastats-based data
466 gfastats_data = {}
467 for asm_stage, stage_properties in asm_data.items():
468 for haplotypes, haplotype_properties in stage_properties.items():
469 if isinstance(haplotype_properties, dict):
470 if 'gfastats--nstar-report_txt' in haplotype_properties:
471 file_path = haplotype_properties['gfastats--nstar-report_txt']
472 with open(file_path, 'r') as file:
473 content = file.read()
474 gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys)
475
476 gaps_per_gbp_data = {}
477 for (asm_stage, haplotypes), values in gfastats_data.items():
478 try:
479 gaps = float(values[gaps_index])
480 total_length = float(values[total_length_index])
481 gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
482 gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
483 except (ValueError, ZeroDivisionError):
484 gaps_per_gbp_data[(asm_stage, haplotypes)] = ''
485
486 # Define the contigging table (column names) DON'T MOVE THIS AGAIN!!!!!!!
487 asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]]
488
489 # Fill the table with the gfastats data
490 for i in range(len(display_names)):
491 metric = display_names[i]
492 if metric not in exclusion_list:
493 asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
494
495 # Add the gaps/gbp in between
496 gc_index = display_names.index("GC %")
497 gc_index
498 asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
499
500 # get QV, Kmer completeness and BUSCO data
501 qv_data = {}
502 completeness_data = {}
503 busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']}
504 for asm_stage, stage_properties in asm_data.items():
505 asm_stage_elements = [element for element in stage_properties.keys() if element != 'pipeline']
506 for i, haplotypes in enumerate(asm_stage_elements):
507 haplotype_properties = stage_properties[haplotypes]
508 if isinstance(haplotype_properties, dict):
509 if 'merqury_qv' in haplotype_properties:
510 qv_data[(asm_stage, haplotypes)] = get_qv_value(haplotype_properties['merqury_qv'], i, asm_stage, haplotypes)
511 if 'merqury_completeness_stats' in haplotype_properties:
512 completeness_data[(asm_stage, haplotypes)] = get_completeness_value(haplotype_properties['merqury_completeness_stats'], i, asm_stage, haplotypes)
513 if 'busco_short_summary_txt' in haplotype_properties:
514 s_value, d_value, f_value, m_value = extract_busco_values(haplotype_properties['busco_short_summary_txt'])
515 busco_data['BUSCO sing.'].update({(asm_stage, haplotypes): s_value})
516 busco_data['BUSCO dupl.'].update({(asm_stage, haplotypes): d_value})
517 busco_data['BUSCO frag.'].update({(asm_stage, haplotypes): f_value})
518 busco_data['BUSCO miss.'].update({(asm_stage, haplotypes): m_value})
519
520 # Fill the table with the QV data
521 asm_table_data.append(['QV'] + [qv_data.get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
522
523 # Fill the table with the Kmer completeness data
524 asm_table_data.append(['Kmer compl.'] + [completeness_data.get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
525
526 # Fill the table with the BUSCO data
527 for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']:
528 asm_table_data.append([metric] + [busco_data[metric].get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
529
530 # Reading CURATION NOTES section from yaml ####################################################
531
532 obs_haploid_num = yaml_data.get("NOTES", {}).get("Obs_Haploid_num", "NA")
533 obs_sex = yaml_data.get("NOTES", {}).get("Obs_Sex", "NA")
534 interventions_per_gb = yaml_data.get("NOTES", {}).get("Interventions_per_Gb", "NA")
535 contamination_notes = yaml_data.get("NOTES", {}).get("Contamination_notes", "NA")
536 other_notes = yaml_data.get("NOTES", {}).get("Other_notes", "NA")
537
538 # Extract Total bp for each haplotype and find the maximum
539 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
540 total_bp_values = []
541 for haplotype, properties in curated_assemblies.items():
542 if 'gfastats--nstar-report_txt' in properties:
543 total_bp = extract_total_bp_from_gfastats(properties['gfastats--nstar-report_txt'])
544 total_bp_values.append(total_bp)
545
546 max_total_bp = max(total_bp_values, default='NA')
547
548 # Create table data
549 genome_traits_table_data = [
550 ["Genome Traits", "Expected", "Observed"],
551 ["Haploid size (bp)", genome_haploid_length, f"{max_total_bp}"],
552 ["Haploid Number", f"{haploid_number} (source: {haploid_source})", obs_haploid_num],
553 ["Ploidy", f"{ploidy} (source: {ploidy_source})", proposed_ploidy],
554 ["Sample Sex", sex, obs_sex]
555 ]
556
557 # Get curator notes
558 curator_notes_text = (
559 f". Interventions/Gb: {interventions_per_gb}<br/>"
560 f". Contamination notes: &quot;{contamination_notes}&quot;<br/>"
561 f". Other observations: &quot;{other_notes}&quot;"
562 )
563
564 # PDF CONSTRUCTION ############################################################################
565
566 # Set up the PDF file
567 pdf_filename = "EAR.pdf"
568 margin = 0.5 * 72 # 0.5 inch in points (normal margin is 1 inch)
569 pdf = SimpleDocTemplate(pdf_filename,
570 pagesize=A4,
571 leftMargin=margin,
572 rightMargin=margin,
573 topMargin=margin,
574 bottomMargin=margin)
575 elements = []
576
577 # Set all the styles
578 styles = getSampleStyleSheet()
579 styles.add(ParagraphStyle(name='TitleStyle', fontName='Courier', fontSize=20))
580 styles.add(ParagraphStyle(name='subTitleStyle', fontName='Courier', fontSize=16))
581 styles.add(ParagraphStyle(name='normalStyle', fontName='Courier', fontSize=12))
582 styles.add(ParagraphStyle(name='midiStyle', fontName='Courier', fontSize=10))
583 styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True))
584 styles.add(ParagraphStyle(name='treeStyle', fontName='Courier', fontSize=10, leftIndent=12))
585 styles.add(ParagraphStyle(name='miniStyle', fontName='Courier', fontSize=8))
586 styles.add(ParagraphStyle(name='FileNameStyle', fontName='Courier', fontSize=6))
587
588 # PDF SECTION 1 -------------------------------------------------------------------------------
589
590 # Add the title
591 title = Paragraph("ERGA Assembly Report", styles['TitleStyle'])
592 elements.append(title)
593
594 # Spacer
595 elements.append(Spacer(1, 12))
596
597 # Add version
598 ver_paragraph = Paragraph(EAR_version, styles['normalStyle'])
599 elements.append(ver_paragraph)
600
601 # Spacer
602 elements.append(Spacer(1, 12))
603
604 # Add tags
605 tags_paragraph = Paragraph(f"Tags: {tags}", styles['normalStyle'])
606 elements.append(tags_paragraph)
607
608 # Spacer
609 elements.append(Spacer(1, 24))
610
611 # Create the SPECIES DATA table with the transposed data
612 sp_data_table = Table(transposed_sp_data)
613
614 # Style the table
615 sp_data_table.setStyle(TableStyle([
616 ("BACKGROUND", (0, 0), (0, -1), '#e7e7e7'), # Grey background for column 1
617 ("BACKGROUND", (1, 0), (1, -1), colors.white), # White background for column 2
618 ("ALIGN", (0, 0), (-1, -1), "CENTER"),
619 ('FONTNAME', (0, 0), (0, 0), 'Courier'), # Regular font for row1, col1
620 ('FONTNAME', (1, 0), (1, 0), 'Courier'),
621 ('FONTNAME', (0, 1), (-1, -1), 'Courier'), # Regular font for the rest of the table
622 ('FONTNAME', (1, 1), (1, 1), 'Courier-Bold'), # Bold font for row1, col2
623 ("FONTSIZE", (0, 0), (-1, -1), 14),
624 ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
625 ("GRID", (0, 0), (-1, -1), 0.5, colors.black)
626 ]))
627
628 # Add SPECIES DATA table
629 elements.append(sp_data_table)
630
631 # Spacer
632 elements.append(Spacer(1, 32))
633
634 # Create the GENOME TRAITS table
635 genome_traits_table = Table(genome_traits_table_data)
636
637 # Style the table
638 genome_traits_table.setStyle(TableStyle([
639 ('BACKGROUND', (0, 0), (0, -1), '#e7e7e7'),
640 ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
641 ('FONTNAME', (0, 0), (-1, -1), 'Courier'),
642 ('FONTSIZE', (0, 0), (-1, -1), 12),
643 ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
644 ("GRID", (0, 0), (-1, -1), 0.5, colors.black)
645 ]))
646
647 # Add GENOME TRAITS table
648 elements.append(genome_traits_table)
649
650 # Spacer
651 elements.append(Spacer(1, 28))
652
653 # Add EBP METRICS SECTION subtitle
654 subtitle = Paragraph("EBP metrics summary and curation notes", styles['subTitleStyle'])
655 elements.append(subtitle)
656
657 # Spacer
658 elements.append(Spacer(1, 24))
659
660 # Iterate over haplotypes in the Curated category to get data for EBP metrics
661 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
662 haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']
663
664 for haplotype in haplotype_names:
665 properties = curated_assemblies[haplotype]
666 if 'gfastats--nstar-report_txt' in properties and 'merqury_qv' in properties:
667 gfastats_path = properties['gfastats--nstar-report_txt']
668 order = haplotype_names.index(haplotype) # Determine the order based on the position of the haplotype in the list
669 qv_value = get_qv_value(properties['merqury_qv'], order, 'Curated', haplotype)
670
671 ebp_quality_metric = compute_ebp_metric(haplotype, gfastats_path, qv_value)
672 EBP_metric_paragraph = Paragraph(ebp_quality_metric, styles["midiStyle"])
673
674 # Add the EBP quality metric paragraph to elements
675 elements.append(EBP_metric_paragraph)
676
677 # Spacer
678 elements.append(Spacer(1, 8))
679
680 # Add sentence
681 Textline = Paragraph("The following metrics were automatically flagged as below EBP recommended standards or different from expected:", styles['midiStyle'])
682 elements.append(Textline)
683
684 # Spacer
685 elements.append(Spacer(1, 4))
686
687 # Apply checks and add warning paragraphs to elements
688 elements += generate_warning_paragraphs(genome_haploid_length, max_total_bp, "Haploid size (bp)")
689 elements += generate_warning_paragraphs(haploid_number, obs_haploid_num, "Haploid Number")
690 elements += generate_warning_paragraphs(proposed_ploidy, ploidy, "Ploidy")
691 elements += generate_warning_paragraphs(sex, obs_sex, "Sample Sex")
692
693 # Spacer
694 elements.append(Spacer(1, 4))
695
696 # Iterate over haplotypes in the Curated category and apply checks
697 for haplotype in haplotype_names:
698 properties = curated_assemblies[haplotype]
699 if isinstance(properties, dict) and 'merqury_qv' in properties and 'merqury_completeness_stats' in properties and 'busco_short_summary_txt' in properties:
700 order = haplotype_names.index(haplotype)
701 qv_value = get_qv_value(properties['merqury_qv'], order, "Curated", haplotype)
702 completeness_value = get_completeness_value(properties['merqury_completeness_stats'], order, "Curated", haplotype)
703 busco_scores = extract_busco_values(properties['busco_short_summary_txt'])
704
705 warnings = generate_curated_warnings(haplotype, qv_value, completeness_value, busco_scores)
706 elements += warnings
707
708 assembly_warnings = generate_assembly_warnings(asm_data, gaps_per_gbp_data, obs_haploid_num)
709 elements.extend(assembly_warnings)
710
711 # Spacer
712 elements.append(Spacer(1, 24))
713
714 # Add small subtitle for Curator notes
715 subtitle = Paragraph("Curator notes", styles['normalStyle'])
716 elements.append(subtitle)
717
718 # Spacer
719 elements.append(Spacer(1, 8))
720
721 # Curator notes
722 curator_notes_paragraph = Paragraph(curator_notes_text, styles["midiStyle"])
723 elements.append(curator_notes_paragraph)
724
725 # Page break
726 elements.append(PageBreak())
727
728 # PDF SECTION 2 -------------------------------------------------------------------------------
729
730 # Add quality metrics section subtitle
731 subtitle = Paragraph("Quality metrics table", styles['TitleStyle'])
732 elements.append(subtitle)
733
734 # Spacer
735 elements.append(Spacer(1, 48))
736
737 # create QUALITY METRICS table
738 asm_table = Table(asm_table_data)
739
740 # Style the table
741 asm_table.setStyle(TableStyle([
742 ('BACKGROUND', (0, 0), (-1, 0), '#e7e7e7'), # grey background for the header
743 ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # center alignment
744 ('FONTNAME', (0, 0), (-1, -1), 'Courier'), # bold font for the header
745 ('FONTSIZE', (0, 0), (-1, -1), 11), # font size
746 ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
747 ("GRID", (0, 0), (-1, -1), 0.5, colors.black)
748 ]))
749
750 # Add QUALITY METRICS table
751 elements.append(asm_table)
752
753 # Spacer
754 elements.append(Spacer(1, 5))
755
756 # Store BUSCO version and lineage information from each file in list
757 busco_info_list = []
758 for asm_stages, stage_properties in asm_data.items():
759 for haplotype_keys, haplotype_properties in stage_properties.items():
760 if isinstance(haplotype_properties, dict):
761 if 'busco_short_summary_txt' in haplotype_properties:
762 busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
763 if busco_version and lineage_info:
764 busco_info_list.append((busco_version, lineage_info))
765
766 # Checking if all elements in the list are identical
767 if all(info == busco_info_list[0] for info in busco_info_list):
768 busco_version, (lineage_name, num_genomes, num_buscos) = busco_info_list[0]
769 elements.append(Paragraph(f"BUSCO {busco_version} Lineage: {lineage_name} (genomes:{num_genomes}, BUSCOs:{num_buscos})", styles['miniStyle']))
770 else:
771 elements.append(Paragraph("Warning: BUSCO versions or lineage datasets are not the same across results", styles['miniStyle']))
772 logging.warning("WARNING!!! BUSCO versions or lineage datasets are not the same across results")
773
774 # Page break
775 elements.append(PageBreak())
776
777 # PDF SECTION 3 -------------------------------------------------------------------------------
778
779 # Add hic maps section subtitle
780 subtitle = Paragraph("HiC contact map of curated assembly", styles['TitleStyle'])
781 elements.append(subtitle)
782
783 # Spacer
784 elements.append(Spacer(1, 36))
785
786 # Initialize counter
787 tool_count = 0
788
789 # Add title and images for each step
790 for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
791 if asm_stages == 'Curated':
792 tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
793
794 images_with_names = []
795
796 for haplotype in tool_elements:
797 haplotype_properties = stage_properties[haplotype]
798
799 # Check if there is an image and/or a link
800 png_file = haplotype_properties.get('hic_FullMap_png', '')
801 link = haplotype_properties.get('hic_FullMap_link', '')
802
803 # Prepare paragraphs for the image and link
804 if png_file:
805 # Create image object
806 img = Image(png_file, width=11 * cm, height=11 * cm)
807 images_with_names.append([img])
808 else:
809 # Add paragraph for missing image
810 missing_png_paragraph = Paragraph(f"<b>{haplotype}</b> HiC PNG is missing!", styles["midiStyle"])
811 images_with_names.append([missing_png_paragraph])
812
813 # Add paragraph for the link
814 if link:
815 link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[LINK]</link>'
816 else:
817 link_html = f'<b>{haplotype}</b> File link is missing!'
818
819 link_paragraph = Paragraph(link_html, styles["midiStyle"])
820 images_with_names.append([link_paragraph])
821
822 # Append a spacer only if the next element is an image
823 if len(tool_elements) > 1 and tool_elements.index(haplotype) < len(tool_elements) - 1:
824 images_with_names.append([Spacer(1, 12)])
825
826 # Add images and names to the elements in pairs
827 for i in range(0, len(images_with_names), 4): # Process two images (and their names) at a time
828 elements_to_add = images_with_names[i:i + 4]
829
830 # Create table for the images and names
831 table = Table(elements_to_add)
832 table.hAlign = 'CENTER'
833 elements.append(table)
834
835 # Add a page break conditionally
836 next_elements_start = i + 4
837 if next_elements_start < len(images_with_names):
838 if len(images_with_names[next_elements_start]) > 0 and isinstance(images_with_names[next_elements_start][0], Image):
839 elements.append(PageBreak())
840
841 tool_count += 1
842
843 elements.append(PageBreak())
844
845 # PDF SECTION 4 -------------------------------------------------------------------------------
846
847 # Add kmer spectra section subtitle
848 subtitle = Paragraph("K-mer spectra of curated assembly", styles['TitleStyle'])
849 elements.append(subtitle)
850
851 # Spacer
852 elements.append(Spacer(1, 48))
853
854 # Initialize counter
855 counter = 0
856
857 # Iterate over haplotypes in the Curated category to get K-mer spectra images
858 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
859 haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']
860
861 # Get paths for spectra files
862 spectra_files = {
863 'hap1': {
864 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None),
865 },
866 'hap2': {
867 'spectra_cn_png': curated_assemblies.get('hap2', {}).get('merqury_hap_spectra_cn_png', None),
868 },
869 'common': {
870 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_cn_png', None),
871 'spectra_asm_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_asm_png', None),
872 }
873 }
874
875 # Filter out None values and empty strings
876 spectra_files = {k: {sk: v for sk, v in sv.items() if v} for k, sv in spectra_files.items()}
877
878 # Determine the number of spectra-cn files and assign unique names if needed
879 spectra_cn_files = [
880 spectra_files['common'].get('spectra_cn_png', None),
881 spectra_files['hap1'].get('spectra_cn_png', None),
882 spectra_files['hap2'].get('spectra_cn_png', None)
883 ]
884 spectra_cn_files = [f for f in spectra_cn_files if f] # Filter out None values
885
886 if len(spectra_cn_files) == 3:
887 # For 3 spectra-cn files
888 shortest_spectra_cn_file = min(spectra_cn_files, key=lambda f: len(os.path.basename(f)), default=None)
889 similar_files = [f for f in spectra_cn_files if f != shortest_spectra_cn_file]
890 if similar_files:
891 unique_name1, unique_name2 = find_unique_parts(os.path.basename(similar_files[0]), os.path.basename(similar_files[1]))
892 else:
893 shortest_spectra_cn_file = spectra_cn_files[0] if spectra_cn_files else None
894 unique_name1 = unique_name2 = None
895
896 # Create image objects and add filename below each image
897 images = []
898
899 for label, file_dict in spectra_files.items():
900 for key, png_file in file_dict.items():
901 if png_file:
902 image = Image(png_file, width=8.4 * cm, height=7 * cm)
903 filename = os.path.basename(png_file)
904
905 if filename.endswith("spectra-asm.ln.png"):
906 text = "Distribution of k-mer counts coloured by their presence in reads/assemblies"
907 elif filename.endswith("spectra-cn.ln.png"):
908 if len(spectra_cn_files) == 3:
909 # For 3 spectra-cn files use particular text
910 if png_file == shortest_spectra_cn_file:
911 text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)"
912 else:
913 if png_file == spectra_files['hap1'].get('spectra_cn_png', None):
914 text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name1}</b> (hapl.)"
915 elif png_file == spectra_files['hap2'].get('spectra_cn_png', None):
916 text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name2}</b> (hapl.)"
917 else:
918 text = "Distribution of k-mer counts per copy numbers found in asm"
919 else:
920 # For 2 spectra-cn files use same text
921 text = "Distribution of k-mer counts per copy numbers found in asm"
922 else:
923 text = filename
924
925 images.append([image, Paragraph(text, styles["midiStyle"])])
926
927 # Filter None values
928 images = [img for img in images if img[0] is not None]
929
930 # Get number of rows and columns for the table
931 num_rows = (len(images) + 1) // 2 # +1 to handle odd numbers of images
932 num_columns = 2
933
934 # Create the table with dynamic size
935 image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)]
936 image_table = Table(image_table_data)
937
938 # Style the "table"
939 table_style = TableStyle([
940 ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
941 ('BOTTOMPADDING', (0, 0), (-1, -1), 20), # 20 here is a spacer between rows
942 ])
943
944 # Set the style
945 image_table.setStyle(table_style)
946
947 # Add image table to elements
948 elements.append(image_table)
949
950 # Increase counter by the number of PNGs added
951 counter += len(images)
952
953 # If counter is a multiple of 4, insert a page break and reset counter
954 if counter % 4 == 0:
955 elements.append(PageBreak())
956
957 # Add spacer
958 elements.append(Spacer(1, 12))
959
960 # If we have processed all haps and the last page does not contain exactly 4 images, insert a page break
961 if counter % 4 != 0:
962 elements.append(PageBreak())
963
964 # PDF SECTION 5 -------------------------------------------------------------------------------
965
966 # Add contamination section subtitle
967 subtitle = Paragraph("Post-curation contamination screening", styles['TitleStyle'])
968 elements.append(subtitle)
969
970 # Spacer
971 elements.append(Spacer(1, 36))
972
973 # Initialize counter
974 tool_count = 0
975
976 # Add title and images for each step
977 for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
978 if asm_stages == 'Curated': # Check if the current stage is 'Curated'
979 tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
980
981 for haplotype in tool_elements:
982 haplotype_properties = stage_properties[haplotype]
983 if isinstance(haplotype_properties, dict) and 'blobplot_cont_png' in haplotype_properties:
984 # Get image path
985 png_file = haplotype_properties['blobplot_cont_png']
986
987 # If png_file is not empty, display it
988 if png_file:
989 # Create image object
990 img = Image(png_file, width=20 * cm, height=20 * cm)
991 elements.append(img)
992
993 # Create paragraph for filename with haplotype name
994 blob_text = f"<b>{haplotype}.</b> Bubble plot circles are scaled by sequence length, positioned by coverage and GC proportion, and coloured by taxonomy. Histograms show total assembly length distribution on each axis."
995 blob_paragraph = Paragraph(blob_text, styles["midiStyle"])
996 elements.append(blob_paragraph)
997 else:
998 # Add paragraph for missing image
999 missing_png_paragraph = Paragraph(f"<b>{haplotype}</b> PNG is missing!", styles["midiStyle"])
1000 elements.append(missing_png_paragraph)
1001
1002 # Add a page break after each image and its description
1003 elements.append(PageBreak())
1004
1005 tool_count += 1
1006
1007 # SECTION 6 -----------------------------------------------------------------------------------
1008
1009 # Add data profile section subtitle
1010 subtitle = Paragraph("Data profile", styles['TitleStyle'])
1011 elements.append(subtitle)
1012
1013 # Spacer
1014 elements.append(Spacer(1, 24))
1015
1016 # Create the DATA PROFILE table
1017 data_table = Table(table_data)
1018
1019 # Style the table
1020 data_table.setStyle(TableStyle([
1021 ('BACKGROUND', (0, 0), (0, -1), '#e7e7e7'), # grey background for the first column
1022 ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # center alignment
1023 ('FONTNAME', (0, 0), (-1, -1), 'Courier'), # remove bold font
1024 ('FONTSIZE', (0, 0), (-1, -1), 12), # font size for the header
1025 ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
1026 ("GRID", (0, 0), (-1, -1), 0.5, colors.black)
1027 ]))
1028
1029 # Add DATA PROFILE table
1030 elements.append(data_table)
1031
1032 # Spacer
1033 elements.append(Spacer(1, 32))
1034
1035 # Add assembly pipeline section subtitle
1036 subtitle = Paragraph("Assembly pipeline", styles['TitleStyle'])
1037 elements.append(subtitle)
1038
1039 # Spacer
1040 elements.append(Spacer(1, 24))
1041
1042 # Add ASM PIPELINE tree
1043 elements.append(Paragraph(asm_pipeline_tree, styles['treeStyle']))
1044
1045 # Spacer
1046 elements.append(Spacer(1, 32))
1047
1048 # Add curation pipeline section subtitle
1049 subtitle = Paragraph("Curation pipeline", styles['TitleStyle'])
1050 elements.append(subtitle)
1051
1052 # Spacer
1053 elements.append(Spacer(1, 24))
1054
1055 # Add CURATION PIPELINE tree
1056 elements.append(Paragraph(curation_pipeline_tree, styles['treeStyle']))
1057
1058 # Spacer
1059 elements.append(Spacer(1, 48))
1060
1061 # Add submitter, affiliation
1062 submitter_paragraph_style = ParagraphStyle(name='SubmitterStyle', fontName='Courier', fontSize=10)
1063 elements.append(Paragraph(f"Submitter: {submitter}", submitter_paragraph_style))
1064 elements.append(Paragraph(f"Affiliation: {affiliation}", submitter_paragraph_style))
1065
1066 # Spacer
1067 elements.append(Spacer(1, 8))
1068
1069 # Add the date and time (CET) of the document creation
1070 cet = pytz.timezone("CET")
1071 current_datetime = datetime.now(cet)
1072 formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S %Z")
1073 elements.append(Paragraph(f"Date and time: {formatted_datetime}", submitter_paragraph_style))
1074
1075 # Build the PDF ###############################################################################
1076 pdf.build(elements)
1077
1078
1079 if __name__ == "__main__":
1080 parser = argparse.ArgumentParser(description='Create an ERGA Assembly Report (EAR) from a YAML file. Visit https://github.com/ERGA-consortium/EARs for more information')
1081 parser.add_argument('yaml_file', type=str, help='Path to the YAML file')
1082 args = parser.parse_args()
1083
1084 make_report(args.yaml_file)