Mercurial > repos > bgruening > erga_ear
comparison make_EAR.py @ 0:6af76d4371f8 draft
planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit a08f73e00550020ac83f4d45045075962a8a2251
author | bgruening |
---|---|
date | Thu, 04 Jul 2024 12:43:34 +0000 |
parents | |
children | 82450f7907ef |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6af76d4371f8 |
---|---|
1 | |
2 import argparse | |
3 import glob | |
4 import logging | |
5 import math | |
6 import os | |
7 import re | |
8 import sys | |
9 from datetime import datetime | |
10 | |
11 import pytz | |
12 import requests | |
13 import yaml | |
14 from reportlab.lib import colors | |
15 from reportlab.lib.pagesizes import A4 | |
16 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle | |
17 from reportlab.lib.units import cm | |
18 from reportlab.platypus import Image, PageBreak, Paragraph, SimpleDocTemplate, Spacer, Table, TableStyle | |
19 | |
20 | |
21 # make_EAR_glxy.py | |
22 # CAUTION: This is for the Galaxy version! | |
23 # by Diego De Panis | |
24 # ERGA Sequencing and Assembly Committee | |
25 EAR_version = "v24.05.20_glxy_beta" | |
26 | |
27 | |
28 def make_report(yaml_file): | |
29 logging.basicConfig(filename='EAR.log', level=logging.INFO) | |
30 # Read the content from EAR.yaml file | |
31 with open(yaml_file, "r") as file: | |
32 yaml_data = yaml.safe_load(file) | |
33 | |
34 # FUNCTIONS ################################################################################### | |
35 | |
36 def format_number(value): | |
37 try: | |
38 value_float = float(value) | |
39 if value_float.is_integer(): | |
40 # format as an integer if no decimal part | |
41 return f'{int(value_float):,}' | |
42 else: | |
43 # format as a float | |
44 return f'{value_float:,}' | |
45 except ValueError: | |
46 # return the original value if it can't be converted to a float | |
47 return value | |
48 | |
49 # extract gfastats values | |
50 def extract_gfastats_values(content, keys): | |
51 return [re.findall(f"{key}: (.+)", content)[0] for key in keys] | |
52 | |
53 keys = [ | |
54 "Total scaffold length", | |
55 "GC content %", | |
56 "# gaps in scaffolds", | |
57 "Total gap length in scaffolds", | |
58 "# scaffolds", | |
59 "Scaffold N50", | |
60 "Scaffold L50", | |
61 "Scaffold L90", | |
62 "# contigs", | |
63 "Contig N50", | |
64 "Contig L50", | |
65 "Contig L90", | |
66 ] | |
67 | |
68 display_names = keys.copy() | |
69 display_names[display_names.index("Total scaffold length")] = "Total bp" | |
70 total_length_index = keys.index("Total scaffold length") | |
71 display_names[display_names.index("GC content %")] = "GC %" | |
72 display_names[display_names.index("Total gap length in scaffolds")] = "Total gap bp" | |
73 display_names[display_names.index("# scaffolds")] = "Scaffolds" | |
74 display_names[display_names.index("# contigs")] = "Contigs" | |
75 | |
76 gaps_index = keys.index("# gaps in scaffolds") | |
77 exclusion_list = ["# gaps in scaffolds"] | |
78 | |
79 # extract Total bp from gfastats report | |
80 def extract_total_bp_from_gfastats(gfastats_path): | |
81 with open(gfastats_path, "r") as f: | |
82 content = f.read() | |
83 total_bp = re.search(r"Total scaffold length: (.+)", content).group(1) | |
84 total_bp = int(total_bp.replace(',', '')) | |
85 return "{:,}".format(total_bp) | |
86 | |
87 # compute EBP quality metric | |
88 def compute_ebp_metric(haplotype, gfastats_path, qv_value): | |
89 keys_needed = ["Contig N50", "Scaffold N50"] | |
90 content = '' | |
91 with open(gfastats_path, "r") as f: | |
92 content = f.read() | |
93 | |
94 values = extract_gfastats_values(content, keys_needed) | |
95 contig_n50_log = math.floor(math.log10(int(values[0].replace(',', '')))) | |
96 scaffold_n50_log = math.floor(math.log10(int(values[1].replace(',', '')))) | |
97 | |
98 return f"Obtained EBP quality metric for {haplotype}: {contig_n50_log}.{scaffold_n50_log}.Q{math.floor(float(qv_value))}" | |
99 | |
100 # extract qv values | |
101 def get_qv_value(file_path, order, tool, haplotype): | |
102 try: | |
103 with open(file_path, 'r') as file: | |
104 lines = file.readlines() | |
105 if len(lines) > order and (len(lines) == 1 or lines[2].split('\t')[0].strip() == "Both"): | |
106 target_line = lines[order] | |
107 fourth_column_value = target_line.split('\t')[3] | |
108 return fourth_column_value | |
109 except Exception as e: | |
110 logging.error(f"Error reading {file_path} for tool {tool} and haplotype {haplotype}: {str(e)}") | |
111 return '' | |
112 | |
113 # extract Kmer completeness values | |
114 def get_completeness_value(file_path, order, tool, haplotype): | |
115 try: | |
116 with open(file_path, 'r') as file: | |
117 lines = file.readlines() | |
118 if len(lines) > order: | |
119 target_line = lines[order] | |
120 fifth_column_value = target_line.split('\t')[4].strip() | |
121 return fifth_column_value | |
122 except Exception as e: | |
123 logging.warning(f"Error reading {file_path}: {str(e)}") | |
124 return '' | |
125 | |
126 # Getting kmer plots for curated asm | |
127 def get_png_files(dir_path): | |
128 png_files = glob.glob(f"{dir_path}/*.ln.png") | |
129 if len(png_files) < 4: | |
130 logging.warning(f"Warning: Less than 4 png files found in {dir_path}. If this is diploid, some images may be missing.") | |
131 # fill missing with None | |
132 while len(png_files) < 4: | |
133 png_files.append(None) | |
134 return png_files[:4] | |
135 | |
136 # get unique part in file names | |
137 def find_unique_parts(file1, file2): | |
138 # Split filenames into parts | |
139 parts1 = file1.split('.') | |
140 parts2 = file2.split('.') | |
141 # Find unique parts | |
142 unique_parts1 = [part for part in parts1 if part not in parts2] | |
143 unique_parts2 = [part for part in parts2 if part not in parts1] | |
144 | |
145 return ' '.join(unique_parts1), ' '.join(unique_parts2) | |
146 | |
147 # extract BUSCO values | |
148 def extract_busco_values(file_path): | |
149 try: | |
150 with open(file_path, 'r') as file: | |
151 content = file.read() | |
152 results_line = re.findall(r"C:.*n:\d+", content)[0] | |
153 s_value = re.findall(r"S:(\d+\.\d+%)", results_line)[0] | |
154 d_value = re.findall(r"D:(\d+\.\d+%)", results_line)[0] | |
155 f_value = re.findall(r"F:(\d+\.\d+%)", results_line)[0] | |
156 m_value = re.findall(r"M:(\d+\.\d+%)", results_line)[0] | |
157 return s_value, d_value, f_value, m_value | |
158 except Exception as e: | |
159 logging.warning(f"Error reading {file_path}: {str(e)}") | |
160 return '', '', '', '' | |
161 | |
162 # extract BUSCO info | |
163 def extract_busco_info(file_path): | |
164 busco_version = None | |
165 lineage_info = None | |
166 | |
167 try: | |
168 with open(file_path, 'r') as file: | |
169 content = file.read() | |
170 version_match = re.search(r"# BUSCO version is: ([\d.]+)", content) | |
171 if version_match: | |
172 busco_version = version_match.group(1) | |
173 lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of genomes: (\d+), number of BUSCOs: (\d+)\)", content) | |
174 if lineage_match: | |
175 lineage_info = lineage_match.groups() | |
176 if not lineage_info: | |
177 lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of species: (\d+), number of BUSCOs: (\d+)\)", content) | |
178 if lineage_match: | |
179 lineage_info = lineage_match.groups() | |
180 | |
181 except Exception as e: | |
182 logging.warning(f"Error reading {file_path}: {str(e)}") | |
183 | |
184 return busco_version, lineage_info | |
185 | |
186 # Function to check and generate warning messages | |
187 def generate_warning_paragraphs(expected, observed, trait): | |
188 paragraphs = [] | |
189 try: | |
190 if trait == "Haploid size (bp)": | |
191 expected_val = int(expected.replace(',', '')) | |
192 observed_val = int(observed.replace(',', '')) | |
193 if abs(expected_val - observed_val) / expected_val > 0.20: | |
194 message = f". Observed {trait} has >20% difference with Expected" | |
195 paragraphs.append(Paragraph(message, styles["midiStyle"])) | |
196 elif trait in ["Haploid Number", "Ploidy"]: | |
197 # Ensure both values are integers for comparison | |
198 expected_val = int(expected) | |
199 observed_val = int(observed) | |
200 if expected_val != observed_val: | |
201 message = f". Observed {trait} is different from Expected" | |
202 paragraphs.append(Paragraph(message, styles["midiStyle"])) | |
203 elif trait == "Sample Sex": | |
204 # Compare case-insensitive and trimmed strings | |
205 if expected.strip().lower() != observed.strip().lower(): | |
206 message = ". Observed sex is different from Sample sex" | |
207 paragraphs.append(Paragraph(message, styles["midiStyle"])) | |
208 except Exception as e: | |
209 logging.warning(f"Error in generating warning for {trait}: {str(e)}") | |
210 return paragraphs | |
211 | |
212 # Generate warnings for curated haplotypes (qv, kcomp, busco) | |
213 def generate_curated_warnings(haplotype, qv_value, completeness_value, busco_scores): | |
214 paragraphs = [] | |
215 try: | |
216 # Ensure values are correctly interpreted as floats | |
217 qv_val = float(qv_value) | |
218 completeness_val = float(completeness_value) | |
219 s_value = float(busco_scores[0].rstrip('%')) | |
220 d_value = float(busco_scores[1].rstrip('%')) | |
221 | |
222 # Check QV value | |
223 if qv_val < 40: | |
224 message = f". QV value is less than 40 for {haplotype}" | |
225 paragraphs.append(Paragraph(message, styles["midiStyle"])) | |
226 | |
227 # Check Kmer completeness value | |
228 if completeness_val < 90: | |
229 message = f". Kmer completeness value is less than 90 for {haplotype}" | |
230 paragraphs.append(Paragraph(message, styles["midiStyle"])) | |
231 | |
232 # Check BUSCO s_value | |
233 if s_value < 90: | |
234 message = f". BUSCO single copy value is less than 90% for {haplotype}" | |
235 paragraphs.append(Paragraph(message, styles["midiStyle"])) | |
236 | |
237 # Check BUSCO d_value | |
238 if d_value > 5: | |
239 message = f". BUSCO duplicated value is more than 5% for {haplotype}" | |
240 paragraphs.append(Paragraph(message, styles["midiStyle"])) | |
241 | |
242 except Exception as e: | |
243 logging.warning(f"Error in generating warnings for {haplotype}: {str(e)}") | |
244 | |
245 return paragraphs | |
246 | |
247 # Generate warnings for curated haplotypes (loss, gaps, 90inChrom) | |
248 def generate_assembly_warnings(asm_data, gaps_per_gbp_data, obs_haploid_num): | |
249 warnings = [] | |
250 | |
251 # Iterate over haplotypes and generate warnings based on the criteria | |
252 for haplotype in asm_stages: | |
253 pre_curation_bp = extract_total_bp_from_gfastats(asm_data['Pre-curation'][haplotype]['gfastats--nstar-report_txt']) | |
254 curated_bp = extract_total_bp_from_gfastats(asm_data['Curated'][haplotype]['gfastats--nstar-report_txt']) | |
255 scaffold_l90 = float(gfastats_data[('Curated', haplotype)][display_names.index('Scaffold L90')].replace(',', '')) | |
256 | |
257 # Check for assembly length loss > 3% | |
258 if pre_curation_bp and curated_bp: | |
259 loss_percentage = (float(pre_curation_bp.replace(',', '')) - float(curated_bp.replace(',', ''))) / float(pre_curation_bp.replace(',', '')) * 100 | |
260 if loss_percentage > 3: | |
261 warnings.append(Paragraph(f". Assembly length loss > 3% for {haplotype}", styles["midiStyle"])) | |
262 | |
263 # Check for more than 1000 gaps/Gbp | |
264 gaps_gbp = gaps_per_gbp_data.get(('Curated', haplotype), 0) | |
265 if gaps_gbp > 1000: | |
266 warnings.append(Paragraph(f". More than 1000 gaps/Gbp for {haplotype}", styles["midiStyle"])) | |
267 | |
268 # Check if Scaffold L90 value is more than Observed Haploid number | |
269 if scaffold_l90 > float(obs_haploid_num): | |
270 warnings.append(Paragraph(f". Not 90% of assembly in chromosomes for {haplotype}", styles["midiStyle"])) | |
271 | |
272 return warnings | |
273 | |
274 # Parse pipeline and generate "tree" | |
275 def generate_pipeline_tree(pipeline_data): | |
276 tree_lines = [] | |
277 indent = " " * 2 # Adjust indent spacing as needed | |
278 | |
279 for tool_version_param in pipeline_data: | |
280 parts = tool_version_param.split('|') | |
281 tool_version = parts[0] | |
282 tool, version = tool_version.split('_v') if '_v' in tool_version else (tool_version, "NA") | |
283 | |
284 # Handle parameters: join all but the first (which is tool_version) with ', ' | |
285 param_text = ', '.join(parts[1:]) if len(parts) > 1 else "NA" | |
286 | |
287 # Tool line | |
288 tool_line = f"- <b>{tool}</b>" | |
289 tree_lines.append(tool_line) | |
290 | |
291 # Version line | |
292 version_line = f"{indent*2}|_ <i>ver:</i> {version}" | |
293 tree_lines.append(version_line) | |
294 | |
295 # Param line(s) | |
296 if param_text != "NA": | |
297 for param in param_text.split(','): | |
298 param = param.strip() | |
299 param_line = f"{indent*2}|_ <i>key param:</i> {param if param else 'NA'}" | |
300 tree_lines.append(param_line) | |
301 else: | |
302 param_line = f"{indent*2}|_ <i>key param:</i> NA" | |
303 tree_lines.append(param_line) | |
304 | |
305 # Join lines with HTML break for paragraph | |
306 tree_diagram = "<br/>".join(tree_lines) | |
307 return tree_diagram | |
308 | |
309 # Reading SAMPLE INFORMATION section from yaml ################################################ | |
310 | |
311 # Check for required fields | |
312 required_fields = ["ToLID", "Species", "Sex", "Submitter", "Affiliation", "Tags"] | |
313 missing_fields = [field for field in required_fields if field not in yaml_data or not yaml_data[field]] | |
314 | |
315 if missing_fields: | |
316 logging.error(f"# GENERAL INFORMATION section in the yaml file is missing or empty for the following information: {', '.join(missing_fields)}") | |
317 sys.exit(1) | |
318 | |
319 # Check that "Species" field is a string | |
320 if not isinstance(yaml_data["Species"], str): | |
321 logging.error(f"# GENERAL INFORMATION section in the yaml file contains incorrect data type for 'Species'. Expected 'str' but got '{type(yaml_data['Species']).__name__}'.") | |
322 sys.exit(1) | |
323 | |
324 # Get data for Header, ToLID table and submitter | |
325 tol_id = yaml_data["ToLID"] | |
326 species = yaml_data["Species"] | |
327 sex = yaml_data["Sex"] | |
328 submitter = yaml_data["Submitter"] | |
329 affiliation = yaml_data["Affiliation"] | |
330 tags = yaml_data["Tags"] | |
331 | |
332 # Check if tag is valid | |
333 valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Satellite"] | |
334 if tags not in valid_tags: | |
335 tags += "[INVALID TAG]" | |
336 logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Satellite") | |
337 | |
338 # Get data from GoaT based on species name | |
339 # urllib.parse.quote to handle special characters and spaces in the species name | |
340 species_name = requests.utils.quote(species) | |
341 | |
342 # Get stuff from GoaT | |
343 goat_response = requests.get(f'https://goat.genomehubs.org/api/v2/search?query=tax_name%28{species_name}%29&result=taxon') | |
344 goat_data = goat_response.json() # convert json to dict | |
345 | |
346 taxon_number = goat_data['results'][0]['result']['taxon_id'] | |
347 | |
348 goat_results = goat_data['results'] | |
349 | |
350 class_name = 'NA' | |
351 order_name = 'NA' | |
352 haploid_number = 'NA' | |
353 haploid_source = 'NA' | |
354 ploidy = 'NA' | |
355 ploidy_source = 'NA' | |
356 | |
357 for result in goat_results: | |
358 lineage = result['result']['lineage'] | |
359 for node in lineage: | |
360 if node['taxon_rank'] == 'class': | |
361 class_name = node['scientific_name'] | |
362 if node['taxon_rank'] == 'order': | |
363 order_name = node['scientific_name'] | |
364 | |
365 goat_second_response = requests.get(f'https://goat.genomehubs.org/api/v2/record?recordId={taxon_number}&result=taxon&taxonomy=ncbi') | |
366 goat_second_data = goat_second_response.json() | |
367 | |
368 ploidy_info = goat_second_data['records'][0]['record']['attributes']['ploidy'] | |
369 | |
370 ploidy = ploidy_info['value'] | |
371 ploidy_source = ploidy_info['aggregation_source'] | |
372 | |
373 haploid_info = goat_second_data['records'][0]['record']['attributes']['haploid_number'] | |
374 | |
375 haploid_number = haploid_info['value'] | |
376 haploid_source = haploid_info['aggregation_source'] | |
377 | |
378 sp_data = [ | |
379 ["TxID", "ToLID", "Species", "Class", "Order"], | |
380 [taxon_number, tol_id, species, class_name, order_name] | |
381 ] | |
382 | |
383 # Transpose the data | |
384 transposed_sp_data = list(map(list, zip(*sp_data))) | |
385 | |
386 # Reading SEQUENCING DATA section from yaml ################################################### | |
387 | |
388 # get DATA section from yaml | |
389 data_list = yaml_data.get('DATA', []) | |
390 | |
391 # Prepare headers | |
392 headers = ['Data'] | |
393 data_values = ['Coverage'] | |
394 | |
395 # Extract data from YAML and format it for the table | |
396 for item in data_list: | |
397 for technology, coverage in item.items(): | |
398 headers.append(technology) | |
399 data_values.append('NA' if not coverage else coverage) | |
400 | |
401 # Create a list of lists for the table | |
402 table_data = [headers, data_values] | |
403 | |
404 # Extract pipeline data from 'Pre-curation' category | |
405 asm_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Pre-curation', {}).get('pipeline', []) | |
406 asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data) | |
407 | |
408 # Extract pipeline data from 'Curated' category | |
409 curation_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}).get('pipeline', []) | |
410 curation_pipeline_tree = generate_pipeline_tree(curation_pipeline_data) | |
411 | |
412 # Reading GENOME PROFILING DATA section from yaml ############################################# | |
413 | |
414 profiling_data = yaml_data.get('PROFILING') | |
415 | |
416 # Check if profiling_data is available | |
417 if not profiling_data: | |
418 logging.error('Error: No profiling data found in the YAML file.') | |
419 sys.exit(1) | |
420 | |
421 # Handle GenomeScope specific processing | |
422 genomescope_data = profiling_data.get('GenomeScope') | |
423 if genomescope_data: | |
424 summary_file = genomescope_data.get('genomescope_summary_txt') | |
425 if summary_file and os.path.exists(summary_file): | |
426 with open(summary_file, "r") as f: | |
427 summary_txt = f.read() | |
428 genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1) | |
429 proposed_ploidy_match = re.search(r"p = (\d+)", summary_txt) | |
430 proposed_ploidy = proposed_ploidy_match.group(1) if proposed_ploidy_match else 'NA' | |
431 else: | |
432 logging.error(f"File {summary_file} not found for GenomeScope.") | |
433 sys.exit(1) | |
434 else: | |
435 logging.error("GenomeScope data is missing in the PROFILING section.") | |
436 sys.exit(1) | |
437 | |
438 # Handle Smudgeplot specific processing | |
439 smudgeplot_data = profiling_data.get('Smudgeplot') | |
440 if smudgeplot_data: | |
441 verbose_summary_file = smudgeplot_data.get('smudgeplot_verbose_summary_txt') | |
442 if verbose_summary_file and os.path.exists(verbose_summary_file): | |
443 with open(verbose_summary_file, "r") as f: | |
444 smud_summary_txt = f.readlines() | |
445 for line in smud_summary_txt: | |
446 if line.startswith("* Proposed ploidy"): | |
447 proposed_ploidy = line.split(":")[1].strip() | |
448 break | |
449 else: | |
450 logging.warning(f"Verbose summary file {verbose_summary_file} not found for Smudgeplot; skipping detailed Smudgeplot analysis.") | |
451 else: | |
452 logging.warning("Smudgeplot data is missing in the PROFILING section; skipping Smudgeplot analysis.") | |
453 | |
454 # Reading ASSEMBLY DATA section from yaml ##################################################### | |
455 | |
456 asm_data = yaml_data.get('ASSEMBLIES', {}) | |
457 | |
458 # make a list from the assemblies available in asm_data | |
459 asm_stages = [] | |
460 for asm_stage, stage_properties in asm_data.items(): | |
461 for haplotypes in stage_properties.keys(): | |
462 if haplotypes != 'pipeline' and haplotypes not in asm_stages: | |
463 asm_stages.append(haplotypes) | |
464 | |
465 # get gfastats-based data | |
466 gfastats_data = {} | |
467 for asm_stage, stage_properties in asm_data.items(): | |
468 for haplotypes, haplotype_properties in stage_properties.items(): | |
469 if isinstance(haplotype_properties, dict): | |
470 if 'gfastats--nstar-report_txt' in haplotype_properties: | |
471 file_path = haplotype_properties['gfastats--nstar-report_txt'] | |
472 with open(file_path, 'r') as file: | |
473 content = file.read() | |
474 gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys) | |
475 | |
476 gaps_per_gbp_data = {} | |
477 for (asm_stage, haplotypes), values in gfastats_data.items(): | |
478 try: | |
479 gaps = float(values[gaps_index]) | |
480 total_length = float(values[total_length_index]) | |
481 gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2) | |
482 gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp | |
483 except (ValueError, ZeroDivisionError): | |
484 gaps_per_gbp_data[(asm_stage, haplotypes)] = '' | |
485 | |
486 # Define the contigging table (column names) DON'T MOVE THIS AGAIN!!!!!!! | |
487 asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]] | |
488 | |
489 # Fill the table with the gfastats data | |
490 for i in range(len(display_names)): | |
491 metric = display_names[i] | |
492 if metric not in exclusion_list: | |
493 asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) | |
494 | |
495 # Add the gaps/gbp in between | |
496 gc_index = display_names.index("GC %") | |
497 gc_index | |
498 asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) | |
499 | |
500 # get QV, Kmer completeness and BUSCO data | |
501 qv_data = {} | |
502 completeness_data = {} | |
503 busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']} | |
504 for asm_stage, stage_properties in asm_data.items(): | |
505 asm_stage_elements = [element for element in stage_properties.keys() if element != 'pipeline'] | |
506 for i, haplotypes in enumerate(asm_stage_elements): | |
507 haplotype_properties = stage_properties[haplotypes] | |
508 if isinstance(haplotype_properties, dict): | |
509 if 'merqury_qv' in haplotype_properties: | |
510 qv_data[(asm_stage, haplotypes)] = get_qv_value(haplotype_properties['merqury_qv'], i, asm_stage, haplotypes) | |
511 if 'merqury_completeness_stats' in haplotype_properties: | |
512 completeness_data[(asm_stage, haplotypes)] = get_completeness_value(haplotype_properties['merqury_completeness_stats'], i, asm_stage, haplotypes) | |
513 if 'busco_short_summary_txt' in haplotype_properties: | |
514 s_value, d_value, f_value, m_value = extract_busco_values(haplotype_properties['busco_short_summary_txt']) | |
515 busco_data['BUSCO sing.'].update({(asm_stage, haplotypes): s_value}) | |
516 busco_data['BUSCO dupl.'].update({(asm_stage, haplotypes): d_value}) | |
517 busco_data['BUSCO frag.'].update({(asm_stage, haplotypes): f_value}) | |
518 busco_data['BUSCO miss.'].update({(asm_stage, haplotypes): m_value}) | |
519 | |
520 # Fill the table with the QV data | |
521 asm_table_data.append(['QV'] + [qv_data.get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) | |
522 | |
523 # Fill the table with the Kmer completeness data | |
524 asm_table_data.append(['Kmer compl.'] + [completeness_data.get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) | |
525 | |
526 # Fill the table with the BUSCO data | |
527 for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']: | |
528 asm_table_data.append([metric] + [busco_data[metric].get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) | |
529 | |
530 # Reading CURATION NOTES section from yaml #################################################### | |
531 | |
532 obs_haploid_num = yaml_data.get("NOTES", {}).get("Obs_Haploid_num", "NA") | |
533 obs_sex = yaml_data.get("NOTES", {}).get("Obs_Sex", "NA") | |
534 interventions_per_gb = yaml_data.get("NOTES", {}).get("Interventions_per_Gb", "NA") | |
535 contamination_notes = yaml_data.get("NOTES", {}).get("Contamination_notes", "NA") | |
536 other_notes = yaml_data.get("NOTES", {}).get("Other_notes", "NA") | |
537 | |
538 # Extract Total bp for each haplotype and find the maximum | |
539 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) | |
540 total_bp_values = [] | |
541 for haplotype, properties in curated_assemblies.items(): | |
542 if 'gfastats--nstar-report_txt' in properties: | |
543 total_bp = extract_total_bp_from_gfastats(properties['gfastats--nstar-report_txt']) | |
544 total_bp_values.append(total_bp) | |
545 | |
546 max_total_bp = max(total_bp_values, default='NA') | |
547 | |
548 # Create table data | |
549 genome_traits_table_data = [ | |
550 ["Genome Traits", "Expected", "Observed"], | |
551 ["Haploid size (bp)", genome_haploid_length, f"{max_total_bp}"], | |
552 ["Haploid Number", f"{haploid_number} (source: {haploid_source})", obs_haploid_num], | |
553 ["Ploidy", f"{ploidy} (source: {ploidy_source})", proposed_ploidy], | |
554 ["Sample Sex", sex, obs_sex] | |
555 ] | |
556 | |
557 # Get curator notes | |
558 curator_notes_text = ( | |
559 f". Interventions/Gb: {interventions_per_gb}<br/>" | |
560 f". Contamination notes: "{contamination_notes}"<br/>" | |
561 f". Other observations: "{other_notes}"" | |
562 ) | |
563 | |
564 # PDF CONSTRUCTION ############################################################################ | |
565 | |
566 # Set up the PDF file | |
567 pdf_filename = "EAR.pdf" | |
568 margin = 0.5 * 72 # 0.5 inch in points (normal margin is 1 inch) | |
569 pdf = SimpleDocTemplate(pdf_filename, | |
570 pagesize=A4, | |
571 leftMargin=margin, | |
572 rightMargin=margin, | |
573 topMargin=margin, | |
574 bottomMargin=margin) | |
575 elements = [] | |
576 | |
577 # Set all the styles | |
578 styles = getSampleStyleSheet() | |
579 styles.add(ParagraphStyle(name='TitleStyle', fontName='Courier', fontSize=20)) | |
580 styles.add(ParagraphStyle(name='subTitleStyle', fontName='Courier', fontSize=16)) | |
581 styles.add(ParagraphStyle(name='normalStyle', fontName='Courier', fontSize=12)) | |
582 styles.add(ParagraphStyle(name='midiStyle', fontName='Courier', fontSize=10)) | |
583 styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True)) | |
584 styles.add(ParagraphStyle(name='treeStyle', fontName='Courier', fontSize=10, leftIndent=12)) | |
585 styles.add(ParagraphStyle(name='miniStyle', fontName='Courier', fontSize=8)) | |
586 styles.add(ParagraphStyle(name='FileNameStyle', fontName='Courier', fontSize=6)) | |
587 | |
588 # PDF SECTION 1 ------------------------------------------------------------------------------- | |
589 | |
590 # Add the title | |
591 title = Paragraph("ERGA Assembly Report", styles['TitleStyle']) | |
592 elements.append(title) | |
593 | |
594 # Spacer | |
595 elements.append(Spacer(1, 12)) | |
596 | |
597 # Add version | |
598 ver_paragraph = Paragraph(EAR_version, styles['normalStyle']) | |
599 elements.append(ver_paragraph) | |
600 | |
601 # Spacer | |
602 elements.append(Spacer(1, 12)) | |
603 | |
604 # Add tags | |
605 tags_paragraph = Paragraph(f"Tags: {tags}", styles['normalStyle']) | |
606 elements.append(tags_paragraph) | |
607 | |
608 # Spacer | |
609 elements.append(Spacer(1, 24)) | |
610 | |
611 # Create the SPECIES DATA table with the transposed data | |
612 sp_data_table = Table(transposed_sp_data) | |
613 | |
614 # Style the table | |
615 sp_data_table.setStyle(TableStyle([ | |
616 ("BACKGROUND", (0, 0), (0, -1), '#e7e7e7'), # Grey background for column 1 | |
617 ("BACKGROUND", (1, 0), (1, -1), colors.white), # White background for column 2 | |
618 ("ALIGN", (0, 0), (-1, -1), "CENTER"), | |
619 ('FONTNAME', (0, 0), (0, 0), 'Courier'), # Regular font for row1, col1 | |
620 ('FONTNAME', (1, 0), (1, 0), 'Courier'), | |
621 ('FONTNAME', (0, 1), (-1, -1), 'Courier'), # Regular font for the rest of the table | |
622 ('FONTNAME', (1, 1), (1, 1), 'Courier-Bold'), # Bold font for row1, col2 | |
623 ("FONTSIZE", (0, 0), (-1, -1), 14), | |
624 ('BOTTOMPADDING', (0, 0), (-1, -1), 8), | |
625 ("GRID", (0, 0), (-1, -1), 0.5, colors.black) | |
626 ])) | |
627 | |
628 # Add SPECIES DATA table | |
629 elements.append(sp_data_table) | |
630 | |
631 # Spacer | |
632 elements.append(Spacer(1, 32)) | |
633 | |
634 # Create the GENOME TRAITS table | |
635 genome_traits_table = Table(genome_traits_table_data) | |
636 | |
637 # Style the table | |
638 genome_traits_table.setStyle(TableStyle([ | |
639 ('BACKGROUND', (0, 0), (0, -1), '#e7e7e7'), | |
640 ('ALIGN', (0, 0), (-1, -1), 'CENTER'), | |
641 ('FONTNAME', (0, 0), (-1, -1), 'Courier'), | |
642 ('FONTSIZE', (0, 0), (-1, -1), 12), | |
643 ('BOTTOMPADDING', (0, 0), (-1, -1), 8), | |
644 ("GRID", (0, 0), (-1, -1), 0.5, colors.black) | |
645 ])) | |
646 | |
647 # Add GENOME TRAITS table | |
648 elements.append(genome_traits_table) | |
649 | |
650 # Spacer | |
651 elements.append(Spacer(1, 28)) | |
652 | |
653 # Add EBP METRICS SECTION subtitle | |
654 subtitle = Paragraph("EBP metrics summary and curation notes", styles['subTitleStyle']) | |
655 elements.append(subtitle) | |
656 | |
657 # Spacer | |
658 elements.append(Spacer(1, 24)) | |
659 | |
660 # Iterate over haplotypes in the Curated category to get data for EBP metrics | |
661 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) | |
662 haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline'] | |
663 | |
664 for haplotype in haplotype_names: | |
665 properties = curated_assemblies[haplotype] | |
666 if 'gfastats--nstar-report_txt' in properties and 'merqury_qv' in properties: | |
667 gfastats_path = properties['gfastats--nstar-report_txt'] | |
668 order = haplotype_names.index(haplotype) # Determine the order based on the position of the haplotype in the list | |
669 qv_value = get_qv_value(properties['merqury_qv'], order, 'Curated', haplotype) | |
670 | |
671 ebp_quality_metric = compute_ebp_metric(haplotype, gfastats_path, qv_value) | |
672 EBP_metric_paragraph = Paragraph(ebp_quality_metric, styles["midiStyle"]) | |
673 | |
674 # Add the EBP quality metric paragraph to elements | |
675 elements.append(EBP_metric_paragraph) | |
676 | |
677 # Spacer | |
678 elements.append(Spacer(1, 8)) | |
679 | |
680 # Add sentence | |
681 Textline = Paragraph("The following metrics were automatically flagged as below EBP recommended standards or different from expected:", styles['midiStyle']) | |
682 elements.append(Textline) | |
683 | |
684 # Spacer | |
685 elements.append(Spacer(1, 4)) | |
686 | |
687 # Apply checks and add warning paragraphs to elements | |
688 elements += generate_warning_paragraphs(genome_haploid_length, max_total_bp, "Haploid size (bp)") | |
689 elements += generate_warning_paragraphs(haploid_number, obs_haploid_num, "Haploid Number") | |
690 elements += generate_warning_paragraphs(proposed_ploidy, ploidy, "Ploidy") | |
691 elements += generate_warning_paragraphs(sex, obs_sex, "Sample Sex") | |
692 | |
693 # Spacer | |
694 elements.append(Spacer(1, 4)) | |
695 | |
696 # Iterate over haplotypes in the Curated category and apply checks | |
697 for haplotype in haplotype_names: | |
698 properties = curated_assemblies[haplotype] | |
699 if isinstance(properties, dict) and 'merqury_qv' in properties and 'merqury_completeness_stats' in properties and 'busco_short_summary_txt' in properties: | |
700 order = haplotype_names.index(haplotype) | |
701 qv_value = get_qv_value(properties['merqury_qv'], order, "Curated", haplotype) | |
702 completeness_value = get_completeness_value(properties['merqury_completeness_stats'], order, "Curated", haplotype) | |
703 busco_scores = extract_busco_values(properties['busco_short_summary_txt']) | |
704 | |
705 warnings = generate_curated_warnings(haplotype, qv_value, completeness_value, busco_scores) | |
706 elements += warnings | |
707 | |
708 assembly_warnings = generate_assembly_warnings(asm_data, gaps_per_gbp_data, obs_haploid_num) | |
709 elements.extend(assembly_warnings) | |
710 | |
711 # Spacer | |
712 elements.append(Spacer(1, 24)) | |
713 | |
714 # Add small subtitle for Curator notes | |
715 subtitle = Paragraph("Curator notes", styles['normalStyle']) | |
716 elements.append(subtitle) | |
717 | |
718 # Spacer | |
719 elements.append(Spacer(1, 8)) | |
720 | |
721 # Curator notes | |
722 curator_notes_paragraph = Paragraph(curator_notes_text, styles["midiStyle"]) | |
723 elements.append(curator_notes_paragraph) | |
724 | |
725 # Page break | |
726 elements.append(PageBreak()) | |
727 | |
728 # PDF SECTION 2 ------------------------------------------------------------------------------- | |
729 | |
730 # Add quality metrics section subtitle | |
731 subtitle = Paragraph("Quality metrics table", styles['TitleStyle']) | |
732 elements.append(subtitle) | |
733 | |
734 # Spacer | |
735 elements.append(Spacer(1, 48)) | |
736 | |
737 # create QUALITY METRICS table | |
738 asm_table = Table(asm_table_data) | |
739 | |
740 # Style the table | |
741 asm_table.setStyle(TableStyle([ | |
742 ('BACKGROUND', (0, 0), (-1, 0), '#e7e7e7'), # grey background for the header | |
743 ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # center alignment | |
744 ('FONTNAME', (0, 0), (-1, -1), 'Courier'), # bold font for the header | |
745 ('FONTSIZE', (0, 0), (-1, -1), 11), # font size | |
746 ('BOTTOMPADDING', (0, 0), (-1, -1), 8), | |
747 ("GRID", (0, 0), (-1, -1), 0.5, colors.black) | |
748 ])) | |
749 | |
750 # Add QUALITY METRICS table | |
751 elements.append(asm_table) | |
752 | |
753 # Spacer | |
754 elements.append(Spacer(1, 5)) | |
755 | |
756 # Store BUSCO version and lineage information from each file in list | |
757 busco_info_list = [] | |
758 for asm_stages, stage_properties in asm_data.items(): | |
759 for haplotype_keys, haplotype_properties in stage_properties.items(): | |
760 if isinstance(haplotype_properties, dict): | |
761 if 'busco_short_summary_txt' in haplotype_properties: | |
762 busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt']) | |
763 if busco_version and lineage_info: | |
764 busco_info_list.append((busco_version, lineage_info)) | |
765 | |
766 # Checking if all elements in the list are identical | |
767 if all(info == busco_info_list[0] for info in busco_info_list): | |
768 busco_version, (lineage_name, num_genomes, num_buscos) = busco_info_list[0] | |
769 elements.append(Paragraph(f"BUSCO {busco_version} Lineage: {lineage_name} (genomes:{num_genomes}, BUSCOs:{num_buscos})", styles['miniStyle'])) | |
770 else: | |
771 elements.append(Paragraph("Warning: BUSCO versions or lineage datasets are not the same across results", styles['miniStyle'])) | |
772 logging.warning("WARNING!!! BUSCO versions or lineage datasets are not the same across results") | |
773 | |
774 # Page break | |
775 elements.append(PageBreak()) | |
776 | |
777 # PDF SECTION 3 ------------------------------------------------------------------------------- | |
778 | |
779 # Add hic maps section subtitle | |
780 subtitle = Paragraph("HiC contact map of curated assembly", styles['TitleStyle']) | |
781 elements.append(subtitle) | |
782 | |
783 # Spacer | |
784 elements.append(Spacer(1, 36)) | |
785 | |
786 # Initialize counter | |
787 tool_count = 0 | |
788 | |
789 # Add title and images for each step | |
790 for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1): | |
791 if asm_stages == 'Curated': | |
792 tool_elements = [element for element in stage_properties.keys() if element != 'pipeline'] | |
793 | |
794 images_with_names = [] | |
795 | |
796 for haplotype in tool_elements: | |
797 haplotype_properties = stage_properties[haplotype] | |
798 | |
799 # Check if there is an image and/or a link | |
800 png_file = haplotype_properties.get('hic_FullMap_png', '') | |
801 link = haplotype_properties.get('hic_FullMap_link', '') | |
802 | |
803 # Prepare paragraphs for the image and link | |
804 if png_file: | |
805 # Create image object | |
806 img = Image(png_file, width=11 * cm, height=11 * cm) | |
807 images_with_names.append([img]) | |
808 else: | |
809 # Add paragraph for missing image | |
810 missing_png_paragraph = Paragraph(f"<b>{haplotype}</b> HiC PNG is missing!", styles["midiStyle"]) | |
811 images_with_names.append([missing_png_paragraph]) | |
812 | |
813 # Add paragraph for the link | |
814 if link: | |
815 link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[LINK]</link>' | |
816 else: | |
817 link_html = f'<b>{haplotype}</b> File link is missing!' | |
818 | |
819 link_paragraph = Paragraph(link_html, styles["midiStyle"]) | |
820 images_with_names.append([link_paragraph]) | |
821 | |
822 # Append a spacer only if the next element is an image | |
823 if len(tool_elements) > 1 and tool_elements.index(haplotype) < len(tool_elements) - 1: | |
824 images_with_names.append([Spacer(1, 12)]) | |
825 | |
826 # Add images and names to the elements in pairs | |
827 for i in range(0, len(images_with_names), 4): # Process two images (and their names) at a time | |
828 elements_to_add = images_with_names[i:i + 4] | |
829 | |
830 # Create table for the images and names | |
831 table = Table(elements_to_add) | |
832 table.hAlign = 'CENTER' | |
833 elements.append(table) | |
834 | |
835 # Add a page break conditionally | |
836 next_elements_start = i + 4 | |
837 if next_elements_start < len(images_with_names): | |
838 if len(images_with_names[next_elements_start]) > 0 and isinstance(images_with_names[next_elements_start][0], Image): | |
839 elements.append(PageBreak()) | |
840 | |
841 tool_count += 1 | |
842 | |
843 elements.append(PageBreak()) | |
844 | |
845 # PDF SECTION 4 ------------------------------------------------------------------------------- | |
846 | |
847 # Add kmer spectra section subtitle | |
848 subtitle = Paragraph("K-mer spectra of curated assembly", styles['TitleStyle']) | |
849 elements.append(subtitle) | |
850 | |
851 # Spacer | |
852 elements.append(Spacer(1, 48)) | |
853 | |
854 # Initialize counter | |
855 counter = 0 | |
856 | |
857 # Iterate over haplotypes in the Curated category to get K-mer spectra images | |
858 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) | |
859 haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline'] | |
860 | |
861 # Get paths for spectra files | |
862 spectra_files = { | |
863 'hap1': { | |
864 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None), | |
865 }, | |
866 'hap2': { | |
867 'spectra_cn_png': curated_assemblies.get('hap2', {}).get('merqury_hap_spectra_cn_png', None), | |
868 }, | |
869 'common': { | |
870 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_cn_png', None), | |
871 'spectra_asm_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_asm_png', None), | |
872 } | |
873 } | |
874 | |
875 # Filter out None values and empty strings | |
876 spectra_files = {k: {sk: v for sk, v in sv.items() if v} for k, sv in spectra_files.items()} | |
877 | |
878 # Determine the number of spectra-cn files and assign unique names if needed | |
879 spectra_cn_files = [ | |
880 spectra_files['common'].get('spectra_cn_png', None), | |
881 spectra_files['hap1'].get('spectra_cn_png', None), | |
882 spectra_files['hap2'].get('spectra_cn_png', None) | |
883 ] | |
884 spectra_cn_files = [f for f in spectra_cn_files if f] # Filter out None values | |
885 | |
886 if len(spectra_cn_files) == 3: | |
887 # For 3 spectra-cn files | |
888 shortest_spectra_cn_file = min(spectra_cn_files, key=lambda f: len(os.path.basename(f)), default=None) | |
889 similar_files = [f for f in spectra_cn_files if f != shortest_spectra_cn_file] | |
890 if similar_files: | |
891 unique_name1, unique_name2 = find_unique_parts(os.path.basename(similar_files[0]), os.path.basename(similar_files[1])) | |
892 else: | |
893 shortest_spectra_cn_file = spectra_cn_files[0] if spectra_cn_files else None | |
894 unique_name1 = unique_name2 = None | |
895 | |
896 # Create image objects and add filename below each image | |
897 images = [] | |
898 | |
899 for label, file_dict in spectra_files.items(): | |
900 for key, png_file in file_dict.items(): | |
901 if png_file: | |
902 image = Image(png_file, width=8.4 * cm, height=7 * cm) | |
903 filename = os.path.basename(png_file) | |
904 | |
905 if filename.endswith("spectra-asm.ln.png"): | |
906 text = "Distribution of k-mer counts coloured by their presence in reads/assemblies" | |
907 elif filename.endswith("spectra-cn.ln.png"): | |
908 if len(spectra_cn_files) == 3: | |
909 # For 3 spectra-cn files use particular text | |
910 if png_file == shortest_spectra_cn_file: | |
911 text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)" | |
912 else: | |
913 if png_file == spectra_files['hap1'].get('spectra_cn_png', None): | |
914 text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name1}</b> (hapl.)" | |
915 elif png_file == spectra_files['hap2'].get('spectra_cn_png', None): | |
916 text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name2}</b> (hapl.)" | |
917 else: | |
918 text = "Distribution of k-mer counts per copy numbers found in asm" | |
919 else: | |
920 # For 2 spectra-cn files use same text | |
921 text = "Distribution of k-mer counts per copy numbers found in asm" | |
922 else: | |
923 text = filename | |
924 | |
925 images.append([image, Paragraph(text, styles["midiStyle"])]) | |
926 | |
927 # Filter None values | |
928 images = [img for img in images if img[0] is not None] | |
929 | |
930 # Get number of rows and columns for the table | |
931 num_rows = (len(images) + 1) // 2 # +1 to handle odd numbers of images | |
932 num_columns = 2 | |
933 | |
934 # Create the table with dynamic size | |
935 image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)] | |
936 image_table = Table(image_table_data) | |
937 | |
938 # Style the "table" | |
939 table_style = TableStyle([ | |
940 ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), | |
941 ('BOTTOMPADDING', (0, 0), (-1, -1), 20), # 20 here is a spacer between rows | |
942 ]) | |
943 | |
944 # Set the style | |
945 image_table.setStyle(table_style) | |
946 | |
947 # Add image table to elements | |
948 elements.append(image_table) | |
949 | |
950 # Increase counter by the number of PNGs added | |
951 counter += len(images) | |
952 | |
953 # If counter is a multiple of 4, insert a page break and reset counter | |
954 if counter % 4 == 0: | |
955 elements.append(PageBreak()) | |
956 | |
957 # Add spacer | |
958 elements.append(Spacer(1, 12)) | |
959 | |
960 # If we have processed all haps and the last page does not contain exactly 4 images, insert a page break | |
961 if counter % 4 != 0: | |
962 elements.append(PageBreak()) | |
963 | |
964 # PDF SECTION 5 ------------------------------------------------------------------------------- | |
965 | |
966 # Add contamination section subtitle | |
967 subtitle = Paragraph("Post-curation contamination screening", styles['TitleStyle']) | |
968 elements.append(subtitle) | |
969 | |
970 # Spacer | |
971 elements.append(Spacer(1, 36)) | |
972 | |
973 # Initialize counter | |
974 tool_count = 0 | |
975 | |
976 # Add title and images for each step | |
977 for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1): | |
978 if asm_stages == 'Curated': # Check if the current stage is 'Curated' | |
979 tool_elements = [element for element in stage_properties.keys() if element != 'pipeline'] | |
980 | |
981 for haplotype in tool_elements: | |
982 haplotype_properties = stage_properties[haplotype] | |
983 if isinstance(haplotype_properties, dict) and 'blobplot_cont_png' in haplotype_properties: | |
984 # Get image path | |
985 png_file = haplotype_properties['blobplot_cont_png'] | |
986 | |
987 # If png_file is not empty, display it | |
988 if png_file: | |
989 # Create image object | |
990 img = Image(png_file, width=20 * cm, height=20 * cm) | |
991 elements.append(img) | |
992 | |
993 # Create paragraph for filename with haplotype name | |
994 blob_text = f"<b>{haplotype}.</b> Bubble plot circles are scaled by sequence length, positioned by coverage and GC proportion, and coloured by taxonomy. Histograms show total assembly length distribution on each axis." | |
995 blob_paragraph = Paragraph(blob_text, styles["midiStyle"]) | |
996 elements.append(blob_paragraph) | |
997 else: | |
998 # Add paragraph for missing image | |
999 missing_png_paragraph = Paragraph(f"<b>{haplotype}</b> PNG is missing!", styles["midiStyle"]) | |
1000 elements.append(missing_png_paragraph) | |
1001 | |
1002 # Add a page break after each image and its description | |
1003 elements.append(PageBreak()) | |
1004 | |
1005 tool_count += 1 | |
1006 | |
1007 # SECTION 6 ----------------------------------------------------------------------------------- | |
1008 | |
1009 # Add data profile section subtitle | |
1010 subtitle = Paragraph("Data profile", styles['TitleStyle']) | |
1011 elements.append(subtitle) | |
1012 | |
1013 # Spacer | |
1014 elements.append(Spacer(1, 24)) | |
1015 | |
1016 # Create the DATA PROFILE table | |
1017 data_table = Table(table_data) | |
1018 | |
1019 # Style the table | |
1020 data_table.setStyle(TableStyle([ | |
1021 ('BACKGROUND', (0, 0), (0, -1), '#e7e7e7'), # grey background for the first column | |
1022 ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # center alignment | |
1023 ('FONTNAME', (0, 0), (-1, -1), 'Courier'), # remove bold font | |
1024 ('FONTSIZE', (0, 0), (-1, -1), 12), # font size for the header | |
1025 ('BOTTOMPADDING', (0, 0), (-1, -1), 8), | |
1026 ("GRID", (0, 0), (-1, -1), 0.5, colors.black) | |
1027 ])) | |
1028 | |
1029 # Add DATA PROFILE table | |
1030 elements.append(data_table) | |
1031 | |
1032 # Spacer | |
1033 elements.append(Spacer(1, 32)) | |
1034 | |
1035 # Add assembly pipeline section subtitle | |
1036 subtitle = Paragraph("Assembly pipeline", styles['TitleStyle']) | |
1037 elements.append(subtitle) | |
1038 | |
1039 # Spacer | |
1040 elements.append(Spacer(1, 24)) | |
1041 | |
1042 # Add ASM PIPELINE tree | |
1043 elements.append(Paragraph(asm_pipeline_tree, styles['treeStyle'])) | |
1044 | |
1045 # Spacer | |
1046 elements.append(Spacer(1, 32)) | |
1047 | |
1048 # Add curation pipeline section subtitle | |
1049 subtitle = Paragraph("Curation pipeline", styles['TitleStyle']) | |
1050 elements.append(subtitle) | |
1051 | |
1052 # Spacer | |
1053 elements.append(Spacer(1, 24)) | |
1054 | |
1055 # Add CURATION PIPELINE tree | |
1056 elements.append(Paragraph(curation_pipeline_tree, styles['treeStyle'])) | |
1057 | |
1058 # Spacer | |
1059 elements.append(Spacer(1, 48)) | |
1060 | |
1061 # Add submitter, affiliation | |
1062 submitter_paragraph_style = ParagraphStyle(name='SubmitterStyle', fontName='Courier', fontSize=10) | |
1063 elements.append(Paragraph(f"Submitter: {submitter}", submitter_paragraph_style)) | |
1064 elements.append(Paragraph(f"Affiliation: {affiliation}", submitter_paragraph_style)) | |
1065 | |
1066 # Spacer | |
1067 elements.append(Spacer(1, 8)) | |
1068 | |
1069 # Add the date and time (CET) of the document creation | |
1070 cet = pytz.timezone("CET") | |
1071 current_datetime = datetime.now(cet) | |
1072 formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S %Z") | |
1073 elements.append(Paragraph(f"Date and time: {formatted_datetime}", submitter_paragraph_style)) | |
1074 | |
1075 # Build the PDF ############################################################################### | |
1076 pdf.build(elements) | |
1077 | |
1078 | |
1079 if __name__ == "__main__": | |
1080 parser = argparse.ArgumentParser(description='Create an ERGA Assembly Report (EAR) from a YAML file. Visit https://github.com/ERGA-consortium/EARs for more information') | |
1081 parser.add_argument('yaml_file', type=str, help='Path to the YAML file') | |
1082 args = parser.parse_args() | |
1083 | |
1084 make_report(args.yaml_file) |