Mercurial > repos > bgruening > erga_ear
diff make_EAR.py @ 0:6af76d4371f8 draft
planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit a08f73e00550020ac83f4d45045075962a8a2251
author | bgruening |
---|---|
date | Thu, 04 Jul 2024 12:43:34 +0000 |
parents | |
children | 82450f7907ef |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make_EAR.py Thu Jul 04 12:43:34 2024 +0000 @@ -0,0 +1,1084 @@ + +import argparse +import glob +import logging +import math +import os +import re +import sys +from datetime import datetime + +import pytz +import requests +import yaml +from reportlab.lib import colors +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.units import cm +from reportlab.platypus import Image, PageBreak, Paragraph, SimpleDocTemplate, Spacer, Table, TableStyle + + +# make_EAR_glxy.py +# CAUTION: This is for the Galaxy version! +# by Diego De Panis +# ERGA Sequencing and Assembly Committee +EAR_version = "v24.05.20_glxy_beta" + + +def make_report(yaml_file): + logging.basicConfig(filename='EAR.log', level=logging.INFO) + # Read the content from EAR.yaml file + with open(yaml_file, "r") as file: + yaml_data = yaml.safe_load(file) + + # FUNCTIONS ################################################################################### + + def format_number(value): + try: + value_float = float(value) + if value_float.is_integer(): + # format as an integer if no decimal part + return f'{int(value_float):,}' + else: + # format as a float + return f'{value_float:,}' + except ValueError: + # return the original value if it can't be converted to a float + return value + + # extract gfastats values + def extract_gfastats_values(content, keys): + return [re.findall(f"{key}: (.+)", content)[0] for key in keys] + + keys = [ + "Total scaffold length", + "GC content %", + "# gaps in scaffolds", + "Total gap length in scaffolds", + "# scaffolds", + "Scaffold N50", + "Scaffold L50", + "Scaffold L90", + "# contigs", + "Contig N50", + "Contig L50", + "Contig L90", + ] + + display_names = keys.copy() + display_names[display_names.index("Total scaffold length")] = "Total bp" + total_length_index = keys.index("Total scaffold length") + display_names[display_names.index("GC content %")] = "GC %" + display_names[display_names.index("Total gap length in scaffolds")] = "Total gap bp" + display_names[display_names.index("# scaffolds")] = "Scaffolds" + display_names[display_names.index("# contigs")] = "Contigs" + + gaps_index = keys.index("# gaps in scaffolds") + exclusion_list = ["# gaps in scaffolds"] + + # extract Total bp from gfastats report + def extract_total_bp_from_gfastats(gfastats_path): + with open(gfastats_path, "r") as f: + content = f.read() + total_bp = re.search(r"Total scaffold length: (.+)", content).group(1) + total_bp = int(total_bp.replace(',', '')) + return "{:,}".format(total_bp) + + # compute EBP quality metric + def compute_ebp_metric(haplotype, gfastats_path, qv_value): + keys_needed = ["Contig N50", "Scaffold N50"] + content = '' + with open(gfastats_path, "r") as f: + content = f.read() + + values = extract_gfastats_values(content, keys_needed) + contig_n50_log = math.floor(math.log10(int(values[0].replace(',', '')))) + scaffold_n50_log = math.floor(math.log10(int(values[1].replace(',', '')))) + + return f"Obtained EBP quality metric for {haplotype}: {contig_n50_log}.{scaffold_n50_log}.Q{math.floor(float(qv_value))}" + + # extract qv values + def get_qv_value(file_path, order, tool, haplotype): + try: + with open(file_path, 'r') as file: + lines = file.readlines() + if len(lines) > order and (len(lines) == 1 or lines[2].split('\t')[0].strip() == "Both"): + target_line = lines[order] + fourth_column_value = target_line.split('\t')[3] + return fourth_column_value + except Exception as e: + logging.error(f"Error reading {file_path} for tool {tool} and haplotype {haplotype}: {str(e)}") + return '' + + # extract Kmer completeness values + def get_completeness_value(file_path, order, tool, haplotype): + try: + with open(file_path, 'r') as file: + lines = file.readlines() + if len(lines) > order: + target_line = lines[order] + fifth_column_value = target_line.split('\t')[4].strip() + return fifth_column_value + except Exception as e: + logging.warning(f"Error reading {file_path}: {str(e)}") + return '' + + # Getting kmer plots for curated asm + def get_png_files(dir_path): + png_files = glob.glob(f"{dir_path}/*.ln.png") + if len(png_files) < 4: + logging.warning(f"Warning: Less than 4 png files found in {dir_path}. If this is diploid, some images may be missing.") + # fill missing with None + while len(png_files) < 4: + png_files.append(None) + return png_files[:4] + + # get unique part in file names + def find_unique_parts(file1, file2): + # Split filenames into parts + parts1 = file1.split('.') + parts2 = file2.split('.') + # Find unique parts + unique_parts1 = [part for part in parts1 if part not in parts2] + unique_parts2 = [part for part in parts2 if part not in parts1] + + return ' '.join(unique_parts1), ' '.join(unique_parts2) + + # extract BUSCO values + def extract_busco_values(file_path): + try: + with open(file_path, 'r') as file: + content = file.read() + results_line = re.findall(r"C:.*n:\d+", content)[0] + s_value = re.findall(r"S:(\d+\.\d+%)", results_line)[0] + d_value = re.findall(r"D:(\d+\.\d+%)", results_line)[0] + f_value = re.findall(r"F:(\d+\.\d+%)", results_line)[0] + m_value = re.findall(r"M:(\d+\.\d+%)", results_line)[0] + return s_value, d_value, f_value, m_value + except Exception as e: + logging.warning(f"Error reading {file_path}: {str(e)}") + return '', '', '', '' + + # extract BUSCO info + def extract_busco_info(file_path): + busco_version = None + lineage_info = None + + try: + with open(file_path, 'r') as file: + content = file.read() + version_match = re.search(r"# BUSCO version is: ([\d.]+)", content) + if version_match: + busco_version = version_match.group(1) + lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of genomes: (\d+), number of BUSCOs: (\d+)\)", content) + if lineage_match: + lineage_info = lineage_match.groups() + if not lineage_info: + lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of species: (\d+), number of BUSCOs: (\d+)\)", content) + if lineage_match: + lineage_info = lineage_match.groups() + + except Exception as e: + logging.warning(f"Error reading {file_path}: {str(e)}") + + return busco_version, lineage_info + + # Function to check and generate warning messages + def generate_warning_paragraphs(expected, observed, trait): + paragraphs = [] + try: + if trait == "Haploid size (bp)": + expected_val = int(expected.replace(',', '')) + observed_val = int(observed.replace(',', '')) + if abs(expected_val - observed_val) / expected_val > 0.20: + message = f". Observed {trait} has >20% difference with Expected" + paragraphs.append(Paragraph(message, styles["midiStyle"])) + elif trait in ["Haploid Number", "Ploidy"]: + # Ensure both values are integers for comparison + expected_val = int(expected) + observed_val = int(observed) + if expected_val != observed_val: + message = f". Observed {trait} is different from Expected" + paragraphs.append(Paragraph(message, styles["midiStyle"])) + elif trait == "Sample Sex": + # Compare case-insensitive and trimmed strings + if expected.strip().lower() != observed.strip().lower(): + message = ". Observed sex is different from Sample sex" + paragraphs.append(Paragraph(message, styles["midiStyle"])) + except Exception as e: + logging.warning(f"Error in generating warning for {trait}: {str(e)}") + return paragraphs + + # Generate warnings for curated haplotypes (qv, kcomp, busco) + def generate_curated_warnings(haplotype, qv_value, completeness_value, busco_scores): + paragraphs = [] + try: + # Ensure values are correctly interpreted as floats + qv_val = float(qv_value) + completeness_val = float(completeness_value) + s_value = float(busco_scores[0].rstrip('%')) + d_value = float(busco_scores[1].rstrip('%')) + + # Check QV value + if qv_val < 40: + message = f". QV value is less than 40 for {haplotype}" + paragraphs.append(Paragraph(message, styles["midiStyle"])) + + # Check Kmer completeness value + if completeness_val < 90: + message = f". Kmer completeness value is less than 90 for {haplotype}" + paragraphs.append(Paragraph(message, styles["midiStyle"])) + + # Check BUSCO s_value + if s_value < 90: + message = f". BUSCO single copy value is less than 90% for {haplotype}" + paragraphs.append(Paragraph(message, styles["midiStyle"])) + + # Check BUSCO d_value + if d_value > 5: + message = f". BUSCO duplicated value is more than 5% for {haplotype}" + paragraphs.append(Paragraph(message, styles["midiStyle"])) + + except Exception as e: + logging.warning(f"Error in generating warnings for {haplotype}: {str(e)}") + + return paragraphs + + # Generate warnings for curated haplotypes (loss, gaps, 90inChrom) + def generate_assembly_warnings(asm_data, gaps_per_gbp_data, obs_haploid_num): + warnings = [] + + # Iterate over haplotypes and generate warnings based on the criteria + for haplotype in asm_stages: + pre_curation_bp = extract_total_bp_from_gfastats(asm_data['Pre-curation'][haplotype]['gfastats--nstar-report_txt']) + curated_bp = extract_total_bp_from_gfastats(asm_data['Curated'][haplotype]['gfastats--nstar-report_txt']) + scaffold_l90 = float(gfastats_data[('Curated', haplotype)][display_names.index('Scaffold L90')].replace(',', '')) + + # Check for assembly length loss > 3% + if pre_curation_bp and curated_bp: + loss_percentage = (float(pre_curation_bp.replace(',', '')) - float(curated_bp.replace(',', ''))) / float(pre_curation_bp.replace(',', '')) * 100 + if loss_percentage > 3: + warnings.append(Paragraph(f". Assembly length loss > 3% for {haplotype}", styles["midiStyle"])) + + # Check for more than 1000 gaps/Gbp + gaps_gbp = gaps_per_gbp_data.get(('Curated', haplotype), 0) + if gaps_gbp > 1000: + warnings.append(Paragraph(f". More than 1000 gaps/Gbp for {haplotype}", styles["midiStyle"])) + + # Check if Scaffold L90 value is more than Observed Haploid number + if scaffold_l90 > float(obs_haploid_num): + warnings.append(Paragraph(f". Not 90% of assembly in chromosomes for {haplotype}", styles["midiStyle"])) + + return warnings + + # Parse pipeline and generate "tree" + def generate_pipeline_tree(pipeline_data): + tree_lines = [] + indent = " " * 2 # Adjust indent spacing as needed + + for tool_version_param in pipeline_data: + parts = tool_version_param.split('|') + tool_version = parts[0] + tool, version = tool_version.split('_v') if '_v' in tool_version else (tool_version, "NA") + + # Handle parameters: join all but the first (which is tool_version) with ', ' + param_text = ', '.join(parts[1:]) if len(parts) > 1 else "NA" + + # Tool line + tool_line = f"- <b>{tool}</b>" + tree_lines.append(tool_line) + + # Version line + version_line = f"{indent*2}|_ <i>ver:</i> {version}" + tree_lines.append(version_line) + + # Param line(s) + if param_text != "NA": + for param in param_text.split(','): + param = param.strip() + param_line = f"{indent*2}|_ <i>key param:</i> {param if param else 'NA'}" + tree_lines.append(param_line) + else: + param_line = f"{indent*2}|_ <i>key param:</i> NA" + tree_lines.append(param_line) + + # Join lines with HTML break for paragraph + tree_diagram = "<br/>".join(tree_lines) + return tree_diagram + + # Reading SAMPLE INFORMATION section from yaml ################################################ + + # Check for required fields + required_fields = ["ToLID", "Species", "Sex", "Submitter", "Affiliation", "Tags"] + missing_fields = [field for field in required_fields if field not in yaml_data or not yaml_data[field]] + + if missing_fields: + logging.error(f"# GENERAL INFORMATION section in the yaml file is missing or empty for the following information: {', '.join(missing_fields)}") + sys.exit(1) + + # Check that "Species" field is a string + if not isinstance(yaml_data["Species"], str): + logging.error(f"# GENERAL INFORMATION section in the yaml file contains incorrect data type for 'Species'. Expected 'str' but got '{type(yaml_data['Species']).__name__}'.") + sys.exit(1) + + # Get data for Header, ToLID table and submitter + tol_id = yaml_data["ToLID"] + species = yaml_data["Species"] + sex = yaml_data["Sex"] + submitter = yaml_data["Submitter"] + affiliation = yaml_data["Affiliation"] + tags = yaml_data["Tags"] + + # Check if tag is valid + valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Satellite"] + if tags not in valid_tags: + tags += "[INVALID TAG]" + logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Satellite") + + # Get data from GoaT based on species name + # urllib.parse.quote to handle special characters and spaces in the species name + species_name = requests.utils.quote(species) + + # Get stuff from GoaT + goat_response = requests.get(f'https://goat.genomehubs.org/api/v2/search?query=tax_name%28{species_name}%29&result=taxon') + goat_data = goat_response.json() # convert json to dict + + taxon_number = goat_data['results'][0]['result']['taxon_id'] + + goat_results = goat_data['results'] + + class_name = 'NA' + order_name = 'NA' + haploid_number = 'NA' + haploid_source = 'NA' + ploidy = 'NA' + ploidy_source = 'NA' + + for result in goat_results: + lineage = result['result']['lineage'] + for node in lineage: + if node['taxon_rank'] == 'class': + class_name = node['scientific_name'] + if node['taxon_rank'] == 'order': + order_name = node['scientific_name'] + + goat_second_response = requests.get(f'https://goat.genomehubs.org/api/v2/record?recordId={taxon_number}&result=taxon&taxonomy=ncbi') + goat_second_data = goat_second_response.json() + + ploidy_info = goat_second_data['records'][0]['record']['attributes']['ploidy'] + + ploidy = ploidy_info['value'] + ploidy_source = ploidy_info['aggregation_source'] + + haploid_info = goat_second_data['records'][0]['record']['attributes']['haploid_number'] + + haploid_number = haploid_info['value'] + haploid_source = haploid_info['aggregation_source'] + + sp_data = [ + ["TxID", "ToLID", "Species", "Class", "Order"], + [taxon_number, tol_id, species, class_name, order_name] + ] + + # Transpose the data + transposed_sp_data = list(map(list, zip(*sp_data))) + + # Reading SEQUENCING DATA section from yaml ################################################### + + # get DATA section from yaml + data_list = yaml_data.get('DATA', []) + + # Prepare headers + headers = ['Data'] + data_values = ['Coverage'] + + # Extract data from YAML and format it for the table + for item in data_list: + for technology, coverage in item.items(): + headers.append(technology) + data_values.append('NA' if not coverage else coverage) + + # Create a list of lists for the table + table_data = [headers, data_values] + + # Extract pipeline data from 'Pre-curation' category + asm_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Pre-curation', {}).get('pipeline', []) + asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data) + + # Extract pipeline data from 'Curated' category + curation_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}).get('pipeline', []) + curation_pipeline_tree = generate_pipeline_tree(curation_pipeline_data) + + # Reading GENOME PROFILING DATA section from yaml ############################################# + + profiling_data = yaml_data.get('PROFILING') + + # Check if profiling_data is available + if not profiling_data: + logging.error('Error: No profiling data found in the YAML file.') + sys.exit(1) + + # Handle GenomeScope specific processing + genomescope_data = profiling_data.get('GenomeScope') + if genomescope_data: + summary_file = genomescope_data.get('genomescope_summary_txt') + if summary_file and os.path.exists(summary_file): + with open(summary_file, "r") as f: + summary_txt = f.read() + genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1) + proposed_ploidy_match = re.search(r"p = (\d+)", summary_txt) + proposed_ploidy = proposed_ploidy_match.group(1) if proposed_ploidy_match else 'NA' + else: + logging.error(f"File {summary_file} not found for GenomeScope.") + sys.exit(1) + else: + logging.error("GenomeScope data is missing in the PROFILING section.") + sys.exit(1) + + # Handle Smudgeplot specific processing + smudgeplot_data = profiling_data.get('Smudgeplot') + if smudgeplot_data: + verbose_summary_file = smudgeplot_data.get('smudgeplot_verbose_summary_txt') + if verbose_summary_file and os.path.exists(verbose_summary_file): + with open(verbose_summary_file, "r") as f: + smud_summary_txt = f.readlines() + for line in smud_summary_txt: + if line.startswith("* Proposed ploidy"): + proposed_ploidy = line.split(":")[1].strip() + break + else: + logging.warning(f"Verbose summary file {verbose_summary_file} not found for Smudgeplot; skipping detailed Smudgeplot analysis.") + else: + logging.warning("Smudgeplot data is missing in the PROFILING section; skipping Smudgeplot analysis.") + + # Reading ASSEMBLY DATA section from yaml ##################################################### + + asm_data = yaml_data.get('ASSEMBLIES', {}) + + # make a list from the assemblies available in asm_data + asm_stages = [] + for asm_stage, stage_properties in asm_data.items(): + for haplotypes in stage_properties.keys(): + if haplotypes != 'pipeline' and haplotypes not in asm_stages: + asm_stages.append(haplotypes) + + # get gfastats-based data + gfastats_data = {} + for asm_stage, stage_properties in asm_data.items(): + for haplotypes, haplotype_properties in stage_properties.items(): + if isinstance(haplotype_properties, dict): + if 'gfastats--nstar-report_txt' in haplotype_properties: + file_path = haplotype_properties['gfastats--nstar-report_txt'] + with open(file_path, 'r') as file: + content = file.read() + gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys) + + gaps_per_gbp_data = {} + for (asm_stage, haplotypes), values in gfastats_data.items(): + try: + gaps = float(values[gaps_index]) + total_length = float(values[total_length_index]) + gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2) + gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp + except (ValueError, ZeroDivisionError): + gaps_per_gbp_data[(asm_stage, haplotypes)] = '' + + # Define the contigging table (column names) DON'T MOVE THIS AGAIN!!!!!!! + asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]] + + # Fill the table with the gfastats data + for i in range(len(display_names)): + metric = display_names[i] + if metric not in exclusion_list: + asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) + + # Add the gaps/gbp in between + gc_index = display_names.index("GC %") + gc_index + asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) + + # get QV, Kmer completeness and BUSCO data + qv_data = {} + completeness_data = {} + busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']} + for asm_stage, stage_properties in asm_data.items(): + asm_stage_elements = [element for element in stage_properties.keys() if element != 'pipeline'] + for i, haplotypes in enumerate(asm_stage_elements): + haplotype_properties = stage_properties[haplotypes] + if isinstance(haplotype_properties, dict): + if 'merqury_qv' in haplotype_properties: + qv_data[(asm_stage, haplotypes)] = get_qv_value(haplotype_properties['merqury_qv'], i, asm_stage, haplotypes) + if 'merqury_completeness_stats' in haplotype_properties: + completeness_data[(asm_stage, haplotypes)] = get_completeness_value(haplotype_properties['merqury_completeness_stats'], i, asm_stage, haplotypes) + if 'busco_short_summary_txt' in haplotype_properties: + s_value, d_value, f_value, m_value = extract_busco_values(haplotype_properties['busco_short_summary_txt']) + busco_data['BUSCO sing.'].update({(asm_stage, haplotypes): s_value}) + busco_data['BUSCO dupl.'].update({(asm_stage, haplotypes): d_value}) + busco_data['BUSCO frag.'].update({(asm_stage, haplotypes): f_value}) + busco_data['BUSCO miss.'].update({(asm_stage, haplotypes): m_value}) + + # Fill the table with the QV data + asm_table_data.append(['QV'] + [qv_data.get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) + + # Fill the table with the Kmer completeness data + asm_table_data.append(['Kmer compl.'] + [completeness_data.get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) + + # Fill the table with the BUSCO data + for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']: + asm_table_data.append([metric] + [busco_data[metric].get((asm_stage, haplotypes), '') for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) + + # Reading CURATION NOTES section from yaml #################################################### + + obs_haploid_num = yaml_data.get("NOTES", {}).get("Obs_Haploid_num", "NA") + obs_sex = yaml_data.get("NOTES", {}).get("Obs_Sex", "NA") + interventions_per_gb = yaml_data.get("NOTES", {}).get("Interventions_per_Gb", "NA") + contamination_notes = yaml_data.get("NOTES", {}).get("Contamination_notes", "NA") + other_notes = yaml_data.get("NOTES", {}).get("Other_notes", "NA") + + # Extract Total bp for each haplotype and find the maximum + curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) + total_bp_values = [] + for haplotype, properties in curated_assemblies.items(): + if 'gfastats--nstar-report_txt' in properties: + total_bp = extract_total_bp_from_gfastats(properties['gfastats--nstar-report_txt']) + total_bp_values.append(total_bp) + + max_total_bp = max(total_bp_values, default='NA') + + # Create table data + genome_traits_table_data = [ + ["Genome Traits", "Expected", "Observed"], + ["Haploid size (bp)", genome_haploid_length, f"{max_total_bp}"], + ["Haploid Number", f"{haploid_number} (source: {haploid_source})", obs_haploid_num], + ["Ploidy", f"{ploidy} (source: {ploidy_source})", proposed_ploidy], + ["Sample Sex", sex, obs_sex] + ] + + # Get curator notes + curator_notes_text = ( + f". Interventions/Gb: {interventions_per_gb}<br/>" + f". Contamination notes: "{contamination_notes}"<br/>" + f". Other observations: "{other_notes}"" + ) + + # PDF CONSTRUCTION ############################################################################ + + # Set up the PDF file + pdf_filename = "EAR.pdf" + margin = 0.5 * 72 # 0.5 inch in points (normal margin is 1 inch) + pdf = SimpleDocTemplate(pdf_filename, + pagesize=A4, + leftMargin=margin, + rightMargin=margin, + topMargin=margin, + bottomMargin=margin) + elements = [] + + # Set all the styles + styles = getSampleStyleSheet() + styles.add(ParagraphStyle(name='TitleStyle', fontName='Courier', fontSize=20)) + styles.add(ParagraphStyle(name='subTitleStyle', fontName='Courier', fontSize=16)) + styles.add(ParagraphStyle(name='normalStyle', fontName='Courier', fontSize=12)) + styles.add(ParagraphStyle(name='midiStyle', fontName='Courier', fontSize=10)) + styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True)) + styles.add(ParagraphStyle(name='treeStyle', fontName='Courier', fontSize=10, leftIndent=12)) + styles.add(ParagraphStyle(name='miniStyle', fontName='Courier', fontSize=8)) + styles.add(ParagraphStyle(name='FileNameStyle', fontName='Courier', fontSize=6)) + + # PDF SECTION 1 ------------------------------------------------------------------------------- + + # Add the title + title = Paragraph("ERGA Assembly Report", styles['TitleStyle']) + elements.append(title) + + # Spacer + elements.append(Spacer(1, 12)) + + # Add version + ver_paragraph = Paragraph(EAR_version, styles['normalStyle']) + elements.append(ver_paragraph) + + # Spacer + elements.append(Spacer(1, 12)) + + # Add tags + tags_paragraph = Paragraph(f"Tags: {tags}", styles['normalStyle']) + elements.append(tags_paragraph) + + # Spacer + elements.append(Spacer(1, 24)) + + # Create the SPECIES DATA table with the transposed data + sp_data_table = Table(transposed_sp_data) + + # Style the table + sp_data_table.setStyle(TableStyle([ + ("BACKGROUND", (0, 0), (0, -1), '#e7e7e7'), # Grey background for column 1 + ("BACKGROUND", (1, 0), (1, -1), colors.white), # White background for column 2 + ("ALIGN", (0, 0), (-1, -1), "CENTER"), + ('FONTNAME', (0, 0), (0, 0), 'Courier'), # Regular font for row1, col1 + ('FONTNAME', (1, 0), (1, 0), 'Courier'), + ('FONTNAME', (0, 1), (-1, -1), 'Courier'), # Regular font for the rest of the table + ('FONTNAME', (1, 1), (1, 1), 'Courier-Bold'), # Bold font for row1, col2 + ("FONTSIZE", (0, 0), (-1, -1), 14), + ('BOTTOMPADDING', (0, 0), (-1, -1), 8), + ("GRID", (0, 0), (-1, -1), 0.5, colors.black) + ])) + + # Add SPECIES DATA table + elements.append(sp_data_table) + + # Spacer + elements.append(Spacer(1, 32)) + + # Create the GENOME TRAITS table + genome_traits_table = Table(genome_traits_table_data) + + # Style the table + genome_traits_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (0, -1), '#e7e7e7'), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, -1), 'Courier'), + ('FONTSIZE', (0, 0), (-1, -1), 12), + ('BOTTOMPADDING', (0, 0), (-1, -1), 8), + ("GRID", (0, 0), (-1, -1), 0.5, colors.black) + ])) + + # Add GENOME TRAITS table + elements.append(genome_traits_table) + + # Spacer + elements.append(Spacer(1, 28)) + + # Add EBP METRICS SECTION subtitle + subtitle = Paragraph("EBP metrics summary and curation notes", styles['subTitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 24)) + + # Iterate over haplotypes in the Curated category to get data for EBP metrics + curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) + haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline'] + + for haplotype in haplotype_names: + properties = curated_assemblies[haplotype] + if 'gfastats--nstar-report_txt' in properties and 'merqury_qv' in properties: + gfastats_path = properties['gfastats--nstar-report_txt'] + order = haplotype_names.index(haplotype) # Determine the order based on the position of the haplotype in the list + qv_value = get_qv_value(properties['merqury_qv'], order, 'Curated', haplotype) + + ebp_quality_metric = compute_ebp_metric(haplotype, gfastats_path, qv_value) + EBP_metric_paragraph = Paragraph(ebp_quality_metric, styles["midiStyle"]) + + # Add the EBP quality metric paragraph to elements + elements.append(EBP_metric_paragraph) + + # Spacer + elements.append(Spacer(1, 8)) + + # Add sentence + Textline = Paragraph("The following metrics were automatically flagged as below EBP recommended standards or different from expected:", styles['midiStyle']) + elements.append(Textline) + + # Spacer + elements.append(Spacer(1, 4)) + + # Apply checks and add warning paragraphs to elements + elements += generate_warning_paragraphs(genome_haploid_length, max_total_bp, "Haploid size (bp)") + elements += generate_warning_paragraphs(haploid_number, obs_haploid_num, "Haploid Number") + elements += generate_warning_paragraphs(proposed_ploidy, ploidy, "Ploidy") + elements += generate_warning_paragraphs(sex, obs_sex, "Sample Sex") + + # Spacer + elements.append(Spacer(1, 4)) + + # Iterate over haplotypes in the Curated category and apply checks + for haplotype in haplotype_names: + properties = curated_assemblies[haplotype] + if isinstance(properties, dict) and 'merqury_qv' in properties and 'merqury_completeness_stats' in properties and 'busco_short_summary_txt' in properties: + order = haplotype_names.index(haplotype) + qv_value = get_qv_value(properties['merqury_qv'], order, "Curated", haplotype) + completeness_value = get_completeness_value(properties['merqury_completeness_stats'], order, "Curated", haplotype) + busco_scores = extract_busco_values(properties['busco_short_summary_txt']) + + warnings = generate_curated_warnings(haplotype, qv_value, completeness_value, busco_scores) + elements += warnings + + assembly_warnings = generate_assembly_warnings(asm_data, gaps_per_gbp_data, obs_haploid_num) + elements.extend(assembly_warnings) + + # Spacer + elements.append(Spacer(1, 24)) + + # Add small subtitle for Curator notes + subtitle = Paragraph("Curator notes", styles['normalStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 8)) + + # Curator notes + curator_notes_paragraph = Paragraph(curator_notes_text, styles["midiStyle"]) + elements.append(curator_notes_paragraph) + + # Page break + elements.append(PageBreak()) + + # PDF SECTION 2 ------------------------------------------------------------------------------- + + # Add quality metrics section subtitle + subtitle = Paragraph("Quality metrics table", styles['TitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 48)) + + # create QUALITY METRICS table + asm_table = Table(asm_table_data) + + # Style the table + asm_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), '#e7e7e7'), # grey background for the header + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # center alignment + ('FONTNAME', (0, 0), (-1, -1), 'Courier'), # bold font for the header + ('FONTSIZE', (0, 0), (-1, -1), 11), # font size + ('BOTTOMPADDING', (0, 0), (-1, -1), 8), + ("GRID", (0, 0), (-1, -1), 0.5, colors.black) + ])) + + # Add QUALITY METRICS table + elements.append(asm_table) + + # Spacer + elements.append(Spacer(1, 5)) + + # Store BUSCO version and lineage information from each file in list + busco_info_list = [] + for asm_stages, stage_properties in asm_data.items(): + for haplotype_keys, haplotype_properties in stage_properties.items(): + if isinstance(haplotype_properties, dict): + if 'busco_short_summary_txt' in haplotype_properties: + busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt']) + if busco_version and lineage_info: + busco_info_list.append((busco_version, lineage_info)) + + # Checking if all elements in the list are identical + if all(info == busco_info_list[0] for info in busco_info_list): + busco_version, (lineage_name, num_genomes, num_buscos) = busco_info_list[0] + elements.append(Paragraph(f"BUSCO {busco_version} Lineage: {lineage_name} (genomes:{num_genomes}, BUSCOs:{num_buscos})", styles['miniStyle'])) + else: + elements.append(Paragraph("Warning: BUSCO versions or lineage datasets are not the same across results", styles['miniStyle'])) + logging.warning("WARNING!!! BUSCO versions or lineage datasets are not the same across results") + + # Page break + elements.append(PageBreak()) + + # PDF SECTION 3 ------------------------------------------------------------------------------- + + # Add hic maps section subtitle + subtitle = Paragraph("HiC contact map of curated assembly", styles['TitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 36)) + + # Initialize counter + tool_count = 0 + + # Add title and images for each step + for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1): + if asm_stages == 'Curated': + tool_elements = [element for element in stage_properties.keys() if element != 'pipeline'] + + images_with_names = [] + + for haplotype in tool_elements: + haplotype_properties = stage_properties[haplotype] + + # Check if there is an image and/or a link + png_file = haplotype_properties.get('hic_FullMap_png', '') + link = haplotype_properties.get('hic_FullMap_link', '') + + # Prepare paragraphs for the image and link + if png_file: + # Create image object + img = Image(png_file, width=11 * cm, height=11 * cm) + images_with_names.append([img]) + else: + # Add paragraph for missing image + missing_png_paragraph = Paragraph(f"<b>{haplotype}</b> HiC PNG is missing!", styles["midiStyle"]) + images_with_names.append([missing_png_paragraph]) + + # Add paragraph for the link + if link: + link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[LINK]</link>' + else: + link_html = f'<b>{haplotype}</b> File link is missing!' + + link_paragraph = Paragraph(link_html, styles["midiStyle"]) + images_with_names.append([link_paragraph]) + + # Append a spacer only if the next element is an image + if len(tool_elements) > 1 and tool_elements.index(haplotype) < len(tool_elements) - 1: + images_with_names.append([Spacer(1, 12)]) + + # Add images and names to the elements in pairs + for i in range(0, len(images_with_names), 4): # Process two images (and their names) at a time + elements_to_add = images_with_names[i:i + 4] + + # Create table for the images and names + table = Table(elements_to_add) + table.hAlign = 'CENTER' + elements.append(table) + + # Add a page break conditionally + next_elements_start = i + 4 + if next_elements_start < len(images_with_names): + if len(images_with_names[next_elements_start]) > 0 and isinstance(images_with_names[next_elements_start][0], Image): + elements.append(PageBreak()) + + tool_count += 1 + + elements.append(PageBreak()) + + # PDF SECTION 4 ------------------------------------------------------------------------------- + + # Add kmer spectra section subtitle + subtitle = Paragraph("K-mer spectra of curated assembly", styles['TitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 48)) + + # Initialize counter + counter = 0 + + # Iterate over haplotypes in the Curated category to get K-mer spectra images + curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) + haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline'] + + # Get paths for spectra files + spectra_files = { + 'hap1': { + 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None), + }, + 'hap2': { + 'spectra_cn_png': curated_assemblies.get('hap2', {}).get('merqury_hap_spectra_cn_png', None), + }, + 'common': { + 'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_cn_png', None), + 'spectra_asm_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_asm_png', None), + } + } + + # Filter out None values and empty strings + spectra_files = {k: {sk: v for sk, v in sv.items() if v} for k, sv in spectra_files.items()} + + # Determine the number of spectra-cn files and assign unique names if needed + spectra_cn_files = [ + spectra_files['common'].get('spectra_cn_png', None), + spectra_files['hap1'].get('spectra_cn_png', None), + spectra_files['hap2'].get('spectra_cn_png', None) + ] + spectra_cn_files = [f for f in spectra_cn_files if f] # Filter out None values + + if len(spectra_cn_files) == 3: + # For 3 spectra-cn files + shortest_spectra_cn_file = min(spectra_cn_files, key=lambda f: len(os.path.basename(f)), default=None) + similar_files = [f for f in spectra_cn_files if f != shortest_spectra_cn_file] + if similar_files: + unique_name1, unique_name2 = find_unique_parts(os.path.basename(similar_files[0]), os.path.basename(similar_files[1])) + else: + shortest_spectra_cn_file = spectra_cn_files[0] if spectra_cn_files else None + unique_name1 = unique_name2 = None + + # Create image objects and add filename below each image + images = [] + + for label, file_dict in spectra_files.items(): + for key, png_file in file_dict.items(): + if png_file: + image = Image(png_file, width=8.4 * cm, height=7 * cm) + filename = os.path.basename(png_file) + + if filename.endswith("spectra-asm.ln.png"): + text = "Distribution of k-mer counts coloured by their presence in reads/assemblies" + elif filename.endswith("spectra-cn.ln.png"): + if len(spectra_cn_files) == 3: + # For 3 spectra-cn files use particular text + if png_file == shortest_spectra_cn_file: + text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)" + else: + if png_file == spectra_files['hap1'].get('spectra_cn_png', None): + text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name1}</b> (hapl.)" + elif png_file == spectra_files['hap2'].get('spectra_cn_png', None): + text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name2}</b> (hapl.)" + else: + text = "Distribution of k-mer counts per copy numbers found in asm" + else: + # For 2 spectra-cn files use same text + text = "Distribution of k-mer counts per copy numbers found in asm" + else: + text = filename + + images.append([image, Paragraph(text, styles["midiStyle"])]) + + # Filter None values + images = [img for img in images if img[0] is not None] + + # Get number of rows and columns for the table + num_rows = (len(images) + 1) // 2 # +1 to handle odd numbers of images + num_columns = 2 + + # Create the table with dynamic size + image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)] + image_table = Table(image_table_data) + + # Style the "table" + table_style = TableStyle([ + ('VALIGN', (0, 0), (-1, -1), 'MIDDLE'), + ('BOTTOMPADDING', (0, 0), (-1, -1), 20), # 20 here is a spacer between rows + ]) + + # Set the style + image_table.setStyle(table_style) + + # Add image table to elements + elements.append(image_table) + + # Increase counter by the number of PNGs added + counter += len(images) + + # If counter is a multiple of 4, insert a page break and reset counter + if counter % 4 == 0: + elements.append(PageBreak()) + + # Add spacer + elements.append(Spacer(1, 12)) + + # If we have processed all haps and the last page does not contain exactly 4 images, insert a page break + if counter % 4 != 0: + elements.append(PageBreak()) + + # PDF SECTION 5 ------------------------------------------------------------------------------- + + # Add contamination section subtitle + subtitle = Paragraph("Post-curation contamination screening", styles['TitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 36)) + + # Initialize counter + tool_count = 0 + + # Add title and images for each step + for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1): + if asm_stages == 'Curated': # Check if the current stage is 'Curated' + tool_elements = [element for element in stage_properties.keys() if element != 'pipeline'] + + for haplotype in tool_elements: + haplotype_properties = stage_properties[haplotype] + if isinstance(haplotype_properties, dict) and 'blobplot_cont_png' in haplotype_properties: + # Get image path + png_file = haplotype_properties['blobplot_cont_png'] + + # If png_file is not empty, display it + if png_file: + # Create image object + img = Image(png_file, width=20 * cm, height=20 * cm) + elements.append(img) + + # Create paragraph for filename with haplotype name + blob_text = f"<b>{haplotype}.</b> Bubble plot circles are scaled by sequence length, positioned by coverage and GC proportion, and coloured by taxonomy. Histograms show total assembly length distribution on each axis." + blob_paragraph = Paragraph(blob_text, styles["midiStyle"]) + elements.append(blob_paragraph) + else: + # Add paragraph for missing image + missing_png_paragraph = Paragraph(f"<b>{haplotype}</b> PNG is missing!", styles["midiStyle"]) + elements.append(missing_png_paragraph) + + # Add a page break after each image and its description + elements.append(PageBreak()) + + tool_count += 1 + + # SECTION 6 ----------------------------------------------------------------------------------- + + # Add data profile section subtitle + subtitle = Paragraph("Data profile", styles['TitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 24)) + + # Create the DATA PROFILE table + data_table = Table(table_data) + + # Style the table + data_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (0, -1), '#e7e7e7'), # grey background for the first column + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), # center alignment + ('FONTNAME', (0, 0), (-1, -1), 'Courier'), # remove bold font + ('FONTSIZE', (0, 0), (-1, -1), 12), # font size for the header + ('BOTTOMPADDING', (0, 0), (-1, -1), 8), + ("GRID", (0, 0), (-1, -1), 0.5, colors.black) + ])) + + # Add DATA PROFILE table + elements.append(data_table) + + # Spacer + elements.append(Spacer(1, 32)) + + # Add assembly pipeline section subtitle + subtitle = Paragraph("Assembly pipeline", styles['TitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 24)) + + # Add ASM PIPELINE tree + elements.append(Paragraph(asm_pipeline_tree, styles['treeStyle'])) + + # Spacer + elements.append(Spacer(1, 32)) + + # Add curation pipeline section subtitle + subtitle = Paragraph("Curation pipeline", styles['TitleStyle']) + elements.append(subtitle) + + # Spacer + elements.append(Spacer(1, 24)) + + # Add CURATION PIPELINE tree + elements.append(Paragraph(curation_pipeline_tree, styles['treeStyle'])) + + # Spacer + elements.append(Spacer(1, 48)) + + # Add submitter, affiliation + submitter_paragraph_style = ParagraphStyle(name='SubmitterStyle', fontName='Courier', fontSize=10) + elements.append(Paragraph(f"Submitter: {submitter}", submitter_paragraph_style)) + elements.append(Paragraph(f"Affiliation: {affiliation}", submitter_paragraph_style)) + + # Spacer + elements.append(Spacer(1, 8)) + + # Add the date and time (CET) of the document creation + cet = pytz.timezone("CET") + current_datetime = datetime.now(cet) + formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S %Z") + elements.append(Paragraph(f"Date and time: {formatted_datetime}", submitter_paragraph_style)) + + # Build the PDF ############################################################################### + pdf.build(elements) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Create an ERGA Assembly Report (EAR) from a YAML file. Visit https://github.com/ERGA-consortium/EARs for more information') + parser.add_argument('yaml_file', type=str, help='Path to the YAML file') + args = parser.parse_args() + + make_report(args.yaml_file)