erga_ear: make_EAR.py comparison

comparison make_EAR.py @ 2:0efed25f6d38 draft default tip

planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit 720787c4fb8885f5127ab6ada2813f8dd580921c

author	bgruening
date	Tue, 15 Oct 2024 12:52:44 +0000
parents	82450f7907ef
children

comparison

equal deleted inserted replaced

-:82450f7907ef
+:0efed25f6d38
 # make_EAR_glxy.py
 # CAUTION: This is for the Galaxy version!
 # by Diego De Panis
 # ERGA Sequencing and Assembly Committee
-EAR_version = "v24.08.26"
+EAR_version = "v24.10.15"
 def make_report(yaml_file):
 logging.basicConfig(filename='EAR.log', level=logging.INFO)
 # Read the content from EAR.yaml file
 def format_number(value):
 try:
 value_float = float(value)
 if value_float.is_integer():
-# format as an integer if no decimal part
+# format as integer if no decimal
 return f'{int(value_float):,}'
 else:
 # format as a float
 return f'{value_float:,}'
 except ValueError:
-# return the original value if it can't be converted to a float
+# return original value if can't be converted to float
 return value
 # extract gfastats values
 def extract_gfastats_values(content, keys):
-return [re.findall(f"{key}: (.+)", content)[0] for key in keys]
+values = []
+for key in keys:
+# colon-separated as default format first
+match = re.search(rf"{re.escape(key)}:\s*(.+)", content)
+if not match:
+# If not try galaxy's tab-separated
+match = re.search(rf"{re.escape(key)}\t(.+)", content)
+if match:
+values.append(match.group(1).strip())
+else:
+values.append("N/A")
+return values
 keys = [
 "Total scaffold length",
 "GC content %",
 "# gaps in scaffolds",
 # extract Total bp from gfastats report
 def extract_total_bp_from_gfastats(gfastats_path):
 with open(gfastats_path, "r") as f:
 content = f.read()
-total_bp = re.search(r"Total scaffold length: (.+)", content).group(1)
+# Try colon-separated format first
-total_bp = int(total_bp.replace(',', ''))
+match = re.search(r"Total scaffold length:\s*(.+)", content)
-return "{:,}".format(total_bp)
+if not match:
+# If not found, try tab-separated format
+match = re.search(r"Total scaffold length\t(.+)", content)
+if match:
+total_bp = match.group(1).replace(',', '')
+return "{:,}".format(int(total_bp))
+else:
+logging.error(f"Could not find Total scaffold length in {gfastats_path}")
+return "N/A"
 # compute EBP quality metric
 def compute_ebp_metric(haplotype, gfastats_path, qv_value):
 keys_needed = ["Contig N50", "Scaffold N50"]
 content = ''
 content = f.read()
 values = extract_gfastats_values(content, keys_needed)
 contig_n50_log = math.floor(math.log10(int(values[0].replace(',', ''))))
 scaffold_n50_log = math.floor(math.log10(int(values[1].replace(',', ''))))
 return f"Obtained EBP quality metric for {haplotype}: {contig_n50_log}.{scaffold_n50_log}.Q{math.floor(float(qv_value))}"
 # extract qv values
 def get_qv_value(file_path, order, tool, haplotype):
 try:
 # extract BUSCO info
 def extract_busco_info(file_path):
 busco_version = None
 lineage_info = None
+busco_mode = None
+busco_pred = None
 try:
 with open(file_path, 'r') as file:
 content = file.read()
 version_match = re.search(r"# BUSCO version is: ([\d.]+)", content)
 if version_match:
 busco_version = version_match.group(1)
-lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of genomes: (\d+), number of BUSCOs: (\d+)\)", content)
+lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of (genomes|species): (\d+), number of BUSCOs: (\d+)\)", content)
 if lineage_match:
-lineage_info = lineage_match.groups()
+lineage_info = (lineage_match.group(1), lineage_match.group(3), lineage_match.group(4))
-if not lineage_info:
+mode_match = re.search(r"# BUSCO was run in mode: (\w+)", content)
-lineage_match = re.search(r"The lineage dataset is: (.*?) \(Creation date:.*?, number of species: (\d+), number of BUSCOs: (\d+)\)", content)
+if mode_match:
-if lineage_match:
+busco_mode = mode_match.group(1)
-lineage_info = lineage_match.groups()
+pred_match = re.search(r"# Gene predictor used: (\w+)", content)
+if pred_match:
+busco_pred = pred_match.group(1)
 except Exception as e:
 logging.warning(f"Error reading {file_path}: {str(e)}")
-return busco_version, lineage_info
+return busco_version, lineage_info, busco_mode, busco_pred
 # Function to check and generate warning messages
 def generate_warning_paragraphs(expected, observed, trait):
 paragraphs = []
 try:
 if expected.strip().lower() != observed.strip().lower():
 message = ". Observed sex is different from Sample sex"
 paragraphs.append(Paragraph(message, styles["midiStyle"]))
 except Exception as e:
 logging.warning(f"Error in generating warning for {trait}: {str(e)}")
 return paragraphs
 # Generate warnings for curated haplotypes (qv, kcomp, busco)
 def generate_curated_warnings(haplotype, qv_value, completeness_value, busco_scores):
 paragraphs = []
 for asm_stage, stage_properties in asm_data.items():
 for haplotypes, haplotype_properties in stage_properties.items():
 if isinstance(haplotype_properties, dict):
 if 'gfastats--nstar-report_txt' in haplotype_properties:
 file_path = haplotype_properties['gfastats--nstar-report_txt']
-with open(file_path, 'r') as file:
+try:
-content = file.read()
+with open(file_path, 'r') as file:
-gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys)
+content = file.read()
+gfastats_data[(asm_stage, haplotypes)] = extract_gfastats_values(content, keys)
+except FileNotFoundError:
+logging.error(f"Gfastats file not found: {file_path}")
+except Exception as e:
+logging.error(f"Error processing gfastats file {file_path}: {str(e)}")
 gaps_per_gbp_data = {}
 for (asm_stage, haplotypes), values in gfastats_data.items():
 try:
-gaps = float(values[gaps_index])
+gaps = float(values[gaps_index].replace(',', ''))
-total_length = float(values[total_length_index])
+total_length = float(values[total_length_index].replace(',', ''))
-gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
+if total_length > 0:
-gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
+gaps_per_gbp = round((gaps / total_length * 1_000_000_000), 2)
-except (ValueError, ZeroDivisionError):
+gaps_per_gbp_data[(asm_stage, haplotypes)] = gaps_per_gbp
-gaps_per_gbp_data[(asm_stage, haplotypes)] = ''
+else:
+logging.warning(f"Total length is zero for {asm_stage} {haplotypes}")
+gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A'
+except (ValueError, IndexError):
+logging.warning(f"Could not calculate gaps per Gbp for {asm_stage} {haplotypes}")
+gaps_per_gbp_data[(asm_stage, haplotypes)] = 'N/A'
 # Define the contigging table (column names)
 asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]]
 # Fill the table with the gfastats data
 for i in range(len(display_names)):
 metric = display_names[i]
 if metric not in exclusion_list:
-asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
+asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), ['N/A'] * len(keys))[i]) if (asm_stage, haplotypes) in gfastats_data else 'N/A' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
 # Add the gaps/gbp in between
-asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
+asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), 'N/A')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
 # get QV, Kmer completeness and BUSCO data
 qv_data = {}
 completeness_data = {}
 busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']}
 for asm_stage, stage_properties in asm_data.items():
 elements.append(asm_table)
 # Spacer
 elements.append(Spacer(1, 5))
-# Store BUSCO version and lineage information from each file in list
+# Store BUSCO information from each file in a list
 busco_info_list = []
 for asm_stages, stage_properties in asm_data.items():
 for i, haplotype_properties in stage_properties.items():
 if isinstance(haplotype_properties, dict):
 if 'busco_short_summary_txt' in haplotype_properties:
-busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
+busco_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
-if busco_version and lineage_info:
+if all(busco_info):
-busco_info_list.append((busco_version, lineage_info))
+busco_info_list.append(busco_info)
+# Function to format BUSCO information
+def format_busco_info(info):
+version, (lineage, genomes, buscos), mode, predictor = info
+return f"BUSCO: {version} ({mode}, {predictor}) / Lineage: {lineage} (genomes:{genomes}, BUSCOs:{buscos})"
 # Checking if all elements in the list are identical
-if all(info == busco_info_list[0] for info in busco_info_list):
+if busco_info_list and all(info == busco_info_list[0] for info in busco_info_list):
-busco_version, (lineage_name, num_genomes, num_buscos) = busco_info_list[0]
+busco_text = format_busco_info(busco_info_list[0])
-elements.append(Paragraph(f"BUSCO {busco_version} Lineage: {lineage_name} (genomes:{num_genomes}, BUSCOs:{num_buscos})", styles['miniStyle']))
+elements.append(Paragraph(busco_text, styles['miniStyle']))
 else:
-elements.append(Paragraph("Warning: BUSCO versions or lineage datasets are not the same across results", styles['miniStyle']))
+elements.append(Paragraph("Warning! BUSCO versions or lineage datasets are not the same across results:", styles['miniStyle']))
-logging.warning("WARNING!!! BUSCO versions or lineage datasets are not the same across results")
+logging.warning("WARNING: BUSCO versions or lineage datasets are not the same across results")
+for info in busco_info_list:
+busco_text = format_busco_info(info)
+elements.append(Paragraph(busco_text, styles['miniStyle']))
 # Page break
 elements.append(PageBreak())
 # PDF SECTION 3 -------------------------------------------------------------------------------
 missing_png_paragraph = Paragraph(f"<b>{haplotype}</b> HiC PNG is missing!", styles["midiStyle"])
 images_with_names.append([missing_png_paragraph])
 # Add paragraph for the link
 if link:
-link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[LINK]</link>'
+link_html = f'<b>{haplotype}</b> <link href="{link}" color="blue">[non-permanent LINK]</link>'
 else:
 link_html = f'<b>{haplotype}</b> File link is missing!'
 link_paragraph = Paragraph(link_html, styles["midiStyle"])
 images_with_names.append([link_paragraph])
 # Iterate over haplotypes in the Curated category to get K-mer spectra images
 curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
 # Get paths for spectra files
-spectra_files = {
+spectra_files = {'common': {}}
-'hap1': {
+for assembly_type, assembly_data in curated_assemblies.items():
-'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_hap_spectra_cn_png', None),
+if 'merqury_hap_spectra_cn_png' in assembly_data:
-},
+spectra_files[assembly_type] = {'spectra_cn_png': assembly_data['merqury_hap_spectra_cn_png']}
-'hap2': {
+if 'merqury_spectra_cn_png' in assembly_data:
-'spectra_cn_png': curated_assemblies.get('hap2', {}).get('merqury_hap_spectra_cn_png', None),
+spectra_files['common']['spectra_cn_png'] = assembly_data['merqury_spectra_cn_png']
-},
+if 'merqury_spectra_asm_png' in assembly_data:
-'common': {
+spectra_files['common']['spectra_asm_png'] = assembly_data['merqury_spectra_asm_png']
-'spectra_cn_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_cn_png', None),
-'spectra_asm_png': curated_assemblies.get('hap1', {}).get('merqury_spectra_asm_png', None),
-}
-}
-# Filter out None values and empty strings
-spectra_files = {k: {sk: v for sk, v in sv.items() if v} for k, sv in spectra_files.items()}
 # Determine the number of spectra-cn files and assign unique names if needed
 spectra_cn_files = [
-spectra_files['common'].get('spectra_cn_png', None),
+file_dict.get('spectra_cn_png', None)
-spectra_files['hap1'].get('spectra_cn_png', None),
+for file_dict in spectra_files.values()
-spectra_files['hap2'].get('spectra_cn_png', None)
+if file_dict.get('spectra_cn_png')
 ]
-spectra_cn_files = [f for f in spectra_cn_files if f]  # Filter out None values
+spectra_cn_files = list(set(spectra_cn_files))  # Remove duplicates
 if len(spectra_cn_files) == 3:
-# For 3 spectra-cn files
 shortest_spectra_cn_file = min(spectra_cn_files, key=lambda f: len(os.path.basename(f)), default=None)
 similar_files = [f for f in spectra_cn_files if f != shortest_spectra_cn_file]
 if similar_files:
 unique_name1, unique_name2 = find_unique_parts(os.path.basename(similar_files[0]), os.path.basename(similar_files[1]))
 else:
 shortest_spectra_cn_file = spectra_cn_files[0] if spectra_cn_files else None
-unique_name1 = unique_name2 = None
+# unique_name1 = unique_name2 = None
 # Create image objects and add filename below each image
 images = []
 for label, file_dict in spectra_files.items():
 for key, png_file in file_dict.items():
-if png_file:
+if png_file and os.path.exists(png_file):
-image = Image(png_file, width=8.4 * cm, height=7 * cm)
+try:
-filename = os.path.basename(png_file)
+image = Image(png_file, width=8.4 * cm, height=7 * cm)
+filename = os.path.basename(png_file)
-if filename.endswith("spectra-asm.ln.png"):
-text = "Distribution of k-mer counts coloured by their presence in reads/assemblies"
+if filename.endswith("spectra-asm.ln.png"):
-elif filename.endswith("spectra-cn.ln.png"):
+text = "Distribution of k-mer counts coloured by their presence in reads/assemblies"
-if len(spectra_cn_files) == 3:
+elif filename.endswith("spectra-cn.ln.png"):
-# For 3 spectra-cn files use particular text
+if len(spectra_cn_files) == 3:
 if png_file == shortest_spectra_cn_file:
 text = "Distribution of k-mer counts per copy numbers found in asm (dipl.)"
+else:
+text = f"Distribution of k-mer counts per copy numbers found in {label} (hapl.)"
 else:
-if png_file == spectra_files['hap1'].get('spectra_cn_png', None):
+text = "Distribution of k-mer counts per copy numbers found in asm"
-text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name1}</b> (hapl.)"
-elif png_file == spectra_files['hap2'].get('spectra_cn_png', None):
-text = f"Distribution of k-mer counts per copy numbers found in <b>{unique_name2}</b> (hapl.)"
-else:
-text = "Distribution of k-mer counts per copy numbers found in asm"
 else:
-# For 2 spectra-cn files use same text
+text = filename
-text = "Distribution of k-mer counts per copy numbers found in asm"
-else:
+images.append([image, Paragraph(text, styles["midiStyle"])])
-text = filename
+except Exception as e:
+logging.error(f"Error processing image {png_file}: {str(e)}")
-images.append([image, Paragraph(text, styles["midiStyle"])])
-# Filter None values
-images = [img for img in images if img[0] is not None]
-# Get number of rows and columns for the table
-num_rows = (len(images) + 1) // 2  # +1 to handle odd numbers of images
-num_columns = 2
 # Create the table with dynamic size
-image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)]
+if images:
-image_table = Table(image_table_data)
+num_rows = (len(images) + 1) // 2
+num_columns = 2
-# Style the "table"
+image_table_data = [[images[i * num_columns + j] if i * num_columns + j < len(images) else [] for j in range(num_columns)] for i in range(num_rows)]
-table_style = TableStyle([
+image_table = Table(image_table_data)
-('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
-('BOTTOMPADDING', (0, 0), (-1, -1), 20),  # 20 here is a spacer between rows
+# Style the table
-])
+table_style = TableStyle([
+('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
-# Set the style
+('BOTTOMPADDING', (0, 0), (-1, -1), 20),  # 20 here is a spacer between rows
-image_table.setStyle(table_style)
+])
-# Add image table to elements
+image_table.setStyle(table_style)
 elements.append(image_table)
+else:
+elements.append(Paragraph("No K-mer spectra images available.", styles["midiStyle"]))
 # Increase counter by the number of PNGs added
 counter += len(images)
 # If counter is a multiple of 4, insert a page break and reset counter

Mercurial > repos > bgruening > erga_ear

comparison make_EAR.py @ 2:0efed25f6d38 draft default tip