Mercurial > repos > bgruening > erga_ear
diff make_EAR.py @ 1:82450f7907ef draft
planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit e293d14e82a903a4cab64dd72dfa3f3798466176
author | bgruening |
---|---|
date | Fri, 30 Aug 2024 09:27:17 +0000 |
parents | 6af76d4371f8 |
children | 0efed25f6d38 |
line wrap: on
line diff
--- a/make_EAR.py Thu Jul 04 12:43:34 2024 +0000 +++ b/make_EAR.py Fri Aug 30 09:27:17 2024 +0000 @@ -1,6 +1,5 @@ import argparse -import glob import logging import math import os @@ -22,7 +21,7 @@ # CAUTION: This is for the Galaxy version! # by Diego De Panis # ERGA Sequencing and Assembly Committee -EAR_version = "v24.05.20_glxy_beta" +EAR_version = "v24.08.26" def make_report(yaml_file): @@ -120,19 +119,9 @@ fifth_column_value = target_line.split('\t')[4].strip() return fifth_column_value except Exception as e: - logging.warning(f"Error reading {file_path}: {str(e)}") + logging.error(f"Error reading {file_path} for tool {tool} and haplotype {haplotype}: {str(e)}") return '' - # Getting kmer plots for curated asm - def get_png_files(dir_path): - png_files = glob.glob(f"{dir_path}/*.ln.png") - if len(png_files) < 4: - logging.warning(f"Warning: Less than 4 png files found in {dir_path}. If this is diploid, some images may be missing.") - # fill missing with None - while len(png_files) < 4: - png_files.append(None) - return png_files[:4] - # get unique part in file names def find_unique_parts(file1, file2): # Split filenames into parts @@ -141,7 +130,6 @@ # Find unique parts unique_parts1 = [part for part in parts1 if part not in parts2] unique_parts2 = [part for part in parts2 if part not in parts1] - return ' '.join(unique_parts1), ' '.join(unique_parts2) # extract BUSCO values @@ -274,33 +262,34 @@ # Parse pipeline and generate "tree" def generate_pipeline_tree(pipeline_data): tree_lines = [] - indent = " " * 2 # Adjust indent spacing as needed - - for tool_version_param in pipeline_data: - parts = tool_version_param.split('|') - tool_version = parts[0] - tool, version = tool_version.split('_v') if '_v' in tool_version else (tool_version, "NA") + indent = " " * 2 # Adjust indent spacing - # Handle parameters: join all but the first (which is tool_version) with ', ' - param_text = ', '.join(parts[1:]) if len(parts) > 1 else "NA" + if isinstance(pipeline_data, dict): + for tool, version_param in pipeline_data.items(): + # Tool line + tool_line = f"- <b>{tool}</b>" + tree_lines.append(tool_line) - # Tool line - tool_line = f"- <b>{tool}</b>" - tree_lines.append(tool_line) + # Convert version_param to string and split + version_param_str = str(version_param) + parts = version_param_str.split('/') + version = parts[0] + params = [p for p in parts[1:] if p] # This will remove empty strings - # Version line - version_line = f"{indent*2}|_ <i>ver:</i> {version}" - tree_lines.append(version_line) + # Version line + version_line = f"{indent * 2}|_ <i>ver:</i> {version}" + tree_lines.append(version_line) - # Param line(s) - if param_text != "NA": - for param in param_text.split(','): - param = param.strip() - param_line = f"{indent*2}|_ <i>key param:</i> {param if param else 'NA'}" + # Param line(s) + if params: + for param in params: + param_line = f"{indent * 2}|_ <i>key param:</i> {param}" + tree_lines.append(param_line) + else: + param_line = f"{indent * 2}|_ <i>key param:</i> NA" tree_lines.append(param_line) - else: - param_line = f"{indent*2}|_ <i>key param:</i> NA" - tree_lines.append(param_line) + else: + tree_lines.append("Invalid pipeline data format") # Join lines with HTML break for paragraph tree_diagram = "<br/>".join(tree_lines) @@ -330,10 +319,10 @@ tags = yaml_data["Tags"] # Check if tag is valid - valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Satellite"] + valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Community", "ERGA-testing"] if tags not in valid_tags: tags += "[INVALID TAG]" - logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Satellite") + logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Community.") # Get data from GoaT based on species name # urllib.parse.quote to handle special characters and spaces in the species name @@ -401,16 +390,15 @@ # Create a list of lists for the table table_data = [headers, data_values] - # Extract pipeline data from 'Pre-curation' category - asm_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Pre-curation', {}).get('pipeline', []) - asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data) + # Extract pipeline data + asm_pipeline_data = yaml_data.get('PIPELINES', {}).get('Assembly', {}) + curation_pipeline_data = yaml_data.get('PIPELINES', {}).get('Curation', {}) # Extract pipeline data from 'Curated' category - curation_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}).get('pipeline', []) + asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data) curation_pipeline_tree = generate_pipeline_tree(curation_pipeline_data) # Reading GENOME PROFILING DATA section from yaml ############################################# - profiling_data = yaml_data.get('PROFILING') # Check if profiling_data is available @@ -418,38 +406,46 @@ logging.error('Error: No profiling data found in the YAML file.') sys.exit(1) - # Handle GenomeScope specific processing + # Check for GenomeScope data (mandatory) genomescope_data = profiling_data.get('GenomeScope') - if genomescope_data: - summary_file = genomescope_data.get('genomescope_summary_txt') - if summary_file and os.path.exists(summary_file): - with open(summary_file, "r") as f: - summary_txt = f.read() - genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1) - proposed_ploidy_match = re.search(r"p = (\d+)", summary_txt) - proposed_ploidy = proposed_ploidy_match.group(1) if proposed_ploidy_match else 'NA' - else: - logging.error(f"File {summary_file} not found for GenomeScope.") - sys.exit(1) - else: - logging.error("GenomeScope data is missing in the PROFILING section.") + if not genomescope_data: + logging.error("Error: GenomeScope data is missing in the YAML file. This is mandatory.") + sys.exit(1) + + genomescope_summary = genomescope_data.get('genomescope_summary_txt') + if not genomescope_summary: + logging.error("Error: GenomeScope summary file path is missing in the YAML file.") sys.exit(1) - # Handle Smudgeplot specific processing + # Read the content of the GenomeScope summary file + try: + with open(genomescope_summary, "r") as f: + summary_txt = f.read() + # Extract values from summary.txt + genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1) + proposed_ploidy = re.search(r"p = (\d+)", summary_txt).group(1) + except Exception as e: + logging.error(f"Error reading GenomeScope summary file: {str(e)}") + sys.exit(1) + + # Check for Smudgeplot data (optional) smudgeplot_data = profiling_data.get('Smudgeplot') if smudgeplot_data: - verbose_summary_file = smudgeplot_data.get('smudgeplot_verbose_summary_txt') - if verbose_summary_file and os.path.exists(verbose_summary_file): - with open(verbose_summary_file, "r") as f: - smud_summary_txt = f.readlines() - for line in smud_summary_txt: - if line.startswith("* Proposed ploidy"): - proposed_ploidy = line.split(":")[1].strip() - break + smudgeplot_summary = smudgeplot_data.get('smudgeplot_verbose_summary_txt') + if smudgeplot_summary: + try: + with open(smudgeplot_summary, "r") as f: + smud_summary_txt = f.readlines() + for line in smud_summary_txt: + if line.startswith("* Proposed ploidy"): + proposed_ploidy = line.split(":")[1].strip() + break + except Exception as e: + logging.warning(f"Error reading Smudgeplot summary file: {str(e)}. Using GenomeScope ploidy.") else: - logging.warning(f"Verbose summary file {verbose_summary_file} not found for Smudgeplot; skipping detailed Smudgeplot analysis.") + logging.warning("Smudgeplot summary file path is missing. Using GenomeScope ploidy.") else: - logging.warning("Smudgeplot data is missing in the PROFILING section; skipping Smudgeplot analysis.") + logging.info("Smudgeplot data not provided. Using GenomeScope ploidy.") # Reading ASSEMBLY DATA section from yaml ##################################################### @@ -459,7 +455,7 @@ asm_stages = [] for asm_stage, stage_properties in asm_data.items(): for haplotypes in stage_properties.keys(): - if haplotypes != 'pipeline' and haplotypes not in asm_stages: + if haplotypes not in asm_stages: asm_stages.append(haplotypes) # get gfastats-based data @@ -483,7 +479,7 @@ except (ValueError, ZeroDivisionError): gaps_per_gbp_data[(asm_stage, haplotypes)] = '' - # Define the contigging table (column names) DON'T MOVE THIS AGAIN!!!!!!! + # Define the contigging table (column names) asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]] # Fill the table with the gfastats data @@ -493,8 +489,6 @@ asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) # Add the gaps/gbp in between - gc_index = display_names.index("GC %") - gc_index asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]) # get QV, Kmer completeness and BUSCO data @@ -502,7 +496,7 @@ completeness_data = {} busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']} for asm_stage, stage_properties in asm_data.items(): - asm_stage_elements = [element for element in stage_properties.keys() if element != 'pipeline'] + asm_stage_elements = list(stage_properties.keys()) for i, haplotypes in enumerate(asm_stage_elements): haplotype_properties = stage_properties[haplotypes] if isinstance(haplotype_properties, dict): @@ -580,7 +574,7 @@ styles.add(ParagraphStyle(name='subTitleStyle', fontName='Courier', fontSize=16)) styles.add(ParagraphStyle(name='normalStyle', fontName='Courier', fontSize=12)) styles.add(ParagraphStyle(name='midiStyle', fontName='Courier', fontSize=10)) - styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True)) + # styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True)) styles.add(ParagraphStyle(name='treeStyle', fontName='Courier', fontSize=10, leftIndent=12)) styles.add(ParagraphStyle(name='miniStyle', fontName='Courier', fontSize=8)) styles.add(ParagraphStyle(name='FileNameStyle', fontName='Courier', fontSize=6)) @@ -659,7 +653,7 @@ # Iterate over haplotypes in the Curated category to get data for EBP metrics curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) - haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline'] + haplotype_names = list(curated_assemblies.keys()) for haplotype in haplotype_names: properties = curated_assemblies[haplotype] @@ -756,7 +750,7 @@ # Store BUSCO version and lineage information from each file in list busco_info_list = [] for asm_stages, stage_properties in asm_data.items(): - for haplotype_keys, haplotype_properties in stage_properties.items(): + for i, haplotype_properties in stage_properties.items(): if isinstance(haplotype_properties, dict): if 'busco_short_summary_txt' in haplotype_properties: busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt']) @@ -787,9 +781,9 @@ tool_count = 0 # Add title and images for each step - for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1): + for asm_stages, stage_properties in asm_data.items(): if asm_stages == 'Curated': - tool_elements = [element for element in stage_properties.keys() if element != 'pipeline'] + tool_elements = list(stage_properties.keys()) images_with_names = [] @@ -825,7 +819,7 @@ # Add images and names to the elements in pairs for i in range(0, len(images_with_names), 4): # Process two images (and their names) at a time - elements_to_add = images_with_names[i:i + 4] + elements_to_add = images_with_names[i: i + 4] # Create table for the images and names table = Table(elements_to_add) @@ -856,7 +850,6 @@ # Iterate over haplotypes in the Curated category to get K-mer spectra images curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}) - haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline'] # Get paths for spectra files spectra_files = { @@ -974,9 +967,9 @@ tool_count = 0 # Add title and images for each step - for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1): + for asm_stages, stage_properties in asm_data.items(): if asm_stages == 'Curated': # Check if the current stage is 'Curated' - tool_elements = [element for element in stage_properties.keys() if element != 'pipeline'] + tool_elements = list(stage_properties.keys()) for haplotype in tool_elements: haplotype_properties = stage_properties[haplotype]