diff make_EAR.py @ 1:82450f7907ef draft

planemo upload for repository https://github.com/ERGA-consortium/EARs/tree/main commit e293d14e82a903a4cab64dd72dfa3f3798466176
author bgruening
date Fri, 30 Aug 2024 09:27:17 +0000
parents 6af76d4371f8
children 0efed25f6d38
line wrap: on
line diff
--- a/make_EAR.py	Thu Jul 04 12:43:34 2024 +0000
+++ b/make_EAR.py	Fri Aug 30 09:27:17 2024 +0000
@@ -1,6 +1,5 @@
 
 import argparse
-import glob
 import logging
 import math
 import os
@@ -22,7 +21,7 @@
 # CAUTION: This is for the Galaxy version!
 # by Diego De Panis
 # ERGA Sequencing and Assembly Committee
-EAR_version = "v24.05.20_glxy_beta"
+EAR_version = "v24.08.26"
 
 
 def make_report(yaml_file):
@@ -120,19 +119,9 @@
                     fifth_column_value = target_line.split('\t')[4].strip()
                     return fifth_column_value
         except Exception as e:
-            logging.warning(f"Error reading {file_path}: {str(e)}")
+            logging.error(f"Error reading {file_path} for tool {tool} and haplotype {haplotype}: {str(e)}")
             return ''
 
-    # Getting kmer plots for curated asm
-    def get_png_files(dir_path):
-        png_files = glob.glob(f"{dir_path}/*.ln.png")
-        if len(png_files) < 4:
-            logging.warning(f"Warning: Less than 4 png files found in {dir_path}. If this is diploid, some images may be missing.")
-            # fill missing with None
-            while len(png_files) < 4:
-                png_files.append(None)
-        return png_files[:4]
-
     # get unique part in file names
     def find_unique_parts(file1, file2):
         # Split filenames into parts
@@ -141,7 +130,6 @@
         # Find unique parts
         unique_parts1 = [part for part in parts1 if part not in parts2]
         unique_parts2 = [part for part in parts2 if part not in parts1]
-
         return ' '.join(unique_parts1), ' '.join(unique_parts2)
 
     # extract BUSCO values
@@ -274,33 +262,34 @@
     # Parse pipeline and generate "tree"
     def generate_pipeline_tree(pipeline_data):
         tree_lines = []
-        indent = "&nbsp;" * 2  # Adjust indent spacing as needed
-
-        for tool_version_param in pipeline_data:
-            parts = tool_version_param.split('|')
-            tool_version = parts[0]
-            tool, version = tool_version.split('_v') if '_v' in tool_version else (tool_version, "NA")
+        indent = "&nbsp;" * 2  # Adjust indent spacing
 
-            # Handle parameters: join all but the first (which is tool_version) with ', '
-            param_text = ', '.join(parts[1:]) if len(parts) > 1 else "NA"
+        if isinstance(pipeline_data, dict):
+            for tool, version_param in pipeline_data.items():
+                # Tool line
+                tool_line = f"- <b>{tool}</b>"
+                tree_lines.append(tool_line)
 
-            # Tool line
-            tool_line = f"- <b>{tool}</b>"
-            tree_lines.append(tool_line)
+                # Convert version_param to string and split
+                version_param_str = str(version_param)
+                parts = version_param_str.split('/')
+                version = parts[0]
+                params = [p for p in parts[1:] if p]  # This will remove empty strings
 
-            # Version line
-            version_line = f"{indent*2}|_ <i>ver:</i> {version}"
-            tree_lines.append(version_line)
+                # Version line
+                version_line = f"{indent * 2}|_ <i>ver:</i> {version}"
+                tree_lines.append(version_line)
 
-            # Param line(s)
-            if param_text != "NA":
-                for param in param_text.split(','):
-                    param = param.strip()
-                    param_line = f"{indent*2}|_ <i>key param:</i> {param if param else 'NA'}"
+                # Param line(s)
+                if params:
+                    for param in params:
+                        param_line = f"{indent * 2}|_ <i>key param:</i> {param}"
+                        tree_lines.append(param_line)
+                else:
+                    param_line = f"{indent * 2}|_ <i>key param:</i> NA"
                     tree_lines.append(param_line)
-            else:
-                param_line = f"{indent*2}|_ <i>key param:</i> NA"
-                tree_lines.append(param_line)
+        else:
+            tree_lines.append("Invalid pipeline data format")
 
         # Join lines with HTML break for paragraph
         tree_diagram = "<br/>".join(tree_lines)
@@ -330,10 +319,10 @@
     tags = yaml_data["Tags"]
 
     # Check if tag is valid
-    valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Satellite"]
+    valid_tags = ["ERGA-BGE", "ERGA-Pilot", "ERGA-Community", "ERGA-testing"]
     if tags not in valid_tags:
         tags += "[INVALID TAG]"
-        logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Satellite")
+        logging.warning("# SAMPLE INFORMATION section in the yaml file contains an invalid tag. Valid tags are ERGA-BGE, ERGA-Pilot and ERGA-Community.")
 
     # Get data from GoaT based on species name
     # urllib.parse.quote to handle special characters and spaces in the species name
@@ -401,16 +390,15 @@
     # Create a list of lists for the table
     table_data = [headers, data_values]
 
-    # Extract pipeline data from 'Pre-curation' category
-    asm_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Pre-curation', {}).get('pipeline', [])
-    asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data)
+    # Extract pipeline data
+    asm_pipeline_data = yaml_data.get('PIPELINES', {}).get('Assembly', {})
+    curation_pipeline_data = yaml_data.get('PIPELINES', {}).get('Curation', {})
 
     # Extract pipeline data from 'Curated' category
-    curation_pipeline_data = yaml_data.get('ASSEMBLIES', {}).get('Curated', {}).get('pipeline', [])
+    asm_pipeline_tree = generate_pipeline_tree(asm_pipeline_data)
     curation_pipeline_tree = generate_pipeline_tree(curation_pipeline_data)
 
     # Reading GENOME PROFILING DATA section from yaml #############################################
-
     profiling_data = yaml_data.get('PROFILING')
 
     # Check if profiling_data is available
@@ -418,38 +406,46 @@
         logging.error('Error: No profiling data found in the YAML file.')
         sys.exit(1)
 
-    # Handle GenomeScope specific processing
+    # Check for GenomeScope data (mandatory)
     genomescope_data = profiling_data.get('GenomeScope')
-    if genomescope_data:
-        summary_file = genomescope_data.get('genomescope_summary_txt')
-        if summary_file and os.path.exists(summary_file):
-            with open(summary_file, "r") as f:
-                summary_txt = f.read()
-            genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1)
-            proposed_ploidy_match = re.search(r"p = (\d+)", summary_txt)
-            proposed_ploidy = proposed_ploidy_match.group(1) if proposed_ploidy_match else 'NA'
-        else:
-            logging.error(f"File {summary_file} not found for GenomeScope.")
-            sys.exit(1)
-    else:
-        logging.error("GenomeScope data is missing in the PROFILING section.")
+    if not genomescope_data:
+        logging.error("Error: GenomeScope data is missing in the YAML file. This is mandatory.")
+        sys.exit(1)
+
+    genomescope_summary = genomescope_data.get('genomescope_summary_txt')
+    if not genomescope_summary:
+        logging.error("Error: GenomeScope summary file path is missing in the YAML file.")
         sys.exit(1)
 
-    # Handle Smudgeplot specific processing
+    # Read the content of the GenomeScope summary file
+    try:
+        with open(genomescope_summary, "r") as f:
+            summary_txt = f.read()
+        # Extract values from summary.txt
+        genome_haploid_length = re.search(r"Genome Haploid Length\s+([\d,]+) bp", summary_txt).group(1)
+        proposed_ploidy = re.search(r"p = (\d+)", summary_txt).group(1)
+    except Exception as e:
+        logging.error(f"Error reading GenomeScope summary file: {str(e)}")
+        sys.exit(1)
+
+    # Check for Smudgeplot data (optional)
     smudgeplot_data = profiling_data.get('Smudgeplot')
     if smudgeplot_data:
-        verbose_summary_file = smudgeplot_data.get('smudgeplot_verbose_summary_txt')
-        if verbose_summary_file and os.path.exists(verbose_summary_file):
-            with open(verbose_summary_file, "r") as f:
-                smud_summary_txt = f.readlines()
-            for line in smud_summary_txt:
-                if line.startswith("* Proposed ploidy"):
-                    proposed_ploidy = line.split(":")[1].strip()
-                    break
+        smudgeplot_summary = smudgeplot_data.get('smudgeplot_verbose_summary_txt')
+        if smudgeplot_summary:
+            try:
+                with open(smudgeplot_summary, "r") as f:
+                    smud_summary_txt = f.readlines()
+                for line in smud_summary_txt:
+                    if line.startswith("* Proposed ploidy"):
+                        proposed_ploidy = line.split(":")[1].strip()
+                        break
+            except Exception as e:
+                logging.warning(f"Error reading Smudgeplot summary file: {str(e)}. Using GenomeScope ploidy.")
         else:
-            logging.warning(f"Verbose summary file {verbose_summary_file} not found for Smudgeplot; skipping detailed Smudgeplot analysis.")
+            logging.warning("Smudgeplot summary file path is missing. Using GenomeScope ploidy.")
     else:
-        logging.warning("Smudgeplot data is missing in the PROFILING section; skipping Smudgeplot analysis.")
+        logging.info("Smudgeplot data not provided. Using GenomeScope ploidy.")
 
     # Reading ASSEMBLY DATA section from yaml #####################################################
 
@@ -459,7 +455,7 @@
     asm_stages = []
     for asm_stage, stage_properties in asm_data.items():
         for haplotypes in stage_properties.keys():
-            if haplotypes != 'pipeline' and haplotypes not in asm_stages:
+            if haplotypes not in asm_stages:
                 asm_stages.append(haplotypes)
 
     # get gfastats-based data
@@ -483,7 +479,7 @@
         except (ValueError, ZeroDivisionError):
             gaps_per_gbp_data[(asm_stage, haplotypes)] = ''
 
-    # Define the contigging table (column names) DON'T MOVE THIS AGAIN!!!!!!!
+    # Define the contigging table (column names)
     asm_table_data = [["Metrics"] + [f'{asm_stage} \n {haplotypes}' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]]]
 
     # Fill the table with the gfastats data
@@ -493,8 +489,6 @@
             asm_table_data.append([metric] + [format_number(gfastats_data.get((asm_stage, haplotypes), [''])[i]) if (asm_stage, haplotypes) in gfastats_data else '' for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
 
     # Add the gaps/gbp in between
-    gc_index = display_names.index("GC %")
-    gc_index
     asm_table_data.insert(gaps_index + 1, ['Gaps/Gbp'] + [format_number(gaps_per_gbp_data.get((asm_stage, haplotypes), '')) for asm_stage in asm_data for haplotypes in asm_stages if haplotypes in asm_data[asm_stage]])
 
     # get QV, Kmer completeness and BUSCO data
@@ -502,7 +496,7 @@
     completeness_data = {}
     busco_data = {metric: {} for metric in ['BUSCO sing.', 'BUSCO dupl.', 'BUSCO frag.', 'BUSCO miss.']}
     for asm_stage, stage_properties in asm_data.items():
-        asm_stage_elements = [element for element in stage_properties.keys() if element != 'pipeline']
+        asm_stage_elements = list(stage_properties.keys())
         for i, haplotypes in enumerate(asm_stage_elements):
             haplotype_properties = stage_properties[haplotypes]
             if isinstance(haplotype_properties, dict):
@@ -580,7 +574,7 @@
     styles.add(ParagraphStyle(name='subTitleStyle', fontName='Courier', fontSize=16))
     styles.add(ParagraphStyle(name='normalStyle', fontName='Courier', fontSize=12))
     styles.add(ParagraphStyle(name='midiStyle', fontName='Courier', fontSize=10))
-    styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True))
+    # styles.add(ParagraphStyle(name='LinkStyle', fontName='Courier', fontSize=10, textColor='blue', underline=True))
     styles.add(ParagraphStyle(name='treeStyle', fontName='Courier', fontSize=10, leftIndent=12))
     styles.add(ParagraphStyle(name='miniStyle', fontName='Courier', fontSize=8))
     styles.add(ParagraphStyle(name='FileNameStyle', fontName='Courier', fontSize=6))
@@ -659,7 +653,7 @@
 
     # Iterate over haplotypes in the Curated category to get data for EBP metrics
     curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
-    haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']
+    haplotype_names = list(curated_assemblies.keys())
 
     for haplotype in haplotype_names:
         properties = curated_assemblies[haplotype]
@@ -756,7 +750,7 @@
     # Store BUSCO version and lineage information from each file in list
     busco_info_list = []
     for asm_stages, stage_properties in asm_data.items():
-        for haplotype_keys, haplotype_properties in stage_properties.items():
+        for i, haplotype_properties in stage_properties.items():
             if isinstance(haplotype_properties, dict):
                 if 'busco_short_summary_txt' in haplotype_properties:
                     busco_version, lineage_info = extract_busco_info(haplotype_properties['busco_short_summary_txt'])
@@ -787,9 +781,9 @@
     tool_count = 0
 
     # Add title and images for each step
-    for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
+    for asm_stages, stage_properties in asm_data.items():
         if asm_stages == 'Curated':
-            tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
+            tool_elements = list(stage_properties.keys())
 
             images_with_names = []
 
@@ -825,7 +819,7 @@
 
             # Add images and names to the elements in pairs
             for i in range(0, len(images_with_names), 4):  # Process two images (and their names) at a time
-                elements_to_add = images_with_names[i:i + 4]
+                elements_to_add = images_with_names[i: i + 4]
 
                 # Create table for the images and names
                 table = Table(elements_to_add)
@@ -856,7 +850,6 @@
 
     # Iterate over haplotypes in the Curated category to get K-mer spectra images
     curated_assemblies = yaml_data.get('ASSEMBLIES', {}).get('Curated', {})
-    haplotype_names = [key for key in curated_assemblies.keys() if key != 'pipeline']
 
     # Get paths for spectra files
     spectra_files = {
@@ -974,9 +967,9 @@
     tool_count = 0
 
     # Add title and images for each step
-    for idx, (asm_stages, stage_properties) in enumerate(asm_data.items(), 1):
+    for asm_stages, stage_properties in asm_data.items():
         if asm_stages == 'Curated':  # Check if the current stage is 'Curated'
-            tool_elements = [element for element in stage_properties.keys() if element != 'pipeline']
+            tool_elements = list(stage_properties.keys())
 
             for haplotype in tool_elements:
                 haplotype_properties = stage_properties[haplotype]