Mercurial > repos > ieguinoa > ena_upload

--- a/ena_upload.xml	Mon Nov 16 14:24:59 2020 +0000
+++ b/ena_upload.xml	Thu Nov 19 10:21:04 2020 +0000
@@ -1,6 +1,6 @@
 <tool id="ena_upload" name="ENA Upload tool" version="0.2" python_template_version="3.5">
   <macros>
-    <token name="@VERSION@">0.2.2</token>
+    <token name="@VERSION@">0.2.3</token>
     <import>samples_macros.xml</import>
 </macros>
   <requirements>
@@ -30,7 +30,7 @@
 #set $dry_run_option = "False"
 #set viral_submission = "False"
 #if $action_options.input_format_conditional.input_format == "build_tables":
-  python $__tool_directory__/extract_tables.py --out_dir \$cwd --studies $studies_json;
+  python $__tool_directory__/extract_tables.py --action $action_options.action --out_dir \$cwd --studies $studies_json;
   #set $studies_table_path = "$cwd/studies.tsv"
   #set $samples_table_path =   "$cwd/samples.tsv"
   #set $experiments_table_path = "$cwd/experiments.tsv"
@@ -42,7 +42,7 @@
     #if $action_options.input_format_conditional.viral_submission == "true":
         --vir
     #end if
-    --form $action_options.input_format_conditional.xlsx_file --out_dir . ;
+    --action $action_options.action --form $action_options.input_format_conditional.xlsx_file --out_dir . ;
     #set $studies_table_path = "$cwd/studies.tsv"
     #set $samples_table_path =   "$cwd/samples.tsv"
     #set $experiments_table_path = "$cwd/experiments.tsv"
@@ -89,7 +89,7 @@
 #if $dry_run_option == "False":
 ena-upload-cli
     --tool 'ena-upload-cli v@VERSION@ @ Galaxy'
-    --action 'add'
+    --action '$action_options.action'
     --center '$action_options.center'
     --webin_id '$webin_id'
     #if $use_secret == "False":
@@ -172,43 +172,10 @@
             </param>
             <when value="add">
                 <param name="submit_dev" type="boolean" label="Submit to test ENA server?" help="By selecting yes the reads will be submitted " />
-                <conditional name="input_format_conditional">
-                    <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?">
-                        <option value="excel_tables" selected="True">User generated metadata tables based on Excel templates</option>
-                        <option value="build_tables" selected="False">Interactive generation of the study structure (recommended for small studies)</option>
-                        <option value="user_generated_tables" selected="False">User generated tabular files (studies - samples - experiments - runs) </option>
-                    </param>
-                    <when value="excel_tables">
-                        <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />
-                        <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>
-                        <param name="xlsx_file" type="data" format="xlsx" label="File based on templates here:"/>
-                        <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>
-                    </when>
-                    <when value="user_generated_tables">
-                        <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />
-                        <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>
-                        <param name="studies_users_table" type="data" format="tabular" multiple="false" label="Studies table" help="Studies metadata file"/>
-                        <param name="samples_users_table" type="data" format="tabular" multiple="false" label="Samples table" help="Samples metadata file"/>
-                        <param name="experiments_users_table" type="data" format="tabular" multiple="false" label="Experiments table" help="Experiments metadata file"/>
-                        <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Runs table" help="Runs metadata file"/>
-                    </when>
-                    <when value="build_tables">
-                        <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>
-                        <conditional name="conditional_viral_metadata">
-                            <param name="viral_sample" type="boolean" label="Does your submission contains viral samples?" />
-                            <when value="true">
-                                <expand macro="viral_samples" />
-                            </when>
-                            <when value="false">
-                                <expand macro="nonviral_samples" />
-                            </when>
-                        </conditional>
-                    </when>
-                </conditional>
-                <param name="center" type="text" optional="False" label="Affiliation center"/>
+                <expand macro="table_inputs_macro" />
             </when>
             <when value="modify">
-                <param name="modify_first" type="text" label="Mofification of submitted data is not yet possible"/>
+                <expand macro="table_inputs_macro" />
             </when>
         </conditional>
     </inputs>
@@ -228,6 +195,9 @@
         </data>
     </outputs>
     <help><![CDATA[
-        TODO: Fill in help.
+        This is a wrapper for the ENA upload tool in https://github.com/usegalaxy-eu/ena-upload-cli
+        The input metadata can be submitted following the tabular format of the templates in https://github.com/usegalaxy-eu/ena-upload-cli/tree/master/example_tables
+        It is also possible to submit an excel file by following the template in https://drive.google.com/file/d/1ncC22--tW2v-EI-te_r86sAZujIPAjlX/view?usp=sharing
+        For viral submissions a larger set of metadata is required, you can find the template in https://drive.google.com/file/d/1U4VdcczsIecIXxseV8svE1zO_CBUadog/view?usp=sharing
     ]]></help>
 </tool>
--- a/extract_tables.py	Mon Nov 16 14:24:59 2020 +0000
+++ b/extract_tables.py	Thu Nov 19 10:21:04 2020 +0000
@@ -1,62 +1,91 @@
 import argparse
 import json
-import os
 import pathlib
-
 from datetime import datetime

+FILE_FORMAT = 'fastq'
+
 parser = argparse.ArgumentParser()
-parser.add_argument('--studies',dest='studies_json_path', required=True)
-parser.add_argument('--out_dir',dest='out_path', required=True)
+parser.add_argument('--studies', dest='studies_json_path', required=True)
+parser.add_argument('--out_dir', dest='out_path', required=True)
+parser.add_argument('--action', dest='action', required=True)
 args = parser.parse_args()

-
-with open(args.studies_json_path,'r') as studies_json_file:
+with open(args.studies_json_path, 'r') as studies_json_file:
     studies_dict = json.load(studies_json_file)
-
 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
-studies_table.write('\t'.join(['alias','status','accession','title','study_type','study_abstract','pubmed_id','submission_date']) + '\n')
-
+studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type',
+                               'study_abstract', 'pubmed_id', 'submission_date']) + '\n')
 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
+experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
+experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias',
+                                   'sample_alias', 'design_description', 'library_name',
+                                   'library_strategy', 'library_source', 'library_selection',
+                                   'library_layout', 'insert_size',
+                                   'library_construction_protocol', 'platform', 'instrument_model',
+                                   'submission_date']) + '\n')
+runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
+runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name',
+                            'file_format', 'file_checksum', 'submission_date']) + '\n')

-experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
-experiments_table.write('\t'.join(['alias','status','accession','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model','submission_date'])+ '\n')
-
-runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
-runs_table.write('\t'.join(['alias','status','accession','experiment_alias','file_name','file_format','file_checksum','submission_date'])+ '\n')
-
-action = 'add'
-viral_submission = False
+action = args.action

 dt_oobj = datetime.now(tz=None)
 timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
 for study_index, study in enumerate(studies_dict):
-    study_alias = 'study_'+str(study_index)+'_'+timestamp
-    studies_table.write('\t'.join([study_alias,action,'ENA_accession',study['title'], study['type'],study['abstract'],study['pubmed_id'],'ENA_submission_data']))
-    if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample
-        samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','collection_date','geographic_location','host_common_name','host_subject_id','host_health_state','host_sex','host_scientific_name','collector_name','collecting_institution','isolate','submission_date']) + '\n')
+    study_alias = 'study_' + str(study_index) + '_' + timestamp
+    studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
+                                   study['type'], study['abstract'], study['pubmed_id'],
+                                   'ENA_submission_data']))
+    if "geo_location" in study['samples'][0].keys():           # sample belongs to a viral sample
+        samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
+                                       'taxon_id', 'sample_description', 'collection_date',
+                                       'geographic_location', 'host_common_name', 'host_subject_id',
+                                       'host_health_state', 'host_sex', 'host_scientific_name',
+                                       'collector_name', 'collecting_institution', 'isolate',
+                                       'submission_date']) + '\n')
     else:
-        samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date']) + '\n')
-    for sample_index,sample in enumerate(study['samples']):
-        sample_alias = 'sample_'+str(sample_index)+'_'+timestamp
-        if "geo_location" in sample.keys(): # sample belongs to a viral sample
+        samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
+                                       'taxon_id', 'sample_description', 'submission_date']) + '\n')
+    for sample_index, sample in enumerate(study['samples']):
+        sample_alias = 'sample_' + str(sample_index) + '_' + timestamp
+        if "geo_location" in sample.keys():  # sample belongs to a viral sample
             if sample['collector_name'] == '':
                 sample['collector_name'] = 'unknown'
-            samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],sample['collection_date'],sample['geo_location'],sample['host_common_name'],sample['host_subject_id'],sample['host_health_state'],sample['host_sex'],sample['host_scientific_name'],sample['collector_name'],sample['collecting_institution'],sample['isolate'],'ENA_submission_date'])+ '\n')
+            samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
+                                           sample['tax_name'], sample['tax_id'],
+                                           sample['description'], sample['collection_date'],
+                                           sample['geo_location'], sample['host_common_name'],
+                                           sample['host_subject_id'], sample['host_health_state'],
+                                           sample['host_sex'], sample['host_scientific_name'],
+                                           sample['collector_name'],
+                                           sample['collecting_institution'], sample['isolate'],
+                                           'ENA_submission_date']) + '\n')
         else:
-            samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],'ENA_submission_date'])+ '\n')
-        for exp_index,exp in enumerate(sample['experiments']):
-            exp_alias = 'experiment_'+str(exp_index)+'.'+str(sample_index)+'_'+timestamp
-            lib_alias = 'library_'+str(exp_index)+'_'+str(sample_index)
-            experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['experiment_design'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'].lower(),exp['insert_size'],exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n')
+            samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
+                                           sample['tax_name'], sample['tax_id'],
+                                           sample['description'], 'ENA_submission_date']) + '\n')
+        for exp_index, exp in enumerate(sample['experiments']):
+            exp_alias = 'experiment_' + str(exp_index) + '.' + str(sample_index) + '_' + timestamp
+            lib_alias = 'library_' + str(exp_index) + '_' + str(sample_index)
+            experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'],
+                                               study_alias, sample_alias, exp['experiment_design'],
+                                               lib_alias, exp['library_strategy'],
+                                               exp['library_source'], exp['library_selection'],
+                                               exp['library_layout'].lower(), exp['insert_size'],
+                                               exp['library_construction_protocol'],
+                                               exp['platform'], exp['instrument_model'],
+                                               'submission_date_ENA']) + '\n')
             run_index = 0
             # exp['runs'] is a list of lists
             for run in exp['runs']:
                 run_index += 1
-                run_alias = '.'.join(['run_'+str(run_index),str(exp_index),str(sample_index)]) + '_' +timestamp
+                run_alias = '.'.join(['run_' + str(run_index), str(exp_index), str(sample_index)]) \
+                            + '_' + timestamp
                 for file_entry in run:
-                    file_format = 'fastq'
-                    runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,file_entry,file_format,'file_checksum','submission_date_ENA']) + '\n')
+                    runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias,
+                                                file_entry, FILE_FORMAT, 'file_checksum',
+                                                'submission_date_ENA']) + '\n')

 studies_table.close()
 samples_table.close()
--- a/process_xlsx.py	Mon Nov 16 14:24:59 2020 +0000
+++ b/process_xlsx.py	Thu Nov 19 10:21:04 2020 +0000
@@ -1,16 +1,19 @@
 import argparse
-import json
-import os
 import pathlib
 import sys
+
 import xlrd

-from datetime import datetime
+
+FILE_FORMAT = 'fastq'


 def extract_data(xl_sheet, expected_columns):
-    # Check that the columns I expect are present in the sheet (any order and mixed with others)
-    # Just a verification that the user filled the correct template
+    """
+    1. Check that the columns I expect are present in the sheet
+    (any order and mixed with others, it's just a verification that
+    the user filled the correct template)
+    2. Fill a dictionary with the rows data indexed by first column in list"""
     sheet_columns = {}
     for sh_col in range(xl_sheet.ncols):
         if xl_sheet.cell(0, sh_col).value in expected_columns:
@@ -19,139 +22,151 @@
             else:
                 sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col
     for col in range(len(expected_columns)):
-        assert expected_columns[col] in sheet_columns.keys(), "Expected column %s not found" %expected_columns[col]
+        assert expected_columns[col] in sheet_columns.keys(), \
+            "Expected column %s not found" % expected_columns[col]

     # fetch rows in a dict
     data_dict = {}
     # the first of the expected columns will be the index
     index_col = sheet_columns[expected_columns[0]]
-    ## skip first 2 rows: column names + comments rows
-    for row_id in range(2,xl_sheet.nrows):
+    # skip first 2 rows: column names + comments rows
+    for row_id in range(2, xl_sheet.nrows):
         row_dict = {}
-        for col in range(1,len(expected_columns)):
-            # row_dict[expected_columns[col]] = xl_sheet.cell(row_id,col).value
+        for col in range(1, len(expected_columns)):
             sheet_col_index = sheet_columns[expected_columns[col]]
-            row_dict[expected_columns[col]] = xl_sheet.cell(row_id,sheet_col_index).value
-        # should I check for duplicate alias/ids?
+            row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value
+        # should check for duplicate alias/ids?
         data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
     return data_dict

+
 parser = argparse.ArgumentParser()
-parser.add_argument('--form',dest='xlsx_path', required=True)
-parser.add_argument('--out_dir',dest='out_path', required=True)
-parser.add_argument('--vir',dest='viral_submission',required=False,action='store_true')
+parser.add_argument('--form', dest='xlsx_path', required=True)
+parser.add_argument('--out_dir', dest='out_path', required=True)
+parser.add_argument('--action', dest='action', required=True)
+parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
 args = parser.parse_args()

 xl_workbook = xlrd.open_workbook(args.xlsx_path)

-
-## PARSE STUDIES
+# PARSE STUDIES
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_study')
-if(xl_sheet.nrows < 3):
+if xl_sheet.nrows < 3:
     raise ValueError('No entries found in studies sheet')
-
 studies_dict = {}
-# Assert column names
-studies_col = ['alias','title','study_type','study_abstract']
+studies_col = ['alias', 'title', 'study_type', 'study_abstract']
 studies_dict = extract_data(xl_sheet, studies_col)

-
-## PARSE SAMPLES
+# PARSE SAMPLES
+#################
 xl_sheet = xl_workbook.sheet_by_name('ENA_sample')
-if(xl_sheet.nrows < 3):
+if xl_sheet.nrows < 3:
     raise ValueError('No entries found in samples')
 if args.viral_submission:
-    samples_cols = ['alias','title','scientific_name','sample_description','geographic location (country and/or sea)', 'host common name', 'host health state', 'host sex', 'host scientific name', 'collector name', 'collection date','collecting institution', 'isolate']
+    samples_cols = ['alias', 'title', 'scientific_name', 'sample_description',
+                    'geographic location (country and/or sea)', 'host common name',
+                    'host health state', 'host sex', 'host scientific name', 'collector name',
+                    'collection date', 'collecting institution', 'isolate']
 else:
-    samples_cols = ['alias','title','scientific_name','sample_description']
+    samples_cols = ['alias', 'title', 'scientific_name', 'sample_description']
 samples_dict = extract_data(xl_sheet, samples_cols)

-
-
-## PARSE EXPERIMENTS
+# PARSE EXPERIMENTS
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
-if(xl_sheet.nrows < 3):
+if xl_sheet.nrows < 3:
     raise ValueError('No experiments found in experiments sheet')
-
-exp_columns = ['alias','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model']
-
+exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name',
+               'library_strategy', 'library_source', 'library_selection', 'library_layout',
+               'insert_size', 'library_construction_protocol', 'platform', 'instrument_model']
 experiments_dict = extract_data(xl_sheet, exp_columns)

-
-## PARSE RUNS SHEET
+# PARSE RUNS SHEET
 #################
 xl_sheet = xl_workbook.sheet_by_name('ENA_run')
-if(xl_sheet.nrows < 3):
+if xl_sheet.nrows < 3:
     raise ValueError('No entries found in runs sheet')
-
-#Assert column names
-row_idx = 0
-run_cols = ['alias','experiment_alias','file_name','file_format']
-
+run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format']
 runs_dict = extract_data(xl_sheet, run_cols)
-
-
-## WRITE  DICTIONARIES TO TABLE FILES
+# WRITE HEADERS TO TABLES
 studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
-studies_table.write('\t'.join(['alias','status','accession','title','study_type','study_abstract','pubmed_id','submission_date']) + '\n')
-
+studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type',
+                               'study_abstract', 'pubmed_id', 'submission_date']) + '\n')
 samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
 if args.viral_submission:
-    samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','collection date','geographic_location','host_common_name','host_subject_id','host_health_state','host_sex','host_scientific_name','collector_name','collecting_institution','isolate','submission_date']) + '\n')
+    samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
+                                   'taxon_id', 'sample_description', 'collection date',
+                                   'geographic_location', 'host_common_name', 'host_subject_id',
+                                   'host_health_state', 'host_sex', 'host_scientific_name',
+                                   'collector_name', 'collecting_institution', 'isolate',
+                                   'submission_date']) + '\n')
 else:
-    samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date'])+ '\n')
+    samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name',
+                                   'taxon_id', 'sample_description', 'submission_date']) + '\n')

 experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
-experiments_table.write('\t'.join(['alias','status','accession','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model','submission_date'])+ '\n')
+experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias',
+                                   'sample_alias', 'design_description', 'library_name',
+                                   'library_strategy', 'library_source', 'library_selection',
+                                   'library_layout', 'insert_size', 'library_construction_protocol',
+                                   'platform', 'instrument_model', 'submission_date']) + '\n')

 runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
-runs_table.write('\t'.join(['alias','status','accession','experiment_alias','file_name','file_format','file_checksum','submission_date'])+ '\n')
+runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name',
+                            'file_format', 'file_checksum', 'submission_date']) + '\n')
+action = args.action

-action = 'add'
+# WRITE  DICTIONARIES TO TABLE FILES

-dt_oobj = datetime.now(tz=None)
-timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
+# ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
+# dt_oobj = datetime.now(tz=None)
+# timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
 for study_alias, study in studies_dict.items():
-    # study_alias = 'study_'+str(study_index)+'_'+timestamp
-    # study_alias = study_index #'study_'+str(study_index)+'_'+timestamp
-    # studies_col = ['alias','title','study_type','study_abstract']
-    studies_table.write('\t'.join([study_alias,action,'ENA_accession',study['title'], study['study_type'],study['study_abstract'],'','ENA_submission_data'])+ '\n')  ## assuming no pubmed_id
+    # study_alias = study_alias + '_' + timestamp
+    studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
+                                   study['study_type'], study['study_abstract'], '',
+                                   'ENA_submission_data']) + '\n')  # assuming no pubmed_id
 for sample_alias, sample in samples_dict.items():
-    # if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample
-    # sample_alias = 'sample_'+str(sample_index)+'_'+timestamp
-    if sample['collector name'] == '':
-        sample['collector name'] = 'unknown'
+    # sample_alias = sample_alias + '_' + timestamp
     if args.viral_submission:
-        samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['scientific_name'], 'tax_id_updated_by_ENA',sample['sample_description'],sample['collection date'],sample['geographic location (country and/or sea)'],sample['host common name'],'host subject id',sample['host health state'],sample['host sex'],sample['host scientific name'],sample['collector name'],sample['collecting institution'],sample['isolate'],'ENA_submission_date'])+ '\n')
+        if sample['collector name'] == '':
+            sample['collector name'] = 'unknown'
+        samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
+                                       sample['scientific_name'], 'tax_id_updated_by_ENA',
+                                       sample['sample_description'], sample['collection date'],
+                                       sample['geographic location (country and/or sea)'],
+                                       sample['host common name'], 'host subject id',
+                                       sample['host health state'], sample['host sex'],
+                                       sample['host scientific name'], sample['collector name'],
+                                       sample['collecting institution'], sample['isolate'],
+                                       'ENA_submission_date']) + '\n')
     else:
-        samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['scientific_name'],'tax_id_updated_by_ENA',sample['sample_description']])+ '\n')
-    # process the experiments from this sample
+        samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'],
+                                       sample['scientific_name'], 'tax_id_updated_by_ENA',
+                                       sample['sample_description']]) + '\n')
     for exp_alias, exp in experiments_dict.items():
-        # maybe i should check here if any experiment has a study or sample alias that is incorrect? (not listed in the samples or study dict)
+        # should I check here if any experiment has a study or sample alias that is incorrect?
+        # (not listed in the samples or study dict)
         # process the experiments for this sample
         if exp['sample_alias'] == sample_alias:
-            # exp_alias = ' +'_'+timestamp
-            # is this ok as a lib alias?
-            lib_alias = 'library_'+exp_alias +'_'+ exp['sample_alias']    #+str(exp_index)+'_'+str(sample_index)
-            experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['design_description'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'].lower(),str(exp['insert_size']),exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n')
+            lib_alias = 'library_' + exp_alias + '_' + exp['sample_alias']
+            experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'],
+                                               exp['study_alias'], sample_alias,
+                                               exp['design_description'], lib_alias,
+                                               exp['library_strategy'], exp['library_source'],
+                                               exp['library_selection'],
+                                               exp['library_layout'].lower(),
+                                               str(exp['insert_size']),
+                                               exp['library_construction_protocol'],
+                                               exp['platform'], exp['instrument_model'],
+                                               'submission_date_ENA']) + '\n')
             for run_alias, run in runs_dict.items():
                 if run['experiment_alias'] == exp_alias:
-                    file_format = 'fastq'
-                    runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,run['file_name'],file_format,'file_checksum','submission_date_ENA']) + '\n')
-                    # run_index = 0
-                    # exp['runs'] is a list of lists
-                    # for run in exp['runs']:
-                        # run_index += 1
-                        # run_alias = '.'.join(['run_'+str(run_index),str(exp_index),str(sample_index)]) + '_' +timestamp
-                        # for file_entry in run:
-                            # file_format = 'fastq'
-                            # runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,file_entry,file_format,'file_checksum','submission_date_ENA']) + '\n')
-
-
+                    runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias,
+                                                run['file_name'], FILE_FORMAT, 'file_checksum',
+                                                'submission_date_ENA']) + '\n')
 studies_table.close()
 samples_table.close()
 experiments_table.close()
 runs_table.close()
-
--- a/samples_macros.xml	Mon Nov 16 14:24:59 2020 +0000
+++ b/samples_macros.xml	Thu Nov 19 10:21:04 2020 +0000
@@ -1,4 +1,41 @@
 <macros>
+
+    <xml name="table_inputs_macro">
+    <conditional name="input_format_conditional">
+        <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?">
+            <option value="excel_tables" selected="True">User generated metadata tables based on Excel templates</option>
+            <option value="build_tables" selected="False">Interactive generation of the study structure (recommended for small studies)</option>
+            <option value="user_generated_tables" selected="False">User generated tabular files (studies - samples - experiments - runs) </option>
+        </param>
+        <when value="excel_tables">
+            <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />
+            <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>
+            <param name="xlsx_file" type="data" format="xlsx" />
+            <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>
+        </when>
+        <when value="user_generated_tables">
+            <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />
+            <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>
+            <param name="studies_users_table" type="data" format="tabular" multiple="false" label="Studies table" help="Studies metadata file"/>
+            <param name="samples_users_table" type="data" format="tabular" multiple="false" label="Samples table" help="Samples metadata file"/>
+            <param name="experiments_users_table" type="data" format="tabular" multiple="false" label="Experiments table" help="Experiments metadata file"/>
+            <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Runs table" help="Runs metadata file"/>
+        </when>
+        <when value="build_tables">
+            <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>
+            <conditional name="conditional_viral_metadata">
+                <param name="viral_sample" type="boolean" label="Does your submission contains viral samples?" />
+                <when value="true">
+                    <expand macro="viral_samples" />
+                </when>
+                <when value="false">
+                    <expand macro="nonviral_samples" />
+                </when>
+            </conditional>
+        </when>
+    </conditional>
+    <param name="center" type="text" optional="False" label="Affiliation center"/>
+    </xml>
     <xml name="viral_samples">
     <repeat name="rep_study" title="Study" min="1">
         <param name="study_title" type="text" optional="False" label="Please provide a short descriptive title for the study"/>