changeset 21:d7a88a8a8f5f draft

Uploaded
author ieguinoa
date Fri, 06 Nov 2020 20:14:09 +0000
parents 4bda52670a54
children 391da0820827
files README.md ena_upload.xml extract_tables.py process_xlsx.py
diffstat 4 files changed, 194 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/README.md	Tue Oct 06 12:26:38 2020 +0000
+++ b/README.md	Fri Nov 06 20:14:09 2020 +0000
@@ -1,2 +1,4 @@
 # ena-upload-wrapper
 Galaxy wrapper for ena-cli-upload
+
+This tool is shipped in a ready to use Galaxy container found [here](https://github.com/ELIXIR-Belgium/ena-upload-container).
--- a/ena_upload.xml	Tue Oct 06 12:26:38 2020 +0000
+++ b/ena_upload.xml	Fri Nov 06 20:14:09 2020 +0000
@@ -5,6 +5,7 @@
 </macros>
   <requirements>
     <requirement type="package" version="@VERSION@">ena-upload-cli</requirement>
+    <requirement type="package" version="1.2.0">xlrd</requirement>
   </requirements>
   <command detect_errors="exit_code"><![CDATA[
 cwd=\$(pwd);
@@ -12,6 +13,7 @@
 #set webin_secret = os.environ.get('WEBIN_SECRET', None)
 #set working_dir = os.getcwd()
 #set $dry_run_option = "False"
+#set viral_submission = "False"
 #if $action_options.input_format_conditional.input_format == "build_tables":
   python $__tool_directory__/extract_tables.py --out_dir \$cwd --studies $studies_json;
   #set $studies_table_path = "$cwd/studies.tsv"
@@ -20,6 +22,21 @@
   #set $runs_table_path =  "$cwd/runs.tsv"
 #end if
 
+#if $action_options.input_format_conditional.input_format == "excel_tables":
+    python $__tool_directory__/process_xlsx.py 
+    #if $action_options.input_format_conditional.viral_submission == "true":
+        --vir 
+    #end if
+    --form $action_options.input_format_conditional.xlsx_file --out_dir . ;
+    #set $studies_table_path = "$cwd/studies.tsv"
+    #set $samples_table_path =   "$cwd/samples.tsv"
+    #set $experiments_table_path = "$cwd/experiments.tsv"
+    #set $runs_table_path =  "$cwd/runs.tsv"
+    #if $action_options.input_format_conditional.dry_run == "true":
+      #set $dry_run_option = "True"
+    #end if
+#end if
+
 #if $action_options.input_format_conditional.input_format != "user_generated_tables":
     cp $studies_table_path $studies_table_out;
     cp $samples_table_path $samples_table_out;
@@ -31,7 +48,6 @@
 #end if
 
 
-
 ## create the list of files to upload and make the symlinks 
 #set $files_to_upload = list()
 #if $action_options.input_format_conditional.input_format == "build_tables":
@@ -79,8 +95,14 @@
     --study $studies_table_path
     --run $runs_table_path
     --sample $samples_table_path
-    #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true":
-      --vir
+    #if $action_options.input_format_conditional.input_format == "build_tables":
+        #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true":
+          --vir
+        #end if
+    #else:
+        #if $action_options.input_format_conditional.viral_submission == "true":
+          --vir
+        #end if
     #end if
 #end if
 #if $action_options.submit_dev == "true":
@@ -130,12 +152,19 @@
                 <option value="modify">Modify metadata</option>
             </param>
             <when value="add">
-                <param name="submit_dev" type="boolean" label="Submit to test ENA server?"/>
+                <param name="submit_dev" type="boolean" label="Submit to test ENA server?" help="By selecting yes the reads will be submitted " />
                 <conditional name="input_format_conditional">
                     <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?">
-                        <option value="user_generated_tables" selected="True">User generated tables of studies/experiments/runs/samples</option>
-                        <option value="build_tables" selected="False">Interactive generation of studies structure from dataset</option>
+                        <option value="user_generated_tables" selected="False">User generated tables of studies/experiments/runs/samples</option>
+                        <option value="excel_tables" selected="False">User generated metadata tables based on Excel templates</option>
+                        <option value="build_tables" selected="True">Interactive generation of studies structure from dataset</option>
                     </param>
+                    <when value="excel_tables">
+                        <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />
+                        <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>
+                        <param name="xlsx_file" type="data" format="xlsx" label="File based on templates here:"/>
+                        <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>
+                    </when>
                     <when value="user_generated_tables">
                         <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />
                         <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>
@@ -145,7 +174,7 @@
                         <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Runs table" help="Runs metadata file"/>
                     </when>
                     <when value="build_tables">
-                        <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets"/>
+                        <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>
                         <conditional name="conditional_viral_metadata">
                             <param name="viral_sample" type="boolean" label="Does your submission contains viral samples?" />
                             <when value="true">
@@ -167,16 +196,16 @@
     <outputs>
         <data name="output" format="data" label="${tool.name} on ${on_string}: Upload summary"/>
         <data name="studies_table_out" format="tabular" label="Studies table">
-            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" </filter>
+            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" or action_options['input_format_conditional']['input_format'] == "excel_tables"</filter>
         </data>
         <data name="samples_table_out" format="tabular" label="Samples table">
-            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" </filter>
+            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" or action_options['input_format_conditional']['input_format'] == "excel_tables"</filter>
         </data>
         <data name="experiments_table_out" format="tabular" label="Experiments table">
-            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" </filter>
+            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" or action_options['input_format_conditional']['input_format'] == "excel_tables"</filter>
         </data>
         <data name="runs_table_out" format="tabular" label="Runs table">
-            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" </filter>
+            <filter> action_options['input_format_conditional']['input_format'] == "build_tables" or action_options['input_format_conditional']['input_format'] == "excel_tables"</filter>
         </data>
     </outputs>
     <help><![CDATA[
--- a/extract_tables.py	Tue Oct 06 12:26:38 2020 +0000
+++ b/extract_tables.py	Fri Nov 06 20:14:09 2020 +0000
@@ -3,6 +3,8 @@
 import os
 import pathlib
 
+from datetime import datetime
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--studies',dest='studies_json_path', required=True)
 parser.add_argument('--out_dir',dest='out_path', required=True)
@@ -25,15 +27,18 @@
 
 action = 'add'
 viral_submission = False
+
+dt_oobj = datetime.now(tz=None)
+timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
 for study_index, study in enumerate(studies_dict):
-    study_alias = 'study_'+str(study_index)
+    study_alias = 'study_'+str(study_index)+'_'+timestamp
     studies_table.write('\t'.join([study_alias,action,'ENA_accession',study['title'], study['type'],study['abstract'],study['pubmed_id'],'ENA_submission_data']))
     if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample
         samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','collection_date','geographic_location','host_common_name','host_subject_id','host_health_state','host_sex','host_scientific_name','collector_name','collecting_institution','isolate','submission_date']) + '\n')
     else:
         samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date']) + '\n')
     for sample_index,sample in enumerate(study['samples']):
-        sample_alias = 'sample_'+str(sample_index)
+        sample_alias = 'sample_'+str(sample_index)+'_'+timestamp
         if "geo_location" in sample.keys(): # sample belongs to a viral sample
             if sample['collector_name'] == '':
                 sample['collector_name'] = 'unknown'
@@ -41,14 +46,14 @@
         else:
             samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],'ENA_submission_date'])+ '\n')
         for exp_index,exp in enumerate(sample['experiments']):
-            exp_alias = 'experiment_'+str(exp_index)+'_'+str(sample_index)
+            exp_alias = 'experiment_'+str(exp_index)+'.'+str(sample_index)+'_'+timestamp
             lib_alias = 'library_'+str(exp_index)+'_'+str(sample_index)
             experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['experiment_design'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'].lower(),exp['insert_size'],exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n')
             run_index = 0
             # exp['runs'] is a list of lists
             for run in exp['runs']:
                 run_index += 1
-                run_alias = '_'.join(['run',str(exp_index),str(sample_index),str(run_index)])
+                run_alias = '.'.join(['run_'+str(run_index),str(exp_index),str(sample_index)]) + '_' +timestamp
                 for file_entry in run:
                     file_format = 'fastq'
                     runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,file_entry,file_format,'file_checksum','submission_date_ENA']) + '\n')
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/process_xlsx.py	Fri Nov 06 20:14:09 2020 +0000
@@ -0,0 +1,143 @@
+import argparse
+import json
+import os
+import pathlib
+import xlrd
+from datetime import datetime
+
+
+def extract_data(xl_sheet, expected_columns):
+    # Check that the columns in the sheet correspond to what I expect
+    # Just a verification that the user filled the correct template
+    row_id = 0
+    for col in range(len(expected_columns)):
+        assert expected_columns[col] == xl_sheet.cell(row_id, col).value
+    data_dict = {}
+    for row_id in range(2,xl_sheet.nrows):
+        row_dict = {}
+        for col in range(1,len(expected_columns)):
+            row_dict[expected_columns[col]] = xl_sheet.cell(row_id,col).value
+        # should I check for duplicate alias/ids?
+        data_dict[xl_sheet.cell(row_id, 0).value] = row_dict
+    return data_dict
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--form',dest='xlsx_path', required=True)
+parser.add_argument('--out_dir',dest='out_path', required=True)
+parser.add_argument('--vir',dest='viral_submission',required=False,action='store_true')
+args = parser.parse_args()
+
+xl_workbook = xlrd.open_workbook(args.xlsx_path)
+
+
+## PARSE STUDIES
+#################
+xl_sheet = xl_workbook.sheet_by_name('ENA_study')
+if(xl_sheet.nrows < 3):
+    raise ValueError('No entries found in studies sheet')
+
+studies_dict = {}
+# Assert column names
+studies_col = ['alias','title','study_type','study_abstract']
+studies_dict = extract_data(xl_sheet, studies_col)
+
+
+## PARSE SAMPLES
+xl_sheet = xl_workbook.sheet_by_name('ENA_sample')
+if(xl_sheet.nrows < 3):
+    raise ValueError('No entries found in samples')
+if args.viral_submission:
+    samples_cols = ['alias','title','scientific_name','sample_description','collection date','geographic location (country and/or sea)', 'geographic location (region and locality)', 'sample capture status', 'host disease outcome', 'host common name', 'host age', 'host health state', 'host sex', 'host scientific name', 'virus identifier', 'collector name', 'collecting institution', 'isolate','isolation source host-associated']
+else:
+    samples_cols = ['alias','title','scientific_name','sample_description']
+samples_dict = extract_data(xl_sheet, samples_cols)
+
+
+
+## PARSE EXPERIMENTS
+#################
+xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
+if(xl_sheet.nrows < 3):
+    raise ValueError('No experiments found in experiments sheet')
+
+exp_columns = ['alias','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model']
+
+experiments_dict = extract_data(xl_sheet, exp_columns)
+
+
+## PARSE RUNS SHEET
+#################
+xl_sheet = xl_workbook.sheet_by_name('ENA_run')
+if(xl_sheet.nrows < 3):
+    raise ValueError('No entries found in runs sheet')
+
+#Assert column names
+row_idx = 0
+run_cols = ['alias','experiment_alias','file_name','file_format']
+
+runs_dict = extract_data(xl_sheet, run_cols)
+
+
+## WRITE  DICTIONARIES TO TABLE FILES
+studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
+studies_table.write('\t'.join(['alias','status','accession','title','study_type','study_abstract','pubmed_id','submission_date']) + '\n')
+
+samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
+if args.viral_submission:
+    samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','collection_date','geographic_location','host_common_name','host_subject_id','host_health_state','host_sex','host_scientific_name','collector_name','collecting_institution','isolate','submission_date']) + '\n')
+else:
+    samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date'])+ '\n')
+
+experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
+experiments_table.write('\t'.join(['alias','status','accession','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model','submission_date'])+ '\n')
+
+runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
+runs_table.write('\t'.join(['alias','status','accession','experiment_alias','file_name','file_format','file_checksum','submission_date'])+ '\n')
+
+action = 'add'
+
+dt_oobj = datetime.now(tz=None)
+timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
+for study_alias, study in studies_dict.items():
+    # study_alias = 'study_'+str(study_index)+'_'+timestamp
+    # study_alias = study_index #'study_'+str(study_index)+'_'+timestamp
+    # studies_col = ['alias','title','study_type','study_abstract']
+    studies_table.write('\t'.join([study_alias,action,'ENA_accession',study['title'], study['study_type'],study['study_abstract'],'','ENA_submission_data'])+ '\n')  ## assuming no pubmed_id
+for sample_alias, sample in samples_dict.items():
+    # if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample
+    # sample_alias = 'sample_'+str(sample_index)+'_'+timestamp
+    if sample['collector name'] == '':
+        sample['collector name'] = 'unknown'
+    if args.viral_submission:
+        samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['scientific_name'], 'tax_id_updated_by_ENA',sample['sample_description'],sample['collection date'],sample['geographic location (country and/or sea)'],sample['host common name'],'host subject id',sample['host health state'],sample['host sex'],sample['host scientific name'],sample['collector name'],sample['collecting institution'],sample['isolate'],'ENA_submission_date'])+ '\n')
+    else:
+        samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['scientific_name'],'tax_id_updated_by_ENA',sample['sample_description']])+ '\n')
+    # process the experiments from this sample
+    for exp_alias, exp in experiments_dict.items():
+        # maybe i should check here if any experiment has a study or sample alias that is incorrect? (not listed in the samples or study dict)
+        # process the experiments for this sample
+        if exp['sample_alias'] == sample_alias:
+            # exp_alias = ' +'_'+timestamp
+            # is this ok as a lib alias?
+            lib_alias = 'library_'+exp_alias +'_'+ exp['sample_alias']    #+str(exp_index)+'_'+str(sample_index)
+            experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['design_description'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'].lower(),str(exp['insert_size']),exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n')
+            for run_alias, run in runs_dict.items():
+                if run['experiment_alias'] == exp_alias:
+                    file_format = 'fastq'
+                    runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,run['file_name'],file_format,'file_checksum','submission_date_ENA']) + '\n')
+                    # run_index = 0
+                    # exp['runs'] is a list of lists
+                    # for run in exp['runs']:
+                        # run_index += 1
+                        # run_alias = '.'.join(['run_'+str(run_index),str(exp_index),str(sample_index)]) + '_' +timestamp
+                        # for file_entry in run:
+                            # file_format = 'fastq'
+                            # runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,file_entry,file_format,'file_checksum','submission_date_ENA']) + '\n')
+
+
+studies_table.close()
+samples_table.close()
+experiments_table.close()
+runs_table.close()
+