changeset 4:b85d239b1d58 draft

Uploaded
author ieguinoa
date Wed, 05 Aug 2020 14:55:11 +0000
parents e0f039988802
children 19f34b5e27db
files .shed.yml README.md ena_upload.xml extract_tables.py tool-data/instrument_model.txt tool-data/library_layout.txt tool-data/library_selection.txt tool-data/library_source.txt tool-data/library_strategy.txt tool-data/study_type.txt
diffstat 10 files changed, 421 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,12 @@
+categories:
+    - TODO
+description: |
+    Submits experimental data and respective metadata to the European Nucleotide Archive (ENA). 
+long_description: |
+    The program submits experimental data and respective metadata to the European Nucleotide Archive (ENA). 
+    The metadata should be provided in separate tables corresponding to ENA objects STUDY, SAMPLE, EXPERIMENT and RUN
+name: ena_upload
+owner: iuc
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload
+homepage_url: https://github.com/usegalaxy-eu/ena-upload-cli
+type: unrestricted
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,2 @@
+# ena-upload-wrapper
+Galaxy wrapper for ena-cli-upload
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ena_upload.xml	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,211 @@
+<tool id="ena_upload" name="ENA Upload tool" version="0.1.0" python_template_version="3.5">
+  <macros>
+    <token name="@VERSION@">0.1.3</token>   
+  </macros>    
+  <requirements>
+    <requirement type="package" version="@VERSION@">ena-upload-cli</requirement>
+  </requirements>
+  <command detect_errors="exit_code"><![CDATA[
+cwd=\$(pwd);
+#set webin_id = os.environ.get('WEBIN_ID', None)
+#set webin_pass = os.environ.get('WEBIN_PASS', None)
+#set working_dir = os.getcwd()   
+#if $action_options.input_format_conditional.input_format == "build_tables":
+  python $__tool_directory__/extract_tables.py --out_dir \$cwd --studies $studies_json;
+  #set $studies_table_path = "$cwd/studies.tsv"
+  #set $samples_table_path =   "$cwd/samples.tsv"
+  #set $experiments_table_path = "$cwd/experiments.tsv"
+  #set $runs_table_path =  "$cwd/runs.tsv"
+#end if
+#if $action_options.input_format_conditional.dry_run:
+    cp $studies_table_path $studies_table_out;
+    cp $samples_table_path $samples_table_out;
+    cp $experiments_table_path $experiments_table_out;
+    cp $runs_table_path $runs_table_out;
+#end if
+
+
+## create the list of files to upload and make the symlinks 
+#set $files_to_upload = list()
+#for $study in $action_options.input_format_conditional.rep_study:
+  #for $sample in $study.rep_sample:
+    #for $experiment in $sample.rep_experiment:
+      #for $run in $experiment.rep_runs:
+        #for $file in $run.upload_files:
+            ln -s $file $file.element_identifier;
+            $files_to_upload.append(str($file.element_identifier))
+        #end for
+      #end for
+    #end for
+  #end for
+#end for
+
+#if $action_options.input_format_conditional.dry_run == "false":
+ena-upload-cli
+    --action 'add'
+    --center $action_options.center
+    --webin_id '$webin_id'
+    --password '$webin_pass'
+    --data
+    #for $dataset in $files_to_upload:
+      $dataset
+    #end for
+#if $action_options.input_format_conditional == "user_generated_tables":
+    --experiment $action_options.input_format_conditional.experiments_users_table
+    --study $action_options.input_format_conditional.studies_users_table
+    --run $action_options.input_format_conditional.runs_users_table
+    --sample $action_options.input_format_conditional.samples_users_table
+#else:
+    --experiment $experiments_table_path
+    --study $studies_table_path
+    --run $runs_table_path
+    --sample $samples_table_path
+#end if
+#if $action_options.submit_dev == "true":
+    -d
+#end if
+  > $output  
+#end if
+
+    ]]></command>
+    <configfiles>
+    <configfile name="studies_json">
+#import json
+#if $action_options.input_format_conditional.input_format == "build_tables":
+  #set $files_to_upload = list()
+  #set $studies = list()
+  #for $study in $action_options.input_format_conditional.rep_study:
+    #set samples = list()
+    #for $sample in $study.rep_sample:
+      #set experiments = list()
+      #for $experiment in $sample.rep_experiment:
+        #set runs = list()
+        #for $run in $experiment.rep_runs:
+            #set run_files = list()
+            #for $file in $run.upload_files:
+              $run_files.append(str($file.element_identifier))
+            #end for
+            $runs.append($run_files) 
+        #end for
+        $experiments.append({'title':str($experiment.experiment_title),'experiment_design':str($experiment.experiment_design),'library_strategy':str($experiment.library_strategy),'library_source':str($experiment.library_source),'library_selection':str($experiment.library_strategy),'library_layout':str($experiment.library_layout),'insert_size':str($experiment.insert_size),'library_construction_protocol':str($experiment.library_construction_protocol),'platform':str($experiment.platform),'instrument_model':str($experiment.instrument_model),'runs':$runs})
+      #end for
+      $samples.append({'title':str($sample.sample_title),'description':str($sample.sample_description),'tax_name':str($sample.scientific_name),'tax_id':str($sample.tax_id),'experiments':$experiments})
+    #end for
+    $studies.append({'title':str($study.study_title),'type':str($study.study_type),'abstract':str($study.study_abstract),'pubmed_id':str($study.study_pubmed_id),'samples':$samples})
+  #end for
+  #echo $json.dumps($studies)
+#end if
+        </configfile>
+    </configfiles>
+<inputs>
+       <conditional name="action_options">
+            <param argument="--action" type="select" label="Action to execute">
+                <option value="add" selected="True">Add new data</option>
+                <option value="modify">Modify metadata</option>
+            </param>
+            <when value="add">
+                <param name="submit_dev" type="boolean" label="Submit to test ENA server?"/>
+                <conditional name="input_format_conditional">
+                    <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?">
+                        <option value="user_generated_tables" selected="True">User generated tables of studies/experiments/runs/samples</option>
+                        <option value="build_tables" selected="False">Interactive generation of studies structure from dataset</option>
+                    </param>    
+                    <when value="user_generated_tables">
+                        <param name="data" type="data" format="fastq.gz" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>
+                        <param name="experiments_users_table" type="data" format="tabular" multiple="false" label="Metadata" help="Experiment metadata file"/>
+                        <param name="studies_users_table" type="data" format="tabular" multiple="false" label="Metadata" help="Study metadata file"/>
+                        <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Datasets to upload" help="Run metadata file"/>
+                        <param name="samples_users_table" type="data" format="tabular" multiple="false" label="Datasets to upload" help="Sample metadata file"/>
+                    </when>
+                    <when value="build_tables">
+                        <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets"/>
+                        <repeat name="rep_study" title="Study" min="1">
+                            <param name="study_title" type="text" optional="False" label="Please provide a short descriptive title for the study"/>
+                            <param name="study_abstract" type="text" optional="True" label="Please provide an abstract to describe the study in detail"/>
+                            <param name="study_type" type="select" label="Please select the type of study">
+                                    <options from_file="study_type.txt">
+                                        <column name="value" index="0"/>
+                                    </options>
+                            </param>
+                            <param name="study_pubmed_id" type="text" optional="True" value="" label="Please provide the PubMed id if exists (or leave it blank)"/>
+                            <repeat name="rep_sample" title="Samples associated with this study" min="1" >
+                                <param name="sample_title" type="text" default="Blood sample" label="Sample title"/>
+                                <param name="sample_description" type="text" default="liver cells" label="Describe the type of sample"/>
+                                <param name="scientific_name" type="text" default="Homo Sapiens" label="Enter the species of the sample" help=""/>
+                                <param name="tax_id" type="text" label="Enter the taxonomic ID corresponding to the sample species" help=""/>
+                                <repeat name="rep_experiment" title="Sequencing experiments performed with this sample" min="1" >
+                                    <param name="experiment_title" type="text" label="Specify an experiment title" />
+                                    <param name="experiment_design" type="text" label="Describe the experiment design" />
+                                    <param name="library_strategy" type="select" label="Library strategy" help="The library strategy specifies the sequencing technique intended for this library">
+                                        <options from_file="library_strategy.txt">
+                                            <column name="value" index="0"/>
+                                        </options>
+                                    </param>
+                                    <param name="library_source" type="select" label="Select library source" help="The library source specifies the type of source material that is being sequenced">
+                                        <options from_file="library_source.txt">
+                                            <column name="value" index="0"/>
+                                        </options>
+                                    </param>
+                                    <param name="library_selection" type="select" label="Library selection" help="The library selection specifies whether any method was used to select for or against, enrich, or screen the material being sequenced">
+                                        <options from_file="library_selection.txt">
+                                            <column name="value" index="0"/>
+                                        </options>
+                                    </param>
+                                    <param name="library_layout" type="select" label="Library layout">
+                                        <options from_file="library_layout.txt">
+                                            <column name="value" index="0"/>
+                                        </options>
+                                    </param>
+                                    <param name="insert_size" type="integer" value="0" label="Specify the insert size"/>
+                                    <param name="library_construction_protocol" type="text" label="Please describe the library construction protocol"/>
+                                    <param name="platform" type="select" label="Select the sequencing platform used">
+                                        <option value="LS454">LS454</option>
+                                        <option value="ILLUMINA">Illumina</option>
+                                        <option value="HELICOS">Helicos</option>
+                                        <option value="ABI_SOLID">ABI Solid</option>
+                                        <option value="COMPLETE_GENOMICS">Complete Genomics</option>
+                                        <option value="BGISEQ">BGI Seq</option>
+                                        <option value="OXFORD_NANOPORE">Oxford Nanopore</option>
+                                        <option value="PACBIO_SMRT">PacBio</option>
+                                        <option value="ION_TORRENT">Ion Torrent</option>
+                                        <option value="CAPILLARY">Capillary sequencing</option>
+                                    </param>
+                                    <param name="instrument_model" type="select" label="Instrument model">
+                                        <options from_file="instrument_model.txt">
+                                            <column name="value" index="0"/>
+                                        </options>
+                                    </param>
+                                    <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >
+                                        <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2" multiple="true" label="File(s) associated with this run"/>
+                                    </repeat>
+                                </repeat>
+                            </repeat>
+                        </repeat>
+                    </when>
+                </conditional>
+            <param name="center" type="text" optional="False" label="Affiliation center"/>
+            </when>
+            <when value="modify">
+                <param name="modify_first" type="text" label="Mofification of submitted data is not yet possible"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output" format="data" label="${tool.name} on ${on_string}: Upload summary"/>
+        <data name="studies_table_out" format="tabular" label="Studies table">
+            <filter> action_options['input_format_conditional']['dry_run'] == True </filter>
+        </data>
+        <data name="samples_table_out" format="tabular" label="Samples table">
+            <filter> action_options['input_format_conditional']['dry_run'] == True </filter>
+        </data>
+        <data name="experiments_table_out" format="tabular" label="Experiments table">
+            <filter> action_options['input_format_conditional']['dry_run'] == True </filter>
+        </data>
+        <data name="runs_table_out" format="tabular" label="Runs table">
+            <filter> action_options['input_format_conditional']['dry_run'] == True </filter>
+        </data>
+    </outputs>
+    <help><![CDATA[
+        TODO: Fill in help.
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_tables.py	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,50 @@
+import argparse
+import json
+import os
+import pathlib
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--studies',dest='studies_json_path', required=True)
+parser.add_argument('--out_dir',dest='out_path', required=True)
+args = parser.parse_args()
+
+
+with open(args.studies_json_path,'r') as studies_json_file:
+    studies_dict = json.load(studies_json_file)
+
+studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w')
+studies_table.write('\t'.join(['alias','status','accession','title','study_type','study_abstract','pubmed_id','submission_date']) + '\n')
+
+samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w')
+samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date']) + '\n')
+
+experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w')
+experiments_table.write('\t'.join(['alias','status','accession','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model','submission_date'])+ '\n')
+
+runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w')
+runs_table.write('\t'.join(['alias','status','accession','experiment_alias','file_name','file_format','file_checksum','submission_date'])+ '\n')
+
+action = 'add'
+for study_index, study in enumerate(studies_dict):
+    study_alias = 'study_'+str(study_index)
+    studies_table.write('\t'.join([study_alias,action,'ENA_accession',study['title'], study['type'],study['abstract'],study['pubmed_id'],'ENA_submission_data']))
+    for sample_index,sample in enumerate(study['samples']):
+        sample_alias = 'sample_'+str(sample_index)
+        samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],'ENA_submission_date'])+ '\n')
+        for exp_index,exp in enumerate(sample['experiments']):
+            exp_alias = 'experiment_'+str(exp_index)+'_'+str(sample_index)
+            lib_alias = 'library_'+str(exp_index)+'_'+str(sample_index)
+            experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['experiment_design'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'],exp['insert_size'],exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n')
+            run_index = 0
+            # exp['runs'] is a list of lists
+            for run in exp['runs']:
+                run_index += 1
+                run_alias = '_'.join(['run',str(exp_index),str(sample_index),str(run_index)])
+                for file_entry in run:
+                    file_format = 'fastq.gz' if os.path.splitext(file_entry)[-1] == '.gz' else 'fastq.bz2'
+                    runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,file_entry,file_format,'file_checksum','submission_date_ENA']) + '\n')
+
+studies_table.close()
+samples_table.close()
+experiments_table.close()
+runs_table.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/instrument_model.txt	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,58 @@
+minION
+GridION
+PromethION
+454 GS
+454 GS 20
+454 GS FLX
+454 GS FLX+
+454 GS FLX Titanium
+454 GS Junior
+Illumina Genome Analyzer
+Illumina Genome Analyzer II
+Illumina Genome Analyzer IIx
+Illumina HiSeq 1000
+Illumina HiSeq 1500
+Illumina HiSeq 2000
+Illumina HiSeq 2500
+Illumina HiSeq 3000
+Illumina HiSeq 4000
+Illumina iSeq 100
+Illumina HiScanSQ
+Illumina NextSeq 500
+Illumina NextSeq 550
+Illumina NovaSeq 6000
+Illumina HiSeq X Five
+Illumina HiSeq X Ten
+Illumina MiSeq
+Illumina MiniSeq
+AB SOLiD System
+AB SOLiD System 2.0
+AB SOLiD System 3.0
+AB SOLiD 3 Plus System
+AB SOLiD 4 System
+AB SOLiD 4hq System
+AB SOLiD PI System
+AB 5500 Genetic Analyzer
+AB 5500xl Genetic Analyzer
+AB 5500xl-W Genetic Analysis System
+Ion Torrent PGM
+Ion Torrent Proton
+Ion Torrent S5
+Ion Torrent S5 XL
+Complete Genomics
+PacBio RS
+PacBio RS II
+Sequel
+Sequel II
+AB 3730xL Genetic Analyzer
+AB 3730 Genetic Analyzer
+AB 3500xL Genetic Analyzer
+AB 3500 Genetic Analyzer
+AB 3130xL Genetic Analyzer
+AB 3130 Genetic Analyzer
+AB 310 Genetic Analyzer
+BGISEQ-500
+DNBSEQ-T7
+DNBSEQ-G400
+DNBSEQ-G50
+DNBSEQ-G400 FAST
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/library_layout.txt	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,2 @@
+SINGLE
+PAIRED
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/library_selection.txt	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,27 @@
+RANDOM
+PCR
+RANDOM PCR
+RT-PCR
+HMPR
+MF
+repeat fractionation
+size fractionation
+MSLL
+cDNA
+ChIP
+MNase
+DNase
+Hybrid Selection
+Reduced Representation
+Restriction Digest
+5-methylcytidine antibody
+MBD2 protein methyl-CpG binding domain
+CAGE
+RACE
+MDA
+padlock probes capture method
+Oligo-dT
+Inverse rRNA selection
+ChIP-Seq
+other
+unspecified
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/library_source.txt	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,7 @@
+GENOMIC
+TRANSCRIPTOMIC
+METAGENOMIC
+METATRANSCRIPTOMIC
+SYNTHETIC
+VIRAL RNA
+OTHER
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/library_strategy.txt	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,38 @@
+WGS
+WGA
+WXS
+RNA-Seq
+ssRNA-seq
+miRNA-Seq
+ncRNA-Seq
+FL-cDNA
+EST
+Hi-C
+ATAC-seq
+WCS
+RAD-Seq
+CLONE
+POOLCLONE
+AMPLICON
+CLONEEND
+FINISHING
+ChIP-Seq
+MNase-Seq
+DNase-Hypersensitivity
+Bisulfite-Seq
+CTS
+MRE-Seq
+MeDIP-Seq
+MBD-Seq
+Tn-Seq
+VALIDATION
+FAIRE-seq
+SELEX
+RIP-Seq
+ChIA-PET
+Synthetic-Long-Read
+Targeted-Capture
+Tethered Chromatin Conformation Capture
+ChM-Seq
+GBS
+OTHER
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/study_type.txt	Wed Aug 05 14:55:11 2020 +0000
@@ -0,0 +1,14 @@
+Whole Genome Sequencing
+Metagenomics
+Transcriptome Analysis
+Resequencing
+Epigenetics
+Synthetic Genomics
+Forensic or Paleo-genomics
+Gene Regulation Study
+Cancer Genomics
+Population Genomics
+RNASeq
+Exome Sequencing
+Pooled Clone Sequencing
+Transcriptome Sequencing