Mercurial > repos > ieguinoa > ena_upload
changeset 4:b85d239b1d58 draft
Uploaded
author | ieguinoa |
---|---|
date | Wed, 05 Aug 2020 14:55:11 +0000 |
parents | e0f039988802 |
children | 19f34b5e27db |
files | .shed.yml README.md ena_upload.xml extract_tables.py tool-data/instrument_model.txt tool-data/library_layout.txt tool-data/library_selection.txt tool-data/library_source.txt tool-data/library_strategy.txt tool-data/study_type.txt |
diffstat | 10 files changed, 421 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,12 @@ +categories: + - TODO +description: | + Submits experimental data and respective metadata to the European Nucleotide Archive (ENA). +long_description: | + The program submits experimental data and respective metadata to the European Nucleotide Archive (ENA). + The metadata should be provided in separate tables corresponding to ENA objects STUDY, SAMPLE, EXPERIMENT and RUN +name: ena_upload +owner: iuc +remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload +homepage_url: https://github.com/usegalaxy-eu/ena-upload-cli +type: unrestricted \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,2 @@ +# ena-upload-wrapper +Galaxy wrapper for ena-cli-upload
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ena_upload.xml Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,211 @@ +<tool id="ena_upload" name="ENA Upload tool" version="0.1.0" python_template_version="3.5"> + <macros> + <token name="@VERSION@">0.1.3</token> + </macros> + <requirements> + <requirement type="package" version="@VERSION@">ena-upload-cli</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +cwd=\$(pwd); +#set webin_id = os.environ.get('WEBIN_ID', None) +#set webin_pass = os.environ.get('WEBIN_PASS', None) +#set working_dir = os.getcwd() +#if $action_options.input_format_conditional.input_format == "build_tables": + python $__tool_directory__/extract_tables.py --out_dir \$cwd --studies $studies_json; + #set $studies_table_path = "$cwd/studies.tsv" + #set $samples_table_path = "$cwd/samples.tsv" + #set $experiments_table_path = "$cwd/experiments.tsv" + #set $runs_table_path = "$cwd/runs.tsv" +#end if +#if $action_options.input_format_conditional.dry_run: + cp $studies_table_path $studies_table_out; + cp $samples_table_path $samples_table_out; + cp $experiments_table_path $experiments_table_out; + cp $runs_table_path $runs_table_out; +#end if + + +## create the list of files to upload and make the symlinks +#set $files_to_upload = list() +#for $study in $action_options.input_format_conditional.rep_study: + #for $sample in $study.rep_sample: + #for $experiment in $sample.rep_experiment: + #for $run in $experiment.rep_runs: + #for $file in $run.upload_files: + ln -s $file $file.element_identifier; + $files_to_upload.append(str($file.element_identifier)) + #end for + #end for + #end for + #end for +#end for + +#if $action_options.input_format_conditional.dry_run == "false": +ena-upload-cli + --action 'add' + --center $action_options.center + --webin_id '$webin_id' + --password '$webin_pass' + --data + #for $dataset in $files_to_upload: + $dataset + #end for +#if $action_options.input_format_conditional == "user_generated_tables": + --experiment $action_options.input_format_conditional.experiments_users_table + --study $action_options.input_format_conditional.studies_users_table + --run $action_options.input_format_conditional.runs_users_table + --sample $action_options.input_format_conditional.samples_users_table +#else: + --experiment $experiments_table_path + --study $studies_table_path + --run $runs_table_path + --sample $samples_table_path +#end if +#if $action_options.submit_dev == "true": + -d +#end if + > $output +#end if + + ]]></command> + <configfiles> + <configfile name="studies_json"> +#import json +#if $action_options.input_format_conditional.input_format == "build_tables": + #set $files_to_upload = list() + #set $studies = list() + #for $study in $action_options.input_format_conditional.rep_study: + #set samples = list() + #for $sample in $study.rep_sample: + #set experiments = list() + #for $experiment in $sample.rep_experiment: + #set runs = list() + #for $run in $experiment.rep_runs: + #set run_files = list() + #for $file in $run.upload_files: + $run_files.append(str($file.element_identifier)) + #end for + $runs.append($run_files) + #end for + $experiments.append({'title':str($experiment.experiment_title),'experiment_design':str($experiment.experiment_design),'library_strategy':str($experiment.library_strategy),'library_source':str($experiment.library_source),'library_selection':str($experiment.library_strategy),'library_layout':str($experiment.library_layout),'insert_size':str($experiment.insert_size),'library_construction_protocol':str($experiment.library_construction_protocol),'platform':str($experiment.platform),'instrument_model':str($experiment.instrument_model),'runs':$runs}) + #end for + $samples.append({'title':str($sample.sample_title),'description':str($sample.sample_description),'tax_name':str($sample.scientific_name),'tax_id':str($sample.tax_id),'experiments':$experiments}) + #end for + $studies.append({'title':str($study.study_title),'type':str($study.study_type),'abstract':str($study.study_abstract),'pubmed_id':str($study.study_pubmed_id),'samples':$samples}) + #end for + #echo $json.dumps($studies) +#end if + </configfile> + </configfiles> +<inputs> + <conditional name="action_options"> + <param argument="--action" type="select" label="Action to execute"> + <option value="add" selected="True">Add new data</option> + <option value="modify">Modify metadata</option> + </param> + <when value="add"> + <param name="submit_dev" type="boolean" label="Submit to test ENA server?"/> + <conditional name="input_format_conditional"> + <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?"> + <option value="user_generated_tables" selected="True">User generated tables of studies/experiments/runs/samples</option> + <option value="build_tables" selected="False">Interactive generation of studies structure from dataset</option> + </param> + <when value="user_generated_tables"> + <param name="data" type="data" format="fastq.gz" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/> + <param name="experiments_users_table" type="data" format="tabular" multiple="false" label="Metadata" help="Experiment metadata file"/> + <param name="studies_users_table" type="data" format="tabular" multiple="false" label="Metadata" help="Study metadata file"/> + <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Datasets to upload" help="Run metadata file"/> + <param name="samples_users_table" type="data" format="tabular" multiple="false" label="Datasets to upload" help="Sample metadata file"/> + </when> + <when value="build_tables"> + <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets"/> + <repeat name="rep_study" title="Study" min="1"> + <param name="study_title" type="text" optional="False" label="Please provide a short descriptive title for the study"/> + <param name="study_abstract" type="text" optional="True" label="Please provide an abstract to describe the study in detail"/> + <param name="study_type" type="select" label="Please select the type of study"> + <options from_file="study_type.txt"> + <column name="value" index="0"/> + </options> + </param> + <param name="study_pubmed_id" type="text" optional="True" value="" label="Please provide the PubMed id if exists (or leave it blank)"/> + <repeat name="rep_sample" title="Samples associated with this study" min="1" > + <param name="sample_title" type="text" default="Blood sample" label="Sample title"/> + <param name="sample_description" type="text" default="liver cells" label="Describe the type of sample"/> + <param name="scientific_name" type="text" default="Homo Sapiens" label="Enter the species of the sample" help=""/> + <param name="tax_id" type="text" label="Enter the taxonomic ID corresponding to the sample species" help=""/> + <repeat name="rep_experiment" title="Sequencing experiments performed with this sample" min="1" > + <param name="experiment_title" type="text" label="Specify an experiment title" /> + <param name="experiment_design" type="text" label="Describe the experiment design" /> + <param name="library_strategy" type="select" label="Library strategy" help="The library strategy specifies the sequencing technique intended for this library"> + <options from_file="library_strategy.txt"> + <column name="value" index="0"/> + </options> + </param> + <param name="library_source" type="select" label="Select library source" help="The library source specifies the type of source material that is being sequenced"> + <options from_file="library_source.txt"> + <column name="value" index="0"/> + </options> + </param> + <param name="library_selection" type="select" label="Library selection" help="The library selection specifies whether any method was used to select for or against, enrich, or screen the material being sequenced"> + <options from_file="library_selection.txt"> + <column name="value" index="0"/> + </options> + </param> + <param name="library_layout" type="select" label="Library layout"> + <options from_file="library_layout.txt"> + <column name="value" index="0"/> + </options> + </param> + <param name="insert_size" type="integer" value="0" label="Specify the insert size"/> + <param name="library_construction_protocol" type="text" label="Please describe the library construction protocol"/> + <param name="platform" type="select" label="Select the sequencing platform used"> + <option value="LS454">LS454</option> + <option value="ILLUMINA">Illumina</option> + <option value="HELICOS">Helicos</option> + <option value="ABI_SOLID">ABI Solid</option> + <option value="COMPLETE_GENOMICS">Complete Genomics</option> + <option value="BGISEQ">BGI Seq</option> + <option value="OXFORD_NANOPORE">Oxford Nanopore</option> + <option value="PACBIO_SMRT">PacBio</option> + <option value="ION_TORRENT">Ion Torrent</option> + <option value="CAPILLARY">Capillary sequencing</option> + </param> + <param name="instrument_model" type="select" label="Instrument model"> + <options from_file="instrument_model.txt"> + <column name="value" index="0"/> + </options> + </param> + <repeat name="rep_runs" title="Runs executed within this experiment" min="1" > + <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2" multiple="true" label="File(s) associated with this run"/> + </repeat> + </repeat> + </repeat> + </repeat> + </when> + </conditional> + <param name="center" type="text" optional="False" label="Affiliation center"/> + </when> + <when value="modify"> + <param name="modify_first" type="text" label="Mofification of submitted data is not yet possible"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="output" format="data" label="${tool.name} on ${on_string}: Upload summary"/> + <data name="studies_table_out" format="tabular" label="Studies table"> + <filter> action_options['input_format_conditional']['dry_run'] == True </filter> + </data> + <data name="samples_table_out" format="tabular" label="Samples table"> + <filter> action_options['input_format_conditional']['dry_run'] == True </filter> + </data> + <data name="experiments_table_out" format="tabular" label="Experiments table"> + <filter> action_options['input_format_conditional']['dry_run'] == True </filter> + </data> + <data name="runs_table_out" format="tabular" label="Runs table"> + <filter> action_options['input_format_conditional']['dry_run'] == True </filter> + </data> + </outputs> + <help><![CDATA[ + TODO: Fill in help. + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/extract_tables.py Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,50 @@ +import argparse +import json +import os +import pathlib + +parser = argparse.ArgumentParser() +parser.add_argument('--studies',dest='studies_json_path', required=True) +parser.add_argument('--out_dir',dest='out_path', required=True) +args = parser.parse_args() + + +with open(args.studies_json_path,'r') as studies_json_file: + studies_dict = json.load(studies_json_file) + +studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') +studies_table.write('\t'.join(['alias','status','accession','title','study_type','study_abstract','pubmed_id','submission_date']) + '\n') + +samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') +samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date']) + '\n') + +experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') +experiments_table.write('\t'.join(['alias','status','accession','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model','submission_date'])+ '\n') + +runs_table = open(pathlib.Path(args.out_path) / 'runs.tsv', 'w') +runs_table.write('\t'.join(['alias','status','accession','experiment_alias','file_name','file_format','file_checksum','submission_date'])+ '\n') + +action = 'add' +for study_index, study in enumerate(studies_dict): + study_alias = 'study_'+str(study_index) + studies_table.write('\t'.join([study_alias,action,'ENA_accession',study['title'], study['type'],study['abstract'],study['pubmed_id'],'ENA_submission_data'])) + for sample_index,sample in enumerate(study['samples']): + sample_alias = 'sample_'+str(sample_index) + samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],'ENA_submission_date'])+ '\n') + for exp_index,exp in enumerate(sample['experiments']): + exp_alias = 'experiment_'+str(exp_index)+'_'+str(sample_index) + lib_alias = 'library_'+str(exp_index)+'_'+str(sample_index) + experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['experiment_design'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'],exp['insert_size'],exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n') + run_index = 0 + # exp['runs'] is a list of lists + for run in exp['runs']: + run_index += 1 + run_alias = '_'.join(['run',str(exp_index),str(sample_index),str(run_index)]) + for file_entry in run: + file_format = 'fastq.gz' if os.path.splitext(file_entry)[-1] == '.gz' else 'fastq.bz2' + runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,file_entry,file_format,'file_checksum','submission_date_ENA']) + '\n') + +studies_table.close() +samples_table.close() +experiments_table.close() +runs_table.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/instrument_model.txt Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,58 @@ +minION +GridION +PromethION +454 GS +454 GS 20 +454 GS FLX +454 GS FLX+ +454 GS FLX Titanium +454 GS Junior +Illumina Genome Analyzer +Illumina Genome Analyzer II +Illumina Genome Analyzer IIx +Illumina HiSeq 1000 +Illumina HiSeq 1500 +Illumina HiSeq 2000 +Illumina HiSeq 2500 +Illumina HiSeq 3000 +Illumina HiSeq 4000 +Illumina iSeq 100 +Illumina HiScanSQ +Illumina NextSeq 500 +Illumina NextSeq 550 +Illumina NovaSeq 6000 +Illumina HiSeq X Five +Illumina HiSeq X Ten +Illumina MiSeq +Illumina MiniSeq +AB SOLiD System +AB SOLiD System 2.0 +AB SOLiD System 3.0 +AB SOLiD 3 Plus System +AB SOLiD 4 System +AB SOLiD 4hq System +AB SOLiD PI System +AB 5500 Genetic Analyzer +AB 5500xl Genetic Analyzer +AB 5500xl-W Genetic Analysis System +Ion Torrent PGM +Ion Torrent Proton +Ion Torrent S5 +Ion Torrent S5 XL +Complete Genomics +PacBio RS +PacBio RS II +Sequel +Sequel II +AB 3730xL Genetic Analyzer +AB 3730 Genetic Analyzer +AB 3500xL Genetic Analyzer +AB 3500 Genetic Analyzer +AB 3130xL Genetic Analyzer +AB 3130 Genetic Analyzer +AB 310 Genetic Analyzer +BGISEQ-500 +DNBSEQ-T7 +DNBSEQ-G400 +DNBSEQ-G50 +DNBSEQ-G400 FAST
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/library_layout.txt Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,2 @@ +SINGLE +PAIRED
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/library_selection.txt Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,27 @@ +RANDOM +PCR +RANDOM PCR +RT-PCR +HMPR +MF +repeat fractionation +size fractionation +MSLL +cDNA +ChIP +MNase +DNase +Hybrid Selection +Reduced Representation +Restriction Digest +5-methylcytidine antibody +MBD2 protein methyl-CpG binding domain +CAGE +RACE +MDA +padlock probes capture method +Oligo-dT +Inverse rRNA selection +ChIP-Seq +other +unspecified
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/library_source.txt Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,7 @@ +GENOMIC +TRANSCRIPTOMIC +METAGENOMIC +METATRANSCRIPTOMIC +SYNTHETIC +VIRAL RNA +OTHER
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/library_strategy.txt Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,38 @@ +WGS +WGA +WXS +RNA-Seq +ssRNA-seq +miRNA-Seq +ncRNA-Seq +FL-cDNA +EST +Hi-C +ATAC-seq +WCS +RAD-Seq +CLONE +POOLCLONE +AMPLICON +CLONEEND +FINISHING +ChIP-Seq +MNase-Seq +DNase-Hypersensitivity +Bisulfite-Seq +CTS +MRE-Seq +MeDIP-Seq +MBD-Seq +Tn-Seq +VALIDATION +FAIRE-seq +SELEX +RIP-Seq +ChIA-PET +Synthetic-Long-Read +Targeted-Capture +Tethered Chromatin Conformation Capture +ChM-Seq +GBS +OTHER
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/study_type.txt Wed Aug 05 14:55:11 2020 +0000 @@ -0,0 +1,14 @@ +Whole Genome Sequencing +Metagenomics +Transcriptome Analysis +Resequencing +Epigenetics +Synthetic Genomics +Forensic or Paleo-genomics +Gene Regulation Study +Cancer Genomics +Population Genomics +RNASeq +Exome Sequencing +Pooled Clone Sequencing +Transcriptome Sequencing