Mercurial > repos > ieguinoa > ena_upload
changeset 15:d85e35a36715 draft
Uploaded
author | ieguinoa |
---|---|
date | Thu, 24 Sep 2020 10:14:41 +0000 |
parents | 436da36c6114 |
children | 32494e1a490b |
files | ena_upload.xml extract_tables.py tool_data_table_conf.xml.sample |
diffstat | 3 files changed, 43 insertions(+), 76 deletions(-) [+] |
line wrap: on
line diff
--- a/ena_upload.xml Tue Sep 22 20:36:41 2020 +0000 +++ b/ena_upload.xml Thu Sep 24 10:14:41 2020 +0000 @@ -1,7 +1,8 @@ <tool id="ena_upload" name="ENA Upload tool" version="0.1.0" python_template_version="3.5"> <macros> <token name="@VERSION@">0.1.8</token> - </macros> + <import>samples_macros.xml</import> +</macros> <requirements> <requirement type="package" version="@VERSION@">ena-upload-cli</requirement> </requirements> @@ -9,7 +10,7 @@ cwd=\$(pwd); #set webin_id = os.environ.get('WEBIN_ID', None) #set webin_secret = os.environ.get('WEBIN_SECRET', None) -#set working_dir = os.getcwd() +#set working_dir = os.getcwd() #set $dry_run_option = "False" #if $action_options.input_format_conditional.input_format == "build_tables": python $__tool_directory__/extract_tables.py --out_dir \$cwd --studies $studies_json; @@ -34,7 +35,7 @@ ## create the list of files to upload and make the symlinks #set $files_to_upload = list() #if $action_options.input_format_conditional.input_format == "build_tables": - #for $study in $action_options.input_format_conditional.rep_study: + #for $study in $action_options.input_format_conditional.conditional_viral_metadata.rep_study: #for $sample in $study.rep_sample: #for $experiment in $sample.rep_experiment: #for $run in $experiment.rep_runs: @@ -77,11 +78,14 @@ --study $studies_table_path --run $runs_table_path --sample $samples_table_path + #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true": + --vir + #end if #end if #if $action_options.submit_dev == "true": -d #end if - > $output + > $output #end if ]]></command> @@ -91,7 +95,7 @@ #if $action_options.input_format_conditional.input_format == "build_tables": #set $files_to_upload = list() #set $studies = list() - #for $study in $action_options.input_format_conditional.rep_study: + #for $study in $action_options.input_format_conditional.conditional_viral_metadata.rep_study: #set samples = list() #for $sample in $study.rep_sample: #set experiments = list() @@ -102,11 +106,15 @@ #for $file in $run.upload_files: $run_files.append(str($file.element_identifier)) #end for - $runs.append($run_files) + $runs.append($run_files) #end for - $experiments.append({'title':str($experiment.experiment_title),'experiment_design':str($experiment.experiment_design),'library_strategy':str($experiment.library_strategy),'library_source':str($experiment.library_source),'library_selection':str($experiment.library_strategy),'library_layout':str($experiment.library_layout),'insert_size':str($experiment.insert_size),'library_construction_protocol':str($experiment.library_construction_protocol),'platform':str($experiment.platform),'instrument_model':str($experiment.instrument_model),'runs':$runs}) + $experiments.append({'title':str($experiment.experiment_title),'experiment_design':str($experiment.experiment_design),'library_strategy':str($experiment.library_strategy),'library_source':str($experiment.library_source),'library_selection':str($experiment.library_selection),'library_layout':str($experiment.library_layout),'insert_size':str($experiment.insert_size),'library_construction_protocol':str($experiment.library_construction_protocol),'platform':str($experiment.platform),'instrument_model':str($experiment.instrument_model),'runs':$runs}) #end for - $samples.append({'title':str($sample.sample_title),'description':str($sample.sample_description),'tax_name':str($sample.scientific_name),'tax_id':str($sample.tax_id),'experiments':$experiments}) + #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true": + $samples.append({'title':str($sample.sample_title),'description':str($sample.sample_description),'tax_name':str($sample.scientific_name),'tax_id':str($sample.tax_id),'collection_date':str($sample.collection_date),'geo_location':str($sample.geo_location_country),'host_common_name':str($sample.host_common_name),'host_subject_id':str($sample.host_subject_id),'host_health_state':str($sample.host_health_state),'host_sex':str($sample.host_sex),'host_scientific_name':str($sample.host_scientific_name),'collector_name':str($sample.collector_name),'collecting_institution':str($sample.collecting_institution),'isolate':str($sample.isolate),'experiments':$experiments}) + #else: + $samples.append({'title':str($sample.sample_title),'description':str($sample.sample_description),'tax_name':str($sample.scientific_name),'tax_id':str($sample.tax_id),'experiments':$experiments}) + #end if #end for $studies.append({'title':str($study.study_title),'type':str($study.study_type),'abstract':str($study.study_abstract),'pubmed_id':str($study.study_pubmed_id),'samples':$samples}) #end for @@ -126,7 +134,7 @@ <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?"> <option value="user_generated_tables" selected="True">User generated tables of studies/experiments/runs/samples</option> <option value="build_tables" selected="False">Interactive generation of studies structure from dataset</option> - </param> + </param> <when value="user_generated_tables"> <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" /> <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/> @@ -137,71 +145,18 @@ </when> <when value="build_tables"> <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets"/> - <repeat name="rep_study" title="Study" min="1"> - <param name="study_title" type="text" optional="False" label="Please provide a short descriptive title for the study"/> - <param name="study_abstract" type="text" optional="True" label="Please provide an abstract to describe the study in detail"/> - <param name="study_type" type="select" label="Please select the type of study"> - <options from_file="study_type.txt"> - <column name="value" index="0"/> - </options> - </param> - <param name="study_pubmed_id" type="text" optional="True" value="" label="Please provide the PubMed id if exists (or leave it blank)"/> - <repeat name="rep_sample" title="Samples associated with this study" min="1" > - <param name="sample_title" type="text" default="Blood sample" label="Sample title"/> - <param name="sample_description" type="text" default="liver cells" label="Describe the type of sample"/> - <param name="scientific_name" type="text" default="Homo Sapiens" label="Enter the species of the sample" help=""/> - <param name="tax_id" type="text" label="Enter the taxonomic ID corresponding to the sample species" help=""/> - <repeat name="rep_experiment" title="Sequencing experiments performed with this sample" min="1" > - <param name="experiment_title" type="text" label="Specify an experiment title" /> - <param name="experiment_design" type="text" label="Describe the experiment design" /> - <param name="library_strategy" type="select" label="Library strategy" help="The library strategy specifies the sequencing technique intended for this library"> - <options from_file="library_strategy.txt"> - <column name="value" index="0"/> - </options> - </param> - <param name="library_source" type="select" label="Select library source" help="The library source specifies the type of source material that is being sequenced"> - <options from_file="library_source.txt"> - <column name="value" index="0"/> - </options> - </param> - <param name="library_selection" type="select" label="Library selection" help="The library selection specifies whether any method was used to select for or against, enrich, or screen the material being sequenced"> - <options from_file="library_selection.txt"> - <column name="value" index="0"/> - </options> - </param> - <param name="library_layout" type="select" label="Library layout"> - <options from_file="library_layout.txt"> - <column name="value" index="0"/> - </options> - </param> - <param name="insert_size" type="integer" value="0" label="Specify the insert size"/> - <param name="library_construction_protocol" type="text" label="Please describe the library construction protocol"/> - <param name="platform" type="select" label="Select the sequencing platform used"> - <option value="LS454">LS454</option> - <option value="ILLUMINA">Illumina</option> - <option value="HELICOS">Helicos</option> - <option value="ABI_SOLID">ABI Solid</option> - <option value="COMPLETE_GENOMICS">Complete Genomics</option> - <option value="BGISEQ">BGI Seq</option> - <option value="OXFORD_NANOPORE">Oxford Nanopore</option> - <option value="PACBIO_SMRT">PacBio</option> - <option value="ION_TORRENT">Ion Torrent</option> - <option value="CAPILLARY">Capillary sequencing</option> - </param> - <param name="instrument_model" type="select" label="Instrument model"> - <options from_file="instrument_model.txt"> - <column name="value" index="0"/> - </options> - </param> - <repeat name="rep_runs" title="Runs executed within this experiment" min="1" > - <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz" multiple="true" label="File(s) associated with this run"/> - </repeat> - </repeat> - </repeat> - </repeat> + <conditional name="conditional_viral_metadata"> + <param name="viral_sample" type="boolean" label="Does your submission contains viral samples?" /> + <when value="true"> + <expand macro="viral_samples" /> + </when> + <when value="false"> + <expand macro="nonviral_samples" /> + </when> + </conditional> </when> </conditional> - <param name="center" type="text" optional="False" label="Affiliation center"/> + <param name="center" type="text" optional="False" label="Affiliation center"/> </when> <when value="modify"> <param name="modify_first" type="text" label="Mofification of submitted data is not yet possible"/>
--- a/extract_tables.py Tue Sep 22 20:36:41 2020 +0000 +++ b/extract_tables.py Thu Sep 24 10:14:41 2020 +0000 @@ -16,7 +16,6 @@ studies_table.write('\t'.join(['alias','status','accession','title','study_type','study_abstract','pubmed_id','submission_date']) + '\n') samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') -samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date']) + '\n') experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') experiments_table.write('\t'.join(['alias','status','accession','title','study_alias','sample_alias','design_description','library_name','library_strategy','library_source','library_selection','library_layout','insert_size','library_construction_protocol','platform','instrument_model','submission_date'])+ '\n') @@ -25,23 +24,33 @@ runs_table.write('\t'.join(['alias','status','accession','experiment_alias','file_name','file_format','file_checksum','submission_date'])+ '\n') action = 'add' +viral_submission = False for study_index, study in enumerate(studies_dict): study_alias = 'study_'+str(study_index) studies_table.write('\t'.join([study_alias,action,'ENA_accession',study['title'], study['type'],study['abstract'],study['pubmed_id'],'ENA_submission_data'])) + if "geo_location" in study['samples'][0].keys(): # sample belongs to a viral sample + samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','collection_date','geographic_location','host_common_name','host_subject_id','host_health_state','host_sex','host_scientific_name','collector_name','collecting_institution','isolate','submission_date']) + '\n') + else: + samples_table.write('\t'.join(['alias','status','accession','title','scientific_name','taxon_id','sample_description','submission_date']) + '\n') for sample_index,sample in enumerate(study['samples']): sample_alias = 'sample_'+str(sample_index) - samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],'ENA_submission_date'])+ '\n') + if "geo_location" in sample.keys(): # sample belongs to a viral sample + if sample['collector_name'] == '': + sample['collector_name'] = 'unknown' + samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],sample['collection_date'],sample['geo_location'],sample['host_common_name'],sample['host_subject_id'],sample['host_health_state'],sample['host_sex'],sample['host_scientific_name'],sample['collector_name'],sample['collecting_institution'],sample['isolate'],'ENA_submission_date'])+ '\n') + else: + samples_table.write('\t'.join([sample_alias,action,'ena_accession',sample['title'],sample['tax_name'], sample['tax_id'],sample['description'],'ENA_submission_date'])+ '\n') for exp_index,exp in enumerate(sample['experiments']): exp_alias = 'experiment_'+str(exp_index)+'_'+str(sample_index) lib_alias = 'library_'+str(exp_index)+'_'+str(sample_index) - experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['experiment_design'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'],exp['insert_size'],exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n') + experiments_table.write('\t'.join([exp_alias,action,'accession_ena',exp['title'],study_alias,sample_alias,exp['experiment_design'],lib_alias,exp['library_strategy'],exp['library_source'],exp['library_selection'],exp['library_layout'].lower(),exp['insert_size'],exp['library_construction_protocol'],exp['platform'],exp['instrument_model'],'submission_date_ENA']) + '\n') run_index = 0 # exp['runs'] is a list of lists for run in exp['runs']: run_index += 1 run_alias = '_'.join(['run',str(exp_index),str(sample_index),str(run_index)]) for file_entry in run: - file_format = 'fastq.gz' if os.path.splitext(file_entry)[-1] == '.gz' else 'fastq.bz2' + file_format = 'fastq' runs_table.write('\t'.join([run_alias,action,'ena_run_accession',exp_alias,file_entry,file_format,'file_checksum','submission_date_ENA']) + '\n') studies_table.close()
--- a/tool_data_table_conf.xml.sample Tue Sep 22 20:36:41 2020 +0000 +++ b/tool_data_table_conf.xml.sample Thu Sep 24 10:14:41 2020 +0000 @@ -6,5 +6,8 @@ <table name="instrument_model" comment_char="#" allow_duplicate_entries="False"><columns>value</columns><file path="tool-data/instrument_model.loc" /></table> <table name="library_source" comment_char="#" allow_duplicate_entries="False"><columns>value</columns><file path="tool-data/library_source.loc" /></table> <table name="library_strategy" comment_char="#" allow_duplicate_entries="False"><columns>value</columns><file path="tool-data/library_strategy.loc" /></table> + <table name="geographic_location_1" comment_char="#" allow_duplicate_entries="False"><columns>value</columns><file path="tool-data/geographic_location_1.loc" /></table> + <table name="host_sex_1" comment_char="#" allow_duplicate_entries="False"><columns>value</columns><file path="tool-data/host_sex_1.loc" /></table> + <table name="host_health_state_1" comment_char="#" allow_duplicate_entries="False"><columns>value</columns><file path="tool-data/host_health_state_1.loc" /></table> </tables>