Mercurial > repos > iuc > ega_download_client

<tool id="pyega3" name="EGA Download Client" version="@TOOL_VERSION@+galaxy1" profile="21.01" >
    <macros>
        <token name="@TOOL_VERSION@">4.0.0</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">pyega3</requirement>
    </requirements>
        <command detect_errors="exit_code"><![CDATA[
#set $username = $__user__.extra_preferences.get('ega_account|username', "")
#if $username == "":
    #set $username = "ega-test-data@ebi.ac.uk (default user)"
#end if
    echo "Running as user: $username. Set your credentials via: User -> Preferences -> Manage Information"  &&

#if $action.action_type == "list_datasets"
    pyega3 -cf '$credentials'
      datasets
#elif $action.action_type == "list_dataset_files"
    pyega3 -cf '$credentials'
      files '$action.dataset_id'
    &&

    ## create file header
    echo -e 'File ID\tStatus\tBytes\tCheck sum\tFile name' > '$dataset_file_list' &&

    ## remove timestamps and convert spaces to tabs
    grep EGAF pyega3_output.log | sed -e 's/^\[.*\]\s\+//g' | sed 's/\s\+/\t/g' >> '$dataset_file_list'

#elif $action.action_type == "download_file"
    pyega3 -c \${PYEGA_CONNECTIONS:-30} -cf '$credentials'
      fetch '$action.file_id'
      --max-retries 10
      #if $action.range.reference_name
        --reference-name '$action.range.reference_name'
        #if str($action.range.start)
          --start $action.range.start
        #end if
        #if $action.range.end
          --end $action.range.end
        #end if
      #end if
    && mv ${action.file_id} downloads
    && rm -f downloads/*.md5  ## checksum validation already performed by pyEGA, cleanup downloads folder

#elif $action.action_type == "download_files"
    #set file_ids=[x.split('\t')[int(str($action.id_column))-1] for x in open(str($id_table)).readlines() if x.split('\t')[int(str($action.id_column))-1].startswith('EGAF') ]
    mkdir downloads
    #for f in $file_ids
      &&
      pyega3 -c \${PYEGA_CONNECTIONS:-30} -cf '$credentials'
        fetch '$f'
          --max-retries 10
          #if $action.range.reference_name
          --reference-name '$action.range.reference_name'
          #if str($action.range.start)
            --start $action.range.start
          #end if
          #if $action.range.end
            --end $action.range.end
          #end if
        #end if
        --output-dir downloads
    #end for
    && rm -f downloads/**/*.md5  ## checksum validation already performed by pyEGA, clean up downloads folder

#end if
    ]]></command>
    <configfiles>
        <configfile name="credentials"><![CDATA[
#set $password = $__user__.extra_preferences.get('ega_account|password', "")
#set $username = $__user__.extra_preferences.get('ega_account|username', "")
#if $username == "" or $password == "":
   #set $username = "ega-test-data@ebi.ac.uk"
   #set $password = "egarocks"
#end if
{
    "username": "$username",
    "password": "$password"
}
        ]]></configfile>
    </configfiles>
    <inputs>
        <conditional name="action">
            <param name="action_type" type="select" label="What would you like to do?">
                <option value="list_datasets"> List my authorized datasets </option>
                <option value="list_dataset_files"> List files in a datasets </option>
                <option value="download_file"> Download a file </option>
                <option value="download_files"> Download multiple files (based on a file with IDs) </option>
            </param>
            <when value="list_dataset_files">
                <param name="dataset_id" type="text" optional="false" label="EGA Dataset Accession ID" help="Identifier starting with 'EGAD'. For example: EGAD00001003338">
                    <validator type="regex" message="EGA dataset ID must be a string of numbers prefixed by 'EGAD'">EGAD[0-9]+</validator>
                </param>
            </when>
            <when value="list_datasets"/>
            <when value="download_file">
                <param name="file_id" type="text" optional="false" label="EGA File Accession Identifier" help="Identifier starting with 'EGAF'. For example: EGAF00001753735">
                     <validator type="regex" message="EGA Accession ID must be a string of numbers prefixed by 'EGAD' (datasets) or 'EGAF' (files)">EGAF[0-9]+</validator>
                </param>
                <section name="range" title="Request a specific Genomic range?" expanded="false">
                    <param argument="--reference-name" type="text" optional="true" label="Reference Sequence Name" help="For example 'chr1', '1', or 'chrX'. If unspecified, all data is returned." />
                    <param argument="--start" type="integer" optional="true" min="0" label="Start Position" help="0-based, inclusive. Only used if a reference sequence name was specified"/>
                    <param argument="--end" type="integer" optional="true" min="0" label="End Position" help="0-based, exclusive. Only used if a reference sequence name was specified"/>
                </section>
            </when>
            <when value="download_files">
                <param name="id_table" type="data" format="tabular" label="Table with IDs to download" help="A tabular file where one column contains the set of file IDs. This will output a collection. Please select files that are all the same format (e.g. all BAM or all VCF)."/>
                <param name="id_column" type="data_column" data_ref="id_table" label="Column containing the file IDs" help="File Identifiers starting with 'EGAF'. For example: EGAF00001753735" />
                <section name="range" title="Request a specific Genomic range? (will be applied to ALL requested files)" expanded="false">
                    <param argument="--reference-name" type="text" optional="true" label="Reference Sequence Name" help="For example 'chr1', '1', or 'chrX'. If unspecified, all data is returned." />
                    <param argument="--start" type="integer" optional="true" min="0" label="Start Position" help="0-based, inclusive. Only used if a reference sequence name was specified"/>
                    <param argument="--end" type="integer" optional="true" min="0" label="End Position" help="0-based, exclusive. Only used if a reference sequence name was specified"/>
                </section>
            </when>
        </conditional>
        <param name="output_log" type="boolean" checked="false" label="Output the log file?"/>
    </inputs>
    <outputs>
        <data name="authorized_datasets" format="txt" from_work_dir="pyega3_output.log" label="${tool.name}: authorized datasets">
            <filter> action['action_type'] == 'list_datasets' </filter>
        </data>
        <data name="dataset_file_list" format="tabular" label="${tool.name}: ${action.dataset_id} - file list">
            <filter> action['action_type'] == 'list_dataset_files' </filter>
        </data>
        <data name="downloaded_file" auto_format="true" from_work_dir="downloads/*" label="${tool.name}: ${action.file_id} ${action.range.reference_name} ${action.range.start} ${action.range.end}">
            <filter> action['action_type'] == 'download_file' </filter>
        </data>
        <data name="logfile" format="txt" from_work_dir="pyega3_output.log" label="${tool.name}: log">
            <filter> output_log </filter>
        </data>
        <collection name="downloaded_file_collection" type="list" label="${tool.name} on ${on_string}: Downloaded datasets">
            <filter> action['action_type'] == 'download_files' </filter>
            <discover_datasets pattern="__designation_and_ext__" recurse="true" directory="downloads" />
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="1"><!-- list datasets with default credentials -->
            <param name="action_type" value="list_datasets"/>
            <output name="authorized_datasets" ftype="txt">
                <assert_contents>
                    <has_text text="pyEGA3 - EGA python client version @TOOL_VERSION@"/>
                    <has_text text="EGAD00001003338"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2"><!-- list dataset files with default credentials, and request a log output file -->
            <param name="action_type" value="list_dataset_files"/>
            <param name="dataset_id" value="EGAD00001003338"/>
            <param name="output_log" value="true"/>
            <output name="dataset_file_list" file="filelist_EGAD00001003338.tabular"/>
            <output name="logfile" ftype="txt">
                <assert_contents>
                    <has_text text="pyEGA3 - EGA python client version @TOOL_VERSION@"/>
                    <has_line_matching expression="^\[.*\]\s+File ID\s+Status\s+Bytes\s+Check sum\s+File name$"/>
                    <has_text text="EGAF00001753734"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1"> <!-- download a single file -->
            <param name="action_type" value="download_file"/>
            <param name="file_id" value="EGAF00001775036"/>
            <output name="downloaded_file" md5="3b89b96387db5199fef6ba613f70e27c"/>
        </test>
        <test expect_num_outputs="1"> <!-- download a single file, with genomic range specified -->
            <param name="action_type" value="download_file"/>
            <param name="file_id" value="EGAF00001753756"/>
            <param name="reference_name" value="1"/>
            <param name="start" value="0"/>
            <param name="end" value="10000"/>
            <output name="downloaded_file" ftype="bam" md5="e576a38748feec45aa45191f6e902ce2"/>
        </test>
        <test expect_num_outputs="1"> <!-- download multiple files -->
            <param name="action_type" value="download_files"/>
            <param name="id_table" value="filelist.tabular"/>
            <param name="id_column" value="1"/>
            <output_collection name="downloaded_file_collection" type="list" count="2">
                <element name="ENCFF000VWO.bam" md5="b8ae14d5d1f717ab17d45e8fc36946a0" />
                <element name="ENCFF284YOU.bam" md5="3b89b96387db5199fef6ba613f70e27c" />
            </output_collection>
        </test>
        <test expect_num_outputs="1"> <!-- download multiple files, in combination with a genomic range -->
            <param name="action_type" value="download_files"/>
            <param name="id_table" value="filelist2.tabular"/>
            <param name="id_column" value="1"/>
            <param name="reference_name" value="1"/>
            <param name="start" value="0"/>
            <param name="end" value="10000"/>
            <output_collection name="downloaded_file_collection" count="2">
                <element name="NA19239_genomic_range_1_0_10000" md5="bcdcf18846233cbe5cc8afd95168552c" />
                <element name="NA19240_genomic_range_1_0_10000" md5="e576a38748feec45aa45191f6e902ce2" />
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[
The pyEGA3 download client is a python-based tool for viewing and downloading files from authorized EGA datasets.

If you have an EGA account, you can set your EGA credentials in the user preferences menu of Galaxy. Otherwise, default EGA credentials with access to an example dataset will be used.

pyEGA3 uses the EGA Data API and has several key features:

- Files are transferred over secure https connections and received unencrypted, so no need for decryption after download.
- Downloads resume from where they left off in the event that the connection is interrupted.
- pyEGA3 supports file segmenting and parallelized download of segments, improving overall performance.
- After download completes, file integrity is verified using checksums.
- pyEGA3 implements the GA4GH-compliant htsget protocol for download of genomic ranges for data files with accompanying index files.

    ]]></help>
    <citations>
        <citation type="doi">10.1038/ng.3312</citation>
    </citations>
</tool>
author	iuc
date	Tue, 14 Jun 2022 17:05:39 +0000
parents	95e6ae085d2e
children	6070512177f8