Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_artic_primers.py	Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+from __future__ import print_function, division
+
+import argparse
+import json
+import os
+import os.path
+import sys
+
+import requests
+
+DATA_TABLE_NAME = 'artic_primers'
+def fetch_artic_primers(output_filename, output_directory, primers):
+    primer_sets = {
+        'ARTICv1': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V1/nCoV-2019.bed',
+        'ARTICv2': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V2/nCoV-2019.bed',
+        'ARTICv3': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed'
+        }
+
+    if not os.path.isdir(output_directory):
+        os.makedirs(output_directory)
+    data_manager_dict = json.load(open(output_filename))
+    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+    data_manager_dict['data_tables'][DATA_TABLE_NAME] = data_manager_dict['data_tables'].get(DATA_TABLE_NAME, [])
+
+    data = []
+    for name, url in primer_sets.items():
+        response = requests.get(url)
+        if response.status_code != 200:
+            print('Error: download of', url, 'failed with code', response.status_code, file=sys.stderr)
+            exit(response.status_code)
+        bed_output_filename = os.path.join(output_directory, name + '.bed')
+        open(bed_output_filename, 'w').write(response.text)
+        description = name[:-2] + ' ' + name[-2:] + ' primer set'
+        data.append(dict(value=name, path=bed_output_filename, description=description))
+    data_manager_dict['data_tables'][DATA_TABLE_NAME].extend(data)
+    print(data_manager_dict)
+    json.dump(data_manager_dict, open(output_filename, 'w'))
+
+class SplitArgs(argparse.Action):
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, values.split(','))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Fetch ARTIC primer files for Galaxy use')
+    parser.add_argument('--output_directory', default='tmp', help='Directory to write output to')
+    parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs')
+    parser.add_argument('--primers', default='ARTCIv1,ARTICv2,ARTICv3', action=SplitArgs, help='Comma separated list of primers to fetch')
+    args = parser.parse_args()
+    fetch_artic_primers(args.galaxy_datamanager_filename, args.output_directory, args.primers)
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_artic_primers.xml	Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,31 @@
+<tool id="fetch_artic_primers" name="ARTIC primer data manager" version="0.0.1" tool_type="manage_data" profile="19.05">
+    <requirements>
+        <requirement type="package">python</requirement>
+        <requirement type="package" version="2.22.0">requests</requirement>
+    </requirements>
+    <!-- fetch all the primers in one go -->
+    <command detect_errors="exit_code">
+    python '$__tool_directory__/fetch_artic_primers.py'
+        --galaxy_datamanager_filename '${output_file}'
+    </command>
+    <inputs>
+        <param name="primers" type="select" multiple="true" label="SARS-CoV-2 Primers to fetch">
+            <option value="ARTICv1" selected="true">ARTIC v1</option>
+            <option value="ARTICv2" selected="true">ARTIC v2</option>
+            <option value="ARTICv3" selected="true">ARTIC v3</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="primers" value="ARTICv1,ARTICv2,ARTICv3"/>
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="ARTIC"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+</tool>
\ No newline at end of file
--- a/data_manager/fetch_refseq.py	Fri Sep 28 23:46:24 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,191 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import division, print_function
-
-import argparse
-import functools
-import gzip
-import json
-import os
-import os.path
-import sys
-from datetime import date
-from multiprocessing import Process, Queue
-
-import requests
-
-try:
-    from io import StringIO
-except ImportError:
-    from StringIO import StringIO
-# Refseq structure
-# - Release number
-# - Divisions
-#   1. archea
-#   2. bacteria
-#   3. fungi
-#   4. invertebrate
-#   5. mitochondrion
-#   6. other
-#   7. plant
-#   8. plasmid
-#   9. plastid
-#  10. protozoa
-#  11. vertebrate mammalian
-#  12. vertebrate other
-#  13. viral
-# within each division
-# DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz
-#  where fna and faa are FASTA, gbff and gpff are Genbank
-
-
-def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
-    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
-    data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', [])
-    data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
-    return data_manager_dict
-
-
-def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False, make_len_file=False):
-    input_filename = conn.get()
-    if compress:
-        open_output = gzip.open
-    else:
-        open_output = open
-    if make_len_file:
-        fa_pos = output_filename.find('.fa')
-        if fa_pos == -1:
-            # this should not happen - filename does not contain '.fa'
-            len_filename = output_filename + '.len'
-        else:
-            len_filename = output_filename[:fa_pos] + '.len'
-        len_output = open(len_filename, 'wb')
-        record_len = 0
-        record_id = ''
-    with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file:
-        while input_filename != 'STOP':
-            if debug:
-                print('Reading', input_filename, file=sys.stderr)
-            with gzip.open(input_filename, 'rb') as input_file:
-                read_chunk = functools.partial(input_file.read, (chunk_size))
-                for data in iter(read_chunk, b''):  # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3
-                    if make_len_file:
-                        # break data into lines and parse as FASTA, perhaps continuing from partial previous record
-                        for line in data.split('\n'):
-                            if line.startswith('>'):
-                                if record_id != '':
-                                    len_output.write('{}\t{}\n'.format(record_id, record_len))
-                                # update record ID of record we are processing, set length to 0
-                                record_len = 0
-                                record_id = line[1:].split()[0]
-                            else:
-                                assert record_id != '', "FASTA data found before FASTA record ID known in {}, data: {}".format(input_filename, line)
-                                record_len += len(line.strip())
-                    output_file.write(data)
-                if make_len_file:
-                    # write last entry to .len file
-                    len_output.write('{}\t{}\n'.format(record_id, record_len))
-            os.unlink(input_filename)
-            input_filename = conn.get()
-    len_output.close()
-
-
-def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False):
-    base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/'
-    valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other',
-                          'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'])
-    ending_mappings = {
-        'genomic': '.genomic.fna.gz',
-        'protein': '.protein.faa.gz',
-        'rna': 'rna.fna.gz'
-    }
-    assert division_name in valid_divisions, "Unknown division name ({})".format(division_name)
-    for mol_type in mol_types:
-        assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type)
-    if not os.path.exists(output_directory):
-        os.mkdir(output_directory)
-    release_num_file = base_url + 'RELEASE_NUMBER'
-    r = requests.get(release_num_file)
-    release_num = str(int(r.text.strip()))
-    division_base_url = base_url + division_name
-    if debug:
-        print('Retrieving {}'.format(division_base_url), file=sys.stderr)
-    r = requests.get(division_base_url)
-    listing_text = r.text
-
-    unzip_queues = {}
-    unzip_processes = []
-    final_output_filenames = []
-    for mol_type in mol_types:
-        q = unzip_queues[mol_type] = Queue()
-        output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta'
-        if compress:
-            output_filename += '.gz'
-        final_output_filenames.append(output_filename)
-        unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename),
-                                       kwargs=dict(debug=debug, compress=compress)))
-        unzip_processes[-1].start()
-
-    # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a>   2018-07-13 00:59   10M
-    for line in StringIO(listing_text):
-        if '.gz' not in line:
-            continue
-        parts = line.split('"')
-        assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip())
-        filename = parts[1]
-        for mol_type in mol_types:
-            ending = ending_mappings[mol_type]
-            if filename.endswith(ending):
-                if debug:
-                    print('Downloading:', filename, ending, mol_type, file=sys.stderr)
-                output_filename = os.path.join(output_directory, filename)
-                with open(output_filename, 'wb') as output_file:
-                    r = requests.get(division_base_url + '/' + filename)
-                    for chunk in r.iter_content(chunk_size=4096):
-                        output_file.write(chunk)
-                conn = unzip_queues[mol_type]
-                conn.put(output_filename)
-
-    for mol_type in mol_types:
-        conn = unzip_queues[mol_type]
-        conn.put('STOP')
-
-    return [release_num, final_output_filenames]
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Download RefSeq databases')
-    parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)')
-    parser.add_argument('--compress', default=False, action='store_true', help='Compress output files')
-    parser.add_argument('--output_directory', default='tmp', help='Directory to write output to')
-    parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs')
-    parser.add_argument('--division_names', help='RefSeq divisions to download')
-    parser.add_argument('--mol_types', help='Molecule types (genomic, rna, protein) to fetch')
-    parser.add_argument('--pin_date', help='Force download date to this version string')
-    args = parser.parse_args()
-
-    division_names = args.division_names.split(',')
-    mol_types = args.mol_types.split(',')
-    if args.galaxy_datamanager_filename is not None:
-        dm_opts = json.loads(open(args.galaxy_datamanager_filename).read())
-        output_directory = dm_opts['output_data'][0]['extra_files_path']  # take the extra_files_path of the first output parameter
-        data_manager_dict = {}
-    else:
-        output_directory = args.output_directory
-    for division_name in division_names:
-        if args.pin_date is not None:
-            today_str = args.pin_date
-        else:
-            today_str = date.today().strftime('%Y-%m-%d')  # ISO 8601 date format
-        [release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress)
-        if args.galaxy_datamanager_filename is not None:
-            for i, mol_type in enumerate(mol_types):
-                assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i])
-                unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type  # note: this is now same as dbkey
-                dbkey = unique_key
-                desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')'
-                path = os.path.join(output_directory, fasta_files[i])
-                _add_data_table_entry(data_manager_dict=data_manager_dict,
-                                      data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path),
-                                      data_table_name='all_fasta')
-            open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode())
--- a/data_manager/fetch_refseq.xml	Fri Sep 28 23:46:24 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-<tool id="data_manager_fetch_refseq" name="RefSeq data manager" version="0.0.19" tool_type="manage_data">
-    <description>Fetch FASTA data from NCBI RefSeq and update all_fasta data table</description>
-    <requirements>
-        <requirement type="package" version="3">python</requirement>
-    </requirements>
-    <command detect_errors="aggressive"><![CDATA[
-    python3 $__tool_directory__/fetch_refseq.py
-      #if str( $advanced.advanced_selector ) == 'advanced':
-        '${advanced.compress}'
-      #end if
-      --galaxy_datamanager_filename '${output_file}'
-      --division_names ${division_names}
-      --mol_types ${mol_types}
-      #if str( $pin_date ) != 'NO':
-        --pin_date '${pin_date}'
-      #end if
-    ]]></command>
-    <inputs>
-        <param argument="division_names" type="select" label="RefSeq division" multiple="true">
-            <option value="archea">Archea</option>
-            <option value="bacteria">Bacteria</option>
-            <option value="complete">Complete</option>
-            <option value="fungi">Fungi</option>
-            <option value="invertebrate">Invertebrate</option>
-            <option value="mitochondrion">Mitochondrion</option>
-            <option value="other">Other</option>
-            <option value="plant">Plant</option>
-            <option value="plasmid">Plasmid</option>
-            <option value="plastid">Plastid</option>
-            <option value="protozoa">Protozoa</option>
-            <option value="vertebrate_mammalian">Mammalian Vertebrate</option>
-            <option value="vertebrate_other">Other Vertebrate</option>
-            <option value="viral">Viral</option>
-        </param>
-        <param argument="mol_types" type="select" multiple="true" label="Molecule type" help="Select at least one of genomic, protein or rna sequence">
-            <option value="protein">Protein</option>
-            <option value="genomic">Genomic (DNA)</option>
-            <option value="rna">RNA</option>
-        </param>
-        <conditional name="advanced">
-            <param name="advanced_selector" type="select" label="Advanced Options">
-                <option value="basic" selected="True">Basic</option>
-                <option value="advanced">Advanced</option>
-            </param>
-            <when value="basic">
-            </when>
-            <when value="advanced">
-                <param type="boolean" argument="--compress" truevalue="--compress" falsevalue="" label="Compress FASTA files"
-                    help="Compress downloaded FASTA files (with gzip). Limits compatibility with tools expecting uncompressed FASTA."/>
-            </when>
-        </conditional>
-        <param argument="--pin_date" type="hidden" value="NO" help="Used for testing"/>
-    </inputs>
-    <outputs>
-        <data name="output_file" format="data_manager_json"/>
-    </outputs>
-    <tests>
-        <test>
-            <param name="division_names" value="plastid"/>
-            <param name="mol_types" value="protein"/>
-            <param name="pin_date" value="2018-03-14"/>
-            <param name="advanced_selector" value="basic"/>
-            <output name="output_file">
-                <assert_contents>
-                    <has_text text="2018-03-14"/>
-                    <has_text text="refseq_plastid"/>
-                    <has_text text="/refseq_plastid."/>
-                </assert_contents>
-            </output>
-        </test>
-    </tests>
-    <help><![CDATA[
-This data manager fetches FASTA format collections of proteins, nucleotides (genomic DNA) and RNA
-from NCBI's RefSeq_ data collection.
-
-RefSeq is released every two months and consists of a number of divisions. Some sequences are shared
-between multiple divisions. This data manager allows the Galaxy administrator to select which
-divisions and which molecule types within each division to download. Once downloaded the
-files are made accessible by adding an entry into the *all_fasta* data table.
-
-.. _RefSeq: https://www.ncbi.nlm.nih.gov/refseq/
-    ]]>
-    </help>
-    <citations>
-        <citation type="doi">10.1093/nar/gkv1189</citation>
-    </citations>
-</tool>
\ No newline at end of file
--- a/data_manager_conf.xml	Fri Sep 28 23:46:24 2018 -0400
+++ b/data_manager_conf.xml	Thu Apr 16 10:19:57 2020 +0000
@@ -1,17 +1,16 @@
 <?xml version="1.0"?>
 <data_managers>
-    <data_manager tool_file="data_manager/fetch_refseq.xml" id="fetch_genome_fetch_refseq">
-        <data_table name="all_fasta">
+    <data_manager tool_file="data_manager/fetch_artic_primers.xml" id="fetch_artic_primers">
+        <data_table name="artic_primers">
             <output>
                 <column name="value" />
-                <column name="dbkey" />
-                <column name="name" />
+                <column name="description" />
                 <column name="path" output_ref="output_file" >
                     <move type="file">
                         <source>${path}</source>
-                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">refseq/#echo str($dbkey).split('.')[1]#/${value}.fasta</target>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">artic_primers/#echo str($name).bed#</target>
                     </move>
-                    <value_translation>refseq/#echo str($dbkey).split('.')[1]#/${value}.fasta</value_translation>
+                    <value_translation>artic_primers/#echo str($name).bed#</value_translation>
                     <value_translation type="function">abspath</value_translation>
                 </column>
             </output>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/artic.json	Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,21 @@
+{
+    "data_tables": {
+        "artic_primers": [
+            {
+                "value": "ARTICv1",
+                "description": "ARTIC v1 primer set",
+                "path": "tmp/ARTICv1.bed"
+            },
+            {
+                "value": "ARTICv2",
+                "description": "ARTIC v2 primer set",
+                "path": "tmp/ARTICv2.bed"
+            },
+            {
+                "value": "ARTICv3",
+                "description": "ARTIC v3 primer set",
+                "path": "tmp/ARTICv3.bed"
+            }
+        ]
+    }
+}
--- a/test-data/plastid.json	Fri Sep 28 23:46:24 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-{"data_tables": {"all_fasta": [{"path": "tmp/plastid.89.protein.fasta.gz", "dbkey": "plastid.89.protein", "name": "RefSeq plastid Release 89 protein (2018-09-07)", "value": "plastid.89.protein.2018-03-14"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/artic_primers.loc.sample	Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,7 @@
+# this is a tab separated file describing the location of ARTIC primers for use in SARS-CoV-2 sequencing
+#
+# the columns are:
+# value  description path
+#
+# for example
+# ARTICv1   ARTIC v1 primers    /data/galaxy/tool_data/artic_primers/ARTICv1.bed
\ No newline at end of file
--- a/tool_data_table_conf.xml.sample	Fri Sep 28 23:46:24 2018 -0400
+++ b/tool_data_table_conf.xml.sample	Thu Apr 16 10:19:57 2020 +0000
@@ -1,7 +1,7 @@
 <tables>
     <!-- Locations of all fasta files under genome directory -->
-    <table name="all_fasta" comment_char="#">
-        <columns>value, dbkey, name, path</columns>
-        <file path="tool-data/all_fasta.loc" />
+    <table name="artic_primers" comment_char="#">
+        <columns>value, description, path</columns>
+        <file path="tool-data/artic_primers.loc" />
     </table>
 </tables>
\ No newline at end of file