Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_refseq.py	Fri Sep 07 17:13:20 2018 -0400
@@ -0,0 +1,156 @@
+#!/usr/bin/env python
+
+from __future__ import print_function, division
+import argparse
+from datetime import date
+import gzip
+import json
+from multiprocessing import Process, Queue
+import os
+import os.path
+import re
+import requests
+import sys
+try:
+    from io import StringIO
+except ImportError:
+    from StringIO import StringIO
+# Refseq structure
+# - Release number
+# - Divisions
+#   1. archea
+#   2. bacteria
+#   3. fungi
+#   4. invertebrate
+#   5. mitochondrion
+#   6. other
+#   7. plant
+#   8. plasmid
+#   9. plastid
+#  10. protozoa
+#  11. vertebrate mammalian
+#  12. vertebrate other
+#  13. viral
+# within each division
+# DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz
+#  where fna and faa are FASTA, gbff and gpff are Genbank
+
+def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
+    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+    data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', [])
+    data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
+    return data_manager_dict
+
+def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False):
+    input_filename = conn.get()
+    if compress:
+        open_output = gzip.open
+    else:
+        open_output = open
+    with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file:
+        while input_filename != 'STOP':
+            if debug:
+                print('Reading', input_filename, file=sys.stderr)
+            with gzip.open(input_filename) as input_file:
+                data = input_file.read(chunk_size)
+                while data != '':
+                    output_file.write(data)
+                    data = input_file.read(chunk_size)
+            # os.unlink(input_filename)
+            input_filename = conn.get()
+
+def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False):
+    base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/'
+    valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other',
+                          'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'])
+    ending_mappings = {
+        'genomic': '.genomic.fna.gz',
+        'protein': '.protein.faa.gz',
+        'rna': 'rna.fna.gz'
+    }
+    assert division_name in valid_divisions, "Unknown division name ({})".format(division_name)
+    for mol_type in mol_types:
+        assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type)
+    if not os.path.exists(output_directory):
+        os.mkdir(output_directory)
+    release_num_file = base_url + 'RELEASE_NUMBER'
+    r = requests.get(release_num_file)
+    release_num = str(int(r.text.strip()))
+    division_base_url = base_url + division_name
+    if debug:
+        print('Retrieving {}'.format(division_base_url), file=sys.stderr)
+    r = requests.get(division_base_url)
+    listing_text = r.text
+
+    unzip_queues = {}
+    unzip_processes = []
+    final_output_filenames = []
+    for mol_type in mol_types:
+        q = unzip_queues[mol_type] = Queue()
+        output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta'
+        if compress:
+            output_filename += '.gz'
+        final_output_filenames.append(output_filename)
+        unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename),
+                                       kwargs=dict(debug=debug, compress=compress)))
+        unzip_processes[-1].start()
+
+    # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a>   2018-07-13 00:59   10M
+    for line in StringIO(listing_text):
+        if not '.gz' in line:
+            continue
+        parts = line.split('"')
+        assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip())
+        filename = parts[1]
+        for mol_type in mol_types:
+            ending = ending_mappings[mol_type]
+            if filename.endswith(ending):
+                if debug:
+                    print('Downloading:', filename, ending, mol_type, file=sys.stderr)
+                output_filename = os.path.join(output_directory, filename)
+                with open(output_filename, 'wb') as output_file:
+                    r = requests.get(division_base_url + '/' + filename)
+                    for chunk in r.iter_content(chunk_size=4096):
+                        output_file.write(chunk)
+                conn = unzip_queues[mol_type]
+                conn.put(output_filename)
+
+    for mol_type in mol_types:
+        conn = unzip_queues[mol_type]
+        conn.put('STOP')
+
+    return [release_num, final_output_filenames]
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Download RefSeq databases')
+    parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)')
+    parser.add_argument('--compress', default=False, action='store_true', help='Compress output files')
+    parser.add_argument('--output_directory', default='tmp', help='Directory to write output to')
+    parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs')
+    parser.add_argument('--division_names', nargs='+', help='RefSeq divisions to download')
+    parser.add_argument('--mol_types', nargs='+', help='Molecule types (genomic, rna, protein) to fetch')
+    parser.add_argument('--pin_date', help='Force download date to this version string')
+    args = parser.parse_args()
+    if args.galaxy_datamanager_filename is not None:
+        dm_opts = json.loads(open(args.galaxy_datamanager_filename).read())
+        output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter
+        data_manager_dict = {}
+    else:
+        output_directory = args.output_directory
+    for division_name in args.division_names:
+        if args.pin_date is not None:
+            today_str = args.pin_date
+        else:
+            today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format
+        [release_num, fasta_files] = get_refseq_division(division_name, args.mol_types, output_directory, args.debug, args.compress)
+        if args.galaxy_datamanager_filename is not None:
+            for i,  mol_type in enumerate(args.mol_types):
+                assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i])
+                unique_key = division_name + '.' + release_num + '.' + mol_type + '.' + today_str
+                dbkey = division_name + '.' + release_num + '.' + mol_type
+                desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')'
+                path = os.path.join(output_directory, fasta_files[i])
+                _add_data_table_entry(data_manager_dict=data_manager_dict,
+                                      data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path),
+                                      data_table_name='all_fasta')
+            open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode())
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_refseq.xml	Fri Sep 07 17:13:20 2018 -0400
@@ -0,0 +1,85 @@
+<tool id="data_manager_fetch_refseq" name="RefSeq data manager" version="0.0.1" tool_type="manage_data">
+    <description>Fetch FASTA data from NCBI RefSeq and update all_fasta data table</description>
+    <requirements>
+        <requirement type="package" version="3">python</requirement>
+    </requirements>
+    <command detect_errors="aggressive"><![CDATA[
+    python3 $__tool_directory__/fetch_refseq.py
+      #if str( $advanced.advanced_selector ) == 'advanced':
+        '${advanced.compress}'
+      #end if
+      --galaxy_datamanager_filename '${output_file}'
+      --division_names ${division_names}
+      --mol_types ${mol_type}
+      #if str( $pin_date ) != 'NO':
+        --pin_date '${pin_date}'
+      #end if
+    ]]></command>
+    <inputs>
+        <param argument="division_names" type="select" label="RefSeq division" multiple="true">
+            <option value="archea">Archea</option>
+            <option value="bacteria">Bacteria</option>
+            <option value="complete">Complete</option>
+            <option value="fungi">Fungi</option>
+            <option value="invertebrate">Invertebrate</option>
+            <option value="mitochondrion">Mitochondrion</option>
+            <option value="other">Other</option>
+            <option value="plant">Plant</option>
+            <option value="plasmid">Plasmid</option>
+            <option value="plastid">Plastid</option>
+            <option value="protozoa">Protozoa</option>
+            <option value="vertebrate_mammalian">Mammalian Vertebrate</option>
+            <option value="vertebrate_other">Other Vertebrate</option>
+            <option value="viral">Viral</option>
+        </param>
+        <param argument="mol_types" type="select" label="Molecule type" help="Select at least one of genomic, protein or rna sequence">
+            <option value="protein">Protein</option>
+            <option value="genomic">Genomic (DNA)</option>
+            <option value="rna">RNA</option>
+        </param>
+        <conditional name="advanced">
+            <param name="advanced_selector" type="select" label="Advanced Options">
+                <option value="basic" selected="True">Basic</option>
+                <option value="advanced">Advanced</option>
+            </param>
+            <when value="basic">
+            </when>
+            <when value="advanced">
+                <param type="boolean" argument="--compress" truevalue="--compress" falsevalue="" label="Compress FASTA files"
+                    help="Compress downloaded FASTA files (with gzip). Limits compatibility with tools expecting uncompressed FASTA."/>
+            </when>
+        </conditional>
+        <param argument="--pin_date" type="hidden" value="NO" help="Used for testing"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="division_names" value="plastid"/>
+            <param name="mol_types" value="protein"/>
+            <param name="pin_date" value="2018-03-14"/>
+            <param name="advanced_selector" value="basic"/>
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="2018-03-14"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+This data manager fetches FASTA format collections of proteins, nucleotides (genomic DNA) and RNA
+from NCBI's RefSeq_ data collection.
+
+RefSeq is released every two months and consists of a number of divisions. Some sequences are shared
+between multiple divisions. This data manager allows the Galaxy administrator to select which
+divisions and which molecule types within each division to download. Once downloaded the
+files are made accessible by adding an entry into the *all_fasta* data table.
+
+.. _RefSeq: https://www.ncbi.nlm.nih.gov/refseq/
+    ]]>
+    </help>
+    <citations>
+        <citation type="doi">10.1093/nar/gkv1189</citation>
+    </citations>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Fri Sep 07 17:13:20 2018 -0400
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/fetch_refseq.xml" id="fetch_genome_fetch_refseq">
+        <data_table name="all_fasta">
+            <output>
+                <column name="value" />
+                <column name="dbkey" />
+                <column name="name" />
+                <column name="path" output_ref="out_file" >
+                    <move type="file">
+                        <source>${path}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">${dbkey}/seq/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/${dbkey}/seq/${path}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/plastid.json	Fri Sep 07 17:13:20 2018 -0400
@@ -0,0 +1,1 @@
+{"data_tables": {"all_fasta": [{"path": "tmp/plastid.89.protein.fasta.gz", "dbkey": "plastid.89.protein", "name": "RefSeq plastid Release 89 protein (2018-09-07)", "value": "plastid.89.protein.2018-03-14"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/all_fasta.loc.sample	Fri Sep 07 17:13:20 2018 -0400
@@ -0,0 +1,18 @@
+#This file lists the locations and dbkeys of all the fasta files
+#under the "genome" directory (a directory that contains a directory
+#for each build). The script extract_fasta.py will generate the file
+#all_fasta.loc. This file has the format (white space characters are
+#TAB characters):
+#
+#<unique_build_id>	<dbkey>		<display_name>	<file_path>
+#
+#So, all_fasta.loc could look something like this:
+#
+#apiMel3	apiMel3	Honeybee (Apis mellifera): apiMel3		/path/to/genome/apiMel3/apiMel3.fa
+#hg19canon	hg19		Human (Homo sapiens): hg19 Canonical		/path/to/genome/hg19/hg19canon.fa
+#hg19full	hg19		Human (Homo sapiens): hg19 Full			/path/to/genome/hg19/hg19full.fa
+#
+#Your all_fasta.loc file should contain an entry for each individual
+#fasta file. So there will be multiple fasta files for each build,
+#such as with hg19 above.
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Fri Sep 07 17:13:20 2018 -0400
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Locations of all fasta files under genome directory -->
+    <table name="all_fasta" comment_char="#">
+        <columns>value, dbkey, name, path</columns>
+        <file path="tool-data/all_fasta.loc" />
+    </table>
+</tables>
\ No newline at end of file