changeset 5:eaca3e270bf6 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_ncbi_taxonomy/ commit 3d78021971b83d585c432830cee0488ada7bd257"
author iuc
date Fri, 19 Mar 2021 21:43:14 +0000
parents cef5c909ccb8
children
files data_manager/data_manager.py data_manager/ncbi_taxonomy_fetcher.xml data_manager_conf.xml test-data/ncbi_accession2taxid.loc test-data/ncbi_taxonomy.loc test-data/taxonomy.json test-data/taxonomy_with_accession2taxid.json tool-data/ncbi_accession2taxid.loc.sample tool-data/ncbi_taxonomy.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 11 files changed, 156 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/data_manager.py	Tue Jul 07 00:05:59 2020 +0000
+++ b/data_manager/data_manager.py	Fri Mar 19 21:43:14 2021 +0000
@@ -5,12 +5,7 @@
 import shutil
 import tarfile
 import zipfile
-try:
-    # For Python 3.0 and later
-    from urllib.request import Request, urlopen
-except ImportError:
-    # Fall back to Python 2 imports
-    from urllib2 import Request, urlopen
+from urllib.request import Request, urlopen
 
 
 def url_download(url, workdir):
@@ -42,22 +37,89 @@
     os.remove(file_path)
 
 
+def download_name_maps(url, workdir, partial):
+
+    if partial:
+        map_files = [
+            'pdb.accession2taxid.gz',
+        ]
+    else:
+        map_files = [
+            'dead_nucl.accession2taxid.gz',
+            'dead_prot.accession2taxid.gz',
+            'dead_wgs.accession2taxid.gz',
+            'nucl_gb.accession2taxid.gz',
+            'nucl_wgs.accession2taxid.gz',
+            'pdb.accession2taxid.gz',
+            'prot.accession2taxid.gz',
+            'prot.accession2taxid.FULL.gz'
+        ]
+
+    if not os.path.exists(workdir):
+        os.makedirs(workdir)
+
+    for map in map_files:
+        src = "{}{}".format(url, map)
+        dest = os.path.join(workdir, map)
+
+        print("Downloading taxonomy accession2taxid file from {} to {}".format(src, dest))
+
+        try:
+            req = Request(src)
+            src = urlopen(req)
+            with open(dest, 'wb') as dst:
+                while True:
+                    chunk = src.read(2**10)
+                    if chunk:
+                        dst.write(chunk)
+                    else:
+                        break
+        finally:
+            if src:
+                src.close()
+
+
+def move_files_to_final_dir(workdir, target_directory, copy=False):
+    for filename in os.listdir(workdir):
+        if copy:
+            shutil.copy(os.path.join(workdir, filename), target_directory)
+        else:
+            shutil.move(os.path.join(workdir, filename), target_directory)
+
+
 def main(args):
-    workdir = os.path.join(os.getcwd(), 'taxonomy')
+    workdir = os.path.abspath(os.path.join(os.getcwd(), 'taxonomy'))
     url_download(args.url, workdir)
+
     data_manager_entry = {}
     data_manager_entry['value'] = args.name.lower()
     data_manager_entry['name'] = args.name
     data_manager_entry['path'] = '.'
     data_manager_json = dict(data_tables=dict(ncbi_taxonomy=data_manager_entry))
-    params = json.loads(open(args.output).read())
-    target_directory = params['output_data'][0]['extra_files_path']
-    os.mkdir(target_directory)
-    output_path = os.path.abspath(os.path.join(os.getcwd(), 'taxonomy'))
-    for filename in os.listdir(workdir):
-        shutil.move(os.path.join(output_path, filename), target_directory)
-    with open(args.output, 'w') as out:
-        out.write(json.dumps(data_manager_json, sort_keys=True))
+
+    with open(args.output) as fh:
+        params = json.load(fh)
+
+    if args.name_maps:
+        workdir_a2t = os.path.join(os.getcwd(), 'accession2taxid')
+        download_name_maps("ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/", workdir_a2t, args.partial)
+
+        target_directory_a2t = os.path.join(params['output_data'][0]['extra_files_path'], "accession2taxid")
+        os.makedirs(target_directory_a2t)
+        move_files_to_final_dir(workdir_a2t, target_directory_a2t)
+
+        # Also copy taxonomy data to accession2taxid dir
+        move_files_to_final_dir(workdir, target_directory_a2t, copy=True)
+
+        data_manager_json['data_tables']['ncbi_accession2taxid'] = data_manager_entry
+
+    target_directory_tax = os.path.join(params['output_data'][0]['extra_files_path'], "taxonomy")
+    os.makedirs(target_directory_tax)
+
+    move_files_to_final_dir(workdir, target_directory_tax)
+
+    with open(args.output, 'w') as fh:
+        json.dump(data_manager_json, fh, sort_keys=True)
 
 
 if __name__ == '__main__':
@@ -65,6 +127,8 @@
     parser.add_argument('--out', dest='output', action='store', help='JSON filename')
     parser.add_argument('--name', dest='name', action='store', default=str(datetime.date.today()), help='Data table entry unique ID')
     parser.add_argument('--url', dest='url', action='store', default='ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz', help='Download URL')
+    parser.add_argument('--name-maps', dest='name_maps', action='store_true', help='')
+    parser.add_argument('--partial', dest='partial', action='store_true', help='Only download a small subset of data (for testing)')
     args = parser.parse_args()
 
     main(args)
--- a/data_manager/ncbi_taxonomy_fetcher.xml	Tue Jul 07 00:05:59 2020 +0000
+++ b/data_manager/ncbi_taxonomy_fetcher.xml	Fri Mar 19 21:43:14 2021 +0000
@@ -1,6 +1,9 @@
 <?xml version="1.0"?>
-<tool id="ncbi_taxonomy_fetcher" name="NCBI" tool_type="manage_data" version="1.0.1">
+<tool id="ncbi_taxonomy_fetcher" name="NCBI" tool_type="manage_data" version="1.0.3">
     <description>taxonomy downloader</description>
+    <requirements>
+        <requirement type="package" version="3.7">python</requirement>
+    </requirements>
     <command detect_errors="exit_code">
     <![CDATA[
         python '$__tool_directory__/data_manager.py' --out '${out_file}'
@@ -10,13 +13,36 @@
         #if $database_name:
             --name '${database_name}'
         #end if
+        $name_maps
+        $partial_data
     ]]>
     </command>
     <inputs>
         <param name="database_name" type="text" optional="true" label="Name for this database" help="Enter a unique identifier, or leave blank for today's date" />
         <param name="taxonomy_url" type="text" value='ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' label="Enter URL for taxonomy files" />
+        <param name="name_maps" type="boolean" truevalue="--name-maps" falsevalue="" label="Also download accession2taxid data" checked="false" />
+        <param name="partial_data" type="hidden" value="" help="Used for testing"/>
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json" />
     </outputs>
+    <tests>
+        <test>
+            <param name="database_name" value="tax_name"/>
+            <param name="database_id" value="tax_id"/>
+            <output name="out_file" value="taxonomy.json"/>
+        </test>
+        <test>
+            <param name="database_name" value="tax_name"/>
+            <param name="database_id" value="tax_id"/>
+            <param name="name_maps" value="true"/>
+            <param name="partial_data" value="--partial"/>
+            <output name="out_file" value="taxonomy_with_accession2taxid.json"/>
+        </test>
+    </tests>
+    <help>
+        Download a taxonomy dump from a provided URL.
+        The default URL is the latest dump from NCBI taxonomy.
+        The accession2taxid data comes from ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/ and can be used by the Diamond data_manager (uses ~20Gb as of 2021).
+    </help>
 </tool>
--- a/data_manager_conf.xml	Tue Jul 07 00:05:59 2020 +0000
+++ b/data_manager_conf.xml	Fri Mar 19 21:43:14 2021 +0000
@@ -1,12 +1,13 @@
 <?xml version="1.0"?>
 <data_managers>
-    <data_manager tool_file="data_manager/ncbi_taxonomy_fetcher.xml" id="ncbi_taxonomy_fetcher" version="1.0.0">
+    <data_manager tool_file="data_manager/ncbi_taxonomy_fetcher.xml" id="ncbi_taxonomy_fetcher" version="1.0.3">
         <data_table name="ncbi_taxonomy">
             <output>
                 <column name="value" />
                 <column name="name" />
                 <column name="path" output_ref="out_file">
                     <move type="directory" relativize_symlinks="True">
+                        <source>${path}/taxonomy</source>
                         <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_taxonomy/${value}</target>
                     </move>
                     <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/ncbi_taxonomy/${value}</value_translation>
@@ -14,5 +15,19 @@
                 </column>
             </output>
         </data_table>
+        <data_table name="ncbi_accession2taxid">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="path" output_ref="out_file">
+                    <move type="directory" relativize_symlinks="True">
+                        <source>${path}/accession2taxid</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_accession2taxid/${value}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/ncbi_accession2taxid/${value}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
     </data_manager>
 </data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_accession2taxid.loc	Fri Mar 19 21:43:14 2021 +0000
@@ -0,0 +1,5 @@
+# Tab separated fields where
+# value is unique key
+# name is descriptive name
+# path is path to directory containing accession2taxid files
+#value	name	path
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_taxonomy.loc	Fri Mar 19 21:43:14 2021 +0000
@@ -0,0 +1,5 @@
+# Tab separated fields where
+# value is unique key
+# name is descriptive name
+# path is path to directory containing names.dmp and nodes.dmp files
+#value	name	path
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxonomy.json	Fri Mar 19 21:43:14 2021 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"ncbi_taxonomy": {"name": "tax_name", "path": ".", "value": "tax_name"}}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/taxonomy_with_accession2taxid.json	Fri Mar 19 21:43:14 2021 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"ncbi_accession2taxid": {"name": "tax_name", "path": ".", "value": "tax_name"}, "ncbi_taxonomy": {"name": "tax_name", "path": ".", "value": "tax_name"}}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_accession2taxid.loc.sample	Fri Mar 19 21:43:14 2021 +0000
@@ -0,0 +1,5 @@
+# Tab separated fields where
+# value is unique key
+# name is descriptive name
+# path is path to directory containing accession2taxid files
+#value	name	path
--- a/tool-data/ncbi_taxonomy.loc.sample	Tue Jul 07 00:05:59 2020 +0000
+++ b/tool-data/ncbi_taxonomy.loc.sample	Fri Mar 19 21:43:14 2021 +0000
@@ -2,4 +2,4 @@
 # value is unique key
 # name is descriptive name
 # path is path to directory containing names.dmp and nodes.dmp files
-#value	name		path
+#value	name	path
--- a/tool_data_table_conf.xml.sample	Tue Jul 07 00:05:59 2020 +0000
+++ b/tool_data_table_conf.xml.sample	Fri Mar 19 21:43:14 2021 +0000
@@ -5,4 +5,8 @@
         <columns>value, name, path</columns>
         <file path="tool-data/ncbi_taxonomy.loc" />
     </table>
+    <table name="ncbi_accession2taxid" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/ncbi_accession2taxid.loc" />
+    </table>
 </tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Fri Mar 19 21:43:14 2021 +0000
@@ -0,0 +1,12 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of taxonomy data downloaded from NCBI -->
+    <table name="ncbi_taxonomy" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/ncbi_taxonomy.loc" />
+    </table>
+    <table name="ncbi_accession2taxid" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/ncbi_accession2taxid.loc" />
+    </table>
+</tables>