changeset 0:a50614a513f3 draft

Uploaded
author estrain
date Thu, 04 Jul 2019 21:42:31 -0400
parents
children 9de84cd78a82
files data_manager/fetch_ganon.py data_manager/ganon_data_manager.xml data_manager_conf.xml tool-data/ganon_databases.loc.sample tool_data_table_conf.xml.sample
diffstat 5 files changed, 132 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_ganon.py	Thu Jul 04 21:42:31 2019 -0400
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import os
+import os.path
+import sys
+import ftplib
+import socket
+
+def get_refseq_rrna(rrna):
+
+    host = 'ftp.ncbi.nlm.nih.gov'
+    folder_path = 'refseq/TargetedLoci/Bacteria/'
+    file_name="bacteria."+rrna+"rRNA.fna.gz"
+
+    try:
+      f = ftplib.FTP(host)
+    except (socket.error, socket.gaierror), e:
+      print 'ERROR: cannot reach "%s"' % host 
+      return
+    print '*** Connected to host "%s"' % host 
+
+    try:
+      f.login()
+    except ftplib.error_perm:
+      print 'ERROR: cannot login anonymously'
+      f.quit()
+      return
+    print '*** Logged in as "anonymous"'
+
+    try:
+      f.cwd(folder_path)
+    except ftplib.error_perm:
+      print 'ERROR: cannot CD to "%s"' % folder_path 
+      f.quit()
+      return
+    print '*** Changed to "%s" folder' % folder_path 
+
+    try:
+      f.retrbinary('RETR %s' % file_name,
+          open(file_name, 'wb').write)
+    except ftplib.error_perm:
+      print 'ERROR: cannot read file "%s"' % file_name 
+      os.unlink(file_name)
+    else:
+      print '*** Downloaded "%s" to CWD' % file_name 
+    f.quit()
+
+    return[file_name]
+
+def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
+    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+    data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('ganon_databases', [])
+    data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
+    return data_manager_dict
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Download RefSeq rRNA bacterial databases')
+    parser.add_argument('--output_directory', default='/tool-data/ganon', help='Directory to write output to')
+    parser.add_argument('--rrna', help='rRNA sequences to download (5S, 16S, or 23S)')
+    args = parser.parse_args()
+
+    output_directory = args.output_directory
+    if not os.path.exists(output_directory):
+        os.mkdir(output_directory)
+
+    outfile=get_refseq_rrna(args.rrna)
+
+
+    data_manager_dict = {}
+    _add_data_table_entry(data_manager_dict=data_manager_dict,
+      data_table_entry=dict(value=args.rrna, dbkey=args.rrna, name=args.rrna, path=args.output_directory),
+      data_table_name='ganon_databases')
+    open("output_file", 'w').write(json.dumps(data_manager_dict, sort_keys=True))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/ganon_data_manager.xml	Thu Jul 04 21:42:31 2019 -0400
@@ -0,0 +1,29 @@
+<tool id="ganon_data_manager" name="ganon data manager" version="0.0.1" tool_type="manage_data">
+    <description>Fetch rRNA data from NCBI RefSeq and create ganon databases</description>
+    <requirements>
+        <requirement type="package">python</requirement>
+        <requirement type="package">ganon</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+       python $__tool_directory__/fetch_ganon.py --rrna $rrna --output_directory $out_dir; 
+       ganon build -d $rrna -i *.gz;
+       mv $rrna.* $out_dir;
+    ]]></command>
+    <inputs>
+        <param argument="rrna" type="select" label="Bacterial rRNA loci" multiple="true">
+          <option value="5S">5S</option>
+          <option value="16S">16S</option>
+          <option value="23S">23S</option>
+        </param>
+        <param argument="out_dir" type="text" label="Output folder" value="/tool-data/ganon"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json"/>
+    </outputs>
+    <help><![CDATA[
+    Retrieve RefSeq Bacterial rRNA Targeted Loci
+    ]]>
+    </help>
+    <citations>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Thu Jul 04 21:42:31 2019 -0400
@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/ganon_data_manager.xml" id="ganon_data_manager" version="0.0.1">
+        <data_table name="ganon_databases">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="dbkey" />
+                <column name="path" />
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ganon_databases.loc.sample	Thu Jul 04 21:42:31 2019 -0400
@@ -0,0 +1,8 @@
+# Tab separated with three columns:
+# - value (Galaxy records this in the Galaxy DB)
+# - name (Galaxy shows this in the UI)
+# - dbkey (database key)
+# - path (Name of the ganon DB)
+#
+#e.g.
+#ganon5S<tab>5S rRNA<tab>dbkey<tab>/path/to/ganonDB/5S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Jul 04 21:42:31 2019 -0400
@@ -0,0 +1,6 @@
+<tables>
+    <table name="ganon_databases" comment_char="#">
+        <columns>value, name, dbkey, path</columns>
+        <file path="tool-data/ganon_databases.loc" />
+    </table>
+ </tables>