view data_manager/fetch_ganon.py @ 1:9de84cd78a82 draft

Uploaded
author estrain
date Fri, 05 Jul 2019 07:56:23 -0400
parents a50614a513f3
children c220590bde7a
line wrap: on
line source

#!/usr/bin/env python

import argparse
import json
import os
import os.path
import sys
import ftplib
import socket

def get_refseq_rrna(rrna):

    host = 'ftp.ncbi.nlm.nih.gov'
    folder_path = 'refseq/TargetedLoci/Bacteria/'
    file_name="bacteria."+rrna+"rRNA.fna.gz"

    try:
      f = ftplib.FTP(host)
    except (socket.error, socket.gaierror), e:
      print 'ERROR: cannot reach "%s"' % host 
      return
    print '*** Connected to host "%s"' % host 

    try:
      f.login()
    except ftplib.error_perm:
      print 'ERROR: cannot login anonymously'
      f.quit()
      return
    print '*** Logged in as "anonymous"'

    try:
      f.cwd(folder_path)
    except ftplib.error_perm:
      print 'ERROR: cannot CD to "%s"' % folder_path 
      f.quit()
      return
    print '*** Changed to "%s" folder' % folder_path 

    try:
      f.retrbinary('RETR %s' % file_name,
          open(file_name, 'wb').write)
    except ftplib.error_perm:
      print 'ERROR: cannot read file "%s"' % file_name 
      os.unlink(file_name)
    else:
      print '*** Downloaded "%s" to CWD' % file_name 
    f.quit()

    return[file_name]

def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
    data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('ganon_databases', [])
    data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
    return data_manager_dict

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Download RefSeq rRNA bacterial databases')
    parser.add_argument('--output_directory', default='/tool-data/ganon', help='Directory to write output to')
    parser.add_argument('--rrna', help='rRNA sequences to download (5S, 16S, or 23S)')
    args = parser.parse_args()

    output_directory = args.output_directory
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    outfile=get_refseq_rrna(args.rrna)


    data_manager_dict = {}
    _add_data_table_entry(data_manager_dict=data_manager_dict,
      data_table_entry=dict(value=args.rrna, dbkey=args.rrna, name=args.rrna, path=args.output_directory),
      data_table_name='ganon_databases')
    open("output_file", 'w').write(json.dumps(data_manager_dict, sort_keys=True))