data_manager_phemost: data_manager/fetch_mlst

comparison data_manager/fetch_mlst_data.py @ 0:c01ac945b067 draft

Uploaded

author	ulfschaefer
date	Mon, 11 Jul 2016 05:23:33 -0400
parents
children	577ff220eaea

comparison

equal deleted inserted replaced

--1:000000000000
+:c01ac945b067
+#!/usr/bin/env python
+'''
+Download MLST datasets from this site: http://pubmlst.org/data/ by
+parsing an xml file (http://pubmlst.org/data/dbases.xml).
+Data is downloaded for a species determined by the user:
+- profiles (maps STs to allele numbers)
+- numbered sequences for each locus in the scheme
+In addition, the alleles are concatenated together for use with SRST2.
+A log file is also generated in the working directory, detailing the
+time, date and location of all files downloaded, as well as the <retrieved>
+tag which tells us when the XML entry was last updated.
+If the species name input by the user matches multiple <species> in the
+xml file, the script simply reports the possible matches so the user can
+try again.
+'''
+"""
+- Remove empty line at the end of profiles.txt file.
+- Ensure the allele names at the profiles.txt file don't contain "_".
+"""
+from argparse import ArgumentParser
+import xml.dom.minidom as xml
+import urllib2 as url
+import re
+import os
+import sys
+import glob
+import csv
+import shutil
+from urlparse import urlparse
+import time
+import subprocess
+try:
+from galaxy.util.json import from_json_string, to_json_string
+has_galaxy = True
+except ImportError:
+sys.stderr.write("Will download some stuff to the current dir but can't do Galaxy stuff!")
+has_galaxy = False
+# --------------------------------------------------------------------------------------------------
+def parse_args():
+parser = ArgumentParser(description='Download MLST datasets by species'
+'from pubmlst.org.')
+parser.add_argument('--repository_url',
+metavar = 'URL',
+default = 'http://pubmlst.org/data/dbases.xml',
+help = 'URL for MLST repository XML index')
+parser.add_argument('--species',
+metavar = 'NAME',
+required = True,
+help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
+parser.add_argument('--outfile',
+metavar = 'FILE',
+required = True,
+help = 'The name of the Json file to write that galaxy stuff to.')
+parser.add_argument('--reference',
+metavar = 'ACCESSION',
+required = True,
+help = 'NCBI accession number of the reference genome to use for flanking regions.')
+return parser.parse_args()
+# --------------------------------------------------------------------------------------------------
+def main():
+"""
+<species>
+Achromobacter spp.
+<mlst>
+<database>
+<url>http://pubmlst.org/achromobacter</url>
+<retrieved>2015-08-11</retrieved>
+<profiles>
+<count>272</count>
+<url>http://pubmlst.org/data/profiles/achromobacter.txt</url>
+</profiles>
+<loci>
+<locus>
+nusA
+<url>
+http://pubmlst.org/data/alleles/achromobacter/nusA.tfa
+</url>
+</locus>
+<locus>
+rpoB
+<url>
+http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa
+</url>
+</locus>
+"""
+args = parse_args()
+docFile = url.urlopen(args.repository_url) # url address  #args.repository_url =http://pubmlst.org/data/dbases.xml
+doc = xml.parse(docFile)
+root = doc.childNodes[0]
+found_species = []
+if args.species == "Escherichia coli":
+args.species = "Escherichia coli#1"
+elif args.species == "Acinetobacter baumannii":
+args.species = "Acinetobacter baumannii#1"
+elif args.species == "Pasteurella multocida":
+args.species = "Pasteurella multocida#1"
+else:
+pass
+for species_node in root.getElementsByTagName('species'):
+info = getSpeciesInfo(species_node, args.species)
+if info != None:
+found_species.append(info)
+if len(found_species) == 0:
+sys.stderr.write("No species matched your query.\n")
+exit(1)
+if len(found_species) > 1:
+sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species)))
+for info in found_species:
+sys.stderr.write(info.name + '\n')
+exit(2)
+# output information for the single matching species
+assert len(found_species) == 1
+species_info = found_species[0]
+species_name_underscores = species_info.name.replace(' ', '_')
+timestamp = time.strftime("%Y%m%d%H%M%S")
+params = None
+if has_galaxy == True:
+params = from_json_string(open(args.outfile).read())
+folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp)
+else:
+folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), species_name_underscores, timestamp)
+if not os.path.isdir(folder):
+os.makedirs(folder)
+profile_doc = url.urlopen(species_info.profiles_url)
+with open(os.path.join(folder, 'profiles.txt'), 'w') as f:
+sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt')))
+for line in profile_doc.readlines():
+cols = line.split("\t")
+f.write("%s\n" % ('\t'.join(cols[0:8])))
+profile_doc.close()
+for locus in species_info.loci:
+locus_path = urlparse(locus.url).path
+locus_filename = locus_path.split('/')[-1]
+locus_filename = locus_filename.replace("_.tfa", ".fas")
+locus_filename = locus_filename.replace("tfa", "fas")
+locus_doc = url.urlopen(locus.url)
+with open(os.path.join(folder, locus_filename), 'w') as locus_file:
+locus_fasta_content = locus_doc.read()
+locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-")
+sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename)))
+locus_file.write(locus_fasta_content)
+locus_doc.close()
+get_reference(folder, args.reference)
+# do Galaxy stuff
+if has_galaxy == True:
+data_manager_dict = {}
+data_manager_dict['data_tables'] = {}
+name = "%s-%s" % (species_info.name, timestamp)
+data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores,
+dbkey=species_name_underscores,
+name=name,
+time_stamp=timestamp,
+file_path=folder)]
+#save info to json file
+with open(args.outfile, 'wb') as fjson:
+fjson.write(to_json_string(data_manager_dict))
+# end of main --------------------------------------------------------------------------------------
+def get_reference(folder, acc):
+# We're getting this file from Japan!
+# It seems to work pretty well until they take down or change their website
+# See: http://www.ncbi.nlm.nih.gov/pubmed/20472643
+refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc)
+remote_ref = url.urlopen(refurl)
+ref_filename = os.path.join(folder, 'reference.seq')
+with open(ref_filename, 'wb') as fRef:
+fRef.write(remote_ref.read())
+remote_ref.close()
+cmd = "makeblastdb -in %s -dbtype nucl -out %s" \
+% (ref_filename, ref_filename.replace("reference.seq", "reference"))
+p = subprocess.Popen(cmd,
+shell=True,
+stdin=None,
+stdout=subprocess.PIPE,
+stderr=subprocess.PIPE, close_fds=True)
+p.wait()
+return
+# --------------------------------------------------------------------------------------------------
+# test if a node is an Element and that it has a specific tag name
+def testElementTag(node, name):
+return node.nodeType == node.ELEMENT_NODE and node.localName == name
+# --------------------------------------------------------------------------------------------------
+# Get the text from an element node with a text node child
+def getText(element):
+result = ''
+for node in element.childNodes:
+if node.nodeType == node.TEXT_NODE:
+result += node.data
+return normaliseText(result)
+# --------------------------------------------------------------------------------------------------
+# remove unwanted whitespace including linebreaks etc.
+def normaliseText(str):
+return ' '.join(str.split())
+# --------------------------------------------------------------------------------------------------
+# A collection of interesting information about a taxa
+class SpeciesInfo(object):
+def __init__(self):
+self.name = None # String name of species
+self.database_url = None # URL as string
+self.retrieved = None # date as string
+self.profiles_url = None # URL as string
+self.profiles_count = None # positive integer
+self.loci = [] # list of loci
+def __str__(self):
+s = "Name: %s\n" % self.name
+s += "Database URL: %s\n" % self.database_url
+s += "Retrieved: %s\n" % self.retrieved
+s += "Profiles URL: %s\n" % self.profiles_url
+s += "Profiles count: %s\n" % self.profiles_count
+s += "Loci: %s\n" % (','.join([str(x) for x in self.loci]))
+return s
+# --------------------------------------------------------------------------------------------------
+class LocusInfo(object):
+def __init__(self):
+self.url = None
+self.name = None
+def __str__(self):
+return "Locus: name:%s,url:%s" % (self.name, self.url)
+# --------------------------------------------------------------------------------------------------
+# retrieve the interesting information for a given sample element
+def getSpeciesInfo(species_node, species):
+this_name = getText(species_node)
+print this_name
+if this_name.startswith(species):
+info = SpeciesInfo()
+info.name = this_name
+for mlst_node in species_node.getElementsByTagName('mlst'):
+for database_node in mlst_node.getElementsByTagName('database'):
+for database_child_node in database_node.childNodes:
+if testElementTag(database_child_node, 'url'):
+info.database_url = getText(database_child_node)
+elif testElementTag(database_child_node, 'retrieved'):
+info.retrieved = getText(database_child_node)
+elif testElementTag(database_child_node, 'profiles'):
+for profile_count in database_child_node.getElementsByTagName('count'):
+info.profiles_count = getText(profile_count)
+for profile_url in database_child_node.getElementsByTagName('url'):
+info.profiles_url = getText(profile_url)
+elif testElementTag(database_child_node, 'loci'):
+for locus_node in database_child_node.getElementsByTagName('locus'):
+locus_info = LocusInfo()
+locus_info.name = getText(locus_node)
+for locus_url in locus_node.getElementsByTagName('url'):
+locus_info.url = getText(locus_url)
+info.loci.append(locus_info)
+return info
+else:
+return None
+# --------------------------------------------------------------------------------------------------
+if __name__ == '__main__':
+main()

Mercurial > repos > ulfschaefer > data_manager_phemost

comparison data_manager/fetch_mlst_data.py @ 0:c01ac945b067 draft