Mercurial > repos > ulfschaefer > data_manager_phemost
comparison data_manager/fetch_mlst_data.py @ 0:c01ac945b067 draft
Uploaded
| author | ulfschaefer |
|---|---|
| date | Mon, 11 Jul 2016 05:23:33 -0400 |
| parents | |
| children | 577ff220eaea |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c01ac945b067 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 ''' | |
| 4 Download MLST datasets from this site: http://pubmlst.org/data/ by | |
| 5 parsing an xml file (http://pubmlst.org/data/dbases.xml). | |
| 6 | |
| 7 Data is downloaded for a species determined by the user: | |
| 8 - profiles (maps STs to allele numbers) | |
| 9 - numbered sequences for each locus in the scheme | |
| 10 | |
| 11 In addition, the alleles are concatenated together for use with SRST2. | |
| 12 | |
| 13 A log file is also generated in the working directory, detailing the | |
| 14 time, date and location of all files downloaded, as well as the <retrieved> | |
| 15 tag which tells us when the XML entry was last updated. | |
| 16 | |
| 17 If the species name input by the user matches multiple <species> in the | |
| 18 xml file, the script simply reports the possible matches so the user can | |
| 19 try again. | |
| 20 ''' | |
| 21 | |
| 22 """ | |
| 23 - Remove empty line at the end of profiles.txt file. | |
| 24 - Ensure the allele names at the profiles.txt file don't contain "_". | |
| 25 | |
| 26 """ | |
| 27 from argparse import ArgumentParser | |
| 28 import xml.dom.minidom as xml | |
| 29 import urllib2 as url | |
| 30 import re | |
| 31 import os | |
| 32 import sys | |
| 33 import glob | |
| 34 import csv | |
| 35 import shutil | |
| 36 from urlparse import urlparse | |
| 37 import time | |
| 38 import subprocess | |
| 39 | |
| 40 try: | |
| 41 from galaxy.util.json import from_json_string, to_json_string | |
| 42 has_galaxy = True | |
| 43 except ImportError: | |
| 44 sys.stderr.write("Will download some stuff to the current dir but can't do Galaxy stuff!") | |
| 45 has_galaxy = False | |
| 46 | |
| 47 # -------------------------------------------------------------------------------------------------- | |
| 48 | |
| 49 def parse_args(): | |
| 50 parser = ArgumentParser(description='Download MLST datasets by species' | |
| 51 'from pubmlst.org.') | |
| 52 | |
| 53 parser.add_argument('--repository_url', | |
| 54 metavar = 'URL', | |
| 55 default = 'http://pubmlst.org/data/dbases.xml', | |
| 56 help = 'URL for MLST repository XML index') | |
| 57 | |
| 58 parser.add_argument('--species', | |
| 59 metavar = 'NAME', | |
| 60 required = True, | |
| 61 help = 'The name of the species that you want to download (e.g. "Escherichia coli")') | |
| 62 | |
| 63 parser.add_argument('--outfile', | |
| 64 metavar = 'FILE', | |
| 65 required = True, | |
| 66 help = 'The name of the Json file to write that galaxy stuff to.') | |
| 67 | |
| 68 parser.add_argument('--reference', | |
| 69 metavar = 'ACCESSION', | |
| 70 required = True, | |
| 71 help = 'NCBI accession number of the reference genome to use for flanking regions.') | |
| 72 | |
| 73 return parser.parse_args() | |
| 74 | |
| 75 # -------------------------------------------------------------------------------------------------- | |
| 76 | |
| 77 def main(): | |
| 78 | |
| 79 """ | |
| 80 <species> | |
| 81 Achromobacter spp. | |
| 82 <mlst> | |
| 83 <database> | |
| 84 <url>http://pubmlst.org/achromobacter</url> | |
| 85 <retrieved>2015-08-11</retrieved> | |
| 86 <profiles> | |
| 87 <count>272</count> | |
| 88 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url> | |
| 89 </profiles> | |
| 90 <loci> | |
| 91 <locus> | |
| 92 nusA | |
| 93 <url> | |
| 94 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa | |
| 95 </url> | |
| 96 </locus> | |
| 97 <locus> | |
| 98 rpoB | |
| 99 <url> | |
| 100 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa | |
| 101 </url> | |
| 102 </locus> | |
| 103 """ | |
| 104 | |
| 105 args = parse_args() | |
| 106 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml | |
| 107 | |
| 108 doc = xml.parse(docFile) | |
| 109 root = doc.childNodes[0] | |
| 110 found_species = [] | |
| 111 | |
| 112 if args.species == "Escherichia coli": | |
| 113 args.species = "Escherichia coli#1" | |
| 114 elif args.species == "Acinetobacter baumannii": | |
| 115 args.species = "Acinetobacter baumannii#1" | |
| 116 elif args.species == "Pasteurella multocida": | |
| 117 args.species = "Pasteurella multocida#1" | |
| 118 else: | |
| 119 pass | |
| 120 | |
| 121 for species_node in root.getElementsByTagName('species'): | |
| 122 info = getSpeciesInfo(species_node, args.species) | |
| 123 if info != None: | |
| 124 found_species.append(info) | |
| 125 | |
| 126 if len(found_species) == 0: | |
| 127 sys.stderr.write("No species matched your query.\n") | |
| 128 exit(1) | |
| 129 | |
| 130 if len(found_species) > 1: | |
| 131 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species))) | |
| 132 for info in found_species: | |
| 133 sys.stderr.write(info.name + '\n') | |
| 134 exit(2) | |
| 135 | |
| 136 # output information for the single matching species | |
| 137 assert len(found_species) == 1 | |
| 138 species_info = found_species[0] | |
| 139 species_name_underscores = species_info.name.replace(' ', '_') | |
| 140 timestamp = time.strftime("%Y%m%d%H%M%S") | |
| 141 | |
| 142 params = None | |
| 143 if has_galaxy == True: | |
| 144 params = from_json_string(open(args.outfile).read()) | |
| 145 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp) | |
| 146 else: | |
| 147 folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), species_name_underscores, timestamp) | |
| 148 | |
| 149 if not os.path.isdir(folder): | |
| 150 os.makedirs(folder) | |
| 151 | |
| 152 profile_doc = url.urlopen(species_info.profiles_url) | |
| 153 with open(os.path.join(folder, 'profiles.txt'), 'w') as f: | |
| 154 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt'))) | |
| 155 for line in profile_doc.readlines(): | |
| 156 cols = line.split("\t") | |
| 157 f.write("%s\n" % ('\t'.join(cols[0:8]))) | |
| 158 profile_doc.close() | |
| 159 | |
| 160 for locus in species_info.loci: | |
| 161 locus_path = urlparse(locus.url).path | |
| 162 locus_filename = locus_path.split('/')[-1] | |
| 163 locus_filename = locus_filename.replace("_.tfa", ".fas") | |
| 164 locus_filename = locus_filename.replace("tfa", "fas") | |
| 165 locus_doc = url.urlopen(locus.url) | |
| 166 with open(os.path.join(folder, locus_filename), 'w') as locus_file: | |
| 167 locus_fasta_content = locus_doc.read() | |
| 168 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-") | |
| 169 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename))) | |
| 170 locus_file.write(locus_fasta_content) | |
| 171 locus_doc.close() | |
| 172 | |
| 173 get_reference(folder, args.reference) | |
| 174 | |
| 175 | |
| 176 # do Galaxy stuff | |
| 177 if has_galaxy == True: | |
| 178 data_manager_dict = {} | |
| 179 data_manager_dict['data_tables'] = {} | |
| 180 name = "%s-%s" % (species_info.name, timestamp) | |
| 181 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores, | |
| 182 dbkey=species_name_underscores, | |
| 183 name=name, | |
| 184 time_stamp=timestamp, | |
| 185 file_path=folder)] | |
| 186 #save info to json file | |
| 187 with open(args.outfile, 'wb') as fjson: | |
| 188 fjson.write(to_json_string(data_manager_dict)) | |
| 189 | |
| 190 # end of main -------------------------------------------------------------------------------------- | |
| 191 | |
| 192 def get_reference(folder, acc): | |
| 193 | |
| 194 # We're getting this file from Japan! | |
| 195 # It seems to work pretty well until they take down or change their website | |
| 196 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643 | |
| 197 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc) | |
| 198 remote_ref = url.urlopen(refurl) | |
| 199 ref_filename = os.path.join(folder, 'reference.seq') | |
| 200 with open(ref_filename, 'wb') as fRef: | |
| 201 fRef.write(remote_ref.read()) | |
| 202 remote_ref.close() | |
| 203 | |
| 204 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \ | |
| 205 % (ref_filename, ref_filename.replace("reference.seq", "reference")) | |
| 206 p = subprocess.Popen(cmd, | |
| 207 shell=True, | |
| 208 stdin=None, | |
| 209 stdout=subprocess.PIPE, | |
| 210 stderr=subprocess.PIPE, close_fds=True) | |
| 211 p.wait() | |
| 212 | |
| 213 return | |
| 214 | |
| 215 # -------------------------------------------------------------------------------------------------- | |
| 216 | |
| 217 # test if a node is an Element and that it has a specific tag name | |
| 218 def testElementTag(node, name): | |
| 219 return node.nodeType == node.ELEMENT_NODE and node.localName == name | |
| 220 | |
| 221 # -------------------------------------------------------------------------------------------------- | |
| 222 | |
| 223 # Get the text from an element node with a text node child | |
| 224 def getText(element): | |
| 225 result = '' | |
| 226 for node in element.childNodes: | |
| 227 if node.nodeType == node.TEXT_NODE: | |
| 228 result += node.data | |
| 229 return normaliseText(result) | |
| 230 | |
| 231 # -------------------------------------------------------------------------------------------------- | |
| 232 | |
| 233 # remove unwanted whitespace including linebreaks etc. | |
| 234 def normaliseText(str): | |
| 235 return ' '.join(str.split()) | |
| 236 | |
| 237 # -------------------------------------------------------------------------------------------------- | |
| 238 | |
| 239 # A collection of interesting information about a taxa | |
| 240 class SpeciesInfo(object): | |
| 241 def __init__(self): | |
| 242 self.name = None # String name of species | |
| 243 self.database_url = None # URL as string | |
| 244 self.retrieved = None # date as string | |
| 245 self.profiles_url = None # URL as string | |
| 246 self.profiles_count = None # positive integer | |
| 247 self.loci = [] # list of loci | |
| 248 | |
| 249 def __str__(self): | |
| 250 s = "Name: %s\n" % self.name | |
| 251 s += "Database URL: %s\n" % self.database_url | |
| 252 s += "Retrieved: %s\n" % self.retrieved | |
| 253 s += "Profiles URL: %s\n" % self.profiles_url | |
| 254 s += "Profiles count: %s\n" % self.profiles_count | |
| 255 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci])) | |
| 256 return s | |
| 257 | |
| 258 # -------------------------------------------------------------------------------------------------- | |
| 259 | |
| 260 class LocusInfo(object): | |
| 261 def __init__(self): | |
| 262 self.url = None | |
| 263 self.name = None | |
| 264 def __str__(self): | |
| 265 return "Locus: name:%s,url:%s" % (self.name, self.url) | |
| 266 | |
| 267 # -------------------------------------------------------------------------------------------------- | |
| 268 | |
| 269 # retrieve the interesting information for a given sample element | |
| 270 def getSpeciesInfo(species_node, species): | |
| 271 this_name = getText(species_node) | |
| 272 print this_name | |
| 273 if this_name.startswith(species): | |
| 274 info = SpeciesInfo() | |
| 275 info.name = this_name | |
| 276 for mlst_node in species_node.getElementsByTagName('mlst'): | |
| 277 for database_node in mlst_node.getElementsByTagName('database'): | |
| 278 for database_child_node in database_node.childNodes: | |
| 279 if testElementTag(database_child_node, 'url'): | |
| 280 info.database_url = getText(database_child_node) | |
| 281 elif testElementTag(database_child_node, 'retrieved'): | |
| 282 info.retrieved = getText(database_child_node) | |
| 283 elif testElementTag(database_child_node, 'profiles'): | |
| 284 for profile_count in database_child_node.getElementsByTagName('count'): | |
| 285 info.profiles_count = getText(profile_count) | |
| 286 for profile_url in database_child_node.getElementsByTagName('url'): | |
| 287 info.profiles_url = getText(profile_url) | |
| 288 elif testElementTag(database_child_node, 'loci'): | |
| 289 for locus_node in database_child_node.getElementsByTagName('locus'): | |
| 290 locus_info = LocusInfo() | |
| 291 locus_info.name = getText(locus_node) | |
| 292 for locus_url in locus_node.getElementsByTagName('url'): | |
| 293 locus_info.url = getText(locus_url) | |
| 294 info.loci.append(locus_info) | |
| 295 | |
| 296 return info | |
| 297 else: | |
| 298 return None | |
| 299 | |
| 300 # -------------------------------------------------------------------------------------------------- | |
| 301 | |
| 302 if __name__ == '__main__': | |
| 303 main() |
