0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 '''
|
|
4 Download MLST datasets from this site: http://pubmlst.org/data/ by
|
|
5 parsing an xml file (http://pubmlst.org/data/dbases.xml).
|
|
6
|
|
7 Data is downloaded for a species determined by the user:
|
|
8 - profiles (maps STs to allele numbers)
|
|
9 - numbered sequences for each locus in the scheme
|
|
10
|
|
11 In addition, the alleles are concatenated together for use with SRST2.
|
|
12
|
|
13 A log file is also generated in the working directory, detailing the
|
|
14 time, date and location of all files downloaded, as well as the <retrieved>
|
|
15 tag which tells us when the XML entry was last updated.
|
|
16
|
|
17 If the species name input by the user matches multiple <species> in the
|
|
18 xml file, the script simply reports the possible matches so the user can
|
|
19 try again.
|
|
20 '''
|
|
21
|
|
22 """
|
|
23 - Remove empty line at the end of profiles.txt file.
|
|
24 - Ensure the allele names at the profiles.txt file don't contain "_".
|
|
25
|
|
26 """
|
|
27 from argparse import ArgumentParser
|
|
28 import xml.dom.minidom as xml
|
|
29 import urllib2 as url
|
|
30 import re
|
|
31 import os
|
|
32 import sys
|
|
33 import glob
|
|
34 import csv
|
|
35 import shutil
|
|
36 from urlparse import urlparse
|
|
37 import time
|
|
38 import subprocess
|
|
39
|
|
40 try:
|
|
41 from galaxy.util.json import from_json_string, to_json_string
|
|
42 has_galaxy = True
|
|
43 except ImportError:
|
|
44 sys.stderr.write("Will download some stuff to the current dir but can't do Galaxy stuff!")
|
|
45 has_galaxy = False
|
|
46
|
|
47 # --------------------------------------------------------------------------------------------------
|
|
48
|
|
49 def parse_args():
|
|
50 parser = ArgumentParser(description='Download MLST datasets by species'
|
|
51 'from pubmlst.org.')
|
|
52
|
|
53 parser.add_argument('--repository_url',
|
|
54 metavar = 'URL',
|
|
55 default = 'http://pubmlst.org/data/dbases.xml',
|
|
56 help = 'URL for MLST repository XML index')
|
|
57
|
|
58 parser.add_argument('--species',
|
|
59 metavar = 'NAME',
|
|
60 required = True,
|
|
61 help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
|
|
62
|
|
63 parser.add_argument('--outfile',
|
|
64 metavar = 'FILE',
|
|
65 required = True,
|
|
66 help = 'The name of the Json file to write that galaxy stuff to.')
|
|
67
|
|
68 parser.add_argument('--reference',
|
|
69 metavar = 'ACCESSION',
|
|
70 required = True,
|
|
71 help = 'NCBI accession number of the reference genome to use for flanking regions.')
|
|
72
|
|
73 return parser.parse_args()
|
|
74
|
|
75 # --------------------------------------------------------------------------------------------------
|
|
76
|
|
77 def main():
|
|
78
|
|
79 """
|
|
80 <species>
|
|
81 Achromobacter spp.
|
|
82 <mlst>
|
|
83 <database>
|
|
84 <url>http://pubmlst.org/achromobacter</url>
|
|
85 <retrieved>2015-08-11</retrieved>
|
|
86 <profiles>
|
|
87 <count>272</count>
|
|
88 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url>
|
|
89 </profiles>
|
|
90 <loci>
|
|
91 <locus>
|
|
92 nusA
|
|
93 <url>
|
|
94 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa
|
|
95 </url>
|
|
96 </locus>
|
|
97 <locus>
|
|
98 rpoB
|
|
99 <url>
|
|
100 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa
|
|
101 </url>
|
|
102 </locus>
|
|
103 """
|
|
104
|
|
105 args = parse_args()
|
|
106 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml
|
|
107
|
|
108 doc = xml.parse(docFile)
|
|
109 root = doc.childNodes[0]
|
|
110 found_species = []
|
|
111
|
|
112 if args.species == "Escherichia coli":
|
|
113 args.species = "Escherichia coli#1"
|
|
114 elif args.species == "Acinetobacter baumannii":
|
|
115 args.species = "Acinetobacter baumannii#1"
|
|
116 elif args.species == "Pasteurella multocida":
|
|
117 args.species = "Pasteurella multocida#1"
|
|
118 else:
|
|
119 pass
|
|
120
|
|
121 for species_node in root.getElementsByTagName('species'):
|
|
122 info = getSpeciesInfo(species_node, args.species)
|
|
123 if info != None:
|
|
124 found_species.append(info)
|
|
125
|
|
126 if len(found_species) == 0:
|
|
127 sys.stderr.write("No species matched your query.\n")
|
|
128 exit(1)
|
|
129
|
|
130 if len(found_species) > 1:
|
|
131 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species)))
|
|
132 for info in found_species:
|
|
133 sys.stderr.write(info.name + '\n')
|
|
134 exit(2)
|
|
135
|
|
136 # output information for the single matching species
|
|
137 assert len(found_species) == 1
|
|
138 species_info = found_species[0]
|
|
139 species_name_underscores = species_info.name.replace(' ', '_')
|
|
140 timestamp = time.strftime("%Y%m%d%H%M%S")
|
|
141
|
|
142 params = None
|
|
143 if has_galaxy == True:
|
|
144 params = from_json_string(open(args.outfile).read())
|
|
145 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp)
|
|
146 else:
|
|
147 folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), species_name_underscores, timestamp)
|
|
148
|
|
149 if not os.path.isdir(folder):
|
|
150 os.makedirs(folder)
|
|
151
|
|
152 profile_doc = url.urlopen(species_info.profiles_url)
|
|
153 with open(os.path.join(folder, 'profiles.txt'), 'w') as f:
|
|
154 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt')))
|
|
155 for line in profile_doc.readlines():
|
|
156 cols = line.split("\t")
|
|
157 f.write("%s\n" % ('\t'.join(cols[0:8])))
|
|
158 profile_doc.close()
|
|
159
|
|
160 for locus in species_info.loci:
|
|
161 locus_path = urlparse(locus.url).path
|
|
162 locus_filename = locus_path.split('/')[-1]
|
|
163 locus_filename = locus_filename.replace("_.tfa", ".fas")
|
|
164 locus_filename = locus_filename.replace("tfa", "fas")
|
|
165 locus_doc = url.urlopen(locus.url)
|
|
166 with open(os.path.join(folder, locus_filename), 'w') as locus_file:
|
|
167 locus_fasta_content = locus_doc.read()
|
|
168 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-")
|
|
169 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename)))
|
|
170 locus_file.write(locus_fasta_content)
|
|
171 locus_doc.close()
|
|
172
|
|
173 get_reference(folder, args.reference)
|
|
174
|
|
175
|
|
176 # do Galaxy stuff
|
|
177 if has_galaxy == True:
|
|
178 data_manager_dict = {}
|
|
179 data_manager_dict['data_tables'] = {}
|
|
180 name = "%s-%s" % (species_info.name, timestamp)
|
|
181 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores,
|
|
182 dbkey=species_name_underscores,
|
|
183 name=name,
|
|
184 time_stamp=timestamp,
|
|
185 file_path=folder)]
|
|
186 #save info to json file
|
|
187 with open(args.outfile, 'wb') as fjson:
|
|
188 fjson.write(to_json_string(data_manager_dict))
|
|
189
|
|
190 # end of main --------------------------------------------------------------------------------------
|
|
191
|
|
192 def get_reference(folder, acc):
|
|
193
|
|
194 # We're getting this file from Japan!
|
|
195 # It seems to work pretty well until they take down or change their website
|
|
196 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643
|
|
197 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc)
|
|
198 remote_ref = url.urlopen(refurl)
|
|
199 ref_filename = os.path.join(folder, 'reference.seq')
|
|
200 with open(ref_filename, 'wb') as fRef:
|
|
201 fRef.write(remote_ref.read())
|
|
202 remote_ref.close()
|
|
203
|
|
204 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \
|
|
205 % (ref_filename, ref_filename.replace("reference.seq", "reference"))
|
|
206 p = subprocess.Popen(cmd,
|
|
207 shell=True,
|
|
208 stdin=None,
|
|
209 stdout=subprocess.PIPE,
|
|
210 stderr=subprocess.PIPE, close_fds=True)
|
|
211 p.wait()
|
|
212
|
|
213 return
|
|
214
|
|
215 # --------------------------------------------------------------------------------------------------
|
|
216
|
|
217 # test if a node is an Element and that it has a specific tag name
|
|
218 def testElementTag(node, name):
|
|
219 return node.nodeType == node.ELEMENT_NODE and node.localName == name
|
|
220
|
|
221 # --------------------------------------------------------------------------------------------------
|
|
222
|
|
223 # Get the text from an element node with a text node child
|
|
224 def getText(element):
|
|
225 result = ''
|
|
226 for node in element.childNodes:
|
|
227 if node.nodeType == node.TEXT_NODE:
|
|
228 result += node.data
|
|
229 return normaliseText(result)
|
|
230
|
|
231 # --------------------------------------------------------------------------------------------------
|
|
232
|
|
233 # remove unwanted whitespace including linebreaks etc.
|
|
234 def normaliseText(str):
|
|
235 return ' '.join(str.split())
|
|
236
|
|
237 # --------------------------------------------------------------------------------------------------
|
|
238
|
|
239 # A collection of interesting information about a taxa
|
|
240 class SpeciesInfo(object):
|
|
241 def __init__(self):
|
|
242 self.name = None # String name of species
|
|
243 self.database_url = None # URL as string
|
|
244 self.retrieved = None # date as string
|
|
245 self.profiles_url = None # URL as string
|
|
246 self.profiles_count = None # positive integer
|
|
247 self.loci = [] # list of loci
|
|
248
|
|
249 def __str__(self):
|
|
250 s = "Name: %s\n" % self.name
|
|
251 s += "Database URL: %s\n" % self.database_url
|
|
252 s += "Retrieved: %s\n" % self.retrieved
|
|
253 s += "Profiles URL: %s\n" % self.profiles_url
|
|
254 s += "Profiles count: %s\n" % self.profiles_count
|
|
255 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci]))
|
|
256 return s
|
|
257
|
|
258 # --------------------------------------------------------------------------------------------------
|
|
259
|
|
260 class LocusInfo(object):
|
|
261 def __init__(self):
|
|
262 self.url = None
|
|
263 self.name = None
|
|
264 def __str__(self):
|
|
265 return "Locus: name:%s,url:%s" % (self.name, self.url)
|
|
266
|
|
267 # --------------------------------------------------------------------------------------------------
|
|
268
|
|
269 # retrieve the interesting information for a given sample element
|
|
270 def getSpeciesInfo(species_node, species):
|
|
271 this_name = getText(species_node)
|
|
272 print this_name
|
|
273 if this_name.startswith(species):
|
|
274 info = SpeciesInfo()
|
|
275 info.name = this_name
|
|
276 for mlst_node in species_node.getElementsByTagName('mlst'):
|
|
277 for database_node in mlst_node.getElementsByTagName('database'):
|
|
278 for database_child_node in database_node.childNodes:
|
|
279 if testElementTag(database_child_node, 'url'):
|
|
280 info.database_url = getText(database_child_node)
|
|
281 elif testElementTag(database_child_node, 'retrieved'):
|
|
282 info.retrieved = getText(database_child_node)
|
|
283 elif testElementTag(database_child_node, 'profiles'):
|
|
284 for profile_count in database_child_node.getElementsByTagName('count'):
|
|
285 info.profiles_count = getText(profile_count)
|
|
286 for profile_url in database_child_node.getElementsByTagName('url'):
|
|
287 info.profiles_url = getText(profile_url)
|
|
288 elif testElementTag(database_child_node, 'loci'):
|
|
289 for locus_node in database_child_node.getElementsByTagName('locus'):
|
|
290 locus_info = LocusInfo()
|
|
291 locus_info.name = getText(locus_node)
|
|
292 for locus_url in locus_node.getElementsByTagName('url'):
|
|
293 locus_info.url = getText(locus_url)
|
|
294 info.loci.append(locus_info)
|
|
295
|
|
296 return info
|
|
297 else:
|
|
298 return None
|
|
299
|
|
300 # --------------------------------------------------------------------------------------------------
|
|
301
|
|
302 if __name__ == '__main__':
|
|
303 main()
|