comparison data_manager/fetch_mlst_data.py @ 0:c01ac945b067 draft

Uploaded
author ulfschaefer
date Mon, 11 Jul 2016 05:23:33 -0400
parents
children 577ff220eaea
comparison
equal deleted inserted replaced
-1:000000000000 0:c01ac945b067
1 #!/usr/bin/env python
2
3 '''
4 Download MLST datasets from this site: http://pubmlst.org/data/ by
5 parsing an xml file (http://pubmlst.org/data/dbases.xml).
6
7 Data is downloaded for a species determined by the user:
8 - profiles (maps STs to allele numbers)
9 - numbered sequences for each locus in the scheme
10
11 In addition, the alleles are concatenated together for use with SRST2.
12
13 A log file is also generated in the working directory, detailing the
14 time, date and location of all files downloaded, as well as the <retrieved>
15 tag which tells us when the XML entry was last updated.
16
17 If the species name input by the user matches multiple <species> in the
18 xml file, the script simply reports the possible matches so the user can
19 try again.
20 '''
21
22 """
23 - Remove empty line at the end of profiles.txt file.
24 - Ensure the allele names at the profiles.txt file don't contain "_".
25
26 """
27 from argparse import ArgumentParser
28 import xml.dom.minidom as xml
29 import urllib2 as url
30 import re
31 import os
32 import sys
33 import glob
34 import csv
35 import shutil
36 from urlparse import urlparse
37 import time
38 import subprocess
39
40 try:
41 from galaxy.util.json import from_json_string, to_json_string
42 has_galaxy = True
43 except ImportError:
44 sys.stderr.write("Will download some stuff to the current dir but can't do Galaxy stuff!")
45 has_galaxy = False
46
47 # --------------------------------------------------------------------------------------------------
48
49 def parse_args():
50 parser = ArgumentParser(description='Download MLST datasets by species'
51 'from pubmlst.org.')
52
53 parser.add_argument('--repository_url',
54 metavar = 'URL',
55 default = 'http://pubmlst.org/data/dbases.xml',
56 help = 'URL for MLST repository XML index')
57
58 parser.add_argument('--species',
59 metavar = 'NAME',
60 required = True,
61 help = 'The name of the species that you want to download (e.g. "Escherichia coli")')
62
63 parser.add_argument('--outfile',
64 metavar = 'FILE',
65 required = True,
66 help = 'The name of the Json file to write that galaxy stuff to.')
67
68 parser.add_argument('--reference',
69 metavar = 'ACCESSION',
70 required = True,
71 help = 'NCBI accession number of the reference genome to use for flanking regions.')
72
73 return parser.parse_args()
74
75 # --------------------------------------------------------------------------------------------------
76
77 def main():
78
79 """
80 <species>
81 Achromobacter spp.
82 <mlst>
83 <database>
84 <url>http://pubmlst.org/achromobacter</url>
85 <retrieved>2015-08-11</retrieved>
86 <profiles>
87 <count>272</count>
88 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url>
89 </profiles>
90 <loci>
91 <locus>
92 nusA
93 <url>
94 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa
95 </url>
96 </locus>
97 <locus>
98 rpoB
99 <url>
100 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa
101 </url>
102 </locus>
103 """
104
105 args = parse_args()
106 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml
107
108 doc = xml.parse(docFile)
109 root = doc.childNodes[0]
110 found_species = []
111
112 if args.species == "Escherichia coli":
113 args.species = "Escherichia coli#1"
114 elif args.species == "Acinetobacter baumannii":
115 args.species = "Acinetobacter baumannii#1"
116 elif args.species == "Pasteurella multocida":
117 args.species = "Pasteurella multocida#1"
118 else:
119 pass
120
121 for species_node in root.getElementsByTagName('species'):
122 info = getSpeciesInfo(species_node, args.species)
123 if info != None:
124 found_species.append(info)
125
126 if len(found_species) == 0:
127 sys.stderr.write("No species matched your query.\n")
128 exit(1)
129
130 if len(found_species) > 1:
131 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species)))
132 for info in found_species:
133 sys.stderr.write(info.name + '\n')
134 exit(2)
135
136 # output information for the single matching species
137 assert len(found_species) == 1
138 species_info = found_species[0]
139 species_name_underscores = species_info.name.replace(' ', '_')
140 timestamp = time.strftime("%Y%m%d%H%M%S")
141
142 params = None
143 if has_galaxy == True:
144 params = from_json_string(open(args.outfile).read())
145 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp)
146 else:
147 folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), species_name_underscores, timestamp)
148
149 if not os.path.isdir(folder):
150 os.makedirs(folder)
151
152 profile_doc = url.urlopen(species_info.profiles_url)
153 with open(os.path.join(folder, 'profiles.txt'), 'w') as f:
154 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt')))
155 for line in profile_doc.readlines():
156 cols = line.split("\t")
157 f.write("%s\n" % ('\t'.join(cols[0:8])))
158 profile_doc.close()
159
160 for locus in species_info.loci:
161 locus_path = urlparse(locus.url).path
162 locus_filename = locus_path.split('/')[-1]
163 locus_filename = locus_filename.replace("_.tfa", ".fas")
164 locus_filename = locus_filename.replace("tfa", "fas")
165 locus_doc = url.urlopen(locus.url)
166 with open(os.path.join(folder, locus_filename), 'w') as locus_file:
167 locus_fasta_content = locus_doc.read()
168 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-")
169 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename)))
170 locus_file.write(locus_fasta_content)
171 locus_doc.close()
172
173 get_reference(folder, args.reference)
174
175
176 # do Galaxy stuff
177 if has_galaxy == True:
178 data_manager_dict = {}
179 data_manager_dict['data_tables'] = {}
180 name = "%s-%s" % (species_info.name, timestamp)
181 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores,
182 dbkey=species_name_underscores,
183 name=name,
184 time_stamp=timestamp,
185 file_path=folder)]
186 #save info to json file
187 with open(args.outfile, 'wb') as fjson:
188 fjson.write(to_json_string(data_manager_dict))
189
190 # end of main --------------------------------------------------------------------------------------
191
192 def get_reference(folder, acc):
193
194 # We're getting this file from Japan!
195 # It seems to work pretty well until they take down or change their website
196 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643
197 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc)
198 remote_ref = url.urlopen(refurl)
199 ref_filename = os.path.join(folder, 'reference.seq')
200 with open(ref_filename, 'wb') as fRef:
201 fRef.write(remote_ref.read())
202 remote_ref.close()
203
204 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \
205 % (ref_filename, ref_filename.replace("reference.seq", "reference"))
206 p = subprocess.Popen(cmd,
207 shell=True,
208 stdin=None,
209 stdout=subprocess.PIPE,
210 stderr=subprocess.PIPE, close_fds=True)
211 p.wait()
212
213 return
214
215 # --------------------------------------------------------------------------------------------------
216
217 # test if a node is an Element and that it has a specific tag name
218 def testElementTag(node, name):
219 return node.nodeType == node.ELEMENT_NODE and node.localName == name
220
221 # --------------------------------------------------------------------------------------------------
222
223 # Get the text from an element node with a text node child
224 def getText(element):
225 result = ''
226 for node in element.childNodes:
227 if node.nodeType == node.TEXT_NODE:
228 result += node.data
229 return normaliseText(result)
230
231 # --------------------------------------------------------------------------------------------------
232
233 # remove unwanted whitespace including linebreaks etc.
234 def normaliseText(str):
235 return ' '.join(str.split())
236
237 # --------------------------------------------------------------------------------------------------
238
239 # A collection of interesting information about a taxa
240 class SpeciesInfo(object):
241 def __init__(self):
242 self.name = None # String name of species
243 self.database_url = None # URL as string
244 self.retrieved = None # date as string
245 self.profiles_url = None # URL as string
246 self.profiles_count = None # positive integer
247 self.loci = [] # list of loci
248
249 def __str__(self):
250 s = "Name: %s\n" % self.name
251 s += "Database URL: %s\n" % self.database_url
252 s += "Retrieved: %s\n" % self.retrieved
253 s += "Profiles URL: %s\n" % self.profiles_url
254 s += "Profiles count: %s\n" % self.profiles_count
255 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci]))
256 return s
257
258 # --------------------------------------------------------------------------------------------------
259
260 class LocusInfo(object):
261 def __init__(self):
262 self.url = None
263 self.name = None
264 def __str__(self):
265 return "Locus: name:%s,url:%s" % (self.name, self.url)
266
267 # --------------------------------------------------------------------------------------------------
268
269 # retrieve the interesting information for a given sample element
270 def getSpeciesInfo(species_node, species):
271 this_name = getText(species_node)
272 print this_name
273 if this_name.startswith(species):
274 info = SpeciesInfo()
275 info.name = this_name
276 for mlst_node in species_node.getElementsByTagName('mlst'):
277 for database_node in mlst_node.getElementsByTagName('database'):
278 for database_child_node in database_node.childNodes:
279 if testElementTag(database_child_node, 'url'):
280 info.database_url = getText(database_child_node)
281 elif testElementTag(database_child_node, 'retrieved'):
282 info.retrieved = getText(database_child_node)
283 elif testElementTag(database_child_node, 'profiles'):
284 for profile_count in database_child_node.getElementsByTagName('count'):
285 info.profiles_count = getText(profile_count)
286 for profile_url in database_child_node.getElementsByTagName('url'):
287 info.profiles_url = getText(profile_url)
288 elif testElementTag(database_child_node, 'loci'):
289 for locus_node in database_child_node.getElementsByTagName('locus'):
290 locus_info = LocusInfo()
291 locus_info.name = getText(locus_node)
292 for locus_url in locus_node.getElementsByTagName('url'):
293 locus_info.url = getText(locus_url)
294 info.loci.append(locus_info)
295
296 return info
297 else:
298 return None
299
300 # --------------------------------------------------------------------------------------------------
301
302 if __name__ == '__main__':
303 main()