Mercurial > repos > ulfschaefer > data_manager_phemost
comparison data_manager/fetch_mlst_data.py @ 0:c01ac945b067 draft
Uploaded
author | ulfschaefer |
---|---|
date | Mon, 11 Jul 2016 05:23:33 -0400 |
parents | |
children | 577ff220eaea |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c01ac945b067 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ''' | |
4 Download MLST datasets from this site: http://pubmlst.org/data/ by | |
5 parsing an xml file (http://pubmlst.org/data/dbases.xml). | |
6 | |
7 Data is downloaded for a species determined by the user: | |
8 - profiles (maps STs to allele numbers) | |
9 - numbered sequences for each locus in the scheme | |
10 | |
11 In addition, the alleles are concatenated together for use with SRST2. | |
12 | |
13 A log file is also generated in the working directory, detailing the | |
14 time, date and location of all files downloaded, as well as the <retrieved> | |
15 tag which tells us when the XML entry was last updated. | |
16 | |
17 If the species name input by the user matches multiple <species> in the | |
18 xml file, the script simply reports the possible matches so the user can | |
19 try again. | |
20 ''' | |
21 | |
22 """ | |
23 - Remove empty line at the end of profiles.txt file. | |
24 - Ensure the allele names at the profiles.txt file don't contain "_". | |
25 | |
26 """ | |
27 from argparse import ArgumentParser | |
28 import xml.dom.minidom as xml | |
29 import urllib2 as url | |
30 import re | |
31 import os | |
32 import sys | |
33 import glob | |
34 import csv | |
35 import shutil | |
36 from urlparse import urlparse | |
37 import time | |
38 import subprocess | |
39 | |
40 try: | |
41 from galaxy.util.json import from_json_string, to_json_string | |
42 has_galaxy = True | |
43 except ImportError: | |
44 sys.stderr.write("Will download some stuff to the current dir but can't do Galaxy stuff!") | |
45 has_galaxy = False | |
46 | |
47 # -------------------------------------------------------------------------------------------------- | |
48 | |
49 def parse_args(): | |
50 parser = ArgumentParser(description='Download MLST datasets by species' | |
51 'from pubmlst.org.') | |
52 | |
53 parser.add_argument('--repository_url', | |
54 metavar = 'URL', | |
55 default = 'http://pubmlst.org/data/dbases.xml', | |
56 help = 'URL for MLST repository XML index') | |
57 | |
58 parser.add_argument('--species', | |
59 metavar = 'NAME', | |
60 required = True, | |
61 help = 'The name of the species that you want to download (e.g. "Escherichia coli")') | |
62 | |
63 parser.add_argument('--outfile', | |
64 metavar = 'FILE', | |
65 required = True, | |
66 help = 'The name of the Json file to write that galaxy stuff to.') | |
67 | |
68 parser.add_argument('--reference', | |
69 metavar = 'ACCESSION', | |
70 required = True, | |
71 help = 'NCBI accession number of the reference genome to use for flanking regions.') | |
72 | |
73 return parser.parse_args() | |
74 | |
75 # -------------------------------------------------------------------------------------------------- | |
76 | |
77 def main(): | |
78 | |
79 """ | |
80 <species> | |
81 Achromobacter spp. | |
82 <mlst> | |
83 <database> | |
84 <url>http://pubmlst.org/achromobacter</url> | |
85 <retrieved>2015-08-11</retrieved> | |
86 <profiles> | |
87 <count>272</count> | |
88 <url>http://pubmlst.org/data/profiles/achromobacter.txt</url> | |
89 </profiles> | |
90 <loci> | |
91 <locus> | |
92 nusA | |
93 <url> | |
94 http://pubmlst.org/data/alleles/achromobacter/nusA.tfa | |
95 </url> | |
96 </locus> | |
97 <locus> | |
98 rpoB | |
99 <url> | |
100 http://pubmlst.org/data/alleles/achromobacter/rpoB.tfa | |
101 </url> | |
102 </locus> | |
103 """ | |
104 | |
105 args = parse_args() | |
106 docFile = url.urlopen(args.repository_url) # url address #args.repository_url =http://pubmlst.org/data/dbases.xml | |
107 | |
108 doc = xml.parse(docFile) | |
109 root = doc.childNodes[0] | |
110 found_species = [] | |
111 | |
112 if args.species == "Escherichia coli": | |
113 args.species = "Escherichia coli#1" | |
114 elif args.species == "Acinetobacter baumannii": | |
115 args.species = "Acinetobacter baumannii#1" | |
116 elif args.species == "Pasteurella multocida": | |
117 args.species = "Pasteurella multocida#1" | |
118 else: | |
119 pass | |
120 | |
121 for species_node in root.getElementsByTagName('species'): | |
122 info = getSpeciesInfo(species_node, args.species) | |
123 if info != None: | |
124 found_species.append(info) | |
125 | |
126 if len(found_species) == 0: | |
127 sys.stderr.write("No species matched your query.\n") | |
128 exit(1) | |
129 | |
130 if len(found_species) > 1: | |
131 sys.stderr.write("The following %i species match your query, please be more specific:\n" % (len(found_species))) | |
132 for info in found_species: | |
133 sys.stderr.write(info.name + '\n') | |
134 exit(2) | |
135 | |
136 # output information for the single matching species | |
137 assert len(found_species) == 1 | |
138 species_info = found_species[0] | |
139 species_name_underscores = species_info.name.replace(' ', '_') | |
140 timestamp = time.strftime("%Y%m%d%H%M%S") | |
141 | |
142 params = None | |
143 if has_galaxy == True: | |
144 params = from_json_string(open(args.outfile).read()) | |
145 folder = os.path.join(params['output_data'][0]['extra_files_path'], species_name_underscores, timestamp) | |
146 else: | |
147 folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), species_name_underscores, timestamp) | |
148 | |
149 if not os.path.isdir(folder): | |
150 os.makedirs(folder) | |
151 | |
152 profile_doc = url.urlopen(species_info.profiles_url) | |
153 with open(os.path.join(folder, 'profiles.txt'), 'w') as f: | |
154 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, 'profiles.txt'))) | |
155 for line in profile_doc.readlines(): | |
156 cols = line.split("\t") | |
157 f.write("%s\n" % ('\t'.join(cols[0:8]))) | |
158 profile_doc.close() | |
159 | |
160 for locus in species_info.loci: | |
161 locus_path = urlparse(locus.url).path | |
162 locus_filename = locus_path.split('/')[-1] | |
163 locus_filename = locus_filename.replace("_.tfa", ".fas") | |
164 locus_filename = locus_filename.replace("tfa", "fas") | |
165 locus_doc = url.urlopen(locus.url) | |
166 with open(os.path.join(folder, locus_filename), 'w') as locus_file: | |
167 locus_fasta_content = locus_doc.read() | |
168 locus_fasta_content = locus_fasta_content.replace("_","-").replace("--","-") | |
169 sys.stdout.write("Writing to %s\n" % (os.path.join(folder, locus_filename))) | |
170 locus_file.write(locus_fasta_content) | |
171 locus_doc.close() | |
172 | |
173 get_reference(folder, args.reference) | |
174 | |
175 | |
176 # do Galaxy stuff | |
177 if has_galaxy == True: | |
178 data_manager_dict = {} | |
179 data_manager_dict['data_tables'] = {} | |
180 name = "%s-%s" % (species_info.name, timestamp) | |
181 data_manager_dict['data_tables']['mlst_data'] = [dict(value=species_name_underscores, | |
182 dbkey=species_name_underscores, | |
183 name=name, | |
184 time_stamp=timestamp, | |
185 file_path=folder)] | |
186 #save info to json file | |
187 with open(args.outfile, 'wb') as fjson: | |
188 fjson.write(to_json_string(data_manager_dict)) | |
189 | |
190 # end of main -------------------------------------------------------------------------------------- | |
191 | |
192 def get_reference(folder, acc): | |
193 | |
194 # We're getting this file from Japan! | |
195 # It seems to work pretty well until they take down or change their website | |
196 # See: http://www.ncbi.nlm.nih.gov/pubmed/20472643 | |
197 refurl = 'http://togows.dbcls.jp/entry/ncbi-nucleotide/%s.fasta' % (acc) | |
198 remote_ref = url.urlopen(refurl) | |
199 ref_filename = os.path.join(folder, 'reference.seq') | |
200 with open(ref_filename, 'wb') as fRef: | |
201 fRef.write(remote_ref.read()) | |
202 remote_ref.close() | |
203 | |
204 cmd = "makeblastdb -in %s -dbtype nucl -out %s" \ | |
205 % (ref_filename, ref_filename.replace("reference.seq", "reference")) | |
206 p = subprocess.Popen(cmd, | |
207 shell=True, | |
208 stdin=None, | |
209 stdout=subprocess.PIPE, | |
210 stderr=subprocess.PIPE, close_fds=True) | |
211 p.wait() | |
212 | |
213 return | |
214 | |
215 # -------------------------------------------------------------------------------------------------- | |
216 | |
217 # test if a node is an Element and that it has a specific tag name | |
218 def testElementTag(node, name): | |
219 return node.nodeType == node.ELEMENT_NODE and node.localName == name | |
220 | |
221 # -------------------------------------------------------------------------------------------------- | |
222 | |
223 # Get the text from an element node with a text node child | |
224 def getText(element): | |
225 result = '' | |
226 for node in element.childNodes: | |
227 if node.nodeType == node.TEXT_NODE: | |
228 result += node.data | |
229 return normaliseText(result) | |
230 | |
231 # -------------------------------------------------------------------------------------------------- | |
232 | |
233 # remove unwanted whitespace including linebreaks etc. | |
234 def normaliseText(str): | |
235 return ' '.join(str.split()) | |
236 | |
237 # -------------------------------------------------------------------------------------------------- | |
238 | |
239 # A collection of interesting information about a taxa | |
240 class SpeciesInfo(object): | |
241 def __init__(self): | |
242 self.name = None # String name of species | |
243 self.database_url = None # URL as string | |
244 self.retrieved = None # date as string | |
245 self.profiles_url = None # URL as string | |
246 self.profiles_count = None # positive integer | |
247 self.loci = [] # list of loci | |
248 | |
249 def __str__(self): | |
250 s = "Name: %s\n" % self.name | |
251 s += "Database URL: %s\n" % self.database_url | |
252 s += "Retrieved: %s\n" % self.retrieved | |
253 s += "Profiles URL: %s\n" % self.profiles_url | |
254 s += "Profiles count: %s\n" % self.profiles_count | |
255 s += "Loci: %s\n" % (','.join([str(x) for x in self.loci])) | |
256 return s | |
257 | |
258 # -------------------------------------------------------------------------------------------------- | |
259 | |
260 class LocusInfo(object): | |
261 def __init__(self): | |
262 self.url = None | |
263 self.name = None | |
264 def __str__(self): | |
265 return "Locus: name:%s,url:%s" % (self.name, self.url) | |
266 | |
267 # -------------------------------------------------------------------------------------------------- | |
268 | |
269 # retrieve the interesting information for a given sample element | |
270 def getSpeciesInfo(species_node, species): | |
271 this_name = getText(species_node) | |
272 print this_name | |
273 if this_name.startswith(species): | |
274 info = SpeciesInfo() | |
275 info.name = this_name | |
276 for mlst_node in species_node.getElementsByTagName('mlst'): | |
277 for database_node in mlst_node.getElementsByTagName('database'): | |
278 for database_child_node in database_node.childNodes: | |
279 if testElementTag(database_child_node, 'url'): | |
280 info.database_url = getText(database_child_node) | |
281 elif testElementTag(database_child_node, 'retrieved'): | |
282 info.retrieved = getText(database_child_node) | |
283 elif testElementTag(database_child_node, 'profiles'): | |
284 for profile_count in database_child_node.getElementsByTagName('count'): | |
285 info.profiles_count = getText(profile_count) | |
286 for profile_url in database_child_node.getElementsByTagName('url'): | |
287 info.profiles_url = getText(profile_url) | |
288 elif testElementTag(database_child_node, 'loci'): | |
289 for locus_node in database_child_node.getElementsByTagName('locus'): | |
290 locus_info = LocusInfo() | |
291 locus_info.name = getText(locus_node) | |
292 for locus_url in locus_node.getElementsByTagName('url'): | |
293 locus_info.url = getText(locus_url) | |
294 info.loci.append(locus_info) | |
295 | |
296 return info | |
297 else: | |
298 return None | |
299 | |
300 # -------------------------------------------------------------------------------------------------- | |
301 | |
302 if __name__ == '__main__': | |
303 main() |