comparison vref.py @ 0:3ba5983012cf draft

Uploaded
author mzeidler
date Tue, 24 Sep 2013 10:19:40 -0400
parents
children 3eddea30be1a
comparison
equal deleted inserted replaced
-1:000000000000 0:3ba5983012cf
1 #!/usr/bin/env python
2
3 import sys
4 import logging
5 import gzip
6 import string
7 import os
8 import tarfile
9
10 from io import BytesIO
11 from collections import defaultdict
12
13 try:
14 from Bio.SeqRecord import SeqRecord
15 from Bio import SeqIO
16 except ImportError:
17 message = 'This script requires the BioPython python package\n'
18 sys.stderr.write(message)
19 sys.exit(1)
20
21 try:
22 from plumbum import cli
23 except ImportError:
24 message = 'This script requires the plumbum python package\n'
25 sys.stderr.write(message)
26 sys.exit(1)
27
28 try:
29 import ftputil
30 except ImportError:
31 message = 'This script requires the ftputil python package\n'
32 sys.stderr.write(message)
33 sys.exit(1)
34
35
36 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
37 NON_ID = NON_ID.replace('_', '').replace('-', '')
38 NUC_TRANS = string.maketrans('uryswkmbdhvURYSWKMBDHV', 'tnnnnnnnnnnTNNNNNNNNNN')
39
40 logging.basicConfig(level=logging.INFO, format='%(message)s')
41
42
43 class DataDownloader(object):
44 """ Generic class for downloading fasta records via FTP. """
45
46 def __init__(self, host_name, base_path, user='anonymous', password='anonymous'):
47
48 self.group = 'NoGroup'
49 self.host_name = host_name
50 self.base_path = base_path
51
52 self.host = ftputil.FTPHost(host_name, user, password)
53 self.host.chdir(base_path)
54
55 @classmethod
56 def _get_fasta_record(self, record, group, family, organism, identifier, sub_identifier=None, description=''):
57
58 record.seq._data = record.seq._data.translate(NUC_TRANS)
59
60 group = group.translate(None, NON_ID)
61 family = family.translate(None, NON_ID)
62 organism = organism.translate(None, NON_ID)
63 identifier = identifier.translate(None, NON_ID)
64
65 if sub_identifier:
66 sub_identifier = sub_identifier.translate(None, NON_ID)
67 record.id = ';'.join([group, family, organism, identifier, sub_identifier])
68 else:
69 record.id = ';'.join([group, family, organism, identifier])
70
71 record.description = description
72 record.name = ''
73
74 return record
75
76 def get_fasta_records(self):
77
78 raise NotImplementedError
79
80 class SilvaDownlaoder(DataDownloader):
81 """ Downlaods rRNA transcripts from the Silva database. """
82
83 def __init__(self, silva_release='111'):
84
85 self.silva_release = silva_release
86 self.silva_host_name = 'ftp.arb-silva.de'
87 self.base_path = '/release_%s/Exports/' % self.silva_release
88
89 super(SilvaDownlaoder, self).__init__(self.silva_host_name, self.base_path)
90
91 self.group = 'rRNA'
92
93 def get_fasta_records(self):
94
95 remote_path = self.base_path + 'SSURef_%s_NR_tax_silva_trunc.fasta.tgz' % self.silva_release
96 if self.host.path.isfile(remote_path):
97
98 with self.host.file(remote_path, 'rb') as remote_handle:
99 remote_content = BytesIO(remote_handle.read())
100
101 with tarfile.open(fileobj=remote_content) as tar:
102
103 for subentry in tar.getnames():
104 if subentry.endswith('.fasta'):
105 logging.debug('Processing rRNA file %s' % subentry)
106 subhandle = tar.extractfile(subentry)
107
108 for record in SeqIO.parse(subhandle, "fasta"):
109 identifier = record.id.split('.')[0]
110 description = '_'.join(record.description.split(' ')[1:])
111 taxonomy = description.split(';')
112 organism = taxonomy[-1]
113 family = 'Unassigned'
114
115 for level in taxonomy:
116 if level.endswith('eae'):
117 family = level
118 break
119
120 yield self._get_fasta_record(record, self.group, family, organism, identifier)
121
122 class HumanTranscriptDownloader(DataDownloader):
123 """ Downloads human transcripts fasta records that contain the locations
124 of their exons on the human genome within their description line.
125 """
126
127 def __init__(self):
128
129 self.ensemble_host_name = 'ftp.ensembl.org'
130 self.gtf_path = '/pub/current_gtf/homo_sapiens'
131 self.cdna_path = '/pub/current_fasta/homo_sapiens/cdna'
132 self.ncrna_path = '/pub/current_fasta/homo_sapiens/ncrna'
133
134 super(HumanTranscriptDownloader, self).__init__(self.ensemble_host_name, self.gtf_path)
135
136 self.chromosomes = set()
137 self.genes = defaultdict(set)
138 self.regions = defaultdict(set)
139 self.group = 'Homo_sapiens_cDNA'
140
141 def _cache_annotations(self):
142 """ Obtains gtf annotations for human transcripts from ensemble and
143 caches them in memory.
144 """
145
146 self.host.chdir(self.gtf_path)
147
148 entries = self.host.listdir(self.host.curdir)
149
150 for entry in entries:
151
152 if self.host.path.isfile(entry):
153 if entry.startswith('Homo_sapiens.') and entry.endswith('.gtf.gz'):
154 logging.debug('Processing cDNA annotations %s' % entry)
155
156 with self.host.file(self.gtf_path + '/' + entry, 'rb') as zipped_handle:
157 remote_content = BytesIO(zipped_handle.read())
158
159 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
160 for i, gtf_line in enumerate(gzipfile):
161 if i % 100000 == 0:
162 logging.debug('Cached %i human transcript annotations' % i)
163
164 self._add_annotation(gtf_line)
165
166 def _get_raw_transcripts(self):
167 """ Obtains coding and noncoding human transcripts from ensemble and
168 yields them as fasta records if they have exons that have a known location
169 on the human genome. Fasta records contain the exon locations in their header
170 lines.
171 """
172
173 for subpath in [self.cdna_path, self.ncrna_path]:
174
175 self.host.chdir(subpath)
176
177 entries = self.host.listdir(self.host.curdir)
178
179 for entry in entries:
180 if self.host.path.isfile(entry):
181 if entry.endswith('cdna.all.fa.gz') or entry.endswith('ncrna.fa.gz'):
182
183 logging.debug('Processing human transcript file %s' % entry)
184
185 with self.host.file(subpath + '/' + entry, 'rb') as zipped_handle:
186 remote_content = BytesIO(zipped_handle.read())
187
188 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
189 for i, record in enumerate(SeqIO.parse(gzipfile, "fasta")):
190
191 if record.id.startswith('ENST'):
192
193 if i % 100000 == 0:
194 logging.debug('Retrieved %i human transcripts' % i)
195
196 record.description = ''
197
198 yield record
199
200 def _add_annotation(self, gtf_line):
201 """ Parses gtf annotations and stores them in memory. """
202
203 if gtf_line[0] == '#':
204 return
205
206 fields = gtf_line.strip().split('\t')
207 chromosome, source, feature, start, end, score, strands, frame, attributes\
208 = fields
209
210 start, end = sorted([int(start), int(end)])
211
212 if feature != 'exon':
213 return
214
215 attributes = attributes.replace(' ', '').split(';')
216 transcript_ids = [identifier.split('"')[1] for identifier in attributes\
217 if identifier.startswith('transcript_id')]
218
219 gene_names = [identifier.split('"')[1] for identifier in attributes\
220 if identifier.startswith('gene_name')]
221
222 for transcript_id in transcript_ids:
223 self.genes[transcript_id] = self.genes[transcript_id].union(gene_names)
224 self.regions[transcript_id].add((chromosome, start, end))
225
226 def get_fasta_records(self):
227 """ Yields annotated fasta records of human transcripts. """
228
229 logging.debug('Caching annotations')
230 self._cache_annotations()
231
232 logging.debug('Downloading and annotating transcripts')
233 for record in self._get_raw_transcripts():
234 transcript_id = record.id
235
236 if transcript_id in self.regions:
237
238 mapping_locations = self.regions[transcript_id]
239
240 description = '|'.join([';'.join(map(str, location))\
241 for location in mapping_locations])
242
243 # Obtain gene name for transcript
244 gene_names = self.genes[transcript_id]
245 if gene_names:
246 gene_name = list(gene_names)[0]
247 else:
248 gene_name = transcript_id
249
250 yield self._get_fasta_record(record, self.group , 'Hominidae', 'Homo_sapiens', gene_name, sub_identifier=transcript_id, description=description)
251
252 class RefSeqDownloader(DataDownloader):
253 """ Generic downloader that known how to interpret and parse genbank records """
254
255 def __init__(self, group):
256
257 self.refseq_host_name = 'ftp.ncbi.nih.gov'
258 self.base_path = '/genomes/' + group
259
260 super(RefSeqDownloader, self).__init__(self.refseq_host_name, self.base_path)
261
262 self.group = group
263
264
265 @classmethod
266 def _get_project_from_genbank(self, gb_record):
267 """ Parses genbank project identifier from genbank record. """
268
269 project = ''
270 for dbxref in gb_record.dbxrefs:
271 for entry in dbxref.split(' '):
272 if entry.startswith('BioProject'):
273 project = entry.split(':')[1]
274 return project
275
276 if not project:
277 for dbxref in gb_record.dbxrefs:
278 for entry in dbxref.split(' '):
279 if entry.startswith('Project'):
280 project = entry.split(':')[1]
281 return project
282
283 return project
284
285 @classmethod
286 def _get_annotations_from_genbank(self, gb_record):
287 """ Parses taxonomic annotation from genbank record. """
288
289 accession = gb_record.annotations['accessions'][0]
290 organism = gb_record.annotations['organism'].replace(' ', '_')
291 organism = '_'.join(organism.split('_')[:2])
292 taxonomy = ("; ").join(gb_record.annotations['taxonomy'])
293 gi = gb_record.annotations['gi']
294
295 family = 'Unassigned'
296 if len(gb_record.annotations['taxonomy']) > 0:
297 for level in gb_record.annotations['taxonomy']:
298 if level.endswith('dae') or level.endswith('aceae'):
299 family = level
300 break
301
302 if family == 'Unassigned':
303 logging.debug('Unassigned family found based on %s' % taxonomy)
304
305 return organism, accession, gi, taxonomy, family
306
307 @classmethod
308 def _get_record_from_genbank(self, gb_record):
309 """ Parses sequence from genbank record. """
310
311 return SeqRecord(gb_record.seq)
312
313 @classmethod
314 def _parse_genbank_records(self, file_handle):
315
316 for gb_record in SeqIO.parse(file_handle, 'genbank'):
317
318 project = self._get_project_from_genbank(gb_record)
319 organism, accession, gi, taxonomy, family =\
320 self._get_annotations_from_genbank(gb_record)
321 record = self._get_record_from_genbank(gb_record)
322
323 if len(record) > 0:
324 yield record, project, taxonomy, family, organism, gi
325
326 def _get_nc(self, entry_path):
327
328 if self.host.path.isfile(entry_path):
329 logging.debug('Processing refseq entry %s' % entry_path)
330 remote_handle = self.host.file(entry_path)
331
332
333 for record, project, taxonomy, family, organism, gi\
334 in self._parse_genbank_records(remote_handle):
335
336 cleaned = self._get_fasta_record(record, self.group , family, organism, gi)
337
338 yield cleaned
339
340 def _get_nz(self, entry_path):
341
342 scaffold_path = entry_path.replace('.gbk', '.scaffold.gbk.tgz')
343 if self.host.path.isfile(scaffold_path):
344 logging.debug('Processing refseq entry %s' % scaffold_path)
345 with self.host.file(scaffold_path, 'rb') as remote_handle:
346 remote_content = BytesIO(remote_handle.read())
347 with tarfile.open(fileobj=remote_content) as tar:
348 for subentry in tar.getnames():
349 if subentry.endswith('.gbk'):
350 logging.debug('Processing subaccession %s' % subentry)
351 subhandle = tar.extractfile(subentry)
352 for record, project, taxonomy, family, organism, gi in self._parse_genbank_records(subhandle):
353 yield self._get_fasta_record(record, self.group , family, organism, gi)
354
355
356 def get_fasta_records(self):
357
358 species_dirs = self.host.listdir(self.host.curdir)
359
360 for species_dir in species_dirs:
361
362 species_path = self.base_path + '/' + species_dir
363
364 if self.host.path.isdir(species_path):
365
366 self.host.chdir(species_path)
367
368 logging.debug('Processing species directory %s' % species_path)
369
370 if self.host.path.isdir(species_path):
371
372 self.host.chdir(species_path)
373
374 entries = self.host.listdir(self.host.curdir)
375
376 for entry in entries:
377
378 if self.host.path.isfile(self.host.curdir + '/' + entry):
379
380 if entry.endswith('.gbk') or entry.endswith('.gbk.gz'):
381
382 logging.debug('Processing entry %s' % entry)
383
384 entry_path = species_path + '/' + entry
385
386 if entry.startswith('NC_'):
387 for record in self._get_nc(entry_path):
388 yield record
389
390 elif entry.startswith('NZ_'):
391 for record in self._get_nz(entry_path):
392 yield record
393
394 class HumanGenomeDownloader(DataDownloader):
395
396 def __init__(self):
397
398 self.ensemble_host_name = 'ftp.ensembl.org'
399 self.base_path = '/pub/current_fasta/homo_sapiens/dna'
400
401 super(HumanGenomeDownloader, self).__init__(self.ensemble_host_name, self.base_path)
402
403 self.group = 'Homo_sapiens'
404
405 def get_fasta_records(self):
406
407 entries = self.host.listdir(self.host.curdir)
408 for entry in entries:
409
410 if self.host.path.isfile(entry):
411
412 # You may want to change your filtering criteria here if you would like to include patches and haplotypes
413 if entry.endswith('fa.gz') and 'dna_sm.primary_assembly' in entry:
414
415 logging.debug('Processing human entry %s' % entry)
416
417 with self.host.file(self.host.getcwd() + '/' + entry, 'rb') as zipped_handle:
418 remote_content = BytesIO(zipped_handle.read())
419
420 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
421
422 for record in SeqIO.parse(gzipfile, "fasta"):
423
424 identifier = record.id
425 organism = 'Homo_sapiens'
426 family = 'Hominidae'
427
428 yield self._get_fasta_record(record, self.group, family, organism, identifier)
429
430 class UniVecDownloader(DataDownloader):
431
432 def __init__(self):
433
434 self.univec_host_name = 'ftp.ncbi.nih.gov'
435 self.base_path = '/pub/UniVec'
436
437 super(UniVecDownloader, self).__init__(self.univec_host_name, self.base_path)
438
439 self.group = 'UniVec'
440
441 def get_fasta_records(self):
442
443 logging.debug('Processing Univec entries')
444
445 with self.host.file(self.host.getcwd() + '/' + 'UniVec') as remote_handle:
446 for record in SeqIO.parse(remote_handle, "fasta"):
447 organism = '_'.join(record.description.split(' ')[1:])
448 family = 'Unassigned'
449 identifier = '000000000'
450
451 yield self._get_fasta_record(record, self.group, family, organism, identifier)
452
453 class CLI(cli.Application):
454 """"""
455 PROGNAME = "metamap"
456 VERSION = "1.0.0"
457 DESCRIPTION = """"""
458
459 def main(self, *args):
460
461 if args:
462 print("Unknown command %r" % (args[0]))
463 print self.USAGE
464 return 1
465
466 if not self.nested_command:
467 print("No command given")
468 print self.USAGE
469 return 1
470
471 @CLI.subcommand("fasta")
472 class References(cli.Application):
473 """ Obtains NCBI RefSeq and Ensembl reference genomes and transcripts"""
474
475 valid_references = ['Fungi', 'Fungi_DRAFT', 'Bacteria',
476 'Bacteria_DRAFT', 'Homo_sapiens',
477 'Viruses', 'UniVec', 'Plasmids',
478 'Protozoa', 'rRNA', 'Homo_sapiens_cDNA']
479
480 references = cli.SwitchAttr(['-r', '--reference'],
481 cli.Set(*valid_references, case_sensitive=True),
482 list=True, default=valid_references,
483 mandatory=False,
484 help="Sets the kind of references to obtain")
485
486 fasta_path = cli.SwitchAttr(['-o', '--output_file'], str, mandatory=True,
487 help="Sets the fasta output file")
488
489 zipped = cli.Flag(["-z", "--zipped"], help="Write gzipped output")
490 debug = cli.Flag(["-d", "--debug"], help="Enable debug messages")
491
492
493 def main(self):
494 """ Downloads genome fasta files from reference databases"""
495
496 if self.debug:
497 logging.getLogger().setLevel(logging.DEBUG)
498
499 if os.path.dirname(self.fasta_path) and not os.path.exists(os.path.dirname(self.fasta_path)):
500 logging.debug('Making directories for output file %s' % self.fasta_path)
501 os.makedirs(os.path.dirname(self.fasta_path))
502
503 if self.zipped:
504 logging.debug('Making zipped output file %s' % self.fasta_path)
505 output_file = gzip.open(self.fasta_path, 'wb')
506
507 else:
508 logging.debug('Making regular output file %s' % self.fasta_path)
509 output_file = open(self.fasta_path, 'w', buffering=100 * 1024 * 1024)
510
511
512 for reference in self.references:
513
514 if reference == 'UniVec':
515 downloader = UniVecDownloader()
516 elif reference == 'Homo_sapiens':
517 downloader = HumanGenomeDownloader()
518 elif reference == 'rRNA':
519 downloader = SilvaDownlaoder()
520 elif reference == 'Homo_sapiens_cDNA':
521 downloader = HumanTranscriptDownloader()
522 else:
523 downloader = RefSeqDownloader(group=reference)
524
525 for record in downloader.get_fasta_records():
526 SeqIO.write([record], output_file, "fasta")
527
528 output_file.close()
529
530 @CLI.subcommand("blast")
531 class Blast(cli.Application):
532 """ Obtains blast databases """
533
534 valid_references = ['nt', 'nr']
535
536 references = cli.SwitchAttr(['-r', '--reference_database'],
537 cli.Set(*valid_references, case_sensitive=True),
538 list=True, default=valid_references,
539 mandatory=False,
540 help="Sets the kind of database to obtain")
541
542 database_path = cli.SwitchAttr(['-o', '--output_path'], str, mandatory=True,
543 help="Sets the fasta output file")
544
545 debug = cli.Flag(["-d", "--debug"], help="Enable debug messages")
546
547 def main(self):
548 """ Downloads blast database files """
549
550 if self.debug:
551 logging.getLogger().setLevel(logging.DEBUG)
552
553 if not os.path.exists(self.database_path):
554 logging.debug('Making output directory %s' % self.database_path)
555 os.makedirs(self.database_path)
556
557 host_name = 'ftp.ncbi.nih.gov'
558 host = ftputil.FTPHost(host_name, 'anonymous', 'anonymous')
559
560 for reference in self.references:
561
562 path = '/blast/db'
563 host.chdir(path)
564
565 entries = host.listdir(host.curdir)
566
567 for entry in entries:
568
569 if host.path.isfile(entry):
570
571 if entry.startswith(reference) and entry.endswith('.tar.gz'):
572 logging.debug('Downloading %s' % entry)
573 local_path = os.path.join(self.database_path, entry)
574 host.download(entry, local_path, 'b')
575 logging.debug('Unpacking %s' % entry)
576 tfile = tarfile.open(local_path, 'r:gz')
577 tfile.extractall(self.database_path)
578 os.remove(local_path)
579
580 if __name__ == "__main__":
581 CLI.run()
582