| 0 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 import sys | 
|  | 4 import logging | 
|  | 5 import gzip | 
|  | 6 import string | 
|  | 7 import os | 
|  | 8 import tarfile | 
|  | 9 | 
|  | 10 from io import BytesIO | 
|  | 11 from collections import defaultdict | 
|  | 12 | 
|  | 13 try: | 
|  | 14     from Bio.SeqRecord import SeqRecord | 
|  | 15     from Bio import SeqIO | 
|  | 16 except ImportError: | 
|  | 17     message = 'This script requires the BioPython python package\n' | 
|  | 18     sys.stderr.write(message) | 
|  | 19     sys.exit(1) | 
|  | 20 | 
|  | 21 try: | 
|  | 22     from plumbum import cli | 
|  | 23 except ImportError: | 
|  | 24     message = 'This script requires the plumbum python package\n' | 
|  | 25     sys.stderr.write(message) | 
|  | 26     sys.exit(1) | 
|  | 27 | 
|  | 28 try: | 
|  | 29     import ftputil | 
|  | 30 except ImportError: | 
|  | 31     message = 'This script requires the ftputil python package\n' | 
|  | 32     sys.stderr.write(message) | 
|  | 33     sys.exit(1) | 
|  | 34 | 
|  | 35 | 
|  | 36 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum()) | 
|  | 37 NON_ID = NON_ID.replace('_', '').replace('-', '') | 
|  | 38 NUC_TRANS = string.maketrans('uryswkmbdhvURYSWKMBDHV', 'tnnnnnnnnnnTNNNNNNNNNN') | 
|  | 39 | 
|  | 40 logging.basicConfig(level=logging.INFO, format='%(message)s') | 
|  | 41 | 
|  | 42 | 
|  | 43 class DataDownloader(object): | 
|  | 44     """ Generic class for downloading fasta records via FTP. """ | 
|  | 45 | 
|  | 46     def __init__(self, host_name, base_path, user='anonymous', password='anonymous'): | 
|  | 47 | 
|  | 48         self.group     = 'NoGroup' | 
|  | 49         self.host_name = host_name | 
|  | 50         self.base_path = base_path | 
|  | 51 | 
|  | 52         self.host = ftputil.FTPHost(host_name, user, password) | 
|  | 53         self.host.chdir(base_path) | 
|  | 54 | 
|  | 55     @classmethod | 
|  | 56     def _get_fasta_record(self, record, group, family, organism, identifier, sub_identifier=None, description=''): | 
|  | 57 | 
|  | 58         record.seq._data = record.seq._data.translate(NUC_TRANS) | 
|  | 59 | 
|  | 60         group       = group.translate(None, NON_ID) | 
|  | 61         family      = family.translate(None, NON_ID) | 
|  | 62         organism    = organism.translate(None, NON_ID) | 
|  | 63         identifier  = identifier.translate(None, NON_ID) | 
|  | 64 | 
|  | 65         if sub_identifier: | 
|  | 66             sub_identifier = sub_identifier.translate(None, NON_ID) | 
|  | 67             record.id  = ';'.join([group, family, organism, identifier, sub_identifier]) | 
|  | 68         else: | 
|  | 69             record.id           = ';'.join([group, family, organism, identifier]) | 
|  | 70 | 
|  | 71         record.description  = description | 
|  | 72         record.name         = '' | 
|  | 73 | 
|  | 74         return record | 
|  | 75 | 
|  | 76     def get_fasta_records(self): | 
|  | 77 | 
|  | 78         raise NotImplementedError | 
|  | 79 | 
|  | 80 class SilvaDownlaoder(DataDownloader): | 
|  | 81     """ Downlaods rRNA transcripts from the Silva database. """ | 
|  | 82 | 
|  | 83     def __init__(self, silva_release='111'): | 
|  | 84 | 
|  | 85         self.silva_release      = silva_release | 
|  | 86         self.silva_host_name    = 'ftp.arb-silva.de' | 
|  | 87         self.base_path          = '/release_%s/Exports/' % self.silva_release | 
|  | 88 | 
|  | 89         super(SilvaDownlaoder, self).__init__(self.silva_host_name, self.base_path) | 
|  | 90 | 
|  | 91         self.group = 'rRNA' | 
|  | 92 | 
|  | 93     def get_fasta_records(self): | 
|  | 94 | 
|  | 95         remote_path = self.base_path + 'SSURef_%s_NR_tax_silva_trunc.fasta.tgz' % self.silva_release | 
|  | 96         if self.host.path.isfile(remote_path): | 
|  | 97 | 
|  | 98             with self.host.file(remote_path, 'rb') as remote_handle: | 
|  | 99                 remote_content = BytesIO(remote_handle.read()) | 
|  | 100 | 
|  | 101                 with tarfile.open(fileobj=remote_content) as tar: | 
|  | 102 | 
|  | 103                     for subentry in tar.getnames(): | 
|  | 104                         if subentry.endswith('.fasta'): | 
|  | 105                             logging.debug('Processing rRNA file %s' % subentry) | 
|  | 106                             subhandle = tar.extractfile(subentry) | 
|  | 107 | 
|  | 108                             for record in SeqIO.parse(subhandle, "fasta"): | 
|  | 109                                 identifier  = record.id.split('.')[0] | 
|  | 110                                 description = '_'.join(record.description.split(' ')[1:]) | 
|  | 111                                 taxonomy    = description.split(';') | 
|  | 112                                 organism    = taxonomy[-1] | 
|  | 113                                 family      = 'Unassigned' | 
|  | 114 | 
|  | 115                                 for level in taxonomy: | 
|  | 116                                     if level.endswith('eae'): | 
|  | 117                                         family = level | 
|  | 118                                         break | 
|  | 119 | 
|  | 120                                 yield self._get_fasta_record(record, self.group, family, organism, identifier) | 
|  | 121 | 
|  | 122 class HumanTranscriptDownloader(DataDownloader): | 
|  | 123     """ Downloads human transcripts fasta records that contain the locations | 
|  | 124         of their exons on the human genome within their description line. | 
|  | 125     """ | 
|  | 126 | 
|  | 127     def __init__(self): | 
|  | 128 | 
|  | 129         self.ensemble_host_name  = 'ftp.ensembl.org' | 
|  | 130         self.gtf_path            = '/pub/current_gtf/homo_sapiens' | 
|  | 131         self.cdna_path           = '/pub/current_fasta/homo_sapiens/cdna' | 
|  | 132         self.ncrna_path          = '/pub/current_fasta/homo_sapiens/ncrna' | 
|  | 133 | 
|  | 134         super(HumanTranscriptDownloader, self).__init__(self.ensemble_host_name, self.gtf_path) | 
|  | 135 | 
|  | 136         self.chromosomes    = set() | 
|  | 137         self.genes          = defaultdict(set) | 
|  | 138         self.regions        = defaultdict(set) | 
|  | 139         self.group          = 'Homo_sapiens_cDNA' | 
|  | 140 | 
|  | 141     def _cache_annotations(self): | 
|  | 142         """ Obtains gtf annotations for human transcripts from ensemble and | 
|  | 143             caches them in memory. | 
|  | 144         """ | 
|  | 145 | 
|  | 146         self.host.chdir(self.gtf_path) | 
|  | 147 | 
|  | 148         entries = self.host.listdir(self.host.curdir) | 
|  | 149 | 
|  | 150         for entry in entries: | 
|  | 151 | 
|  | 152             if self.host.path.isfile(entry): | 
|  | 153                 if entry.startswith('Homo_sapiens.') and entry.endswith('.gtf.gz'): | 
|  | 154                     logging.debug('Processing cDNA annotations %s' % entry) | 
|  | 155 | 
|  | 156                     with self.host.file(self.gtf_path + '/' + entry, 'rb') as zipped_handle: | 
|  | 157                         remote_content = BytesIO(zipped_handle.read()) | 
|  | 158 | 
|  | 159                         with gzip.GzipFile(fileobj=remote_content) as gzipfile: | 
|  | 160                             for i, gtf_line in enumerate(gzipfile): | 
|  | 161                                 if i % 100000 == 0: | 
|  | 162                                     logging.debug('Cached %i human transcript annotations' % i) | 
|  | 163 | 
|  | 164                                 self._add_annotation(gtf_line) | 
|  | 165 | 
|  | 166     def _get_raw_transcripts(self): | 
|  | 167         """ Obtains coding and noncoding human transcripts from ensemble and | 
|  | 168             yields them as fasta records if they have exons that have a known location | 
|  | 169             on the human genome. Fasta records contain the exon locations in their header | 
|  | 170             lines. | 
|  | 171         """ | 
|  | 172 | 
|  | 173         for subpath in [self.cdna_path, self.ncrna_path]: | 
|  | 174 | 
|  | 175             self.host.chdir(subpath) | 
|  | 176 | 
|  | 177             entries = self.host.listdir(self.host.curdir) | 
|  | 178 | 
|  | 179             for entry in entries: | 
|  | 180                 if self.host.path.isfile(entry): | 
|  | 181                     if entry.endswith('cdna.all.fa.gz') or entry.endswith('ncrna.fa.gz'): | 
|  | 182 | 
|  | 183                         logging.debug('Processing human transcript file %s' % entry) | 
|  | 184 | 
|  | 185                         with self.host.file(subpath + '/' + entry, 'rb') as zipped_handle: | 
|  | 186                             remote_content = BytesIO(zipped_handle.read()) | 
|  | 187 | 
|  | 188                             with gzip.GzipFile(fileobj=remote_content) as gzipfile: | 
|  | 189                                 for i, record in enumerate(SeqIO.parse(gzipfile, "fasta")): | 
|  | 190 | 
|  | 191                                     if record.id.startswith('ENST'): | 
|  | 192 | 
|  | 193                                         if i % 100000 == 0: | 
|  | 194                                             logging.debug('Retrieved %i human transcripts' % i) | 
|  | 195 | 
|  | 196                                         record.description = '' | 
|  | 197 | 
|  | 198                                         yield record | 
|  | 199 | 
|  | 200     def _add_annotation(self, gtf_line): | 
|  | 201         """ Parses gtf annotations and stores them in memory. """ | 
|  | 202 | 
|  | 203         if gtf_line[0] == '#': | 
|  | 204             return | 
|  | 205 | 
|  | 206         fields = gtf_line.strip().split('\t') | 
|  | 207         chromosome, source, feature, start, end, score, strands, frame, attributes\ | 
|  | 208                                                                     = fields | 
|  | 209 | 
|  | 210         start, end = sorted([int(start), int(end)]) | 
|  | 211 | 
|  | 212         if feature != 'exon': | 
|  | 213             return | 
|  | 214 | 
|  | 215         attributes = attributes.replace(' ', '').split(';') | 
|  | 216         transcript_ids = [identifier.split('"')[1] for identifier in attributes\ | 
|  | 217                                 if identifier.startswith('transcript_id')] | 
|  | 218 | 
|  | 219         gene_names = [identifier.split('"')[1] for identifier in attributes\ | 
|  | 220                                 if identifier.startswith('gene_name')] | 
|  | 221 | 
|  | 222         for transcript_id in transcript_ids: | 
|  | 223             self.genes[transcript_id] = self.genes[transcript_id].union(gene_names) | 
|  | 224             self.regions[transcript_id].add((chromosome, start, end)) | 
|  | 225 | 
|  | 226     def get_fasta_records(self): | 
|  | 227         """ Yields annotated fasta records of human transcripts. """ | 
|  | 228 | 
|  | 229         logging.debug('Caching annotations') | 
|  | 230         self._cache_annotations() | 
|  | 231 | 
|  | 232         logging.debug('Downloading and annotating transcripts') | 
|  | 233         for record in self._get_raw_transcripts(): | 
|  | 234             transcript_id   = record.id | 
|  | 235 | 
|  | 236             if transcript_id in self.regions: | 
|  | 237 | 
|  | 238                 mapping_locations = self.regions[transcript_id] | 
|  | 239 | 
|  | 240                 description = '|'.join([';'.join(map(str, location))\ | 
|  | 241                         for location in mapping_locations]) | 
|  | 242 | 
|  | 243                 # Obtain gene name for transcript | 
|  | 244                 gene_names = self.genes[transcript_id] | 
|  | 245                 if gene_names: | 
|  | 246                     gene_name = list(gene_names)[0] | 
|  | 247                 else: | 
|  | 248                     gene_name = transcript_id | 
|  | 249 | 
|  | 250                 yield self._get_fasta_record(record, self.group , 'Hominidae', 'Homo_sapiens', gene_name, sub_identifier=transcript_id, description=description) | 
|  | 251 | 
|  | 252 class RefSeqDownloader(DataDownloader): | 
|  | 253     """ Generic downloader that known how to interpret and parse genbank records """ | 
|  | 254 | 
|  | 255     def __init__(self, group): | 
|  | 256 | 
|  | 257         self.refseq_host_name  = 'ftp.ncbi.nih.gov' | 
|  | 258         self.base_path         = '/genomes/' + group | 
|  | 259 | 
|  | 260         super(RefSeqDownloader, self).__init__(self.refseq_host_name, self.base_path) | 
|  | 261 | 
|  | 262         self.group              = group | 
|  | 263 | 
|  | 264 | 
|  | 265     @classmethod | 
|  | 266     def _get_project_from_genbank(self, gb_record): | 
|  | 267         """ Parses genbank project identifier from genbank record. """ | 
|  | 268 | 
|  | 269         project = '' | 
|  | 270         for dbxref in gb_record.dbxrefs: | 
|  | 271             for entry in dbxref.split(' '): | 
|  | 272                 if entry.startswith('BioProject'): | 
|  | 273                     project = entry.split(':')[1] | 
|  | 274                     return project | 
|  | 275 | 
|  | 276         if not project: | 
|  | 277             for dbxref in gb_record.dbxrefs: | 
|  | 278                 for entry in dbxref.split(' '): | 
|  | 279                     if entry.startswith('Project'): | 
|  | 280                         project = entry.split(':')[1] | 
|  | 281                         return project | 
|  | 282 | 
|  | 283         return project | 
|  | 284 | 
|  | 285     @classmethod | 
|  | 286     def _get_annotations_from_genbank(self, gb_record): | 
|  | 287         """ Parses taxonomic annotation from genbank record. """ | 
|  | 288 | 
|  | 289         accession   = gb_record.annotations['accessions'][0] | 
|  | 290         organism    = gb_record.annotations['organism'].replace(' ', '_') | 
|  | 291         organism    = '_'.join(organism.split('_')[:2]) | 
|  | 292         taxonomy    = ("; ").join(gb_record.annotations['taxonomy']) | 
|  | 293         gi          = gb_record.annotations['gi'] | 
|  | 294 | 
|  | 295         family      = 'Unassigned' | 
|  | 296         if len(gb_record.annotations['taxonomy']) > 0: | 
|  | 297             for level in gb_record.annotations['taxonomy']: | 
|  | 298                 if level.endswith('dae') or level.endswith('aceae'): | 
|  | 299                     family = level | 
|  | 300                     break | 
|  | 301 | 
|  | 302         if family == 'Unassigned': | 
|  | 303             logging.debug('Unassigned family found based on %s' % taxonomy) | 
|  | 304 | 
|  | 305         return organism, accession, gi, taxonomy, family | 
|  | 306 | 
|  | 307     @classmethod | 
|  | 308     def _get_record_from_genbank(self, gb_record): | 
|  | 309         """ Parses sequence from genbank record. """ | 
|  | 310 | 
|  | 311         return SeqRecord(gb_record.seq) | 
|  | 312 | 
|  | 313     @classmethod | 
|  | 314     def _parse_genbank_records(self, file_handle): | 
|  | 315 | 
|  | 316         for gb_record in SeqIO.parse(file_handle, 'genbank'): | 
|  | 317 | 
|  | 318             project = self._get_project_from_genbank(gb_record) | 
|  | 319             organism, accession, gi, taxonomy, family =\ | 
|  | 320                 self._get_annotations_from_genbank(gb_record) | 
|  | 321             record = self._get_record_from_genbank(gb_record) | 
|  | 322 | 
|  | 323             if len(record) > 0: | 
|  | 324                 yield record, project, taxonomy, family, organism, gi | 
|  | 325 | 
|  | 326     def _get_nc(self, entry_path): | 
|  | 327 | 
|  | 328         if self.host.path.isfile(entry_path): | 
|  | 329             logging.debug('Processing refseq entry %s' % entry_path) | 
| 1 | 330             with self.host.file(entry_path) as remote_handle: | 
| 0 | 331 | 
|  | 332 | 
| 1 | 333                 for record, project, taxonomy, family, organism, gi\ | 
|  | 334                     in self._parse_genbank_records(remote_handle): | 
|  | 335 | 
|  | 336                     cleaned = self._get_fasta_record(record, self.group , family, organism, gi) | 
|  | 337 | 
|  | 338                     yield cleaned | 
| 0 | 339 | 
|  | 340     def _get_nz(self, entry_path): | 
|  | 341 | 
|  | 342         scaffold_path = entry_path.replace('.gbk', '.scaffold.gbk.tgz') | 
|  | 343         if self.host.path.isfile(scaffold_path): | 
|  | 344             logging.debug('Processing refseq entry %s' % scaffold_path) | 
|  | 345             with self.host.file(scaffold_path, 'rb') as remote_handle: | 
|  | 346                 remote_content = BytesIO(remote_handle.read()) | 
|  | 347                 with tarfile.open(fileobj=remote_content) as tar: | 
|  | 348                     for subentry in tar.getnames(): | 
|  | 349                         if subentry.endswith('.gbk'): | 
|  | 350                             logging.debug('Processing subaccession %s' % subentry) | 
|  | 351                             subhandle = tar.extractfile(subentry) | 
|  | 352                             for record, project, taxonomy, family, organism, gi in self._parse_genbank_records(subhandle): | 
|  | 353                                 yield self._get_fasta_record(record, self.group , family, organism, gi) | 
|  | 354 | 
|  | 355 | 
|  | 356     def get_fasta_records(self): | 
|  | 357 | 
|  | 358         species_dirs = self.host.listdir(self.host.curdir) | 
|  | 359 | 
|  | 360         for species_dir in species_dirs: | 
|  | 361 | 
|  | 362             species_path = self.base_path + '/' + species_dir | 
|  | 363 | 
|  | 364             if self.host.path.isdir(species_path): | 
|  | 365 | 
|  | 366                 self.host.chdir(species_path) | 
|  | 367 | 
|  | 368                 logging.debug('Processing species directory %s' % species_path) | 
|  | 369 | 
|  | 370                 if self.host.path.isdir(species_path): | 
|  | 371 | 
|  | 372                     self.host.chdir(species_path) | 
|  | 373 | 
|  | 374                     entries = self.host.listdir(self.host.curdir) | 
|  | 375 | 
|  | 376                     for entry in entries: | 
|  | 377 | 
|  | 378                         if self.host.path.isfile(self.host.curdir + '/' + entry): | 
|  | 379 | 
|  | 380                             if entry.endswith('.gbk') or entry.endswith('.gbk.gz'): | 
|  | 381 | 
|  | 382                                 logging.debug('Processing entry %s' % entry) | 
|  | 383 | 
|  | 384                                 entry_path = species_path + '/' + entry | 
|  | 385 | 
|  | 386                                 if entry.startswith('NC_'): | 
|  | 387                                     for record in self._get_nc(entry_path): | 
|  | 388                                         yield record | 
|  | 389 | 
|  | 390                                 elif entry.startswith('NZ_'): | 
|  | 391                                     for record in self._get_nz(entry_path): | 
|  | 392                                         yield record | 
|  | 393 | 
|  | 394 class HumanGenomeDownloader(DataDownloader): | 
|  | 395 | 
|  | 396     def __init__(self): | 
|  | 397 | 
|  | 398         self.ensemble_host_name  = 'ftp.ensembl.org' | 
|  | 399         self.base_path           = '/pub/current_fasta/homo_sapiens/dna' | 
|  | 400 | 
|  | 401         super(HumanGenomeDownloader, self).__init__(self.ensemble_host_name, self.base_path) | 
|  | 402 | 
|  | 403         self.group          = 'Homo_sapiens' | 
|  | 404 | 
|  | 405     def get_fasta_records(self): | 
|  | 406 | 
|  | 407         entries = self.host.listdir(self.host.curdir) | 
|  | 408         for entry in entries: | 
|  | 409 | 
|  | 410             if self.host.path.isfile(entry): | 
|  | 411 | 
|  | 412                 # You may want to change your filtering criteria here if you would like to include patches and haplotypes | 
|  | 413                 if entry.endswith('fa.gz') and 'dna_sm.primary_assembly' in entry: | 
|  | 414 | 
|  | 415                     logging.debug('Processing human entry %s' % entry) | 
|  | 416 | 
|  | 417                     with self.host.file(self.host.getcwd() + '/' + entry, 'rb') as zipped_handle: | 
|  | 418                         remote_content = BytesIO(zipped_handle.read()) | 
|  | 419 | 
|  | 420                         with gzip.GzipFile(fileobj=remote_content) as gzipfile: | 
|  | 421 | 
|  | 422                             for record in SeqIO.parse(gzipfile, "fasta"): | 
|  | 423 | 
|  | 424                                 identifier      = record.id | 
|  | 425                                 organism        = 'Homo_sapiens' | 
|  | 426                                 family          = 'Hominidae' | 
|  | 427 | 
|  | 428                                 yield self._get_fasta_record(record, self.group, family, organism, identifier) | 
|  | 429 | 
|  | 430 class UniVecDownloader(DataDownloader): | 
|  | 431 | 
|  | 432     def __init__(self): | 
|  | 433 | 
|  | 434         self.univec_host_name  = 'ftp.ncbi.nih.gov' | 
|  | 435         self.base_path         = '/pub/UniVec' | 
|  | 436 | 
|  | 437         super(UniVecDownloader, self).__init__(self.univec_host_name, self.base_path) | 
|  | 438 | 
|  | 439         self.group          = 'UniVec' | 
|  | 440 | 
|  | 441     def get_fasta_records(self): | 
|  | 442 | 
|  | 443         logging.debug('Processing Univec entries') | 
|  | 444 | 
|  | 445         with self.host.file(self.host.getcwd() + '/' + 'UniVec') as remote_handle: | 
|  | 446             for record in SeqIO.parse(remote_handle, "fasta"): | 
|  | 447                 organism    = '_'.join(record.description.split(' ')[1:]) | 
|  | 448                 family      = 'Unassigned' | 
|  | 449                 identifier  = '000000000' | 
|  | 450 | 
|  | 451                 yield self._get_fasta_record(record, self.group, family, organism, identifier) | 
|  | 452 | 
|  | 453 class CLI(cli.Application): | 
|  | 454     """""" | 
|  | 455     PROGNAME = "metamap" | 
|  | 456     VERSION = "1.0.0" | 
|  | 457     DESCRIPTION = """""" | 
|  | 458 | 
|  | 459     def main(self, *args): | 
|  | 460 | 
|  | 461         if args: | 
|  | 462             print("Unknown command %r" % (args[0])) | 
|  | 463             print self.USAGE | 
|  | 464             return 1 | 
|  | 465 | 
|  | 466         if not self.nested_command: | 
|  | 467             print("No command given") | 
|  | 468             print self.USAGE | 
|  | 469             return 1 | 
|  | 470 | 
|  | 471 @CLI.subcommand("fasta") | 
|  | 472 class References(cli.Application): | 
|  | 473     """ Obtains NCBI RefSeq and Ensembl reference genomes and transcripts""" | 
|  | 474 | 
|  | 475     valid_references = ['Fungi', 'Fungi_DRAFT', 'Bacteria', | 
|  | 476               'Bacteria_DRAFT', 'Homo_sapiens', | 
|  | 477               'Viruses', 'UniVec', 'Plasmids', | 
|  | 478               'Protozoa', 'rRNA', 'Homo_sapiens_cDNA'] | 
|  | 479 | 
|  | 480     references = cli.SwitchAttr(['-r', '--reference'], | 
|  | 481                                    cli.Set(*valid_references, case_sensitive=True), | 
|  | 482                                    list=True, default=valid_references, | 
|  | 483                                    mandatory=False, | 
|  | 484                                    help="Sets the kind of references to obtain") | 
|  | 485 | 
|  | 486     fasta_path  = cli.SwitchAttr(['-o', '--output_file'], str, mandatory=True, | 
|  | 487                                  help="Sets the fasta output file") | 
|  | 488 | 
|  | 489     zipped      = cli.Flag(["-z", "--zipped"], help="Write gzipped output") | 
|  | 490     debug       = cli.Flag(["-d", "--debug"], help="Enable debug messages") | 
|  | 491 | 
|  | 492 | 
|  | 493     def main(self): | 
|  | 494         """ Downloads genome fasta files from reference databases""" | 
|  | 495 | 
|  | 496         if self.debug: | 
|  | 497             logging.getLogger().setLevel(logging.DEBUG) | 
|  | 498 | 
|  | 499         if os.path.dirname(self.fasta_path) and not os.path.exists(os.path.dirname(self.fasta_path)): | 
|  | 500             logging.debug('Making directories for output file %s' % self.fasta_path) | 
|  | 501             os.makedirs(os.path.dirname(self.fasta_path)) | 
|  | 502 | 
|  | 503         if self.zipped: | 
|  | 504             logging.debug('Making zipped output file %s' % self.fasta_path) | 
|  | 505             output_file = gzip.open(self.fasta_path, 'wb') | 
|  | 506 | 
|  | 507         else: | 
|  | 508             logging.debug('Making regular output file %s' % self.fasta_path) | 
|  | 509             output_file = open(self.fasta_path, 'w', buffering=100 * 1024 * 1024) | 
|  | 510 | 
|  | 511 | 
|  | 512         for reference in self.references: | 
|  | 513 | 
|  | 514             if reference == 'UniVec': | 
|  | 515                 downloader = UniVecDownloader() | 
|  | 516             elif reference == 'Homo_sapiens': | 
|  | 517                 downloader = HumanGenomeDownloader() | 
|  | 518             elif reference == 'rRNA': | 
|  | 519                 downloader = SilvaDownlaoder() | 
|  | 520             elif reference == 'Homo_sapiens_cDNA': | 
|  | 521                 downloader = HumanTranscriptDownloader() | 
|  | 522             else: | 
|  | 523                 downloader = RefSeqDownloader(group=reference) | 
|  | 524 | 
|  | 525             for record in downloader.get_fasta_records(): | 
|  | 526                 SeqIO.write([record], output_file, "fasta") | 
|  | 527 | 
|  | 528         output_file.close() | 
|  | 529 | 
|  | 530 @CLI.subcommand("blast") | 
|  | 531 class Blast(cli.Application): | 
|  | 532     """ Obtains blast databases """ | 
|  | 533 | 
|  | 534     valid_references = ['nt', 'nr'] | 
|  | 535 | 
|  | 536     references = cli.SwitchAttr(['-r', '--reference_database'], | 
|  | 537                                    cli.Set(*valid_references, case_sensitive=True), | 
|  | 538                                    list=True, default=valid_references, | 
|  | 539                                    mandatory=False, | 
|  | 540                                    help="Sets the kind of database to obtain") | 
|  | 541 | 
|  | 542     database_path  = cli.SwitchAttr(['-o', '--output_path'], str, mandatory=True, | 
|  | 543                                  help="Sets the fasta output file") | 
|  | 544 | 
|  | 545     debug       = cli.Flag(["-d", "--debug"], help="Enable debug messages") | 
|  | 546 | 
|  | 547     def main(self): | 
|  | 548         """ Downloads blast database files """ | 
|  | 549 | 
|  | 550         if self.debug: | 
|  | 551             logging.getLogger().setLevel(logging.DEBUG) | 
|  | 552 | 
|  | 553         if not os.path.exists(self.database_path): | 
|  | 554             logging.debug('Making output directory %s' % self.database_path) | 
|  | 555             os.makedirs(self.database_path) | 
|  | 556 | 
|  | 557         host_name = 'ftp.ncbi.nih.gov' | 
|  | 558         host = ftputil.FTPHost(host_name, 'anonymous', 'anonymous') | 
|  | 559 | 
|  | 560         for reference in self.references: | 
|  | 561 | 
|  | 562             path = '/blast/db' | 
|  | 563             host.chdir(path) | 
|  | 564 | 
|  | 565             entries = host.listdir(host.curdir) | 
|  | 566 | 
|  | 567             for entry in entries: | 
|  | 568 | 
|  | 569                 if host.path.isfile(entry): | 
|  | 570 | 
|  | 571                     if entry.startswith(reference) and entry.endswith('.tar.gz'): | 
|  | 572                         logging.debug('Downloading %s' % entry) | 
|  | 573                         local_path = os.path.join(self.database_path, entry) | 
|  | 574                         host.download(entry, local_path, 'b') | 
|  | 575                         logging.debug('Unpacking %s' % entry) | 
|  | 576                         tfile = tarfile.open(local_path, 'r:gz') | 
|  | 577                         tfile.extractall(self.database_path) | 
|  | 578                         os.remove(local_path) | 
|  | 579 | 
|  | 580 if __name__ == "__main__": | 
|  | 581     CLI.run() | 
|  | 582 |