Mercurial > repos > mzeidler > virana_main
diff vref.py @ 0:f63d639b223c draft
Initial Upload
author | mzeidler |
---|---|
date | Wed, 25 Sep 2013 11:41:16 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vref.py Wed Sep 25 11:41:16 2013 -0400 @@ -0,0 +1,582 @@ +#!/usr/bin/env python + +import sys +import logging +import gzip +import string +import os +import tarfile + +from io import BytesIO +from collections import defaultdict + +try: + from Bio.SeqRecord import SeqRecord + from Bio import SeqIO +except ImportError: + message = 'This script requires the BioPython python package\n' + sys.stderr.write(message) + sys.exit(1) + +try: + from plumbum import cli +except ImportError: + message = 'This script requires the plumbum python package\n' + sys.stderr.write(message) + sys.exit(1) + +try: + import ftputil +except ImportError: + message = 'This script requires the ftputil python package\n' + sys.stderr.write(message) + sys.exit(1) + + +NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum()) +NON_ID = NON_ID.replace('_', '').replace('-', '') +NUC_TRANS = string.maketrans('uryswkmbdhvURYSWKMBDHV', 'tnnnnnnnnnnTNNNNNNNNNN') + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +class DataDownloader(object): + """ Generic class for downloading fasta records via FTP. """ + + def __init__(self, host_name, base_path, user='anonymous', password='anonymous'): + + self.group = 'NoGroup' + self.host_name = host_name + self.base_path = base_path + + self.host = ftputil.FTPHost(host_name, user, password) + self.host.chdir(base_path) + + @classmethod + def _get_fasta_record(self, record, group, family, organism, identifier, sub_identifier=None, description=''): + + record.seq._data = record.seq._data.translate(NUC_TRANS) + + group = group.translate(None, NON_ID) + family = family.translate(None, NON_ID) + organism = organism.translate(None, NON_ID) + identifier = identifier.translate(None, NON_ID) + + if sub_identifier: + sub_identifier = sub_identifier.translate(None, NON_ID) + record.id = ';'.join([group, family, organism, identifier, sub_identifier]) + else: + record.id = ';'.join([group, family, organism, identifier]) + + record.description = description + record.name = '' + + return record + + def get_fasta_records(self): + + raise NotImplementedError + +class SilvaDownlaoder(DataDownloader): + """ Downlaods rRNA transcripts from the Silva database. """ + + def __init__(self, silva_release='111'): + + self.silva_release = silva_release + self.silva_host_name = 'ftp.arb-silva.de' + self.base_path = '/release_%s/Exports/' % self.silva_release + + super(SilvaDownlaoder, self).__init__(self.silva_host_name, self.base_path) + + self.group = 'rRNA' + + def get_fasta_records(self): + + remote_path = self.base_path + 'SSURef_%s_NR_tax_silva_trunc.fasta.tgz' % self.silva_release + if self.host.path.isfile(remote_path): + + with self.host.file(remote_path, 'rb') as remote_handle: + remote_content = BytesIO(remote_handle.read()) + + with tarfile.open(fileobj=remote_content) as tar: + + for subentry in tar.getnames(): + if subentry.endswith('.fasta'): + logging.debug('Processing rRNA file %s' % subentry) + subhandle = tar.extractfile(subentry) + + for record in SeqIO.parse(subhandle, "fasta"): + identifier = record.id.split('.')[0] + description = '_'.join(record.description.split(' ')[1:]) + taxonomy = description.split(';') + organism = taxonomy[-1] + family = 'Unassigned' + + for level in taxonomy: + if level.endswith('eae'): + family = level + break + + yield self._get_fasta_record(record, self.group, family, organism, identifier) + +class HumanTranscriptDownloader(DataDownloader): + """ Downloads human transcripts fasta records that contain the locations + of their exons on the human genome within their description line. + """ + + def __init__(self): + + self.ensemble_host_name = 'ftp.ensembl.org' + self.gtf_path = '/pub/current_gtf/homo_sapiens' + self.cdna_path = '/pub/current_fasta/homo_sapiens/cdna' + self.ncrna_path = '/pub/current_fasta/homo_sapiens/ncrna' + + super(HumanTranscriptDownloader, self).__init__(self.ensemble_host_name, self.gtf_path) + + self.chromosomes = set() + self.genes = defaultdict(set) + self.regions = defaultdict(set) + self.group = 'Homo_sapiens_cDNA' + + def _cache_annotations(self): + """ Obtains gtf annotations for human transcripts from ensemble and + caches them in memory. + """ + + self.host.chdir(self.gtf_path) + + entries = self.host.listdir(self.host.curdir) + + for entry in entries: + + if self.host.path.isfile(entry): + if entry.startswith('Homo_sapiens.') and entry.endswith('.gtf.gz'): + logging.debug('Processing cDNA annotations %s' % entry) + + with self.host.file(self.gtf_path + '/' + entry, 'rb') as zipped_handle: + remote_content = BytesIO(zipped_handle.read()) + + with gzip.GzipFile(fileobj=remote_content) as gzipfile: + for i, gtf_line in enumerate(gzipfile): + if i % 100000 == 0: + logging.debug('Cached %i human transcript annotations' % i) + + self._add_annotation(gtf_line) + + def _get_raw_transcripts(self): + """ Obtains coding and noncoding human transcripts from ensemble and + yields them as fasta records if they have exons that have a known location + on the human genome. Fasta records contain the exon locations in their header + lines. + """ + + for subpath in [self.cdna_path, self.ncrna_path]: + + self.host.chdir(subpath) + + entries = self.host.listdir(self.host.curdir) + + for entry in entries: + if self.host.path.isfile(entry): + if entry.endswith('cdna.all.fa.gz') or entry.endswith('ncrna.fa.gz'): + + logging.debug('Processing human transcript file %s' % entry) + + with self.host.file(subpath + '/' + entry, 'rb') as zipped_handle: + remote_content = BytesIO(zipped_handle.read()) + + with gzip.GzipFile(fileobj=remote_content) as gzipfile: + for i, record in enumerate(SeqIO.parse(gzipfile, "fasta")): + + if record.id.startswith('ENST'): + + if i % 100000 == 0: + logging.debug('Retrieved %i human transcripts' % i) + + record.description = '' + + yield record + + def _add_annotation(self, gtf_line): + """ Parses gtf annotations and stores them in memory. """ + + if gtf_line[0] == '#': + return + + fields = gtf_line.strip().split('\t') + chromosome, source, feature, start, end, score, strands, frame, attributes\ + = fields + + start, end = sorted([int(start), int(end)]) + + if feature != 'exon': + return + + attributes = attributes.replace(' ', '').split(';') + transcript_ids = [identifier.split('"')[1] for identifier in attributes\ + if identifier.startswith('transcript_id')] + + gene_names = [identifier.split('"')[1] for identifier in attributes\ + if identifier.startswith('gene_name')] + + for transcript_id in transcript_ids: + self.genes[transcript_id] = self.genes[transcript_id].union(gene_names) + self.regions[transcript_id].add((chromosome, start, end)) + + def get_fasta_records(self): + """ Yields annotated fasta records of human transcripts. """ + + logging.debug('Caching annotations') + self._cache_annotations() + + logging.debug('Downloading and annotating transcripts') + for record in self._get_raw_transcripts(): + transcript_id = record.id + + if transcript_id in self.regions: + + mapping_locations = self.regions[transcript_id] + + description = '|'.join([';'.join(map(str, location))\ + for location in mapping_locations]) + + # Obtain gene name for transcript + gene_names = self.genes[transcript_id] + if gene_names: + gene_name = list(gene_names)[0] + else: + gene_name = transcript_id + + yield self._get_fasta_record(record, self.group , 'Hominidae', 'Homo_sapiens', gene_name, sub_identifier=transcript_id, description=description) + +class RefSeqDownloader(DataDownloader): + """ Generic downloader that known how to interpret and parse genbank records """ + + def __init__(self, group): + + self.refseq_host_name = 'ftp.ncbi.nih.gov' + self.base_path = '/genomes/' + group + + super(RefSeqDownloader, self).__init__(self.refseq_host_name, self.base_path) + + self.group = group + + + @classmethod + def _get_project_from_genbank(self, gb_record): + """ Parses genbank project identifier from genbank record. """ + + project = '' + for dbxref in gb_record.dbxrefs: + for entry in dbxref.split(' '): + if entry.startswith('BioProject'): + project = entry.split(':')[1] + return project + + if not project: + for dbxref in gb_record.dbxrefs: + for entry in dbxref.split(' '): + if entry.startswith('Project'): + project = entry.split(':')[1] + return project + + return project + + @classmethod + def _get_annotations_from_genbank(self, gb_record): + """ Parses taxonomic annotation from genbank record. """ + + accession = gb_record.annotations['accessions'][0] + organism = gb_record.annotations['organism'].replace(' ', '_') + organism = '_'.join(organism.split('_')[:2]) + taxonomy = ("; ").join(gb_record.annotations['taxonomy']) + gi = gb_record.annotations['gi'] + + family = 'Unassigned' + if len(gb_record.annotations['taxonomy']) > 0: + for level in gb_record.annotations['taxonomy']: + if level.endswith('dae') or level.endswith('aceae'): + family = level + break + + if family == 'Unassigned': + logging.debug('Unassigned family found based on %s' % taxonomy) + + return organism, accession, gi, taxonomy, family + + @classmethod + def _get_record_from_genbank(self, gb_record): + """ Parses sequence from genbank record. """ + + return SeqRecord(gb_record.seq) + + @classmethod + def _parse_genbank_records(self, file_handle): + + for gb_record in SeqIO.parse(file_handle, 'genbank'): + + project = self._get_project_from_genbank(gb_record) + organism, accession, gi, taxonomy, family =\ + self._get_annotations_from_genbank(gb_record) + record = self._get_record_from_genbank(gb_record) + + if len(record) > 0: + yield record, project, taxonomy, family, organism, gi + + def _get_nc(self, entry_path): + + if self.host.path.isfile(entry_path): + logging.debug('Processing refseq entry %s' % entry_path) + with self.host.file(entry_path) as remote_handle: + + + for record, project, taxonomy, family, organism, gi\ + in self._parse_genbank_records(remote_handle): + + cleaned = self._get_fasta_record(record, self.group , family, organism, gi) + + yield cleaned + + def _get_nz(self, entry_path): + + scaffold_path = entry_path.replace('.gbk', '.scaffold.gbk.tgz') + if self.host.path.isfile(scaffold_path): + logging.debug('Processing refseq entry %s' % scaffold_path) + with self.host.file(scaffold_path, 'rb') as remote_handle: + remote_content = BytesIO(remote_handle.read()) + with tarfile.open(fileobj=remote_content) as tar: + for subentry in tar.getnames(): + if subentry.endswith('.gbk'): + logging.debug('Processing subaccession %s' % subentry) + subhandle = tar.extractfile(subentry) + for record, project, taxonomy, family, organism, gi in self._parse_genbank_records(subhandle): + yield self._get_fasta_record(record, self.group , family, organism, gi) + + + def get_fasta_records(self): + + species_dirs = self.host.listdir(self.host.curdir) + + for species_dir in species_dirs: + + species_path = self.base_path + '/' + species_dir + + if self.host.path.isdir(species_path): + + self.host.chdir(species_path) + + logging.debug('Processing species directory %s' % species_path) + + if self.host.path.isdir(species_path): + + self.host.chdir(species_path) + + entries = self.host.listdir(self.host.curdir) + + for entry in entries: + + if self.host.path.isfile(self.host.curdir + '/' + entry): + + if entry.endswith('.gbk') or entry.endswith('.gbk.gz'): + + logging.debug('Processing entry %s' % entry) + + entry_path = species_path + '/' + entry + + if entry.startswith('NC_'): + for record in self._get_nc(entry_path): + yield record + + elif entry.startswith('NZ_'): + for record in self._get_nz(entry_path): + yield record + +class HumanGenomeDownloader(DataDownloader): + + def __init__(self): + + self.ensemble_host_name = 'ftp.ensembl.org' + self.base_path = '/pub/current_fasta/homo_sapiens/dna' + + super(HumanGenomeDownloader, self).__init__(self.ensemble_host_name, self.base_path) + + self.group = 'Homo_sapiens' + + def get_fasta_records(self): + + entries = self.host.listdir(self.host.curdir) + for entry in entries: + + if self.host.path.isfile(entry): + + # You may want to change your filtering criteria here if you would like to include patches and haplotypes + if entry.endswith('fa.gz') and 'dna_sm.primary_assembly' in entry: + + logging.debug('Processing human entry %s' % entry) + + with self.host.file(self.host.getcwd() + '/' + entry, 'rb') as zipped_handle: + remote_content = BytesIO(zipped_handle.read()) + + with gzip.GzipFile(fileobj=remote_content) as gzipfile: + + for record in SeqIO.parse(gzipfile, "fasta"): + + identifier = record.id + organism = 'Homo_sapiens' + family = 'Hominidae' + + yield self._get_fasta_record(record, self.group, family, organism, identifier) + +class UniVecDownloader(DataDownloader): + + def __init__(self): + + self.univec_host_name = 'ftp.ncbi.nih.gov' + self.base_path = '/pub/UniVec' + + super(UniVecDownloader, self).__init__(self.univec_host_name, self.base_path) + + self.group = 'UniVec' + + def get_fasta_records(self): + + logging.debug('Processing Univec entries') + + with self.host.file(self.host.getcwd() + '/' + 'UniVec') as remote_handle: + for record in SeqIO.parse(remote_handle, "fasta"): + organism = '_'.join(record.description.split(' ')[1:]) + family = 'Unassigned' + identifier = '000000000' + + yield self._get_fasta_record(record, self.group, family, organism, identifier) + +class CLI(cli.Application): + """""" + PROGNAME = "metamap" + VERSION = "1.0.0" + DESCRIPTION = """""" + + def main(self, *args): + + if args: + print("Unknown command %r" % (args[0])) + print self.USAGE + return 1 + + if not self.nested_command: + print("No command given") + print self.USAGE + return 1 + +@CLI.subcommand("fasta") +class References(cli.Application): + """ Obtains NCBI RefSeq and Ensembl reference genomes and transcripts""" + + valid_references = ['Fungi', 'Fungi_DRAFT', 'Bacteria', + 'Bacteria_DRAFT', 'Homo_sapiens', + 'Viruses', 'UniVec', 'Plasmids', + 'Protozoa', 'rRNA', 'Homo_sapiens_cDNA'] + + references = cli.SwitchAttr(['-r', '--reference'], + cli.Set(*valid_references, case_sensitive=True), + list=True, default=valid_references, + mandatory=False, + help="Sets the kind of references to obtain") + + fasta_path = cli.SwitchAttr(['-o', '--output_file'], str, mandatory=True, + help="Sets the fasta output file") + + zipped = cli.Flag(["-z", "--zipped"], help="Write gzipped output") + debug = cli.Flag(["-d", "--debug"], help="Enable debug messages") + + + def main(self): + """ Downloads genome fasta files from reference databases""" + + if self.debug: + logging.getLogger().setLevel(logging.DEBUG) + + if os.path.dirname(self.fasta_path) and not os.path.exists(os.path.dirname(self.fasta_path)): + logging.debug('Making directories for output file %s' % self.fasta_path) + os.makedirs(os.path.dirname(self.fasta_path)) + + if self.zipped: + logging.debug('Making zipped output file %s' % self.fasta_path) + output_file = gzip.open(self.fasta_path, 'wb') + + else: + logging.debug('Making regular output file %s' % self.fasta_path) + output_file = open(self.fasta_path, 'w', buffering=100 * 1024 * 1024) + + + for reference in self.references: + + if reference == 'UniVec': + downloader = UniVecDownloader() + elif reference == 'Homo_sapiens': + downloader = HumanGenomeDownloader() + elif reference == 'rRNA': + downloader = SilvaDownlaoder() + elif reference == 'Homo_sapiens_cDNA': + downloader = HumanTranscriptDownloader() + else: + downloader = RefSeqDownloader(group=reference) + + for record in downloader.get_fasta_records(): + SeqIO.write([record], output_file, "fasta") + + output_file.close() + +@CLI.subcommand("blast") +class Blast(cli.Application): + """ Obtains blast databases """ + + valid_references = ['nt', 'nr'] + + references = cli.SwitchAttr(['-r', '--reference_database'], + cli.Set(*valid_references, case_sensitive=True), + list=True, default=valid_references, + mandatory=False, + help="Sets the kind of database to obtain") + + database_path = cli.SwitchAttr(['-o', '--output_path'], str, mandatory=True, + help="Sets the fasta output file") + + debug = cli.Flag(["-d", "--debug"], help="Enable debug messages") + + def main(self): + """ Downloads blast database files """ + + if self.debug: + logging.getLogger().setLevel(logging.DEBUG) + + if not os.path.exists(self.database_path): + logging.debug('Making output directory %s' % self.database_path) + os.makedirs(self.database_path) + + host_name = 'ftp.ncbi.nih.gov' + host = ftputil.FTPHost(host_name, 'anonymous', 'anonymous') + + for reference in self.references: + + path = '/blast/db' + host.chdir(path) + + entries = host.listdir(host.curdir) + + for entry in entries: + + if host.path.isfile(entry): + + if entry.startswith(reference) and entry.endswith('.tar.gz'): + logging.debug('Downloading %s' % entry) + local_path = os.path.join(self.database_path, entry) + host.download(entry, local_path, 'b') + logging.debug('Unpacking %s' % entry) + tfile = tarfile.open(local_path, 'r:gz') + tfile.extractall(self.database_path) + os.remove(local_path) + +if __name__ == "__main__": + CLI.run() +