annotate vref.py @ 0:3ba5983012cf draft

Uploaded
author mzeidler
date Tue, 24 Sep 2013 10:19:40 -0400
parents
children 3eddea30be1a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
1 #!/usr/bin/env python
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
2
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
3 import sys
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
4 import logging
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
5 import gzip
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
6 import string
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
7 import os
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
8 import tarfile
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
9
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
10 from io import BytesIO
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
11 from collections import defaultdict
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
12
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
13 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
14 from Bio.SeqRecord import SeqRecord
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
15 from Bio import SeqIO
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
16 except ImportError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
17 message = 'This script requires the BioPython python package\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
18 sys.stderr.write(message)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
19 sys.exit(1)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
20
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
21 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
22 from plumbum import cli
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
23 except ImportError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
24 message = 'This script requires the plumbum python package\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
25 sys.stderr.write(message)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
26 sys.exit(1)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
27
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
28 try:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
29 import ftputil
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
30 except ImportError:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
31 message = 'This script requires the ftputil python package\n'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
32 sys.stderr.write(message)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
33 sys.exit(1)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
34
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
35
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
36 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
37 NON_ID = NON_ID.replace('_', '').replace('-', '')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
38 NUC_TRANS = string.maketrans('uryswkmbdhvURYSWKMBDHV', 'tnnnnnnnnnnTNNNNNNNNNN')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
39
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
40 logging.basicConfig(level=logging.INFO, format='%(message)s')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
41
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
42
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
43 class DataDownloader(object):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
44 """ Generic class for downloading fasta records via FTP. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
45
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
46 def __init__(self, host_name, base_path, user='anonymous', password='anonymous'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
47
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
48 self.group = 'NoGroup'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
49 self.host_name = host_name
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
50 self.base_path = base_path
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
51
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
52 self.host = ftputil.FTPHost(host_name, user, password)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
53 self.host.chdir(base_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
54
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
55 @classmethod
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
56 def _get_fasta_record(self, record, group, family, organism, identifier, sub_identifier=None, description=''):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
57
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
58 record.seq._data = record.seq._data.translate(NUC_TRANS)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
59
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
60 group = group.translate(None, NON_ID)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
61 family = family.translate(None, NON_ID)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
62 organism = organism.translate(None, NON_ID)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
63 identifier = identifier.translate(None, NON_ID)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
64
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
65 if sub_identifier:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
66 sub_identifier = sub_identifier.translate(None, NON_ID)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
67 record.id = ';'.join([group, family, organism, identifier, sub_identifier])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
68 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
69 record.id = ';'.join([group, family, organism, identifier])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
70
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
71 record.description = description
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
72 record.name = ''
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
73
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
74 return record
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
75
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
76 def get_fasta_records(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
77
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
78 raise NotImplementedError
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
79
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
80 class SilvaDownlaoder(DataDownloader):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
81 """ Downlaods rRNA transcripts from the Silva database. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
82
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
83 def __init__(self, silva_release='111'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
84
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
85 self.silva_release = silva_release
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
86 self.silva_host_name = 'ftp.arb-silva.de'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
87 self.base_path = '/release_%s/Exports/' % self.silva_release
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
88
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
89 super(SilvaDownlaoder, self).__init__(self.silva_host_name, self.base_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
90
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
91 self.group = 'rRNA'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
92
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
93 def get_fasta_records(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
94
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
95 remote_path = self.base_path + 'SSURef_%s_NR_tax_silva_trunc.fasta.tgz' % self.silva_release
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
96 if self.host.path.isfile(remote_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
97
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
98 with self.host.file(remote_path, 'rb') as remote_handle:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
99 remote_content = BytesIO(remote_handle.read())
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
100
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
101 with tarfile.open(fileobj=remote_content) as tar:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
102
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
103 for subentry in tar.getnames():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
104 if subentry.endswith('.fasta'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
105 logging.debug('Processing rRNA file %s' % subentry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
106 subhandle = tar.extractfile(subentry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
107
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
108 for record in SeqIO.parse(subhandle, "fasta"):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
109 identifier = record.id.split('.')[0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
110 description = '_'.join(record.description.split(' ')[1:])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
111 taxonomy = description.split(';')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
112 organism = taxonomy[-1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
113 family = 'Unassigned'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
114
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
115 for level in taxonomy:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
116 if level.endswith('eae'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
117 family = level
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
118 break
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
119
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
120 yield self._get_fasta_record(record, self.group, family, organism, identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
121
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
122 class HumanTranscriptDownloader(DataDownloader):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
123 """ Downloads human transcripts fasta records that contain the locations
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
124 of their exons on the human genome within their description line.
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
125 """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
126
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
127 def __init__(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
128
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
129 self.ensemble_host_name = 'ftp.ensembl.org'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
130 self.gtf_path = '/pub/current_gtf/homo_sapiens'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
131 self.cdna_path = '/pub/current_fasta/homo_sapiens/cdna'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
132 self.ncrna_path = '/pub/current_fasta/homo_sapiens/ncrna'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
133
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
134 super(HumanTranscriptDownloader, self).__init__(self.ensemble_host_name, self.gtf_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
135
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
136 self.chromosomes = set()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
137 self.genes = defaultdict(set)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
138 self.regions = defaultdict(set)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
139 self.group = 'Homo_sapiens_cDNA'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
140
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
141 def _cache_annotations(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
142 """ Obtains gtf annotations for human transcripts from ensemble and
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
143 caches them in memory.
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
144 """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
145
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
146 self.host.chdir(self.gtf_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
147
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
148 entries = self.host.listdir(self.host.curdir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
149
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
150 for entry in entries:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
151
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
152 if self.host.path.isfile(entry):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
153 if entry.startswith('Homo_sapiens.') and entry.endswith('.gtf.gz'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
154 logging.debug('Processing cDNA annotations %s' % entry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
155
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
156 with self.host.file(self.gtf_path + '/' + entry, 'rb') as zipped_handle:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
157 remote_content = BytesIO(zipped_handle.read())
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
158
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
159 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
160 for i, gtf_line in enumerate(gzipfile):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
161 if i % 100000 == 0:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
162 logging.debug('Cached %i human transcript annotations' % i)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
163
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
164 self._add_annotation(gtf_line)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
165
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
166 def _get_raw_transcripts(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
167 """ Obtains coding and noncoding human transcripts from ensemble and
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
168 yields them as fasta records if they have exons that have a known location
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
169 on the human genome. Fasta records contain the exon locations in their header
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
170 lines.
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
171 """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
172
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
173 for subpath in [self.cdna_path, self.ncrna_path]:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
174
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
175 self.host.chdir(subpath)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
176
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
177 entries = self.host.listdir(self.host.curdir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
178
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
179 for entry in entries:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
180 if self.host.path.isfile(entry):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
181 if entry.endswith('cdna.all.fa.gz') or entry.endswith('ncrna.fa.gz'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
182
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
183 logging.debug('Processing human transcript file %s' % entry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
184
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
185 with self.host.file(subpath + '/' + entry, 'rb') as zipped_handle:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
186 remote_content = BytesIO(zipped_handle.read())
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
187
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
188 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
189 for i, record in enumerate(SeqIO.parse(gzipfile, "fasta")):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
190
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
191 if record.id.startswith('ENST'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
192
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
193 if i % 100000 == 0:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
194 logging.debug('Retrieved %i human transcripts' % i)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
195
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
196 record.description = ''
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
197
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
198 yield record
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
199
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
200 def _add_annotation(self, gtf_line):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
201 """ Parses gtf annotations and stores them in memory. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
202
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
203 if gtf_line[0] == '#':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
204 return
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
205
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
206 fields = gtf_line.strip().split('\t')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
207 chromosome, source, feature, start, end, score, strands, frame, attributes\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
208 = fields
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
209
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
210 start, end = sorted([int(start), int(end)])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
211
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
212 if feature != 'exon':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
213 return
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
214
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
215 attributes = attributes.replace(' ', '').split(';')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
216 transcript_ids = [identifier.split('"')[1] for identifier in attributes\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
217 if identifier.startswith('transcript_id')]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
218
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
219 gene_names = [identifier.split('"')[1] for identifier in attributes\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
220 if identifier.startswith('gene_name')]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
221
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
222 for transcript_id in transcript_ids:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
223 self.genes[transcript_id] = self.genes[transcript_id].union(gene_names)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
224 self.regions[transcript_id].add((chromosome, start, end))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
225
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
226 def get_fasta_records(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
227 """ Yields annotated fasta records of human transcripts. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
228
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
229 logging.debug('Caching annotations')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
230 self._cache_annotations()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
231
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
232 logging.debug('Downloading and annotating transcripts')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
233 for record in self._get_raw_transcripts():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
234 transcript_id = record.id
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
235
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
236 if transcript_id in self.regions:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
237
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
238 mapping_locations = self.regions[transcript_id]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
239
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
240 description = '|'.join([';'.join(map(str, location))\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
241 for location in mapping_locations])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
242
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
243 # Obtain gene name for transcript
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
244 gene_names = self.genes[transcript_id]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
245 if gene_names:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
246 gene_name = list(gene_names)[0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
247 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
248 gene_name = transcript_id
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
249
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
250 yield self._get_fasta_record(record, self.group , 'Hominidae', 'Homo_sapiens', gene_name, sub_identifier=transcript_id, description=description)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
251
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
252 class RefSeqDownloader(DataDownloader):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
253 """ Generic downloader that known how to interpret and parse genbank records """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
254
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
255 def __init__(self, group):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
256
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
257 self.refseq_host_name = 'ftp.ncbi.nih.gov'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
258 self.base_path = '/genomes/' + group
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
259
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
260 super(RefSeqDownloader, self).__init__(self.refseq_host_name, self.base_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
261
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
262 self.group = group
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
263
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
264
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
265 @classmethod
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
266 def _get_project_from_genbank(self, gb_record):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
267 """ Parses genbank project identifier from genbank record. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
268
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
269 project = ''
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
270 for dbxref in gb_record.dbxrefs:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
271 for entry in dbxref.split(' '):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
272 if entry.startswith('BioProject'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
273 project = entry.split(':')[1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
274 return project
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
275
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
276 if not project:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
277 for dbxref in gb_record.dbxrefs:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
278 for entry in dbxref.split(' '):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
279 if entry.startswith('Project'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
280 project = entry.split(':')[1]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
281 return project
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
282
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
283 return project
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
284
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
285 @classmethod
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
286 def _get_annotations_from_genbank(self, gb_record):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
287 """ Parses taxonomic annotation from genbank record. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
288
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
289 accession = gb_record.annotations['accessions'][0]
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
290 organism = gb_record.annotations['organism'].replace(' ', '_')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
291 organism = '_'.join(organism.split('_')[:2])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
292 taxonomy = ("; ").join(gb_record.annotations['taxonomy'])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
293 gi = gb_record.annotations['gi']
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
294
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
295 family = 'Unassigned'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
296 if len(gb_record.annotations['taxonomy']) > 0:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
297 for level in gb_record.annotations['taxonomy']:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
298 if level.endswith('dae') or level.endswith('aceae'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
299 family = level
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
300 break
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
301
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
302 if family == 'Unassigned':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
303 logging.debug('Unassigned family found based on %s' % taxonomy)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
304
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
305 return organism, accession, gi, taxonomy, family
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
306
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
307 @classmethod
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
308 def _get_record_from_genbank(self, gb_record):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
309 """ Parses sequence from genbank record. """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
310
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
311 return SeqRecord(gb_record.seq)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
312
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
313 @classmethod
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
314 def _parse_genbank_records(self, file_handle):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
315
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
316 for gb_record in SeqIO.parse(file_handle, 'genbank'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
317
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
318 project = self._get_project_from_genbank(gb_record)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
319 organism, accession, gi, taxonomy, family =\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
320 self._get_annotations_from_genbank(gb_record)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
321 record = self._get_record_from_genbank(gb_record)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
322
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
323 if len(record) > 0:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
324 yield record, project, taxonomy, family, organism, gi
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
325
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
326 def _get_nc(self, entry_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
327
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
328 if self.host.path.isfile(entry_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
329 logging.debug('Processing refseq entry %s' % entry_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
330 remote_handle = self.host.file(entry_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
331
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
332
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
333 for record, project, taxonomy, family, organism, gi\
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
334 in self._parse_genbank_records(remote_handle):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
335
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
336 cleaned = self._get_fasta_record(record, self.group , family, organism, gi)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
337
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
338 yield cleaned
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
339
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
340 def _get_nz(self, entry_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
341
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
342 scaffold_path = entry_path.replace('.gbk', '.scaffold.gbk.tgz')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
343 if self.host.path.isfile(scaffold_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
344 logging.debug('Processing refseq entry %s' % scaffold_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
345 with self.host.file(scaffold_path, 'rb') as remote_handle:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
346 remote_content = BytesIO(remote_handle.read())
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
347 with tarfile.open(fileobj=remote_content) as tar:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
348 for subentry in tar.getnames():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
349 if subentry.endswith('.gbk'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
350 logging.debug('Processing subaccession %s' % subentry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
351 subhandle = tar.extractfile(subentry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
352 for record, project, taxonomy, family, organism, gi in self._parse_genbank_records(subhandle):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
353 yield self._get_fasta_record(record, self.group , family, organism, gi)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
354
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
355
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
356 def get_fasta_records(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
357
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
358 species_dirs = self.host.listdir(self.host.curdir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
359
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
360 for species_dir in species_dirs:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
361
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
362 species_path = self.base_path + '/' + species_dir
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
363
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
364 if self.host.path.isdir(species_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
365
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
366 self.host.chdir(species_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
367
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
368 logging.debug('Processing species directory %s' % species_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
369
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
370 if self.host.path.isdir(species_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
371
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
372 self.host.chdir(species_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
373
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
374 entries = self.host.listdir(self.host.curdir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
375
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
376 for entry in entries:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
377
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
378 if self.host.path.isfile(self.host.curdir + '/' + entry):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
379
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
380 if entry.endswith('.gbk') or entry.endswith('.gbk.gz'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
381
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
382 logging.debug('Processing entry %s' % entry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
383
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
384 entry_path = species_path + '/' + entry
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
385
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
386 if entry.startswith('NC_'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
387 for record in self._get_nc(entry_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
388 yield record
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
389
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
390 elif entry.startswith('NZ_'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
391 for record in self._get_nz(entry_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
392 yield record
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
393
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
394 class HumanGenomeDownloader(DataDownloader):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
395
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
396 def __init__(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
397
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
398 self.ensemble_host_name = 'ftp.ensembl.org'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
399 self.base_path = '/pub/current_fasta/homo_sapiens/dna'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
400
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
401 super(HumanGenomeDownloader, self).__init__(self.ensemble_host_name, self.base_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
402
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
403 self.group = 'Homo_sapiens'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
404
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
405 def get_fasta_records(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
406
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
407 entries = self.host.listdir(self.host.curdir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
408 for entry in entries:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
409
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
410 if self.host.path.isfile(entry):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
411
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
412 # You may want to change your filtering criteria here if you would like to include patches and haplotypes
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
413 if entry.endswith('fa.gz') and 'dna_sm.primary_assembly' in entry:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
414
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
415 logging.debug('Processing human entry %s' % entry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
416
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
417 with self.host.file(self.host.getcwd() + '/' + entry, 'rb') as zipped_handle:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
418 remote_content = BytesIO(zipped_handle.read())
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
419
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
420 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
421
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
422 for record in SeqIO.parse(gzipfile, "fasta"):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
423
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
424 identifier = record.id
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
425 organism = 'Homo_sapiens'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
426 family = 'Hominidae'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
427
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
428 yield self._get_fasta_record(record, self.group, family, organism, identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
429
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
430 class UniVecDownloader(DataDownloader):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
431
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
432 def __init__(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
433
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
434 self.univec_host_name = 'ftp.ncbi.nih.gov'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
435 self.base_path = '/pub/UniVec'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
436
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
437 super(UniVecDownloader, self).__init__(self.univec_host_name, self.base_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
438
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
439 self.group = 'UniVec'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
440
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
441 def get_fasta_records(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
442
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
443 logging.debug('Processing Univec entries')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
444
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
445 with self.host.file(self.host.getcwd() + '/' + 'UniVec') as remote_handle:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
446 for record in SeqIO.parse(remote_handle, "fasta"):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
447 organism = '_'.join(record.description.split(' ')[1:])
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
448 family = 'Unassigned'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
449 identifier = '000000000'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
450
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
451 yield self._get_fasta_record(record, self.group, family, organism, identifier)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
452
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
453 class CLI(cli.Application):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
454 """"""
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
455 PROGNAME = "metamap"
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
456 VERSION = "1.0.0"
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
457 DESCRIPTION = """"""
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
458
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
459 def main(self, *args):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
460
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
461 if args:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
462 print("Unknown command %r" % (args[0]))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
463 print self.USAGE
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
464 return 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
465
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
466 if not self.nested_command:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
467 print("No command given")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
468 print self.USAGE
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
469 return 1
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
470
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
471 @CLI.subcommand("fasta")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
472 class References(cli.Application):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
473 """ Obtains NCBI RefSeq and Ensembl reference genomes and transcripts"""
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
474
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
475 valid_references = ['Fungi', 'Fungi_DRAFT', 'Bacteria',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
476 'Bacteria_DRAFT', 'Homo_sapiens',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
477 'Viruses', 'UniVec', 'Plasmids',
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
478 'Protozoa', 'rRNA', 'Homo_sapiens_cDNA']
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
479
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
480 references = cli.SwitchAttr(['-r', '--reference'],
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
481 cli.Set(*valid_references, case_sensitive=True),
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
482 list=True, default=valid_references,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
483 mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
484 help="Sets the kind of references to obtain")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
485
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
486 fasta_path = cli.SwitchAttr(['-o', '--output_file'], str, mandatory=True,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
487 help="Sets the fasta output file")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
488
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
489 zipped = cli.Flag(["-z", "--zipped"], help="Write gzipped output")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
490 debug = cli.Flag(["-d", "--debug"], help="Enable debug messages")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
491
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
492
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
493 def main(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
494 """ Downloads genome fasta files from reference databases"""
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
495
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
496 if self.debug:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
497 logging.getLogger().setLevel(logging.DEBUG)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
498
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
499 if os.path.dirname(self.fasta_path) and not os.path.exists(os.path.dirname(self.fasta_path)):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
500 logging.debug('Making directories for output file %s' % self.fasta_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
501 os.makedirs(os.path.dirname(self.fasta_path))
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
502
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
503 if self.zipped:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
504 logging.debug('Making zipped output file %s' % self.fasta_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
505 output_file = gzip.open(self.fasta_path, 'wb')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
506
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
507 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
508 logging.debug('Making regular output file %s' % self.fasta_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
509 output_file = open(self.fasta_path, 'w', buffering=100 * 1024 * 1024)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
510
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
511
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
512 for reference in self.references:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
513
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
514 if reference == 'UniVec':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
515 downloader = UniVecDownloader()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
516 elif reference == 'Homo_sapiens':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
517 downloader = HumanGenomeDownloader()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
518 elif reference == 'rRNA':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
519 downloader = SilvaDownlaoder()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
520 elif reference == 'Homo_sapiens_cDNA':
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
521 downloader = HumanTranscriptDownloader()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
522 else:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
523 downloader = RefSeqDownloader(group=reference)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
524
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
525 for record in downloader.get_fasta_records():
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
526 SeqIO.write([record], output_file, "fasta")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
527
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
528 output_file.close()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
529
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
530 @CLI.subcommand("blast")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
531 class Blast(cli.Application):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
532 """ Obtains blast databases """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
533
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
534 valid_references = ['nt', 'nr']
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
535
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
536 references = cli.SwitchAttr(['-r', '--reference_database'],
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
537 cli.Set(*valid_references, case_sensitive=True),
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
538 list=True, default=valid_references,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
539 mandatory=False,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
540 help="Sets the kind of database to obtain")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
541
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
542 database_path = cli.SwitchAttr(['-o', '--output_path'], str, mandatory=True,
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
543 help="Sets the fasta output file")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
544
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
545 debug = cli.Flag(["-d", "--debug"], help="Enable debug messages")
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
546
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
547 def main(self):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
548 """ Downloads blast database files """
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
549
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
550 if self.debug:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
551 logging.getLogger().setLevel(logging.DEBUG)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
552
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
553 if not os.path.exists(self.database_path):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
554 logging.debug('Making output directory %s' % self.database_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
555 os.makedirs(self.database_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
556
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
557 host_name = 'ftp.ncbi.nih.gov'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
558 host = ftputil.FTPHost(host_name, 'anonymous', 'anonymous')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
559
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
560 for reference in self.references:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
561
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
562 path = '/blast/db'
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
563 host.chdir(path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
564
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
565 entries = host.listdir(host.curdir)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
566
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
567 for entry in entries:
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
568
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
569 if host.path.isfile(entry):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
570
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
571 if entry.startswith(reference) and entry.endswith('.tar.gz'):
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
572 logging.debug('Downloading %s' % entry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
573 local_path = os.path.join(self.database_path, entry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
574 host.download(entry, local_path, 'b')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
575 logging.debug('Unpacking %s' % entry)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
576 tfile = tarfile.open(local_path, 'r:gz')
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
577 tfile.extractall(self.database_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
578 os.remove(local_path)
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
579
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
580 if __name__ == "__main__":
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
581 CLI.run()
3ba5983012cf Uploaded
mzeidler
parents:
diff changeset
582