annotate vref.py @ 43:c74e70b459dd draft

Uploaded
author mzeidler
date Mon, 04 Nov 2013 11:54:58 -0500
parents f63d639b223c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
1 #!/usr/bin/env python
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
2
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
3 import sys
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
4 import logging
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
5 import gzip
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
6 import string
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
7 import os
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
8 import tarfile
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
9
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
10 from io import BytesIO
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
11 from collections import defaultdict
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
12
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
13 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
14 from Bio.SeqRecord import SeqRecord
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
15 from Bio import SeqIO
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
16 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
17 message = 'This script requires the BioPython python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
18 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
19 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
20
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
21 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
22 from plumbum import cli
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
23 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
24 message = 'This script requires the plumbum python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
25 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
26 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
27
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
28 try:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
29 import ftputil
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
30 except ImportError:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
31 message = 'This script requires the ftputil python package\n'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
32 sys.stderr.write(message)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
33 sys.exit(1)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
34
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
35
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
36 NON_ID = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
37 NON_ID = NON_ID.replace('_', '').replace('-', '')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
38 NUC_TRANS = string.maketrans('uryswkmbdhvURYSWKMBDHV', 'tnnnnnnnnnnTNNNNNNNNNN')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
39
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
40 logging.basicConfig(level=logging.INFO, format='%(message)s')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
41
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
42
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
43 class DataDownloader(object):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
44 """ Generic class for downloading fasta records via FTP. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
45
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
46 def __init__(self, host_name, base_path, user='anonymous', password='anonymous'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
47
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
48 self.group = 'NoGroup'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
49 self.host_name = host_name
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
50 self.base_path = base_path
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
51
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
52 self.host = ftputil.FTPHost(host_name, user, password)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
53 self.host.chdir(base_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
54
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
55 @classmethod
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
56 def _get_fasta_record(self, record, group, family, organism, identifier, sub_identifier=None, description=''):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
57
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
58 record.seq._data = record.seq._data.translate(NUC_TRANS)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
59
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
60 group = group.translate(None, NON_ID)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
61 family = family.translate(None, NON_ID)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
62 organism = organism.translate(None, NON_ID)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
63 identifier = identifier.translate(None, NON_ID)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
64
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
65 if sub_identifier:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
66 sub_identifier = sub_identifier.translate(None, NON_ID)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
67 record.id = ';'.join([group, family, organism, identifier, sub_identifier])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
68 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
69 record.id = ';'.join([group, family, organism, identifier])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
70
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
71 record.description = description
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
72 record.name = ''
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
73
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
74 return record
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
75
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
76 def get_fasta_records(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
77
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
78 raise NotImplementedError
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
79
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
80 class SilvaDownlaoder(DataDownloader):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
81 """ Downlaods rRNA transcripts from the Silva database. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
82
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
83 def __init__(self, silva_release='111'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
84
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
85 self.silva_release = silva_release
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
86 self.silva_host_name = 'ftp.arb-silva.de'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
87 self.base_path = '/release_%s/Exports/' % self.silva_release
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
88
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
89 super(SilvaDownlaoder, self).__init__(self.silva_host_name, self.base_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
90
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
91 self.group = 'rRNA'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
92
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
93 def get_fasta_records(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
94
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
95 remote_path = self.base_path + 'SSURef_%s_NR_tax_silva_trunc.fasta.tgz' % self.silva_release
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
96 if self.host.path.isfile(remote_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
97
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
98 with self.host.file(remote_path, 'rb') as remote_handle:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
99 remote_content = BytesIO(remote_handle.read())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
100
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
101 with tarfile.open(fileobj=remote_content) as tar:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
102
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
103 for subentry in tar.getnames():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
104 if subentry.endswith('.fasta'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
105 logging.debug('Processing rRNA file %s' % subentry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
106 subhandle = tar.extractfile(subentry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
107
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
108 for record in SeqIO.parse(subhandle, "fasta"):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
109 identifier = record.id.split('.')[0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
110 description = '_'.join(record.description.split(' ')[1:])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
111 taxonomy = description.split(';')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
112 organism = taxonomy[-1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
113 family = 'Unassigned'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
114
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
115 for level in taxonomy:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
116 if level.endswith('eae'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
117 family = level
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
118 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
119
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
120 yield self._get_fasta_record(record, self.group, family, organism, identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
121
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
122 class HumanTranscriptDownloader(DataDownloader):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
123 """ Downloads human transcripts fasta records that contain the locations
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
124 of their exons on the human genome within their description line.
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
125 """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
126
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
127 def __init__(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
128
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
129 self.ensemble_host_name = 'ftp.ensembl.org'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
130 self.gtf_path = '/pub/current_gtf/homo_sapiens'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
131 self.cdna_path = '/pub/current_fasta/homo_sapiens/cdna'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
132 self.ncrna_path = '/pub/current_fasta/homo_sapiens/ncrna'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
133
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
134 super(HumanTranscriptDownloader, self).__init__(self.ensemble_host_name, self.gtf_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
135
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
136 self.chromosomes = set()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
137 self.genes = defaultdict(set)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
138 self.regions = defaultdict(set)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
139 self.group = 'Homo_sapiens_cDNA'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
140
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
141 def _cache_annotations(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
142 """ Obtains gtf annotations for human transcripts from ensemble and
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
143 caches them in memory.
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
144 """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
145
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
146 self.host.chdir(self.gtf_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
147
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
148 entries = self.host.listdir(self.host.curdir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
149
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
150 for entry in entries:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
151
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
152 if self.host.path.isfile(entry):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
153 if entry.startswith('Homo_sapiens.') and entry.endswith('.gtf.gz'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
154 logging.debug('Processing cDNA annotations %s' % entry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
155
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
156 with self.host.file(self.gtf_path + '/' + entry, 'rb') as zipped_handle:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
157 remote_content = BytesIO(zipped_handle.read())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
158
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
159 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
160 for i, gtf_line in enumerate(gzipfile):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
161 if i % 100000 == 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
162 logging.debug('Cached %i human transcript annotations' % i)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
163
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
164 self._add_annotation(gtf_line)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
165
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
166 def _get_raw_transcripts(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
167 """ Obtains coding and noncoding human transcripts from ensemble and
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
168 yields them as fasta records if they have exons that have a known location
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
169 on the human genome. Fasta records contain the exon locations in their header
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
170 lines.
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
171 """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
172
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
173 for subpath in [self.cdna_path, self.ncrna_path]:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
174
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
175 self.host.chdir(subpath)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
176
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
177 entries = self.host.listdir(self.host.curdir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
178
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
179 for entry in entries:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
180 if self.host.path.isfile(entry):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
181 if entry.endswith('cdna.all.fa.gz') or entry.endswith('ncrna.fa.gz'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
182
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
183 logging.debug('Processing human transcript file %s' % entry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
184
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
185 with self.host.file(subpath + '/' + entry, 'rb') as zipped_handle:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
186 remote_content = BytesIO(zipped_handle.read())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
187
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
188 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
189 for i, record in enumerate(SeqIO.parse(gzipfile, "fasta")):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
190
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
191 if record.id.startswith('ENST'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
192
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
193 if i % 100000 == 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
194 logging.debug('Retrieved %i human transcripts' % i)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
195
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
196 record.description = ''
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
197
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
198 yield record
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
199
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
200 def _add_annotation(self, gtf_line):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
201 """ Parses gtf annotations and stores them in memory. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
202
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
203 if gtf_line[0] == '#':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
204 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
205
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
206 fields = gtf_line.strip().split('\t')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
207 chromosome, source, feature, start, end, score, strands, frame, attributes\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
208 = fields
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
209
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
210 start, end = sorted([int(start), int(end)])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
211
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
212 if feature != 'exon':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
213 return
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
214
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
215 attributes = attributes.replace(' ', '').split(';')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
216 transcript_ids = [identifier.split('"')[1] for identifier in attributes\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
217 if identifier.startswith('transcript_id')]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
218
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
219 gene_names = [identifier.split('"')[1] for identifier in attributes\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
220 if identifier.startswith('gene_name')]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
221
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
222 for transcript_id in transcript_ids:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
223 self.genes[transcript_id] = self.genes[transcript_id].union(gene_names)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
224 self.regions[transcript_id].add((chromosome, start, end))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
225
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
226 def get_fasta_records(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
227 """ Yields annotated fasta records of human transcripts. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
228
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
229 logging.debug('Caching annotations')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
230 self._cache_annotations()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
231
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
232 logging.debug('Downloading and annotating transcripts')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
233 for record in self._get_raw_transcripts():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
234 transcript_id = record.id
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
235
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
236 if transcript_id in self.regions:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
237
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
238 mapping_locations = self.regions[transcript_id]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
239
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
240 description = '|'.join([';'.join(map(str, location))\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
241 for location in mapping_locations])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
242
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
243 # Obtain gene name for transcript
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
244 gene_names = self.genes[transcript_id]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
245 if gene_names:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
246 gene_name = list(gene_names)[0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
247 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
248 gene_name = transcript_id
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
249
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
250 yield self._get_fasta_record(record, self.group , 'Hominidae', 'Homo_sapiens', gene_name, sub_identifier=transcript_id, description=description)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
251
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
252 class RefSeqDownloader(DataDownloader):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
253 """ Generic downloader that known how to interpret and parse genbank records """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
254
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
255 def __init__(self, group):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
256
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
257 self.refseq_host_name = 'ftp.ncbi.nih.gov'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
258 self.base_path = '/genomes/' + group
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
259
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
260 super(RefSeqDownloader, self).__init__(self.refseq_host_name, self.base_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
261
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
262 self.group = group
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
263
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
264
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
265 @classmethod
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
266 def _get_project_from_genbank(self, gb_record):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
267 """ Parses genbank project identifier from genbank record. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
268
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
269 project = ''
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
270 for dbxref in gb_record.dbxrefs:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
271 for entry in dbxref.split(' '):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
272 if entry.startswith('BioProject'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
273 project = entry.split(':')[1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
274 return project
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
275
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
276 if not project:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
277 for dbxref in gb_record.dbxrefs:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
278 for entry in dbxref.split(' '):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
279 if entry.startswith('Project'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
280 project = entry.split(':')[1]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
281 return project
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
282
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
283 return project
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
284
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
285 @classmethod
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
286 def _get_annotations_from_genbank(self, gb_record):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
287 """ Parses taxonomic annotation from genbank record. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
288
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
289 accession = gb_record.annotations['accessions'][0]
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
290 organism = gb_record.annotations['organism'].replace(' ', '_')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
291 organism = '_'.join(organism.split('_')[:2])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
292 taxonomy = ("; ").join(gb_record.annotations['taxonomy'])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
293 gi = gb_record.annotations['gi']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
294
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
295 family = 'Unassigned'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
296 if len(gb_record.annotations['taxonomy']) > 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
297 for level in gb_record.annotations['taxonomy']:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
298 if level.endswith('dae') or level.endswith('aceae'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
299 family = level
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
300 break
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
301
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
302 if family == 'Unassigned':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
303 logging.debug('Unassigned family found based on %s' % taxonomy)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
304
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
305 return organism, accession, gi, taxonomy, family
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
306
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
307 @classmethod
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
308 def _get_record_from_genbank(self, gb_record):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
309 """ Parses sequence from genbank record. """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
310
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
311 return SeqRecord(gb_record.seq)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
312
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
313 @classmethod
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
314 def _parse_genbank_records(self, file_handle):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
315
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
316 for gb_record in SeqIO.parse(file_handle, 'genbank'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
317
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
318 project = self._get_project_from_genbank(gb_record)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
319 organism, accession, gi, taxonomy, family =\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
320 self._get_annotations_from_genbank(gb_record)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
321 record = self._get_record_from_genbank(gb_record)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
322
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
323 if len(record) > 0:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
324 yield record, project, taxonomy, family, organism, gi
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
325
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
326 def _get_nc(self, entry_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
327
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
328 if self.host.path.isfile(entry_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
329 logging.debug('Processing refseq entry %s' % entry_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
330 with self.host.file(entry_path) as remote_handle:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
331
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
332
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
333 for record, project, taxonomy, family, organism, gi\
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
334 in self._parse_genbank_records(remote_handle):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
335
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
336 cleaned = self._get_fasta_record(record, self.group , family, organism, gi)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
337
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
338 yield cleaned
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
339
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
340 def _get_nz(self, entry_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
341
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
342 scaffold_path = entry_path.replace('.gbk', '.scaffold.gbk.tgz')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
343 if self.host.path.isfile(scaffold_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
344 logging.debug('Processing refseq entry %s' % scaffold_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
345 with self.host.file(scaffold_path, 'rb') as remote_handle:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
346 remote_content = BytesIO(remote_handle.read())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
347 with tarfile.open(fileobj=remote_content) as tar:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
348 for subentry in tar.getnames():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
349 if subentry.endswith('.gbk'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
350 logging.debug('Processing subaccession %s' % subentry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
351 subhandle = tar.extractfile(subentry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
352 for record, project, taxonomy, family, organism, gi in self._parse_genbank_records(subhandle):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
353 yield self._get_fasta_record(record, self.group , family, organism, gi)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
354
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
355
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
356 def get_fasta_records(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
357
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
358 species_dirs = self.host.listdir(self.host.curdir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
359
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
360 for species_dir in species_dirs:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
361
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
362 species_path = self.base_path + '/' + species_dir
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
363
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
364 if self.host.path.isdir(species_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
365
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
366 self.host.chdir(species_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
367
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
368 logging.debug('Processing species directory %s' % species_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
369
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
370 if self.host.path.isdir(species_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
371
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
372 self.host.chdir(species_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
373
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
374 entries = self.host.listdir(self.host.curdir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
375
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
376 for entry in entries:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
377
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
378 if self.host.path.isfile(self.host.curdir + '/' + entry):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
379
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
380 if entry.endswith('.gbk') or entry.endswith('.gbk.gz'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
381
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
382 logging.debug('Processing entry %s' % entry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
383
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
384 entry_path = species_path + '/' + entry
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
385
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
386 if entry.startswith('NC_'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
387 for record in self._get_nc(entry_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
388 yield record
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
389
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
390 elif entry.startswith('NZ_'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
391 for record in self._get_nz(entry_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
392 yield record
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
393
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
394 class HumanGenomeDownloader(DataDownloader):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
395
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
396 def __init__(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
397
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
398 self.ensemble_host_name = 'ftp.ensembl.org'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
399 self.base_path = '/pub/current_fasta/homo_sapiens/dna'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
400
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
401 super(HumanGenomeDownloader, self).__init__(self.ensemble_host_name, self.base_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
402
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
403 self.group = 'Homo_sapiens'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
404
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
405 def get_fasta_records(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
406
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
407 entries = self.host.listdir(self.host.curdir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
408 for entry in entries:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
409
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
410 if self.host.path.isfile(entry):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
411
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
412 # You may want to change your filtering criteria here if you would like to include patches and haplotypes
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
413 if entry.endswith('fa.gz') and 'dna_sm.primary_assembly' in entry:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
414
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
415 logging.debug('Processing human entry %s' % entry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
416
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
417 with self.host.file(self.host.getcwd() + '/' + entry, 'rb') as zipped_handle:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
418 remote_content = BytesIO(zipped_handle.read())
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
419
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
420 with gzip.GzipFile(fileobj=remote_content) as gzipfile:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
421
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
422 for record in SeqIO.parse(gzipfile, "fasta"):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
423
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
424 identifier = record.id
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
425 organism = 'Homo_sapiens'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
426 family = 'Hominidae'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
427
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
428 yield self._get_fasta_record(record, self.group, family, organism, identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
429
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
430 class UniVecDownloader(DataDownloader):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
431
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
432 def __init__(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
433
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
434 self.univec_host_name = 'ftp.ncbi.nih.gov'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
435 self.base_path = '/pub/UniVec'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
436
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
437 super(UniVecDownloader, self).__init__(self.univec_host_name, self.base_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
438
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
439 self.group = 'UniVec'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
440
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
441 def get_fasta_records(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
442
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
443 logging.debug('Processing Univec entries')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
444
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
445 with self.host.file(self.host.getcwd() + '/' + 'UniVec') as remote_handle:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
446 for record in SeqIO.parse(remote_handle, "fasta"):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
447 organism = '_'.join(record.description.split(' ')[1:])
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
448 family = 'Unassigned'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
449 identifier = '000000000'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
450
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
451 yield self._get_fasta_record(record, self.group, family, organism, identifier)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
452
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
453 class CLI(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
454 """"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
455 PROGNAME = "metamap"
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
456 VERSION = "1.0.0"
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
457 DESCRIPTION = """"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
458
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
459 def main(self, *args):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
460
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
461 if args:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
462 print("Unknown command %r" % (args[0]))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
463 print self.USAGE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
464 return 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
465
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
466 if not self.nested_command:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
467 print("No command given")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
468 print self.USAGE
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
469 return 1
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
470
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
471 @CLI.subcommand("fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
472 class References(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
473 """ Obtains NCBI RefSeq and Ensembl reference genomes and transcripts"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
474
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
475 valid_references = ['Fungi', 'Fungi_DRAFT', 'Bacteria',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
476 'Bacteria_DRAFT', 'Homo_sapiens',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
477 'Viruses', 'UniVec', 'Plasmids',
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
478 'Protozoa', 'rRNA', 'Homo_sapiens_cDNA']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
479
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
480 references = cli.SwitchAttr(['-r', '--reference'],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
481 cli.Set(*valid_references, case_sensitive=True),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
482 list=True, default=valid_references,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
483 mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
484 help="Sets the kind of references to obtain")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
485
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
486 fasta_path = cli.SwitchAttr(['-o', '--output_file'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
487 help="Sets the fasta output file")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
488
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
489 zipped = cli.Flag(["-z", "--zipped"], help="Write gzipped output")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
490 debug = cli.Flag(["-d", "--debug"], help="Enable debug messages")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
491
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
492
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
493 def main(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
494 """ Downloads genome fasta files from reference databases"""
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
495
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
496 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
497 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
498
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
499 if os.path.dirname(self.fasta_path) and not os.path.exists(os.path.dirname(self.fasta_path)):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
500 logging.debug('Making directories for output file %s' % self.fasta_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
501 os.makedirs(os.path.dirname(self.fasta_path))
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
502
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
503 if self.zipped:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
504 logging.debug('Making zipped output file %s' % self.fasta_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
505 output_file = gzip.open(self.fasta_path, 'wb')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
506
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
507 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
508 logging.debug('Making regular output file %s' % self.fasta_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
509 output_file = open(self.fasta_path, 'w', buffering=100 * 1024 * 1024)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
510
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
511
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
512 for reference in self.references:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
513
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
514 if reference == 'UniVec':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
515 downloader = UniVecDownloader()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
516 elif reference == 'Homo_sapiens':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
517 downloader = HumanGenomeDownloader()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
518 elif reference == 'rRNA':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
519 downloader = SilvaDownlaoder()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
520 elif reference == 'Homo_sapiens_cDNA':
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
521 downloader = HumanTranscriptDownloader()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
522 else:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
523 downloader = RefSeqDownloader(group=reference)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
524
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
525 for record in downloader.get_fasta_records():
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
526 SeqIO.write([record], output_file, "fasta")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
527
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
528 output_file.close()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
529
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
530 @CLI.subcommand("blast")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
531 class Blast(cli.Application):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
532 """ Obtains blast databases """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
533
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
534 valid_references = ['nt', 'nr']
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
535
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
536 references = cli.SwitchAttr(['-r', '--reference_database'],
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
537 cli.Set(*valid_references, case_sensitive=True),
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
538 list=True, default=valid_references,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
539 mandatory=False,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
540 help="Sets the kind of database to obtain")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
541
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
542 database_path = cli.SwitchAttr(['-o', '--output_path'], str, mandatory=True,
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
543 help="Sets the fasta output file")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
544
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
545 debug = cli.Flag(["-d", "--debug"], help="Enable debug messages")
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
546
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
547 def main(self):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
548 """ Downloads blast database files """
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
549
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
550 if self.debug:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
551 logging.getLogger().setLevel(logging.DEBUG)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
552
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
553 if not os.path.exists(self.database_path):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
554 logging.debug('Making output directory %s' % self.database_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
555 os.makedirs(self.database_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
556
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
557 host_name = 'ftp.ncbi.nih.gov'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
558 host = ftputil.FTPHost(host_name, 'anonymous', 'anonymous')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
559
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
560 for reference in self.references:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
561
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
562 path = '/blast/db'
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
563 host.chdir(path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
564
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
565 entries = host.listdir(host.curdir)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
566
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
567 for entry in entries:
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
568
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
569 if host.path.isfile(entry):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
570
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
571 if entry.startswith(reference) and entry.endswith('.tar.gz'):
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
572 logging.debug('Downloading %s' % entry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
573 local_path = os.path.join(self.database_path, entry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
574 host.download(entry, local_path, 'b')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
575 logging.debug('Unpacking %s' % entry)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
576 tfile = tarfile.open(local_path, 'r:gz')
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
577 tfile.extractall(self.database_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
578 os.remove(local_path)
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
579
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
580 if __name__ == "__main__":
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
581 CLI.run()
f63d639b223c Initial Upload
mzeidler
parents:
diff changeset
582