0
|
1 =head1 NAME
|
|
2
|
|
3 NCBITaxon - DESCRIPTION of Object
|
|
4
|
|
5 =head1 DESCRIPTION
|
|
6
|
|
7 An object that hold a node within a taxonomic tree. Inherits from NestedSet.
|
|
8
|
|
9 From Bio::Species
|
|
10 classification
|
|
11 common_name
|
|
12 binomial
|
|
13
|
|
14 Here are also the additional methods in Bio::Species that "might" be useful, but let us
|
|
15 forget about these for now.
|
|
16 genus
|
|
17 species
|
|
18 sub_species
|
|
19 variant
|
|
20 organelle
|
|
21 division
|
|
22
|
|
23 =head1 CONTACT
|
|
24
|
|
25 Contact Jessica Severin on implemetation/design detail: jessica@ebi.ac.uk
|
|
26 Contact Ewan Birney on EnsEMBL in general: birney@sanger.ac.uk
|
|
27
|
|
28 =head1 APPENDIX
|
|
29
|
|
30 The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
|
|
31
|
|
32 =cut
|
|
33
|
|
34 package Bio::EnsEMBL::Compara::NCBITaxon;
|
|
35
|
|
36 use strict;
|
|
37 use Bio::Species;
|
|
38 use Bio::EnsEMBL::Compara::NestedSet;
|
|
39 use Bio::EnsEMBL::Utils::Exception;
|
|
40 use Bio::EnsEMBL::Utils::Argument;
|
|
41
|
|
42 our @ISA = qw(Bio::EnsEMBL::Compara::NestedSet);
|
|
43
|
|
44 =head2 copy
|
|
45
|
|
46 Arg [1] : int $member_id (optional)
|
|
47 Example :
|
|
48 Description: returns copy of object, calling superclass copy method
|
|
49 Returntype :
|
|
50 Exceptions :
|
|
51 Caller :
|
|
52
|
|
53 =cut
|
|
54
|
|
55 sub copy {
|
|
56 my $self = shift;
|
|
57
|
|
58 my $mycopy = $self->SUPER::copy;
|
|
59 bless $mycopy, "Bio::EnsEMBL::Compara::NCBITaxon";
|
|
60
|
|
61 $mycopy->ncbi_taxid($self->ncbi_taxid);
|
|
62 $mycopy->rank($self->rank);
|
|
63 $mycopy->genbank_hidden_flag($self->genbank_hidden_flag);
|
|
64
|
|
65 return $mycopy;
|
|
66 }
|
|
67
|
|
68
|
|
69 sub ncbi_taxid {
|
|
70 my $self = shift;
|
|
71 my $value = shift;
|
|
72 $self->node_id($value) if($value);
|
|
73 return $self->node_id;
|
|
74 }
|
|
75
|
|
76 sub taxon_id {
|
|
77 my $self = shift;
|
|
78 my $value = shift;
|
|
79 $self->node_id($value) if($value);
|
|
80 return $self->node_id;
|
|
81 }
|
|
82
|
|
83 sub dbID {
|
|
84 my $self = shift;
|
|
85 my $value = shift;
|
|
86 $self->node_id($value) if($value);
|
|
87 return $self->node_id;
|
|
88 }
|
|
89
|
|
90 sub rank {
|
|
91 my $self = shift;
|
|
92 $self->{'_rank'} = shift if(@_);
|
|
93 return $self->{'_rank'};
|
|
94 }
|
|
95
|
|
96 sub genbank_hidden_flag {
|
|
97 my $self = shift;
|
|
98 $self->{'_genbank_hidden_flag'} = shift if(@_);
|
|
99 return $self->{'_genbank_hidden_flag'};
|
|
100 }
|
|
101
|
|
102 =head2 classification
|
|
103
|
|
104 Arg[SEPARATOR] : String (optional); used to separate the classification by
|
|
105 when returning as a string. If not specified then a single
|
|
106 space will be used.
|
|
107 Arg[FULL] : Boolean (optional); indicates we want all nodes including
|
|
108 those which Genbank sets as hidden
|
|
109 Arg[AS_ARRAY] : Boolean (optional); says the return type will be an
|
|
110 ArrayRef of all nodes in the classification as instances
|
|
111 of NCBITaxon.
|
|
112 Example : my $classification_string = $node->classification();
|
|
113 Description : Returns the String representation of a taxon node's
|
|
114 classification or the objects which constitute it (
|
|
115 including the current node). The String return when
|
|
116 split is compatible with BioPerl's Species classification
|
|
117 code and will return a data structure compatible with
|
|
118 that found in core species MetaContainers.
|
|
119
|
|
120 This code is a redevelopment of existing code which
|
|
121 descended down the taxonomy which had disadvanatages
|
|
122 when a classification was requested on nodes causing
|
|
123 the taxonomy to bi/multi-furcate.
|
|
124
|
|
125 Note the String representation does have some disadvantages
|
|
126 when working with the poorer end of the taxonomy where
|
|
127 species nodes are not well defined. For these situations
|
|
128 you are better using the array representation and
|
|
129 capturing the required information from the nodes.
|
|
130
|
|
131 Also to maintain the original functionality of the method
|
|
132 we filter any species, subspecies or subgenus nodes above
|
|
133 the current node. For the true classification always
|
|
134 call using the array structure.
|
|
135
|
|
136 Recalling this subroutine with the same parameters for
|
|
137 separators will return a cached representation. Calling
|
|
138 for AS_ARRAY will cause the classificaiton to be
|
|
139 recalculated each time.
|
|
140 Returntype : String if not asking for an array otherwise the array
|
|
141 Exceptions : -
|
|
142 Caller : Public
|
|
143
|
|
144 =cut
|
|
145
|
|
146 sub classification {
|
|
147 my ($self, @args) = @_;
|
|
148 my ($separator, $full, $as_array) = rearrange([qw( SEPARATOR FULL AS_ARRAY )], @args);
|
|
149
|
|
150 #setup defaults
|
|
151 $separator = ' ' unless(defined $separator);
|
|
152 $full = 0 unless (defined $full);
|
|
153
|
|
154 if(!$as_array) {
|
|
155 #Reset the separators & classifications if we already had one & it
|
|
156 #differed from the input
|
|
157 if(defined $self->{_separator} && $self->{_separator} ne $separator) {
|
|
158 $self->{_separator} = undef;
|
|
159 $self->{_classification} = undef;
|
|
160 }
|
|
161 if(defined $self->{_separator_full} && $self->{_separator_full} ne $separator) {
|
|
162 $self->{_separator_full} = undef;
|
|
163 $self->{_classification_full} = undef;
|
|
164 }
|
|
165
|
|
166 $self->{_separator} = $separator unless (defined $self->{_separator});
|
|
167 $self->{_separator_full} = $separator unless (defined $self->{_separator_full});
|
|
168
|
|
169 return $self->{_classification_full} if ($full && defined $self->{_classification_full});
|
|
170 return $self->{_classification} if (!$full && defined $self->{_classification});
|
|
171 }
|
|
172
|
|
173 my $node = $self;
|
|
174 my @classification;
|
|
175 while( $node->name() ne 'root' ) {
|
|
176 my $subgenus = $node->rank() eq 'subgenus';
|
|
177 if($full) {
|
|
178 push(@classification, $node);
|
|
179 }
|
|
180 else {
|
|
181 unless($node->genbank_hidden_flag() || $subgenus) {
|
|
182 push(@classification, $node);
|
|
183 }
|
|
184 }
|
|
185
|
|
186 $node = $node->parent();
|
|
187 }
|
|
188
|
|
189 if($as_array) {
|
|
190 return \@classification;
|
|
191 }
|
|
192
|
|
193 #Once we have a normal array we can do top-down as before to replicate
|
|
194 #the original functionality
|
|
195 my $text_classification = $self->_to_text_classification(\@classification);
|
|
196
|
|
197 if ($full) {
|
|
198 $self->{_classification_full} = join($separator, @{$text_classification});
|
|
199 $self->{_separator_full} = $separator;
|
|
200 return $self->{_classification_full};
|
|
201 } else {
|
|
202 $self->{_classification} = join($separator, @{$text_classification});
|
|
203 $self->{_separator} = $separator;
|
|
204 return $self->{_classification};
|
|
205 }
|
|
206 }
|
|
207
|
|
208 =head2 _to_text_classification
|
|
209
|
|
210 Arg[1] : ArrayRef of the classification array to be converted to
|
|
211 the text classification
|
|
212 Example : my $array = $node->_to_text_classification(\@classification);
|
|
213 Description : Returns the Array representation of a taxon node's
|
|
214 classification or the objects which constitute it (
|
|
215 including the current node) as the species names or split
|
|
216 according to the rules for working with BioPerl.
|
|
217 Returntype : ArrayRef of Strings
|
|
218 Exceptions : -
|
|
219 Caller : Private
|
|
220
|
|
221 =cut
|
|
222
|
|
223 sub _to_text_classification {
|
|
224 my ($self, $classification) = @_;
|
|
225 my @text_classification;
|
|
226 my $first = 1;
|
|
227 for my $node ( @{$classification}) {
|
|
228 my $subgenus = $node->rank() eq 'subgenus';
|
|
229 my $species = $node->rank() eq 'species';
|
|
230 my $subspecies = $node->rank() eq 'subspecies';
|
|
231
|
|
232 if($first) {
|
|
233 if($species || $subspecies) {
|
|
234 my ($genus, $species, $subspecies) = split(q{ }, $node->binomial());
|
|
235 unshift @text_classification, $species;
|
|
236 unshift @text_classification, $subspecies if (defined $subspecies);
|
|
237 }
|
|
238 $first = 0;
|
|
239 next;
|
|
240 }
|
|
241
|
|
242 next if $subgenus || $species || $subspecies;
|
|
243 push(@text_classification, $node->name());
|
|
244 }
|
|
245 return \@text_classification;
|
|
246 }
|
|
247
|
|
248 =head2 subspecies
|
|
249
|
|
250 Example : $ncbi->subspecies;
|
|
251 Description: Returns the subspeceis name for this species
|
|
252 Example : "verus" for Pan troglodytes verus
|
|
253 Returntype : string
|
|
254 Exceptions :
|
|
255 Caller : general
|
|
256
|
|
257 =cut
|
|
258
|
|
259 sub subspecies {
|
|
260 my $self = shift;
|
|
261
|
|
262 unless (defined $self->{'_species'}) {
|
|
263 my ($genus, $species, $subspecies) = split(" ", $self->binomial);
|
|
264 $self->{'_species'} = $species;
|
|
265 $self->{'_genus'} = $genus;
|
|
266 $self->{'_subspecies'} = $subspecies;
|
|
267 }
|
|
268
|
|
269 return $self->{'_species'};
|
|
270 }
|
|
271
|
|
272
|
|
273 =head2 species
|
|
274
|
|
275 Example : $ncbi->species;
|
|
276 Description: Returns the speceis name for this species
|
|
277 Example : "sapiens" for Homo sapiens
|
|
278 Returntype : string
|
|
279 Exceptions :
|
|
280 Caller : general
|
|
281
|
|
282 =cut
|
|
283
|
|
284 sub species {
|
|
285 my $self = shift;
|
|
286
|
|
287 unless (defined $self->{'_species'}) {
|
|
288 my ($genus, $species, $subspecies) = split(" ", $self->binomial);
|
|
289 $self->{'_species'} = $species;
|
|
290 $self->{'_genus'} = $genus;
|
|
291 $self->{'_subspecies'} = $subspecies;
|
|
292 }
|
|
293
|
|
294 return $self->{'_species'};
|
|
295 }
|
|
296
|
|
297
|
|
298 =head2 genus
|
|
299
|
|
300 Example : $ncbi->genus;
|
|
301 Description: Returns the genus name for this species
|
|
302 Returntype : string
|
|
303 Example : "Homo" for Homo sapiens
|
|
304 Exceptions :
|
|
305 Caller : general
|
|
306
|
|
307 =cut
|
|
308
|
|
309 sub genus {
|
|
310 my $self = shift;
|
|
311
|
|
312 unless (defined $self->{'_genus'}) {
|
|
313 my ($genus, $species, $subspecies) = split(" ", $self->binomial);
|
|
314 $self->{'_species'} = $species;
|
|
315 $self->{'_genus'} = $genus;
|
|
316 $self->{'_subspecies'} = $subspecies;
|
|
317 }
|
|
318
|
|
319 return $self->{'_genus'};
|
|
320 }
|
|
321
|
|
322 =head2 common_name
|
|
323
|
|
324 Example : $ncbi->common_name;
|
|
325 Description: The comon name as defined by Genbank
|
|
326 Returntype : string
|
|
327 Exceptions : returns undef if no genbank common name exists.
|
|
328 Caller : general
|
|
329
|
|
330 =cut
|
|
331
|
|
332 sub common_name {
|
|
333 my $self = shift;
|
|
334 if ($self->has_tag('genbank common name') && $self->rank eq 'species') {
|
|
335 return $self->get_tagvalue('genbank common name');
|
|
336 } else {
|
|
337 return undef;
|
|
338 }
|
|
339 }
|
|
340
|
|
341 =head2 ensembl_alias_name
|
|
342
|
|
343 Example : $ncbi->ensembl_alias_name;
|
|
344 Description: The comon name as defined by ensembl alias
|
|
345 Returntype : string
|
|
346 Exceptions : returns undef if no ensembl alias name exists.
|
|
347 Caller : general
|
|
348
|
|
349 =cut
|
|
350
|
|
351 sub ensembl_alias_name {
|
|
352 my $self = shift;
|
|
353
|
|
354 #Not checking for rank as we do above, because we do not get dog since the
|
|
355 #rank for dog is subspecies (ensembl-51).
|
|
356 if ($self->has_tag('ensembl alias name')) {
|
|
357 return $self->get_tagvalue('ensembl alias name');
|
|
358 } else {
|
|
359 return undef;
|
|
360 }
|
|
361 }
|
|
362
|
|
363
|
|
364 =head scientific_name
|
|
365
|
|
366 Example : $ncbi->scientific_name;
|
|
367 Description: The scientific name of this taxon
|
|
368 Returntype : string
|
|
369 Exceptions :
|
|
370 Caller : general
|
|
371
|
|
372 =cut
|
|
373
|
|
374 sub scientific_name {
|
|
375 my ($self) = @_;
|
|
376 return $self->get_tagvalue('scientific name');
|
|
377 }
|
|
378
|
|
379 =head2 binomial
|
|
380
|
|
381 Example : $ncbi->binomial;
|
|
382 Description: The binomial name (AKA the scientific name) of this genome
|
|
383 Returntype : string
|
|
384 Exceptions : warns when node is not a species or has no scientific name
|
|
385 Caller : general
|
|
386
|
|
387 =cut
|
|
388
|
|
389 sub binomial {
|
|
390 my $self = shift;
|
|
391 if ($self->has_tag('scientific name') && ($self->rank eq 'species' || $self->rank eq 'subspecies')) {
|
|
392 return $self->scientific_name;
|
|
393 } else {
|
|
394 warning("taxon_id=",$self->node_id," is not a species or subspecies. So binomial is undef (try the scientific_name method)\n");
|
|
395 return undef;
|
|
396 }
|
|
397 }
|
|
398
|
|
399 =head2 ensembl_alias
|
|
400
|
|
401 Example : $ncbi->ensembl_alias;
|
|
402 Description: The ensembl_alias name (AKA the name in the ensembl website) of this genome
|
|
403 Returntype : string
|
|
404 Exceptions : warns when node is not a species or has no ensembl_alias
|
|
405 Caller : general
|
|
406
|
|
407 =cut
|
|
408
|
|
409 sub ensembl_alias {
|
|
410 my $self = shift;
|
|
411 if ($self->has_tag('ensembl alias name')) {
|
|
412 return $self->get_tagvalue('ensembl alias name');
|
|
413 } else {
|
|
414 warning("taxon_id=",$self->node_id," is not a species or subspecies. So ensembl_alias is undef\n");
|
|
415 return undef;
|
|
416 }
|
|
417 }
|
|
418
|
|
419
|
|
420 =head2 short_name
|
|
421
|
|
422 Example : $ncbi->short_name;
|
|
423 Description: The name of this genome in the Gspe ('G'enera
|
|
424 'spe'cies) format.
|
|
425 Returntype : string
|
|
426 Exceptions : none
|
|
427 Caller : general
|
|
428
|
|
429 =cut
|
|
430
|
|
431 sub short_name {
|
|
432 my $self = shift;
|
|
433 my $name = $self->name;
|
|
434 $name =~ s/(\S)\S+\s(\S{3})\S+/$1$2/;
|
|
435 $name =~ s/\ //g;
|
|
436 return $name;
|
|
437 }
|
|
438
|
|
439 sub get_short_name {
|
|
440 my $self = shift;
|
|
441 return $self->short_name;
|
|
442 }
|
|
443
|
|
444
|
|
445 sub RAP_species_format {
|
|
446 my $self = shift;
|
|
447 my $newick = "";
|
|
448
|
|
449 if($self->get_child_count() > 0) {
|
|
450 $newick .= "(";
|
|
451 my $first_child=1;
|
|
452 foreach my $child (@{$self->sorted_children}) {
|
|
453 $newick .= "," unless($first_child);
|
|
454 $newick .= $child->newick_format;
|
|
455 $first_child = 0;
|
|
456 }
|
|
457 $newick .= ")";
|
|
458 }
|
|
459
|
|
460 $newick .= sprintf("\"%s\"", $self->name,);
|
|
461 $newick .= sprintf(":%1.4f", $self->distance_to_parent) if($self->distance_to_parent > 0);
|
|
462
|
|
463 if(!($self->has_parent)) {
|
|
464 $newick .= ";";
|
|
465 }
|
|
466 return $newick;
|
|
467 }
|
|
468
|
|
469
|
|
470 sub print_node {
|
|
471 my $self = shift;
|
|
472 printf("(%s", $self->node_id);
|
|
473 printf(" %s", $self->rank) if($self->rank);
|
|
474 print(")");
|
|
475 printf("%s", $self->name) if($self->name);
|
|
476 print("\n");
|
|
477 }
|
|
478
|
|
479 1;
|