0
|
1 # $Id: locuslink.pm,v 1.2.2.2 2003/03/13 02:09:20 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::SeqIO::locuslink
|
|
4 #
|
|
5 # Cared for by Keith Ching <kching at gnf.org>
|
|
6 #
|
|
7 # Copyright Keith Ching
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 #
|
|
12 # (c) Keith Ching, kching at gnf.org, 2002.
|
|
13 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
|
|
14 #
|
|
15 # You may distribute this module under the same terms as perl itself.
|
|
16 # Refer to the Perl Artistic License (see the license accompanying this
|
|
17 # software package, or see http://www.perl.com/language/misc/Artistic.html)
|
|
18 # for the terms under which you may use, modify, and redistribute this module.
|
|
19 #
|
|
20 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
|
|
21 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
|
|
22 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
|
23 #
|
|
24
|
|
25 # POD documentation - main docs before the code
|
|
26
|
|
27 =head1 NAME
|
|
28
|
|
29 Bio::SeqIO::locuslink - DESCRIPTION of Object
|
|
30
|
|
31 =head1 SYNOPSIS
|
|
32
|
|
33 # don't instantiate directly - instead do
|
|
34 my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN);
|
|
35
|
|
36 =head1 DESCRIPTION
|
|
37
|
|
38 This module parses LocusLink into Bio::SeqI objects with rich
|
|
39 annotation, but no sequence.
|
|
40
|
|
41 The input file has to be in the LL_tmpl format - the tabular format
|
|
42 will not work.
|
|
43
|
|
44 The way the current implementation populates the object is rather a
|
|
45 draft work than a finished work of art. Note that at this stage the
|
|
46 locuslink entries cannot be round-tripped, because the parser loses
|
|
47 certain information. For instance, most of the alternative transcript
|
|
48 descriptions are not retained. The parser also misses any element
|
|
49 that deals with visual representation (e.g., 'button') except for the
|
|
50 URLs. Almost all of the pieces of the annotation are kept in the
|
|
51 L<Bio::Annotation::Collection> object.
|
|
52
|
|
53 =head1 FEEDBACK
|
|
54
|
|
55 =head2 Mailing Lists
|
|
56
|
|
57 User feedback is an integral part of the evolution of this and other
|
|
58 Bioperl modules. Send your comments and suggestions preferably to
|
|
59 the Bioperl mailing list. Your participation is much appreciated.
|
|
60
|
|
61 bioperl-l@bioperl.org - General discussion
|
|
62 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
63
|
|
64 =head2 Reporting Bugs
|
|
65
|
|
66 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
67 of the bugs and their resolution. Bug reports can be submitted via
|
|
68 email or the web:
|
|
69
|
|
70 bioperl-bugs@bioperl.org
|
|
71 http://bugzilla.bioperl.org/
|
|
72
|
|
73 =head1 AUTHOR - Keith Ching
|
|
74
|
|
75 Email kching at gnf.org
|
|
76
|
|
77 Describe contact details here
|
|
78
|
|
79 =head1 CONTRIBUTORS
|
|
80
|
|
81 Hilmar Lapp, hlapp at gmx.net
|
|
82
|
|
83 =head1 APPENDIX
|
|
84
|
|
85 The rest of the documentation details each of the object methods.
|
|
86 Internal methods are usually preceded with a _
|
|
87
|
|
88 =cut
|
|
89
|
|
90 package Bio::SeqIO::locuslink;
|
|
91
|
|
92 use strict;
|
|
93 use vars qw(@ISA);
|
|
94
|
|
95 use Bio::SeqIO;
|
|
96 use Bio::Seq::SeqFactory;
|
|
97 use Bio::Species;
|
|
98 use Bio::Annotation::DBLink;
|
|
99 #use Bio::Annotation::Reference;
|
|
100 use Bio::Annotation::Comment;
|
|
101 use Bio::Annotation::SimpleValue;
|
|
102 use Bio::Annotation::OntologyTerm;
|
|
103 use Bio::Annotation::Collection;
|
|
104
|
|
105 @ISA = qw(Bio::SeqIO);
|
|
106
|
|
107 # list of all the field names in locuslink
|
|
108 my @locuslink_keys = qw(
|
|
109 ACCNUM
|
|
110 ALIAS_PROT
|
|
111 ALIAS_SYMBOL
|
|
112 ASSEMBLY
|
|
113 BUTTON
|
|
114 CDD
|
|
115 CHR
|
|
116 COMP
|
|
117 CONTIG
|
|
118 CURRENT_LOCUSID
|
|
119 DB_DESCR
|
|
120 DB_LINK
|
|
121 ECNUM
|
|
122 EVID
|
|
123 EXTANNOT
|
|
124 GO
|
|
125 GRIF
|
|
126 LINK
|
|
127 LOCUSID
|
|
128 LOCUS_CONFIRMED
|
|
129 LOCUS_TYPE
|
|
130 MAP
|
|
131 MAPLINK
|
|
132 NC
|
|
133 NG
|
|
134 NM
|
|
135 NP
|
|
136 NR
|
|
137 OFFICIAL_GENE_NAME
|
|
138 OFFICIAL_SYMBOL
|
|
139 OMIM
|
|
140 ORGANISM
|
|
141 PHENOTYPE
|
|
142 PHENOTYPE_ID
|
|
143 PMID
|
|
144 PREFERRED_GENE_NAME
|
|
145 PREFERRED_PRODUCT
|
|
146 PREFERRED_SYMBOL
|
|
147 PRODUCT
|
|
148 PROT
|
|
149 RELL
|
|
150 STATUS
|
|
151 STS
|
|
152 SUMFUNC
|
|
153 SUMMARY
|
|
154 TRANSVAR
|
|
155 TYPE
|
|
156 UNIGENE
|
|
157 XG
|
|
158 XM
|
|
159 XP
|
|
160 XR
|
|
161 );
|
|
162
|
|
163 # list of fields to make simple annotations from
|
|
164 # fields not listed here or as a key in feature hash are ignored (lost).
|
|
165 my %anntype_map = (
|
|
166 SimpleValue => [qw(
|
|
167 ALIAS_PROT
|
|
168 ALIAS_SYMBOL
|
|
169 CDD
|
|
170 CHR
|
|
171 CURRENT_LOCUSID
|
|
172 ECNUM
|
|
173 EXTANNOT
|
|
174 MAP
|
|
175 NC
|
|
176 NR
|
|
177 OFFICIAL_GENE_NAME
|
|
178 OFFICIAL_SYMBOL
|
|
179 PHENOTYPE
|
|
180 PREFERRED_GENE_NAME
|
|
181 PREFERRED_PRODUCT
|
|
182 PREFERRED_SYMBOL
|
|
183 PRODUCT
|
|
184 RELL
|
|
185 SUMFUNC
|
|
186 )
|
|
187 ],
|
|
188 Comment => [qw(
|
|
189 SUMMARY
|
|
190 )
|
|
191 ],
|
|
192 );
|
|
193
|
|
194
|
|
195 # certain fields are not named the same as the symgene database list
|
|
196 my %dbname_map = (
|
|
197 pfam => 'Pfam',
|
|
198 smart => 'SMART',
|
|
199 NM => 'RefSeq',
|
|
200 NP => 'RefSeq',
|
|
201 XP => 'RefSeq',
|
|
202 XM => 'RefSeq',
|
|
203 NG => 'RefSeq',
|
|
204 XG => 'RefSeq',
|
|
205 XR => 'RefSeq',
|
|
206 PROT => 'GenBank',
|
|
207 ACCNUM => 'GenBank',
|
|
208 CONTIG => 'GenBank',
|
|
209 # certain fields are not named the same as the symgene
|
|
210 # database list: rename the fields the symgene database name
|
|
211 # key = field name in locuslink
|
|
212 # value = database name in sym
|
|
213 #GO => 'GO',
|
|
214 OMIM => 'MIM',
|
|
215 GRIF => 'GRIF',
|
|
216 STS => 'STS',
|
|
217 UNIGENE => 'UniGene',
|
|
218 );
|
|
219
|
|
220 # certain CDD entries use the wrong prefix for the accession number
|
|
221 # cddprefix will replace the key w/ the value for these entries
|
|
222 my %cddprefix = (
|
|
223 pfam => 'PF',
|
|
224 smart => 'SM',
|
|
225 );
|
|
226
|
|
227 # alternate mappings if one field does not exist
|
|
228 my %alternate_map = (
|
|
229 OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME',
|
|
230 OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL',
|
|
231 );
|
|
232
|
|
233 # for these field names, we only care about the first value X in value X|Y|Z
|
|
234 my @ll_firstelements = qw(
|
|
235 NM
|
|
236 NP
|
|
237 NG
|
|
238 XG
|
|
239 XM
|
|
240 XP
|
|
241 XR
|
|
242 PROT
|
|
243 STS
|
|
244 ACCNUM
|
|
245 CONTIG
|
|
246 GRIF
|
|
247 );
|
|
248
|
|
249 # these fields need to be flattened into a single string, using the given
|
|
250 # join string
|
|
251 my %flatten_tags = (
|
|
252 ASSEMBLY => '',
|
|
253 ORGANISM => '',
|
|
254 OFFICIAL_SYMBOL => '',
|
|
255 OFFICIAL_GENE_NAME => '',
|
|
256 LOCUSID => '',
|
|
257 PMID => '',
|
|
258 PREFERRED_SYMBOL => ', ',
|
|
259 PREFERRED_GENE_NAME => ', '
|
|
260 );
|
|
261
|
|
262 # set the default search pattern for all the field names
|
|
263 my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys;
|
|
264
|
|
265 sub _initialize {
|
|
266 my($self,@args) = @_;
|
|
267
|
|
268 $self->SUPER::_initialize(@args);
|
|
269
|
|
270 # overwrite the search pattern w/ the first value pattern
|
|
271 foreach my $key(@ll_firstelements){
|
|
272 $feature_pat_map{$key}="^$key: ([^|]+)";
|
|
273 }
|
|
274
|
|
275 # special search pattern for cdd entries
|
|
276 foreach my $key(keys %cddprefix) {
|
|
277 $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)';
|
|
278 }
|
|
279
|
|
280 # special patterns for specific fields
|
|
281 $feature_pat_map{MAP} = '^MAP: (.+?)\|';
|
|
282 $feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|';
|
|
283 $feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|';
|
|
284 $feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|';
|
|
285 $feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|';
|
|
286 $feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+';
|
|
287
|
|
288 # set the sequence factory of none has been set already
|
|
289 if(! $self->sequence_factory()) {
|
|
290 $self->sequence_factory(Bio::Seq::SeqFactory->new(
|
|
291 -type => 'Bio::Seq::RichSeq'));
|
|
292 }
|
|
293 }
|
|
294
|
|
295
|
|
296 #########################
|
|
297 #
|
|
298 sub search_pattern{
|
|
299 #
|
|
300 #########################
|
|
301 my ($self,
|
|
302 $entry, #text to search
|
|
303 $searchconfirm, #to make sure you got the right thing
|
|
304 $searchpattern,
|
|
305 $searchtype) = @_;
|
|
306 my @query = $entry=~/$searchpattern/gm;
|
|
307 if ($searchconfirm ne "FALSE"){
|
|
308 $self->warn("No $searchtype found\n$entry\n") unless @query;
|
|
309 foreach (@query){
|
|
310 if (!($_=~/$searchconfirm/)){
|
|
311 $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n");
|
|
312 }
|
|
313 }#endforeach
|
|
314 }#endsearchconfirm
|
|
315 return(@query);
|
|
316 }#endsub
|
|
317 ############
|
|
318 #
|
|
319 sub read_species{
|
|
320 #
|
|
321 ############
|
|
322 my ($spline)=@_;
|
|
323 my $species;
|
|
324 my $genus;
|
|
325 ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/;
|
|
326 my $make = Bio::Species->new();
|
|
327 $make->classification( ($species,$genus) );
|
|
328 return $make;
|
|
329 }
|
|
330 ################
|
|
331 #
|
|
332 sub read_dblink{
|
|
333 #
|
|
334 ################
|
|
335 my ($ann,$db,$ref)=@_;
|
|
336 my @results=$ref ? @$ref : ();
|
|
337 foreach my $id(@results){
|
|
338 if($id){
|
|
339 $ann->add_Annotation('dblink',
|
|
340 Bio::Annotation::DBLink->new(
|
|
341 -database =>$db ,
|
|
342 -primary_id =>$id));
|
|
343 }
|
|
344 }
|
|
345 return($ann);
|
|
346 }
|
|
347
|
|
348 ################
|
|
349 #
|
|
350 sub read_reference{
|
|
351 #
|
|
352 ################
|
|
353 my ($ann,$db,$results)=@_;
|
|
354
|
|
355 if($results){
|
|
356 chomp($results);
|
|
357 my @ids=split(/,/,$results);
|
|
358 $ann = read_dblink($ann,$db,\@ids) if @ids;
|
|
359 }
|
|
360 return $ann;
|
|
361 }#endsub
|
|
362
|
|
363 ################
|
|
364 #
|
|
365 sub add_annotation{
|
|
366 #
|
|
367 ################
|
|
368 my ($ac,$type,$text,$anntype)=@_;
|
|
369 my @args;
|
|
370
|
|
371 $anntype = 'SimpleValue' unless $anntype;
|
|
372 SWITCH : {
|
|
373 $anntype eq 'SimpleValue' && do {
|
|
374 push(@args, -value => $text, -tagname => $type);
|
|
375 last SWITCH;
|
|
376 };
|
|
377 $anntype eq 'Comment' && do {
|
|
378 push(@args, -text => $text, -tagname => 'comment');
|
|
379 last SWITCH;
|
|
380 };
|
|
381 }
|
|
382 $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args));
|
|
383 return($ac);
|
|
384 }#endsub
|
|
385
|
|
386 ################
|
|
387 #
|
|
388 sub add_annotation_ref{
|
|
389 #
|
|
390 ################
|
|
391 my ($ann,$type,$textref)=@_;
|
|
392 my @text=$textref ? @$textref : ();
|
|
393
|
|
394 foreach my $text(@text){
|
|
395 $ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text));
|
|
396 }
|
|
397 return($ann);
|
|
398 }#endsub
|
|
399
|
|
400 ################
|
|
401 #
|
|
402 sub make_unique{
|
|
403 #
|
|
404 ##############
|
|
405 my ($ann,$key) = @_;
|
|
406
|
|
407 my %seen = ();
|
|
408 foreach my $dbl ($ann->remove_Annotations($key)) {
|
|
409 if(! $seen{$dbl->as_text()}) {
|
|
410 $seen{$dbl->as_text()} = 1;
|
|
411 $ann->add_Annotation($dbl);
|
|
412 }
|
|
413 }
|
|
414 return $ann;
|
|
415 }
|
|
416
|
|
417 ################
|
|
418 #
|
|
419 sub next_seq{
|
|
420 #
|
|
421 ##############
|
|
422 my ($self, @args)=@_;
|
|
423 my (@results,$search,$ref,$cddref);
|
|
424
|
|
425 # LOCUSLINK entries begin w/ >>
|
|
426 local $/="\n>>";
|
|
427
|
|
428 # slurp in a whole entry and return if no more entries
|
|
429 return unless my $entry = $self->_readline;
|
|
430
|
|
431 # strip the leading '>>' is it's the first entry
|
|
432 if (index($entry,'>>') == 0) { #first entry
|
|
433 $entry = substr($entry,2);
|
|
434 }
|
|
435
|
|
436 # we aren't interested in obsoleted entries, so we need to loop
|
|
437 # and skip those until we've found the next not obsoleted
|
|
438 my %record = ();
|
|
439 while($entry && ($entry =~ /\w/)) {
|
|
440 if (!($entry=~/LOCUSID/)){
|
|
441 $self->throw("No LOCUSID in first line of record. ".
|
|
442 "Not LocusLink in my book.");
|
|
443 }
|
|
444 # see whether it's an obsoleted entry, and if so jump to the next
|
|
445 # one entry right away
|
|
446 if($entry =~ /^CURRENT_LOCUSID:/m) {
|
|
447 # read next entry and continue
|
|
448 $entry = $self->_readline;
|
|
449 %record = ();
|
|
450 next;
|
|
451 }
|
|
452 # loop through list of features and get field values
|
|
453 # place into record hash as array refs
|
|
454 foreach my $key (keys %feature_pat_map){
|
|
455 $search=$feature_pat_map{$key};
|
|
456 @results=$self->search_pattern($entry,'FALSE',$search,$search);
|
|
457 $record{$key} = @results ? [@results] : undef;
|
|
458 }#endfor
|
|
459 # terminate loop as this one hasn't been obsoleted
|
|
460 last;
|
|
461 }
|
|
462
|
|
463 # we have reached the end-of-file ...
|
|
464 return unless %record;
|
|
465
|
|
466 # special processing for CDD entries like pfam and smart
|
|
467 my ($PRESENT,@keep);
|
|
468 foreach my $key(keys %cddprefix){
|
|
469 #print "check CDD $key\n";
|
|
470 if($record{$key}) {
|
|
471 @keep=();
|
|
472 foreach my $list (@{$record{$key}}) {
|
|
473 # replace AC with correct AC number
|
|
474 push(@keep,$cddprefix{$key}.$list);
|
|
475 }
|
|
476 # replace CDD ref with correctly prefixed AC number
|
|
477 $record{$key} = [@keep];
|
|
478 }
|
|
479 }
|
|
480 # modify CDD references @=();
|
|
481 if($record{CDD}) {
|
|
482 @keep=();
|
|
483 foreach my $cdd (@{$record{CDD}}) {
|
|
484 $PRESENT = undef;
|
|
485 foreach my $key (keys %cddprefix) {
|
|
486 if ($cdd=~/$key/){
|
|
487 $PRESENT = 1;
|
|
488 last;
|
|
489 }
|
|
490 }
|
|
491 push(@keep,$cdd) if(! $PRESENT);
|
|
492 }
|
|
493 $record{CDD} = [@keep];
|
|
494 }
|
|
495
|
|
496 # create annotation collection - we'll need it now
|
|
497 my $ann = Bio::Annotation::Collection->new();
|
|
498
|
|
499 foreach my $field(keys %dbname_map){
|
|
500 $ann=read_dblink($ann,$dbname_map{$field},$record{$field});
|
|
501 }
|
|
502
|
|
503 # add GO link as an OntologyTerm annotation
|
|
504 if($record{GO}) {
|
|
505 for(my $j = 0; $j < @{$record{GO}}; $j++) {
|
|
506 my $goann = Bio::Annotation::OntologyTerm->new(
|
|
507 -identifier => $record{GO}->[$j],
|
|
508 -name => $record{GO_DESC}->[$j],
|
|
509 -ontology => $record{GO_CAT}->[$j]);
|
|
510 $ann->add_Annotation($goann);
|
|
511 }
|
|
512 }
|
|
513
|
|
514 $ann=add_annotation_ref($ann,'URL',$record{LINK});
|
|
515 $ann=add_annotation_ref($ann,'URL',$record{DB_LINK});
|
|
516
|
|
517 # presently we can't store types of dblinks - hence make unique
|
|
518 make_unique($ann,'dblink');
|
|
519
|
|
520 # everything else gets a simple tag or comment value annotation
|
|
521 foreach my $anntype (keys %anntype_map) {
|
|
522 foreach my $key (@{$anntype_map{$anntype}}){
|
|
523 if($record{$key}){
|
|
524 foreach (@{$record{$key}}){
|
|
525 #print "$key\t\t$_\n";
|
|
526 $ann=add_annotation($ann,$key,$_,$anntype);
|
|
527 }
|
|
528 }
|
|
529 }
|
|
530 }
|
|
531
|
|
532 # flatten designated attributes into a scalar value
|
|
533 foreach my $field (keys %flatten_tags) {
|
|
534 if($record{$field}) {
|
|
535 $record{$field} = join($flatten_tags{$field},
|
|
536 @{$record{$field}});
|
|
537 }
|
|
538 }
|
|
539
|
|
540 # annotation that expects the array flattened out
|
|
541 $ann=read_reference($ann,'PUBMED',$record{PMID});
|
|
542 if($record{ASSEMBLY}) {
|
|
543 my @assembly=split(/,/,$record{ASSEMBLY});
|
|
544 $ann=read_dblink($ann,'GenBank',\@assembly);
|
|
545 }
|
|
546
|
|
547 # replace fields w/ alternate if original does not exist
|
|
548 foreach my $fieldval (keys %alternate_map){
|
|
549 if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){
|
|
550 $record{$fieldval}=$record{$alternate_map{$fieldval}};
|
|
551 }
|
|
552 }
|
|
553
|
|
554 # create sequence object (i.e., let seq.factory create one)
|
|
555 my $seq = $self->sequence_factory->create(
|
|
556 -verbose => $self->verbose(),
|
|
557 -accession_number => $record{LOCUSID},
|
|
558 -desc => $record{OFFICIAL_GENE_NAME},
|
|
559 -display_id => $record{OFFICIAL_SYMBOL},
|
|
560 -species => read_species($record{ORGANISM}),
|
|
561 -annotation => $ann);
|
|
562
|
|
563 # dump out object contents
|
|
564 # show_obj([$seq]);
|
|
565
|
|
566 return($seq);
|
|
567 }
|
|
568
|
|
569 ################
|
|
570 #
|
|
571 sub show_obj{
|
|
572 #
|
|
573 ################
|
|
574 my ($seqlistref)=@_;
|
|
575 my @list=@$seqlistref;
|
|
576 my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' );
|
|
577 my ($ann,@values,$val);
|
|
578
|
|
579 foreach my $seq(@list){
|
|
580 $out->write_seq($seq);
|
|
581 $ann=$seq->annotation;
|
|
582 foreach my $key ( $ann->get_all_annotation_keys() ) {
|
|
583 @values = $ann->get_Annotations($key);
|
|
584 foreach my $value ( @values ) {
|
|
585 # value is an Bio::AnnotationI, and defines a "as_text" method
|
|
586 $val=$value->as_text;
|
|
587 print "Annotation ",$key,"\t\t",$val,"\n";
|
|
588 }
|
|
589 }
|
|
590 }
|
|
591 }#endsub
|
|
592
|
|
593 1;
|