comparison variant_effect_predictor/Bio/SeqIO/locuslink.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: locuslink.pm,v 1.2.2.2 2003/03/13 02:09:20 lapp Exp $
2 #
3 # BioPerl module for Bio::SeqIO::locuslink
4 #
5 # Cared for by Keith Ching <kching at gnf.org>
6 #
7 # Copyright Keith Ching
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 #
12 # (c) Keith Ching, kching at gnf.org, 2002.
13 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002.
14 #
15 # You may distribute this module under the same terms as perl itself.
16 # Refer to the Perl Artistic License (see the license accompanying this
17 # software package, or see http://www.perl.com/language/misc/Artistic.html)
18 # for the terms under which you may use, modify, and redistribute this module.
19 #
20 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
21 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
22 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
23 #
24
25 # POD documentation - main docs before the code
26
27 =head1 NAME
28
29 Bio::SeqIO::locuslink - DESCRIPTION of Object
30
31 =head1 SYNOPSIS
32
33 # don't instantiate directly - instead do
34 my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN);
35
36 =head1 DESCRIPTION
37
38 This module parses LocusLink into Bio::SeqI objects with rich
39 annotation, but no sequence.
40
41 The input file has to be in the LL_tmpl format - the tabular format
42 will not work.
43
44 The way the current implementation populates the object is rather a
45 draft work than a finished work of art. Note that at this stage the
46 locuslink entries cannot be round-tripped, because the parser loses
47 certain information. For instance, most of the alternative transcript
48 descriptions are not retained. The parser also misses any element
49 that deals with visual representation (e.g., 'button') except for the
50 URLs. Almost all of the pieces of the annotation are kept in the
51 L<Bio::Annotation::Collection> object.
52
53 =head1 FEEDBACK
54
55 =head2 Mailing Lists
56
57 User feedback is an integral part of the evolution of this and other
58 Bioperl modules. Send your comments and suggestions preferably to
59 the Bioperl mailing list. Your participation is much appreciated.
60
61 bioperl-l@bioperl.org - General discussion
62 http://bioperl.org/MailList.shtml - About the mailing lists
63
64 =head2 Reporting Bugs
65
66 Report bugs to the Bioperl bug tracking system to help us keep track
67 of the bugs and their resolution. Bug reports can be submitted via
68 email or the web:
69
70 bioperl-bugs@bioperl.org
71 http://bugzilla.bioperl.org/
72
73 =head1 AUTHOR - Keith Ching
74
75 Email kching at gnf.org
76
77 Describe contact details here
78
79 =head1 CONTRIBUTORS
80
81 Hilmar Lapp, hlapp at gmx.net
82
83 =head1 APPENDIX
84
85 The rest of the documentation details each of the object methods.
86 Internal methods are usually preceded with a _
87
88 =cut
89
90 package Bio::SeqIO::locuslink;
91
92 use strict;
93 use vars qw(@ISA);
94
95 use Bio::SeqIO;
96 use Bio::Seq::SeqFactory;
97 use Bio::Species;
98 use Bio::Annotation::DBLink;
99 #use Bio::Annotation::Reference;
100 use Bio::Annotation::Comment;
101 use Bio::Annotation::SimpleValue;
102 use Bio::Annotation::OntologyTerm;
103 use Bio::Annotation::Collection;
104
105 @ISA = qw(Bio::SeqIO);
106
107 # list of all the field names in locuslink
108 my @locuslink_keys = qw(
109 ACCNUM
110 ALIAS_PROT
111 ALIAS_SYMBOL
112 ASSEMBLY
113 BUTTON
114 CDD
115 CHR
116 COMP
117 CONTIG
118 CURRENT_LOCUSID
119 DB_DESCR
120 DB_LINK
121 ECNUM
122 EVID
123 EXTANNOT
124 GO
125 GRIF
126 LINK
127 LOCUSID
128 LOCUS_CONFIRMED
129 LOCUS_TYPE
130 MAP
131 MAPLINK
132 NC
133 NG
134 NM
135 NP
136 NR
137 OFFICIAL_GENE_NAME
138 OFFICIAL_SYMBOL
139 OMIM
140 ORGANISM
141 PHENOTYPE
142 PHENOTYPE_ID
143 PMID
144 PREFERRED_GENE_NAME
145 PREFERRED_PRODUCT
146 PREFERRED_SYMBOL
147 PRODUCT
148 PROT
149 RELL
150 STATUS
151 STS
152 SUMFUNC
153 SUMMARY
154 TRANSVAR
155 TYPE
156 UNIGENE
157 XG
158 XM
159 XP
160 XR
161 );
162
163 # list of fields to make simple annotations from
164 # fields not listed here or as a key in feature hash are ignored (lost).
165 my %anntype_map = (
166 SimpleValue => [qw(
167 ALIAS_PROT
168 ALIAS_SYMBOL
169 CDD
170 CHR
171 CURRENT_LOCUSID
172 ECNUM
173 EXTANNOT
174 MAP
175 NC
176 NR
177 OFFICIAL_GENE_NAME
178 OFFICIAL_SYMBOL
179 PHENOTYPE
180 PREFERRED_GENE_NAME
181 PREFERRED_PRODUCT
182 PREFERRED_SYMBOL
183 PRODUCT
184 RELL
185 SUMFUNC
186 )
187 ],
188 Comment => [qw(
189 SUMMARY
190 )
191 ],
192 );
193
194
195 # certain fields are not named the same as the symgene database list
196 my %dbname_map = (
197 pfam => 'Pfam',
198 smart => 'SMART',
199 NM => 'RefSeq',
200 NP => 'RefSeq',
201 XP => 'RefSeq',
202 XM => 'RefSeq',
203 NG => 'RefSeq',
204 XG => 'RefSeq',
205 XR => 'RefSeq',
206 PROT => 'GenBank',
207 ACCNUM => 'GenBank',
208 CONTIG => 'GenBank',
209 # certain fields are not named the same as the symgene
210 # database list: rename the fields the symgene database name
211 # key = field name in locuslink
212 # value = database name in sym
213 #GO => 'GO',
214 OMIM => 'MIM',
215 GRIF => 'GRIF',
216 STS => 'STS',
217 UNIGENE => 'UniGene',
218 );
219
220 # certain CDD entries use the wrong prefix for the accession number
221 # cddprefix will replace the key w/ the value for these entries
222 my %cddprefix = (
223 pfam => 'PF',
224 smart => 'SM',
225 );
226
227 # alternate mappings if one field does not exist
228 my %alternate_map = (
229 OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME',
230 OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL',
231 );
232
233 # for these field names, we only care about the first value X in value X|Y|Z
234 my @ll_firstelements = qw(
235 NM
236 NP
237 NG
238 XG
239 XM
240 XP
241 XR
242 PROT
243 STS
244 ACCNUM
245 CONTIG
246 GRIF
247 );
248
249 # these fields need to be flattened into a single string, using the given
250 # join string
251 my %flatten_tags = (
252 ASSEMBLY => '',
253 ORGANISM => '',
254 OFFICIAL_SYMBOL => '',
255 OFFICIAL_GENE_NAME => '',
256 LOCUSID => '',
257 PMID => '',
258 PREFERRED_SYMBOL => ', ',
259 PREFERRED_GENE_NAME => ', '
260 );
261
262 # set the default search pattern for all the field names
263 my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys;
264
265 sub _initialize {
266 my($self,@args) = @_;
267
268 $self->SUPER::_initialize(@args);
269
270 # overwrite the search pattern w/ the first value pattern
271 foreach my $key(@ll_firstelements){
272 $feature_pat_map{$key}="^$key: ([^|]+)";
273 }
274
275 # special search pattern for cdd entries
276 foreach my $key(keys %cddprefix) {
277 $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)';
278 }
279
280 # special patterns for specific fields
281 $feature_pat_map{MAP} = '^MAP: (.+?)\|';
282 $feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|';
283 $feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|';
284 $feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|';
285 $feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|';
286 $feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+';
287
288 # set the sequence factory of none has been set already
289 if(! $self->sequence_factory()) {
290 $self->sequence_factory(Bio::Seq::SeqFactory->new(
291 -type => 'Bio::Seq::RichSeq'));
292 }
293 }
294
295
296 #########################
297 #
298 sub search_pattern{
299 #
300 #########################
301 my ($self,
302 $entry, #text to search
303 $searchconfirm, #to make sure you got the right thing
304 $searchpattern,
305 $searchtype) = @_;
306 my @query = $entry=~/$searchpattern/gm;
307 if ($searchconfirm ne "FALSE"){
308 $self->warn("No $searchtype found\n$entry\n") unless @query;
309 foreach (@query){
310 if (!($_=~/$searchconfirm/)){
311 $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n");
312 }
313 }#endforeach
314 }#endsearchconfirm
315 return(@query);
316 }#endsub
317 ############
318 #
319 sub read_species{
320 #
321 ############
322 my ($spline)=@_;
323 my $species;
324 my $genus;
325 ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/;
326 my $make = Bio::Species->new();
327 $make->classification( ($species,$genus) );
328 return $make;
329 }
330 ################
331 #
332 sub read_dblink{
333 #
334 ################
335 my ($ann,$db,$ref)=@_;
336 my @results=$ref ? @$ref : ();
337 foreach my $id(@results){
338 if($id){
339 $ann->add_Annotation('dblink',
340 Bio::Annotation::DBLink->new(
341 -database =>$db ,
342 -primary_id =>$id));
343 }
344 }
345 return($ann);
346 }
347
348 ################
349 #
350 sub read_reference{
351 #
352 ################
353 my ($ann,$db,$results)=@_;
354
355 if($results){
356 chomp($results);
357 my @ids=split(/,/,$results);
358 $ann = read_dblink($ann,$db,\@ids) if @ids;
359 }
360 return $ann;
361 }#endsub
362
363 ################
364 #
365 sub add_annotation{
366 #
367 ################
368 my ($ac,$type,$text,$anntype)=@_;
369 my @args;
370
371 $anntype = 'SimpleValue' unless $anntype;
372 SWITCH : {
373 $anntype eq 'SimpleValue' && do {
374 push(@args, -value => $text, -tagname => $type);
375 last SWITCH;
376 };
377 $anntype eq 'Comment' && do {
378 push(@args, -text => $text, -tagname => 'comment');
379 last SWITCH;
380 };
381 }
382 $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args));
383 return($ac);
384 }#endsub
385
386 ################
387 #
388 sub add_annotation_ref{
389 #
390 ################
391 my ($ann,$type,$textref)=@_;
392 my @text=$textref ? @$textref : ();
393
394 foreach my $text(@text){
395 $ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text));
396 }
397 return($ann);
398 }#endsub
399
400 ################
401 #
402 sub make_unique{
403 #
404 ##############
405 my ($ann,$key) = @_;
406
407 my %seen = ();
408 foreach my $dbl ($ann->remove_Annotations($key)) {
409 if(! $seen{$dbl->as_text()}) {
410 $seen{$dbl->as_text()} = 1;
411 $ann->add_Annotation($dbl);
412 }
413 }
414 return $ann;
415 }
416
417 ################
418 #
419 sub next_seq{
420 #
421 ##############
422 my ($self, @args)=@_;
423 my (@results,$search,$ref,$cddref);
424
425 # LOCUSLINK entries begin w/ >>
426 local $/="\n>>";
427
428 # slurp in a whole entry and return if no more entries
429 return unless my $entry = $self->_readline;
430
431 # strip the leading '>>' is it's the first entry
432 if (index($entry,'>>') == 0) { #first entry
433 $entry = substr($entry,2);
434 }
435
436 # we aren't interested in obsoleted entries, so we need to loop
437 # and skip those until we've found the next not obsoleted
438 my %record = ();
439 while($entry && ($entry =~ /\w/)) {
440 if (!($entry=~/LOCUSID/)){
441 $self->throw("No LOCUSID in first line of record. ".
442 "Not LocusLink in my book.");
443 }
444 # see whether it's an obsoleted entry, and if so jump to the next
445 # one entry right away
446 if($entry =~ /^CURRENT_LOCUSID:/m) {
447 # read next entry and continue
448 $entry = $self->_readline;
449 %record = ();
450 next;
451 }
452 # loop through list of features and get field values
453 # place into record hash as array refs
454 foreach my $key (keys %feature_pat_map){
455 $search=$feature_pat_map{$key};
456 @results=$self->search_pattern($entry,'FALSE',$search,$search);
457 $record{$key} = @results ? [@results] : undef;
458 }#endfor
459 # terminate loop as this one hasn't been obsoleted
460 last;
461 }
462
463 # we have reached the end-of-file ...
464 return unless %record;
465
466 # special processing for CDD entries like pfam and smart
467 my ($PRESENT,@keep);
468 foreach my $key(keys %cddprefix){
469 #print "check CDD $key\n";
470 if($record{$key}) {
471 @keep=();
472 foreach my $list (@{$record{$key}}) {
473 # replace AC with correct AC number
474 push(@keep,$cddprefix{$key}.$list);
475 }
476 # replace CDD ref with correctly prefixed AC number
477 $record{$key} = [@keep];
478 }
479 }
480 # modify CDD references @=();
481 if($record{CDD}) {
482 @keep=();
483 foreach my $cdd (@{$record{CDD}}) {
484 $PRESENT = undef;
485 foreach my $key (keys %cddprefix) {
486 if ($cdd=~/$key/){
487 $PRESENT = 1;
488 last;
489 }
490 }
491 push(@keep,$cdd) if(! $PRESENT);
492 }
493 $record{CDD} = [@keep];
494 }
495
496 # create annotation collection - we'll need it now
497 my $ann = Bio::Annotation::Collection->new();
498
499 foreach my $field(keys %dbname_map){
500 $ann=read_dblink($ann,$dbname_map{$field},$record{$field});
501 }
502
503 # add GO link as an OntologyTerm annotation
504 if($record{GO}) {
505 for(my $j = 0; $j < @{$record{GO}}; $j++) {
506 my $goann = Bio::Annotation::OntologyTerm->new(
507 -identifier => $record{GO}->[$j],
508 -name => $record{GO_DESC}->[$j],
509 -ontology => $record{GO_CAT}->[$j]);
510 $ann->add_Annotation($goann);
511 }
512 }
513
514 $ann=add_annotation_ref($ann,'URL',$record{LINK});
515 $ann=add_annotation_ref($ann,'URL',$record{DB_LINK});
516
517 # presently we can't store types of dblinks - hence make unique
518 make_unique($ann,'dblink');
519
520 # everything else gets a simple tag or comment value annotation
521 foreach my $anntype (keys %anntype_map) {
522 foreach my $key (@{$anntype_map{$anntype}}){
523 if($record{$key}){
524 foreach (@{$record{$key}}){
525 #print "$key\t\t$_\n";
526 $ann=add_annotation($ann,$key,$_,$anntype);
527 }
528 }
529 }
530 }
531
532 # flatten designated attributes into a scalar value
533 foreach my $field (keys %flatten_tags) {
534 if($record{$field}) {
535 $record{$field} = join($flatten_tags{$field},
536 @{$record{$field}});
537 }
538 }
539
540 # annotation that expects the array flattened out
541 $ann=read_reference($ann,'PUBMED',$record{PMID});
542 if($record{ASSEMBLY}) {
543 my @assembly=split(/,/,$record{ASSEMBLY});
544 $ann=read_dblink($ann,'GenBank',\@assembly);
545 }
546
547 # replace fields w/ alternate if original does not exist
548 foreach my $fieldval (keys %alternate_map){
549 if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){
550 $record{$fieldval}=$record{$alternate_map{$fieldval}};
551 }
552 }
553
554 # create sequence object (i.e., let seq.factory create one)
555 my $seq = $self->sequence_factory->create(
556 -verbose => $self->verbose(),
557 -accession_number => $record{LOCUSID},
558 -desc => $record{OFFICIAL_GENE_NAME},
559 -display_id => $record{OFFICIAL_SYMBOL},
560 -species => read_species($record{ORGANISM}),
561 -annotation => $ann);
562
563 # dump out object contents
564 # show_obj([$seq]);
565
566 return($seq);
567 }
568
569 ################
570 #
571 sub show_obj{
572 #
573 ################
574 my ($seqlistref)=@_;
575 my @list=@$seqlistref;
576 my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' );
577 my ($ann,@values,$val);
578
579 foreach my $seq(@list){
580 $out->write_seq($seq);
581 $ann=$seq->annotation;
582 foreach my $key ( $ann->get_all_annotation_keys() ) {
583 @values = $ann->get_Annotations($key);
584 foreach my $value ( @values ) {
585 # value is an Bio::AnnotationI, and defines a "as_text" method
586 $val=$value->as_text;
587 print "Annotation ",$key,"\t\t",$val,"\n";
588 }
589 }
590 }
591 }#endsub
592
593 1;