Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/SeqIO/locuslink.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: locuslink.pm,v 1.2.2.2 2003/03/13 02:09:20 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::SeqIO::locuslink | |
4 # | |
5 # Cared for by Keith Ching <kching at gnf.org> | |
6 # | |
7 # Copyright Keith Ching | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # | |
12 # (c) Keith Ching, kching at gnf.org, 2002. | |
13 # (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. | |
14 # | |
15 # You may distribute this module under the same terms as perl itself. | |
16 # Refer to the Perl Artistic License (see the license accompanying this | |
17 # software package, or see http://www.perl.com/language/misc/Artistic.html) | |
18 # for the terms under which you may use, modify, and redistribute this module. | |
19 # | |
20 # THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED | |
21 # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF | |
22 # MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. | |
23 # | |
24 | |
25 # POD documentation - main docs before the code | |
26 | |
27 =head1 NAME | |
28 | |
29 Bio::SeqIO::locuslink - DESCRIPTION of Object | |
30 | |
31 =head1 SYNOPSIS | |
32 | |
33 # don't instantiate directly - instead do | |
34 my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN); | |
35 | |
36 =head1 DESCRIPTION | |
37 | |
38 This module parses LocusLink into Bio::SeqI objects with rich | |
39 annotation, but no sequence. | |
40 | |
41 The input file has to be in the LL_tmpl format - the tabular format | |
42 will not work. | |
43 | |
44 The way the current implementation populates the object is rather a | |
45 draft work than a finished work of art. Note that at this stage the | |
46 locuslink entries cannot be round-tripped, because the parser loses | |
47 certain information. For instance, most of the alternative transcript | |
48 descriptions are not retained. The parser also misses any element | |
49 that deals with visual representation (e.g., 'button') except for the | |
50 URLs. Almost all of the pieces of the annotation are kept in the | |
51 L<Bio::Annotation::Collection> object. | |
52 | |
53 =head1 FEEDBACK | |
54 | |
55 =head2 Mailing Lists | |
56 | |
57 User feedback is an integral part of the evolution of this and other | |
58 Bioperl modules. Send your comments and suggestions preferably to | |
59 the Bioperl mailing list. Your participation is much appreciated. | |
60 | |
61 bioperl-l@bioperl.org - General discussion | |
62 http://bioperl.org/MailList.shtml - About the mailing lists | |
63 | |
64 =head2 Reporting Bugs | |
65 | |
66 Report bugs to the Bioperl bug tracking system to help us keep track | |
67 of the bugs and their resolution. Bug reports can be submitted via | |
68 email or the web: | |
69 | |
70 bioperl-bugs@bioperl.org | |
71 http://bugzilla.bioperl.org/ | |
72 | |
73 =head1 AUTHOR - Keith Ching | |
74 | |
75 Email kching at gnf.org | |
76 | |
77 Describe contact details here | |
78 | |
79 =head1 CONTRIBUTORS | |
80 | |
81 Hilmar Lapp, hlapp at gmx.net | |
82 | |
83 =head1 APPENDIX | |
84 | |
85 The rest of the documentation details each of the object methods. | |
86 Internal methods are usually preceded with a _ | |
87 | |
88 =cut | |
89 | |
90 package Bio::SeqIO::locuslink; | |
91 | |
92 use strict; | |
93 use vars qw(@ISA); | |
94 | |
95 use Bio::SeqIO; | |
96 use Bio::Seq::SeqFactory; | |
97 use Bio::Species; | |
98 use Bio::Annotation::DBLink; | |
99 #use Bio::Annotation::Reference; | |
100 use Bio::Annotation::Comment; | |
101 use Bio::Annotation::SimpleValue; | |
102 use Bio::Annotation::OntologyTerm; | |
103 use Bio::Annotation::Collection; | |
104 | |
105 @ISA = qw(Bio::SeqIO); | |
106 | |
107 # list of all the field names in locuslink | |
108 my @locuslink_keys = qw( | |
109 ACCNUM | |
110 ALIAS_PROT | |
111 ALIAS_SYMBOL | |
112 ASSEMBLY | |
113 BUTTON | |
114 CDD | |
115 CHR | |
116 COMP | |
117 CONTIG | |
118 CURRENT_LOCUSID | |
119 DB_DESCR | |
120 DB_LINK | |
121 ECNUM | |
122 EVID | |
123 EXTANNOT | |
124 GO | |
125 GRIF | |
126 LINK | |
127 LOCUSID | |
128 LOCUS_CONFIRMED | |
129 LOCUS_TYPE | |
130 MAP | |
131 MAPLINK | |
132 NC | |
133 NG | |
134 NM | |
135 NP | |
136 NR | |
137 OFFICIAL_GENE_NAME | |
138 OFFICIAL_SYMBOL | |
139 OMIM | |
140 ORGANISM | |
141 PHENOTYPE | |
142 PHENOTYPE_ID | |
143 PMID | |
144 PREFERRED_GENE_NAME | |
145 PREFERRED_PRODUCT | |
146 PREFERRED_SYMBOL | |
147 PRODUCT | |
148 PROT | |
149 RELL | |
150 STATUS | |
151 STS | |
152 SUMFUNC | |
153 SUMMARY | |
154 TRANSVAR | |
155 TYPE | |
156 UNIGENE | |
157 XG | |
158 XM | |
159 XP | |
160 XR | |
161 ); | |
162 | |
163 # list of fields to make simple annotations from | |
164 # fields not listed here or as a key in feature hash are ignored (lost). | |
165 my %anntype_map = ( | |
166 SimpleValue => [qw( | |
167 ALIAS_PROT | |
168 ALIAS_SYMBOL | |
169 CDD | |
170 CHR | |
171 CURRENT_LOCUSID | |
172 ECNUM | |
173 EXTANNOT | |
174 MAP | |
175 NC | |
176 NR | |
177 OFFICIAL_GENE_NAME | |
178 OFFICIAL_SYMBOL | |
179 PHENOTYPE | |
180 PREFERRED_GENE_NAME | |
181 PREFERRED_PRODUCT | |
182 PREFERRED_SYMBOL | |
183 PRODUCT | |
184 RELL | |
185 SUMFUNC | |
186 ) | |
187 ], | |
188 Comment => [qw( | |
189 SUMMARY | |
190 ) | |
191 ], | |
192 ); | |
193 | |
194 | |
195 # certain fields are not named the same as the symgene database list | |
196 my %dbname_map = ( | |
197 pfam => 'Pfam', | |
198 smart => 'SMART', | |
199 NM => 'RefSeq', | |
200 NP => 'RefSeq', | |
201 XP => 'RefSeq', | |
202 XM => 'RefSeq', | |
203 NG => 'RefSeq', | |
204 XG => 'RefSeq', | |
205 XR => 'RefSeq', | |
206 PROT => 'GenBank', | |
207 ACCNUM => 'GenBank', | |
208 CONTIG => 'GenBank', | |
209 # certain fields are not named the same as the symgene | |
210 # database list: rename the fields the symgene database name | |
211 # key = field name in locuslink | |
212 # value = database name in sym | |
213 #GO => 'GO', | |
214 OMIM => 'MIM', | |
215 GRIF => 'GRIF', | |
216 STS => 'STS', | |
217 UNIGENE => 'UniGene', | |
218 ); | |
219 | |
220 # certain CDD entries use the wrong prefix for the accession number | |
221 # cddprefix will replace the key w/ the value for these entries | |
222 my %cddprefix = ( | |
223 pfam => 'PF', | |
224 smart => 'SM', | |
225 ); | |
226 | |
227 # alternate mappings if one field does not exist | |
228 my %alternate_map = ( | |
229 OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME', | |
230 OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL', | |
231 ); | |
232 | |
233 # for these field names, we only care about the first value X in value X|Y|Z | |
234 my @ll_firstelements = qw( | |
235 NM | |
236 NP | |
237 NG | |
238 XG | |
239 XM | |
240 XP | |
241 XR | |
242 PROT | |
243 STS | |
244 ACCNUM | |
245 CONTIG | |
246 GRIF | |
247 ); | |
248 | |
249 # these fields need to be flattened into a single string, using the given | |
250 # join string | |
251 my %flatten_tags = ( | |
252 ASSEMBLY => '', | |
253 ORGANISM => '', | |
254 OFFICIAL_SYMBOL => '', | |
255 OFFICIAL_GENE_NAME => '', | |
256 LOCUSID => '', | |
257 PMID => '', | |
258 PREFERRED_SYMBOL => ', ', | |
259 PREFERRED_GENE_NAME => ', ' | |
260 ); | |
261 | |
262 # set the default search pattern for all the field names | |
263 my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys; | |
264 | |
265 sub _initialize { | |
266 my($self,@args) = @_; | |
267 | |
268 $self->SUPER::_initialize(@args); | |
269 | |
270 # overwrite the search pattern w/ the first value pattern | |
271 foreach my $key(@ll_firstelements){ | |
272 $feature_pat_map{$key}="^$key: ([^|]+)"; | |
273 } | |
274 | |
275 # special search pattern for cdd entries | |
276 foreach my $key(keys %cddprefix) { | |
277 $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)'; | |
278 } | |
279 | |
280 # special patterns for specific fields | |
281 $feature_pat_map{MAP} = '^MAP: (.+?)\|'; | |
282 $feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|'; | |
283 $feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|'; | |
284 $feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|'; | |
285 $feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|'; | |
286 $feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+'; | |
287 | |
288 # set the sequence factory of none has been set already | |
289 if(! $self->sequence_factory()) { | |
290 $self->sequence_factory(Bio::Seq::SeqFactory->new( | |
291 -type => 'Bio::Seq::RichSeq')); | |
292 } | |
293 } | |
294 | |
295 | |
296 ######################### | |
297 # | |
298 sub search_pattern{ | |
299 # | |
300 ######################### | |
301 my ($self, | |
302 $entry, #text to search | |
303 $searchconfirm, #to make sure you got the right thing | |
304 $searchpattern, | |
305 $searchtype) = @_; | |
306 my @query = $entry=~/$searchpattern/gm; | |
307 if ($searchconfirm ne "FALSE"){ | |
308 $self->warn("No $searchtype found\n$entry\n") unless @query; | |
309 foreach (@query){ | |
310 if (!($_=~/$searchconfirm/)){ | |
311 $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n"); | |
312 } | |
313 }#endforeach | |
314 }#endsearchconfirm | |
315 return(@query); | |
316 }#endsub | |
317 ############ | |
318 # | |
319 sub read_species{ | |
320 # | |
321 ############ | |
322 my ($spline)=@_; | |
323 my $species; | |
324 my $genus; | |
325 ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/; | |
326 my $make = Bio::Species->new(); | |
327 $make->classification( ($species,$genus) ); | |
328 return $make; | |
329 } | |
330 ################ | |
331 # | |
332 sub read_dblink{ | |
333 # | |
334 ################ | |
335 my ($ann,$db,$ref)=@_; | |
336 my @results=$ref ? @$ref : (); | |
337 foreach my $id(@results){ | |
338 if($id){ | |
339 $ann->add_Annotation('dblink', | |
340 Bio::Annotation::DBLink->new( | |
341 -database =>$db , | |
342 -primary_id =>$id)); | |
343 } | |
344 } | |
345 return($ann); | |
346 } | |
347 | |
348 ################ | |
349 # | |
350 sub read_reference{ | |
351 # | |
352 ################ | |
353 my ($ann,$db,$results)=@_; | |
354 | |
355 if($results){ | |
356 chomp($results); | |
357 my @ids=split(/,/,$results); | |
358 $ann = read_dblink($ann,$db,\@ids) if @ids; | |
359 } | |
360 return $ann; | |
361 }#endsub | |
362 | |
363 ################ | |
364 # | |
365 sub add_annotation{ | |
366 # | |
367 ################ | |
368 my ($ac,$type,$text,$anntype)=@_; | |
369 my @args; | |
370 | |
371 $anntype = 'SimpleValue' unless $anntype; | |
372 SWITCH : { | |
373 $anntype eq 'SimpleValue' && do { | |
374 push(@args, -value => $text, -tagname => $type); | |
375 last SWITCH; | |
376 }; | |
377 $anntype eq 'Comment' && do { | |
378 push(@args, -text => $text, -tagname => 'comment'); | |
379 last SWITCH; | |
380 }; | |
381 } | |
382 $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args)); | |
383 return($ac); | |
384 }#endsub | |
385 | |
386 ################ | |
387 # | |
388 sub add_annotation_ref{ | |
389 # | |
390 ################ | |
391 my ($ann,$type,$textref)=@_; | |
392 my @text=$textref ? @$textref : (); | |
393 | |
394 foreach my $text(@text){ | |
395 $ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text)); | |
396 } | |
397 return($ann); | |
398 }#endsub | |
399 | |
400 ################ | |
401 # | |
402 sub make_unique{ | |
403 # | |
404 ############## | |
405 my ($ann,$key) = @_; | |
406 | |
407 my %seen = (); | |
408 foreach my $dbl ($ann->remove_Annotations($key)) { | |
409 if(! $seen{$dbl->as_text()}) { | |
410 $seen{$dbl->as_text()} = 1; | |
411 $ann->add_Annotation($dbl); | |
412 } | |
413 } | |
414 return $ann; | |
415 } | |
416 | |
417 ################ | |
418 # | |
419 sub next_seq{ | |
420 # | |
421 ############## | |
422 my ($self, @args)=@_; | |
423 my (@results,$search,$ref,$cddref); | |
424 | |
425 # LOCUSLINK entries begin w/ >> | |
426 local $/="\n>>"; | |
427 | |
428 # slurp in a whole entry and return if no more entries | |
429 return unless my $entry = $self->_readline; | |
430 | |
431 # strip the leading '>>' is it's the first entry | |
432 if (index($entry,'>>') == 0) { #first entry | |
433 $entry = substr($entry,2); | |
434 } | |
435 | |
436 # we aren't interested in obsoleted entries, so we need to loop | |
437 # and skip those until we've found the next not obsoleted | |
438 my %record = (); | |
439 while($entry && ($entry =~ /\w/)) { | |
440 if (!($entry=~/LOCUSID/)){ | |
441 $self->throw("No LOCUSID in first line of record. ". | |
442 "Not LocusLink in my book."); | |
443 } | |
444 # see whether it's an obsoleted entry, and if so jump to the next | |
445 # one entry right away | |
446 if($entry =~ /^CURRENT_LOCUSID:/m) { | |
447 # read next entry and continue | |
448 $entry = $self->_readline; | |
449 %record = (); | |
450 next; | |
451 } | |
452 # loop through list of features and get field values | |
453 # place into record hash as array refs | |
454 foreach my $key (keys %feature_pat_map){ | |
455 $search=$feature_pat_map{$key}; | |
456 @results=$self->search_pattern($entry,'FALSE',$search,$search); | |
457 $record{$key} = @results ? [@results] : undef; | |
458 }#endfor | |
459 # terminate loop as this one hasn't been obsoleted | |
460 last; | |
461 } | |
462 | |
463 # we have reached the end-of-file ... | |
464 return unless %record; | |
465 | |
466 # special processing for CDD entries like pfam and smart | |
467 my ($PRESENT,@keep); | |
468 foreach my $key(keys %cddprefix){ | |
469 #print "check CDD $key\n"; | |
470 if($record{$key}) { | |
471 @keep=(); | |
472 foreach my $list (@{$record{$key}}) { | |
473 # replace AC with correct AC number | |
474 push(@keep,$cddprefix{$key}.$list); | |
475 } | |
476 # replace CDD ref with correctly prefixed AC number | |
477 $record{$key} = [@keep]; | |
478 } | |
479 } | |
480 # modify CDD references @=(); | |
481 if($record{CDD}) { | |
482 @keep=(); | |
483 foreach my $cdd (@{$record{CDD}}) { | |
484 $PRESENT = undef; | |
485 foreach my $key (keys %cddprefix) { | |
486 if ($cdd=~/$key/){ | |
487 $PRESENT = 1; | |
488 last; | |
489 } | |
490 } | |
491 push(@keep,$cdd) if(! $PRESENT); | |
492 } | |
493 $record{CDD} = [@keep]; | |
494 } | |
495 | |
496 # create annotation collection - we'll need it now | |
497 my $ann = Bio::Annotation::Collection->new(); | |
498 | |
499 foreach my $field(keys %dbname_map){ | |
500 $ann=read_dblink($ann,$dbname_map{$field},$record{$field}); | |
501 } | |
502 | |
503 # add GO link as an OntologyTerm annotation | |
504 if($record{GO}) { | |
505 for(my $j = 0; $j < @{$record{GO}}; $j++) { | |
506 my $goann = Bio::Annotation::OntologyTerm->new( | |
507 -identifier => $record{GO}->[$j], | |
508 -name => $record{GO_DESC}->[$j], | |
509 -ontology => $record{GO_CAT}->[$j]); | |
510 $ann->add_Annotation($goann); | |
511 } | |
512 } | |
513 | |
514 $ann=add_annotation_ref($ann,'URL',$record{LINK}); | |
515 $ann=add_annotation_ref($ann,'URL',$record{DB_LINK}); | |
516 | |
517 # presently we can't store types of dblinks - hence make unique | |
518 make_unique($ann,'dblink'); | |
519 | |
520 # everything else gets a simple tag or comment value annotation | |
521 foreach my $anntype (keys %anntype_map) { | |
522 foreach my $key (@{$anntype_map{$anntype}}){ | |
523 if($record{$key}){ | |
524 foreach (@{$record{$key}}){ | |
525 #print "$key\t\t$_\n"; | |
526 $ann=add_annotation($ann,$key,$_,$anntype); | |
527 } | |
528 } | |
529 } | |
530 } | |
531 | |
532 # flatten designated attributes into a scalar value | |
533 foreach my $field (keys %flatten_tags) { | |
534 if($record{$field}) { | |
535 $record{$field} = join($flatten_tags{$field}, | |
536 @{$record{$field}}); | |
537 } | |
538 } | |
539 | |
540 # annotation that expects the array flattened out | |
541 $ann=read_reference($ann,'PUBMED',$record{PMID}); | |
542 if($record{ASSEMBLY}) { | |
543 my @assembly=split(/,/,$record{ASSEMBLY}); | |
544 $ann=read_dblink($ann,'GenBank',\@assembly); | |
545 } | |
546 | |
547 # replace fields w/ alternate if original does not exist | |
548 foreach my $fieldval (keys %alternate_map){ | |
549 if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){ | |
550 $record{$fieldval}=$record{$alternate_map{$fieldval}}; | |
551 } | |
552 } | |
553 | |
554 # create sequence object (i.e., let seq.factory create one) | |
555 my $seq = $self->sequence_factory->create( | |
556 -verbose => $self->verbose(), | |
557 -accession_number => $record{LOCUSID}, | |
558 -desc => $record{OFFICIAL_GENE_NAME}, | |
559 -display_id => $record{OFFICIAL_SYMBOL}, | |
560 -species => read_species($record{ORGANISM}), | |
561 -annotation => $ann); | |
562 | |
563 # dump out object contents | |
564 # show_obj([$seq]); | |
565 | |
566 return($seq); | |
567 } | |
568 | |
569 ################ | |
570 # | |
571 sub show_obj{ | |
572 # | |
573 ################ | |
574 my ($seqlistref)=@_; | |
575 my @list=@$seqlistref; | |
576 my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' ); | |
577 my ($ann,@values,$val); | |
578 | |
579 foreach my $seq(@list){ | |
580 $out->write_seq($seq); | |
581 $ann=$seq->annotation; | |
582 foreach my $key ( $ann->get_all_annotation_keys() ) { | |
583 @values = $ann->get_Annotations($key); | |
584 foreach my $value ( @values ) { | |
585 # value is an Bio::AnnotationI, and defines a "as_text" method | |
586 $val=$value->as_text; | |
587 print "Annotation ",$key,"\t\t",$val,"\n"; | |
588 } | |
589 } | |
590 } | |
591 }#endsub | |
592 | |
593 1; |