Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/SeqIO/locuslink.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/SeqIO/locuslink.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,593 @@ +# $Id: locuslink.pm,v 1.2.2.2 2003/03/13 02:09:20 lapp Exp $ +# +# BioPerl module for Bio::SeqIO::locuslink +# +# Cared for by Keith Ching <kching at gnf.org> +# +# Copyright Keith Ching +# +# You may distribute this module under the same terms as perl itself + +# +# (c) Keith Ching, kching at gnf.org, 2002. +# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2002. +# +# You may distribute this module under the same terms as perl itself. +# Refer to the Perl Artistic License (see the license accompanying this +# software package, or see http://www.perl.com/language/misc/Artistic.html) +# for the terms under which you may use, modify, and redistribute this module. +# +# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED +# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF +# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. +# + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::SeqIO::locuslink - DESCRIPTION of Object + +=head1 SYNOPSIS + + # don't instantiate directly - instead do + my $seqio = Bio::SeqIO->new(-format => "locuslink", -file => \STDIN); + +=head1 DESCRIPTION + +This module parses LocusLink into Bio::SeqI objects with rich +annotation, but no sequence. + +The input file has to be in the LL_tmpl format - the tabular format +will not work. + +The way the current implementation populates the object is rather a +draft work than a finished work of art. Note that at this stage the +locuslink entries cannot be round-tripped, because the parser loses +certain information. For instance, most of the alternative transcript +descriptions are not retained. The parser also misses any element +that deals with visual representation (e.g., 'button') except for the +URLs. Almost all of the pieces of the annotation are kept in the +L<Bio::Annotation::Collection> object. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Keith Ching + +Email kching at gnf.org + +Describe contact details here + +=head1 CONTRIBUTORS + +Hilmar Lapp, hlapp at gmx.net + +=head1 APPENDIX + +The rest of the documentation details each of the object methods. +Internal methods are usually preceded with a _ + +=cut + +package Bio::SeqIO::locuslink; + +use strict; +use vars qw(@ISA); + +use Bio::SeqIO; +use Bio::Seq::SeqFactory; +use Bio::Species; +use Bio::Annotation::DBLink; +#use Bio::Annotation::Reference; +use Bio::Annotation::Comment; +use Bio::Annotation::SimpleValue; +use Bio::Annotation::OntologyTerm; +use Bio::Annotation::Collection; + +@ISA = qw(Bio::SeqIO); + +# list of all the field names in locuslink +my @locuslink_keys = qw( + ACCNUM + ALIAS_PROT + ALIAS_SYMBOL + ASSEMBLY + BUTTON + CDD + CHR + COMP + CONTIG + CURRENT_LOCUSID + DB_DESCR + DB_LINK + ECNUM + EVID + EXTANNOT + GO + GRIF + LINK + LOCUSID + LOCUS_CONFIRMED + LOCUS_TYPE + MAP + MAPLINK + NC + NG + NM + NP + NR + OFFICIAL_GENE_NAME + OFFICIAL_SYMBOL + OMIM + ORGANISM + PHENOTYPE + PHENOTYPE_ID + PMID + PREFERRED_GENE_NAME + PREFERRED_PRODUCT + PREFERRED_SYMBOL + PRODUCT + PROT + RELL + STATUS + STS + SUMFUNC + SUMMARY + TRANSVAR + TYPE + UNIGENE + XG + XM + XP + XR + ); + +# list of fields to make simple annotations from +# fields not listed here or as a key in feature hash are ignored (lost). +my %anntype_map = ( + SimpleValue => [qw( + ALIAS_PROT + ALIAS_SYMBOL + CDD + CHR + CURRENT_LOCUSID + ECNUM + EXTANNOT + MAP + NC + NR + OFFICIAL_GENE_NAME + OFFICIAL_SYMBOL + PHENOTYPE + PREFERRED_GENE_NAME + PREFERRED_PRODUCT + PREFERRED_SYMBOL + PRODUCT + RELL + SUMFUNC + ) + ], + Comment => [qw( + SUMMARY + ) + ], + ); + + +# certain fields are not named the same as the symgene database list +my %dbname_map = ( + pfam => 'Pfam', + smart => 'SMART', + NM => 'RefSeq', + NP => 'RefSeq', + XP => 'RefSeq', + XM => 'RefSeq', + NG => 'RefSeq', + XG => 'RefSeq', + XR => 'RefSeq', + PROT => 'GenBank', + ACCNUM => 'GenBank', + CONTIG => 'GenBank', + # certain fields are not named the same as the symgene + # database list: rename the fields the symgene database name + # key = field name in locuslink + # value = database name in sym + #GO => 'GO', + OMIM => 'MIM', + GRIF => 'GRIF', + STS => 'STS', + UNIGENE => 'UniGene', + ); + +# certain CDD entries use the wrong prefix for the accession number +# cddprefix will replace the key w/ the value for these entries +my %cddprefix = ( + pfam => 'PF', + smart => 'SM', + ); + +# alternate mappings if one field does not exist +my %alternate_map = ( + OFFICIAL_GENE_NAME => 'PREFERRED_GENE_NAME', + OFFICIAL_SYMBOL => 'PREFERRED_SYMBOL', + ); + +# for these field names, we only care about the first value X in value X|Y|Z +my @ll_firstelements = qw( + NM + NP + NG + XG + XM + XP + XR + PROT + STS + ACCNUM + CONTIG + GRIF + ); + +# these fields need to be flattened into a single string, using the given +# join string +my %flatten_tags = ( + ASSEMBLY => '', + ORGANISM => '', + OFFICIAL_SYMBOL => '', + OFFICIAL_GENE_NAME => '', + LOCUSID => '', + PMID => '', + PREFERRED_SYMBOL => ', ', + PREFERRED_GENE_NAME => ', ' +); + +# set the default search pattern for all the field names +my %feature_pat_map = map { ($_ , "^$_: (.+)\n"); } @locuslink_keys; + +sub _initialize { + my($self,@args) = @_; + + $self->SUPER::_initialize(@args); + + # overwrite the search pattern w/ the first value pattern + foreach my $key(@ll_firstelements){ + $feature_pat_map{$key}="^$key: ([^|]+)"; + } + + # special search pattern for cdd entries + foreach my $key(keys %cddprefix) { + $feature_pat_map{$key}='^CDD: .+\|'.$key.'(\d+)'; + } + + # special patterns for specific fields + $feature_pat_map{MAP} = '^MAP: (.+?)\|'; + $feature_pat_map{MAPHTML} = '^MAP: .+\|(<.+>)\|'; + $feature_pat_map{GO} = '^GO: .+\|.+\|\w+\|(GO:\d+)\|'; + $feature_pat_map{GO_DESC} = '^GO: .+\|(.+)\|\w+\|GO:\d+\|'; + $feature_pat_map{GO_CAT} = '^GO: (.+)\|.+\|\w+\|GO:\d+\|'; + $feature_pat_map{EXTANNOT} = '^EXTANNOT: (.+)\|(.+)\|\w+\|.+\|\d+'; + + # set the sequence factory of none has been set already + if(! $self->sequence_factory()) { + $self->sequence_factory(Bio::Seq::SeqFactory->new( + -type => 'Bio::Seq::RichSeq')); + } +} + + +######################### +# +sub search_pattern{ +# +######################### + my ($self, + $entry, #text to search + $searchconfirm, #to make sure you got the right thing + $searchpattern, + $searchtype) = @_; + my @query = $entry=~/$searchpattern/gm; + if ($searchconfirm ne "FALSE"){ + $self->warn("No $searchtype found\n$entry\n") unless @query; + foreach (@query){ + if (!($_=~/$searchconfirm/)){ + $self->throw("error\n$entry\n$searchtype parse $_ does not match $searchconfirm\n"); + } + }#endforeach + }#endsearchconfirm + return(@query); +}#endsub +############ +# +sub read_species{ +# +############ + my ($spline)=@_; + my $species; + my $genus; + ($genus,$species)=$spline=~/([^ ]+) ([^ ]+)/; + my $make = Bio::Species->new(); + $make->classification( ($species,$genus) ); + return $make; +} +################ +# +sub read_dblink{ +# +################ + my ($ann,$db,$ref)=@_; + my @results=$ref ? @$ref : (); + foreach my $id(@results){ + if($id){ + $ann->add_Annotation('dblink', + Bio::Annotation::DBLink->new( + -database =>$db , + -primary_id =>$id)); + } + } + return($ann); +} + +################ +# +sub read_reference{ +# +################ + my ($ann,$db,$results)=@_; + + if($results){ + chomp($results); + my @ids=split(/,/,$results); + $ann = read_dblink($ann,$db,\@ids) if @ids; + } + return $ann; +}#endsub + +################ +# +sub add_annotation{ +# +################ + my ($ac,$type,$text,$anntype)=@_; + my @args; + + $anntype = 'SimpleValue' unless $anntype; + SWITCH : { + $anntype eq 'SimpleValue' && do { + push(@args, -value => $text, -tagname => $type); + last SWITCH; + }; + $anntype eq 'Comment' && do { + push(@args, -text => $text, -tagname => 'comment'); + last SWITCH; + }; + } + $ac->add_Annotation("Bio::Annotation::$anntype"->new(@args)); + return($ac); +}#endsub + +################ +# +sub add_annotation_ref{ +# +################ + my ($ann,$type,$textref)=@_; + my @text=$textref ? @$textref : (); + + foreach my $text(@text){ + $ann->add_Annotation($type,Bio::Annotation::SimpleValue->new(-value => $text)); + } + return($ann); +}#endsub + +################ +# +sub make_unique{ +# +############## + my ($ann,$key) = @_; + + my %seen = (); + foreach my $dbl ($ann->remove_Annotations($key)) { + if(! $seen{$dbl->as_text()}) { + $seen{$dbl->as_text()} = 1; + $ann->add_Annotation($dbl); + } + } + return $ann; +} + +################ +# +sub next_seq{ +# +############## + my ($self, @args)=@_; + my (@results,$search,$ref,$cddref); + + # LOCUSLINK entries begin w/ >> + local $/="\n>>"; + + # slurp in a whole entry and return if no more entries + return unless my $entry = $self->_readline; + + # strip the leading '>>' is it's the first entry + if (index($entry,'>>') == 0) { #first entry + $entry = substr($entry,2); + } + + # we aren't interested in obsoleted entries, so we need to loop + # and skip those until we've found the next not obsoleted + my %record = (); + while($entry && ($entry =~ /\w/)) { + if (!($entry=~/LOCUSID/)){ + $self->throw("No LOCUSID in first line of record. ". + "Not LocusLink in my book."); + } + # see whether it's an obsoleted entry, and if so jump to the next + # one entry right away + if($entry =~ /^CURRENT_LOCUSID:/m) { + # read next entry and continue + $entry = $self->_readline; + %record = (); + next; + } + # loop through list of features and get field values + # place into record hash as array refs + foreach my $key (keys %feature_pat_map){ + $search=$feature_pat_map{$key}; + @results=$self->search_pattern($entry,'FALSE',$search,$search); + $record{$key} = @results ? [@results] : undef; + }#endfor + # terminate loop as this one hasn't been obsoleted + last; + } + + # we have reached the end-of-file ... + return unless %record; + + # special processing for CDD entries like pfam and smart + my ($PRESENT,@keep); + foreach my $key(keys %cddprefix){ + #print "check CDD $key\n"; + if($record{$key}) { + @keep=(); + foreach my $list (@{$record{$key}}) { + # replace AC with correct AC number + push(@keep,$cddprefix{$key}.$list); + } + # replace CDD ref with correctly prefixed AC number + $record{$key} = [@keep]; + } + } + # modify CDD references @=(); + if($record{CDD}) { + @keep=(); + foreach my $cdd (@{$record{CDD}}) { + $PRESENT = undef; + foreach my $key (keys %cddprefix) { + if ($cdd=~/$key/){ + $PRESENT = 1; + last; + } + } + push(@keep,$cdd) if(! $PRESENT); + } + $record{CDD} = [@keep]; + } + + # create annotation collection - we'll need it now + my $ann = Bio::Annotation::Collection->new(); + + foreach my $field(keys %dbname_map){ + $ann=read_dblink($ann,$dbname_map{$field},$record{$field}); + } + + # add GO link as an OntologyTerm annotation + if($record{GO}) { + for(my $j = 0; $j < @{$record{GO}}; $j++) { + my $goann = Bio::Annotation::OntologyTerm->new( + -identifier => $record{GO}->[$j], + -name => $record{GO_DESC}->[$j], + -ontology => $record{GO_CAT}->[$j]); + $ann->add_Annotation($goann); + } + } + + $ann=add_annotation_ref($ann,'URL',$record{LINK}); + $ann=add_annotation_ref($ann,'URL',$record{DB_LINK}); + + # presently we can't store types of dblinks - hence make unique + make_unique($ann,'dblink'); + + # everything else gets a simple tag or comment value annotation + foreach my $anntype (keys %anntype_map) { + foreach my $key (@{$anntype_map{$anntype}}){ + if($record{$key}){ + foreach (@{$record{$key}}){ + #print "$key\t\t$_\n"; + $ann=add_annotation($ann,$key,$_,$anntype); + } + } + } + } + + # flatten designated attributes into a scalar value + foreach my $field (keys %flatten_tags) { + if($record{$field}) { + $record{$field} = join($flatten_tags{$field}, + @{$record{$field}}); + } + } + + # annotation that expects the array flattened out + $ann=read_reference($ann,'PUBMED',$record{PMID}); + if($record{ASSEMBLY}) { + my @assembly=split(/,/,$record{ASSEMBLY}); + $ann=read_dblink($ann,'GenBank',\@assembly); + } + + # replace fields w/ alternate if original does not exist + foreach my $fieldval (keys %alternate_map){ + if((! $record{$fieldval}) && ($record{$alternate_map{$fieldval}})){ + $record{$fieldval}=$record{$alternate_map{$fieldval}}; + } + } + + # create sequence object (i.e., let seq.factory create one) + my $seq = $self->sequence_factory->create( + -verbose => $self->verbose(), + -accession_number => $record{LOCUSID}, + -desc => $record{OFFICIAL_GENE_NAME}, + -display_id => $record{OFFICIAL_SYMBOL}, + -species => read_species($record{ORGANISM}), + -annotation => $ann); + + # dump out object contents + # show_obj([$seq]); + + return($seq); +} + +################ +# +sub show_obj{ +# +################ + my ($seqlistref)=@_; + my @list=@$seqlistref; + my $out = Bio::SeqIO->new('-fh' => \*STDOUT, -format => 'genbank' ); + my ($ann,@values,$val); + + foreach my $seq(@list){ + $out->write_seq($seq); + $ann=$seq->annotation; + foreach my $key ( $ann->get_all_annotation_keys() ) { + @values = $ann->get_Annotations($key); + foreach my $value ( @values ) { + # value is an Bio::AnnotationI, and defines a "as_text" method + $val=$value->as_text; + print "Annotation ",$key,"\t\t",$val,"\n"; + } + } + } +}#endsub + +1;