Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/Biblio/IO/medline2ref.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Biblio/IO/medline2ref.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,542 @@ +# $Id: medline2ref.pm,v 1.10 2002/10/22 07:45:13 lapp Exp $ +# +# BioPerl module Bio::Biblio::IO::medline2ref.pm +# +# Cared for by Martin Senger <senger@ebi.ac.uk> +# For copyright and disclaimer see below. + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Biblio::IO::medline2ref - A converter of a raw hash to MEDLINE citations + +=head1 SYNOPSIS + + # to be written + +=head1 DESCRIPTION + + # to be written + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to +the Bioperl mailing list. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +of the bugs and their resolution. Bug reports can be submitted via +email or the web: + + bioperl-bugs@bioperl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR + +Martin Senger (senger@ebi.ac.uk) + +=head1 COPYRIGHT + +Copyright (c) 2002 European Bioinformatics Institute. All Rights Reserved. + +This module is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +=head1 DISCLAIMER + +This software is provided "as is" without warranty of any kind. + +=head1 APPENDIX + +Here is the rest of the object methods. Internal methods are preceded +with an underscore _. + +=cut + + +# Let the code begin... + + +package Bio::Biblio::IO::medline2ref; + +use strict; +use vars qw(@ISA $VERSION $Revision); + +use Bio::Root::Root; +use Bio::Biblio::MedlineJournal; +use Bio::Biblio::MedlineBook; +use Bio::Biblio::Provider; +use Bio::Biblio::Person; +use Bio::Biblio::Organisation; + +@ISA = qw(Bio::Root::Root); + +BEGIN { + # set the version for version checking + $VERSION = do { my @r = (q$Revision: 1.10 $ =~ /\d+/g); sprintf "%d.%-02d", @r }; + $Revision = q$Id: medline2ref.pm,v 1.10 2002/10/22 07:45:13 lapp Exp $; +} + +# ----------------------------------------------------------------------------- +sub new { + my ($caller, @args) = @_; + my $class = ref ($caller) || $caller; + + # object creation and blessing + my ($self) = $class->SUPER::new (@args); + + # make a hashtable from @args + my %param = @args; + @param { map { lc $_ } keys %param } = values %param; # lowercase keys + + # copy all @args into this object (overwriting what may already be + # there) - changing '-key' into '_key', and making keys lowercase + my $new_key; + foreach my $key (keys %param) { + ($new_key = $key) =~ s/^-/_/; + $self->{ lc $new_key } = $param { $key }; + } + + # done + return $self; +} + +# --------------------------------------------------------------------- +# +# Here is the core... +# +# --------------------------------------------------------------------- + +sub _load_instance { + my ($self, $source) = @_; + + # + # MEDLINE has only JournalArticles and BookArticles + # but we may create a general Ref if there is no attribute 'article' + # + my $result; + my $article = $$source{'article'}; + if (defined $article) { + if (defined $$article{'journal'}) { + $result = $self->_new_instance ('Bio::Biblio::MedlineJournalArticle'); + $result->type ('JournalArticle'); + } elsif (defined $$article{'book'}) { + $result = $self->_new_instance ('Bio::Biblio::MedlineBookArticle'); + $result->type ('BookArticle'); + } else { + $result->type ('MedlineArticle'); + } + } + $result = $self->_new_instance ('Bio::Biblio::Ref') unless defined $result; + return $result; +} + +sub convert { + my ($self, $source) = @_; + my $result = $self->_load_instance ($source); + + if (defined $result->type) { + if ($result->type eq 'JournalArticle') { + &_convert_journal_article ($result, $source); + } elsif ($result->type eq 'BookArticle') { + &_convert_book_article ($result, $source); + } elsif ($result->type eq 'Article') { + &_convert_article ($result, $source); + } + } + + # + # now do the attributes which are the same for all resource types + # + + # ...identification is now by MedlineID but the trend is to replace + # it by PMID (I have heard) theefore we keep both also separately + # from the 'identifier' + if (defined $$source{'medlineID'}) { + $result->identifier ($$source{'medlineID'}); + } else { + $result->identifier ($$source{'PMID'}); + } + $result->pmid ($$source{'PMID'}) if defined $$source{'PMID'}; + $result->medline_id ($$source{'medlineID'}) if defined $$source{'medlineID'}; + + # ...few others + $result->citation_owner ($$source{'owner'}) if defined $$source{'owner'}; + $result->status ($$source{'status'}) if defined $$source{'status'}; + $result->number_of_references ($$source{'numberOfReferences'}) if defined $$source{'numberOfReferences'}; + + # ...entry status of the citation in the repository + my $date; + if (defined $$source{'dateRevised'}) { + $result->last_modified_date (&_convert_date ($$source{'dateRevised'})); + $date = &_convert_date ($$source{'dateCreated'}); + $result->date_created ($date) if defined $date; + $date = &_convert_date ($$source{'dateCompleted'}); + $result->date_completed ($date) if defined $date; + } elsif (defined $$source{'dateCompleted'}) { + $result->last_modified_date (&_convert_date ($$source{'dateCompleted'})); + $date = &_convert_date ($$source{'dateCreated'}); + $result->date_created ($date) if defined $date; + } elsif (defined $$source{'dateCreated'}) { + $result->last_modified_date (&_convert_date ($$source{'dateCreated'})); + } + + # ...put citation subsets in a comma-separated string + if (defined $$source{'citationSubsets'}) { + $result->repository_subset (join (',', @{ $$source{'citationSubsets'} })); + } + + # ...MEDLINE's Comments & Corrections will be arrays of hashes + if (defined $$source{'commentsCorrections'}) { + my $corr = $$source{'commentsCorrections'}; + $result->comment_ons ($$corr{'commentOns'}) if defined $$corr{'commentOns'}; + $result->comment_ins ($$corr{'commentIns'}) if defined $$corr{'commentIns'}; + $result->erratum_ins ($$corr{'erratumIns'}) if defined $$corr{'erratumIns'}; + $result->erratum_fors ($$corr{'erratumFors'}) if defined $$corr{'erratumFors'}; + $result->original_report_ins ($$corr{'originalReportIns'}) if defined $$corr{'originalReportIns'}; + $result->republished_froms ($$corr{'republishedFroms'}) if defined $$corr{'republishedFroms'}; + $result->republished_ins ($$corr{'republishedIns'}) if defined $$corr{'republishedIns'}; + $result->retraction_ofs ($$corr{'retractionOfs'}) if defined $$corr{'retractionOfs'}; + $result->retraction_ins ($$corr{'retractionIns'}) if defined $$corr{'retractionIns'}; + $result->summary_for_patients_ins ($$corr{'summaryForPatientsIns'}) if defined $$corr{'summaryForPatientsIns'}; + $result->update_ins ($$corr{'updateIns'}) if defined $$corr{'updateIns'}; + $result->update_ofs ($$corr{'updateOfs'}) if defined $$corr{'updateOfs'}; + } + + # ...MEDLINE's GeneSymbols are put in a comma-separated string + if (defined $$source{'geneSymbols'}) { + $result->gene_symbols (join (',', @{ $$source{'geneSymbols'} })); + } + + # ...MEDLINE's GeneralNotes into an array of hashtables, each one + # having keys for the 'owner' and the 'note' + $result->general_notes ($$source{'generalNotes'}) if defined $$source{'generalNotes'}; + + # ...MEDLINE's PersonalNameSubjects into contributors (TBD: is that correct?) + if (defined $$source{'personalNameSubjects'}) { + my @contributors; + foreach my $person ( @{ $$source{'personalNameSubjects'} } ) { + push (@contributors, &_convert_personal_name ($person)); + } + $result->contributors (\@contributors); + } + + # ...MEDLINE's OtherAbstract into an array of hashtables, each one + # having keys for the 'type', 'AbstractText' and the 'copyright' + $result->other_abstracts ($$source{'otherAbstracts'}) if defined $$source{'otherAbstracts'}; +# if (defined $$source{'otherAbstracts'}) { +# my @other_abstracts = (); +# foreach my $oa ( @{ $$source{'otherAbstracts'} } ) { +# if (defined $$oa{'abstractText'}) { +# my $abstract = $$oa{'abstractText'}; +# delete $$oa{'abstractText'}; +# $$oa{'abstract'} = $$abstract{'abstractText'}; +# $$oa{'rights'} = $$abstract{'copyrightInformation'} if defined $$abstract{'copyrightInformation'}; +# push (@other_abstracts, $oa); +# } +# } +# $result->other_abstracts (\@other_abstracts); +# } + + # ...MEDLINE's OtherIDs into an array of hashtables, each one + # having keys for the 'id', and 'source' + $result->other_ids ($$source{'otherIDs'}) if defined $$source{'otherIDs'}; + + # ...MEDLINE's Chemicals - store them as an array of hashtables + # (each one for each Chemical) + $result->chemicals ($$source{'chemicals'}) if defined $$source{'chemicals'}; + + # MeshHeadings are put on two places: + # - a complete information in a property called "MeshHeadings", and + # - only descriptors in the hashtable "subject_headings", together + # with the word "MeSH" in "subject_headings_source" + if (defined $$source{'meshHeadings'}) { + $result->mesh_headings ($$source{'meshHeadings'}); + my %subject_headings; + foreach my $mesh ( @{ $$source{'meshHeadings'} } ) { + $subject_headings{ $$mesh{'descriptorName'} } = 1 if defined $$mesh{'descriptorName'}; + } + if (%subject_headings) { + $result->subject_headings (\%subject_headings); + $result->subject_headings_source ('Mesh'); + } + } + + # ...MEDLINE's keyword lists are merger all together (this may not + # be good idea - but again the keywords are better accessible + # -TBD?) + if (defined $$source{'keywordLists'}) { + my %keywords; + foreach my $keywords ( @{ $$source{'keywordLists'} } ) { + if ($$keywords{'keywords'}) { + foreach my $keyword ( @{ $$keywords{'keywords'} } ) { + $keywords{$keyword} = 1; + } + } + } + $result->keywords (\%keywords) if %keywords; + } + + # Done! + return $result; +} + +# load a module (given as a real module name, e.g. 'Bio::Biblio::MedlineJournalArticle'), +# call new() method on it, and return the instance returned by the new() method +sub _new_instance { + my ($self, $module) = @_; + my ($filename); + ($filename = $module . '.pm') =~ s|\:\:|/|g; + eval { require $filename; }; + $self->throw ("Loading error when trying '$filename'. $@\n") if $@; + return $module->new; +} + +# +# see OpenBQS specification (http://industry.ebi.ac.uk/openBQS) how +# a date should be coded; +# TBD: this can be improved - checking is missing, timezones, +# converting to UTC... +# Also note that this routine does not convert 'medline_date' - it +# is stored in a separate attribute without ant conversion. +# +sub _convert_date { + my ($date) = @_; + return undef unless + exists $$date{'year'} or + exists $$date{'month'} or + exists $$date{'day'} or + exists $$date{'hour'} or + exists $$date{'minute'} or + exists $$date{'second'}; + + + my $converted = (exists $$date{'year'} ? $$date{'year'} : '0000'); + + if (exists $$date{'month'}) { + $converted .= '-' . $$date{'month'}; + } elsif (exists $$date{'day'}) { + $converted .= '-00'; + } + + if (exists $$date{'day'}) { + $converted .= '-' . $$date{'day'}; + } elsif (exists $$date{'hour'}) { + $converted .= '-00'; + } + + if (exists $$date{'hour'}) { + $converted .= 'T' . $$date{'hour'} . + ':' . (exists $$date{'minute'} ? $$date{'minute'} : '00') . + ':' . (exists $$date{'second'} ? $$date{'second'} : '00') . 'Z'; + } + return $converted; +} + +# $person is a hash with persons attributes - we need to create and +# return a Bio::Biblio::Person object +sub _convert_personal_name { + my ($person) = @_; + foreach my $key (keys %$person) { + $$person{"_$key"} = $$person{$key}; + delete $$person{$key}; + } + new Bio::Biblio::Person (%$person); +} + +# +# takes journal article related attributes from $article and convert +# them into $result and at the end call _convert_article (which is +# shared with book article) +# +sub _convert_journal_article { + my ($result, $source) = @_; + my $article = $$source{'article'}; + + # create and populate both a Journal and a resulting Article objects + my $from_journal = $$article{'journal'}; + my $journal = new Bio::Biblio::MedlineJournal; + $journal->name ($$from_journal{'title'}) if defined $$from_journal{'title'}; + $journal->issn ($$from_journal{'iSSN'}) if defined $$from_journal{'iSSN'}; + $journal->abbreviation ($$from_journal{'iSOAbbreviation'}) if defined $$from_journal{'iSOAbbreviation'}; + $journal->coden ($$from_journal{'coden'}) if defined $$from_journal{'coden'}; + if (defined $$from_journal{'journalIssue'}) { + my $issue = $$from_journal{'journalIssue'}; + $result->volume ($$issue{'volume'}) if defined $$issue{'volume'}; + $result->issue ($$issue{'issue'}) if defined $$issue{'issue'}; + + if (defined $$issue{'pubDate'}) { + my $pub_date = $$issue{'pubDate'}; + my $converted = &_convert_date ($pub_date); + $result->date ($converted) if defined $converted; + + # Some parts of a MEDLINE date are stored just as properties + # because they have almost non-parseable format :-). + $result->medline_date ($$pub_date{'medlineDate'}) if defined $$pub_date{'medlineDate'}; + $result->season ($$pub_date{'season'}) if defined $$pub_date{'season'}; + } + } + + # ...some attributes are in journalInfo (which is outside of the article) + my $journal_info = $$source{'journalInfo'}; + if (defined $journal_info) { + $journal->country ($$journal_info{'country'}) if defined $$journal_info{'country'}; + $journal->medline_ta ($$journal_info{'medlineTA'}) if defined $$journal_info{'medlineTA'}; + $journal->medline_code ($$journal_info{'medlineCode'}) if defined $$journal_info{'medlineCode'}; + $journal->nlm_unique_id ($$journal_info{'nlmUniqueID'}) if defined $$journal_info{'nlmUniqueID'}; + } + + $result->journal ($journal); + &_convert_article ($result, $source); +} + +# +# takes book article related attributes from $article and convert +# them into $result and at the end call _convert_article (which is +# shared with journal article) +# +sub _convert_book_article { + my ($result, $source) = @_; + my $article = $$source{'article'}; + + # create and populate both book and resulting article objects + my $from_book = $$article{'book'}; + my $book = new Bio::Biblio::MedlineBook; + $book->title ($$from_book{'title'}) if defined $$from_book{'title'}; + $book->volume ($$from_book{'volume'}) if defined $$from_book{'volume'}; + $book->series ($$from_book{'collectionTitle'}) if defined $$from_book{'collectionTitle'}; + + if (defined $$from_book{'pubDate'}) { + my $pub_date = $$from_book{'pubDate'}; + my $converted = &_convert_date ($pub_date); + $result->pub_date ($converted) if defined $converted; + + # Some parts of a MEDLINE date are stored just as properties + # because they have almost non-parseable format :-). + $result->medline_date ($$pub_date{'medlineDate'}) if defined $$pub_date{'medlineDate'}; + $result->season ($$pub_date{'season'}) if defined $$pub_date{'season'}; + } + + if (defined $$from_book{'publisher'}) { + my $publisher = new Bio::Biblio::Organisation; + $publisher->name ($$from_book{'publisher'}); + $book->publisher ($publisher); + } + + my @authors = &_convert_providers ($$from_book{'authors'}); + $book->authors (\@authors) if @authors; + + $result->book ($book); + &_convert_article ($result, $source); +} + +# +# takes from $source article related attributes and convert them into +# $article (these attributes are the same both for journal and book +# articles +# +sub _convert_article { + my ($article, $source) = @_; + my $from_article = $$source{'article'}; + + $article->title ($$from_article{'articleTitle'}) if defined $$from_article{'articleTitle'}; + $article->affiliation ($$from_article{'affiliation'}) if defined $$from_article{'affiliation'}; + $article->vernacular_title ($$from_article{'vernacularTitle'}) if defined $$from_article{'vernacularTitle'}; + $article->date_of_electronic_publication + ($$from_article{'dateOfElectronicPublication'}) if defined $$from_article{'dateOfElectronicPublication'}; + + if (defined $$from_article{'pagination'}) { + my $pagination = $$from_article{'pagination'}; + $article->first_page ($$pagination{'startPage'}) if defined $$pagination{'startPage'}; + $article->last_page ($$pagination{'endPage'}) if defined $$pagination{'endPage'}; + $article->medline_page ($$pagination{'medlinePgn'}) if defined $$pagination{'medlinePgn'}; + } + + if (defined $$from_article{'abstract'}) { + my $abstract = $$from_article{'abstract'}; + $article->abstract ($$abstract{'abstractText'}) if defined $$abstract{'abstractText'}; + $article->abstract_type ('text/plain'); + $article->rights ($$abstract{'copyrightInformation'}) if defined $$abstract{'copyrightInformation'}; + } + + if (defined $$from_article{'languages'}) { + my $languages = $$from_article{'languages'}; # ref-array + if ( @{ $languages } > 0) { + $article->language ( $$languages[0] ); + } + if ( @{ $languages } > 1) { + $article->other_languages (join (',', @{ $languages })); + } + } + + my @authors = &_convert_providers ($$from_article{'authors'}); + if (@authors) { + $article->authors (\@authors); + $article->author_list_complete + ($$from_article{'authorListComplete'}) if defined $$from_article{'authorListComplete'}; + } + + # references to database entries are prefixed with database name + # (separated by a slash) + use Bio::Annotation::DBLink; + if (defined $$from_article{'dataBanks'}) { + my $databanks = $$from_article{'dataBanks'}; # a ref-array + my @references; + foreach my $bank ( @{ $databanks } ) { + my $db_name = $$bank{'dataBankName'}; + if (defined $$bank{'accessionNumbers'}) { + foreach my $accn ( @{ $$bank{'accessionNumbers'} } ) { + my $dblink = new Bio::Annotation::DBLink (-primary_id => $accn); + $dblink->database ($db_name); # it does not matter if it is undef + push (@references, $dblink); + } + } + } + if (@references) { + $article->cross_references (\@references); + $article->cross_references_list_complete + ($$from_article{'dataBankListComplete'}) if defined $$from_article{'dataBankListComplete'}; + } + } + + # grants are stored in an array of hashtables (each of the + # hashtables has keys agency, grantID and acronym) + $article->grants ($$from_article{'grants'}) if defined $$from_article{'grants'}; + $article->grant_list_complete + ($$from_article{'grantListComplete'}) if defined $$from_article{'grandListComplete'}; + +} + +# +# takes a ref-array of providers - they can be persons or +# organisations, and returns an array of converted providers +# +sub _convert_providers { + my ($providers) = @_; + return () unless defined $providers; + + my @results; + foreach my $provider ( @{ $providers } ) { + if (defined $$provider{'personalName'}) { + my $converted = &_convert_personal_name ($$provider{'personalName'}); + push (@results, $converted) if defined $converted; + } elsif (defined $$provider{'collectiveName'}) { + push (@results, new Bio::Biblio::Organisation (-name => $$provider{'collectiveName'})); + } else { + new Bio::Biblio::Provider; + } + } + return () unless @results; + return @results; +} + +1; +__END__