Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/ClusterIO/unigene.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/ClusterIO/unigene.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,252 @@ +# $Id: unigene.pm,v 1.16.2.2 2003/09/15 01:50:47 andrew Exp $ +# BioPerl module for Bio::ClusterIO::unigene +# +# Cared for by Andrew Macgregor <andrew@anatomy.otago.ac.nz> +# +# Copyright Andrew Macgregor, Jo-Ann Stanton, David Green +# Molecular Embryology Group, Anatomy & Structural Biology, University of Otago +# http://meg.otago.ac.nz +# +# You may distribute this module under the same terms as perl itself +# +# _history +# April 17, 2002 - Initial implementation by Andrew Macgregor + +# POD documentation - main docs before the code + +=head1 NAME + +Bio::ClusterIO::unigene - UniGene input stream + +=head1 SYNOPSIS + +Do not use this module directly. Use it via the Bio::ClusterIO class. + +=head1 DESCRIPTION + +This object reads from Unigene *.data files downloaded from ftp://ftp.ncbi.nih.gov/repository/UniGene/. +It doesn't download and decompress the file, you have to do that yourself. + + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to one +of the Bioperl mailing lists. Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bioperl.org/MailList.shtml - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track + the bugs and their resolution. + Bug reports can be submitted via email or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHORS - Andrew Macgregor + +Email: andrew@anatomy.otago.ac.nz + + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +#' +# Let the code begin... + +package Bio::ClusterIO::unigene; +use vars qw(@ISA); +use strict; + +use Bio::ClusterIO; +use Bio::Cluster::UniGene; +use Bio::Cluster::ClusterFactory; + +@ISA = qw(Bio::ClusterIO); + +my %line_is = ( + ID => q/ID\s+(\w{2,3}\.\d+)/, + TITLE => q/TITLE\s+(\S.*)/, + GENE => q/GENE\s+(\S.*)/, + CYTOBAND => q/CYTOBAND\s+(\S.*)/, + MGI => q/MGI\s+(\S.*)/, + LOCUSLINK => q/LOCUSLINK\s+(\S.*)/, + EXPRESS => q/EXPRESS\s+(\S.*)/, + GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/, + CHROMOSOME => q/CHROMOSOME\s+(\S.*)/, + STS => q/STS\s+(\S.*)/, + TXMAP => q/TXMAP\s+(\S.*)/, + PROTSIM => q/PROTSIM\s+(\S.*)/, + SCOUNT => q/SCOUNT\s+(\S.*)/, + SEQUENCE => q/SEQUENCE\s+(\S.*)/, + ACC => q/ACC=(\w+)\.?(\d*)/, + NID => q/NID=\s*(\S.*)/, + PID => q/PID=\s*(\S.*)/, + CLONE => q/CLONE=\s*(\S.*)/, + END => q/END=\s*(\S.*)/, + LID => q/LID=\s*(\S.*)/, + MGC => q/MGC=\s*(\S.*)/, + SEQTYPE => q/SEQTYPE=\s*(\S.*)/, + TRACE => q/TRACE=\s*(\S.*)/, + DELIMITER => q/^\/\// +); + +# we set the right factory here +sub _initialize { + my($self, @args) = @_; + + $self->SUPER::_initialize(@args); + if(! $self->cluster_factory()) { + $self->cluster_factory(Bio::Cluster::ClusterFactory->new( + -type => 'Bio::Cluster::UniGene')); + } +} + +=head2 next_cluster + + Title : next_cluster + Usage : $unigene = $stream->next_cluster() + Function: returns the next unigene in the stream + Returns : Bio::Cluster::UniGene object + Args : NONE + +=cut + +sub next_cluster { + my( $self) = @_; + local $/ = "//"; + return unless my $entry = $self->_readline; + +# set up the variables we'll need + my (%unigene,@express,@locuslink,@chromosome, + @sts,@txmap,@protsim,@sequence); + my $UGobj; + +# set up the regexes + +# add whitespace parsing and precompile regexes +#foreach (values %line_is) { +# $_ =~ s/\s+/\\s+/g; +# print STDERR "Regex is $_\n"; +# #$_ = qr/$_/x; +#} + +#$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/; + +# run each line in an entry against the regexes + foreach my $line (split /\n/, $entry) { + #print STDERR "Wanting to match $line\n"; + if ($line =~ /$line_is{ID}/gcx) { + $unigene{ID} = $1; + } + elsif ($line =~ /$line_is{TITLE}/gcx ) { + #print STDERR "MATCHED with [$1]\n"; + $unigene{TITLE} = $1; + } + elsif ($line =~ /$line_is{GENE}/gcx) { + $unigene{GENE} = $1; + } + elsif ($line =~ /$line_is{CYTOBAND}/gcx) { + $unigene{CYTOBAND} = $1; + } + elsif ($line =~ /$line_is{MGI}/gcx) { + $unigene{MGI} = $1; + } + elsif ($line =~ /$line_is{LOCUSLINK}/gcx) { + @locuslink = split /;/, $1; + } + elsif ($line =~ /$line_is{EXPRESS}/gcx) { + my $express = $1; + # remove initial semicolon if present + $express =~ s/^;//; + @express = split /\s*;/, $express; + } + elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) { + $unigene{GNM_TERMINUS} = $1; + } + elsif ($line =~ /$line_is{CHROMOSOME}/gcx) { + push @chromosome, $1; + } + elsif ($line =~ /$line_is{TXMAP}/gcx) { + push @txmap, $1; + } + elsif ($line =~ /$line_is{STS}/gcx) { + push @sts, $1; + } + elsif ($line =~ /$line_is{PROTSIM}/gcx) { + push @protsim, $1; + } + elsif ($line =~ /$line_is{SCOUNT}/gcx) { + $unigene{SCOUNT} = $1; + } + elsif ($line =~ /$line_is{SEQUENCE}/gcx) { + # parse into each sequence line + my $seq = {}; + # add unigene id to each seq + #$seq->{unigene_id} = $unigene{ID}; + my @items = split /;/,$1; + foreach (@items) { + if (/$line_is{ACC}/gcx) { + $seq->{acc} = $1; + $seq->{version} = $2 if defined $2; + } + elsif (/$line_is{NID}/gcx) { + $seq->{nid} = $1; + } + elsif (/$line_is{PID}/gcx) { + $seq->{pid} = $1; + } + elsif (/$line_is{CLONE}/gcx) { + $seq->{clone} = $1; + } + elsif (/$line_is{END}/gcx) { + $seq->{end} = $1; + } + elsif (/$line_is{LID}/gcx) { + $seq->{lid} = $1; + } + elsif (/$line_is{MGC}/gcx) { + $seq->{mgc} = $1; + } + elsif (/$line_is{SEQTYPE}/gcx) { + $seq->{seqtype} = $1; + } + elsif (/$line_is{TRACE}/gcx) { + $seq->{trace} = $1; + } + } + push @sequence, $seq; + } + elsif ($line =~ /$line_is{DELIMITER}/gcx) { + # at the end of the record, add data to the object + $UGobj = $self->cluster_factory->create_object( + -display_id => $unigene{ID}, + -description => $unigene{TITLE}, + -size => $unigene{SCOUNT}, + -members => \@sequence); + $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE}); + $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND}); + $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI}); + $UGobj->locuslink(\@locuslink); + $UGobj->express(\@express); + $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS}); + $UGobj->chromosome(\@chromosome); + $UGobj->sts(\@sts); + $UGobj->txmap(\@txmap); + $UGobj->protsim(\@protsim); + } + } + return $UGobj; +} + +1; +