Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/ClusterIO/unigene.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1f6dce3d34e0 |
|---|---|
| 1 # $Id: unigene.pm,v 1.16.2.2 2003/09/15 01:50:47 andrew Exp $ | |
| 2 # BioPerl module for Bio::ClusterIO::unigene | |
| 3 # | |
| 4 # Cared for by Andrew Macgregor <andrew@anatomy.otago.ac.nz> | |
| 5 # | |
| 6 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green | |
| 7 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago | |
| 8 # http://meg.otago.ac.nz | |
| 9 # | |
| 10 # You may distribute this module under the same terms as perl itself | |
| 11 # | |
| 12 # _history | |
| 13 # April 17, 2002 - Initial implementation by Andrew Macgregor | |
| 14 | |
| 15 # POD documentation - main docs before the code | |
| 16 | |
| 17 =head1 NAME | |
| 18 | |
| 19 Bio::ClusterIO::unigene - UniGene input stream | |
| 20 | |
| 21 =head1 SYNOPSIS | |
| 22 | |
| 23 Do not use this module directly. Use it via the Bio::ClusterIO class. | |
| 24 | |
| 25 =head1 DESCRIPTION | |
| 26 | |
| 27 This object reads from Unigene *.data files downloaded from ftp://ftp.ncbi.nih.gov/repository/UniGene/. | |
| 28 It doesn't download and decompress the file, you have to do that yourself. | |
| 29 | |
| 30 | |
| 31 =head1 FEEDBACK | |
| 32 | |
| 33 =head2 Mailing Lists | |
| 34 | |
| 35 User feedback is an integral part of the evolution of this and other | |
| 36 Bioperl modules. Send your comments and suggestions preferably to one | |
| 37 of the Bioperl mailing lists. Your participation is much appreciated. | |
| 38 | |
| 39 bioperl-l@bioperl.org - General discussion | |
| 40 http://bioperl.org/MailList.shtml - About the mailing lists | |
| 41 | |
| 42 =head2 Reporting Bugs | |
| 43 | |
| 44 Report bugs to the Bioperl bug tracking system to help us keep track | |
| 45 the bugs and their resolution. | |
| 46 Bug reports can be submitted via email or the web: | |
| 47 | |
| 48 bioperl-bugs@bio.perl.org | |
| 49 http://bugzilla.bioperl.org/ | |
| 50 | |
| 51 =head1 AUTHORS - Andrew Macgregor | |
| 52 | |
| 53 Email: andrew@anatomy.otago.ac.nz | |
| 54 | |
| 55 | |
| 56 =head1 APPENDIX | |
| 57 | |
| 58 The rest of the documentation details each of the object | |
| 59 methods. Internal methods are usually preceded with a _ | |
| 60 | |
| 61 =cut | |
| 62 | |
| 63 #' | |
| 64 # Let the code begin... | |
| 65 | |
| 66 package Bio::ClusterIO::unigene; | |
| 67 use vars qw(@ISA); | |
| 68 use strict; | |
| 69 | |
| 70 use Bio::ClusterIO; | |
| 71 use Bio::Cluster::UniGene; | |
| 72 use Bio::Cluster::ClusterFactory; | |
| 73 | |
| 74 @ISA = qw(Bio::ClusterIO); | |
| 75 | |
| 76 my %line_is = ( | |
| 77 ID => q/ID\s+(\w{2,3}\.\d+)/, | |
| 78 TITLE => q/TITLE\s+(\S.*)/, | |
| 79 GENE => q/GENE\s+(\S.*)/, | |
| 80 CYTOBAND => q/CYTOBAND\s+(\S.*)/, | |
| 81 MGI => q/MGI\s+(\S.*)/, | |
| 82 LOCUSLINK => q/LOCUSLINK\s+(\S.*)/, | |
| 83 EXPRESS => q/EXPRESS\s+(\S.*)/, | |
| 84 GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/, | |
| 85 CHROMOSOME => q/CHROMOSOME\s+(\S.*)/, | |
| 86 STS => q/STS\s+(\S.*)/, | |
| 87 TXMAP => q/TXMAP\s+(\S.*)/, | |
| 88 PROTSIM => q/PROTSIM\s+(\S.*)/, | |
| 89 SCOUNT => q/SCOUNT\s+(\S.*)/, | |
| 90 SEQUENCE => q/SEQUENCE\s+(\S.*)/, | |
| 91 ACC => q/ACC=(\w+)\.?(\d*)/, | |
| 92 NID => q/NID=\s*(\S.*)/, | |
| 93 PID => q/PID=\s*(\S.*)/, | |
| 94 CLONE => q/CLONE=\s*(\S.*)/, | |
| 95 END => q/END=\s*(\S.*)/, | |
| 96 LID => q/LID=\s*(\S.*)/, | |
| 97 MGC => q/MGC=\s*(\S.*)/, | |
| 98 SEQTYPE => q/SEQTYPE=\s*(\S.*)/, | |
| 99 TRACE => q/TRACE=\s*(\S.*)/, | |
| 100 DELIMITER => q/^\/\// | |
| 101 ); | |
| 102 | |
| 103 # we set the right factory here | |
| 104 sub _initialize { | |
| 105 my($self, @args) = @_; | |
| 106 | |
| 107 $self->SUPER::_initialize(@args); | |
| 108 if(! $self->cluster_factory()) { | |
| 109 $self->cluster_factory(Bio::Cluster::ClusterFactory->new( | |
| 110 -type => 'Bio::Cluster::UniGene')); | |
| 111 } | |
| 112 } | |
| 113 | |
| 114 =head2 next_cluster | |
| 115 | |
| 116 Title : next_cluster | |
| 117 Usage : $unigene = $stream->next_cluster() | |
| 118 Function: returns the next unigene in the stream | |
| 119 Returns : Bio::Cluster::UniGene object | |
| 120 Args : NONE | |
| 121 | |
| 122 =cut | |
| 123 | |
| 124 sub next_cluster { | |
| 125 my( $self) = @_; | |
| 126 local $/ = "//"; | |
| 127 return unless my $entry = $self->_readline; | |
| 128 | |
| 129 # set up the variables we'll need | |
| 130 my (%unigene,@express,@locuslink,@chromosome, | |
| 131 @sts,@txmap,@protsim,@sequence); | |
| 132 my $UGobj; | |
| 133 | |
| 134 # set up the regexes | |
| 135 | |
| 136 # add whitespace parsing and precompile regexes | |
| 137 #foreach (values %line_is) { | |
| 138 # $_ =~ s/\s+/\\s+/g; | |
| 139 # print STDERR "Regex is $_\n"; | |
| 140 # #$_ = qr/$_/x; | |
| 141 #} | |
| 142 | |
| 143 #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/; | |
| 144 | |
| 145 # run each line in an entry against the regexes | |
| 146 foreach my $line (split /\n/, $entry) { | |
| 147 #print STDERR "Wanting to match $line\n"; | |
| 148 if ($line =~ /$line_is{ID}/gcx) { | |
| 149 $unigene{ID} = $1; | |
| 150 } | |
| 151 elsif ($line =~ /$line_is{TITLE}/gcx ) { | |
| 152 #print STDERR "MATCHED with [$1]\n"; | |
| 153 $unigene{TITLE} = $1; | |
| 154 } | |
| 155 elsif ($line =~ /$line_is{GENE}/gcx) { | |
| 156 $unigene{GENE} = $1; | |
| 157 } | |
| 158 elsif ($line =~ /$line_is{CYTOBAND}/gcx) { | |
| 159 $unigene{CYTOBAND} = $1; | |
| 160 } | |
| 161 elsif ($line =~ /$line_is{MGI}/gcx) { | |
| 162 $unigene{MGI} = $1; | |
| 163 } | |
| 164 elsif ($line =~ /$line_is{LOCUSLINK}/gcx) { | |
| 165 @locuslink = split /;/, $1; | |
| 166 } | |
| 167 elsif ($line =~ /$line_is{EXPRESS}/gcx) { | |
| 168 my $express = $1; | |
| 169 # remove initial semicolon if present | |
| 170 $express =~ s/^;//; | |
| 171 @express = split /\s*;/, $express; | |
| 172 } | |
| 173 elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) { | |
| 174 $unigene{GNM_TERMINUS} = $1; | |
| 175 } | |
| 176 elsif ($line =~ /$line_is{CHROMOSOME}/gcx) { | |
| 177 push @chromosome, $1; | |
| 178 } | |
| 179 elsif ($line =~ /$line_is{TXMAP}/gcx) { | |
| 180 push @txmap, $1; | |
| 181 } | |
| 182 elsif ($line =~ /$line_is{STS}/gcx) { | |
| 183 push @sts, $1; | |
| 184 } | |
| 185 elsif ($line =~ /$line_is{PROTSIM}/gcx) { | |
| 186 push @protsim, $1; | |
| 187 } | |
| 188 elsif ($line =~ /$line_is{SCOUNT}/gcx) { | |
| 189 $unigene{SCOUNT} = $1; | |
| 190 } | |
| 191 elsif ($line =~ /$line_is{SEQUENCE}/gcx) { | |
| 192 # parse into each sequence line | |
| 193 my $seq = {}; | |
| 194 # add unigene id to each seq | |
| 195 #$seq->{unigene_id} = $unigene{ID}; | |
| 196 my @items = split /;/,$1; | |
| 197 foreach (@items) { | |
| 198 if (/$line_is{ACC}/gcx) { | |
| 199 $seq->{acc} = $1; | |
| 200 $seq->{version} = $2 if defined $2; | |
| 201 } | |
| 202 elsif (/$line_is{NID}/gcx) { | |
| 203 $seq->{nid} = $1; | |
| 204 } | |
| 205 elsif (/$line_is{PID}/gcx) { | |
| 206 $seq->{pid} = $1; | |
| 207 } | |
| 208 elsif (/$line_is{CLONE}/gcx) { | |
| 209 $seq->{clone} = $1; | |
| 210 } | |
| 211 elsif (/$line_is{END}/gcx) { | |
| 212 $seq->{end} = $1; | |
| 213 } | |
| 214 elsif (/$line_is{LID}/gcx) { | |
| 215 $seq->{lid} = $1; | |
| 216 } | |
| 217 elsif (/$line_is{MGC}/gcx) { | |
| 218 $seq->{mgc} = $1; | |
| 219 } | |
| 220 elsif (/$line_is{SEQTYPE}/gcx) { | |
| 221 $seq->{seqtype} = $1; | |
| 222 } | |
| 223 elsif (/$line_is{TRACE}/gcx) { | |
| 224 $seq->{trace} = $1; | |
| 225 } | |
| 226 } | |
| 227 push @sequence, $seq; | |
| 228 } | |
| 229 elsif ($line =~ /$line_is{DELIMITER}/gcx) { | |
| 230 # at the end of the record, add data to the object | |
| 231 $UGobj = $self->cluster_factory->create_object( | |
| 232 -display_id => $unigene{ID}, | |
| 233 -description => $unigene{TITLE}, | |
| 234 -size => $unigene{SCOUNT}, | |
| 235 -members => \@sequence); | |
| 236 $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE}); | |
| 237 $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND}); | |
| 238 $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI}); | |
| 239 $UGobj->locuslink(\@locuslink); | |
| 240 $UGobj->express(\@express); | |
| 241 $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS}); | |
| 242 $UGobj->chromosome(\@chromosome); | |
| 243 $UGobj->sts(\@sts); | |
| 244 $UGobj->txmap(\@txmap); | |
| 245 $UGobj->protsim(\@protsim); | |
| 246 } | |
| 247 } | |
| 248 return $UGobj; | |
| 249 } | |
| 250 | |
| 251 1; | |
| 252 |
