0
|
1 # $Id: unigene.pm,v 1.16.2.2 2003/09/15 01:50:47 andrew Exp $
|
|
2 # BioPerl module for Bio::ClusterIO::unigene
|
|
3 #
|
|
4 # Cared for by Andrew Macgregor <andrew@anatomy.otago.ac.nz>
|
|
5 #
|
|
6 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
|
|
7 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
|
|
8 # http://meg.otago.ac.nz
|
|
9 #
|
|
10 # You may distribute this module under the same terms as perl itself
|
|
11 #
|
|
12 # _history
|
|
13 # April 17, 2002 - Initial implementation by Andrew Macgregor
|
|
14
|
|
15 # POD documentation - main docs before the code
|
|
16
|
|
17 =head1 NAME
|
|
18
|
|
19 Bio::ClusterIO::unigene - UniGene input stream
|
|
20
|
|
21 =head1 SYNOPSIS
|
|
22
|
|
23 Do not use this module directly. Use it via the Bio::ClusterIO class.
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 This object reads from Unigene *.data files downloaded from ftp://ftp.ncbi.nih.gov/repository/UniGene/.
|
|
28 It doesn't download and decompress the file, you have to do that yourself.
|
|
29
|
|
30
|
|
31 =head1 FEEDBACK
|
|
32
|
|
33 =head2 Mailing Lists
|
|
34
|
|
35 User feedback is an integral part of the evolution of this and other
|
|
36 Bioperl modules. Send your comments and suggestions preferably to one
|
|
37 of the Bioperl mailing lists. Your participation is much appreciated.
|
|
38
|
|
39 bioperl-l@bioperl.org - General discussion
|
|
40 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
41
|
|
42 =head2 Reporting Bugs
|
|
43
|
|
44 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
45 the bugs and their resolution.
|
|
46 Bug reports can be submitted via email or the web:
|
|
47
|
|
48 bioperl-bugs@bio.perl.org
|
|
49 http://bugzilla.bioperl.org/
|
|
50
|
|
51 =head1 AUTHORS - Andrew Macgregor
|
|
52
|
|
53 Email: andrew@anatomy.otago.ac.nz
|
|
54
|
|
55
|
|
56 =head1 APPENDIX
|
|
57
|
|
58 The rest of the documentation details each of the object
|
|
59 methods. Internal methods are usually preceded with a _
|
|
60
|
|
61 =cut
|
|
62
|
|
63 #'
|
|
64 # Let the code begin...
|
|
65
|
|
66 package Bio::ClusterIO::unigene;
|
|
67 use vars qw(@ISA);
|
|
68 use strict;
|
|
69
|
|
70 use Bio::ClusterIO;
|
|
71 use Bio::Cluster::UniGene;
|
|
72 use Bio::Cluster::ClusterFactory;
|
|
73
|
|
74 @ISA = qw(Bio::ClusterIO);
|
|
75
|
|
76 my %line_is = (
|
|
77 ID => q/ID\s+(\w{2,3}\.\d+)/,
|
|
78 TITLE => q/TITLE\s+(\S.*)/,
|
|
79 GENE => q/GENE\s+(\S.*)/,
|
|
80 CYTOBAND => q/CYTOBAND\s+(\S.*)/,
|
|
81 MGI => q/MGI\s+(\S.*)/,
|
|
82 LOCUSLINK => q/LOCUSLINK\s+(\S.*)/,
|
|
83 EXPRESS => q/EXPRESS\s+(\S.*)/,
|
|
84 GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/,
|
|
85 CHROMOSOME => q/CHROMOSOME\s+(\S.*)/,
|
|
86 STS => q/STS\s+(\S.*)/,
|
|
87 TXMAP => q/TXMAP\s+(\S.*)/,
|
|
88 PROTSIM => q/PROTSIM\s+(\S.*)/,
|
|
89 SCOUNT => q/SCOUNT\s+(\S.*)/,
|
|
90 SEQUENCE => q/SEQUENCE\s+(\S.*)/,
|
|
91 ACC => q/ACC=(\w+)\.?(\d*)/,
|
|
92 NID => q/NID=\s*(\S.*)/,
|
|
93 PID => q/PID=\s*(\S.*)/,
|
|
94 CLONE => q/CLONE=\s*(\S.*)/,
|
|
95 END => q/END=\s*(\S.*)/,
|
|
96 LID => q/LID=\s*(\S.*)/,
|
|
97 MGC => q/MGC=\s*(\S.*)/,
|
|
98 SEQTYPE => q/SEQTYPE=\s*(\S.*)/,
|
|
99 TRACE => q/TRACE=\s*(\S.*)/,
|
|
100 DELIMITER => q/^\/\//
|
|
101 );
|
|
102
|
|
103 # we set the right factory here
|
|
104 sub _initialize {
|
|
105 my($self, @args) = @_;
|
|
106
|
|
107 $self->SUPER::_initialize(@args);
|
|
108 if(! $self->cluster_factory()) {
|
|
109 $self->cluster_factory(Bio::Cluster::ClusterFactory->new(
|
|
110 -type => 'Bio::Cluster::UniGene'));
|
|
111 }
|
|
112 }
|
|
113
|
|
114 =head2 next_cluster
|
|
115
|
|
116 Title : next_cluster
|
|
117 Usage : $unigene = $stream->next_cluster()
|
|
118 Function: returns the next unigene in the stream
|
|
119 Returns : Bio::Cluster::UniGene object
|
|
120 Args : NONE
|
|
121
|
|
122 =cut
|
|
123
|
|
124 sub next_cluster {
|
|
125 my( $self) = @_;
|
|
126 local $/ = "//";
|
|
127 return unless my $entry = $self->_readline;
|
|
128
|
|
129 # set up the variables we'll need
|
|
130 my (%unigene,@express,@locuslink,@chromosome,
|
|
131 @sts,@txmap,@protsim,@sequence);
|
|
132 my $UGobj;
|
|
133
|
|
134 # set up the regexes
|
|
135
|
|
136 # add whitespace parsing and precompile regexes
|
|
137 #foreach (values %line_is) {
|
|
138 # $_ =~ s/\s+/\\s+/g;
|
|
139 # print STDERR "Regex is $_\n";
|
|
140 # #$_ = qr/$_/x;
|
|
141 #}
|
|
142
|
|
143 #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/;
|
|
144
|
|
145 # run each line in an entry against the regexes
|
|
146 foreach my $line (split /\n/, $entry) {
|
|
147 #print STDERR "Wanting to match $line\n";
|
|
148 if ($line =~ /$line_is{ID}/gcx) {
|
|
149 $unigene{ID} = $1;
|
|
150 }
|
|
151 elsif ($line =~ /$line_is{TITLE}/gcx ) {
|
|
152 #print STDERR "MATCHED with [$1]\n";
|
|
153 $unigene{TITLE} = $1;
|
|
154 }
|
|
155 elsif ($line =~ /$line_is{GENE}/gcx) {
|
|
156 $unigene{GENE} = $1;
|
|
157 }
|
|
158 elsif ($line =~ /$line_is{CYTOBAND}/gcx) {
|
|
159 $unigene{CYTOBAND} = $1;
|
|
160 }
|
|
161 elsif ($line =~ /$line_is{MGI}/gcx) {
|
|
162 $unigene{MGI} = $1;
|
|
163 }
|
|
164 elsif ($line =~ /$line_is{LOCUSLINK}/gcx) {
|
|
165 @locuslink = split /;/, $1;
|
|
166 }
|
|
167 elsif ($line =~ /$line_is{EXPRESS}/gcx) {
|
|
168 my $express = $1;
|
|
169 # remove initial semicolon if present
|
|
170 $express =~ s/^;//;
|
|
171 @express = split /\s*;/, $express;
|
|
172 }
|
|
173 elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) {
|
|
174 $unigene{GNM_TERMINUS} = $1;
|
|
175 }
|
|
176 elsif ($line =~ /$line_is{CHROMOSOME}/gcx) {
|
|
177 push @chromosome, $1;
|
|
178 }
|
|
179 elsif ($line =~ /$line_is{TXMAP}/gcx) {
|
|
180 push @txmap, $1;
|
|
181 }
|
|
182 elsif ($line =~ /$line_is{STS}/gcx) {
|
|
183 push @sts, $1;
|
|
184 }
|
|
185 elsif ($line =~ /$line_is{PROTSIM}/gcx) {
|
|
186 push @protsim, $1;
|
|
187 }
|
|
188 elsif ($line =~ /$line_is{SCOUNT}/gcx) {
|
|
189 $unigene{SCOUNT} = $1;
|
|
190 }
|
|
191 elsif ($line =~ /$line_is{SEQUENCE}/gcx) {
|
|
192 # parse into each sequence line
|
|
193 my $seq = {};
|
|
194 # add unigene id to each seq
|
|
195 #$seq->{unigene_id} = $unigene{ID};
|
|
196 my @items = split /;/,$1;
|
|
197 foreach (@items) {
|
|
198 if (/$line_is{ACC}/gcx) {
|
|
199 $seq->{acc} = $1;
|
|
200 $seq->{version} = $2 if defined $2;
|
|
201 }
|
|
202 elsif (/$line_is{NID}/gcx) {
|
|
203 $seq->{nid} = $1;
|
|
204 }
|
|
205 elsif (/$line_is{PID}/gcx) {
|
|
206 $seq->{pid} = $1;
|
|
207 }
|
|
208 elsif (/$line_is{CLONE}/gcx) {
|
|
209 $seq->{clone} = $1;
|
|
210 }
|
|
211 elsif (/$line_is{END}/gcx) {
|
|
212 $seq->{end} = $1;
|
|
213 }
|
|
214 elsif (/$line_is{LID}/gcx) {
|
|
215 $seq->{lid} = $1;
|
|
216 }
|
|
217 elsif (/$line_is{MGC}/gcx) {
|
|
218 $seq->{mgc} = $1;
|
|
219 }
|
|
220 elsif (/$line_is{SEQTYPE}/gcx) {
|
|
221 $seq->{seqtype} = $1;
|
|
222 }
|
|
223 elsif (/$line_is{TRACE}/gcx) {
|
|
224 $seq->{trace} = $1;
|
|
225 }
|
|
226 }
|
|
227 push @sequence, $seq;
|
|
228 }
|
|
229 elsif ($line =~ /$line_is{DELIMITER}/gcx) {
|
|
230 # at the end of the record, add data to the object
|
|
231 $UGobj = $self->cluster_factory->create_object(
|
|
232 -display_id => $unigene{ID},
|
|
233 -description => $unigene{TITLE},
|
|
234 -size => $unigene{SCOUNT},
|
|
235 -members => \@sequence);
|
|
236 $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE});
|
|
237 $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND});
|
|
238 $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI});
|
|
239 $UGobj->locuslink(\@locuslink);
|
|
240 $UGobj->express(\@express);
|
|
241 $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS});
|
|
242 $UGobj->chromosome(\@chromosome);
|
|
243 $UGobj->sts(\@sts);
|
|
244 $UGobj->txmap(\@txmap);
|
|
245 $UGobj->protsim(\@protsim);
|
|
246 }
|
|
247 }
|
|
248 return $UGobj;
|
|
249 }
|
|
250
|
|
251 1;
|
|
252
|