comparison variant_effect_predictor/Bio/ClusterIO/unigene.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 # $Id: unigene.pm,v 1.16.2.2 2003/09/15 01:50:47 andrew Exp $
2 # BioPerl module for Bio::ClusterIO::unigene
3 #
4 # Cared for by Andrew Macgregor <andrew@anatomy.otago.ac.nz>
5 #
6 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
7 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
8 # http://meg.otago.ac.nz
9 #
10 # You may distribute this module under the same terms as perl itself
11 #
12 # _history
13 # April 17, 2002 - Initial implementation by Andrew Macgregor
14
15 # POD documentation - main docs before the code
16
17 =head1 NAME
18
19 Bio::ClusterIO::unigene - UniGene input stream
20
21 =head1 SYNOPSIS
22
23 Do not use this module directly. Use it via the Bio::ClusterIO class.
24
25 =head1 DESCRIPTION
26
27 This object reads from Unigene *.data files downloaded from ftp://ftp.ncbi.nih.gov/repository/UniGene/.
28 It doesn't download and decompress the file, you have to do that yourself.
29
30
31 =head1 FEEDBACK
32
33 =head2 Mailing Lists
34
35 User feedback is an integral part of the evolution of this and other
36 Bioperl modules. Send your comments and suggestions preferably to one
37 of the Bioperl mailing lists. Your participation is much appreciated.
38
39 bioperl-l@bioperl.org - General discussion
40 http://bioperl.org/MailList.shtml - About the mailing lists
41
42 =head2 Reporting Bugs
43
44 Report bugs to the Bioperl bug tracking system to help us keep track
45 the bugs and their resolution.
46 Bug reports can be submitted via email or the web:
47
48 bioperl-bugs@bio.perl.org
49 http://bugzilla.bioperl.org/
50
51 =head1 AUTHORS - Andrew Macgregor
52
53 Email: andrew@anatomy.otago.ac.nz
54
55
56 =head1 APPENDIX
57
58 The rest of the documentation details each of the object
59 methods. Internal methods are usually preceded with a _
60
61 =cut
62
63 #'
64 # Let the code begin...
65
66 package Bio::ClusterIO::unigene;
67 use vars qw(@ISA);
68 use strict;
69
70 use Bio::ClusterIO;
71 use Bio::Cluster::UniGene;
72 use Bio::Cluster::ClusterFactory;
73
74 @ISA = qw(Bio::ClusterIO);
75
76 my %line_is = (
77 ID => q/ID\s+(\w{2,3}\.\d+)/,
78 TITLE => q/TITLE\s+(\S.*)/,
79 GENE => q/GENE\s+(\S.*)/,
80 CYTOBAND => q/CYTOBAND\s+(\S.*)/,
81 MGI => q/MGI\s+(\S.*)/,
82 LOCUSLINK => q/LOCUSLINK\s+(\S.*)/,
83 EXPRESS => q/EXPRESS\s+(\S.*)/,
84 GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/,
85 CHROMOSOME => q/CHROMOSOME\s+(\S.*)/,
86 STS => q/STS\s+(\S.*)/,
87 TXMAP => q/TXMAP\s+(\S.*)/,
88 PROTSIM => q/PROTSIM\s+(\S.*)/,
89 SCOUNT => q/SCOUNT\s+(\S.*)/,
90 SEQUENCE => q/SEQUENCE\s+(\S.*)/,
91 ACC => q/ACC=(\w+)\.?(\d*)/,
92 NID => q/NID=\s*(\S.*)/,
93 PID => q/PID=\s*(\S.*)/,
94 CLONE => q/CLONE=\s*(\S.*)/,
95 END => q/END=\s*(\S.*)/,
96 LID => q/LID=\s*(\S.*)/,
97 MGC => q/MGC=\s*(\S.*)/,
98 SEQTYPE => q/SEQTYPE=\s*(\S.*)/,
99 TRACE => q/TRACE=\s*(\S.*)/,
100 DELIMITER => q/^\/\//
101 );
102
103 # we set the right factory here
104 sub _initialize {
105 my($self, @args) = @_;
106
107 $self->SUPER::_initialize(@args);
108 if(! $self->cluster_factory()) {
109 $self->cluster_factory(Bio::Cluster::ClusterFactory->new(
110 -type => 'Bio::Cluster::UniGene'));
111 }
112 }
113
114 =head2 next_cluster
115
116 Title : next_cluster
117 Usage : $unigene = $stream->next_cluster()
118 Function: returns the next unigene in the stream
119 Returns : Bio::Cluster::UniGene object
120 Args : NONE
121
122 =cut
123
124 sub next_cluster {
125 my( $self) = @_;
126 local $/ = "//";
127 return unless my $entry = $self->_readline;
128
129 # set up the variables we'll need
130 my (%unigene,@express,@locuslink,@chromosome,
131 @sts,@txmap,@protsim,@sequence);
132 my $UGobj;
133
134 # set up the regexes
135
136 # add whitespace parsing and precompile regexes
137 #foreach (values %line_is) {
138 # $_ =~ s/\s+/\\s+/g;
139 # print STDERR "Regex is $_\n";
140 # #$_ = qr/$_/x;
141 #}
142
143 #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/;
144
145 # run each line in an entry against the regexes
146 foreach my $line (split /\n/, $entry) {
147 #print STDERR "Wanting to match $line\n";
148 if ($line =~ /$line_is{ID}/gcx) {
149 $unigene{ID} = $1;
150 }
151 elsif ($line =~ /$line_is{TITLE}/gcx ) {
152 #print STDERR "MATCHED with [$1]\n";
153 $unigene{TITLE} = $1;
154 }
155 elsif ($line =~ /$line_is{GENE}/gcx) {
156 $unigene{GENE} = $1;
157 }
158 elsif ($line =~ /$line_is{CYTOBAND}/gcx) {
159 $unigene{CYTOBAND} = $1;
160 }
161 elsif ($line =~ /$line_is{MGI}/gcx) {
162 $unigene{MGI} = $1;
163 }
164 elsif ($line =~ /$line_is{LOCUSLINK}/gcx) {
165 @locuslink = split /;/, $1;
166 }
167 elsif ($line =~ /$line_is{EXPRESS}/gcx) {
168 my $express = $1;
169 # remove initial semicolon if present
170 $express =~ s/^;//;
171 @express = split /\s*;/, $express;
172 }
173 elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) {
174 $unigene{GNM_TERMINUS} = $1;
175 }
176 elsif ($line =~ /$line_is{CHROMOSOME}/gcx) {
177 push @chromosome, $1;
178 }
179 elsif ($line =~ /$line_is{TXMAP}/gcx) {
180 push @txmap, $1;
181 }
182 elsif ($line =~ /$line_is{STS}/gcx) {
183 push @sts, $1;
184 }
185 elsif ($line =~ /$line_is{PROTSIM}/gcx) {
186 push @protsim, $1;
187 }
188 elsif ($line =~ /$line_is{SCOUNT}/gcx) {
189 $unigene{SCOUNT} = $1;
190 }
191 elsif ($line =~ /$line_is{SEQUENCE}/gcx) {
192 # parse into each sequence line
193 my $seq = {};
194 # add unigene id to each seq
195 #$seq->{unigene_id} = $unigene{ID};
196 my @items = split /;/,$1;
197 foreach (@items) {
198 if (/$line_is{ACC}/gcx) {
199 $seq->{acc} = $1;
200 $seq->{version} = $2 if defined $2;
201 }
202 elsif (/$line_is{NID}/gcx) {
203 $seq->{nid} = $1;
204 }
205 elsif (/$line_is{PID}/gcx) {
206 $seq->{pid} = $1;
207 }
208 elsif (/$line_is{CLONE}/gcx) {
209 $seq->{clone} = $1;
210 }
211 elsif (/$line_is{END}/gcx) {
212 $seq->{end} = $1;
213 }
214 elsif (/$line_is{LID}/gcx) {
215 $seq->{lid} = $1;
216 }
217 elsif (/$line_is{MGC}/gcx) {
218 $seq->{mgc} = $1;
219 }
220 elsif (/$line_is{SEQTYPE}/gcx) {
221 $seq->{seqtype} = $1;
222 }
223 elsif (/$line_is{TRACE}/gcx) {
224 $seq->{trace} = $1;
225 }
226 }
227 push @sequence, $seq;
228 }
229 elsif ($line =~ /$line_is{DELIMITER}/gcx) {
230 # at the end of the record, add data to the object
231 $UGobj = $self->cluster_factory->create_object(
232 -display_id => $unigene{ID},
233 -description => $unigene{TITLE},
234 -size => $unigene{SCOUNT},
235 -members => \@sequence);
236 $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE});
237 $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND});
238 $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI});
239 $UGobj->locuslink(\@locuslink);
240 $UGobj->express(\@express);
241 $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS});
242 $UGobj->chromosome(\@chromosome);
243 $UGobj->sts(\@sts);
244 $UGobj->txmap(\@txmap);
245 $UGobj->protsim(\@protsim);
246 }
247 }
248 return $UGobj;
249 }
250
251 1;
252