Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/ClusterIO/unigene.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 # $Id: unigene.pm,v 1.16.2.2 2003/09/15 01:50:47 andrew Exp $ | |
2 # BioPerl module for Bio::ClusterIO::unigene | |
3 # | |
4 # Cared for by Andrew Macgregor <andrew@anatomy.otago.ac.nz> | |
5 # | |
6 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green | |
7 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago | |
8 # http://meg.otago.ac.nz | |
9 # | |
10 # You may distribute this module under the same terms as perl itself | |
11 # | |
12 # _history | |
13 # April 17, 2002 - Initial implementation by Andrew Macgregor | |
14 | |
15 # POD documentation - main docs before the code | |
16 | |
17 =head1 NAME | |
18 | |
19 Bio::ClusterIO::unigene - UniGene input stream | |
20 | |
21 =head1 SYNOPSIS | |
22 | |
23 Do not use this module directly. Use it via the Bio::ClusterIO class. | |
24 | |
25 =head1 DESCRIPTION | |
26 | |
27 This object reads from Unigene *.data files downloaded from ftp://ftp.ncbi.nih.gov/repository/UniGene/. | |
28 It doesn't download and decompress the file, you have to do that yourself. | |
29 | |
30 | |
31 =head1 FEEDBACK | |
32 | |
33 =head2 Mailing Lists | |
34 | |
35 User feedback is an integral part of the evolution of this and other | |
36 Bioperl modules. Send your comments and suggestions preferably to one | |
37 of the Bioperl mailing lists. Your participation is much appreciated. | |
38 | |
39 bioperl-l@bioperl.org - General discussion | |
40 http://bioperl.org/MailList.shtml - About the mailing lists | |
41 | |
42 =head2 Reporting Bugs | |
43 | |
44 Report bugs to the Bioperl bug tracking system to help us keep track | |
45 the bugs and their resolution. | |
46 Bug reports can be submitted via email or the web: | |
47 | |
48 bioperl-bugs@bio.perl.org | |
49 http://bugzilla.bioperl.org/ | |
50 | |
51 =head1 AUTHORS - Andrew Macgregor | |
52 | |
53 Email: andrew@anatomy.otago.ac.nz | |
54 | |
55 | |
56 =head1 APPENDIX | |
57 | |
58 The rest of the documentation details each of the object | |
59 methods. Internal methods are usually preceded with a _ | |
60 | |
61 =cut | |
62 | |
63 #' | |
64 # Let the code begin... | |
65 | |
66 package Bio::ClusterIO::unigene; | |
67 use vars qw(@ISA); | |
68 use strict; | |
69 | |
70 use Bio::ClusterIO; | |
71 use Bio::Cluster::UniGene; | |
72 use Bio::Cluster::ClusterFactory; | |
73 | |
74 @ISA = qw(Bio::ClusterIO); | |
75 | |
76 my %line_is = ( | |
77 ID => q/ID\s+(\w{2,3}\.\d+)/, | |
78 TITLE => q/TITLE\s+(\S.*)/, | |
79 GENE => q/GENE\s+(\S.*)/, | |
80 CYTOBAND => q/CYTOBAND\s+(\S.*)/, | |
81 MGI => q/MGI\s+(\S.*)/, | |
82 LOCUSLINK => q/LOCUSLINK\s+(\S.*)/, | |
83 EXPRESS => q/EXPRESS\s+(\S.*)/, | |
84 GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/, | |
85 CHROMOSOME => q/CHROMOSOME\s+(\S.*)/, | |
86 STS => q/STS\s+(\S.*)/, | |
87 TXMAP => q/TXMAP\s+(\S.*)/, | |
88 PROTSIM => q/PROTSIM\s+(\S.*)/, | |
89 SCOUNT => q/SCOUNT\s+(\S.*)/, | |
90 SEQUENCE => q/SEQUENCE\s+(\S.*)/, | |
91 ACC => q/ACC=(\w+)\.?(\d*)/, | |
92 NID => q/NID=\s*(\S.*)/, | |
93 PID => q/PID=\s*(\S.*)/, | |
94 CLONE => q/CLONE=\s*(\S.*)/, | |
95 END => q/END=\s*(\S.*)/, | |
96 LID => q/LID=\s*(\S.*)/, | |
97 MGC => q/MGC=\s*(\S.*)/, | |
98 SEQTYPE => q/SEQTYPE=\s*(\S.*)/, | |
99 TRACE => q/TRACE=\s*(\S.*)/, | |
100 DELIMITER => q/^\/\// | |
101 ); | |
102 | |
103 # we set the right factory here | |
104 sub _initialize { | |
105 my($self, @args) = @_; | |
106 | |
107 $self->SUPER::_initialize(@args); | |
108 if(! $self->cluster_factory()) { | |
109 $self->cluster_factory(Bio::Cluster::ClusterFactory->new( | |
110 -type => 'Bio::Cluster::UniGene')); | |
111 } | |
112 } | |
113 | |
114 =head2 next_cluster | |
115 | |
116 Title : next_cluster | |
117 Usage : $unigene = $stream->next_cluster() | |
118 Function: returns the next unigene in the stream | |
119 Returns : Bio::Cluster::UniGene object | |
120 Args : NONE | |
121 | |
122 =cut | |
123 | |
124 sub next_cluster { | |
125 my( $self) = @_; | |
126 local $/ = "//"; | |
127 return unless my $entry = $self->_readline; | |
128 | |
129 # set up the variables we'll need | |
130 my (%unigene,@express,@locuslink,@chromosome, | |
131 @sts,@txmap,@protsim,@sequence); | |
132 my $UGobj; | |
133 | |
134 # set up the regexes | |
135 | |
136 # add whitespace parsing and precompile regexes | |
137 #foreach (values %line_is) { | |
138 # $_ =~ s/\s+/\\s+/g; | |
139 # print STDERR "Regex is $_\n"; | |
140 # #$_ = qr/$_/x; | |
141 #} | |
142 | |
143 #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/; | |
144 | |
145 # run each line in an entry against the regexes | |
146 foreach my $line (split /\n/, $entry) { | |
147 #print STDERR "Wanting to match $line\n"; | |
148 if ($line =~ /$line_is{ID}/gcx) { | |
149 $unigene{ID} = $1; | |
150 } | |
151 elsif ($line =~ /$line_is{TITLE}/gcx ) { | |
152 #print STDERR "MATCHED with [$1]\n"; | |
153 $unigene{TITLE} = $1; | |
154 } | |
155 elsif ($line =~ /$line_is{GENE}/gcx) { | |
156 $unigene{GENE} = $1; | |
157 } | |
158 elsif ($line =~ /$line_is{CYTOBAND}/gcx) { | |
159 $unigene{CYTOBAND} = $1; | |
160 } | |
161 elsif ($line =~ /$line_is{MGI}/gcx) { | |
162 $unigene{MGI} = $1; | |
163 } | |
164 elsif ($line =~ /$line_is{LOCUSLINK}/gcx) { | |
165 @locuslink = split /;/, $1; | |
166 } | |
167 elsif ($line =~ /$line_is{EXPRESS}/gcx) { | |
168 my $express = $1; | |
169 # remove initial semicolon if present | |
170 $express =~ s/^;//; | |
171 @express = split /\s*;/, $express; | |
172 } | |
173 elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) { | |
174 $unigene{GNM_TERMINUS} = $1; | |
175 } | |
176 elsif ($line =~ /$line_is{CHROMOSOME}/gcx) { | |
177 push @chromosome, $1; | |
178 } | |
179 elsif ($line =~ /$line_is{TXMAP}/gcx) { | |
180 push @txmap, $1; | |
181 } | |
182 elsif ($line =~ /$line_is{STS}/gcx) { | |
183 push @sts, $1; | |
184 } | |
185 elsif ($line =~ /$line_is{PROTSIM}/gcx) { | |
186 push @protsim, $1; | |
187 } | |
188 elsif ($line =~ /$line_is{SCOUNT}/gcx) { | |
189 $unigene{SCOUNT} = $1; | |
190 } | |
191 elsif ($line =~ /$line_is{SEQUENCE}/gcx) { | |
192 # parse into each sequence line | |
193 my $seq = {}; | |
194 # add unigene id to each seq | |
195 #$seq->{unigene_id} = $unigene{ID}; | |
196 my @items = split /;/,$1; | |
197 foreach (@items) { | |
198 if (/$line_is{ACC}/gcx) { | |
199 $seq->{acc} = $1; | |
200 $seq->{version} = $2 if defined $2; | |
201 } | |
202 elsif (/$line_is{NID}/gcx) { | |
203 $seq->{nid} = $1; | |
204 } | |
205 elsif (/$line_is{PID}/gcx) { | |
206 $seq->{pid} = $1; | |
207 } | |
208 elsif (/$line_is{CLONE}/gcx) { | |
209 $seq->{clone} = $1; | |
210 } | |
211 elsif (/$line_is{END}/gcx) { | |
212 $seq->{end} = $1; | |
213 } | |
214 elsif (/$line_is{LID}/gcx) { | |
215 $seq->{lid} = $1; | |
216 } | |
217 elsif (/$line_is{MGC}/gcx) { | |
218 $seq->{mgc} = $1; | |
219 } | |
220 elsif (/$line_is{SEQTYPE}/gcx) { | |
221 $seq->{seqtype} = $1; | |
222 } | |
223 elsif (/$line_is{TRACE}/gcx) { | |
224 $seq->{trace} = $1; | |
225 } | |
226 } | |
227 push @sequence, $seq; | |
228 } | |
229 elsif ($line =~ /$line_is{DELIMITER}/gcx) { | |
230 # at the end of the record, add data to the object | |
231 $UGobj = $self->cluster_factory->create_object( | |
232 -display_id => $unigene{ID}, | |
233 -description => $unigene{TITLE}, | |
234 -size => $unigene{SCOUNT}, | |
235 -members => \@sequence); | |
236 $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE}); | |
237 $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND}); | |
238 $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI}); | |
239 $UGobj->locuslink(\@locuslink); | |
240 $UGobj->express(\@express); | |
241 $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS}); | |
242 $UGobj->chromosome(\@chromosome); | |
243 $UGobj->sts(\@sts); | |
244 $UGobj->txmap(\@txmap); | |
245 $UGobj->protsim(\@protsim); | |
246 } | |
247 } | |
248 return $UGobj; | |
249 } | |
250 | |
251 1; | |
252 |