Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/AlignIO/emboss.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: emboss.pm,v 1.11 2002/10/22 07:45:10 lapp Exp $ | |
2 # | |
3 # BioPerl module for Bio::AlignIO::emboss | |
4 # | |
5 # Cared for by Jason Stajich <jason@bioperl.org> | |
6 # | |
7 # Copyright Jason Stajich | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::AlignIO::emboss - Parse EMBOSS alignment output (from applications water and needle) | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 # do not use the object directly | |
20 use Bio::AlignIO; | |
21 # read in an alignment from the EMBOSS program water | |
22 my $in = new Bio::AlignIO(-format => 'emboss', | |
23 -file => 'seq.water'); | |
24 while( my $aln = $in->next_aln ) { | |
25 # do something with the alignment | |
26 } | |
27 | |
28 =head1 DESCRIPTION | |
29 | |
30 This object handles parsing and writing pairwise sequence alignments | |
31 from the EMBOSS suite. | |
32 | |
33 =head1 FEEDBACK | |
34 | |
35 =head2 Mailing Lists | |
36 | |
37 User feedback is an integral part of the evolution of this and other | |
38 Bioperl modules. Send your comments and suggestions preferably to | |
39 the Bioperl mailing list. Your participation is much appreciated. | |
40 | |
41 bioperl-l@bioperl.org - General discussion | |
42 http://bioperl.org/MailList.shtml - About the mailing lists | |
43 | |
44 =head2 Reporting Bugs | |
45 | |
46 Report bugs to the Bioperl bug tracking system to help us keep track | |
47 of the bugs and their resolution. Bug reports can be submitted via | |
48 email or the web: | |
49 | |
50 bioperl-bugs@bioperl.org | |
51 http://bugzilla.bioperl.org/ | |
52 | |
53 =head1 AUTHOR - Jason Stajich | |
54 | |
55 Email jason@bioperl.org | |
56 | |
57 Describe contact details here | |
58 | |
59 =head1 CONTRIBUTORS | |
60 | |
61 Additional contributors names and emails here | |
62 | |
63 =head1 APPENDIX | |
64 | |
65 The rest of the documentation details each of the object methods. | |
66 Internal methods are usually preceded with a _ | |
67 | |
68 =cut | |
69 | |
70 | |
71 # Let the code begin... | |
72 | |
73 | |
74 package Bio::AlignIO::emboss; | |
75 use vars qw(@ISA $EMBOSSTitleLen $EMBOSSLineLen); | |
76 use strict; | |
77 | |
78 use Bio::AlignIO; | |
79 use Bio::LocatableSeq; | |
80 | |
81 @ISA = qw(Bio::AlignIO ); | |
82 | |
83 BEGIN { | |
84 $EMBOSSTitleLen = 13; | |
85 $EMBOSSLineLen = 50; | |
86 } | |
87 | |
88 sub _initialize { | |
89 my($self,@args) = @_; | |
90 $self->SUPER::_initialize(@args); | |
91 $self->{'_type'} = undef; | |
92 } | |
93 | |
94 =head2 next_aln | |
95 | |
96 Title : next_aln | |
97 Usage : $aln = $stream->next_aln() | |
98 Function: returns the next alignment in the stream. | |
99 Returns : L<Bio::Align::AlignI> object - returns 0 on end of file | |
100 or on error | |
101 Args : NONE | |
102 | |
103 =cut | |
104 | |
105 sub next_aln { | |
106 my ($self) = @_; | |
107 my $seenbegin = 0; | |
108 my %data = ( 'seq1' => { | |
109 'start'=> undef, | |
110 'end'=> undef, | |
111 'name' => '', | |
112 'data' => '' }, | |
113 'seq2' => { | |
114 'start'=> undef, | |
115 'end'=> undef, | |
116 'name' => '', | |
117 'data' => '' }, | |
118 'align' => '', | |
119 'type' => $self->{'_type'}, # to restore type from | |
120 # previous aln if possible | |
121 ); | |
122 my %names; | |
123 while( defined($_ = $self->_readline) ) { | |
124 next if( /^\#?\s+$/ || /^\#+\s*$/ ); | |
125 if( /^\#(\=|\-)+\s*$/) { | |
126 last if( $seenbegin); | |
127 } elsif( /(Local|Global):\s*(\S+)\s+vs\s+(\S+)/ || | |
128 /^\#\s+Program:\s+(\S+)/ ) | |
129 { | |
130 my ($name1,$name2) = ($2,$3); | |
131 if( ! defined $name1 ) { # Handle EMBOSS 2.2.X | |
132 $data{'type'} = $1; | |
133 $name1 = $name2 = ''; | |
134 } else { | |
135 $data{'type'} = $1 eq 'Local' ? 'water' : 'needle'; | |
136 } | |
137 $data{'seq1'}->{'name'} = $name1; | |
138 $data{'seq2'}->{'name'} = $name2; | |
139 | |
140 $self->{'_type'} = $data{'type'}; | |
141 | |
142 } elsif( /Score:\s+(\S+)/ ) { | |
143 $data{'score'} = $1; | |
144 } elsif( /^\#\s+(1|2):\s+(\S+)/ && ! $data{"seq$1"}->{'name'} ) { | |
145 my $nm = $2; | |
146 $nm = substr($nm,0,$EMBOSSTitleLen); # emboss has a max seq length | |
147 if( $names{$nm} ) { | |
148 $nm .= "-". $names{$nm}; | |
149 } | |
150 $names{$nm}++; | |
151 $data{"seq$1"}->{'name'} = $nm; | |
152 } elsif( $data{'seq1'}->{'name'} && | |
153 /^$data{'seq1'}->{'name'}/ ) { | |
154 my $count = 0; | |
155 $seenbegin = 1; | |
156 my @current; | |
157 while( defined ($_) ) { | |
158 my $align_other = ''; | |
159 my $delayed; | |
160 if($count == 0 || $count == 2 ) { | |
161 my @l = split; | |
162 my ($seq,$align,$start,$end); | |
163 if( $count == 2 && $data{'seq2'}->{'name'} eq '' ) { | |
164 # weird boundary condition | |
165 ($start,$align,$end) = @l; | |
166 } elsif( @l == 3 ) { | |
167 $align = ''; | |
168 ($seq,$start,$end) = @l | |
169 } else { | |
170 ($seq,$start,$align,$end) = @l; | |
171 } | |
172 | |
173 my $seqname = sprintf("seq%d", ($count == 0) ? '1' : '2'); | |
174 $data{$seqname}->{'data'} .= $align; | |
175 $data{$seqname}->{'start'} ||= $start; | |
176 $data{$seqname}->{'end'} = $end; | |
177 $current[$count] = [ $start,$align || '']; | |
178 } else { | |
179 s/^\s+//; | |
180 s/\s+$//; | |
181 $data{'align'} .= $_; | |
182 } | |
183 | |
184 BOTTOM: | |
185 last if( $count++ == 2); | |
186 $_ = $self->_readline(); | |
187 } | |
188 | |
189 if( $data{'type'} eq 'needle' ) { | |
190 # which ever one is shorter we want to bring it up to | |
191 # length. Man this stinks. | |
192 my ($s1,$s2) = ($data{'seq1'}, $data{'seq2'}); | |
193 | |
194 my $d = length($current[0]->[1]) - length($current[2]->[1]); | |
195 if( $d < 0 ) { # s1 is smaller, need to add some | |
196 # compare the starting points for this alignment line | |
197 if( $current[0]->[0] <= 1 && $current[2]->[0] > 1) { | |
198 $s1->{'data'} = ('-' x abs($d)) . $s1->{'data'}; | |
199 $data{'align'} = (' 'x abs($d)).$data{'align'}; | |
200 } else { | |
201 $s1->{'data'} .= '-' x abs($d); | |
202 $data{'align'} .= ' 'x abs($d); | |
203 } | |
204 } elsif( $d > 0) { # s2 is smaller, need to add some | |
205 if( $current[2]->[0] <= 1 && $current[0]->[0] > 1) { | |
206 $s2->{'data'} = ('-' x abs($d)) . $s2->{'data'}; | |
207 $data{'align'} = (' 'x abs($d)).$data{'align'}; | |
208 } else { | |
209 $s2->{'data'} .= '-' x abs($d); | |
210 $data{'align'} .= ' 'x abs($d); | |
211 } | |
212 } | |
213 } | |
214 | |
215 } | |
216 } | |
217 return undef unless $seenbegin; | |
218 my $aln = Bio::SimpleAlign->new(-verbose => $self->verbose(), | |
219 -source => "EMBOSS-".$data{'type'}); | |
220 | |
221 foreach my $seqname ( qw(seq1 seq2) ) { | |
222 return undef unless ( defined $data{$seqname} ); | |
223 $data{$seqname}->{'name'} ||= $seqname; | |
224 my $seq = new Bio::LocatableSeq('-seq' => $data{$seqname}->{'data'}, | |
225 '-id' => $data{$seqname}->{'name'}, | |
226 '-start'=> $data{$seqname}->{'start'}, | |
227 '-end' => $data{$seqname}->{'end'}, | |
228 ); | |
229 $aln->add_seq($seq); | |
230 } | |
231 return $aln; | |
232 } | |
233 | |
234 =head2 write_aln | |
235 | |
236 Title : write_aln | |
237 Usage : $stream->write_aln(@aln) | |
238 Function: writes the $aln object into the stream in emboss format | |
239 Returns : 1 for success and 0 for error | |
240 Args : L<Bio::Align::AlignI> object | |
241 | |
242 | |
243 =cut | |
244 | |
245 sub write_aln { | |
246 my ($self,@aln) = @_; | |
247 | |
248 $self->throw("Sorry: writing emboss output is not currently available! \n"); | |
249 } | |
250 | |
251 1; |