0
|
1 # $Id: emboss.pm,v 1.11 2002/10/22 07:45:10 lapp Exp $
|
|
2 #
|
|
3 # BioPerl module for Bio::AlignIO::emboss
|
|
4 #
|
|
5 # Cared for by Jason Stajich <jason@bioperl.org>
|
|
6 #
|
|
7 # Copyright Jason Stajich
|
|
8 #
|
|
9 # You may distribute this module under the same terms as perl itself
|
|
10
|
|
11 # POD documentation - main docs before the code
|
|
12
|
|
13 =head1 NAME
|
|
14
|
|
15 Bio::AlignIO::emboss - Parse EMBOSS alignment output (from applications water and needle)
|
|
16
|
|
17 =head1 SYNOPSIS
|
|
18
|
|
19 # do not use the object directly
|
|
20 use Bio::AlignIO;
|
|
21 # read in an alignment from the EMBOSS program water
|
|
22 my $in = new Bio::AlignIO(-format => 'emboss',
|
|
23 -file => 'seq.water');
|
|
24 while( my $aln = $in->next_aln ) {
|
|
25 # do something with the alignment
|
|
26 }
|
|
27
|
|
28 =head1 DESCRIPTION
|
|
29
|
|
30 This object handles parsing and writing pairwise sequence alignments
|
|
31 from the EMBOSS suite.
|
|
32
|
|
33 =head1 FEEDBACK
|
|
34
|
|
35 =head2 Mailing Lists
|
|
36
|
|
37 User feedback is an integral part of the evolution of this and other
|
|
38 Bioperl modules. Send your comments and suggestions preferably to
|
|
39 the Bioperl mailing list. Your participation is much appreciated.
|
|
40
|
|
41 bioperl-l@bioperl.org - General discussion
|
|
42 http://bioperl.org/MailList.shtml - About the mailing lists
|
|
43
|
|
44 =head2 Reporting Bugs
|
|
45
|
|
46 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
47 of the bugs and their resolution. Bug reports can be submitted via
|
|
48 email or the web:
|
|
49
|
|
50 bioperl-bugs@bioperl.org
|
|
51 http://bugzilla.bioperl.org/
|
|
52
|
|
53 =head1 AUTHOR - Jason Stajich
|
|
54
|
|
55 Email jason@bioperl.org
|
|
56
|
|
57 Describe contact details here
|
|
58
|
|
59 =head1 CONTRIBUTORS
|
|
60
|
|
61 Additional contributors names and emails here
|
|
62
|
|
63 =head1 APPENDIX
|
|
64
|
|
65 The rest of the documentation details each of the object methods.
|
|
66 Internal methods are usually preceded with a _
|
|
67
|
|
68 =cut
|
|
69
|
|
70
|
|
71 # Let the code begin...
|
|
72
|
|
73
|
|
74 package Bio::AlignIO::emboss;
|
|
75 use vars qw(@ISA $EMBOSSTitleLen $EMBOSSLineLen);
|
|
76 use strict;
|
|
77
|
|
78 use Bio::AlignIO;
|
|
79 use Bio::LocatableSeq;
|
|
80
|
|
81 @ISA = qw(Bio::AlignIO );
|
|
82
|
|
83 BEGIN {
|
|
84 $EMBOSSTitleLen = 13;
|
|
85 $EMBOSSLineLen = 50;
|
|
86 }
|
|
87
|
|
88 sub _initialize {
|
|
89 my($self,@args) = @_;
|
|
90 $self->SUPER::_initialize(@args);
|
|
91 $self->{'_type'} = undef;
|
|
92 }
|
|
93
|
|
94 =head2 next_aln
|
|
95
|
|
96 Title : next_aln
|
|
97 Usage : $aln = $stream->next_aln()
|
|
98 Function: returns the next alignment in the stream.
|
|
99 Returns : L<Bio::Align::AlignI> object - returns 0 on end of file
|
|
100 or on error
|
|
101 Args : NONE
|
|
102
|
|
103 =cut
|
|
104
|
|
105 sub next_aln {
|
|
106 my ($self) = @_;
|
|
107 my $seenbegin = 0;
|
|
108 my %data = ( 'seq1' => {
|
|
109 'start'=> undef,
|
|
110 'end'=> undef,
|
|
111 'name' => '',
|
|
112 'data' => '' },
|
|
113 'seq2' => {
|
|
114 'start'=> undef,
|
|
115 'end'=> undef,
|
|
116 'name' => '',
|
|
117 'data' => '' },
|
|
118 'align' => '',
|
|
119 'type' => $self->{'_type'}, # to restore type from
|
|
120 # previous aln if possible
|
|
121 );
|
|
122 my %names;
|
|
123 while( defined($_ = $self->_readline) ) {
|
|
124 next if( /^\#?\s+$/ || /^\#+\s*$/ );
|
|
125 if( /^\#(\=|\-)+\s*$/) {
|
|
126 last if( $seenbegin);
|
|
127 } elsif( /(Local|Global):\s*(\S+)\s+vs\s+(\S+)/ ||
|
|
128 /^\#\s+Program:\s+(\S+)/ )
|
|
129 {
|
|
130 my ($name1,$name2) = ($2,$3);
|
|
131 if( ! defined $name1 ) { # Handle EMBOSS 2.2.X
|
|
132 $data{'type'} = $1;
|
|
133 $name1 = $name2 = '';
|
|
134 } else {
|
|
135 $data{'type'} = $1 eq 'Local' ? 'water' : 'needle';
|
|
136 }
|
|
137 $data{'seq1'}->{'name'} = $name1;
|
|
138 $data{'seq2'}->{'name'} = $name2;
|
|
139
|
|
140 $self->{'_type'} = $data{'type'};
|
|
141
|
|
142 } elsif( /Score:\s+(\S+)/ ) {
|
|
143 $data{'score'} = $1;
|
|
144 } elsif( /^\#\s+(1|2):\s+(\S+)/ && ! $data{"seq$1"}->{'name'} ) {
|
|
145 my $nm = $2;
|
|
146 $nm = substr($nm,0,$EMBOSSTitleLen); # emboss has a max seq length
|
|
147 if( $names{$nm} ) {
|
|
148 $nm .= "-". $names{$nm};
|
|
149 }
|
|
150 $names{$nm}++;
|
|
151 $data{"seq$1"}->{'name'} = $nm;
|
|
152 } elsif( $data{'seq1'}->{'name'} &&
|
|
153 /^$data{'seq1'}->{'name'}/ ) {
|
|
154 my $count = 0;
|
|
155 $seenbegin = 1;
|
|
156 my @current;
|
|
157 while( defined ($_) ) {
|
|
158 my $align_other = '';
|
|
159 my $delayed;
|
|
160 if($count == 0 || $count == 2 ) {
|
|
161 my @l = split;
|
|
162 my ($seq,$align,$start,$end);
|
|
163 if( $count == 2 && $data{'seq2'}->{'name'} eq '' ) {
|
|
164 # weird boundary condition
|
|
165 ($start,$align,$end) = @l;
|
|
166 } elsif( @l == 3 ) {
|
|
167 $align = '';
|
|
168 ($seq,$start,$end) = @l
|
|
169 } else {
|
|
170 ($seq,$start,$align,$end) = @l;
|
|
171 }
|
|
172
|
|
173 my $seqname = sprintf("seq%d", ($count == 0) ? '1' : '2');
|
|
174 $data{$seqname}->{'data'} .= $align;
|
|
175 $data{$seqname}->{'start'} ||= $start;
|
|
176 $data{$seqname}->{'end'} = $end;
|
|
177 $current[$count] = [ $start,$align || ''];
|
|
178 } else {
|
|
179 s/^\s+//;
|
|
180 s/\s+$//;
|
|
181 $data{'align'} .= $_;
|
|
182 }
|
|
183
|
|
184 BOTTOM:
|
|
185 last if( $count++ == 2);
|
|
186 $_ = $self->_readline();
|
|
187 }
|
|
188
|
|
189 if( $data{'type'} eq 'needle' ) {
|
|
190 # which ever one is shorter we want to bring it up to
|
|
191 # length. Man this stinks.
|
|
192 my ($s1,$s2) = ($data{'seq1'}, $data{'seq2'});
|
|
193
|
|
194 my $d = length($current[0]->[1]) - length($current[2]->[1]);
|
|
195 if( $d < 0 ) { # s1 is smaller, need to add some
|
|
196 # compare the starting points for this alignment line
|
|
197 if( $current[0]->[0] <= 1 && $current[2]->[0] > 1) {
|
|
198 $s1->{'data'} = ('-' x abs($d)) . $s1->{'data'};
|
|
199 $data{'align'} = (' 'x abs($d)).$data{'align'};
|
|
200 } else {
|
|
201 $s1->{'data'} .= '-' x abs($d);
|
|
202 $data{'align'} .= ' 'x abs($d);
|
|
203 }
|
|
204 } elsif( $d > 0) { # s2 is smaller, need to add some
|
|
205 if( $current[2]->[0] <= 1 && $current[0]->[0] > 1) {
|
|
206 $s2->{'data'} = ('-' x abs($d)) . $s2->{'data'};
|
|
207 $data{'align'} = (' 'x abs($d)).$data{'align'};
|
|
208 } else {
|
|
209 $s2->{'data'} .= '-' x abs($d);
|
|
210 $data{'align'} .= ' 'x abs($d);
|
|
211 }
|
|
212 }
|
|
213 }
|
|
214
|
|
215 }
|
|
216 }
|
|
217 return undef unless $seenbegin;
|
|
218 my $aln = Bio::SimpleAlign->new(-verbose => $self->verbose(),
|
|
219 -source => "EMBOSS-".$data{'type'});
|
|
220
|
|
221 foreach my $seqname ( qw(seq1 seq2) ) {
|
|
222 return undef unless ( defined $data{$seqname} );
|
|
223 $data{$seqname}->{'name'} ||= $seqname;
|
|
224 my $seq = new Bio::LocatableSeq('-seq' => $data{$seqname}->{'data'},
|
|
225 '-id' => $data{$seqname}->{'name'},
|
|
226 '-start'=> $data{$seqname}->{'start'},
|
|
227 '-end' => $data{$seqname}->{'end'},
|
|
228 );
|
|
229 $aln->add_seq($seq);
|
|
230 }
|
|
231 return $aln;
|
|
232 }
|
|
233
|
|
234 =head2 write_aln
|
|
235
|
|
236 Title : write_aln
|
|
237 Usage : $stream->write_aln(@aln)
|
|
238 Function: writes the $aln object into the stream in emboss format
|
|
239 Returns : 1 for success and 0 for error
|
|
240 Args : L<Bio::Align::AlignI> object
|
|
241
|
|
242
|
|
243 =cut
|
|
244
|
|
245 sub write_aln {
|
|
246 my ($self,@aln) = @_;
|
|
247
|
|
248 $self->throw("Sorry: writing emboss output is not currently available! \n");
|
|
249 }
|
|
250
|
|
251 1;
|