0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =head1 AUTHOR
|
|
20
|
|
21 Juguang Xiao <juguang@tll.org.sg>
|
|
22
|
|
23 =cut
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar
|
|
28 strings
|
|
29
|
|
30 =head1 DESCRIPTION
|
|
31
|
|
32 Sequence alignment hits were previously stored within the core database
|
|
33 as ungapped alignments. This imposed 2 major constraints on alignments:
|
|
34
|
|
35 a) alignments for a single hit record would require multiple rows in the
|
|
36 database, and
|
|
37 b) it was not possible to accurately retrieve the exact original
|
|
38 alignment.
|
|
39
|
|
40 Therefore, in the new branch sequence alignments are now stored as
|
|
41 ungapped alignments in the cigar line format (where CIGAR stands for
|
|
42 Concise Idiosyncratic Gapped Alignment Report).
|
|
43
|
|
44 In the cigar line format alignments are sotred as follows:
|
|
45
|
|
46 M: Match
|
|
47 D: Deletino
|
|
48 I: Insertion
|
|
49
|
|
50 An example of an alignment for a hypthetical protein match is shown
|
|
51 below:
|
|
52
|
|
53
|
|
54 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
|
|
55 PG P G GP R PLGP
|
|
56 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
|
|
57
|
|
58 protein_align_feature table as the following cigar line:
|
|
59
|
|
60 7M4D12M2I2MD7M
|
|
61
|
|
62 =cut
|
|
63
|
|
64 package Bio::EnsEMBL::Utils::CigarString;
|
|
65
|
|
66 use strict;
|
|
67 use vars qw(@ISA);
|
|
68 use Bio::EnsEMBL::Root;
|
|
69
|
|
70 @ISA = qw(Bio::EnsEMBL::Root);
|
|
71
|
|
72 =head2 split_hsp
|
|
73
|
|
74 Name : split_hsp (this name is derived from the original sub in BlastWorn)
|
|
75 Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object.
|
|
76 my $factory = new Bio::EnsEMBL::Utils::CigarString;
|
|
77 my $cigar_string = $factory->split_hsp($hsp);
|
|
78
|
|
79 Function: generate cigar string.
|
|
80 Argument: a HSP object.
|
|
81 Returns : a text string.
|
|
82
|
|
83 =cut
|
|
84
|
|
85 sub split_hsp {
|
|
86 my ($self, $hsp) = @_;
|
|
87
|
|
88 $self->throw("a defined object needed") unless($hsp && defined($hsp));
|
|
89 unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
|
|
90 $self->throw("a HSP object needed");
|
|
91 }
|
|
92
|
|
93 my ($qtype, $htype) = $self->_findTypes($hsp);
|
|
94 my ($qstrand, $hstrand) = $self->_findStrands($hsp);
|
|
95 my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype);
|
|
96
|
|
97 my @gaps = ();
|
|
98 my @qchars = split(//, $hsp->query_string);
|
|
99 my @hchars = split(//, $hsp->hit_string);
|
|
100 my $qstart;
|
|
101 if($qstrand == 1){
|
|
102 $qstart = $hsp->query->start;
|
|
103 }elsif($qstart == -1){
|
|
104 $qstart = $hsp->query->end;
|
|
105 }else{
|
|
106 $self->warn("[$qstart], invalid strand value on query");
|
|
107 $qstart = $hsp->query->start;
|
|
108 # Is this a SearchIO's bug???
|
|
109 }
|
|
110
|
|
111 my $hstart;
|
|
112 if($hstrand == 1){
|
|
113 $hstart = $hsp->subject->start;
|
|
114 }elsif($hstrand != -1){
|
|
115 $hstart = $hsp->subject->end;
|
|
116 }else{
|
|
117 $self->throw("[$hstart], invalid strand value on subject");
|
|
118 }
|
|
119
|
|
120 my $qend = $qstart;
|
|
121 my $hend = $hstart;
|
|
122 my $count = 0;
|
|
123 my $found = 0;
|
|
124
|
|
125 my @align_coordinates = ();
|
|
126 while($count <= $#qchars){
|
|
127 if($qchars[$count] ne '-' && $hchars[$count] ne '-') {
|
|
128 $qend += $qinc;
|
|
129 $hend += $hinc;
|
|
130 $found = 1;
|
|
131 }else{ # gapped region
|
|
132 push(@align_coordinates, [$qstart, $hstart]) if($found == 1);
|
|
133
|
|
134 $qstart = $qend;
|
|
135 $qstart += $qinc if($qchars[$count] ne '-');
|
|
136
|
|
137 $hstart = $hend;
|
|
138 $hstart += $hinc if($hchars[$count] ne '-');
|
|
139
|
|
140 $qend = $qstart;
|
|
141 $hend = $hstart;
|
|
142 $found = 0;
|
|
143 }
|
|
144 $count++;
|
|
145 }
|
|
146
|
|
147 if($found){
|
|
148 push(@align_coordinates, [$qstart, $hstart]);
|
|
149 }
|
|
150
|
|
151 my $cigar_string = "";
|
|
152 my $last = $#align_coordinates;
|
|
153 if($last >= 0){
|
|
154 for(my $i=0; $i<$last; $i++){
|
|
155 my $q_this_start = $align_coordinates[$i]->[0];
|
|
156 my $q_next_start = $align_coordinates[$i+1]->[0];
|
|
157 my $q_length = ($q_next_start-$q_this_start-1)*$qinc;
|
|
158 $q_length = abs($q_length);
|
|
159 my $h_this_start = $align_coordinates[$i]->[1];
|
|
160 my $h_next_start = $align_coordinates[$i+1]->[1];
|
|
161 my $h_length = ($h_next_start-$h_this_start-1)*$hinc;
|
|
162 $h_length = abs($h_length);
|
|
163
|
|
164 my $diff = $q_length - $h_length;
|
|
165 if($diff > 0){ # Insertion
|
|
166 $cigar_string .= $diff unless($diff == 1);
|
|
167 $cigar_string .= 'I';
|
|
168 }elsif($diff < 0){ # Deletion
|
|
169 $cigar_string .= -$diff unless($diff == -1);
|
|
170 $cigar_string .= 'D';
|
|
171 }else{ # e.g $diff == 0, Match
|
|
172 $cigar_string .= $q_length unless($q_length == 1);
|
|
173 $cigar_string .= 'M';
|
|
174 }
|
|
175
|
|
176 } # for
|
|
177 } # if
|
|
178
|
|
179 return $cigar_string;
|
|
180 }
|
|
181
|
|
182
|
|
183 sub _findStrands {
|
|
184 my ($self,$hsp) = @_;
|
|
185
|
|
186 my $qstrand = $hsp->query->strand;
|
|
187 unless($qstrand == 1 || $qstrand == -1){
|
|
188 $self->warn("query's strand value is neither 1 or -1");
|
|
189 $qstrand = 1;
|
|
190 }
|
|
191
|
|
192 my $hstrand = $hsp->subject->strand;
|
|
193 unless($hstrand == 1 || $hstrand == -1){
|
|
194 $self->warn("subject's strand value is neither 1 or -1");
|
|
195 $hstrand = 1;
|
|
196 }
|
|
197
|
|
198 return ( $qstrand, $hstrand);
|
|
199 }
|
|
200
|
|
201 sub _findTypes {
|
|
202 my ($self,$hsp) = @_;
|
|
203
|
|
204 my $type1;
|
|
205 my $type2;
|
|
206 my $len1 = $hsp->query->length();
|
|
207 my $len2 = $hsp->subject->length();
|
|
208
|
|
209 if ($len1/$len2 > 2) {
|
|
210 $type1 = 'dna';
|
|
211 $type2 = 'pep';
|
|
212 } elsif ($len2/$len1 > 2) {
|
|
213 $type1 = 'pep';
|
|
214 $type2 = 'dna';
|
|
215 } else {
|
|
216 $type1 = 'dna';
|
|
217 $type2 = 'dna';
|
|
218 }
|
|
219
|
|
220 return ($type1,$type2);
|
|
221 }
|
|
222
|
|
223 sub _findIncrements {
|
|
224 my ($self,$qstrand,$hstrand,$qtype,$htype) = @_;
|
|
225
|
|
226 my $qinc = 1 * $qstrand;
|
|
227 my $hinc = 1 * $hstrand;
|
|
228
|
|
229 if ($qtype eq 'dna' && $htype eq 'pep') {
|
|
230 $qinc = 3 * $qinc;
|
|
231 }
|
|
232 if ($qtype eq 'pep' && $htype eq 'dna') {
|
|
233 $hinc = 3 * $hinc;
|
|
234 }
|
|
235
|
|
236 return ($qinc,$hinc);
|
|
237 }
|
|
238
|
|
239 # This is a core logic of cigar string. The finite state machine theory is
|
|
240 # apply. See the below table, x-axis represents the input, with 3 options:
|
|
241 # (+/+) -- Both current query and subject bases are non-gap. Match
|
|
242 # (-/+) -- The current query base is gap, but subject not. Deletion
|
|
243 # (+/-) -- The current subject base is gap, but query not. Insertion
|
|
244 # While the y-axis means the current state with letter 'M', 'D', 'I'
|
|
245 #
|
|
246 # The content of this table is the action taken in response of the input and
|
|
247 # the current state.
|
|
248 # R remain the state, counter increment.
|
|
249 # G;X generate the cigar line based on the current state and counter;
|
|
250 # clear the counter to zero and change to the state X
|
|
251 #
|
|
252 # || +/+ | -/+ | +/- |
|
|
253 # -------+----------------------+
|
|
254 # M || R | G;D | G;I |
|
|
255 # ------------------------------+
|
|
256 # D || G;M | R | G;I |
|
|
257 # ------------------------------+
|
|
258 # I || G;M | G;D | R |
|
|
259 #
|
|
260
|
|
261 =head2 generate_cigar_string
|
|
262
|
|
263 Name : generate_cigar_string
|
|
264 Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars);
|
|
265 Function: generate the cigar string for a piece of alignment.
|
|
266 Args: 2 array references. The lengths of 2 arrays are the same
|
|
267 Return: a cigar string
|
|
268
|
|
269 =cut
|
|
270
|
|
271 # Developer's Note: The method is originally abstracted from the concept of
|
|
272 # cigar string. It only asks the essential information of 2 sequence characters
|
|
273 # of the alignment, while the BlastWorn::split_HSP asks more unused information
|
|
274 # for cigar string, which is useful to form align_coordinates. - Juguang
|
|
275
|
|
276 my ($count, $state); # strictly only used in the following 2 subs
|
|
277
|
|
278 sub generate_cigar_string {
|
|
279
|
|
280 # my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_;
|
|
281
|
|
282 my ($self, $qchars_ref, $hchars_ref) = @_;
|
|
283
|
|
284 my @qchars = @{$qchars_ref};
|
|
285 my @hchars = @{$hchars_ref};
|
|
286
|
|
287 unless(scalar(@qchars) == scalar(@hchars)){
|
|
288 $self->throw("two sequences are not equal in lengths");
|
|
289 }
|
|
290
|
|
291 $count = 0;
|
|
292 $state = 'M'; # the current state of gaps, (M, D, I)
|
|
293
|
|
294 my $cigar_string = '';
|
|
295 for(my $i=0; $i <= $#qchars; $i++){
|
|
296 my $qchar = $qchars[$i];
|
|
297 my $hchar = $hchars[$i];
|
|
298 if($qchar ne '-' && $hchar ne '-'){ # Match
|
|
299 $cigar_string .= $self->_sub_cigar_string('M');
|
|
300 }elsif($qchar eq '-'){ # Deletion
|
|
301 $cigar_string .= $self->_sub_cigar_string('D');
|
|
302 }elsif($hchar eq '-'){ # Insertion
|
|
303 $cigar_string .= $self->_sub_cigar_string('I');
|
|
304 }else{
|
|
305 $self->throw("Impossible state that 2 gaps on each seq aligned");
|
|
306 }
|
|
307 }
|
|
308 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
|
|
309 return $cigar_string;
|
|
310 }
|
|
311
|
|
312 sub _sub_cigar_string {
|
|
313 my ($self, $new_state) = @_;
|
|
314 my $sub_cigar_string = '';
|
|
315 if($state eq $new_state){
|
|
316 $count++; # Remain the state and increase the counter
|
|
317 }else{
|
|
318 $sub_cigar_string .= $count unless $count == 1;
|
|
319 $sub_cigar_string .= $state;
|
|
320 $count = 1;
|
|
321 $state = $new_state;
|
|
322 }
|
|
323 return $sub_cigar_string;
|
|
324 }
|
|
325
|
|
326 =head2 generate_cigar_string_by_hsp
|
|
327
|
|
328 Name : generate_cigar_string_by_hsp
|
|
329 Usage : my $hsp; # a ready GenericHSP object
|
|
330 my $cigar_string = $self->generate_cigar_string_by_hsp($hsp);
|
|
331 Function: generate a cigar string by given HSP object.
|
|
332 Args : a GenericHSP object
|
|
333 Returns: a text string of cigar string
|
|
334
|
|
335 =cut
|
|
336
|
|
337 sub generate_cigar_string_by_hsp {
|
|
338 my ($self, $hsp) = @_;
|
|
339
|
|
340 unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
|
|
341 $self->throw("A GenericHSP object needed");
|
|
342 }
|
|
343
|
|
344 my @qchars = split(//, $hsp->query_string);
|
|
345 my @hchars = split(//, $hsp->hit_string);
|
|
346
|
|
347 return $self->generate_cigar_string(\@qchars, \@hchars);
|
|
348 }
|
|
349
|
|
350 1;
|