comparison variant_effect_predictor/Bio/EnsEMBL/Utils/CigarString.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =head1 AUTHOR
20
21 Juguang Xiao <juguang@tll.org.sg>
22
23 =cut
24
25 =head1 NAME
26
27 Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar
28 strings
29
30 =head1 DESCRIPTION
31
32 Sequence alignment hits were previously stored within the core database
33 as ungapped alignments. This imposed 2 major constraints on alignments:
34
35 a) alignments for a single hit record would require multiple rows in the
36 database, and
37 b) it was not possible to accurately retrieve the exact original
38 alignment.
39
40 Therefore, in the new branch sequence alignments are now stored as
41 ungapped alignments in the cigar line format (where CIGAR stands for
42 Concise Idiosyncratic Gapped Alignment Report).
43
44 In the cigar line format alignments are sotred as follows:
45
46 M: Match
47 D: Deletino
48 I: Insertion
49
50 An example of an alignment for a hypthetical protein match is shown
51 below:
52
53
54 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL...
55 PG P G GP R PLGP
56 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD...
57
58 protein_align_feature table as the following cigar line:
59
60 7M4D12M2I2MD7M
61
62 =cut
63
64 package Bio::EnsEMBL::Utils::CigarString;
65
66 use strict;
67 use vars qw(@ISA);
68 use Bio::EnsEMBL::Root;
69
70 @ISA = qw(Bio::EnsEMBL::Root);
71
72 =head2 split_hsp
73
74 Name : split_hsp (this name is derived from the original sub in BlastWorn)
75 Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object.
76 my $factory = new Bio::EnsEMBL::Utils::CigarString;
77 my $cigar_string = $factory->split_hsp($hsp);
78
79 Function: generate cigar string.
80 Argument: a HSP object.
81 Returns : a text string.
82
83 =cut
84
85 sub split_hsp {
86 my ($self, $hsp) = @_;
87
88 $self->throw("a defined object needed") unless($hsp && defined($hsp));
89 unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
90 $self->throw("a HSP object needed");
91 }
92
93 my ($qtype, $htype) = $self->_findTypes($hsp);
94 my ($qstrand, $hstrand) = $self->_findStrands($hsp);
95 my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype);
96
97 my @gaps = ();
98 my @qchars = split(//, $hsp->query_string);
99 my @hchars = split(//, $hsp->hit_string);
100 my $qstart;
101 if($qstrand == 1){
102 $qstart = $hsp->query->start;
103 }elsif($qstart == -1){
104 $qstart = $hsp->query->end;
105 }else{
106 $self->warn("[$qstart], invalid strand value on query");
107 $qstart = $hsp->query->start;
108 # Is this a SearchIO's bug???
109 }
110
111 my $hstart;
112 if($hstrand == 1){
113 $hstart = $hsp->subject->start;
114 }elsif($hstrand != -1){
115 $hstart = $hsp->subject->end;
116 }else{
117 $self->throw("[$hstart], invalid strand value on subject");
118 }
119
120 my $qend = $qstart;
121 my $hend = $hstart;
122 my $count = 0;
123 my $found = 0;
124
125 my @align_coordinates = ();
126 while($count <= $#qchars){
127 if($qchars[$count] ne '-' && $hchars[$count] ne '-') {
128 $qend += $qinc;
129 $hend += $hinc;
130 $found = 1;
131 }else{ # gapped region
132 push(@align_coordinates, [$qstart, $hstart]) if($found == 1);
133
134 $qstart = $qend;
135 $qstart += $qinc if($qchars[$count] ne '-');
136
137 $hstart = $hend;
138 $hstart += $hinc if($hchars[$count] ne '-');
139
140 $qend = $qstart;
141 $hend = $hstart;
142 $found = 0;
143 }
144 $count++;
145 }
146
147 if($found){
148 push(@align_coordinates, [$qstart, $hstart]);
149 }
150
151 my $cigar_string = "";
152 my $last = $#align_coordinates;
153 if($last >= 0){
154 for(my $i=0; $i<$last; $i++){
155 my $q_this_start = $align_coordinates[$i]->[0];
156 my $q_next_start = $align_coordinates[$i+1]->[0];
157 my $q_length = ($q_next_start-$q_this_start-1)*$qinc;
158 $q_length = abs($q_length);
159 my $h_this_start = $align_coordinates[$i]->[1];
160 my $h_next_start = $align_coordinates[$i+1]->[1];
161 my $h_length = ($h_next_start-$h_this_start-1)*$hinc;
162 $h_length = abs($h_length);
163
164 my $diff = $q_length - $h_length;
165 if($diff > 0){ # Insertion
166 $cigar_string .= $diff unless($diff == 1);
167 $cigar_string .= 'I';
168 }elsif($diff < 0){ # Deletion
169 $cigar_string .= -$diff unless($diff == -1);
170 $cigar_string .= 'D';
171 }else{ # e.g $diff == 0, Match
172 $cigar_string .= $q_length unless($q_length == 1);
173 $cigar_string .= 'M';
174 }
175
176 } # for
177 } # if
178
179 return $cigar_string;
180 }
181
182
183 sub _findStrands {
184 my ($self,$hsp) = @_;
185
186 my $qstrand = $hsp->query->strand;
187 unless($qstrand == 1 || $qstrand == -1){
188 $self->warn("query's strand value is neither 1 or -1");
189 $qstrand = 1;
190 }
191
192 my $hstrand = $hsp->subject->strand;
193 unless($hstrand == 1 || $hstrand == -1){
194 $self->warn("subject's strand value is neither 1 or -1");
195 $hstrand = 1;
196 }
197
198 return ( $qstrand, $hstrand);
199 }
200
201 sub _findTypes {
202 my ($self,$hsp) = @_;
203
204 my $type1;
205 my $type2;
206 my $len1 = $hsp->query->length();
207 my $len2 = $hsp->subject->length();
208
209 if ($len1/$len2 > 2) {
210 $type1 = 'dna';
211 $type2 = 'pep';
212 } elsif ($len2/$len1 > 2) {
213 $type1 = 'pep';
214 $type2 = 'dna';
215 } else {
216 $type1 = 'dna';
217 $type2 = 'dna';
218 }
219
220 return ($type1,$type2);
221 }
222
223 sub _findIncrements {
224 my ($self,$qstrand,$hstrand,$qtype,$htype) = @_;
225
226 my $qinc = 1 * $qstrand;
227 my $hinc = 1 * $hstrand;
228
229 if ($qtype eq 'dna' && $htype eq 'pep') {
230 $qinc = 3 * $qinc;
231 }
232 if ($qtype eq 'pep' && $htype eq 'dna') {
233 $hinc = 3 * $hinc;
234 }
235
236 return ($qinc,$hinc);
237 }
238
239 # This is a core logic of cigar string. The finite state machine theory is
240 # apply. See the below table, x-axis represents the input, with 3 options:
241 # (+/+) -- Both current query and subject bases are non-gap. Match
242 # (-/+) -- The current query base is gap, but subject not. Deletion
243 # (+/-) -- The current subject base is gap, but query not. Insertion
244 # While the y-axis means the current state with letter 'M', 'D', 'I'
245 #
246 # The content of this table is the action taken in response of the input and
247 # the current state.
248 # R remain the state, counter increment.
249 # G;X generate the cigar line based on the current state and counter;
250 # clear the counter to zero and change to the state X
251 #
252 # || +/+ | -/+ | +/- |
253 # -------+----------------------+
254 # M || R | G;D | G;I |
255 # ------------------------------+
256 # D || G;M | R | G;I |
257 # ------------------------------+
258 # I || G;M | G;D | R |
259 #
260
261 =head2 generate_cigar_string
262
263 Name : generate_cigar_string
264 Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars);
265 Function: generate the cigar string for a piece of alignment.
266 Args: 2 array references. The lengths of 2 arrays are the same
267 Return: a cigar string
268
269 =cut
270
271 # Developer's Note: The method is originally abstracted from the concept of
272 # cigar string. It only asks the essential information of 2 sequence characters
273 # of the alignment, while the BlastWorn::split_HSP asks more unused information
274 # for cigar string, which is useful to form align_coordinates. - Juguang
275
276 my ($count, $state); # strictly only used in the following 2 subs
277
278 sub generate_cigar_string {
279
280 # my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_;
281
282 my ($self, $qchars_ref, $hchars_ref) = @_;
283
284 my @qchars = @{$qchars_ref};
285 my @hchars = @{$hchars_ref};
286
287 unless(scalar(@qchars) == scalar(@hchars)){
288 $self->throw("two sequences are not equal in lengths");
289 }
290
291 $count = 0;
292 $state = 'M'; # the current state of gaps, (M, D, I)
293
294 my $cigar_string = '';
295 for(my $i=0; $i <= $#qchars; $i++){
296 my $qchar = $qchars[$i];
297 my $hchar = $hchars[$i];
298 if($qchar ne '-' && $hchar ne '-'){ # Match
299 $cigar_string .= $self->_sub_cigar_string('M');
300 }elsif($qchar eq '-'){ # Deletion
301 $cigar_string .= $self->_sub_cigar_string('D');
302 }elsif($hchar eq '-'){ # Insertion
303 $cigar_string .= $self->_sub_cigar_string('I');
304 }else{
305 $self->throw("Impossible state that 2 gaps on each seq aligned");
306 }
307 }
308 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail.
309 return $cigar_string;
310 }
311
312 sub _sub_cigar_string {
313 my ($self, $new_state) = @_;
314 my $sub_cigar_string = '';
315 if($state eq $new_state){
316 $count++; # Remain the state and increase the counter
317 }else{
318 $sub_cigar_string .= $count unless $count == 1;
319 $sub_cigar_string .= $state;
320 $count = 1;
321 $state = $new_state;
322 }
323 return $sub_cigar_string;
324 }
325
326 =head2 generate_cigar_string_by_hsp
327
328 Name : generate_cigar_string_by_hsp
329 Usage : my $hsp; # a ready GenericHSP object
330 my $cigar_string = $self->generate_cigar_string_by_hsp($hsp);
331 Function: generate a cigar string by given HSP object.
332 Args : a GenericHSP object
333 Returns: a text string of cigar string
334
335 =cut
336
337 sub generate_cigar_string_by_hsp {
338 my ($self, $hsp) = @_;
339
340 unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){
341 $self->throw("A GenericHSP object needed");
342 }
343
344 my @qchars = split(//, $hsp->query_string);
345 my @hchars = split(//, $hsp->hit_string);
346
347 return $self->generate_cigar_string(\@qchars, \@hchars);
348 }
349
350 1;