Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/CigarString.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =head1 AUTHOR | |
20 | |
21 Juguang Xiao <juguang@tll.org.sg> | |
22 | |
23 =cut | |
24 | |
25 =head1 NAME | |
26 | |
27 Bio::EnsEMBL::Utils::CigarString, a utilites module to generate cigar | |
28 strings | |
29 | |
30 =head1 DESCRIPTION | |
31 | |
32 Sequence alignment hits were previously stored within the core database | |
33 as ungapped alignments. This imposed 2 major constraints on alignments: | |
34 | |
35 a) alignments for a single hit record would require multiple rows in the | |
36 database, and | |
37 b) it was not possible to accurately retrieve the exact original | |
38 alignment. | |
39 | |
40 Therefore, in the new branch sequence alignments are now stored as | |
41 ungapped alignments in the cigar line format (where CIGAR stands for | |
42 Concise Idiosyncratic Gapped Alignment Report). | |
43 | |
44 In the cigar line format alignments are sotred as follows: | |
45 | |
46 M: Match | |
47 D: Deletino | |
48 I: Insertion | |
49 | |
50 An example of an alignment for a hypthetical protein match is shown | |
51 below: | |
52 | |
53 | |
54 Query: 42 PGPAGLP----GSVGLQGPRGLRGPLP-GPLGPPL... | |
55 PG P G GP R PLGP | |
56 Sbjct: 1672 PGTP*TPLVPLGPWVPLGPSSPR--LPSGPLGPTD... | |
57 | |
58 protein_align_feature table as the following cigar line: | |
59 | |
60 7M4D12M2I2MD7M | |
61 | |
62 =cut | |
63 | |
64 package Bio::EnsEMBL::Utils::CigarString; | |
65 | |
66 use strict; | |
67 use vars qw(@ISA); | |
68 use Bio::EnsEMBL::Root; | |
69 | |
70 @ISA = qw(Bio::EnsEMBL::Root); | |
71 | |
72 =head2 split_hsp | |
73 | |
74 Name : split_hsp (this name is derived from the original sub in BlastWorn) | |
75 Usage : my $hsp; # a ready Bio::Search::HSP::GenericHSP object. | |
76 my $factory = new Bio::EnsEMBL::Utils::CigarString; | |
77 my $cigar_string = $factory->split_hsp($hsp); | |
78 | |
79 Function: generate cigar string. | |
80 Argument: a HSP object. | |
81 Returns : a text string. | |
82 | |
83 =cut | |
84 | |
85 sub split_hsp { | |
86 my ($self, $hsp) = @_; | |
87 | |
88 $self->throw("a defined object needed") unless($hsp && defined($hsp)); | |
89 unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ | |
90 $self->throw("a HSP object needed"); | |
91 } | |
92 | |
93 my ($qtype, $htype) = $self->_findTypes($hsp); | |
94 my ($qstrand, $hstrand) = $self->_findStrands($hsp); | |
95 my ($qinc, $hinc) = $self->_findIncrements($qstrand,$hstrand,$qtype,$htype); | |
96 | |
97 my @gaps = (); | |
98 my @qchars = split(//, $hsp->query_string); | |
99 my @hchars = split(//, $hsp->hit_string); | |
100 my $qstart; | |
101 if($qstrand == 1){ | |
102 $qstart = $hsp->query->start; | |
103 }elsif($qstart == -1){ | |
104 $qstart = $hsp->query->end; | |
105 }else{ | |
106 $self->warn("[$qstart], invalid strand value on query"); | |
107 $qstart = $hsp->query->start; | |
108 # Is this a SearchIO's bug??? | |
109 } | |
110 | |
111 my $hstart; | |
112 if($hstrand == 1){ | |
113 $hstart = $hsp->subject->start; | |
114 }elsif($hstrand != -1){ | |
115 $hstart = $hsp->subject->end; | |
116 }else{ | |
117 $self->throw("[$hstart], invalid strand value on subject"); | |
118 } | |
119 | |
120 my $qend = $qstart; | |
121 my $hend = $hstart; | |
122 my $count = 0; | |
123 my $found = 0; | |
124 | |
125 my @align_coordinates = (); | |
126 while($count <= $#qchars){ | |
127 if($qchars[$count] ne '-' && $hchars[$count] ne '-') { | |
128 $qend += $qinc; | |
129 $hend += $hinc; | |
130 $found = 1; | |
131 }else{ # gapped region | |
132 push(@align_coordinates, [$qstart, $hstart]) if($found == 1); | |
133 | |
134 $qstart = $qend; | |
135 $qstart += $qinc if($qchars[$count] ne '-'); | |
136 | |
137 $hstart = $hend; | |
138 $hstart += $hinc if($hchars[$count] ne '-'); | |
139 | |
140 $qend = $qstart; | |
141 $hend = $hstart; | |
142 $found = 0; | |
143 } | |
144 $count++; | |
145 } | |
146 | |
147 if($found){ | |
148 push(@align_coordinates, [$qstart, $hstart]); | |
149 } | |
150 | |
151 my $cigar_string = ""; | |
152 my $last = $#align_coordinates; | |
153 if($last >= 0){ | |
154 for(my $i=0; $i<$last; $i++){ | |
155 my $q_this_start = $align_coordinates[$i]->[0]; | |
156 my $q_next_start = $align_coordinates[$i+1]->[0]; | |
157 my $q_length = ($q_next_start-$q_this_start-1)*$qinc; | |
158 $q_length = abs($q_length); | |
159 my $h_this_start = $align_coordinates[$i]->[1]; | |
160 my $h_next_start = $align_coordinates[$i+1]->[1]; | |
161 my $h_length = ($h_next_start-$h_this_start-1)*$hinc; | |
162 $h_length = abs($h_length); | |
163 | |
164 my $diff = $q_length - $h_length; | |
165 if($diff > 0){ # Insertion | |
166 $cigar_string .= $diff unless($diff == 1); | |
167 $cigar_string .= 'I'; | |
168 }elsif($diff < 0){ # Deletion | |
169 $cigar_string .= -$diff unless($diff == -1); | |
170 $cigar_string .= 'D'; | |
171 }else{ # e.g $diff == 0, Match | |
172 $cigar_string .= $q_length unless($q_length == 1); | |
173 $cigar_string .= 'M'; | |
174 } | |
175 | |
176 } # for | |
177 } # if | |
178 | |
179 return $cigar_string; | |
180 } | |
181 | |
182 | |
183 sub _findStrands { | |
184 my ($self,$hsp) = @_; | |
185 | |
186 my $qstrand = $hsp->query->strand; | |
187 unless($qstrand == 1 || $qstrand == -1){ | |
188 $self->warn("query's strand value is neither 1 or -1"); | |
189 $qstrand = 1; | |
190 } | |
191 | |
192 my $hstrand = $hsp->subject->strand; | |
193 unless($hstrand == 1 || $hstrand == -1){ | |
194 $self->warn("subject's strand value is neither 1 or -1"); | |
195 $hstrand = 1; | |
196 } | |
197 | |
198 return ( $qstrand, $hstrand); | |
199 } | |
200 | |
201 sub _findTypes { | |
202 my ($self,$hsp) = @_; | |
203 | |
204 my $type1; | |
205 my $type2; | |
206 my $len1 = $hsp->query->length(); | |
207 my $len2 = $hsp->subject->length(); | |
208 | |
209 if ($len1/$len2 > 2) { | |
210 $type1 = 'dna'; | |
211 $type2 = 'pep'; | |
212 } elsif ($len2/$len1 > 2) { | |
213 $type1 = 'pep'; | |
214 $type2 = 'dna'; | |
215 } else { | |
216 $type1 = 'dna'; | |
217 $type2 = 'dna'; | |
218 } | |
219 | |
220 return ($type1,$type2); | |
221 } | |
222 | |
223 sub _findIncrements { | |
224 my ($self,$qstrand,$hstrand,$qtype,$htype) = @_; | |
225 | |
226 my $qinc = 1 * $qstrand; | |
227 my $hinc = 1 * $hstrand; | |
228 | |
229 if ($qtype eq 'dna' && $htype eq 'pep') { | |
230 $qinc = 3 * $qinc; | |
231 } | |
232 if ($qtype eq 'pep' && $htype eq 'dna') { | |
233 $hinc = 3 * $hinc; | |
234 } | |
235 | |
236 return ($qinc,$hinc); | |
237 } | |
238 | |
239 # This is a core logic of cigar string. The finite state machine theory is | |
240 # apply. See the below table, x-axis represents the input, with 3 options: | |
241 # (+/+) -- Both current query and subject bases are non-gap. Match | |
242 # (-/+) -- The current query base is gap, but subject not. Deletion | |
243 # (+/-) -- The current subject base is gap, but query not. Insertion | |
244 # While the y-axis means the current state with letter 'M', 'D', 'I' | |
245 # | |
246 # The content of this table is the action taken in response of the input and | |
247 # the current state. | |
248 # R remain the state, counter increment. | |
249 # G;X generate the cigar line based on the current state and counter; | |
250 # clear the counter to zero and change to the state X | |
251 # | |
252 # || +/+ | -/+ | +/- | | |
253 # -------+----------------------+ | |
254 # M || R | G;D | G;I | | |
255 # ------------------------------+ | |
256 # D || G;M | R | G;I | | |
257 # ------------------------------+ | |
258 # I || G;M | G;D | R | | |
259 # | |
260 | |
261 =head2 generate_cigar_string | |
262 | |
263 Name : generate_cigar_string | |
264 Usage: $cigar_string = $self->generate_cigar_string(\@qchars, \@hchars); | |
265 Function: generate the cigar string for a piece of alignment. | |
266 Args: 2 array references. The lengths of 2 arrays are the same | |
267 Return: a cigar string | |
268 | |
269 =cut | |
270 | |
271 # Developer's Note: The method is originally abstracted from the concept of | |
272 # cigar string. It only asks the essential information of 2 sequence characters | |
273 # of the alignment, while the BlastWorn::split_HSP asks more unused information | |
274 # for cigar string, which is useful to form align_coordinates. - Juguang | |
275 | |
276 my ($count, $state); # strictly only used in the following 2 subs | |
277 | |
278 sub generate_cigar_string { | |
279 | |
280 # my ($self, $qstart, $hstart, $qinc, $hinc, $qchars_ref, $hchars_ref) = @_; | |
281 | |
282 my ($self, $qchars_ref, $hchars_ref) = @_; | |
283 | |
284 my @qchars = @{$qchars_ref}; | |
285 my @hchars = @{$hchars_ref}; | |
286 | |
287 unless(scalar(@qchars) == scalar(@hchars)){ | |
288 $self->throw("two sequences are not equal in lengths"); | |
289 } | |
290 | |
291 $count = 0; | |
292 $state = 'M'; # the current state of gaps, (M, D, I) | |
293 | |
294 my $cigar_string = ''; | |
295 for(my $i=0; $i <= $#qchars; $i++){ | |
296 my $qchar = $qchars[$i]; | |
297 my $hchar = $hchars[$i]; | |
298 if($qchar ne '-' && $hchar ne '-'){ # Match | |
299 $cigar_string .= $self->_sub_cigar_string('M'); | |
300 }elsif($qchar eq '-'){ # Deletion | |
301 $cigar_string .= $self->_sub_cigar_string('D'); | |
302 }elsif($hchar eq '-'){ # Insertion | |
303 $cigar_string .= $self->_sub_cigar_string('I'); | |
304 }else{ | |
305 $self->throw("Impossible state that 2 gaps on each seq aligned"); | |
306 } | |
307 } | |
308 $cigar_string .= $self->_sub_cigar_string('X'); # not forget the tail. | |
309 return $cigar_string; | |
310 } | |
311 | |
312 sub _sub_cigar_string { | |
313 my ($self, $new_state) = @_; | |
314 my $sub_cigar_string = ''; | |
315 if($state eq $new_state){ | |
316 $count++; # Remain the state and increase the counter | |
317 }else{ | |
318 $sub_cigar_string .= $count unless $count == 1; | |
319 $sub_cigar_string .= $state; | |
320 $count = 1; | |
321 $state = $new_state; | |
322 } | |
323 return $sub_cigar_string; | |
324 } | |
325 | |
326 =head2 generate_cigar_string_by_hsp | |
327 | |
328 Name : generate_cigar_string_by_hsp | |
329 Usage : my $hsp; # a ready GenericHSP object | |
330 my $cigar_string = $self->generate_cigar_string_by_hsp($hsp); | |
331 Function: generate a cigar string by given HSP object. | |
332 Args : a GenericHSP object | |
333 Returns: a text string of cigar string | |
334 | |
335 =cut | |
336 | |
337 sub generate_cigar_string_by_hsp { | |
338 my ($self, $hsp) = @_; | |
339 | |
340 unless(ref($hsp) && $hsp->isa('Bio::Search::HSP::GenericHSP')){ | |
341 $self->throw("A GenericHSP object needed"); | |
342 } | |
343 | |
344 my @qchars = split(//, $hsp->query_string); | |
345 my @hchars = split(//, $hsp->hit_string); | |
346 | |
347 return $self->generate_cigar_string(\@qchars, \@hchars); | |
348 } | |
349 | |
350 1; |