annotate variant_effect_predictor/Bio/EnsEMBL/Compara/AlignedMemberSet.pm @ 1:d6778b5d8382 draft default tip

Deleted selected files
author willmclaren
date Fri, 03 Aug 2012 10:05:43 -0400
parents 21066c0abaf5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 =head1 LICENSE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4 Genome Research Limited. All rights reserved.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 This software is distributed under a modified Apache license.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7 For license details, please see
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 http://www.ensembl.org/info/about/code_licence.html
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 =head1 CONTACT
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 Please email comments or questions to the public Ensembl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14 developers list at <dev@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16 Questions may also be sent to the Ensembl help desk at
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17 <helpdesk@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 =head1 AUTHORSHIP
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 Ensembl Team. Individual contributions can be found in the CVS log.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 AlignedMemberSet - A superclass for pairwise or multiple relationships, base of
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28 Bio::EnsEMBL::Compara::Family, Bio::EnsEMBL::Compara::Homology and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 Bio::EnsEMBL::Compara::Domain.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 =head1 DESCRIPTION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33 A superclass for pairwise and multiple relationships
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 Currently the AlignedMember objects are used in the GeneTree structure
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 to represent the leaves of the trees. Each leaf contains an aligned
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 sequence, which is represented as an AlignedMember object.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 =head1 INHERITANCE TREE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 Bio::EnsEMBL::Compara::AlignedMemberSet
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 =head1 METHODS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 package Bio::EnsEMBL::Compara::AlignedMemberSet;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 use Scalar::Util qw(weaken);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 use Bio::AlignIO;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 use Bio::EnsEMBL::Utils::Argument;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55 use Bio::EnsEMBL::Utils::Scalar;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56 use Bio::EnsEMBL::Utils::Exception;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 use Bio::EnsEMBL::Compara::AlignedMember;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60 use base ('Bio::EnsEMBL::Compara::MemberSet');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 =head2 member_class
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65 Description: Returns the type of member used in the set
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66 Returntype : String: Bio::EnsEMBL::Compara::AlignedMember
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67 Caller : general
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 Status : Stable
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 sub member_class {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 return 'Bio::EnsEMBL::Compara::AlignedMember';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 ######################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77 # Alignment sections #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 ######################
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81 sub print_sequences_to_fasta {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 my ($self, $pep_file) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 my $pep_counter = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84 open PEP, ">$pep_file";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 foreach my $member (@{$self->get_all_Members}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 next if $member->source_name eq 'ENSEMBLGENE';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 my $member_stable_id = $member->stable_id;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88 my $seq = $member->sequence;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90 print PEP ">$member_stable_id\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 $seq =~ s/(.{72})/$1\n/g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 chomp $seq;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93 unless (defined($seq)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 my $family_id = $self->dbID;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 die "member $member_stable_id in family $family_id doesn't have a sequence";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97 print PEP $seq,"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 $pep_counter++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 close PEP;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 return $pep_counter;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 =head2 read_clustalw
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107 Arg [1] : string $file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 The name of the file containing the clustalw output
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 Example : $family->read_clustalw('/tmp/clustalw.aln');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 Description: Parses the output from clustalw and sets the alignment strings
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111 of each of the memebers of this family
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112 Returntype : none
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 Exceptions : thrown if file cannot be parsed
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 warning if alignment file contains identifiers for sequences
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115 which are not members of this family
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 Caller : general
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 sub read_clustalw {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 my $file = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 my %align_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125 my $FH = IO::File->new();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126 $FH->open($file) || throw("Could not open alignment file [$file]");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128 <$FH>; #skip header
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 while(<$FH>) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 next if($_ =~ /^\s+/); #skip lines that start with space
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 my ($id, $align) = split;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133 $align_hash{$id} ||= '';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134 $align_hash{$id} .= $align;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137 $FH->close;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139 #place all members in a hash on their member name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140 my %member_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 foreach my $member (@{$self->get_all_Members}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 $member_hash{$member->stable_id} = $member;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145 #assign cigar_line to each of the member attribute
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146 foreach my $id (keys %align_hash) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 my $alignment_string = $align_hash{$id};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150 $alignment_string =~ s/\-([A-Z])/\- $1/g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 $alignment_string =~ s/([A-Z])\-/$1 \-/g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 my @cigar_segments = split " ",$alignment_string;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 my $cigar_line = "";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 foreach my $segment (@cigar_segments) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 my $seglength = length($segment);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 $seglength = "" if ($seglength == 1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 if ($segment =~ /^\-+$/) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 $cigar_line .= $seglength . "D";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 $cigar_line .= $seglength . "M";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 $member_hash{$id}->cigar_line($cigar_line);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 sub load_cigars_from_fasta {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171 my ($self, $file) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173 my $alignio = Bio::AlignIO->new(-file => "$file", -format => "fasta");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175 my $aln = $alignio->next_aln or die "Bio::AlignIO could not get next_aln() from file '$file'";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 #place all members in a hash on their member name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 my %member_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179 foreach my $member (@{$self->get_all_Members}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 $member_hash{$member->stable_id} = $member;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183 #assign cigar_line to each of the member attribute
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 foreach my $seq ($aln->each_seq) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185 my $id = $seq->display_id;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 throw("No member for alignment portion: [$id]") unless exists $member_hash{$id};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 my $cigar_line = '';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189 my $seq_string = $seq->seq();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 while($seq_string=~/(?:\b|^)(.)(.*?)(?:\b|$)/g) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 $cigar_line .= ($2 ? length($2)+1 : '').(($1 eq '-') ? 'D' : 'M');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194 $member_hash{$id}->cigar_line($cigar_line);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200 sub get_SimpleAlign {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202 my ($self, @args) = @_;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204 my $id_type = 'STABLE';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 my $unique_seqs = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206 my $cdna = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207 my $stop2x = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 my $append_taxon_id = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 my $append_sp_short_name = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 my $append_genomedb_id = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 my $exon_cased = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 my $alignment = 'protein';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 my $changeSelenos = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 if (scalar @args) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 ($unique_seqs, $cdna, $id_type, $stop2x, $append_taxon_id, $append_sp_short_name, $append_genomedb_id, $exon_cased, $alignment, $changeSelenos) =
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 rearrange([qw(UNIQ_SEQ CDNA ID_TYPE STOP2X APPEND_TAXON_ID APPEND_SP_SHORT_NAME APPEND_GENOMEDB_ID EXON_CASED ALIGNMENT CHANGE_SELENO)], @args);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 my $sa = Bio::SimpleAlign->new();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 #Hack to try to work with both bioperl 0.7 and 1.2:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 #Check to see if the method is called 'addSeq' or 'add_seq'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223 my $bio07 = ($sa->can('add_seq') ? 0 : 1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 my $seq_id_hash = {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226 foreach my $member (@{$self->get_all_Members}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228 next if $member->source_name eq 'ENSEMBLGENE';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 # Print unique sequences only ?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231 next if($unique_seqs and $seq_id_hash->{$member->sequence_id});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 $seq_id_hash->{$member->sequence_id} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234 # The correct codon table
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 if ($member->chr_name =~ /mt/i) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 # codeml icodes
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 # 0:universal code (default)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 my $class;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 eval {$class = member->taxon->classification;};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240 unless ($@) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 if ($class =~ /vertebrata/i) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 # 1:mamalian mt
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243 $sa->{_special_codeml_icode} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245 # 4:invertebrate mt
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 $sa->{_special_codeml_icode} = 4;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 my $seqstr;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252 my $alphabet;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 if ($cdna or (lc($alignment) eq 'cdna')) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 $seqstr = $member->cdna_alignment_string($changeSelenos);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 $seqstr =~ s/\s+//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 $alphabet = 'dna';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258 $seqstr = $member->alignment_string($exon_cased);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 $alphabet = 'protein';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 next if(!$seqstr);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263 # Sequence name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 my $seqID = $member->stable_id;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 $seqID = $member->sequence_id if($id_type eq "SEQ");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 $seqID = $member->member_id if($id_type eq "MEMBER");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 $seqID .= "_" . $member->taxon_id if($append_taxon_id);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 $seqID .= "_" . $member->genome_db_id if ($append_genomedb_id);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270 ## Append $seqID with Speciae short name, if required
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 if ($append_sp_short_name) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272 my $species = $member->genome_db->short_name;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 $species =~ s/\s/_/g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274 $seqID .= "_" . $species . "_";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 # Sequence length
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278 my $aln_end = $member->seq_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279 $aln_end = $aln_end*3 if $alphabet eq 'dna';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281 $seqstr =~ s/\*/X/g if ($stop2x);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282 my $seq = Bio::LocatableSeq->new(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 -SEQ => $seqstr,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 -ALPHABET => $alphabet,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 -START => 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286 -END => $aln_end,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 -ID => $seqID,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288 -STRAND => 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291 if($bio07) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292 $sa->addSeq($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294 $sa->add_seq($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 $sa = $sa->remove_gaps(undef, 1) if UNIVERSAL::can($sa, 'remove_gaps');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 return $sa;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303 # Takes a protein tree and creates a consensus cigar line from the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304 # constituent leaf nodes.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305 sub consensus_cigar_line {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 my @cigars;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310 # First get an 'expanded' cigar string for each leaf of the subtree
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 my @all_members = @{$self->get_all_Members};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312 my $num_members = scalar(@all_members);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 foreach my $leaf (@all_members) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314 next unless( UNIVERSAL::can( $leaf, 'cigar_line' ) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 my $cigar = $leaf->cigar_line;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316 my $newcigar = "";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317 # $cigar =~ s/(\d*)([A-Z])/$2 x ($1||1)/ge; #Expand
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 while ($cigar =~ /(\d*)([A-Z])/g) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319 $newcigar .= $2 x ($1 || 1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321 $cigar = $newcigar;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 push @cigars, $cigar;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 # Itterate through each character of the expanded cigars.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326 # If there is a 'D' at a given location in any cigar,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327 # set the consensus to 'D', otherwise assume an 'M'.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328 # TODO: Fix assumption that cigar strings are always the same length,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 # and start at the same point.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 my $cigar_len = length( $cigars[0] );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 my $cons_cigar;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332 for( my $i=0; $i<$cigar_len; $i++ ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333 my $char = 'M';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334 my $num_deletions = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335 foreach my $cigar( @cigars ){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336 if ( substr($cigar,$i,1) eq 'D'){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337 $num_deletions++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340 if ($num_deletions * 3 > 2 * $num_members) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 $char = "D";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342 } elsif ($num_deletions * 3 > $num_members) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 $char = "m";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345 $cons_cigar .= $char;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348 # Collapse the consensus cigar, e.g. 'DDDD' = 4D
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349 # $cons_cigar =~ s/(\w)(\1*)/($2?length($2)+1:"").$1/ge;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350 my $collapsed_cigar = "";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351 while ($cons_cigar =~ /(D+|M+|I+|m+)/g) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352 $collapsed_cigar .= (length($1) == 1 ? "" : length($1)) . substr($1,0,1)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354 $cons_cigar = $collapsed_cigar;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 # Return the consensus
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356 return $cons_cigar;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 my %TWOD_CODONS = ("TTT" => "Phe",#Phe
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 "TTC" => "Phe",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363 "TTA" => "Leu",#Leu
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364 "TTG" => "Leu",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366 "TAT" => "Tyr",#Tyr
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367 "TAC" => "Tyr",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369 "CAT" => "His",#His
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370 "CAC" => "His",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372 "CAA" => "Gln",#Gln
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373 "CAG" => "Gln",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375 "AAT" => "Asn",#Asn
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376 "AAC" => "Asn",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378 "AAA" => "Lys",#Lys
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 "AAG" => "Lys",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381 "GAT" => "Asp",#Asp
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382 "GAC" => "Asp",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 "GAA" => "Glu",#Glu
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385 "GAG" => "Glu",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387 "TGT" => "Cys",#Cys
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 "TGC" => "Cys",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390 "AGT" => "Ser",#Ser
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 "AGC" => "Ser",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393 "AGA" => "Arg",#Arg
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 "AGG" => "Arg",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396 "ATT" => "Ile",#Ile
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 "ATC" => "Ile",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398 "ATA" => "Ile");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400 my %FOURD_CODONS = ("CTT" => "Leu",#Leu
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401 "CTC" => "Leu",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402 "CTA" => "Leu",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 "CTG" => "Leu",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405 "GTT" => "Val",#Val
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406 "GTC" => "Val",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
407 "GTA" => "Val",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
408 "GTG" => "Val",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
409
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
410 "TCT" => "Ser",#Ser
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
411 "TCC" => "Ser",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
412 "TCA" => "Ser",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
413 "TCG" => "Ser",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
414
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
415 "CCT" => "Pro",#Pro
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
416 "CCC" => "Pro",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
417 "CCA" => "Pro",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
418 "CCG" => "Pro",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
419
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
420 "ACT" => "Thr",#Thr
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
421 "ACC" => "Thr",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
422 "ACA" => "Thr",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
423 "ACG" => "Thr",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
424
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
425 "GCT" => "Ala",#Ala
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
426 "GCC" => "Ala",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
427 "GCA" => "Ala",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
428 "GCG" => "Ala",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
429
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
430 "CGT" => "Arg",#Arg
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
431 "CGC" => "Arg",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
432 "CGA" => "Arg",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
433 "CGG" => "Arg",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
434
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
435 "GGT" => "Gly",#Gly
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
436 "GGC" => "Gly",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
437 "GGA" => "Gly",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
438 "GGG" => "Gly");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
439
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
440 my %CODONS = ("ATG" => "Met",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
441 "TGG" => "Trp",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
442 "TAA" => "TER",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
443 "TAG" => "TER",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
444 "TGA" => "TER",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
445 "---" => "---",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
446 %TWOD_CODONS,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
447 %FOURD_CODONS,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
448 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
449
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
450
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
451 =head2 get_4D_SimpleAlign
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
452
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
453 Example : $4d_align = $homology->get_4D_SimpleAlign();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
454 Description: get 4 times degenerate positions pairwise simple alignment
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
455 Returntype : Bio::SimpleAlign
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
456
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
457 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
458
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
459 sub get_4D_SimpleAlign {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
460 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
461
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
462 my $sa = Bio::SimpleAlign->new();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
463
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
464 #Hack to try to work with both bioperl 0.7 and 1.2:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
465 #Check to see if the method is called 'addSeq' or 'add_seq'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
466 my $bio07 = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
467 if(!$sa->can('add_seq')) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
468 $bio07 = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
469 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
470
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
471 my $ma = $self->adaptor->db->get_MemberAdaptor;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
472
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
473 my %member_seqstr;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
474 foreach my $member (@{$self->get_all_Members}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
475 next if $member->source_name ne 'ENSEMBLPEP';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
476 my $seqstr = $member->cdna_alignment_string();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
477 next if(!$seqstr);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
478 #print STDERR $seqstr,"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
479 my @tmp_tab = split /\s+/, $seqstr;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
480 #print STDERR "tnp_tab 0: ", $tmp_tab[0],"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
481 $member_seqstr{$member->stable_id} = \@tmp_tab;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
482 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
483
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
484 my $seqstr_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
485 foreach my $seqid (keys %member_seqstr) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
486 unless (defined $seqstr_length) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
487 #print STDERR $member_seqstr{$seqid}->[0],"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
488 $seqstr_length = scalar @{$member_seqstr{$seqid}};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
489 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
490 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
491 unless ($seqstr_length == scalar @{$member_seqstr{$seqid}}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
492 die "Length of dna alignment are not the same, $seqstr_length and " . scalar @{$member_seqstr{$seqid}} ." respectively for homology_id " . $self->dbID . "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
493 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
494 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
495
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
496 my %FourD_member_seqstr;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
497 for (my $i=0; $i < $seqstr_length; $i++) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
498 my $FourD_codon = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
499 my $FourD_aminoacid;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
500 foreach my $seqid (keys %member_seqstr) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
501 if (defined $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
502 if (defined $FourD_aminoacid && $FourD_aminoacid eq $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
503 #print STDERR "YES ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
504 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
505 } elsif (defined $FourD_aminoacid) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
506 #print STDERR "NO ",$FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
507 $FourD_codon = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
508 last;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
509 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
510 $FourD_aminoacid = $FOURD_CODONS{$member_seqstr{$seqid}->[$i]};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
511 #print STDERR $FOURD_CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i]," ";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
512 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
513 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
514 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
515 #print STDERR "NO ",$CODONS{$member_seqstr{$seqid}->[$i]}," ",$member_seqstr{$seqid}->[$i],"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
516 $FourD_codon = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
517 last;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
518 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
519 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
520 next unless ($FourD_codon);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
521 foreach my $seqid (keys %member_seqstr) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
522 $FourD_member_seqstr{$seqid} .= substr($member_seqstr{$seqid}->[$i],2,1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
523 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
524 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
525
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
526 foreach my $seqid (keys %FourD_member_seqstr) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
527
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
528 my $seq = Bio::LocatableSeq->new(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
529 -SEQ => $FourD_member_seqstr{$seqid},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
530 -START => 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
531 -END => length($FourD_member_seqstr{$seqid}),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
532 -ID => $seqid,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
533 -STRAND => 0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
534 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
535
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
536 if($bio07) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
537 $sa->addSeq($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
538 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
539 $sa->add_seq($seq);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
540 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
541 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
542
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
543 return $sa;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
544 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
545
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
546 my %matrix_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
547
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
548 {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
549 my $BLOSUM62 = "# Matrix made by matblas from blosum62.iij
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
550 # * column uses minimum score
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
551 # BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
552 # Blocks Database = /data/blocks_5.0/blocks.dat
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
553 # Cluster Percentage: >= 62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
554 # Entropy = 0.6979, Expected = -0.5209
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
555 A R N D C Q E G H I L K M F P S T W Y V B Z X *
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
556 A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
557 R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
558 N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
559 D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
560 C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
561 Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
562 E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
563 G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
564 H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
565 I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
566 L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
567 K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
568 M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
569 F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
570 P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
571 S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
572 T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
573 W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
574 Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
575 V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
576 B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
577 Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
578 X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
579 * -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
580 ";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
581 my $matrix_string;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
582 my @lines = split(/\n/,$BLOSUM62);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
583 foreach my $line (@lines) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
584 next if ($line =~ /^\#/);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
585 if ($line =~ /^[A-Z\*\s]+$/) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
586 $matrix_string .= sprintf "$line\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
587 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
588 my @t = split(/\s+/,$line);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
589 shift @t;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
590 # print scalar @t,"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
591 $matrix_string .= sprintf(join(" ",@t)."\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
592 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
593 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
594
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
595 # my %matrix_hash;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
596 @lines = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
597 @lines = split /\n/, $matrix_string;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
598 my $lts = shift @lines;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
599 $lts =~ s/^\s+//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
600 $lts =~ s/\s+$//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
601 my @letters = split /\s+/, $lts;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
602
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
603 foreach my $letter (@letters) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
604 my $line = shift @lines;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
605 $line =~ s/^\s+//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
606 $line =~ s/\s+$//;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
607 my @penalties = split /\s+/, $line;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
608 die "Size of letters array and penalties array are different\n"
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
609 unless (scalar @letters == scalar @penalties);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
610 for (my $i=0; $i < scalar @letters; $i++) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
611 $matrix_hash{uc $letter}{uc $letters[$i]} = $penalties[$i];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
612 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
613 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
614 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
615
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
616
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
617 sub update_alignment_stats {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
618 my $self = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
619
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
620 my $genes = $self->get_all_Members;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
621 my $ngenes = scalar(@$genes);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
622
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
623 if ($ngenes == 2) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
624 # This code is >4 times faster with pairs of genes
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
625
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
626 my $gene1 = $genes->[0];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
627 my $gene2 = $genes->[1];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
628 my $new_aln1_cigarline = "";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
629 my $new_aln2_cigarline = "";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
630
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
631 my $identical_matches = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
632 my $positive_matches = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
633
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
634 my ($aln1state, $aln2state);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
635 my ($aln1count, $aln2count);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
636 my ($aln1cov, $aln2cov) = (0,0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
637
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
638 my @aln1 = split(//, $gene1->alignment_string);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
639 my @aln2 = split(//, $gene2->alignment_string);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
640
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
641 for (my $i=0; $i <= $#aln1; $i++) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
642 next if ($aln1[$i] eq '-' && $aln2[$i] eq '-');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
643 my $cur_aln1state = ($aln1[$i] eq '-' ? 'D' : 'M');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
644 my $cur_aln2state = ($aln2[$i] eq '-' ? 'D' : 'M');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
645 $aln1cov++ if $cur_aln1state ne 'D';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
646 $aln2cov++ if $cur_aln2state ne 'D';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
647 if ($cur_aln1state eq 'M' && $cur_aln2state eq 'M') {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
648 if ($aln1[$i] eq $aln2[$i]) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
649 $identical_matches++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
650 $positive_matches++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
651 } elsif ($matrix_hash{uc $aln1[$i]}{uc $aln2[$i]} > 0) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
652 $positive_matches++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
653 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
654 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
655 unless (defined $aln1state) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
656 $aln1count = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
657 $aln2count = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
658 $aln1state = $cur_aln1state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
659 $aln2state = $cur_aln2state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
660 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
661 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
662 if ($cur_aln1state eq $aln1state) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
663 $aln1count++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
664 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
665 if ($aln1count == 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
666 $new_aln1_cigarline .= $aln1state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
667 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
668 $new_aln1_cigarline .= $aln1count.$aln1state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
669 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
670 $aln1count = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
671 $aln1state = $cur_aln1state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
672 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
673 if ($cur_aln2state eq $aln2state) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
674 $aln2count++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
675 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
676 if ($aln2count == 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
677 $new_aln2_cigarline .= $aln2state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
678 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
679 $new_aln2_cigarline .= $aln2count.$aln2state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
680 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
681 $aln2count = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
682 $aln2state = $cur_aln2state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
683 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
684 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
685 if ($aln1count == 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
686 $new_aln1_cigarline .= $aln1state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
687 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
688 $new_aln1_cigarline .= $aln1count.$aln1state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
689 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
690 if ($aln2count == 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
691 $new_aln2_cigarline .= $aln2state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
692 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
693 $new_aln2_cigarline .= $aln2count.$aln2state;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
694 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
695 my $seq_length1 = $gene1->seq_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
696 unless (0 == $seq_length1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
697 $gene1->cigar_line($new_aln1_cigarline);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
698 $gene1->perc_id( int((100.0 * $identical_matches / $seq_length1 + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
699 $gene1->perc_pos( int((100.0 * $positive_matches / $seq_length1 + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
700 $gene1->perc_cov( int((100.0 * $aln1cov / $seq_length1 + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
701 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
702 my $seq_length2 = $gene2->seq_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
703 unless (0 == $seq_length2) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
704 $gene2->cigar_line($new_aln2_cigarline);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
705 $gene2->perc_id( int((100.0 * $identical_matches / $seq_length2 + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
706 $gene2->perc_pos( int((100.0 * $positive_matches / $seq_length2 + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
707 $gene2->perc_cov( int((100.0 * $aln2cov / $seq_length2 + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
708 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
709 return undef;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
710 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
711
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
712 my $min_seq = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
713 $min_seq = int($min_seq * $ngenes) if $min_seq <= 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
714
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
715 my @new_cigars = ('') x $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
716 my @nmatch_id = (0) x $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
717 my @nmatch_pos = (0) x $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
718 my @nmatch_cov = (0) x $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
719 my @alncount = (1) x $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
720 my @alnstate = (undef) x $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
721 my @cur_alnstate = (undef) x $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
722
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
723 my @aln = map {$_->alignment_string} @$genes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
724 my $aln_length = length($aln[0]);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
725
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
726 for (my $i=0; $i < $aln_length; $i++) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
727
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
728 my @char_i = map {substr($_, $i, 1)} @aln;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
729 #print "pos $i: ", join('/', @char_i), "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
730
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
731 my %seen;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
732 map {$seen{$_}++} @char_i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
733 next if $seen{'-'} == $ngenes;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
734 delete $seen{'-'};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
735
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
736 my %pos_chars = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
737 my @chars = keys %seen;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
738 while (my $c1 = shift @chars) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
739 foreach my $c2 (@chars) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
740 if (($matrix_hash{uc $c1}{uc $c2} > 0) and ($seen{$c1}+$seen{$c2} >= $min_seq)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
741 $pos_chars{$c1} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
742 $pos_chars{$c2} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
743 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
744 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
745 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
746
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
747 for (my $j=0; $j<$ngenes; $j++) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
748 if ($char_i[$j] eq '-') {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
749 $cur_alnstate[$j] = 'D';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
750 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
751 $nmatch_cov[$j]++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
752 $cur_alnstate[$j] = 'M';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
753 if ($seen{$char_i[$j]} >= $min_seq) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
754 $nmatch_id[$j]++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
755 $nmatch_pos[$j]++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
756 } elsif (exists $pos_chars{$char_i[$j]}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
757 $nmatch_pos[$j]++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
758 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
759 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
760 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
761
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
762 if ($i == 0) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
763 @alnstate = @cur_alnstate;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
764 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
765 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
766
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
767 for (my $j=0; $j<$ngenes; $j++) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
768 if ($cur_alnstate[$j] eq $alnstate[$j]) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
769 $alncount[$j]++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
770 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
771 if ($alncount[$j] == 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
772 $new_cigars[$j] .= $alnstate[$j];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
773 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
774 $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
775 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
776 $alncount[$j] = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
777 $alnstate[$j] = $cur_alnstate[$j];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
778 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
779 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
780 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
781
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
782 for (my $j=0; $j<$ngenes; $j++) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
783 if ($alncount[$j] == 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
784 $new_cigars[$j] .= $alnstate[$j];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
785 } else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
786 $new_cigars[$j] .= $alncount[$j].$alnstate[$j];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
787 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
788 $genes->[$j]->cigar_line($new_cigars[$j]);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
789 my $seq_length = $genes->[$j]->seq_length;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
790 unless (0 == $seq_length) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
791 $genes->[$j]->perc_id( int((100.0 * $nmatch_id[$j] / $seq_length + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
792 $genes->[$j]->perc_pos( int((100.0 * $nmatch_pos[$j] / $seq_length + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
793 $genes->[$j]->perc_cov( int((100.0 * $nmatch_cov[$j] / $seq_length + 0.5)) );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
794 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
795 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
796 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
797
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
798
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
799
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
800 1;