Mercurial > repos > dvanzessen > vep_emc
comparison dir_plugins/MaxEntScan.pm @ 0:e545d0a25ffe draft
Uploaded
| author | dvanzessen | 
|---|---|
| date | Mon, 15 Jul 2019 05:17:17 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:e545d0a25ffe | 
|---|---|
| 1 =head1 LICENSE | |
| 2 | |
| 3 Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute | |
| 4 Copyright [2016-2018] EMBL-European Bioinformatics Institute | |
| 5 | |
| 6 Licensed under the Apache License, Version 2.0 (the "License"); | |
| 7 you may not use this file except in compliance with the License. | |
| 8 You may obtain a copy of the License at | |
| 9 | |
| 10 http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 | |
| 12 Unless required by applicable law or agreed to in writing, software | |
| 13 distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 See the License for the specific language governing permissions and | |
| 16 limitations under the License. | |
| 17 | |
| 18 =head1 CONTACT | |
| 19 | |
| 20 Ensembl <http://www.ensembl.org/info/about/contact/index.html> | |
| 21 | |
| 22 =cut | |
| 23 | |
| 24 =head1 NAME | |
| 25 | |
| 26 MaxEntScan | |
| 27 | |
| 28 =head1 SYNOPSIS | |
| 29 | |
| 30 mv MaxEntScan.pm ~/.vep/Plugins | |
| 31 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload | |
| 32 ./vep -i variants.vcf --plugin MaxEntScan,/path/to/maxentscan/fordownload,SWA,NCSS | |
| 33 | |
| 34 =head1 DESCRIPTION | |
| 35 | |
| 36 This is a plugin for the Ensembl Variant Effect Predictor (VEP) that | |
| 37 runs MaxEntScan (http://genes.mit.edu/burgelab/maxent/Xmaxentscan_scoreseq.html) | |
| 38 to get splice site predictions. | |
| 39 | |
| 40 The plugin copies most of the code verbatim from the score5.pl and score3.pl | |
| 41 scripts provided in the MaxEntScan download. To run the plugin you must get and | |
| 42 unpack the archive from http://genes.mit.edu/burgelab/maxent/download/; the path | |
| 43 to this unpacked directory is then the param you pass to the --plugin flag. | |
| 44 | |
| 45 The plugin executes the logic from one of the scripts depending on which | |
| 46 splice region the variant overlaps: | |
| 47 | |
| 48 score5.pl : last 3 bases of exon --> first 6 bases of intron | |
| 49 score3.pl : last 20 bases of intron --> first 3 bases of exon | |
| 50 | |
| 51 The plugin reports the reference, alternate and difference (REF - ALT) maximum | |
| 52 entropy scores. | |
| 53 | |
| 54 If 'SWA' is specified as a command-line argument, a sliding window algorithm | |
| 55 is applied to subsequences containing the reference and alternate alleles to | |
| 56 identify k-mers with the highest donor and acceptor splice site scores. To assess | |
| 57 the impact of variants, reference comparison scores are also provided. For SNVs, | |
| 58 the comparison scores are derived from sequence in the same frame as the highest | |
| 59 scoring k-mers containing the alternate allele. For all other variants, the | |
| 60 comparison scores are derived from the highest scoring k-mers containing the | |
| 61 reference allele. The difference between the reference comparison and alternate | |
| 62 scores (SWA_REF_COMP - SWA_ALT) are also provided. | |
| 63 | |
| 64 If 'NCSS' is specified as a command-line argument, scores for the nearest | |
| 65 upstream and downstream canonical splice sites are also included. | |
| 66 | |
| 67 By default, only scores are reported. Add 'verbose' to the list of command- | |
| 68 line arguments to include the sequence output associated with those scores. | |
| 69 | |
| 70 =cut | |
| 71 | |
| 72 package MaxEntScan; | |
| 73 | |
| 74 use strict; | |
| 75 use warnings; | |
| 76 | |
| 77 use Digest::MD5 qw(md5_hex); | |
| 78 | |
| 79 use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp); | |
| 80 use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap); | |
| 81 | |
| 82 use Bio::EnsEMBL::Variation::Utils::BaseVepPlugin; | |
| 83 use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepPlugin); | |
| 84 | |
| 85 # how many seq/score pairs to cache in memory | |
| 86 our $CACHE_SIZE = 50; | |
| 87 | |
| 88 sub new { | |
| 89 my $class = shift; | |
| 90 | |
| 91 my $self = $class->SUPER::new(@_); | |
| 92 | |
| 93 # we need sequence, so no offline mode unless we have FASTA | |
| 94 die("ERROR: cannot function in offline mode without a FASTA file\n") if $self->{config}->{offline} && !$self->{config}->{fasta}; | |
| 95 | |
| 96 my $params = $self->params; | |
| 97 | |
| 98 my $dir = shift @$params; | |
| 99 die("ERROR: MaxEntScan directory not specified\n") unless $dir; | |
| 100 die("ERROR: MaxEntScan directory not found\n") unless -d $dir; | |
| 101 $self->{_dir} = $dir; | |
| 102 | |
| 103 ## setup from score5.pl | |
| 104 $self->{'score5_me2x5'} = $self->score5_makescorematrix($dir.'/me2x5'); | |
| 105 $self->{'score5_seq'} = $self->score5_makesequencematrix($dir.'/splicemodels/splice5sequences'); | |
| 106 | |
| 107 ## setup from score3.pl | |
| 108 $self->{'score3_metables'} = $self->score3_makemaxentscores; | |
| 109 | |
| 110 my %opts = map { $_ => undef } @$params; | |
| 111 | |
| 112 $self->{'run_SWA'} = 1 if exists $opts{'SWA'}; | |
| 113 $self->{'run_NCSS'} = 1 if exists $opts{'NCSS'}; | |
| 114 | |
| 115 $self->{'verbose'} = 1 if exists $opts{'verbose'}; | |
| 116 | |
| 117 return $self; | |
| 118 } | |
| 119 | |
| 120 sub feature_types { | |
| 121 return ['Transcript']; | |
| 122 } | |
| 123 | |
| 124 sub get_header_info { | |
| 125 my $self = shift; | |
| 126 | |
| 127 my $v = $self->{'verbose'}; | |
| 128 my $headers = $self->get_MES_header_info($v); | |
| 129 | |
| 130 if ($self->{'run_SWA'}) { | |
| 131 my $swa_headers = $self->get_SWA_header_info($v); | |
| 132 $headers = {%$headers, %$swa_headers}; | |
| 133 } | |
| 134 | |
| 135 if ($self->{'run_NCSS'}) { | |
| 136 my $ncss_headers = $self->get_NCSS_header_info($v); | |
| 137 $headers = {%$headers, %$ncss_headers}; | |
| 138 } | |
| 139 | |
| 140 return $headers; | |
| 141 } | |
| 142 | |
| 143 sub get_MES_header_info { | |
| 144 my ($self, $verbose) = @_; | |
| 145 | |
| 146 my $headers = { | |
| 147 MaxEntScan_ref => "MaxEntScan reference sequence score", | |
| 148 MaxEntScan_alt => "MaxEntScan alternate sequence score", | |
| 149 MaxEntScan_diff => "MaxEntScan score difference", | |
| 150 }; | |
| 151 | |
| 152 if ($verbose) { | |
| 153 | |
| 154 $headers->{'MaxEntScan_ref_seq'} = "MaxEntScan reference sequence"; | |
| 155 $headers->{'MaxEntScan_alt_seq'} = "MaxEntScan alternate sequence"; | |
| 156 } | |
| 157 | |
| 158 return $headers; | |
| 159 } | |
| 160 | |
| 161 sub get_SWA_header_info { | |
| 162 my ($self, $verbose) = @_; | |
| 163 | |
| 164 my $headers = { | |
| 165 "MES-SWA_donor_ref" => "Highest splice donor reference sequence score", | |
| 166 "MES-SWA_donor_alt" => "Highest splice donor alternate sequence score", | |
| 167 "MES-SWA_donor_ref_comp" => "Donor reference comparison sequence score", | |
| 168 "MES-SWA_donor_diff" => "Difference between the donor reference comparison and alternate sequence scores", | |
| 169 | |
| 170 "MES-SWA_acceptor_ref" => "Highest splice acceptor reference sequence score", | |
| 171 "MES-SWA_acceptor_alt" => "Highest splice acceptor alternate sequence score", | |
| 172 "MES-SWA_acceptor_ref_comp" => "Acceptor reference comparison sequence score", | |
| 173 "MES-SWA_acceptor_diff" => "Difference between the acceptor reference comparison and alternate sequence scores", | |
| 174 }; | |
| 175 | |
| 176 if ($verbose) { | |
| 177 | |
| 178 $headers->{'MES-SWA_donor_ref_seq'} = "Highest splice donor reference sequence"; | |
| 179 $headers->{'MES-SWA_donor_ref_frame'} = "Position of the highest splice donor reference sequence"; | |
| 180 $headers->{'MES-SWA_donor_ref_context'} = "Selected donor sequence context containing the reference allele"; | |
| 181 $headers->{'MES-SWA_donor_alt_seq'} = "Highest splice donor alternate sequence"; | |
| 182 $headers->{'MES-SWA_donor_alt_frame'} = "Position of the highest splice donor alternate sequence"; | |
| 183 $headers->{'MES-SWA_donor_alt_context'} = "Selected donor sequence context containing the alternate allele"; | |
| 184 $headers->{'MES-SWA_donor_ref_comp_seq'} = "Donor reference comparison sequence"; | |
| 185 | |
| 186 $headers->{'MES-SWA_acceptor_ref_seq'} = "Highest splice acceptor reference sequence"; | |
| 187 $headers->{'MES-SWA_acceptor_ref_frame'} = "Position of the highest splice acceptor reference sequence"; | |
| 188 $headers->{'MES-SWA_acceptor_ref_context'} = "Selected acceptor sequence context containing the reference allele"; | |
| 189 $headers->{'MES-SWA_acceptor_alt_seq'} = "Highest splice acceptor alternate sequence"; | |
| 190 $headers->{'MES-SWA_acceptor_alt_frame'} = "Position of the highest splice acceptor alternate sequence"; | |
| 191 $headers->{'MES-SWA_acceptor_alt_context'} = "Selected acceptor sequence context containing the alternate allele"; | |
| 192 $headers->{'MES-SWA_acceptor_ref_comp_seq'} = "Acceptor reference comparison sequence"; | |
| 193 } | |
| 194 | |
| 195 return $headers; | |
| 196 } | |
| 197 | |
| 198 sub get_NCSS_header_info { | |
| 199 my ($self, $verbose) = @_; | |
| 200 | |
| 201 my $headers = { | |
| 202 "MES-NCSS_upstream_acceptor" => "Nearest upstream canonical splice acceptor sequence score", | |
| 203 "MES-NCSS_upstream_donor" => "Nearest upstream canonical splice donor sequence score", | |
| 204 | |
| 205 "MES-NCSS_downstream_acceptor" => "Nearest downstream canonical splice acceptor sequence score", | |
| 206 "MES-NCSS_downstream_donor" => "Nearest downstream canonical splice donor sequence score", | |
| 207 }; | |
| 208 | |
| 209 if ($verbose) { | |
| 210 | |
| 211 $headers->{'MES-NCSS_upstream_acceptor_seq'} = "Nearest upstream canonical splice acceptor sequence"; | |
| 212 $headers->{'MES-NCSS_upstream_donor_seq'} = "Nearest upstream canonical splice donor sequence"; | |
| 213 | |
| 214 $headers->{'MES-NCSS_downstream_acceptor_seq'} = "Nearest downstream canonical splice acceptor sequence"; | |
| 215 $headers->{'MES-NCSS_downstream_donor_seq'} = "Nearest downstream canonical splice donor sequence"; | |
| 216 } | |
| 217 | |
| 218 return $headers; | |
| 219 } | |
| 220 | |
| 221 sub run { | |
| 222 my ($self, $tva) = @_; | |
| 223 | |
| 224 my $seq_headers = $self->get_MES_header_info(); | |
| 225 my $results = $self->run_MES($tva); | |
| 226 | |
| 227 if ($self->{'run_SWA'}) { | |
| 228 my $swa_seq_headers = $self->get_SWA_header_info(); | |
| 229 $seq_headers = {%$seq_headers, %$swa_seq_headers}; | |
| 230 my $swa_results = $self->run_SWA($tva); | |
| 231 $results = {%$results, %$swa_results}; | |
| 232 } | |
| 233 | |
| 234 if ($self->{'run_NCSS'}) { | |
| 235 my $ncss_seq_headers = $self->get_NCSS_header_info(); | |
| 236 $seq_headers = {%$seq_headers, %$ncss_seq_headers}; | |
| 237 my $ncss_results = $self->run_NCSS($tva); | |
| 238 $results = {%$results, %$ncss_results}; | |
| 239 } | |
| 240 | |
| 241 my %data; | |
| 242 | |
| 243 # add the scores | |
| 244 my @scores = grep { exists $results->{$_} } keys %$seq_headers; | |
| 245 @data{@scores} = map { sprintf('%.3f', $_) } @{$results}{@scores}; | |
| 246 | |
| 247 if ($self->{'verbose'}) { | |
| 248 # add any remaining results | |
| 249 my @non_scores = grep { ! exists $data{$_} } keys %$results; | |
| 250 @data{@non_scores} = @{$results}{@non_scores}; | |
| 251 } | |
| 252 | |
| 253 return \%data; | |
| 254 } | |
| 255 | |
| 256 sub run_MES { | |
| 257 my ($self, $tva) = @_; | |
| 258 | |
| 259 my $vf = $tva->variation_feature; | |
| 260 return {} unless $vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/; | |
| 261 | |
| 262 my $tv = $tva->transcript_variation; | |
| 263 my $tr = $tva->transcript; | |
| 264 my $tr_strand = $tr->strand; | |
| 265 my ($vf_start, $vf_end) = ($vf->start, $vf->end); | |
| 266 | |
| 267 # use _overlapped_introns() method from BaseTranscriptVariation | |
| 268 # this will use an interval tree if available for superfast lookup of overlapping introns | |
| 269 # we have to expand the search space around $vf because we're looking for the splice region not the intron per se | |
| 270 foreach my $intron(@{$tv->_overlapped_introns($vf_start - 21, $vf_end + 21)}) { | |
| 271 | |
| 272 # get coords depending on strand | |
| 273 # MaxEntScan does different predictions for 5 and 3 prime | |
| 274 # and we need to feed it different bits of sequence for each | |
| 275 # | |
| 276 # 5prime, 3 bases of exon, 6 bases of intron: | |
| 277 # ===------ | |
| 278 # | |
| 279 # 3prime, 20 bases of intron, 3 bases of exon | |
| 280 # --------------------=== | |
| 281 | |
| 282 my ($five_start, $five_end, $three_start, $three_end); | |
| 283 | |
| 284 if($tr_strand > 0) { | |
| 285 ($five_start, $five_end) = ($intron->start - 3, $intron->start + 5); | |
| 286 ($three_start, $three_end) = ($intron->end - 19, $intron->end + 3); | |
| 287 } | |
| 288 | |
| 289 else { | |
| 290 ($five_start, $five_end) = ($intron->end - 5, $intron->end + 3); | |
| 291 ($three_start, $three_end) = ($intron->start - 3, $intron->start + 19); | |
| 292 } | |
| 293 | |
| 294 if(overlap($vf->start, $vf->end, $five_start, $five_end)) { | |
| 295 my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $five_start, $five_end)}; | |
| 296 | |
| 297 return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/; | |
| 298 return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/; | |
| 299 | |
| 300 my $ref_score = $self->score5($ref_seq); | |
| 301 my $alt_score = $self->score5($alt_seq); | |
| 302 | |
| 303 return { | |
| 304 MaxEntScan_ref => $ref_score, | |
| 305 MaxEntScan_ref_seq => $ref_seq, | |
| 306 MaxEntScan_alt => $alt_score, | |
| 307 MaxEntScan_alt_seq => $alt_seq, | |
| 308 MaxEntScan_diff => $ref_score - $alt_score, | |
| 309 } | |
| 310 } | |
| 311 | |
| 312 if(overlap($vf->start, $vf->end, $three_start, $three_end)) { | |
| 313 my ($ref_seq, $alt_seq) = @{$self->get_seqs($tva, $three_start, $three_end)}; | |
| 314 | |
| 315 return {} unless defined($ref_seq) && $ref_seq =~ /^[ACGT]+$/; | |
| 316 return {} unless defined($alt_seq) && $alt_seq =~ /^[ACGT]+$/; | |
| 317 | |
| 318 my $ref_score = $self->score3($ref_seq); | |
| 319 my $alt_score = $self->score3($alt_seq); | |
| 320 | |
| 321 return { | |
| 322 MaxEntScan_ref => $ref_score, | |
| 323 MaxEntScan_ref_seq => $ref_seq, | |
| 324 MaxEntScan_alt => $alt_score, | |
| 325 MaxEntScan_alt_seq => $alt_seq, | |
| 326 MaxEntScan_diff => $ref_score - $alt_score, | |
| 327 } | |
| 328 } | |
| 329 } | |
| 330 | |
| 331 return {}; | |
| 332 } | |
| 333 | |
| 334 sub run_SWA { | |
| 335 my ($self, $tva) = @_; | |
| 336 | |
| 337 my $vf = $tva->variation_feature; | |
| 338 | |
| 339 my %results; | |
| 340 | |
| 341 # get the donor reference and alternate sequence contexts | |
| 342 my ($donor_ref_context, $donor_alt_context) = @{$self->get_seqs($tva, $vf->start - 8, $vf->end + 8)}; | |
| 343 | |
| 344 if (defined($donor_ref_context)) { | |
| 345 $results{'MES-SWA_donor_ref_context'} = $donor_ref_context; | |
| 346 | |
| 347 if ($donor_ref_context =~ /^[ACGT]+$/) { | |
| 348 my ($seq, $frame, $score) = @{$self->get_max_donor($donor_ref_context)}; | |
| 349 $results{'MES-SWA_donor_ref_seq'} = $seq; | |
| 350 $results{'MES-SWA_donor_ref_frame'} = $frame; | |
| 351 $results{'MES-SWA_donor_ref'} = $score; | |
| 352 } | |
| 353 } | |
| 354 | |
| 355 if (defined($donor_alt_context)) { | |
| 356 $results{'MES-SWA_donor_alt_context'} = $donor_alt_context; | |
| 357 | |
| 358 if ($donor_alt_context =~ /^[ACGT]+$/) { | |
| 359 my ($seq, $frame, $score) = @{$self->get_max_donor($donor_alt_context)}; | |
| 360 $results{'MES-SWA_donor_alt_seq'} = $seq; | |
| 361 $results{'MES-SWA_donor_alt_frame'} = $frame; | |
| 362 $results{'MES-SWA_donor_alt'} = $score; | |
| 363 | |
| 364 if (defined(my $ref_comp_seq = $results{'MES-SWA_donor_ref_seq'})) { | |
| 365 | |
| 366 if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) { | |
| 367 # for SNVs, compare to the same frame as the highest scoring ALT k-mer | |
| 368 $ref_comp_seq = substr($donor_ref_context, $frame - 1, 9); | |
| 369 } | |
| 370 | |
| 371 $results{'MES-SWA_donor_ref_comp_seq'} = $ref_comp_seq; | |
| 372 $results{'MES-SWA_donor_ref_comp'} = $self->score5($ref_comp_seq); | |
| 373 | |
| 374 $results{'MES-SWA_donor_diff'} = $results{'MES-SWA_donor_ref_comp'} - $score; | |
| 375 } | |
| 376 } | |
| 377 } | |
| 378 | |
| 379 # get the acceptor reference and alternate sequence contexts | |
| 380 my ($acceptor_ref_context, $acceptor_alt_context) = @{$self->get_seqs($tva, $vf->start - 22, $vf->end + 22)}; | |
| 381 | |
| 382 if (defined($acceptor_ref_context)) { | |
| 383 $results{'MES-SWA_acceptor_ref_context'} = $acceptor_ref_context; | |
| 384 | |
| 385 if ($acceptor_ref_context =~ /^[ACGT]+$/) { | |
| 386 my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_ref_context)}; | |
| 387 $results{'MES-SWA_acceptor_ref_seq'} = $seq; | |
| 388 $results{'MES-SWA_acceptor_ref_frame'} = $frame; | |
| 389 $results{'MES-SWA_acceptor_ref'} = $score; | |
| 390 } | |
| 391 } | |
| 392 | |
| 393 if (defined($acceptor_alt_context)) { | |
| 394 $results{'MES-SWA_acceptor_alt_context'} = $acceptor_alt_context; | |
| 395 | |
| 396 if ($acceptor_alt_context =~ /^[ACGT]+$/) { | |
| 397 my ($seq, $frame, $score) = @{$self->get_max_acceptor($acceptor_alt_context)}; | |
| 398 $results{'MES-SWA_acceptor_alt_seq'} = $seq; | |
| 399 $results{'MES-SWA_acceptor_alt_frame'} = $frame; | |
| 400 $results{'MES-SWA_acceptor_alt'} = $score; | |
| 401 | |
| 402 if (defined(my $ref_comp_seq = $results{'MES-SWA_acceptor_ref_seq'})) { | |
| 403 | |
| 404 if ($vf->{start} == $vf->{end} && $tva->feature_seq =~ /^[ACGT]$/) { | |
| 405 # for SNVs, compare to the same frame as the highest scoring ALT k-mer | |
| 406 $ref_comp_seq = substr($acceptor_ref_context, $frame - 1, 23); | |
| 407 } | |
| 408 | |
| 409 $results{'MES-SWA_acceptor_ref_comp_seq'} = $ref_comp_seq; | |
| 410 $results{'MES-SWA_acceptor_ref_comp'} = $self->score3($ref_comp_seq); | |
| 411 | |
| 412 $results{'MES-SWA_acceptor_diff'} = $results{'MES-SWA_acceptor_ref_comp'} - $score; | |
| 413 } | |
| 414 } | |
| 415 } | |
| 416 | |
| 417 return \%results; | |
| 418 } | |
| 419 | |
| 420 sub run_NCSS { | |
| 421 my ($self, $tva) = @_; | |
| 422 | |
| 423 my $tv = $tva->transcript_variation; | |
| 424 my $tr = $tva->transcript; | |
| 425 | |
| 426 my %results; | |
| 427 | |
| 428 if ($tv->intron_number) { | |
| 429 | |
| 430 my ($intron_numbers, $total_introns) = split(/\//, $tv->intron_number); | |
| 431 my $intron_number = (split(/-/, $intron_numbers))[0]; | |
| 432 | |
| 433 my $introns = $tr->get_all_Introns; | |
| 434 | |
| 435 my $intron_idx = $intron_number - 1; | |
| 436 my $intron = $introns->[$intron_idx]; | |
| 437 | |
| 438 if (defined(my $seq = $self->get_donor_seq_from_intron($intron))) { | |
| 439 $results{'MES-NCSS_upstream_donor_seq'} = $seq; | |
| 440 $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | |
| 441 } | |
| 442 | |
| 443 if (defined(my $seq = $self->get_acceptor_seq_from_intron($intron))) { | |
| 444 $results{'MES-NCSS_downstream_acceptor_seq'} = $seq; | |
| 445 $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | |
| 446 } | |
| 447 | |
| 448 # don't calculate an upstream acceptor score if the intron is the first in the transcript | |
| 449 unless ($intron_number == 1) { | |
| 450 my $upstream_intron = $introns->[$intron_idx - 1]; | |
| 451 | |
| 452 if (defined(my $seq = $self->get_acceptor_seq_from_intron($upstream_intron))) { | |
| 453 $results{'MES-NCSS_upstream_acceptor_seq'} = $seq; | |
| 454 $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | |
| 455 } | |
| 456 } | |
| 457 | |
| 458 # don't calculate a downstream donor score if the intron is the last in the transcript | |
| 459 unless ($intron_number == $total_introns) { | |
| 460 my $downstream_intron = $introns->[$intron_idx + 1]; | |
| 461 | |
| 462 if (defined(my $seq = $self->get_donor_seq_from_intron($downstream_intron))) { | |
| 463 $results{'MES-NCSS_downstream_donor_seq'} = $seq; | |
| 464 $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | |
| 465 } | |
| 466 } | |
| 467 } | |
| 468 | |
| 469 elsif ($tv->exon_number) { | |
| 470 | |
| 471 my ($exon_numbers, $total_exons) = split(/\//, $tv->exon_number); | |
| 472 my $exon_number = (split(/-/, $exon_numbers))[0]; | |
| 473 | |
| 474 my $exons = $tr->get_all_Exons; | |
| 475 | |
| 476 my $exon_idx = $exon_number - 1; | |
| 477 my $exon = $exons->[$exon_idx]; | |
| 478 | |
| 479 # don't calculate upstream scores if the exon is the first in the transcript | |
| 480 unless ($exon_number == 1) { | |
| 481 my $upstream_exon = $exons->[$exon_idx - 1]; | |
| 482 | |
| 483 if (defined(my $seq = $self->get_donor_seq_from_exon($upstream_exon))) { | |
| 484 $results{'MES-NCSS_upstream_donor_seq'} = $seq; | |
| 485 $results{'MES-NCSS_upstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | |
| 486 } | |
| 487 | |
| 488 if (defined(my $seq = $self->get_acceptor_seq_from_exon($exon))) { | |
| 489 $results{'MES-NCSS_upstream_acceptor_seq'} = $seq; | |
| 490 $results{'MES-NCSS_upstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | |
| 491 } | |
| 492 } | |
| 493 | |
| 494 # don't calculate downstream scores if the exon is the last exon in the transcript | |
| 495 unless ($exon_number == $total_exons) { | |
| 496 my $downstream_exon = $exons->[$exon_idx + 1]; | |
| 497 | |
| 498 if (defined(my $seq = $self->get_donor_seq_from_exon($exon))) { | |
| 499 $results{'MES-NCSS_downstream_donor_seq'} = $seq; | |
| 500 $results{'MES-NCSS_downstream_donor'} = $self->score5($seq) if $seq =~ /^[ACGT]+$/; | |
| 501 } | |
| 502 | |
| 503 if (defined(my $seq = $self->get_acceptor_seq_from_exon($downstream_exon))) { | |
| 504 $results{'MES-NCSS_downstream_acceptor_seq'} = $seq; | |
| 505 $results{'MES-NCSS_downstream_acceptor'} = $self->score3($seq) if $seq =~ /^[ACGT]+$/; | |
| 506 } | |
| 507 } | |
| 508 } | |
| 509 | |
| 510 return \%results; | |
| 511 } | |
| 512 | |
| 513 | |
| 514 ## Sliding window approach methods | |
| 515 ################################## | |
| 516 | |
| 517 sub get_max_donor { | |
| 518 my ($self, $sequence) = @_; | |
| 519 | |
| 520 my ($seq, $frame, $max); | |
| 521 my @kmers = @{$self->sliding_window($sequence, 9)}; | |
| 522 | |
| 523 for my $i (0 .. $#kmers) { | |
| 524 my $kmer = $kmers[$i]; | |
| 525 my $score = $self->score5($kmer); | |
| 526 if(!$max || $score > $max) { | |
| 527 $seq = $kmer; | |
| 528 $frame = $i + 1; | |
| 529 $max = $score; | |
| 530 } | |
| 531 } | |
| 532 return [$seq, $frame, $max]; | |
| 533 } | |
| 534 | |
| 535 sub get_max_acceptor { | |
| 536 my ($self, $sequence) = @_; | |
| 537 | |
| 538 my ($seq, $frame, $max); | |
| 539 my @kmers = @{$self->sliding_window($sequence, 23)}; | |
| 540 | |
| 541 for my $i (0 .. $#kmers) { | |
| 542 my $kmer = $kmers[$i]; | |
| 543 my $score = $self->score3($kmer); | |
| 544 if(!$max || $score > $max) { | |
| 545 $seq = $kmer; | |
| 546 $frame = $i + 1; | |
| 547 $max = $score; | |
| 548 } | |
| 549 } | |
| 550 return [$seq, $frame, $max]; | |
| 551 } | |
| 552 | |
| 553 sub sliding_window { | |
| 554 my ($self, $sequence, $winsize) = @_; | |
| 555 my @seqs; | |
| 556 for (my $i = 1; $i <= length($sequence) - $winsize + 1; $i++) { | |
| 557 push @seqs, substr($sequence, $i - 1, $winsize); | |
| 558 } | |
| 559 return \@seqs; | |
| 560 } | |
| 561 | |
| 562 | |
| 563 ## Nearest canonical splice site methods | |
| 564 ######################################## | |
| 565 | |
| 566 sub get_donor_seq_from_exon { | |
| 567 my ($self, $exon) = @_; | |
| 568 | |
| 569 my ($start, $end); | |
| 570 | |
| 571 if ($exon->strand > 0) { | |
| 572 ($start, $end) = ($exon->end - 2, $exon->end + 6); | |
| 573 } | |
| 574 else { | |
| 575 ($start, $end) = ($exon->start - 6, $exon->start + 2); | |
| 576 } | |
| 577 | |
| 578 my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand); | |
| 579 my $seq = $slice->seq() if defined($slice); | |
| 580 | |
| 581 return $seq; | |
| 582 } | |
| 583 | |
| 584 sub get_acceptor_seq_from_exon { | |
| 585 my ($self, $exon) = @_; | |
| 586 | |
| 587 my ($start, $end); | |
| 588 | |
| 589 if ($exon->strand > 0) { | |
| 590 ($start, $end) = ($exon->start - 20, $exon->start + 2); | |
| 591 } | |
| 592 else { | |
| 593 ($start, $end) = ($exon->end - 2, $exon->end + 20); | |
| 594 } | |
| 595 | |
| 596 my $slice = $exon->slice()->sub_Slice($start, $end, $exon->strand); | |
| 597 my $seq = $slice->seq() if defined($slice); | |
| 598 | |
| 599 return $seq; | |
| 600 } | |
| 601 | |
| 602 sub get_donor_seq_from_intron { | |
| 603 my ($self, $intron) = @_; | |
| 604 | |
| 605 my ($start, $end); | |
| 606 | |
| 607 if ($intron->strand > 0) { | |
| 608 ($start, $end) = ($intron->start - 3, $intron->start + 5); | |
| 609 } | |
| 610 else { | |
| 611 ($start, $end) = ($intron->end - 5, $intron->end + 3); | |
| 612 } | |
| 613 | |
| 614 my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand); | |
| 615 my $seq = $slice->seq() if defined($slice); | |
| 616 | |
| 617 return $seq; | |
| 618 } | |
| 619 | |
| 620 sub get_acceptor_seq_from_intron { | |
| 621 my ($self, $intron) = @_; | |
| 622 | |
| 623 my ($start, $end); | |
| 624 | |
| 625 if ($intron->strand > 0) { | |
| 626 ($start, $end) = ($intron->end - 19, $intron->end + 3); | |
| 627 } | |
| 628 else { | |
| 629 ($start, $end) = ($intron->start - 3, $intron->start + 19); | |
| 630 } | |
| 631 | |
| 632 my $slice = $intron->slice()->sub_Slice($start, $end, $intron->strand); | |
| 633 my $seq = $slice->seq() if defined($slice); | |
| 634 | |
| 635 return $seq; | |
| 636 } | |
| 637 | |
| 638 | |
| 639 ## Common methods | |
| 640 ################# | |
| 641 | |
| 642 sub get_seqs { | |
| 643 my ($self, $tva, $start, $end) = @_; | |
| 644 my $vf = $tva->variation_feature; | |
| 645 | |
| 646 my $tr_strand = $tva->transcript->strand; | |
| 647 | |
| 648 my $ref_slice = $vf->{slice}->sub_Slice($start, $end, $tr_strand); | |
| 649 | |
| 650 my ($ref_seq, $alt_seq); | |
| 651 | |
| 652 if (defined $ref_slice) { | |
| 653 | |
| 654 $ref_seq = $alt_seq = $ref_slice->seq(); | |
| 655 | |
| 656 my $substr_start = $tr_strand > 0 ? $vf->{start} - $start : $end - $vf->{end}; | |
| 657 my $feature_seq = $tva->seq_length > 0 ? $tva->feature_seq : ''; | |
| 658 | |
| 659 substr($alt_seq, $substr_start, ($vf->{end} - $vf->{start}) + 1) = $feature_seq; | |
| 660 } | |
| 661 | |
| 662 return [$ref_seq, $alt_seq]; | |
| 663 } | |
| 664 | |
| 665 sub score5 { | |
| 666 my $self = shift; | |
| 667 my $seq = shift; | |
| 668 my $hex = md5_hex($seq); | |
| 669 | |
| 670 # check cache | |
| 671 if($self->{cache}) { | |
| 672 my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score5}}; | |
| 673 | |
| 674 return $res->{score} if $res; | |
| 675 } | |
| 676 | |
| 677 my $a = $self->score5_scoreconsensus($seq); | |
| 678 die("ERROR: No score5_scoreconsensus\n") unless defined($a); | |
| 679 | |
| 680 my $b = $self->score5_getrest($seq); | |
| 681 die("ERROR: No score5_getrest\n") unless defined($b); | |
| 682 | |
| 683 my $c = $self->{'score5_seq'}->{$b}; | |
| 684 die("ERROR: No score5_seq for $b\n") unless defined($c); | |
| 685 | |
| 686 my $d = $self->{'score5_me2x5'}->{$c}; | |
| 687 die("ERROR: No score5_me2x5 for $c\n") unless defined($d); | |
| 688 | |
| 689 my $score = $self->log2($a * $d); | |
| 690 | |
| 691 # cache it | |
| 692 push @{$self->{cache}->{score5}}, { hex => $hex, score => $score }; | |
| 693 shift @{$self->{cache}->{score5}} while scalar @{$self->{cache}->{score5}} > $CACHE_SIZE; | |
| 694 | |
| 695 return $score; | |
| 696 } | |
| 697 | |
| 698 sub score3 { | |
| 699 my $self = shift; | |
| 700 my $seq = shift; | |
| 701 my $hex = md5_hex($seq); | |
| 702 | |
| 703 # check cache | |
| 704 if($self->{cache}) { | |
| 705 my ($res) = grep {$_->{hex} eq $hex} @{$self->{cache}->{score3}}; | |
| 706 | |
| 707 return $res->{score} if $res; | |
| 708 } | |
| 709 | |
| 710 my $a = $self->score3_scoreconsensus($seq); | |
| 711 die("ERROR: No score3_scoreconsensus\n") unless defined($a); | |
| 712 | |
| 713 my $b = $self->score3_getrest($seq); | |
| 714 die("ERROR: No score3_getrest\n") unless defined($b); | |
| 715 | |
| 716 my $c = $self->score3_maxentscore($b, $self->{'score3_metables'}); | |
| 717 die("ERROR: No score3_maxentscore for $b\n") unless defined($c); | |
| 718 | |
| 719 my $score = $self->log2($a * $c); | |
| 720 | |
| 721 # cache it | |
| 722 push @{$self->{cache}->{score3}}, { hex => $hex, score => $score }; | |
| 723 shift @{$self->{cache}->{score3}} while scalar @{$self->{cache}->{score3}} > $CACHE_SIZE; | |
| 724 | |
| 725 return $score; | |
| 726 } | |
| 727 | |
| 728 | |
| 729 ## methods copied from score5.pl | |
| 730 ################################ | |
| 731 | |
| 732 sub score5_makesequencematrix { | |
| 733 my $self = shift; | |
| 734 my $file = shift; | |
| 735 my %matrix; | |
| 736 my $n=0; | |
| 737 open(SCOREF, $file) || die "Can't open $file!\n"; | |
| 738 while(<SCOREF>) { | |
| 739 chomp; | |
| 740 $_=~ s/\s//; | |
| 741 $matrix{$_} = $n; | |
| 742 $n++; | |
| 743 } | |
| 744 close(SCOREF); | |
| 745 return \%matrix; | |
| 746 } | |
| 747 | |
| 748 sub score5_makescorematrix { | |
| 749 my $self = shift; | |
| 750 my $file = shift; | |
| 751 my %matrix; | |
| 752 my $n=0; | |
| 753 open(SCOREF, $file) || die "Can't open $file!\n"; | |
| 754 while(<SCOREF>) { | |
| 755 chomp; | |
| 756 $_=~ s/\s//; | |
| 757 $matrix{$n} = $_; | |
| 758 $n++; | |
| 759 } | |
| 760 close(SCOREF); | |
| 761 return \%matrix; | |
| 762 } | |
| 763 | |
| 764 sub score5_getrest { | |
| 765 my $self = shift; | |
| 766 my $seq = shift; | |
| 767 my @seqa = split(//,uc($seq)); | |
| 768 return $seqa[0].$seqa[1].$seqa[2].$seqa[5].$seqa[6].$seqa[7].$seqa[8]; | |
| 769 } | |
| 770 | |
| 771 sub score5_scoreconsensus { | |
| 772 my $self = shift; | |
| 773 my $seq = shift; | |
| 774 my @seqa = split(//,uc($seq)); | |
| 775 my %bgd; | |
| 776 $bgd{'A'} = 0.27; | |
| 777 $bgd{'C'} = 0.23; | |
| 778 $bgd{'G'} = 0.23; | |
| 779 $bgd{'T'} = 0.27; | |
| 780 my %cons1; | |
| 781 $cons1{'A'} = 0.004; | |
| 782 $cons1{'C'} = 0.0032; | |
| 783 $cons1{'G'} = 0.9896; | |
| 784 $cons1{'T'} = 0.0032; | |
| 785 my %cons2; | |
| 786 $cons2{'A'} = 0.0034; | |
| 787 $cons2{'C'} = 0.0039; | |
| 788 $cons2{'G'} = 0.0042; | |
| 789 $cons2{'T'} = 0.9884; | |
| 790 my $addscore = $cons1{$seqa[3]}*$cons2{$seqa[4]}/($bgd{$seqa[3]}*$bgd{$seqa[4]}); | |
| 791 return $addscore; | |
| 792 } | |
| 793 | |
| 794 sub log2 { | |
| 795 my ($self, $val) = @_; | |
| 796 return log($val)/log(2); | |
| 797 } | |
| 798 | |
| 799 | |
| 800 ## methods copied from score3.pl | |
| 801 ################################ | |
| 802 | |
| 803 sub score3_hashseq { | |
| 804 #returns hash of sequence in base 4 | |
| 805 # $self->score3_hashseq('CAGAAGT') returns 4619 | |
| 806 my $self = shift; | |
| 807 my $seq = shift; | |
| 808 $seq = uc($seq); | |
| 809 $seq =~ tr/ACGT/0123/; | |
| 810 my @seqa = split(//,$seq); | |
| 811 my $sum = 0; | |
| 812 my $len = length($seq); | |
| 813 my @four = (1,4,16,64,256,1024,4096,16384); | |
| 814 my $i=0; | |
| 815 while ($i<$len) { | |
| 816 $sum+= $seqa[$i] * $four[$len - $i -1] ; | |
| 817 $i++; | |
| 818 } | |
| 819 return $sum; | |
| 820 } | |
| 821 | |
| 822 sub score3_makemaxentscores { | |
| 823 my $self = shift; | |
| 824 my $dir = $self->{'_dir'}."/splicemodels/"; | |
| 825 my @list = ('me2x3acc1','me2x3acc2','me2x3acc3','me2x3acc4', | |
| 826 'me2x3acc5','me2x3acc6','me2x3acc7','me2x3acc8','me2x3acc9'); | |
| 827 my @metables; | |
| 828 my $num = 0 ; | |
| 829 foreach my $file (@list) { | |
| 830 my $n = 0; | |
| 831 open (SCOREF,"<".$dir.$file) || die "Can't open $file!\n"; | |
| 832 while(<SCOREF>) { | |
| 833 chomp; | |
| 834 $_=~ s/\s//; | |
| 835 $metables[$num]{$n} = $_; | |
| 836 $n++; | |
| 837 } | |
| 838 close(SCOREF); | |
| 839 #print STDERR $file."\t".$num."\t".$n."\n"; | |
| 840 $num++; | |
| 841 } | |
| 842 return \@metables; | |
| 843 } | |
| 844 | |
| 845 sub score3_maxentscore { | |
| 846 my $self = shift; | |
| 847 my $seq = shift; | |
| 848 my $table_ref = shift; | |
| 849 my @metables = @$table_ref; | |
| 850 my @sc; | |
| 851 $sc[0] = $metables[0]{$self->score3_hashseq(substr($seq,0,7))}; | |
| 852 $sc[1] = $metables[1]{$self->score3_hashseq(substr($seq,7,7))}; | |
| 853 $sc[2] = $metables[2]{$self->score3_hashseq(substr($seq,14,7))}; | |
| 854 $sc[3] = $metables[3]{$self->score3_hashseq(substr($seq,4,7))}; | |
| 855 $sc[4] = $metables[4]{$self->score3_hashseq(substr($seq,11,7))}; | |
| 856 $sc[5] = $metables[5]{$self->score3_hashseq(substr($seq,4,3))}; | |
| 857 $sc[6] = $metables[6]{$self->score3_hashseq(substr($seq,7,4))}; | |
| 858 $sc[7] = $metables[7]{$self->score3_hashseq(substr($seq,11,3))}; | |
| 859 $sc[8] = $metables[8]{$self->score3_hashseq(substr($seq,14,4))}; | |
| 860 my $finalscore = $sc[0] * $sc[1] * $sc[2] * $sc[3] * $sc[4] / ($sc[5] * $sc[6] * $sc[7] * $sc[8]); | |
| 861 return $finalscore; | |
| 862 } | |
| 863 | |
| 864 sub score3_getrest { | |
| 865 my $self = shift; | |
| 866 my $seq = shift; | |
| 867 my $seq_noconsensus = substr($seq,0,18).substr($seq,20,3); | |
| 868 return $seq_noconsensus; | |
| 869 } | |
| 870 | |
| 871 sub score3_scoreconsensus { | |
| 872 my $self = shift; | |
| 873 my $seq = shift; | |
| 874 my @seqa = split(//,uc($seq)); | |
| 875 my %bgd; | |
| 876 $bgd{'A'} = 0.27; | |
| 877 $bgd{'C'} = 0.23; | |
| 878 $bgd{'G'} = 0.23; | |
| 879 $bgd{'T'} = 0.27; | |
| 880 my %cons1; | |
| 881 $cons1{'A'} = 0.9903; | |
| 882 $cons1{'C'} = 0.0032; | |
| 883 $cons1{'G'} = 0.0034; | |
| 884 $cons1{'T'} = 0.0030; | |
| 885 my %cons2; | |
| 886 $cons2{'A'} = 0.0027; | |
| 887 $cons2{'C'} = 0.0037; | |
| 888 $cons2{'G'} = 0.9905; | |
| 889 $cons2{'T'} = 0.0030; | |
| 890 my $addscore = $cons1{$seqa[18]} * $cons2{$seqa[19]}/ ($bgd{$seqa[18]} * $bgd{$seqa[19]}); | |
| 891 return $addscore; | |
| 892 } | |
| 893 | |
| 894 1; | |
| 895 | 
