Mercurial > repos > willmclaren > ensembl_vep
diff variant_effect_predictor/Bio/EnsEMBL/Utils/Slice.pm @ 0:21066c0abaf5 draft
Uploaded
author | willmclaren |
---|---|
date | Fri, 03 Aug 2012 10:04:48 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Utils/Slice.pm Fri Aug 03 10:04:48 2012 -0400 @@ -0,0 +1,137 @@ +=head1 LICENSE + + Copyright (c) 1999-2012 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <dev@ensembl.org>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +=head1 NAME + +Bio::EnsEMBL::Utils::Slice - Utility functions for slices + +=head1 SYNOPSIS + + use Bio::EnsEMBL::Utils::Slice qw(split_Slices); + + # ... + + # get all chromosomes in the database + my $slices = $slice_adaptor->fetch_all('chromosome'); + + # split the chromosomes into equal chunks of size less than 1MB + # with an overlap of 1kb + $slices = split_Slices( $slices, 1e6, 1e3 ); + +=head1 METHODS + +=cut + + +package Bio::EnsEMBL::Utils::Slice; + +use strict; +use warnings; + +use Exporter; + +use vars qw(@ISA @EXPORT_OK); + +@ISA = qw(Exporter); + +@EXPORT_OK = qw(&split_Slices); + +use Bio::EnsEMBL::Utils::Exception qw(throw); +use POSIX; + +=head2 split_Slices + + Arg [1] : ref to list of slices + Arg [2] : int maxlength of sub slices + Arg [3] : int overlap length (optional) + Example : my $sub_slices = split_Slices($slices,$maxlen,$overlap) + Description: splits a slice into smaller slices + Returntype : ref to list of slices + Exceptions : maxlen <1 or overlap < 0 + +=cut + +sub split_Slices{ + my ($slice_big,$max_length,$overlap)=@_; + + if(!defined($max_length) or $max_length < 1){ + throw("maxlength needs to be set and > 0"); + } + + if(!defined($overlap)){ + $overlap = 0; + } + elsif($overlap < 0){ + throw("negative overlaps not allowed"); + } + + my @out=(); + + foreach my $slice (@$slice_big){ + + my $start = $slice->start; + my $end; + my $multiple; + my $number; + my $length = $slice->length; + + if($max_length && ($length > $overlap)) { + #No seq region may be longer than max_length but we want to make + #them all similar size so that the last one isn't much shorter. + #Divide the seq_region into the largest equal pieces that are shorter + #than max_length + + #calculate number of slices to create + $number = ($length-$overlap) / ($max_length-$overlap); + $number = ceil($number); #round up to int + + #calculate length of created slices + $multiple = $length / $number; + $multiple = floor($multiple); #round down to int + } else { + #just one slice of the whole seq_region + $number = 1; + $multiple = $length; + } + + my $i; + for(my $i=0; $i < $number; $i++) { + $end = $start + $multiple + $overlap; + + #any remainder gets added to the last slice of the seq_region + $end = $slice->end if($i == $number-1); + push @out, Bio::EnsEMBL::Slice->new + (-START => $start, + -END => $end, + -STRAND => 1, + -SEQ_REGION_NAME => $slice->seq_region_name, + -SEQ_REGION_LENGTH => $slice->seq_region_length, + -COORD_SYSTEM => $slice->coord_system, + -ADAPTOR => $slice->adaptor); + $start += $multiple + 1; + } + } + + return \@out; +} + + + +1;