Mercurial > repos > mahtabm > ensemb_rep_gvl
diff variant_effect_predictor/Bio/Assembly/Scaffold.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/Assembly/Scaffold.pm Thu Apr 11 06:29:17 2013 -0400 @@ -0,0 +1,615 @@ +# $Id: Scaffold.pm,v 1.2 2002/11/11 18:16:30 lapp Exp $ +# +# BioPerl module for Bio::Assembly::Scaffold +# +# Copyright by Robson F. de Souza +# +# You may distribute this module under the same terms as perl itself +# +# POD documentation - main docs before the code + +=head1 NAME + +Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly data. + +=head1 SYNOPSYS + + # Module loading + use Bio::Assembly::IO; + + # Assembly loading methods + my $aio = new Bio::Assembly::IO(-file=>"test.ace.1", -format=>'phrap'); + my $assembly = $aio->next_assembly; + + foreach my $contig ($assembly->all_contigs) { + # do something... (see Bio::Assembly::Contig) + } + +=head1 DESCRIPTION + +Bio::Assembly::Scaffold was developed to store and manipulate data +from sequence assembly programs like Phrap. It implements the +ScaffoldI interface and intends to be generic enough to be used by +Bio::Assembly::IO drivers written to programs other than Phrap. + +=head1 FEEDBACK + +=head2 Mailing Lists + +User feedback is an integral part of the evolution of this and other +Bioperl modules. Send your comments and suggestions preferably to the +Bioperl mailing lists Your participation is much appreciated. + + bioperl-l@bioperl.org - General discussion + http://bio.perl.org/MailList.html - About the mailing lists + +=head2 Reporting Bugs + +Report bugs to the Bioperl bug tracking system to help us keep track +the bugs and their resolution. Bug reports can be submitted via email +or the web: + + bioperl-bugs@bio.perl.org + http://bugzilla.bioperl.org/ + +=head1 AUTHOR - Robson Francisco de Souza + +rfsouza@citri.iq.usp.br + +=head1 APPENDIX + +The rest of the documentation details each of the object +methods. Internal methods are usually preceded with a _ + +=cut + +package Bio::Assembly::Scaffold; + +use strict; +use vars qw(@ISA $VERSION); + +use Bio::Root::Root; +use Bio::Assembly::ScaffoldI; +use Bio::Annotation::Collection; + +$VERSION = '0.0.1'; +@ISA = qw(Bio::Root::Root Bio::Assembly::ScaffoldI); + +=head2 new () + + Title : new + Usage : $assembly = new (-source=>'program_name', + -contigs=>\@contigs, + -id=>"assembly 1"); + Function: creates a new assembly object + Returns : + Args : + -source : [string] sequence assembly program + -contigs : reference to array of + Bio::Assembly::Contig objects + -id : [string] assembly name + +=cut + +sub new { + my($class,@args) = @_; + + my $self = $class->SUPER::new(@args); + + my ($src,$contigs,$id) = $self->_rearrange([qw(SOURCE CONTIGS ID)], @args); + + $self->{'_contigs'} = {}; + $self->{'_singlets'} = {}; + $self->{'_seqs'} = {}; + $self->{'_annotation'} = Bio::Annotation::Collection->new(); + $self->{'_id'} = 'NoName'; + + if (defined $contigs && ref($contigs = 'ARRAY')) { + foreach my $contig (@{$contigs}) { + $self->add_contig($contig); + } + } + + $self->{'_id'} = $id if (defined $id); + + return $self; +} + +=head1 Accessing general assembly data + +=cut + +=head2 id + + Title : id + Usage : $assembly->id() + Function: Get/Set assembly ID + Returns : string or undef + Args : string + +=cut + +sub id { + my $self = shift; + my $id = shift; + + $self->{'_id'} = $id if (defined $id); + + return $self->{'_id'}; +} + +=head2 annotation + + Title : annotation + Usage : $assembly->annotation() + Function: Get/Set assembly annotation object + Returns : Bio::Annotation::Collection + Args : none + +=cut + +sub annotation { + my ($self,$ref) = shift; + + $self->{'_annotation'} = $ref if (defined $ref); + return $self->{'_annotation'}; +} + +=head2 get_nof_contigs + + Title : get_nof_contigs + Usage : $assembly->get_nof_contigs() + Function: Get the number of contigs included in the assembly + Returns : integer + Args : none + +=cut + +sub get_nof_contigs { + my $self = shift; + + return scalar( $self->get_contig_ids() ); +} + +=head2 get_nof_sequences_in_contigs + + Title : get_nof_sequences_in_contigs + Usage : $assembly->get_nof_sequences_in_contigs() + Function: + + Get the number of sequences included in the + assembly. This number refers only to the sequences used + to build contigs in this assembly. It does not includes + contig consensus sequences or singlets. + + Returns : integer + Args : none + +=cut + +sub get_nof_sequences_in_contigs { + my $self = shift; + + my $nof_seqs = 0; + foreach my $contig ($self->all_contigs) { + $nof_seqs += scalar( $contig->get_seq_ids() ); + } + + return $nof_seqs; +} + +=head2 get_nof_singlets + + Title : nof_singlets + Usage : $assembly->nof_singlets() + Function: Get the number of singlets included in the assembly + Returns : integer + Args : none + +=cut + +sub get_nof_singlets { + my $self = shift; + + return scalar( $self->get_singlet_ids() ); +} + +=head2 get_seq_ids + + Title : get_seq_ids + Usage : $assembly->get_seq_ids() + Function: + + Get the ID of sequences from all contigs. This list + refers only to the aligned sequences in contigs. It does + not includes contig consensus sequences or singlets. + + Returns : array of strings + Args : none + +=cut + +sub get_seq_ids { + my $self = shift; + + return keys %{ $self->{'_seqs'} }; +} + +=head2 get_contig_ids + + Title : get_contig_ids + Usage : $assembly->get_contig_ids() + Function: Access list of contig IDs from assembly + Returns : an array, if there are any contigs in the + assembly. An empty array otherwise + Args : none + +=cut + +sub get_contig_ids { + my $self = shift; + + return sort keys %{$self->{'_contigs'}}; +} + +=head2 get_singlet_ids + + Title : get_singlet_ids + Usage : $assembly->get_singlet_ids() + Function: Access list of singlet IDs from assembly + Returns : array of strings if there are any singlets + otherwise an empty array + Args : none + +=cut + +sub get_singlet_ids { + my $self = shift; + + return sort keys %{$self->{'_singlets'}}; +} + +=head2 get_seq_by_id + + Title : get_seq_by_id + Usage : $assembly->get_seq_by_id($id) + Function: + + Get a reference for an aligned sequence + This sequence must be part of a contig + in the assembly. + + Returns : a Bio::LocatableSeq object + undef if sequence $id is not found + in any contig + Args : [string] sequence identifier (id) + +=cut + +sub get_seq_by_id { + my $self = shift; + my $seqID = shift; + + return undef unless (exists $self->{'_seqs'}{$seqID}); + + return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID); +} + +=head2 get_contig_by_id + + Title : get_contig_by_id + Usage : $assembly->get_contig_by_id($id) + Function: Get a reference for a contig + Returns : a Bio::Assembly::Contig object or undef + Args : [string] contig unique identifier (ID) + +=cut + +sub get_contig_by_id { + my $self = shift; + my $contigID = shift; + + return undef unless (exists $self->{'_contigs'}{$contigID}); + + return $self->{'_contigs'}{$contigID}; +} + +=head2 get_singlet_by_id + + Title : get_singlet_by_id + Usage : $assembly->get_singlet_by_id() + Function: Get a reference for a singlet + Returns : Bio::PrimarySeqI object or undef + Args : [string] a singlet ID + +=cut + +sub get_singlet_by_id { + my $self = shift; + + my $singletID = shift; + + return undef unless (exists $self->{'_singlets'}{$singletID}); + + return $self->{'_singlets'}{$singletID}; +} + +=head1 Modifier methods + +=cut + +=head2 add_contig + + Title : add_contig + Usage : $assembly->add_contig($contig) + Function: Add a contig to the assembly + Returns : 1 on success + Args : a Bio::Assembly::Contig object + order (optional) + +=cut + +sub add_contig { + my $self = shift; + my $contig = shift; + + if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) { + $self->throw("Unable to process non Bio::Assembly::Contig object [", ref($contig), "]"); + } + my $contigID = $contig->id(); + if( !defined $contigID ) { + $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1); + $contig->id($contigID); + $self->warn("Attributing ID $contigID to unidentified Bio::Assembly::Contig object."); + } + + $self->warn("Replacing contig $contigID with a new contig object") + if (exists $self->{'_contigs'}{$contigID}); + $self->{'_contigs'}{$contigID} = $contig; + + foreach my $seqID ($contig->get_seq_ids()) { + if (exists $self->{'_seqs'}{$seqID}) { + $self->warn( "Sequence $seqID already assigned to contig ". + $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID") + unless ($self->{'_seqs'}{$seqID} eq $contig); + } + $self->{'_seqs'}{$seqID} = $contig; + } + + return 1; +} + +=head2 add_singlet + + Title : add_singlet + Usage : $assembly->add_singlet($seq) + Function: Add a singlet to the assembly + Returns : 1 on success, 0 otherwise + Args : a Bio::PrimarySeqI object + order (optional) + +=cut + +sub add_singlet { + my $self = shift; + my $singlet = shift; + + if( !ref $singlet || ! $singlet->isa('Bio::PrimarySeqI') ) { + $self->warn("Unable to process non Bio::SeqI object [", ref($singlet), "]"); + return 0; + } + + my $singletID = $singlet->id(); + $self->warn("Replacing singlet $singletID wih a new sequence object") + if (exists $self->{'_contigs'}{$singletID}); + $self->{'_singlets'}{$singletID} = $singlet; + + return 1; +} + +=head2 update_seq_list + + Title : update_seq_list + Usage : $assembly->update_seq_list() + Function: + + Synchronizes the assembly registry for sequences in + contigs and contig actual aligned sequences content. You + probably want to run this after you remove/add a + sequence from/to a contig in the assembly. + + Returns : nothing + Args : none + +=cut + +sub update_seq_list { + my $self = shift; + + $self->{'_seqs'} = {}; + foreach my $contig ($self->all_contigs) { + foreach my $seqID ($contig->get_seq_ids) { + $self->{'_seqs'}{$seqID} = $contig; + } + } + + return 1; +} + +=head2 remove_contigs + + Title : remove_contigs + Usage : $assembly->remove_contigs(1..4) + Function: Remove contig from assembly object + Returns : an array of removed Bio::Assembly::Contig + objects + Args : an array of contig IDs + + See function get_contig_ids() above + +=cut + +#--------------------- +sub remove_contigs { +#--------------------- + my ($self,@args) = @_; + + my @ret = (); + foreach my $contigID (@args) { + foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) { + delete $self->{'_seqs'}{$seqID}; + } + push(@ret,$self->{'_contigs'}{$contigID}); + delete $self->{'_contigs'}{$contigID}; + } + + return @ret; +} + +=head2 remove_singlets + + Title : remove_singlets + Usage : $assembly->remove_singlets(@singlet_ids) + Function: Remove singlet from assembly object + Returns : the Bio::SeqI objects removed + Args : a list of singlet IDs + + See function get_singlet_ids() above + +=cut + +#--------------------- +sub remove_singlets { +#--------------------- + my ($self,@args) = @_; + + my @ret = (); + foreach my $singletID (@args) { + push(@ret,$self->{'_singlets'}{$singletID}); + delete $self->{'_singlets'}{$singletID}; + } + + return @ret; +} + +=head1 Contig and singlet selection methos + +=cut + +=head2 select_contigs + + Title : select_contigs + Usage : $assembly->select_contigs(@list) + Function: Select an array of contigs from the assembly + Returns : an array of Bio::Assembly::Contig objects + Args : an array of contig ids + + See function get_contig_ids() above + +=cut + +#--------------------- +sub select_contigs { +#--------------------- + my ($self,@args) = @_; + + my @contigs = (); + foreach my $contig (@args) { + unless (exists $self->{'_contigs'}{$contig}) { + $self->warn("$contig contig not found. Ignoring..."); + next; + } + push(@contigs, $self->{'_contigs'}{$contig}); + } + + return @contigs; +} + +=head2 select_singlets + + Title : select_singlets + Usage : $assembly->select_singlets(@list) + Function: Selects an array of singlets from the assembly + Returns : an array of Bio::SeqI objects + Args : an array of singlet ids + + See function get_singlet_ids() above + +=cut + +#--------------------- +sub select_singlets { +#--------------------- + my ($self,@args) = @_; + + my @singlets = (); + foreach my $singlet (@args) { + unless (exists $self->{'_singlets'}{$singlet}) { + $self->warn("$singlet singlet not found. Ignoring..."); + next; + } + push(@singlets, $self->{'_singlets'}{$singlet}); + } + + return @singlets; +} + +=head2 all_contigs + + Title : all_contigs + Usage : my @contigs = $assembly->all_contigs + Function: + + Returns a list of all contigs in this assembly. Contigs + are both clusters and alignments of one or more reads, + with an associated consensus sequence. + + Returns : array of Bio::Assembly::Contig (in lexical id order) + Args : none + +=cut + +#--------------------- +sub all_contigs { +#--------------------- + my ($self) = @_; + + my @contigs = (); + foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) { + push(@contigs, $self->{'_contigs'}{$contig}); + } + + return @contigs; +} + +=head2 all_singlets + + Title : all_singlets + Usage : my @singlets = $assembly->all_singlets + Function: + + Returns a list of all singlets in this assembly. + Singlets are isolated reads, without non-vector + matches to any other read in the assembly. + + Returns : array of Bio::SeqI (in lexical order by id) + Args : none + +=cut + +#--------------------- +sub all_singlets { +#--------------------- + my ($self) = @_; + + my @singlets = (); + foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) { + push(@singlets, $self->{'_singlets'}{$singlet}); + } + + return @singlets; +} + +# =head1 Internal Methods + +1;