comparison variant_effect_predictor/Bio/Assembly/Scaffold.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: Scaffold.pm,v 1.2 2002/11/11 18:16:30 lapp Exp $
2 #
3 # BioPerl module for Bio::Assembly::Scaffold
4 #
5 # Copyright by Robson F. de Souza
6 #
7 # You may distribute this module under the same terms as perl itself
8 #
9 # POD documentation - main docs before the code
10
11 =head1 NAME
12
13 Bio::Assembly::Scaffold - Perl module to hold and manipulate sequence assembly data.
14
15 =head1 SYNOPSYS
16
17 # Module loading
18 use Bio::Assembly::IO;
19
20 # Assembly loading methods
21 my $aio = new Bio::Assembly::IO(-file=>"test.ace.1", -format=>'phrap');
22 my $assembly = $aio->next_assembly;
23
24 foreach my $contig ($assembly->all_contigs) {
25 # do something... (see Bio::Assembly::Contig)
26 }
27
28 =head1 DESCRIPTION
29
30 Bio::Assembly::Scaffold was developed to store and manipulate data
31 from sequence assembly programs like Phrap. It implements the
32 ScaffoldI interface and intends to be generic enough to be used by
33 Bio::Assembly::IO drivers written to programs other than Phrap.
34
35 =head1 FEEDBACK
36
37 =head2 Mailing Lists
38
39 User feedback is an integral part of the evolution of this and other
40 Bioperl modules. Send your comments and suggestions preferably to the
41 Bioperl mailing lists Your participation is much appreciated.
42
43 bioperl-l@bioperl.org - General discussion
44 http://bio.perl.org/MailList.html - About the mailing lists
45
46 =head2 Reporting Bugs
47
48 Report bugs to the Bioperl bug tracking system to help us keep track
49 the bugs and their resolution. Bug reports can be submitted via email
50 or the web:
51
52 bioperl-bugs@bio.perl.org
53 http://bugzilla.bioperl.org/
54
55 =head1 AUTHOR - Robson Francisco de Souza
56
57 rfsouza@citri.iq.usp.br
58
59 =head1 APPENDIX
60
61 The rest of the documentation details each of the object
62 methods. Internal methods are usually preceded with a _
63
64 =cut
65
66 package Bio::Assembly::Scaffold;
67
68 use strict;
69 use vars qw(@ISA $VERSION);
70
71 use Bio::Root::Root;
72 use Bio::Assembly::ScaffoldI;
73 use Bio::Annotation::Collection;
74
75 $VERSION = '0.0.1';
76 @ISA = qw(Bio::Root::Root Bio::Assembly::ScaffoldI);
77
78 =head2 new ()
79
80 Title : new
81 Usage : $assembly = new (-source=>'program_name',
82 -contigs=>\@contigs,
83 -id=>"assembly 1");
84 Function: creates a new assembly object
85 Returns :
86 Args :
87 -source : [string] sequence assembly program
88 -contigs : reference to array of
89 Bio::Assembly::Contig objects
90 -id : [string] assembly name
91
92 =cut
93
94 sub new {
95 my($class,@args) = @_;
96
97 my $self = $class->SUPER::new(@args);
98
99 my ($src,$contigs,$id) = $self->_rearrange([qw(SOURCE CONTIGS ID)], @args);
100
101 $self->{'_contigs'} = {};
102 $self->{'_singlets'} = {};
103 $self->{'_seqs'} = {};
104 $self->{'_annotation'} = Bio::Annotation::Collection->new();
105 $self->{'_id'} = 'NoName';
106
107 if (defined $contigs && ref($contigs = 'ARRAY')) {
108 foreach my $contig (@{$contigs}) {
109 $self->add_contig($contig);
110 }
111 }
112
113 $self->{'_id'} = $id if (defined $id);
114
115 return $self;
116 }
117
118 =head1 Accessing general assembly data
119
120 =cut
121
122 =head2 id
123
124 Title : id
125 Usage : $assembly->id()
126 Function: Get/Set assembly ID
127 Returns : string or undef
128 Args : string
129
130 =cut
131
132 sub id {
133 my $self = shift;
134 my $id = shift;
135
136 $self->{'_id'} = $id if (defined $id);
137
138 return $self->{'_id'};
139 }
140
141 =head2 annotation
142
143 Title : annotation
144 Usage : $assembly->annotation()
145 Function: Get/Set assembly annotation object
146 Returns : Bio::Annotation::Collection
147 Args : none
148
149 =cut
150
151 sub annotation {
152 my ($self,$ref) = shift;
153
154 $self->{'_annotation'} = $ref if (defined $ref);
155 return $self->{'_annotation'};
156 }
157
158 =head2 get_nof_contigs
159
160 Title : get_nof_contigs
161 Usage : $assembly->get_nof_contigs()
162 Function: Get the number of contigs included in the assembly
163 Returns : integer
164 Args : none
165
166 =cut
167
168 sub get_nof_contigs {
169 my $self = shift;
170
171 return scalar( $self->get_contig_ids() );
172 }
173
174 =head2 get_nof_sequences_in_contigs
175
176 Title : get_nof_sequences_in_contigs
177 Usage : $assembly->get_nof_sequences_in_contigs()
178 Function:
179
180 Get the number of sequences included in the
181 assembly. This number refers only to the sequences used
182 to build contigs in this assembly. It does not includes
183 contig consensus sequences or singlets.
184
185 Returns : integer
186 Args : none
187
188 =cut
189
190 sub get_nof_sequences_in_contigs {
191 my $self = shift;
192
193 my $nof_seqs = 0;
194 foreach my $contig ($self->all_contigs) {
195 $nof_seqs += scalar( $contig->get_seq_ids() );
196 }
197
198 return $nof_seqs;
199 }
200
201 =head2 get_nof_singlets
202
203 Title : nof_singlets
204 Usage : $assembly->nof_singlets()
205 Function: Get the number of singlets included in the assembly
206 Returns : integer
207 Args : none
208
209 =cut
210
211 sub get_nof_singlets {
212 my $self = shift;
213
214 return scalar( $self->get_singlet_ids() );
215 }
216
217 =head2 get_seq_ids
218
219 Title : get_seq_ids
220 Usage : $assembly->get_seq_ids()
221 Function:
222
223 Get the ID of sequences from all contigs. This list
224 refers only to the aligned sequences in contigs. It does
225 not includes contig consensus sequences or singlets.
226
227 Returns : array of strings
228 Args : none
229
230 =cut
231
232 sub get_seq_ids {
233 my $self = shift;
234
235 return keys %{ $self->{'_seqs'} };
236 }
237
238 =head2 get_contig_ids
239
240 Title : get_contig_ids
241 Usage : $assembly->get_contig_ids()
242 Function: Access list of contig IDs from assembly
243 Returns : an array, if there are any contigs in the
244 assembly. An empty array otherwise
245 Args : none
246
247 =cut
248
249 sub get_contig_ids {
250 my $self = shift;
251
252 return sort keys %{$self->{'_contigs'}};
253 }
254
255 =head2 get_singlet_ids
256
257 Title : get_singlet_ids
258 Usage : $assembly->get_singlet_ids()
259 Function: Access list of singlet IDs from assembly
260 Returns : array of strings if there are any singlets
261 otherwise an empty array
262 Args : none
263
264 =cut
265
266 sub get_singlet_ids {
267 my $self = shift;
268
269 return sort keys %{$self->{'_singlets'}};
270 }
271
272 =head2 get_seq_by_id
273
274 Title : get_seq_by_id
275 Usage : $assembly->get_seq_by_id($id)
276 Function:
277
278 Get a reference for an aligned sequence
279 This sequence must be part of a contig
280 in the assembly.
281
282 Returns : a Bio::LocatableSeq object
283 undef if sequence $id is not found
284 in any contig
285 Args : [string] sequence identifier (id)
286
287 =cut
288
289 sub get_seq_by_id {
290 my $self = shift;
291 my $seqID = shift;
292
293 return undef unless (exists $self->{'_seqs'}{$seqID});
294
295 return $self->{'_seqs'}{$seqID}->get_seq_by_name($seqID);
296 }
297
298 =head2 get_contig_by_id
299
300 Title : get_contig_by_id
301 Usage : $assembly->get_contig_by_id($id)
302 Function: Get a reference for a contig
303 Returns : a Bio::Assembly::Contig object or undef
304 Args : [string] contig unique identifier (ID)
305
306 =cut
307
308 sub get_contig_by_id {
309 my $self = shift;
310 my $contigID = shift;
311
312 return undef unless (exists $self->{'_contigs'}{$contigID});
313
314 return $self->{'_contigs'}{$contigID};
315 }
316
317 =head2 get_singlet_by_id
318
319 Title : get_singlet_by_id
320 Usage : $assembly->get_singlet_by_id()
321 Function: Get a reference for a singlet
322 Returns : Bio::PrimarySeqI object or undef
323 Args : [string] a singlet ID
324
325 =cut
326
327 sub get_singlet_by_id {
328 my $self = shift;
329
330 my $singletID = shift;
331
332 return undef unless (exists $self->{'_singlets'}{$singletID});
333
334 return $self->{'_singlets'}{$singletID};
335 }
336
337 =head1 Modifier methods
338
339 =cut
340
341 =head2 add_contig
342
343 Title : add_contig
344 Usage : $assembly->add_contig($contig)
345 Function: Add a contig to the assembly
346 Returns : 1 on success
347 Args : a Bio::Assembly::Contig object
348 order (optional)
349
350 =cut
351
352 sub add_contig {
353 my $self = shift;
354 my $contig = shift;
355
356 if( !ref $contig || ! $contig->isa('Bio::Assembly::Contig') ) {
357 $self->throw("Unable to process non Bio::Assembly::Contig object [", ref($contig), "]");
358 }
359 my $contigID = $contig->id();
360 if( !defined $contigID ) {
361 $contigID = 'Unknown_' . ($self->get_nof_contigs() + 1);
362 $contig->id($contigID);
363 $self->warn("Attributing ID $contigID to unidentified Bio::Assembly::Contig object.");
364 }
365
366 $self->warn("Replacing contig $contigID with a new contig object")
367 if (exists $self->{'_contigs'}{$contigID});
368 $self->{'_contigs'}{$contigID} = $contig;
369
370 foreach my $seqID ($contig->get_seq_ids()) {
371 if (exists $self->{'_seqs'}{$seqID}) {
372 $self->warn( "Sequence $seqID already assigned to contig ".
373 $self->{'_seqs'}{$seqID}->id().". Moving to contig $contigID")
374 unless ($self->{'_seqs'}{$seqID} eq $contig);
375 }
376 $self->{'_seqs'}{$seqID} = $contig;
377 }
378
379 return 1;
380 }
381
382 =head2 add_singlet
383
384 Title : add_singlet
385 Usage : $assembly->add_singlet($seq)
386 Function: Add a singlet to the assembly
387 Returns : 1 on success, 0 otherwise
388 Args : a Bio::PrimarySeqI object
389 order (optional)
390
391 =cut
392
393 sub add_singlet {
394 my $self = shift;
395 my $singlet = shift;
396
397 if( !ref $singlet || ! $singlet->isa('Bio::PrimarySeqI') ) {
398 $self->warn("Unable to process non Bio::SeqI object [", ref($singlet), "]");
399 return 0;
400 }
401
402 my $singletID = $singlet->id();
403 $self->warn("Replacing singlet $singletID wih a new sequence object")
404 if (exists $self->{'_contigs'}{$singletID});
405 $self->{'_singlets'}{$singletID} = $singlet;
406
407 return 1;
408 }
409
410 =head2 update_seq_list
411
412 Title : update_seq_list
413 Usage : $assembly->update_seq_list()
414 Function:
415
416 Synchronizes the assembly registry for sequences in
417 contigs and contig actual aligned sequences content. You
418 probably want to run this after you remove/add a
419 sequence from/to a contig in the assembly.
420
421 Returns : nothing
422 Args : none
423
424 =cut
425
426 sub update_seq_list {
427 my $self = shift;
428
429 $self->{'_seqs'} = {};
430 foreach my $contig ($self->all_contigs) {
431 foreach my $seqID ($contig->get_seq_ids) {
432 $self->{'_seqs'}{$seqID} = $contig;
433 }
434 }
435
436 return 1;
437 }
438
439 =head2 remove_contigs
440
441 Title : remove_contigs
442 Usage : $assembly->remove_contigs(1..4)
443 Function: Remove contig from assembly object
444 Returns : an array of removed Bio::Assembly::Contig
445 objects
446 Args : an array of contig IDs
447
448 See function get_contig_ids() above
449
450 =cut
451
452 #---------------------
453 sub remove_contigs {
454 #---------------------
455 my ($self,@args) = @_;
456
457 my @ret = ();
458 foreach my $contigID (@args) {
459 foreach my $seqID ($self->get_contig_by_id($contigID)->get_seq_ids()) {
460 delete $self->{'_seqs'}{$seqID};
461 }
462 push(@ret,$self->{'_contigs'}{$contigID});
463 delete $self->{'_contigs'}{$contigID};
464 }
465
466 return @ret;
467 }
468
469 =head2 remove_singlets
470
471 Title : remove_singlets
472 Usage : $assembly->remove_singlets(@singlet_ids)
473 Function: Remove singlet from assembly object
474 Returns : the Bio::SeqI objects removed
475 Args : a list of singlet IDs
476
477 See function get_singlet_ids() above
478
479 =cut
480
481 #---------------------
482 sub remove_singlets {
483 #---------------------
484 my ($self,@args) = @_;
485
486 my @ret = ();
487 foreach my $singletID (@args) {
488 push(@ret,$self->{'_singlets'}{$singletID});
489 delete $self->{'_singlets'}{$singletID};
490 }
491
492 return @ret;
493 }
494
495 =head1 Contig and singlet selection methos
496
497 =cut
498
499 =head2 select_contigs
500
501 Title : select_contigs
502 Usage : $assembly->select_contigs(@list)
503 Function: Select an array of contigs from the assembly
504 Returns : an array of Bio::Assembly::Contig objects
505 Args : an array of contig ids
506
507 See function get_contig_ids() above
508
509 =cut
510
511 #---------------------
512 sub select_contigs {
513 #---------------------
514 my ($self,@args) = @_;
515
516 my @contigs = ();
517 foreach my $contig (@args) {
518 unless (exists $self->{'_contigs'}{$contig}) {
519 $self->warn("$contig contig not found. Ignoring...");
520 next;
521 }
522 push(@contigs, $self->{'_contigs'}{$contig});
523 }
524
525 return @contigs;
526 }
527
528 =head2 select_singlets
529
530 Title : select_singlets
531 Usage : $assembly->select_singlets(@list)
532 Function: Selects an array of singlets from the assembly
533 Returns : an array of Bio::SeqI objects
534 Args : an array of singlet ids
535
536 See function get_singlet_ids() above
537
538 =cut
539
540 #---------------------
541 sub select_singlets {
542 #---------------------
543 my ($self,@args) = @_;
544
545 my @singlets = ();
546 foreach my $singlet (@args) {
547 unless (exists $self->{'_singlets'}{$singlet}) {
548 $self->warn("$singlet singlet not found. Ignoring...");
549 next;
550 }
551 push(@singlets, $self->{'_singlets'}{$singlet});
552 }
553
554 return @singlets;
555 }
556
557 =head2 all_contigs
558
559 Title : all_contigs
560 Usage : my @contigs = $assembly->all_contigs
561 Function:
562
563 Returns a list of all contigs in this assembly. Contigs
564 are both clusters and alignments of one or more reads,
565 with an associated consensus sequence.
566
567 Returns : array of Bio::Assembly::Contig (in lexical id order)
568 Args : none
569
570 =cut
571
572 #---------------------
573 sub all_contigs {
574 #---------------------
575 my ($self) = @_;
576
577 my @contigs = ();
578 foreach my $contig (sort { $a cmp $b } keys %{ $self->{'_contigs'} }) {
579 push(@contigs, $self->{'_contigs'}{$contig});
580 }
581
582 return @contigs;
583 }
584
585 =head2 all_singlets
586
587 Title : all_singlets
588 Usage : my @singlets = $assembly->all_singlets
589 Function:
590
591 Returns a list of all singlets in this assembly.
592 Singlets are isolated reads, without non-vector
593 matches to any other read in the assembly.
594
595 Returns : array of Bio::SeqI (in lexical order by id)
596 Args : none
597
598 =cut
599
600 #---------------------
601 sub all_singlets {
602 #---------------------
603 my ($self) = @_;
604
605 my @singlets = ();
606 foreach my $singlet (sort { $a cmp $b } keys %{ $self->{'_singlets'} }) {
607 push(@singlets, $self->{'_singlets'}{$singlet});
608 }
609
610 return @singlets;
611 }
612
613 # =head1 Internal Methods
614
615 1;