comparison variant_effect_predictor/Bio/Perl.pm @ 0:2bc9b66ada89 draft default tip

Uploaded
author mahtabm
date Thu, 11 Apr 2013 06:29:17 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:2bc9b66ada89
1 # $Id: Perl.pm,v 1.16.2.1 2003/03/25 12:32:15 heikki Exp $
2 #
3 # BioPerl module for Bio::Perl
4 #
5 # Cared for by Ewan Birney <bioperl-l@bio.perl.org>
6 #
7 # Copyright Ewan Birney
8 #
9 # You may distribute this module under the same terms as perl itself
10
11 # POD documentation - main docs before the code
12
13 =head1 NAME
14
15 Bio::Perl - Functional access to BioPerl for people who don't know objects
16
17 =head1 SYNOPSIS
18
19 use Bio::Perl;
20
21 # will guess file format from extension
22 $seq_object = read_sequence($filename);
23
24 # forces genbank format
25 $seq_object = read_sequence($filename,'genbank');
26
27 # reads an array of sequences
28 @seq_object_array = read_all_sequences($filename,'fasta');
29
30 # sequences are Bio::Seq objects, so the following methods work
31 # for more info see L<Bio::Seq>, or do 'perldoc Bio/Seq.pm'
32
33 print "Sequence name is ",$seq_object->display_id,"\n";
34 print "Sequence acc is ",$seq_object->accession_number,"\n";
35 print "First 5 bases is ",$seq_object->subseq(1,5),"\n";
36
37 # get the whole sequence as a single string
38
39 $sequence_as_a_string = $seq_object->seq();
40
41 # writing sequences
42
43 write_sequence(">$filename",'genbank',$seq_object);
44
45 write_sequence(">$filename",'genbank',@seq_object_array);
46
47 # making a new sequence from just strings you have
48 # from something else
49
50 $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA",
51 "myname","AL12232");
52
53
54 # getting a sequence from a database (assumes internet connection)
55
56 $seq_object = get_sequence('swissprot',"ROA1_HUMAN");
57
58 $seq_object = get_sequence('embl',"AI129902");
59
60 $seq_object = get_sequence('genbank',"AI129902");
61
62 # BLAST a sequence (assummes an internet connection)
63
64 $blast_report = blast_sequence($seq_object);
65
66 write_blast(">blast.out",$blast_report);
67
68
69 =head1 DESCRIPTION
70
71 Easy first time access to BioPerl via functions
72
73 =head1 FEEDBACK
74
75 =head2 Mailing Lists
76
77 User feedback is an integral part of the evolution of this and other
78 Bioperl modules. Send your comments and suggestions preferably to one
79 of the Bioperl mailing lists. Your participation is much appreciated.
80
81 bioperl-l@bio.perl.org
82
83 =head2 Reporting Bugs
84
85 Report bugs to the Bioperl bug tracking system to help us keep track
86 the bugs and their resolution. Bug reports can be submitted via email
87 or the web:
88
89 bioperl-bugs@bio.perl.org
90 http://bugzilla.bioperl.org/
91
92 =head1 AUTHOR - Ewan Birney
93
94 Email bioperl-l@bio.perl.org
95
96 Describe contact details here
97
98 =head1 APPENDIX
99
100 The rest of the documentation details each of the object methods.
101 Internal methods are usually preceded with a _
102
103 =cut
104
105 #'
106 # Let the code begin...
107
108
109 package Bio::Perl;
110 use vars qw(@ISA @EXPORT @EXPORT_OK $DBOKAY);
111 use strict;
112 use Carp;
113 use Exporter;
114
115 use Bio::SeqIO;
116 use Bio::Seq;
117 BEGIN {
118 eval {
119 require Bio::DB::EMBL;
120 require Bio::DB::GenBank;
121 require Bio::DB::SwissProt;
122 require Bio::DB::RefSeq;
123 require Bio::DB::GenPept;
124 };
125 if( $@ ) {
126 $DBOKAY = 0;
127 } else {
128 $DBOKAY = 1;
129 }
130 }
131
132 @ISA = qw(Exporter);
133
134 @EXPORT = qw(read_sequence read_all_sequences write_sequence
135 new_sequence get_sequence translate translate_as_string
136 reverse_complement revcom revcom_as_string
137 reverse_complement_as_string blast_sequence write_blast);
138
139 @EXPORT_OK = @EXPORT;
140
141
142 =head2 read_sequence
143
144 Title : read_sequence
145 Usage : $seq = read_sequence('sequences.fa')
146 $seq = read_sequence($filename,'genbank');
147
148 # pipes are fine
149 $seq = read_sequence("my_fetching_program $id |",'fasta');
150
151 Function: Reads the top sequence from the file. If no format is given, it will
152 try to guess the format from the filename. If a format is given, it
153 forces that format. The filename can be any valid perl open() string
154 - in particular, you can put in pipes
155
156 Returns : A Bio::Seq object. A quick synopsis:
157 $seq_object->display_id - name of the sequence
158 $seq_object->seq - sequence as a string
159
160 Args : Two strings, first the filename - any Perl open() string is ok
161 Second string is the format, which is optional
162
163 For more information on Seq objects see L<Bio::Seq>.
164
165 =cut
166
167 sub read_sequence{
168 my ($filename,$format) = @_;
169
170 if( !defined $filename ) {
171 confess "read_sequence($filename) - usage incorrect";
172 }
173
174 my $seqio;
175
176 if( defined $format ) {
177 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
178 } else {
179 $seqio = Bio::SeqIO->new( '-file' => $filename);
180 }
181
182 my $seq = $seqio->next_seq();
183
184 return $seq;
185 }
186
187
188 =head2 read_all_sequences
189
190 Title : read_all_sequences
191 Usage : @seq_object_array = read_all_sequences($filename);
192 @seq_object_array = read_all_sequences($filename,'genbank');
193
194 Function: Just as the function above, but reads all the sequences in the
195 file and loads them into an array.
196
197 For very large files, you will run out of memory. When this
198 happens, you've got to use the SeqIO system directly (this is
199 not so hard! Don't worry about it!).
200
201 Returns : array of Bio::Seq objects
202
203 Args : two strings, first the filename (any open() string is ok)
204 second the format (which is optional)
205
206 See L<Bio::SeqIO> and L<Bio::Seq> for more information
207
208 =cut
209
210 sub read_all_sequences{
211 my ($filename,$format) = @_;
212
213 if( !defined $filename ) {
214 confess "read_all_sequences($filename) - usage incorrect";
215 }
216
217 my $seqio;
218
219 if( defined $format ) {
220 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format);
221 } else {
222 $seqio = Bio::SeqIO->new( '-file' => $filename);
223 }
224
225 my @seq_array;
226
227 while( my $seq = $seqio->next_seq() ) {
228 push(@seq_array,$seq);
229 }
230
231 return @seq_array;
232 }
233
234
235 =head2 write_sequence
236
237 Title : write_sequence
238 Usage : write_sequence(">new_file.gb",'genbank',$seq)
239 write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects)
240
241 Function: writes sequences in the specified format
242
243 Returns : true
244
245 Args : filename as a string, must provide an open() output file
246 format as a string
247 one or more sequence objects
248
249
250 =cut
251
252 sub write_sequence{
253 my ($filename,$format,@sequence_objects) = @_;
254
255 if( scalar(@sequence_objects) == 0 ) {
256 confess("write_sequence(filename,format,sequence_object)");
257 }
258
259 my $error = 0;
260 my $seqname = "sequence1";
261
262 # catch users who haven't passed us a filename we can open
263 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
264 $filename = ">".$filename;
265 }
266
267 my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format);
268
269 foreach my $seq ( @sequence_objects ) {
270 my $seq_obj;
271
272 if( !ref $seq ) {
273 if( length $seq > 50 ) {
274 # odds are this is a sequence as a string, and someone has not figured out
275 # how to make objects. Warn him/her and then make a sequence object
276 # from this
277 if( $error == 0 ) {
278 carp("WARNING: You have put in a long string into write_sequence.\nI suspect this means that this is the actual sequence\nIn the future try the\n new_sequence method of this module to make a new sequence object.\nDoing this for you here\n");
279 $error = 1;
280 }
281
282 $seq_obj = new_sequence($seq,$seqname);
283 $seqname++;
284 } else {
285 confess("You have a non object [$seq] passed to write_sequence. It maybe that you want to use new_sequence to make this string into a sequence object?");
286 }
287 } else {
288 if( !$seq->isa("Bio::SeqI") ) {
289 confess("object [$seq] is not a Bio::Seq object; can't write it out");
290 }
291 $seq_obj = $seq;
292 }
293
294 # finally... we get to write out the sequence!
295 $seqio->write_seq($seq_obj);
296 }
297 1;
298 }
299
300 =head2 new_sequence
301
302 Title : new_sequence
303 Usage :
304 Function:
305 Example :
306 Returns :
307 Args :
308
309
310 =cut
311
312 sub new_sequence{
313 my ($seq,$name,$accession) = @_;
314
315 if( !defined $seq ) {
316 confess("new_sequence(sequence_as_string) usage");
317 }
318
319 $name ||= "no-name-for-sequence";
320
321 my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name);
322
323 $accession && $seq_object->accession_number($accession);
324
325 return $seq_object;
326 }
327
328 =head2 blast_sequence
329
330 Title : blast_sequence
331 Usage : $blast_result = blast_sequence($seq)
332 $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE');
333
334 Function: If the computer has Internet accessibility, blasts
335 the sequence using the NCBI BLAST server against nrdb.
336
337 It choose the flavour of BLAST on the basis of the sequence.
338
339 This function uses Bio::Tools::Run::RemoteBlast, which itself
340 use Bio::SearchIO - as soon as you want to more, check out
341 these modules
342 Returns : Bio::Search::Result::GenericResult.pm
343
344 Args : Either a string of protein letters or nucleotides, or a
345 Bio::Seq object
346
347 =cut
348
349 sub blast_sequence {
350 my ($seq,$verbose) = shift;
351
352 if( !defined $verbose ) {
353 $verbose = 1;
354 }
355
356 if( !ref $seq ) {
357 $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id');
358 } elsif ( !$seq->isa('Bio::PrimarySeqI') ) {
359 croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted");
360 }
361
362 require Bio::Tools::Run::RemoteBlast;
363
364 my $prog = 'blastp';
365 my $e_val= '1e-10';
366
367 my @params = ( '-prog' => $prog,
368 '-expect' => $e_val,
369 '-readmethod' => 'SearchIO' );
370
371 my $factory = Bio::Tools::Run::RemoteBlast->new(@params);
372
373 my $r = $factory->submit_blast($seq);
374 if( $verbose ) {
375 print STDERR "Submitted Blast for [".$seq->id."] ";
376 }
377 sleep 5;
378
379 my $result;
380
381 LOOP :
382 while( my @rids = $factory->each_rid) {
383 foreach my $rid ( @rids ) {
384 my $rc = $factory->retrieve_blast($rid);
385 if( !ref($rc) ) {
386 if( $rc < 0 ) {
387 $factory->remove_rid($rid);
388 }
389 if( $verbose ) {
390 print STDERR ".";
391 }
392 sleep 10;
393 } else {
394 $result = $rc->next_result();
395 $factory->remove_rid($rid);
396 last LOOP;
397 }
398 }
399 }
400
401 if( $verbose ) {
402 print STDERR "\n";
403 }
404 return $result;
405 }
406
407 =head2 write_blast
408
409 Title : write_blast
410 Usage : write_blast($filename,$blast_report);
411
412 Function: Writes a BLAST result object (or more formally
413 a SearchIO result object) out to a filename
414 in BLAST-like format
415
416 Returns : none
417
418 Args : filename as a string
419 Bio::SearchIO::Results object
420
421 =cut
422
423 sub write_blast {
424 my ($filename,$blast) = @_;
425
426 if( $filename !~ /^\>/ && $filename !~ /^|/ ) {
427 $filename = ">".$filename;
428 }
429
430 my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename);
431
432 $output->write_result($blast);
433
434 }
435
436 =head2 get_sequence
437
438 Title : get_sequence
439 Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN");
440
441 Function: If the computer has Internet accessibility, gets
442 the sequence from Internet accessible databases. Currently
443 this supports Swissprot, EMBL, GenBank and RefSeq.
444
445 Swissprot and EMBL are more robust than GenBank fetching.
446
447 If the user is trying to retrieve a RefSeq entry from
448 GenBank/EMBL, the query is silently redirected.
449
450 Returns : A Bio::Seq object
451
452 Args : database type - one of swiss, embl, genbank or refseq
453 identifier or accession number
454
455 =cut
456
457 my $genbank_db = undef;
458 my $genpept_db = undef;
459 my $embl_db = undef;
460 my $swiss_db = undef;
461 my $refseq_db = undef;
462
463 sub get_sequence{
464 my ($db_type,$identifier) = @_;
465 if( ! $DBOKAY ) {
466 confess("Your system does not have IO::String installed so the DB retrieval method is not available");
467 return;
468 }
469 $db_type = lc($db_type);
470
471 my $db;
472
473 if( $db_type =~ /genbank/ ) {
474 if( !defined $genbank_db ) {
475 $genbank_db = Bio::DB::GenBank->new();
476 }
477 $db = $genbank_db;
478 }
479 if( $db_type =~ /genpept/ ) {
480 if( !defined $genpept_db ) {
481 $genpept_db = Bio::DB::GenPept->new();
482 }
483 $db = $genpept_db;
484 }
485
486 if( $db_type =~ /swiss/ ) {
487 if( !defined $swiss_db ) {
488 $swiss_db = Bio::DB::SwissProt->new();
489 }
490 $db = $swiss_db;
491 }
492
493 if( $db_type =~ /embl/ ) {
494 if( !defined $embl_db ) {
495 $embl_db = Bio::DB::EMBL->new();
496 }
497 $db = $embl_db;
498 }
499
500 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and
501 $identifier =~ /^\s*N\S+_/)) {
502 if( !defined $refseq_db ) {
503 $refseq_db = Bio::DB::RefSeq->new();
504 }
505 $db = $refseq_db;
506 }
507
508 my $seq;
509
510 if( $identifier =~ /^\w+\d+$/ ) {
511 $seq = $db->get_Seq_by_acc($identifier);
512 } else {
513 $seq = $db->get_Seq_by_id($identifier);
514 }
515
516 return $seq;
517 }
518
519
520 =head2 translate
521
522 Title : translate
523 Usage : $seqobj = translate($seq_or_string_scalar)
524
525 Function: translates a DNA sequence object OR just a plain
526 string of DNA to amino acids
527 Returns : A Bio::Seq object
528
529 Args : Either a sequence object or a string of
530 just DNA sequence characters
531
532 =cut
533
534 sub translate {
535 my ($scalar) = shift;
536
537 my $obj;
538
539 if( ref $scalar ) {
540 if( !$scalar->isa("Bio::PrimarySeqI") ) {
541 confess("Expecting a sequence object not a $scalar");
542 } else {
543 $obj= $scalar;
544
545 }
546
547 } else {
548
549 # check this looks vaguely like DNA
550 my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ );
551
552 if( $n < length($scalar) * 0.85 ) {
553 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
554 }
555
556 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
557 }
558
559 return $obj->translate();
560 }
561
562
563 =head2 translate_as_string
564
565 Title : translate_as_string
566 Usage : $seqstring = translate_as_string($seq_or_string_scalar)
567
568 Function: translates a DNA sequence object OR just a plain
569 string of DNA to amino acids
570 Returns : A stirng of just amino acids
571
572 Args : Either a sequence object or a string of
573 just DNA sequence characters
574
575 =cut
576
577 sub translate_as_string {
578 my ($scalar) = shift;
579
580 my $obj = Bio::Perl::translate($scalar);
581
582 return $obj->seq;
583 }
584
585
586 =head2 reverse_complement
587
588 Title : reverse_complement
589 Usage : $seqobj = reverse_complement($seq_or_string_scalar)
590
591 Function: reverse complements a string or sequnce argument
592 producing a Bio::Seq - if you want a string, you
593 can use reverse_complement_as_string
594 Returns : A Bio::Seq object
595
596 Args : Either a sequence object or a string of
597 just DNA sequence characters
598
599 =cut
600
601 sub reverse_complement {
602 my ($scalar) = shift;
603
604 my $obj;
605
606 if( ref $scalar ) {
607 if( !$scalar->isa("Bio::PrimarySeqI") ) {
608 confess("Expecting a sequence object not a $scalar");
609 } else {
610 $obj= $scalar;
611
612 }
613
614 } else {
615
616 # check this looks vaguely like DNA
617 my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ );
618
619 if( $n < length($scalar) * 0.85 ) {
620 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me");
621 }
622
623 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar);
624 }
625
626 return $obj->revcom();
627 }
628
629 =head2 revcom
630
631 Title : revcom
632 Usage : $seqobj = revcom($seq_or_string_scalar)
633
634 Function: reverse complements a string or sequnce argument
635 producing a Bio::Seq - if you want a string, you
636 can use reverse_complement_as_string
637
638 This is an alias for reverse_complement
639 Returns : A Bio::Seq object
640
641 Args : Either a sequence object or a string of
642 just DNA sequence characters
643
644 =cut
645
646 sub revcom {
647 return &Bio::Perl::reverse_complement(@_);
648 }
649
650
651 =head2 reverse_complement_as_string
652
653 Title : reverse_complement_as_string
654 Usage : $string = reverse_complement_as_string($seq_or_string_scalar)
655
656 Function: reverse complements a string or sequnce argument
657 producing a string
658 Returns : A string of DNA letters
659
660 Args : Either a sequence object or a string of
661 just DNA sequence characters
662
663 =cut
664
665 sub reverse_complement_as_string {
666 my ($scalar) = shift;
667
668 my $obj = &Bio::Perl::reverse_complement($scalar);
669
670 return $obj->seq;
671 }
672
673
674 =head2 revcom_as_string
675
676 Title : revcom_as_string
677 Usage : $string = revcom_as_string($seq_or_string_scalar)
678
679 Function: reverse complements a string or sequnce argument
680 producing a string
681 Returns : A string of DNA letters
682
683 Args : Either a sequence object or a string of
684 just DNA sequence characters
685
686 =cut
687
688 sub revcom_as_string {
689 my ($scalar) = shift;
690
691 my $obj = &Bio::Perl::reverse_complement($scalar);
692
693 return $obj->seq;
694 }
695
696
697 1;