Mercurial > repos > mahtabm > ensemb_rep_gvl
comparison variant_effect_predictor/Bio/Perl.pm @ 0:2bc9b66ada89 draft default tip
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 06:29:17 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2bc9b66ada89 |
---|---|
1 # $Id: Perl.pm,v 1.16.2.1 2003/03/25 12:32:15 heikki Exp $ | |
2 # | |
3 # BioPerl module for Bio::Perl | |
4 # | |
5 # Cared for by Ewan Birney <bioperl-l@bio.perl.org> | |
6 # | |
7 # Copyright Ewan Birney | |
8 # | |
9 # You may distribute this module under the same terms as perl itself | |
10 | |
11 # POD documentation - main docs before the code | |
12 | |
13 =head1 NAME | |
14 | |
15 Bio::Perl - Functional access to BioPerl for people who don't know objects | |
16 | |
17 =head1 SYNOPSIS | |
18 | |
19 use Bio::Perl; | |
20 | |
21 # will guess file format from extension | |
22 $seq_object = read_sequence($filename); | |
23 | |
24 # forces genbank format | |
25 $seq_object = read_sequence($filename,'genbank'); | |
26 | |
27 # reads an array of sequences | |
28 @seq_object_array = read_all_sequences($filename,'fasta'); | |
29 | |
30 # sequences are Bio::Seq objects, so the following methods work | |
31 # for more info see L<Bio::Seq>, or do 'perldoc Bio/Seq.pm' | |
32 | |
33 print "Sequence name is ",$seq_object->display_id,"\n"; | |
34 print "Sequence acc is ",$seq_object->accession_number,"\n"; | |
35 print "First 5 bases is ",$seq_object->subseq(1,5),"\n"; | |
36 | |
37 # get the whole sequence as a single string | |
38 | |
39 $sequence_as_a_string = $seq_object->seq(); | |
40 | |
41 # writing sequences | |
42 | |
43 write_sequence(">$filename",'genbank',$seq_object); | |
44 | |
45 write_sequence(">$filename",'genbank',@seq_object_array); | |
46 | |
47 # making a new sequence from just strings you have | |
48 # from something else | |
49 | |
50 $seq_object = new_sequence("ATTGGTTTGGGGACCCAATTTGTGTGTTATATGTA", | |
51 "myname","AL12232"); | |
52 | |
53 | |
54 # getting a sequence from a database (assumes internet connection) | |
55 | |
56 $seq_object = get_sequence('swissprot',"ROA1_HUMAN"); | |
57 | |
58 $seq_object = get_sequence('embl',"AI129902"); | |
59 | |
60 $seq_object = get_sequence('genbank',"AI129902"); | |
61 | |
62 # BLAST a sequence (assummes an internet connection) | |
63 | |
64 $blast_report = blast_sequence($seq_object); | |
65 | |
66 write_blast(">blast.out",$blast_report); | |
67 | |
68 | |
69 =head1 DESCRIPTION | |
70 | |
71 Easy first time access to BioPerl via functions | |
72 | |
73 =head1 FEEDBACK | |
74 | |
75 =head2 Mailing Lists | |
76 | |
77 User feedback is an integral part of the evolution of this and other | |
78 Bioperl modules. Send your comments and suggestions preferably to one | |
79 of the Bioperl mailing lists. Your participation is much appreciated. | |
80 | |
81 bioperl-l@bio.perl.org | |
82 | |
83 =head2 Reporting Bugs | |
84 | |
85 Report bugs to the Bioperl bug tracking system to help us keep track | |
86 the bugs and their resolution. Bug reports can be submitted via email | |
87 or the web: | |
88 | |
89 bioperl-bugs@bio.perl.org | |
90 http://bugzilla.bioperl.org/ | |
91 | |
92 =head1 AUTHOR - Ewan Birney | |
93 | |
94 Email bioperl-l@bio.perl.org | |
95 | |
96 Describe contact details here | |
97 | |
98 =head1 APPENDIX | |
99 | |
100 The rest of the documentation details each of the object methods. | |
101 Internal methods are usually preceded with a _ | |
102 | |
103 =cut | |
104 | |
105 #' | |
106 # Let the code begin... | |
107 | |
108 | |
109 package Bio::Perl; | |
110 use vars qw(@ISA @EXPORT @EXPORT_OK $DBOKAY); | |
111 use strict; | |
112 use Carp; | |
113 use Exporter; | |
114 | |
115 use Bio::SeqIO; | |
116 use Bio::Seq; | |
117 BEGIN { | |
118 eval { | |
119 require Bio::DB::EMBL; | |
120 require Bio::DB::GenBank; | |
121 require Bio::DB::SwissProt; | |
122 require Bio::DB::RefSeq; | |
123 require Bio::DB::GenPept; | |
124 }; | |
125 if( $@ ) { | |
126 $DBOKAY = 0; | |
127 } else { | |
128 $DBOKAY = 1; | |
129 } | |
130 } | |
131 | |
132 @ISA = qw(Exporter); | |
133 | |
134 @EXPORT = qw(read_sequence read_all_sequences write_sequence | |
135 new_sequence get_sequence translate translate_as_string | |
136 reverse_complement revcom revcom_as_string | |
137 reverse_complement_as_string blast_sequence write_blast); | |
138 | |
139 @EXPORT_OK = @EXPORT; | |
140 | |
141 | |
142 =head2 read_sequence | |
143 | |
144 Title : read_sequence | |
145 Usage : $seq = read_sequence('sequences.fa') | |
146 $seq = read_sequence($filename,'genbank'); | |
147 | |
148 # pipes are fine | |
149 $seq = read_sequence("my_fetching_program $id |",'fasta'); | |
150 | |
151 Function: Reads the top sequence from the file. If no format is given, it will | |
152 try to guess the format from the filename. If a format is given, it | |
153 forces that format. The filename can be any valid perl open() string | |
154 - in particular, you can put in pipes | |
155 | |
156 Returns : A Bio::Seq object. A quick synopsis: | |
157 $seq_object->display_id - name of the sequence | |
158 $seq_object->seq - sequence as a string | |
159 | |
160 Args : Two strings, first the filename - any Perl open() string is ok | |
161 Second string is the format, which is optional | |
162 | |
163 For more information on Seq objects see L<Bio::Seq>. | |
164 | |
165 =cut | |
166 | |
167 sub read_sequence{ | |
168 my ($filename,$format) = @_; | |
169 | |
170 if( !defined $filename ) { | |
171 confess "read_sequence($filename) - usage incorrect"; | |
172 } | |
173 | |
174 my $seqio; | |
175 | |
176 if( defined $format ) { | |
177 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format); | |
178 } else { | |
179 $seqio = Bio::SeqIO->new( '-file' => $filename); | |
180 } | |
181 | |
182 my $seq = $seqio->next_seq(); | |
183 | |
184 return $seq; | |
185 } | |
186 | |
187 | |
188 =head2 read_all_sequences | |
189 | |
190 Title : read_all_sequences | |
191 Usage : @seq_object_array = read_all_sequences($filename); | |
192 @seq_object_array = read_all_sequences($filename,'genbank'); | |
193 | |
194 Function: Just as the function above, but reads all the sequences in the | |
195 file and loads them into an array. | |
196 | |
197 For very large files, you will run out of memory. When this | |
198 happens, you've got to use the SeqIO system directly (this is | |
199 not so hard! Don't worry about it!). | |
200 | |
201 Returns : array of Bio::Seq objects | |
202 | |
203 Args : two strings, first the filename (any open() string is ok) | |
204 second the format (which is optional) | |
205 | |
206 See L<Bio::SeqIO> and L<Bio::Seq> for more information | |
207 | |
208 =cut | |
209 | |
210 sub read_all_sequences{ | |
211 my ($filename,$format) = @_; | |
212 | |
213 if( !defined $filename ) { | |
214 confess "read_all_sequences($filename) - usage incorrect"; | |
215 } | |
216 | |
217 my $seqio; | |
218 | |
219 if( defined $format ) { | |
220 $seqio = Bio::SeqIO->new( '-file' => $filename, '-format' => $format); | |
221 } else { | |
222 $seqio = Bio::SeqIO->new( '-file' => $filename); | |
223 } | |
224 | |
225 my @seq_array; | |
226 | |
227 while( my $seq = $seqio->next_seq() ) { | |
228 push(@seq_array,$seq); | |
229 } | |
230 | |
231 return @seq_array; | |
232 } | |
233 | |
234 | |
235 =head2 write_sequence | |
236 | |
237 Title : write_sequence | |
238 Usage : write_sequence(">new_file.gb",'genbank',$seq) | |
239 write_sequence(">new_file.gb",'genbank',@array_of_sequence_objects) | |
240 | |
241 Function: writes sequences in the specified format | |
242 | |
243 Returns : true | |
244 | |
245 Args : filename as a string, must provide an open() output file | |
246 format as a string | |
247 one or more sequence objects | |
248 | |
249 | |
250 =cut | |
251 | |
252 sub write_sequence{ | |
253 my ($filename,$format,@sequence_objects) = @_; | |
254 | |
255 if( scalar(@sequence_objects) == 0 ) { | |
256 confess("write_sequence(filename,format,sequence_object)"); | |
257 } | |
258 | |
259 my $error = 0; | |
260 my $seqname = "sequence1"; | |
261 | |
262 # catch users who haven't passed us a filename we can open | |
263 if( $filename !~ /^\>/ && $filename !~ /^|/ ) { | |
264 $filename = ">".$filename; | |
265 } | |
266 | |
267 my $seqio = Bio::SeqIO->new('-file' => $filename, '-format' => $format); | |
268 | |
269 foreach my $seq ( @sequence_objects ) { | |
270 my $seq_obj; | |
271 | |
272 if( !ref $seq ) { | |
273 if( length $seq > 50 ) { | |
274 # odds are this is a sequence as a string, and someone has not figured out | |
275 # how to make objects. Warn him/her and then make a sequence object | |
276 # from this | |
277 if( $error == 0 ) { | |
278 carp("WARNING: You have put in a long string into write_sequence.\nI suspect this means that this is the actual sequence\nIn the future try the\n new_sequence method of this module to make a new sequence object.\nDoing this for you here\n"); | |
279 $error = 1; | |
280 } | |
281 | |
282 $seq_obj = new_sequence($seq,$seqname); | |
283 $seqname++; | |
284 } else { | |
285 confess("You have a non object [$seq] passed to write_sequence. It maybe that you want to use new_sequence to make this string into a sequence object?"); | |
286 } | |
287 } else { | |
288 if( !$seq->isa("Bio::SeqI") ) { | |
289 confess("object [$seq] is not a Bio::Seq object; can't write it out"); | |
290 } | |
291 $seq_obj = $seq; | |
292 } | |
293 | |
294 # finally... we get to write out the sequence! | |
295 $seqio->write_seq($seq_obj); | |
296 } | |
297 1; | |
298 } | |
299 | |
300 =head2 new_sequence | |
301 | |
302 Title : new_sequence | |
303 Usage : | |
304 Function: | |
305 Example : | |
306 Returns : | |
307 Args : | |
308 | |
309 | |
310 =cut | |
311 | |
312 sub new_sequence{ | |
313 my ($seq,$name,$accession) = @_; | |
314 | |
315 if( !defined $seq ) { | |
316 confess("new_sequence(sequence_as_string) usage"); | |
317 } | |
318 | |
319 $name ||= "no-name-for-sequence"; | |
320 | |
321 my $seq_object = Bio::Seq->new( -seq => $seq, -id => $name); | |
322 | |
323 $accession && $seq_object->accession_number($accession); | |
324 | |
325 return $seq_object; | |
326 } | |
327 | |
328 =head2 blast_sequence | |
329 | |
330 Title : blast_sequence | |
331 Usage : $blast_result = blast_sequence($seq) | |
332 $blast_result = blast_sequence('MFVEGGTFASEDDDSASAEDE'); | |
333 | |
334 Function: If the computer has Internet accessibility, blasts | |
335 the sequence using the NCBI BLAST server against nrdb. | |
336 | |
337 It choose the flavour of BLAST on the basis of the sequence. | |
338 | |
339 This function uses Bio::Tools::Run::RemoteBlast, which itself | |
340 use Bio::SearchIO - as soon as you want to more, check out | |
341 these modules | |
342 Returns : Bio::Search::Result::GenericResult.pm | |
343 | |
344 Args : Either a string of protein letters or nucleotides, or a | |
345 Bio::Seq object | |
346 | |
347 =cut | |
348 | |
349 sub blast_sequence { | |
350 my ($seq,$verbose) = shift; | |
351 | |
352 if( !defined $verbose ) { | |
353 $verbose = 1; | |
354 } | |
355 | |
356 if( !ref $seq ) { | |
357 $seq = Bio::Seq->new( -seq => $seq, -id => 'blast-sequence-temp-id'); | |
358 } elsif ( !$seq->isa('Bio::PrimarySeqI') ) { | |
359 croak("[$seq] is an object, but not a Bio::Seq object, cannot be blasted"); | |
360 } | |
361 | |
362 require Bio::Tools::Run::RemoteBlast; | |
363 | |
364 my $prog = 'blastp'; | |
365 my $e_val= '1e-10'; | |
366 | |
367 my @params = ( '-prog' => $prog, | |
368 '-expect' => $e_val, | |
369 '-readmethod' => 'SearchIO' ); | |
370 | |
371 my $factory = Bio::Tools::Run::RemoteBlast->new(@params); | |
372 | |
373 my $r = $factory->submit_blast($seq); | |
374 if( $verbose ) { | |
375 print STDERR "Submitted Blast for [".$seq->id."] "; | |
376 } | |
377 sleep 5; | |
378 | |
379 my $result; | |
380 | |
381 LOOP : | |
382 while( my @rids = $factory->each_rid) { | |
383 foreach my $rid ( @rids ) { | |
384 my $rc = $factory->retrieve_blast($rid); | |
385 if( !ref($rc) ) { | |
386 if( $rc < 0 ) { | |
387 $factory->remove_rid($rid); | |
388 } | |
389 if( $verbose ) { | |
390 print STDERR "."; | |
391 } | |
392 sleep 10; | |
393 } else { | |
394 $result = $rc->next_result(); | |
395 $factory->remove_rid($rid); | |
396 last LOOP; | |
397 } | |
398 } | |
399 } | |
400 | |
401 if( $verbose ) { | |
402 print STDERR "\n"; | |
403 } | |
404 return $result; | |
405 } | |
406 | |
407 =head2 write_blast | |
408 | |
409 Title : write_blast | |
410 Usage : write_blast($filename,$blast_report); | |
411 | |
412 Function: Writes a BLAST result object (or more formally | |
413 a SearchIO result object) out to a filename | |
414 in BLAST-like format | |
415 | |
416 Returns : none | |
417 | |
418 Args : filename as a string | |
419 Bio::SearchIO::Results object | |
420 | |
421 =cut | |
422 | |
423 sub write_blast { | |
424 my ($filename,$blast) = @_; | |
425 | |
426 if( $filename !~ /^\>/ && $filename !~ /^|/ ) { | |
427 $filename = ">".$filename; | |
428 } | |
429 | |
430 my $output = Bio::SearchIO->new( -output_format => 'blast', -file => $filename); | |
431 | |
432 $output->write_result($blast); | |
433 | |
434 } | |
435 | |
436 =head2 get_sequence | |
437 | |
438 Title : get_sequence | |
439 Usage : $seq_object = get_sequence('swiss',"ROA1_HUMAN"); | |
440 | |
441 Function: If the computer has Internet accessibility, gets | |
442 the sequence from Internet accessible databases. Currently | |
443 this supports Swissprot, EMBL, GenBank and RefSeq. | |
444 | |
445 Swissprot and EMBL are more robust than GenBank fetching. | |
446 | |
447 If the user is trying to retrieve a RefSeq entry from | |
448 GenBank/EMBL, the query is silently redirected. | |
449 | |
450 Returns : A Bio::Seq object | |
451 | |
452 Args : database type - one of swiss, embl, genbank or refseq | |
453 identifier or accession number | |
454 | |
455 =cut | |
456 | |
457 my $genbank_db = undef; | |
458 my $genpept_db = undef; | |
459 my $embl_db = undef; | |
460 my $swiss_db = undef; | |
461 my $refseq_db = undef; | |
462 | |
463 sub get_sequence{ | |
464 my ($db_type,$identifier) = @_; | |
465 if( ! $DBOKAY ) { | |
466 confess("Your system does not have IO::String installed so the DB retrieval method is not available"); | |
467 return; | |
468 } | |
469 $db_type = lc($db_type); | |
470 | |
471 my $db; | |
472 | |
473 if( $db_type =~ /genbank/ ) { | |
474 if( !defined $genbank_db ) { | |
475 $genbank_db = Bio::DB::GenBank->new(); | |
476 } | |
477 $db = $genbank_db; | |
478 } | |
479 if( $db_type =~ /genpept/ ) { | |
480 if( !defined $genpept_db ) { | |
481 $genpept_db = Bio::DB::GenPept->new(); | |
482 } | |
483 $db = $genpept_db; | |
484 } | |
485 | |
486 if( $db_type =~ /swiss/ ) { | |
487 if( !defined $swiss_db ) { | |
488 $swiss_db = Bio::DB::SwissProt->new(); | |
489 } | |
490 $db = $swiss_db; | |
491 } | |
492 | |
493 if( $db_type =~ /embl/ ) { | |
494 if( !defined $embl_db ) { | |
495 $embl_db = Bio::DB::EMBL->new(); | |
496 } | |
497 $db = $embl_db; | |
498 } | |
499 | |
500 if( $db_type =~ /refseq/ or ($db_type !~ /swiss/ and | |
501 $identifier =~ /^\s*N\S+_/)) { | |
502 if( !defined $refseq_db ) { | |
503 $refseq_db = Bio::DB::RefSeq->new(); | |
504 } | |
505 $db = $refseq_db; | |
506 } | |
507 | |
508 my $seq; | |
509 | |
510 if( $identifier =~ /^\w+\d+$/ ) { | |
511 $seq = $db->get_Seq_by_acc($identifier); | |
512 } else { | |
513 $seq = $db->get_Seq_by_id($identifier); | |
514 } | |
515 | |
516 return $seq; | |
517 } | |
518 | |
519 | |
520 =head2 translate | |
521 | |
522 Title : translate | |
523 Usage : $seqobj = translate($seq_or_string_scalar) | |
524 | |
525 Function: translates a DNA sequence object OR just a plain | |
526 string of DNA to amino acids | |
527 Returns : A Bio::Seq object | |
528 | |
529 Args : Either a sequence object or a string of | |
530 just DNA sequence characters | |
531 | |
532 =cut | |
533 | |
534 sub translate { | |
535 my ($scalar) = shift; | |
536 | |
537 my $obj; | |
538 | |
539 if( ref $scalar ) { | |
540 if( !$scalar->isa("Bio::PrimarySeqI") ) { | |
541 confess("Expecting a sequence object not a $scalar"); | |
542 } else { | |
543 $obj= $scalar; | |
544 | |
545 } | |
546 | |
547 } else { | |
548 | |
549 # check this looks vaguely like DNA | |
550 my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); | |
551 | |
552 if( $n < length($scalar) * 0.85 ) { | |
553 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); | |
554 } | |
555 | |
556 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); | |
557 } | |
558 | |
559 return $obj->translate(); | |
560 } | |
561 | |
562 | |
563 =head2 translate_as_string | |
564 | |
565 Title : translate_as_string | |
566 Usage : $seqstring = translate_as_string($seq_or_string_scalar) | |
567 | |
568 Function: translates a DNA sequence object OR just a plain | |
569 string of DNA to amino acids | |
570 Returns : A stirng of just amino acids | |
571 | |
572 Args : Either a sequence object or a string of | |
573 just DNA sequence characters | |
574 | |
575 =cut | |
576 | |
577 sub translate_as_string { | |
578 my ($scalar) = shift; | |
579 | |
580 my $obj = Bio::Perl::translate($scalar); | |
581 | |
582 return $obj->seq; | |
583 } | |
584 | |
585 | |
586 =head2 reverse_complement | |
587 | |
588 Title : reverse_complement | |
589 Usage : $seqobj = reverse_complement($seq_or_string_scalar) | |
590 | |
591 Function: reverse complements a string or sequnce argument | |
592 producing a Bio::Seq - if you want a string, you | |
593 can use reverse_complement_as_string | |
594 Returns : A Bio::Seq object | |
595 | |
596 Args : Either a sequence object or a string of | |
597 just DNA sequence characters | |
598 | |
599 =cut | |
600 | |
601 sub reverse_complement { | |
602 my ($scalar) = shift; | |
603 | |
604 my $obj; | |
605 | |
606 if( ref $scalar ) { | |
607 if( !$scalar->isa("Bio::PrimarySeqI") ) { | |
608 confess("Expecting a sequence object not a $scalar"); | |
609 } else { | |
610 $obj= $scalar; | |
611 | |
612 } | |
613 | |
614 } else { | |
615 | |
616 # check this looks vaguely like DNA | |
617 my $n = ( $scalar =~ tr/ATGCNatgc/ATGCNatgcn/ ); | |
618 | |
619 if( $n < length($scalar) * 0.85 ) { | |
620 confess("Sequence [$scalar] is less than 85% ATGCN, which doesn't look very DNA to me"); | |
621 } | |
622 | |
623 $obj = Bio::PrimarySeq->new(-id => 'internalbioperlseq',-seq => $scalar); | |
624 } | |
625 | |
626 return $obj->revcom(); | |
627 } | |
628 | |
629 =head2 revcom | |
630 | |
631 Title : revcom | |
632 Usage : $seqobj = revcom($seq_or_string_scalar) | |
633 | |
634 Function: reverse complements a string or sequnce argument | |
635 producing a Bio::Seq - if you want a string, you | |
636 can use reverse_complement_as_string | |
637 | |
638 This is an alias for reverse_complement | |
639 Returns : A Bio::Seq object | |
640 | |
641 Args : Either a sequence object or a string of | |
642 just DNA sequence characters | |
643 | |
644 =cut | |
645 | |
646 sub revcom { | |
647 return &Bio::Perl::reverse_complement(@_); | |
648 } | |
649 | |
650 | |
651 =head2 reverse_complement_as_string | |
652 | |
653 Title : reverse_complement_as_string | |
654 Usage : $string = reverse_complement_as_string($seq_or_string_scalar) | |
655 | |
656 Function: reverse complements a string or sequnce argument | |
657 producing a string | |
658 Returns : A string of DNA letters | |
659 | |
660 Args : Either a sequence object or a string of | |
661 just DNA sequence characters | |
662 | |
663 =cut | |
664 | |
665 sub reverse_complement_as_string { | |
666 my ($scalar) = shift; | |
667 | |
668 my $obj = &Bio::Perl::reverse_complement($scalar); | |
669 | |
670 return $obj->seq; | |
671 } | |
672 | |
673 | |
674 =head2 revcom_as_string | |
675 | |
676 Title : revcom_as_string | |
677 Usage : $string = revcom_as_string($seq_or_string_scalar) | |
678 | |
679 Function: reverse complements a string or sequnce argument | |
680 producing a string | |
681 Returns : A string of DNA letters | |
682 | |
683 Args : Either a sequence object or a string of | |
684 just DNA sequence characters | |
685 | |
686 =cut | |
687 | |
688 sub revcom_as_string { | |
689 my ($scalar) = shift; | |
690 | |
691 my $obj = &Bio::Perl::reverse_complement($scalar); | |
692 | |
693 return $obj->seq; | |
694 } | |
695 | |
696 | |
697 1; |