comparison variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FASTASerializer.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <dev@ensembl.org>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 =head1 NAME
22
23 Bio::EnsEMBL::Utils::IO::FASTASerializer
24
25 =head1 SYNOPSIS
26
27 my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle);
28 $serializer->chunk_factor(1000);
29 $serializer->line_width(60);
30 $serializer->print_Seq($slice);
31
32 $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,
33 sub {
34 my $slice = shift;
35 return "Custom header";
36 }
37 );
38
39 =head1 DESCRIPTION
40
41 Replacement for SeqDumper, making better use of shared code. Outputs FASTA
42 format with optional custom header and formatting parameters. Set line_width
43 and chunk_factor to dictate buffer size depending on application. A 60kb
44 buffer is used by default with a line width of 60 characters.
45
46 Custom headers are set by supplying an anonymous subroutine to new(). Custom
47 header code must accept a Slice or Bio::PrimarySeqI compliant object as
48 argument and return a string.
49
50 The custom header method can be overridden later through set_custom_header()
51 but this is not normally necessary.
52
53 =cut
54
55 package Bio::EnsEMBL::Utils::IO::FASTASerializer;
56
57 use strict;
58 use warnings;
59 use Bio::EnsEMBL::Utils::Exception;
60 use Bio::EnsEMBL::Utils::Scalar qw/assert_ref check_ref/;
61
62 use base qw(Bio::EnsEMBL::Utils::IO::Serializer);
63
64 =head2 new
65
66 Arg [1] : Filehandle (optional)
67 Arg [2] : CODEREF subroutine for writing custom headers
68 Arg [3] : [optional] Chunking size (integer)
69 Arg [4] : [optional] Line width (integer)
70 Example : $dumper = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,$header_function,1000,60);
71 Description: Constructor
72 Allows the specification of a custom function for rendering
73 header lines.
74 Returntype : Bio::EnsEMBL::Utils::IO::FASTASerializer;
75 Exceptions : none
76 Caller : general
77
78 =cut
79
80 sub new {
81 my $caller = shift;
82 my $class = ref($caller) || $caller;
83 my $filehandle = shift;
84 my $header_function = shift;
85 my $chunk_factor = shift;
86 my $line_width = shift;
87
88 my $self = $class->SUPER::new($filehandle);
89
90 $self->{'header_function'} = $header_function;
91 $self->{'line_width'} = ($line_width)? $line_width : 60;
92 $self->{'chunk_factor'} = ($chunk_factor)? $chunk_factor : 1000;
93 # gives a 60kb buffer by default, increase for higher database and disk efficiency.
94
95 # TODO: Check this error trap works as intended
96 if ( defined($self->{'header_function'}) ) {
97 if (ref($self->{'header_function'}) ne "CODE") {
98 throw("Custom header function must be an anonymous subroutine when instantiating FASTASerializer");}
99 }
100 else {
101 $self->{'header_function'} = sub {
102 my $slice = shift;
103
104 if(check_ref($slice, 'Bio::EnsEMBL::Slice')) {
105 my $id = $slice->seq_region_name;
106 my $seqtype = 'dna';
107 my $idtype = $slice->coord_system->name;
108 my $location = $slice->name;
109
110 return "$id $seqtype:$idtype $location";
111 }
112 else {
113 # must be a Bio::Seq , or we're doomed
114
115 return $slice->display_id;
116 }
117 };
118
119 }
120
121 return $self;
122 }
123
124 =head2 print_metadata
125
126 Arg [1] : Bio::EnsEMBL::Slice
127 Description: Printing header lines into FASTA files. Usually handled
128 internally to the serializer.
129 Returntype : None
130 Caller : print_Seq
131 =cut
132
133 sub print_metadata {
134 my $self = shift;
135 my $slice = shift;
136 my $fh = $self->{'filehandle'};
137 my $function = $self->header_function();
138 my $metadata = $function->($slice);
139 print $fh '>'.$metadata."\n";
140 }
141
142 =head2 print_Seq
143
144 Arg [1] : Bio::EnsEMBL::Slice or other Bio::PrimarySeqI compliant object
145
146 Description: Serializes the slice into FASTA format. Buffering is used
147 While other Bioperl PrimarySeqI implementations can be used,
148 a custom header function will be required to accommodate it.
149
150 Returntype : None
151
152 =cut
153
154 sub print_Seq {
155 my $self = shift;
156 my $slice = shift;
157 my $fh = $self->{'filehandle'};
158
159 $self->print_metadata($slice);
160 my $width = $self->{line_width};
161
162 # set buffer size
163 my $chunk_size = $self->{'chunk_factor'} * $width;
164
165 my $start = 1;
166 my $end = $slice->length();
167
168 #chunk the sequence to conserve memory, and print
169
170 my $here = $start;
171
172 while($here <= $end) {
173 my $there = $here + $chunk_size - 1;
174 $there = $end if($there > $end);
175 my $seq = $slice->subseq($here, $there);
176 $seq =~ s/(.{1,$width})/$1\n/g;
177 print $fh $seq or die "Error writing to file handle";
178 $here = $there + 1;
179 }
180
181 if ($slice->length > 0) {$self->{'achieved_something'} = 1;}
182
183 }
184
185 =head2 line_width
186
187 Arg [1] : Integer e.g. 60 or 80
188 Description: Set and get FASTA format line width. Default is 60
189 Returntype : Integer
190
191 =cut
192
193 sub line_width {
194 my $self = shift;
195 my $line_width = shift;
196 if ($line_width) { $self->{'line_width'} = $line_width };
197 return $self->{'line_width'}
198 }
199
200 =head2 chunk_factor
201 Arg [1] : Integer e.g. 1000
202 Description: Set and get the multiplier used to dictate buffer size
203 Chunk factor x line width = buffer size in bases.
204 Returntype : Integer
205 =cut
206
207 sub chunk_factor {
208 my $self = shift;
209 my $chunk_factor = shift;
210 if ($chunk_factor) { $self->{'chunk_factor'} = $chunk_factor};
211 return $self->{'chunk_factor'}
212 }
213
214 =head2 set_custom_header
215
216 Arg [1] : CODE reference
217 Description: Set the custom header function. Normally this is done at
218 construction time, but can be overridden here.
219 Example : $serializer->set_custom_header( sub { return 'New header'});
220 Returntype :
221
222 =cut
223
224 sub set_custom_header {
225 my ($self, $new_header_function) = @_;
226 $self->header_function($new_header_function);
227 return;
228 }
229
230 =head2 header_function
231
232 Arg [1] : CODE reference (optional)
233 Description: Getter/setter for the custom header code
234 Example : $serializer->header_function( sub { return 'New header'});
235 Returntype : CODE
236
237 =cut
238
239 sub header_function {
240 my ($self, $header_function) = @_;
241 if($header_function) {
242 assert_ref($header_function, 'CODE', 'header_function');
243 $self->{header_function} = $header_function;
244 }
245 return $self->{header_function};
246 }
247
248 1;