0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <dev@ensembl.org>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Utils::IO::FASTASerializer
|
|
24
|
|
25 =head1 SYNOPSIS
|
|
26
|
|
27 my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle);
|
|
28 $serializer->chunk_factor(1000);
|
|
29 $serializer->line_width(60);
|
|
30 $serializer->print_Seq($slice);
|
|
31
|
|
32 $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,
|
|
33 sub {
|
|
34 my $slice = shift;
|
|
35 return "Custom header";
|
|
36 }
|
|
37 );
|
|
38
|
|
39 =head1 DESCRIPTION
|
|
40
|
|
41 Replacement for SeqDumper, making better use of shared code. Outputs FASTA
|
|
42 format with optional custom header and formatting parameters. Set line_width
|
|
43 and chunk_factor to dictate buffer size depending on application. A 60kb
|
|
44 buffer is used by default with a line width of 60 characters.
|
|
45
|
|
46 Custom headers are set by supplying an anonymous subroutine to new(). Custom
|
|
47 header code must accept a Slice or Bio::PrimarySeqI compliant object as
|
|
48 argument and return a string.
|
|
49
|
|
50 The custom header method can be overridden later through set_custom_header()
|
|
51 but this is not normally necessary.
|
|
52
|
|
53 =cut
|
|
54
|
|
55 package Bio::EnsEMBL::Utils::IO::FASTASerializer;
|
|
56
|
|
57 use strict;
|
|
58 use warnings;
|
|
59 use Bio::EnsEMBL::Utils::Exception;
|
|
60 use Bio::EnsEMBL::Utils::Scalar qw/assert_ref check_ref/;
|
|
61
|
|
62 use base qw(Bio::EnsEMBL::Utils::IO::Serializer);
|
|
63
|
|
64 =head2 new
|
|
65
|
|
66 Arg [1] : Filehandle (optional)
|
|
67 Arg [2] : CODEREF subroutine for writing custom headers
|
|
68 Arg [3] : [optional] Chunking size (integer)
|
|
69 Arg [4] : [optional] Line width (integer)
|
|
70 Example : $dumper = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,$header_function,1000,60);
|
|
71 Description: Constructor
|
|
72 Allows the specification of a custom function for rendering
|
|
73 header lines.
|
|
74 Returntype : Bio::EnsEMBL::Utils::IO::FASTASerializer;
|
|
75 Exceptions : none
|
|
76 Caller : general
|
|
77
|
|
78 =cut
|
|
79
|
|
80 sub new {
|
|
81 my $caller = shift;
|
|
82 my $class = ref($caller) || $caller;
|
|
83 my $filehandle = shift;
|
|
84 my $header_function = shift;
|
|
85 my $chunk_factor = shift;
|
|
86 my $line_width = shift;
|
|
87
|
|
88 my $self = $class->SUPER::new($filehandle);
|
|
89
|
|
90 $self->{'header_function'} = $header_function;
|
|
91 $self->{'line_width'} = ($line_width)? $line_width : 60;
|
|
92 $self->{'chunk_factor'} = ($chunk_factor)? $chunk_factor : 1000;
|
|
93 # gives a 60kb buffer by default, increase for higher database and disk efficiency.
|
|
94
|
|
95 # TODO: Check this error trap works as intended
|
|
96 if ( defined($self->{'header_function'}) ) {
|
|
97 if (ref($self->{'header_function'}) ne "CODE") {
|
|
98 throw("Custom header function must be an anonymous subroutine when instantiating FASTASerializer");}
|
|
99 }
|
|
100 else {
|
|
101 $self->{'header_function'} = sub {
|
|
102 my $slice = shift;
|
|
103
|
|
104 if(check_ref($slice, 'Bio::EnsEMBL::Slice')) {
|
|
105 my $id = $slice->seq_region_name;
|
|
106 my $seqtype = 'dna';
|
|
107 my $idtype = $slice->coord_system->name;
|
|
108 my $location = $slice->name;
|
|
109
|
|
110 return "$id $seqtype:$idtype $location";
|
|
111 }
|
|
112 else {
|
|
113 # must be a Bio::Seq , or we're doomed
|
|
114
|
|
115 return $slice->display_id;
|
|
116 }
|
|
117 };
|
|
118
|
|
119 }
|
|
120
|
|
121 return $self;
|
|
122 }
|
|
123
|
|
124 =head2 print_metadata
|
|
125
|
|
126 Arg [1] : Bio::EnsEMBL::Slice
|
|
127 Description: Printing header lines into FASTA files. Usually handled
|
|
128 internally to the serializer.
|
|
129 Returntype : None
|
|
130 Caller : print_Seq
|
|
131 =cut
|
|
132
|
|
133 sub print_metadata {
|
|
134 my $self = shift;
|
|
135 my $slice = shift;
|
|
136 my $fh = $self->{'filehandle'};
|
|
137 my $function = $self->header_function();
|
|
138 my $metadata = $function->($slice);
|
|
139 print $fh '>'.$metadata."\n";
|
|
140 }
|
|
141
|
|
142 =head2 print_Seq
|
|
143
|
|
144 Arg [1] : Bio::EnsEMBL::Slice or other Bio::PrimarySeqI compliant object
|
|
145
|
|
146 Description: Serializes the slice into FASTA format. Buffering is used
|
|
147 While other Bioperl PrimarySeqI implementations can be used,
|
|
148 a custom header function will be required to accommodate it.
|
|
149
|
|
150 Returntype : None
|
|
151
|
|
152 =cut
|
|
153
|
|
154 sub print_Seq {
|
|
155 my $self = shift;
|
|
156 my $slice = shift;
|
|
157 my $fh = $self->{'filehandle'};
|
|
158
|
|
159 $self->print_metadata($slice);
|
|
160 my $width = $self->{line_width};
|
|
161
|
|
162 # set buffer size
|
|
163 my $chunk_size = $self->{'chunk_factor'} * $width;
|
|
164
|
|
165 my $start = 1;
|
|
166 my $end = $slice->length();
|
|
167
|
|
168 #chunk the sequence to conserve memory, and print
|
|
169
|
|
170 my $here = $start;
|
|
171
|
|
172 while($here <= $end) {
|
|
173 my $there = $here + $chunk_size - 1;
|
|
174 $there = $end if($there > $end);
|
|
175 my $seq = $slice->subseq($here, $there);
|
|
176 $seq =~ s/(.{1,$width})/$1\n/g;
|
|
177 print $fh $seq or die "Error writing to file handle";
|
|
178 $here = $there + 1;
|
|
179 }
|
|
180
|
|
181 if ($slice->length > 0) {$self->{'achieved_something'} = 1;}
|
|
182
|
|
183 }
|
|
184
|
|
185 =head2 line_width
|
|
186
|
|
187 Arg [1] : Integer e.g. 60 or 80
|
|
188 Description: Set and get FASTA format line width. Default is 60
|
|
189 Returntype : Integer
|
|
190
|
|
191 =cut
|
|
192
|
|
193 sub line_width {
|
|
194 my $self = shift;
|
|
195 my $line_width = shift;
|
|
196 if ($line_width) { $self->{'line_width'} = $line_width };
|
|
197 return $self->{'line_width'}
|
|
198 }
|
|
199
|
|
200 =head2 chunk_factor
|
|
201 Arg [1] : Integer e.g. 1000
|
|
202 Description: Set and get the multiplier used to dictate buffer size
|
|
203 Chunk factor x line width = buffer size in bases.
|
|
204 Returntype : Integer
|
|
205 =cut
|
|
206
|
|
207 sub chunk_factor {
|
|
208 my $self = shift;
|
|
209 my $chunk_factor = shift;
|
|
210 if ($chunk_factor) { $self->{'chunk_factor'} = $chunk_factor};
|
|
211 return $self->{'chunk_factor'}
|
|
212 }
|
|
213
|
|
214 =head2 set_custom_header
|
|
215
|
|
216 Arg [1] : CODE reference
|
|
217 Description: Set the custom header function. Normally this is done at
|
|
218 construction time, but can be overridden here.
|
|
219 Example : $serializer->set_custom_header( sub { return 'New header'});
|
|
220 Returntype :
|
|
221
|
|
222 =cut
|
|
223
|
|
224 sub set_custom_header {
|
|
225 my ($self, $new_header_function) = @_;
|
|
226 $self->header_function($new_header_function);
|
|
227 return;
|
|
228 }
|
|
229
|
|
230 =head2 header_function
|
|
231
|
|
232 Arg [1] : CODE reference (optional)
|
|
233 Description: Getter/setter for the custom header code
|
|
234 Example : $serializer->header_function( sub { return 'New header'});
|
|
235 Returntype : CODE
|
|
236
|
|
237 =cut
|
|
238
|
|
239 sub header_function {
|
|
240 my ($self, $header_function) = @_;
|
|
241 if($header_function) {
|
|
242 assert_ref($header_function, 'CODE', 'header_function');
|
|
243 $self->{header_function} = $header_function;
|
|
244 }
|
|
245 return $self->{header_function};
|
|
246 }
|
|
247
|
|
248 1;
|