Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Utils/IO/FASTASerializer.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <dev@ensembl.org>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Utils::IO::FASTASerializer | |
24 | |
25 =head1 SYNOPSIS | |
26 | |
27 my $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle); | |
28 $serializer->chunk_factor(1000); | |
29 $serializer->line_width(60); | |
30 $serializer->print_Seq($slice); | |
31 | |
32 $serializer = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle, | |
33 sub { | |
34 my $slice = shift; | |
35 return "Custom header"; | |
36 } | |
37 ); | |
38 | |
39 =head1 DESCRIPTION | |
40 | |
41 Replacement for SeqDumper, making better use of shared code. Outputs FASTA | |
42 format with optional custom header and formatting parameters. Set line_width | |
43 and chunk_factor to dictate buffer size depending on application. A 60kb | |
44 buffer is used by default with a line width of 60 characters. | |
45 | |
46 Custom headers are set by supplying an anonymous subroutine to new(). Custom | |
47 header code must accept a Slice or Bio::PrimarySeqI compliant object as | |
48 argument and return a string. | |
49 | |
50 The custom header method can be overridden later through set_custom_header() | |
51 but this is not normally necessary. | |
52 | |
53 =cut | |
54 | |
55 package Bio::EnsEMBL::Utils::IO::FASTASerializer; | |
56 | |
57 use strict; | |
58 use warnings; | |
59 use Bio::EnsEMBL::Utils::Exception; | |
60 use Bio::EnsEMBL::Utils::Scalar qw/assert_ref check_ref/; | |
61 | |
62 use base qw(Bio::EnsEMBL::Utils::IO::Serializer); | |
63 | |
64 =head2 new | |
65 | |
66 Arg [1] : Filehandle (optional) | |
67 Arg [2] : CODEREF subroutine for writing custom headers | |
68 Arg [3] : [optional] Chunking size (integer) | |
69 Arg [4] : [optional] Line width (integer) | |
70 Example : $dumper = Bio::EnsEMBL::Utils::IO::FASTASerializer->new($filehandle,$header_function,1000,60); | |
71 Description: Constructor | |
72 Allows the specification of a custom function for rendering | |
73 header lines. | |
74 Returntype : Bio::EnsEMBL::Utils::IO::FASTASerializer; | |
75 Exceptions : none | |
76 Caller : general | |
77 | |
78 =cut | |
79 | |
80 sub new { | |
81 my $caller = shift; | |
82 my $class = ref($caller) || $caller; | |
83 my $filehandle = shift; | |
84 my $header_function = shift; | |
85 my $chunk_factor = shift; | |
86 my $line_width = shift; | |
87 | |
88 my $self = $class->SUPER::new($filehandle); | |
89 | |
90 $self->{'header_function'} = $header_function; | |
91 $self->{'line_width'} = ($line_width)? $line_width : 60; | |
92 $self->{'chunk_factor'} = ($chunk_factor)? $chunk_factor : 1000; | |
93 # gives a 60kb buffer by default, increase for higher database and disk efficiency. | |
94 | |
95 # TODO: Check this error trap works as intended | |
96 if ( defined($self->{'header_function'}) ) { | |
97 if (ref($self->{'header_function'}) ne "CODE") { | |
98 throw("Custom header function must be an anonymous subroutine when instantiating FASTASerializer");} | |
99 } | |
100 else { | |
101 $self->{'header_function'} = sub { | |
102 my $slice = shift; | |
103 | |
104 if(check_ref($slice, 'Bio::EnsEMBL::Slice')) { | |
105 my $id = $slice->seq_region_name; | |
106 my $seqtype = 'dna'; | |
107 my $idtype = $slice->coord_system->name; | |
108 my $location = $slice->name; | |
109 | |
110 return "$id $seqtype:$idtype $location"; | |
111 } | |
112 else { | |
113 # must be a Bio::Seq , or we're doomed | |
114 | |
115 return $slice->display_id; | |
116 } | |
117 }; | |
118 | |
119 } | |
120 | |
121 return $self; | |
122 } | |
123 | |
124 =head2 print_metadata | |
125 | |
126 Arg [1] : Bio::EnsEMBL::Slice | |
127 Description: Printing header lines into FASTA files. Usually handled | |
128 internally to the serializer. | |
129 Returntype : None | |
130 Caller : print_Seq | |
131 =cut | |
132 | |
133 sub print_metadata { | |
134 my $self = shift; | |
135 my $slice = shift; | |
136 my $fh = $self->{'filehandle'}; | |
137 my $function = $self->header_function(); | |
138 my $metadata = $function->($slice); | |
139 print $fh '>'.$metadata."\n"; | |
140 } | |
141 | |
142 =head2 print_Seq | |
143 | |
144 Arg [1] : Bio::EnsEMBL::Slice or other Bio::PrimarySeqI compliant object | |
145 | |
146 Description: Serializes the slice into FASTA format. Buffering is used | |
147 While other Bioperl PrimarySeqI implementations can be used, | |
148 a custom header function will be required to accommodate it. | |
149 | |
150 Returntype : None | |
151 | |
152 =cut | |
153 | |
154 sub print_Seq { | |
155 my $self = shift; | |
156 my $slice = shift; | |
157 my $fh = $self->{'filehandle'}; | |
158 | |
159 $self->print_metadata($slice); | |
160 my $width = $self->{line_width}; | |
161 | |
162 # set buffer size | |
163 my $chunk_size = $self->{'chunk_factor'} * $width; | |
164 | |
165 my $start = 1; | |
166 my $end = $slice->length(); | |
167 | |
168 #chunk the sequence to conserve memory, and print | |
169 | |
170 my $here = $start; | |
171 | |
172 while($here <= $end) { | |
173 my $there = $here + $chunk_size - 1; | |
174 $there = $end if($there > $end); | |
175 my $seq = $slice->subseq($here, $there); | |
176 $seq =~ s/(.{1,$width})/$1\n/g; | |
177 print $fh $seq or die "Error writing to file handle"; | |
178 $here = $there + 1; | |
179 } | |
180 | |
181 if ($slice->length > 0) {$self->{'achieved_something'} = 1;} | |
182 | |
183 } | |
184 | |
185 =head2 line_width | |
186 | |
187 Arg [1] : Integer e.g. 60 or 80 | |
188 Description: Set and get FASTA format line width. Default is 60 | |
189 Returntype : Integer | |
190 | |
191 =cut | |
192 | |
193 sub line_width { | |
194 my $self = shift; | |
195 my $line_width = shift; | |
196 if ($line_width) { $self->{'line_width'} = $line_width }; | |
197 return $self->{'line_width'} | |
198 } | |
199 | |
200 =head2 chunk_factor | |
201 Arg [1] : Integer e.g. 1000 | |
202 Description: Set and get the multiplier used to dictate buffer size | |
203 Chunk factor x line width = buffer size in bases. | |
204 Returntype : Integer | |
205 =cut | |
206 | |
207 sub chunk_factor { | |
208 my $self = shift; | |
209 my $chunk_factor = shift; | |
210 if ($chunk_factor) { $self->{'chunk_factor'} = $chunk_factor}; | |
211 return $self->{'chunk_factor'} | |
212 } | |
213 | |
214 =head2 set_custom_header | |
215 | |
216 Arg [1] : CODE reference | |
217 Description: Set the custom header function. Normally this is done at | |
218 construction time, but can be overridden here. | |
219 Example : $serializer->set_custom_header( sub { return 'New header'}); | |
220 Returntype : | |
221 | |
222 =cut | |
223 | |
224 sub set_custom_header { | |
225 my ($self, $new_header_function) = @_; | |
226 $self->header_function($new_header_function); | |
227 return; | |
228 } | |
229 | |
230 =head2 header_function | |
231 | |
232 Arg [1] : CODE reference (optional) | |
233 Description: Getter/setter for the custom header code | |
234 Example : $serializer->header_function( sub { return 'New header'}); | |
235 Returntype : CODE | |
236 | |
237 =cut | |
238 | |
239 sub header_function { | |
240 my ($self, $header_function) = @_; | |
241 if($header_function) { | |
242 assert_ref($header_function, 'CODE', 'header_function'); | |
243 $self->{header_function} = $header_function; | |
244 } | |
245 return $self->{header_function}; | |
246 } | |
247 | |
248 1; |