0
|
1 #
|
|
2 # BioPerl module for Bio::SeqIO::bsml
|
|
3 #
|
|
4 # Cared for by Charles Tilford (tilfordc@bms.com)
|
|
5 # Copyright (C) Charles Tilford 2001
|
|
6 #
|
|
7 # This library is free software; you can redistribute it and/or
|
|
8 # modify it under the terms of the GNU Lesser General Public
|
|
9 # License as published by the Free Software Foundation; either
|
|
10 # version 2.1 of the License, or (at your option) any later version.
|
|
11 #
|
|
12 # This library is distributed in the hope that it will be useful,
|
|
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
15 # Lesser General Public License for more details.
|
|
16 #
|
|
17 # You should have received a copy of the GNU Lesser General Public
|
|
18 # License along with this library; if not, write to the Free Software
|
|
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
20 # Also at: http://www.gnu.org/copyleft/lesser.html
|
|
21
|
|
22
|
|
23 # Much of the basic documentation in this module has been
|
|
24 # cut-and-pasted from the embl.pm (Ewan Birney) SeqIO module.
|
|
25
|
|
26
|
|
27 =head1 NAME
|
|
28
|
|
29 Bio::SeqIO::bsml - BSML sequence input/output stream
|
|
30
|
|
31 =head1 SYNOPSIS
|
|
32
|
|
33 It is probably best not to use this object directly, but rather go
|
|
34 through the SeqIO handler system. To read a BSML file:
|
|
35
|
|
36 $stream = Bio::SeqIO->new( -file => $filename, -format => 'bsml');
|
|
37
|
|
38 while ( my $bioSeqObj = $stream->next_seq() ) {
|
|
39 # do something with $bioSeqObj
|
|
40 }
|
|
41
|
|
42 To write a Seq object to the current file handle in BSML XML format:
|
|
43
|
|
44 $stream->write_seq( -seq => $seqObj);
|
|
45
|
|
46 If instead you would like a XML::DOM object containing the BSML, use:
|
|
47
|
|
48 my $newXmlObject = $stream->to_bsml( -seq => $seqObj);
|
|
49
|
|
50 =head1 DEPENDENCIES
|
|
51
|
|
52 In addition to parts of the Bio:: hierarchy, this module uses:
|
|
53
|
|
54 XML::DOM
|
|
55
|
|
56 =head1 DESCRIPTION
|
|
57
|
|
58 This object can transform Bio::Seq objects to and from BSML (XML)
|
|
59 flatfiles.
|
|
60
|
|
61 =head2 NOTE:
|
|
62
|
|
63 2/1/02 - I have changed the API to more closely match argument
|
|
64 passing used by other BioPerl methods ( -tag => value ). Internal
|
|
65 methods are using the same API, but you should not be calling those
|
|
66 anyway...
|
|
67
|
|
68 =head1 FEEDBACK
|
|
69
|
|
70 =head2 Mailing Lists
|
|
71
|
|
72 User feedback is an integral part of the evolution of this and other
|
|
73 Bioperl modules. Send your comments and suggestions preferably to one
|
|
74 of the Bioperl mailing lists. Your participation is much
|
|
75 appreciated.
|
|
76
|
|
77 bioperl-l@bioperl.org - General discussion
|
|
78 http://www.bioperl.org/MailList.shtml - About the mailing lists
|
|
79
|
|
80 =head2 Reporting Bugs
|
|
81
|
|
82 Report bugs to the Bioperl bug tracking system to help us keep track
|
|
83 the bugs and their resolution.
|
|
84 Bug reports can be submitted via email or the web:
|
|
85
|
|
86 bioperl-bugs@bio.perl.org
|
|
87 http://bugzilla.bioperl.org/
|
|
88
|
|
89 =head2 Things Still to Do
|
|
90
|
|
91 * The module now uses the new Collection.pm system. However,
|
|
92 Annotations associated with a Feature object still seem to use the
|
|
93 old system, so parsing with the old methods are included..
|
|
94
|
|
95 * Generate Seq objects with no sequence data but an assigned
|
|
96 length. This appears to be an issue with Bio::Seq. It is possible
|
|
97 (and reasonable) to make a BSML document with features but no
|
|
98 sequence data.
|
|
99
|
|
100 * Support <Seq-data-import>. Do not know how commonly this is used.
|
|
101
|
|
102 * Some features are awaiting implementation in later versions of
|
|
103 BSML. These include:
|
|
104
|
|
105 * Nested feature support
|
|
106
|
|
107 * Complex feature (ie joins)
|
|
108
|
|
109 * Unambiguity in strand (ie -1,0,1, not just 'complement' )
|
|
110
|
|
111 * More friendly dblink structures
|
|
112
|
|
113 * Location.pm (or RangeI::union?) appears to have a bug when 'expand'
|
|
114 is used.
|
|
115
|
|
116 * More intelligent hunting for sequence and feature titles? It is not
|
|
117 terribly clear where the most appropriate field is located, better
|
|
118 grepping (eg looking for a reasonable count for spaces and numbers)
|
|
119 may allow for titles better than "AE008041".
|
|
120
|
|
121 =head1 AUTHOR - Charles Tilford
|
|
122
|
|
123 Bristol-Myers Squibb Bioinformatics
|
|
124
|
|
125 Email tilfordc@bms.com
|
|
126
|
|
127 I have developed the BSML specific code for this package, but have used
|
|
128 code from other SeqIO packages for much of the nuts-and-bolts. In particular
|
|
129 I have used code from the embl.pm module either directly or as a framework
|
|
130 for many of the subroutines that are common to SeqIO modules.
|
|
131
|
|
132 =cut
|
|
133
|
|
134 package Bio::SeqIO::bsml;
|
|
135 use vars qw(@ISA);
|
|
136 use strict;
|
|
137
|
|
138 use Bio::SeqIO;
|
|
139 use Bio::SeqFeature::Generic;
|
|
140 use Bio::Species;
|
|
141 use XML::DOM;
|
|
142 use Bio::Seq::SeqFactory;
|
|
143 use Bio::Annotation::Collection;
|
|
144 use Bio::Annotation::Comment;
|
|
145 use Bio::Annotation::Reference;
|
|
146 use Bio::Annotation::DBLink;
|
|
147
|
|
148 @ISA = qw(Bio::SeqIO);
|
|
149
|
|
150 my $idcounter = {}; # Used to generate unique id values
|
|
151 my $nvtoken = ": "; # The token used if a name/value pair has to be stuffed
|
|
152 # into a single line
|
|
153
|
|
154 =head1 METHODS
|
|
155
|
|
156 =cut
|
|
157
|
|
158 # LS: this seems to get overwritten on line 1317, generating a redefinition error. Dead code?
|
|
159 # CAT: This was inappropriately added in revision 1.10 - I added the check for existance of a sequence factory to the actual _initialize
|
|
160 # sub _initialize {
|
|
161 # my($self,@args) = @_;
|
|
162 # $self->SUPER::_initialize(@args);
|
|
163 # if( ! defined $self->sequence_factory ) {
|
|
164 # $self->sequence_factory(new Bio::Seq::SeqFactory(-verbose => $self->verbose(), -type => 'Bio::Seq::RichSeq'));
|
|
165 # }
|
|
166 # }
|
|
167
|
|
168 =head2 next_seq
|
|
169
|
|
170 Title : next_seq
|
|
171 Usage : my $bioSeqObj = $stream->next_seq
|
|
172 Function: Retrieves the next sequence from a SeqIO::bsml stream.
|
|
173 Returns : A reference to a Bio::Seq::RichSeq object
|
|
174 Args :
|
|
175
|
|
176 =cut
|
|
177
|
|
178 sub next_seq {
|
|
179 my $self = shift;
|
|
180 my ($desc);
|
|
181 my $bioSeq = $self->sequence_factory->create(-verbose =>$self->verbose());
|
|
182
|
|
183 unless (exists $self->{'domtree'}) {
|
|
184 $self->throw("A BSML document has not yet been parsed.");
|
|
185 return undef;
|
|
186 }
|
|
187 my $dom = $self->{'domtree'};
|
|
188 my $seqElements = $dom->getElementsByTagName ("Sequence");
|
|
189 if ($self->{'current_node'} == $seqElements->getLength ) {
|
|
190 # There are no more <Sequence>s to process
|
|
191 return undef;
|
|
192 }
|
|
193 my $xmlSeq = $seqElements->item($self->{'current_node'});
|
|
194
|
|
195 # Assume that title attribute contains the best display id
|
|
196 if (my $val = $xmlSeq->getAttribute( "title")) {
|
|
197 $bioSeq->display_id($val);
|
|
198 }
|
|
199
|
|
200 # Set the molecule type
|
|
201 if (my $val = $xmlSeq->getAttribute( "molecule" )) {
|
|
202 my %mol = ('dna' => 'DNA', 'rna' => 'RNA', 'aa' => 'protein');
|
|
203 $bioSeq->molecule($mol{ lc($val) });
|
|
204 }
|
|
205
|
|
206 # Set the accession number
|
|
207 if (my $val = $xmlSeq->getAttribute( "ic-acckey" )) {
|
|
208 $bioSeq->accession_number($val);
|
|
209 }
|
|
210
|
|
211 # Get the sequence data for the element
|
|
212 if (my $seqData = &FIRSTDATA($xmlSeq->getElementsByTagName("Seq-data")
|
|
213 ->item(0) ) ) {
|
|
214 # Sequence data exists, transfer to the Seq object
|
|
215 # Remove white space and CRs (not neccesary?)
|
|
216 $seqData =~ s/[\s\n\r]//g;
|
|
217 $bioSeq->seq($seqData);
|
|
218 } elsif (my $import = $xmlSeq->getElementsByTagName("Seq-dataimport")
|
|
219 ->item(0) ) {
|
|
220 #>>>> # What about <Seq-data-import> ??
|
|
221
|
|
222 } elsif (my $val = $xmlSeq->getAttribute("length")) {
|
|
223 # No sequence defined, set the length directly
|
|
224
|
|
225 #>>>> # This does not appear to work - length is apparently calculated
|
|
226 # from the sequence. How to make a "virtual" sequence??? Such
|
|
227 # creatures are common in BSML...
|
|
228 $bioSeq->length($val);
|
|
229 }
|
|
230
|
|
231 my $species = Bio::Species->new();
|
|
232 my @classification = ();
|
|
233
|
|
234 # Peruse the generic <Attributes> - those that are direct children of
|
|
235 # the <Sequence> or the <Feature-tables> element
|
|
236 # Sticky wicket here - data not controlled by schema, could be anything
|
|
237 my @seqDesc = ();
|
|
238 my %specs = ('common_name' => 'y',
|
|
239 'genus' => 'y',
|
|
240 'species' => 'y',
|
|
241 'sub_species' => 'y', );
|
|
242 my %seqMap = (
|
|
243 'add_date' => [ 'date' ],
|
|
244 'keywords' => [ 'keyword', ],
|
|
245 'seq_version' => [ 'version' ],
|
|
246 'division' => [ 'division' ],
|
|
247 'add_secondary_accession' => ['accession'],
|
|
248 'pid' => ['pid'],
|
|
249 'primary_id' => [ 'primary.id', 'primary_id' ],
|
|
250 );
|
|
251 my $floppies = &GETFLOPPIES($xmlSeq);
|
|
252 foreach my $attr (@{$floppies}) {
|
|
253 # Don't want to get attributes from <Feature> or <Table> elements yet
|
|
254 my $parent = $attr->getParentNode->getNodeName;
|
|
255 next unless($parent eq "Sequence" || $parent eq "Feature-tables");
|
|
256
|
|
257 my ($name, $content) = &FLOPPYVALS($attr);
|
|
258 $name = lc($name);
|
|
259 if (exists $specs{$name}) { # It looks like part of species...
|
|
260 $species->$name($content);
|
|
261 next;
|
|
262 }
|
|
263 my $value = "";
|
|
264 # Cycle through the Seq methods:
|
|
265 foreach my $method (keys %seqMap) {
|
|
266 # Cycle through potential matching attributes:
|
|
267 foreach my $match (@{$seqMap{$method}}) {
|
|
268 # If the <Attribute> name matches one of the keys,
|
|
269 # set $value, unless it has already been set
|
|
270 $value ||= $content if ($name =~ /$match/i);
|
|
271 }
|
|
272 if ($value ne "") {
|
|
273 $bioSeq->$method($value);
|
|
274 last;
|
|
275 }
|
|
276 }
|
|
277 next if ($value ne "");
|
|
278
|
|
279 if ($name =~ /^species$/i) { # Uh, it's the species designation?
|
|
280 if ($content =~ / /) {
|
|
281 # Assume that a full species name has been provided
|
|
282 # This will screw up if the last word is the subspecies...
|
|
283 my @break = split " ", $content;
|
|
284 @classification = reverse @break;
|
|
285 } else {
|
|
286 $classification[0] = $content;
|
|
287 }
|
|
288 next;
|
|
289 }
|
|
290 if ($name =~ /sub[_ ]?species/i) { # Should be the subspecies...
|
|
291 $species->sub_species( $content );
|
|
292 next;
|
|
293 }
|
|
294 if ($name =~ /classification/i) { # Should be species classification
|
|
295 # We will assume that there are spaces separating the terms:
|
|
296 my @bits = split " ", $content;
|
|
297 # Now make sure there is not other cruft as well (eg semi-colons)
|
|
298 for my $i (0..$#bits) {
|
|
299 $bits[$i] =~ /(\w+)/;
|
|
300 $bits[$i] = $1;
|
|
301 }
|
|
302 $species->classification( @bits );
|
|
303 next;
|
|
304 }
|
|
305 if ($name =~ /comment/) {
|
|
306 my $com = Bio::Annotation::Comment->new('-text' => $content);
|
|
307 # $bioSeq->annotation->add_Comment($com);
|
|
308 $bioSeq->annotation->add_Annotation('comment', $com);
|
|
309 next;
|
|
310 }
|
|
311 # Description line - collect all descriptions for later assembly
|
|
312 if ($name =~ /descr/) {
|
|
313 push @seqDesc, $content;
|
|
314 next;
|
|
315 }
|
|
316 # Ok, we have no idea what this attribute is. Dump to SimpleValue
|
|
317 my $simp = Bio::Annotation::SimpleValue->new( -value => $content);
|
|
318 $bioSeq->annotation->add_Annotation($name, $simp);
|
|
319 }
|
|
320 unless ($#seqDesc < 0) {
|
|
321 $bioSeq->desc( join "; ", @seqDesc);
|
|
322 }
|
|
323
|
|
324 #>>>> This should be modified so that any IDREF associated with the
|
|
325 # <Reference> is then used to associate the reference with the
|
|
326 # appropriate Feature
|
|
327
|
|
328 # Extract out <Reference>s associated with the sequence
|
|
329 my @refs;
|
|
330 my %tags = (
|
|
331 -title => "RefTitle",
|
|
332 -authors => "RefAuthors",
|
|
333 -location => "RefJournal",
|
|
334 );
|
|
335 foreach my $ref ( $xmlSeq->getElementsByTagName ("Reference") ) {
|
|
336 my %refVals;
|
|
337 foreach my $tag (keys %tags) {
|
|
338 my $rt = &FIRSTDATA($ref->getElementsByTagName($tags{$tag})
|
|
339 ->item(0));
|
|
340 $rt =~ s/^[\s\r\n]+//; # Kill leading space
|
|
341 $rt =~ s/[\s\r\n]+$//; # Kill trailing space
|
|
342 $rt =~ s/[\s\r\n]+/ /; # Collapse internal space runs
|
|
343 $refVals{$tag} = $rt;
|
|
344 }
|
|
345 my $reference = Bio::Annotation::Reference->new( %refVals );
|
|
346
|
|
347 # Pull out any <Reference> information hidden in <Attributes>
|
|
348 my %refMap = (
|
|
349 comment => [ 'comment', 'remark' ],
|
|
350 medline => [ 'medline', ],
|
|
351 pubmed => [ 'pubmed' ],
|
|
352 start => [ 'start', 'begin' ],
|
|
353 end => [ 'stop', 'end' ],
|
|
354 );
|
|
355 my @refCom = ();
|
|
356 my $floppies = &GETFLOPPIES($ref);
|
|
357 foreach my $attr (@{$floppies}) {
|
|
358 my ($name, $content) = &FLOPPYVALS($attr);
|
|
359 my $value = "";
|
|
360 # Cycle through the Seq methods:
|
|
361 foreach my $method (keys %refMap) {
|
|
362 # Cycle through potential matching attributes:
|
|
363 foreach my $match (@{$refMap{$method}}) {
|
|
364 # If the <Attribute> name matches one of the keys,
|
|
365 # set $value, unless it has already been set
|
|
366 $value ||= $content if ($name =~ /$match/i);
|
|
367 }
|
|
368 if ($value ne "") {
|
|
369 my $str = '$reference->' . $method . "($value)";
|
|
370 eval($str);
|
|
371 next;
|
|
372 }
|
|
373 }
|
|
374 next if ($value ne "");
|
|
375 # Don't know what the <Attribute> is, dump it to comments:
|
|
376 push @refCom, $name . $nvtoken . $content;
|
|
377 }
|
|
378 unless ($#refCom < 0) {
|
|
379 # Random stuff was found, tack it to the comment field
|
|
380 my $exist = $reference->comment;
|
|
381 $exist .= join ", ", @refCom;
|
|
382 $reference->comment($exist);
|
|
383 }
|
|
384 push @refs, $reference;
|
|
385 }
|
|
386 $bioSeq->annotation->add_Annotation('reference'=>$_) foreach @refs;
|
|
387
|
|
388 # Extract the <Feature>s for this <Sequence>
|
|
389 foreach my $feat ( $xmlSeq->getElementsByTagName("Feature") ) {
|
|
390 $bioSeq->add_SeqFeature( $self->_parse_bsml_feature($feat) );
|
|
391 }
|
|
392
|
|
393 $species->classification( @classification );
|
|
394 $bioSeq->species( $species );
|
|
395
|
|
396 # $seq->annotation->add_DBLink(@links); ->
|
|
397
|
|
398 $self->{'current_node'}++;
|
|
399 return $bioSeq;
|
|
400 }
|
|
401 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
402 # Get all the <Attribute> and <Qualifier> children for an object, and
|
|
403 # return them as an array reference
|
|
404 # ('floppy' since these elements have poor/no schema control)
|
|
405 sub GETFLOPPIES {
|
|
406 my $obj = shift;
|
|
407
|
|
408 my @floppies;
|
|
409 my $attributes = $obj->getElementsByTagName ("Attribute");
|
|
410 for (my $i = 0; $i < $attributes->getLength; $i++) {
|
|
411 push @floppies, $attributes->item($i);
|
|
412 }
|
|
413 my $qualifiers = $obj->getElementsByTagName ("Qualifier");
|
|
414 for (my $i = 0; $i < $qualifiers->getLength; $i++) {
|
|
415 push @floppies, $qualifiers->item($i);
|
|
416 }
|
|
417 return \@floppies;
|
|
418 }
|
|
419 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
420 # Given a DOM <Attribute> or <Qualifier> object, return the [name, value] pair
|
|
421 sub FLOPPYVALS {
|
|
422 my $obj = shift;
|
|
423
|
|
424 my ($name, $value);
|
|
425 if ($obj->getNodeName eq "Attribute") {
|
|
426 $name = $obj->getAttribute('name');
|
|
427 $value = $obj->getAttribute('content');
|
|
428 } elsif ($obj->getNodeName eq "Qualifier") {
|
|
429 # Wheras <Attribute>s require both 'name' and 'content' attributes,
|
|
430 # <Qualifier>s can technically have either blank (and sometimes do)
|
|
431 my $n = $obj->getAttribute('value-type');
|
|
432 $name = $n if ($n ne "");
|
|
433 my $v = $obj->getAttribute('value');
|
|
434 $value = $v if ($v ne "");
|
|
435 }
|
|
436 return ($name, $value);
|
|
437 }
|
|
438 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
439 # Returns the value of the first TEXT_NODE encountered below an element
|
|
440 # Rational - avoid grabbing a comment rather than the PCDATA. Not foolproof...
|
|
441 sub FIRSTDATA {
|
|
442 my $element = shift;
|
|
443 return undef unless ($element);
|
|
444
|
|
445 my $hopefuls = $element->getChildNodes;
|
|
446 my $data;
|
|
447 for (my $i = 0; $i < $hopefuls->getLength; $i++) {
|
|
448 if ($hopefuls->item($i)->getNodeType ==
|
|
449 XML::DOM::Node::TEXT_NODE() ) {
|
|
450 $data = $hopefuls->item($i)->getNodeValue;
|
|
451 last;
|
|
452 }
|
|
453 }
|
|
454 return $data;
|
|
455 }
|
|
456 #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
457 # Just collapses whitespace runs in a string
|
|
458 sub STRIP {
|
|
459 my $string = shift;
|
|
460 $string =~ s/[\s\r\n]+/ /g;
|
|
461 return $string;
|
|
462 }
|
|
463
|
|
464 =head2 to_bsml
|
|
465
|
|
466 Title : to_bsml
|
|
467 Usage : my $domDoc = $obj->to_bsml(@args)
|
|
468 Function: Generates an XML structure for one or more Bio::Seq objects.
|
|
469 If $seqref is an array ref, the XML tree generated will include
|
|
470 all the sequences in the array.
|
|
471 Returns : A reference to the XML DOM::Document object generated / modified
|
|
472 Args : Argument array in form of -key => val. Recognized keys:
|
|
473
|
|
474 -seq A Bio::Seq reference, or an array reference of many of them
|
|
475
|
|
476 -xmldoc Specifies an existing XML DOM document to add the sequences
|
|
477 to. If included, then only data (no page formatting) will
|
|
478 be added. If not, a new XML::DOM::Document will be made,
|
|
479 and will be populated with both <Sequence> data, as well as
|
|
480 <Page> display elements.
|
|
481
|
|
482 -nodisp Do not generate <Display> elements, or any children
|
|
483 thereof, even if -xmldoc is not set.
|
|
484
|
|
485 -skipfeat If set to 'all', all <Feature>s will be skipped. If it is
|
|
486 a hash reference, any <Feature> with a class matching a key
|
|
487 in the hash will be skipped - for example, to skip 'source'
|
|
488 and 'score' features, use:
|
|
489
|
|
490 -skipfeat => { source => 'Y', score => 'Y' }
|
|
491
|
|
492 -skiptags As above: if set to 'all', no tags are included, and if a
|
|
493 hash reference, those specific tags will be ignored.
|
|
494
|
|
495 Skipping some or all tags and features can result in
|
|
496 noticable speed improvements.
|
|
497
|
|
498 -nodata If true, then <Seq-data> will not be included. This may be
|
|
499 useful if you just want annotations and do not care about
|
|
500 the raw ACTG information.
|
|
501
|
|
502 -return Default is 'xml', which will return a reference to the BSML
|
|
503 XML object. If set to 'seq' will return an array ref of the
|
|
504 <Sequence> objects added (rather than the whole XML object)
|
|
505
|
|
506 -close Early BSML browsers will crash if an element *could* have
|
|
507 children but does not, and is closed as an empty element
|
|
508 e.g. <Styles/>. If -close is true, then such tags are given
|
|
509 a comment child to explicitly close them e.g. <Styles><!--
|
|
510 --></Styles>. This is default true, set to "0" if you do
|
|
511 not want this behavior.
|
|
512
|
|
513 Examples : my $domObj = $stream->to_bsml( -seq => \@fourCoolSequenceObjects,
|
|
514 -skipfeat => { source => 1 },
|
|
515 );
|
|
516
|
|
517 # Or add sequences to an existing BSML document:
|
|
518 $stream->to_bsml( -seq => \@fourCoolSequenceObjects,
|
|
519 -skipfeat => { source => 1 },
|
|
520 -xmldoc => $myBsmlDocumentInProgress, );
|
|
521
|
|
522 =cut
|
|
523
|
|
524 sub to_bsml {
|
|
525 my $self = shift;
|
|
526 my $args = $self->_parseparams( -close => 1,
|
|
527 -return => 'xml',
|
|
528 @_);
|
|
529 $args->{NODISP} ||= $args->{NODISPLAY};
|
|
530 my $seqref = $args->{SEQ};
|
|
531 $seqref = (ref($seqref) eq 'ARRAY') ? $seqref : [ $seqref ];
|
|
532
|
|
533 #############################
|
|
534 # Basic BSML XML Components #
|
|
535 #############################
|
|
536
|
|
537 my $xml;
|
|
538 my ($bsmlElem, $defsElem, $seqsElem, $dispElem);
|
|
539 if ($args->{XMLDOC}) {
|
|
540 # The user has provided an existing XML DOM object
|
|
541 $xml = $args->{XMLDOC};
|
|
542 unless ($xml->isa("XML::DOM::Document")) {
|
|
543 die ('SeqIO::bsml.pm error:\n'.
|
|
544 'When calling ->to_bsml( { xmldoc => $myDoc }), $myDoc \n' .
|
|
545 'should be an XML::DOM::Document object, or an object that\n'.
|
|
546 'inherits from that class (like BsmlHelper.pm)');
|
|
547 }
|
|
548 } else {
|
|
549 # The user has not provided a new document, make one from scratch
|
|
550 $xml = XML::DOM::Document->new();
|
|
551 $xml->setXMLDecl( $xml->createXMLDecl("1.0") );
|
|
552 my $url = "http://www.labbook.com/dtd/bsml2_2.dtd";
|
|
553 my $doc = $xml->createDocumentType("Bsml",$url);
|
|
554 $xml->setDoctype($doc);
|
|
555 $bsmlElem = $self->_addel( $xml, 'Bsml');
|
|
556 $defsElem = $self->_addel( $bsmlElem, 'Definitions');
|
|
557 $seqsElem = $self->_addel( $defsElem, 'Sequences');
|
|
558 unless ($args->{NODISP}) {
|
|
559 $dispElem = $self->_addel( $bsmlElem, 'Display');
|
|
560 my $stylElem = $self->_addel( $dispElem, 'Styles');
|
|
561 my $style = $self->_addel( $stylElem, 'Style', {
|
|
562 type => "text/css" });
|
|
563 my $styleText =
|
|
564 qq(Interval-widget { display : "1"; }\n) .
|
|
565 qq(Feature { display-auto : "1"; });
|
|
566 $style->appendChild( $xml->createTextNode($styleText) );
|
|
567 }
|
|
568 }
|
|
569
|
|
570 # Establish fundamental BSML elements, if they do not already exist
|
|
571 $bsmlElem ||= $xml->getElementsByTagName("Bsml")->item(0);
|
|
572 $defsElem ||= $xml->getElementsByTagName("Definitions")->item(0);
|
|
573 $seqsElem ||= $xml->getElementsByTagName("Sequences")->item(0);
|
|
574
|
|
575 ###############
|
|
576 # <Sequences> #
|
|
577 ###############
|
|
578
|
|
579 # Map over Bio::Seq to BSML
|
|
580 my %mol = ('dna' => 'DNA', 'rna' => 'RNA', 'protein' => 'AA');
|
|
581 my @xmlSequences;
|
|
582
|
|
583 foreach my $bioSeq (@{$seqref}) {
|
|
584 my $xmlSeq = $xml->createElement("Sequence");
|
|
585 my $FTs = $xml->createElement("Feature-tables");
|
|
586
|
|
587 # Array references to hold <Reference> objects:
|
|
588 my $seqRefs = []; my $featRefs = [];
|
|
589 # Array references to hold <Attribute> values (not objects):
|
|
590 my $seqDesc = [];
|
|
591 push @{$seqDesc}, ["comment" , "This file generated to BSML 2.2 standards - joins will be collapsed to a single feature enclosing all members of the join"];
|
|
592 push @{$seqDesc}, ["description" , eval{$bioSeq->desc}];
|
|
593 foreach my $kwd ( eval{@{$bioSeq->keywords || []}} ) {
|
|
594 push @{$seqDesc}, ["keyword" , $kwd];
|
|
595 }
|
|
596 push @{$seqDesc}, ["version" , eval{$bioSeq->seq_version}];
|
|
597 push @{$seqDesc}, ["division" , eval{$bioSeq->division}];
|
|
598 push @{$seqDesc}, ["pid" , eval{$bioSeq->pid}];
|
|
599 # push @{$seqDesc}, ["bio_object" , ref($bioSeq)];
|
|
600 my $pid = eval{$bioSeq->primary_id} || '';
|
|
601 if( $pid ne $bioSeq ) {
|
|
602 push @{$seqDesc}, ["primary_id" , eval{$bioSeq->primary_id}];
|
|
603 }
|
|
604 foreach my $dt (eval{$bioSeq->get_dates()} ) {
|
|
605 push @{$seqDesc}, ["date" , $dt];
|
|
606 }
|
|
607 foreach my $ac (eval{$bioSeq->get_secondary_accessions()} ) {
|
|
608 push @{$seqDesc}, ["secondary_accession" , $ac];
|
|
609 }
|
|
610
|
|
611 # Determine the accession number and a unique identifier
|
|
612 my $acc = $bioSeq->accession_number eq "unknown" ?
|
|
613 "" : $bioSeq->accession_number;
|
|
614 my $id;
|
|
615 my $pi = $bioSeq->primary_id;
|
|
616 if ($pi && $pi !~ /Bio::/) {
|
|
617 # Not sure I understand what primary_id is... It sometimes
|
|
618 # is a string describing a reference to a BioSeq object...
|
|
619 $id = "SEQ" . $bioSeq->primary_id;
|
|
620 } else {
|
|
621 # Nothing useful found, make a new unique ID
|
|
622 $id = $acc || ("SEQ-io" . $idcounter->{Sequence}++);
|
|
623 }
|
|
624 # print "$id->",ref($bioSeq->primary_id),"\n";
|
|
625 # An id field with spaces is interpreted as an idref - kill the spaces
|
|
626 $id =~ s/ /-/g;
|
|
627 # Map over <Sequence> attributes
|
|
628 my %attr = ( 'title' => $bioSeq->display_id,
|
|
629 'length' => $bioSeq->length,
|
|
630 'ic-acckey' => $acc,
|
|
631 'id' => $id,
|
|
632 'representation' => 'raw',
|
|
633 );
|
|
634 $attr{molecule} = $mol{ lc($bioSeq->molecule) } if $bioSeq->can('molecule');
|
|
635
|
|
636
|
|
637 foreach my $a (keys %attr) {
|
|
638 $xmlSeq->setAttribute($a, $attr{$a}) if (defined $attr{$a} &&
|
|
639 $attr{$a} ne "");
|
|
640 }
|
|
641 # Orphaned Attributes:
|
|
642 $xmlSeq->setAttribute('topology', 'circular')
|
|
643 if ($bioSeq->is_circular);
|
|
644 # <Sequence> strand, locus
|
|
645
|
|
646 $self->_add_page($xml, $xmlSeq) if ($dispElem);
|
|
647 ################
|
|
648 # <Attributes> #
|
|
649 ################
|
|
650
|
|
651 # Check for Bio::Annotations on the * <Sequence> *.
|
|
652 $self->_parse_annotation( -xml => $xml, -obj => $bioSeq,
|
|
653 -desc => $seqDesc, -refs => $seqRefs);
|
|
654
|
|
655 # Incorporate species data
|
|
656 if (ref($bioSeq->species) eq 'Bio::Species') {
|
|
657 # Need to peer into Bio::Species ...
|
|
658 my @specs = ('common_name', 'genus', 'species', 'sub_species');
|
|
659 foreach my $sp (@specs) {
|
|
660 next unless (my $val = $bioSeq->species()->$sp());
|
|
661 push @{$seqDesc}, [$sp , $val];
|
|
662 }
|
|
663 push @{$seqDesc}, ['classification',
|
|
664 (join " ", $bioSeq->species->classification) ];
|
|
665 # Species::binomial will return "genus species sub_species" ...
|
|
666 } elsif (my $val = $bioSeq->species) {
|
|
667 # Ok, no idea what it is, just dump it in there...
|
|
668 push @{$seqDesc}, ["species", $val];
|
|
669 }
|
|
670
|
|
671 # Add the description <Attribute>s for the <Sequence>
|
|
672 foreach my $seqD (@{$seqDesc}) {
|
|
673 $self->_addel($xmlSeq, "Attribute", {
|
|
674 name => $seqD->[0], content => $seqD->[1]}) if ($seqD->[1]);
|
|
675 }
|
|
676
|
|
677 # If sequence references were added, make a Feature-table for them
|
|
678 unless ($#{$seqRefs} < 0) {
|
|
679 my $seqFT = $self->_addel($FTs, "Feature-table", {
|
|
680 title => "Sequence References", });
|
|
681 foreach my $feat (@{$seqRefs}) {
|
|
682 $seqFT->appendChild($feat);
|
|
683 }
|
|
684 }
|
|
685
|
|
686 # This is the appropriate place to add <Feature-tables>
|
|
687 $xmlSeq->appendChild($FTs);
|
|
688
|
|
689 #############
|
|
690 # <Feature> #
|
|
691 #############
|
|
692
|
|
693 #>>>> # Perhaps it is better to loop through top_Seqfeatures?...
|
|
694 #>>>> # ...however, BSML does not have a hierarchy for Features
|
|
695
|
|
696 if (defined $args->{SKIPFEAT} &&
|
|
697 $args->{SKIPFEAT} eq 'all') {
|
|
698 $args->{SKIPFEAT} = { all => 1};
|
|
699 }
|
|
700 foreach my $class (keys %{$args->{SKIPFEAT}}) {
|
|
701 $args->{SKIPFEAT}{lc($class)} = $args->{SKIPFEAT}{$class};
|
|
702 }
|
|
703 # Loop through all the features
|
|
704 my @features = $bioSeq->all_SeqFeatures();
|
|
705 if (@features && !$args->{SKIPFEAT}{all}) {
|
|
706 my $ft = $self->_addel($FTs, "Feature-table", {
|
|
707 title => "Features", });
|
|
708 foreach my $bioFeat (@features ) {
|
|
709 my $featDesc = [];
|
|
710 my $class = lc($bioFeat->primary_tag);
|
|
711 # The user may have specified to ignore this type of feature
|
|
712 next if ($args->{SKIPFEAT}{$class});
|
|
713 my $id = "FEAT-io" . $idcounter->{Feature}++;
|
|
714 my $xmlFeat = $self->_addel( $ft, 'Feature', {
|
|
715 'id' => $id,
|
|
716 'class' => $class ,
|
|
717 'value-type' => $bioFeat->source_tag });
|
|
718 # Check for Bio::Annotations on the * <Feature> *.
|
|
719 $self->_parse_annotation( -xml => $xml, -obj => $bioFeat,
|
|
720 -desc => $featDesc, -id => $id,
|
|
721 -refs =>$featRefs, );
|
|
722 # Add the description stuff for the <Feature>
|
|
723 foreach my $de (@{$featDesc}) {
|
|
724 $self->_addel($xmlFeat, "Attribute", {
|
|
725 name => $de->[0], content => $de->[1]}) if ($de->[1]);
|
|
726 }
|
|
727 $self->_parse_location($xml, $xmlFeat, $bioFeat);
|
|
728
|
|
729 # loop through the tags, add them as <Qualifiers>
|
|
730 next if (defined $args->{SKIPTAGS} &&
|
|
731 $args->{SKIPTAGS} =~ /all/i);
|
|
732 # Tags can consume a lot of CPU cycles, and can often be
|
|
733 # rather non-informative, so -skiptags can allow total or
|
|
734 # selective omission of tags.
|
|
735 foreach my $tag ($bioFeat->all_tags()) {
|
|
736 next if (exists $args->{SKIPTAGS}{$tag});
|
|
737 foreach my $val ($bioFeat->each_tag_value($tag)) {
|
|
738 $self->_addel( $xmlFeat, 'Qualifier', {
|
|
739 'value-type' => $tag ,
|
|
740 'value' => $val });
|
|
741 }
|
|
742 }
|
|
743 }
|
|
744 }
|
|
745
|
|
746 ##############
|
|
747 # <Seq-data> #
|
|
748 ##############
|
|
749
|
|
750 # Add sequence data
|
|
751 if ( (my $data = $bioSeq->seq) && !$args->{NODATA} ) {
|
|
752 my $d = $self->_addel($xmlSeq, 'Seq-data');
|
|
753 $d->appendChild( $xml->createTextNode($data) );
|
|
754 }
|
|
755
|
|
756 # If references were added, make a Feature-table for them
|
|
757 unless ($#{$featRefs} < 0) {
|
|
758 my $seqFT = $self->_addel($FTs, "Feature-table", {
|
|
759 title => "Feature References", });
|
|
760 foreach my $feat (@{$featRefs}) {
|
|
761 $seqFT->appendChild($feat);
|
|
762 }
|
|
763 }
|
|
764
|
|
765 # Place the completed <Sequence> tree as a child of <Sequences>
|
|
766 $seqsElem->appendChild($xmlSeq);
|
|
767 push @xmlSequences, $xmlSeq;
|
|
768 }
|
|
769
|
|
770 # Prevent browser crashes by explicitly closing empty elements:
|
|
771 if ($args->{CLOSE}) {
|
|
772 my @problemChild = ('Sequences', 'Sequence', 'Feature-tables',
|
|
773 'Feature-table', 'Screen', 'View',);
|
|
774 foreach my $kid (@problemChild) {
|
|
775 foreach my $prob ($xml->getElementsByTagName($kid)) {
|
|
776 unless ($prob->hasChildNodes) {
|
|
777 $prob->appendChild(
|
|
778 $xml->createComment(" Must close <$kid> explicitly "));
|
|
779 }
|
|
780 }
|
|
781 }
|
|
782 }
|
|
783
|
|
784 if (defined $args->{RETURN} &&
|
|
785 $args->{RETURN} =~ /seq/i) {
|
|
786 return \@xmlSequences;
|
|
787 } else {
|
|
788 return $xml;
|
|
789 }
|
|
790 }
|
|
791
|
|
792 =head2 write_seq
|
|
793
|
|
794 Title : write_seq
|
|
795 Usage : $obj->write_seq(@args)
|
|
796 Function: Prints out an XML structure for one or more Bio::Seq objects.
|
|
797 If $seqref is an array ref, the XML tree generated will include
|
|
798 all the sequences in the array. This method is fairly simple,
|
|
799 most of the processing is performed within to_bsml.
|
|
800 Returns : A reference to the XML object generated / modified
|
|
801 Args : Argument array. Recognized keys:
|
|
802
|
|
803 -seq A Bio::Seq reference, or an array reference of many of them
|
|
804
|
|
805 Alternatively, the method may be called simply as...
|
|
806
|
|
807 $obj->write_seq( $bioseq )
|
|
808
|
|
809 ... if only a single argument is passed, it is assumed that
|
|
810 it is the sequence object (can also be an array ref of
|
|
811 many Seq objects )
|
|
812
|
|
813 -printmime If true prints "Content-type: $mimetype\n\n" at top of
|
|
814 document, where $mimetype is the value designated by this
|
|
815 key. For generic XML use text/xml, for BSML use text/x-bsml
|
|
816
|
|
817 -return This option will be supressed, since the nature of this
|
|
818 method is to print out the XML document. If you wish to
|
|
819 retrieve the <Sequence> objects generated, use the to_bsml
|
|
820 method directly.
|
|
821
|
|
822 =cut
|
|
823
|
|
824 sub write_seq {
|
|
825 my $self = shift;
|
|
826 my $args = $self->_parseparams( @_);
|
|
827 if ($#_ == 0 ) {
|
|
828 # If only a single value is passed, assume it is the seq object
|
|
829 unshift @_, "-seq";
|
|
830 }
|
|
831 # Build a BSML XML DOM object based on the sequence(s)
|
|
832 my $xml = $self->to_bsml( @_,
|
|
833 -return => undef );
|
|
834 # Convert to a string
|
|
835 my $out = $xml->toString;
|
|
836 # Print after putting a return after each element - more readable
|
|
837 $out =~ s/>/>\n/g;
|
|
838 $self->_print("Content-type: " . $args->{PRINTMIME} . "\n\n")
|
|
839 if ($args->{PRINTMIME});
|
|
840 $self->_print( $out );
|
|
841 # Return the DOM tree in case the user wants to do something with it
|
|
842
|
|
843 $self->flush if $self->_flush_on_write && defined $self->_fh;
|
|
844 return $xml;
|
|
845 }
|
|
846
|
|
847 =head1 INTERNAL METHODS
|
|
848 #-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-
|
|
849
|
|
850 The following methods are used for internal processing, and should probably
|
|
851 not be accessed by the user.
|
|
852
|
|
853 =head2 _parse_location
|
|
854
|
|
855 Title : _parse_location
|
|
856 Usage : $obj->_parse_location($xmlDocument, $parentElem, $SeqFeatureObj)
|
|
857 Function: Adds <Interval-loc> and <Site-loc> children to <$parentElem> based
|
|
858 on locations / sublocations found in $SeqFeatureObj. If
|
|
859 sublocations exist, the original location will be ignored.
|
|
860 Returns : An array ref containing the elements added to the parent.
|
|
861 These will have already been added to <$parentElem>
|
|
862 Args : 0 The DOM::Document being modified
|
|
863 1 The DOM::Element parent that you want to add to
|
|
864 2 Reference to the Bio::SeqFeature being analyzed
|
|
865
|
|
866 =cut
|
|
867
|
|
868 ###############################
|
|
869 # <Interval-loc> & <Site-loc> #
|
|
870 ###############################
|
|
871
|
|
872 sub _parse_location {
|
|
873 my $self = shift;
|
|
874 my ($xml, $xmlFeat, $bioFeat) = @_;
|
|
875 my $bioLoc = $bioFeat->location;
|
|
876 my @locations;
|
|
877 if (ref($bioLoc) =~ /Split/) {
|
|
878 @locations = $bioLoc->sub_Location;
|
|
879 # BSML 2.2 does not recognize / support joins. For this reason,
|
|
880 # we will just use the upper-level location. The line below can
|
|
881 # be deleted or commented out if/when BSML 3 supports complex
|
|
882 # interval deffinitions:
|
|
883 @locations = ($bioLoc);
|
|
884 } else {
|
|
885 @locations = ($bioLoc);
|
|
886 }
|
|
887 my @added = ();
|
|
888
|
|
889 # Add the site or interval positional information:
|
|
890 foreach my $loc (@locations) {
|
|
891 my ($start, $end) = ($loc->start, $loc->end);
|
|
892 my %locAttr;
|
|
893 # Strand information is not well described in BSML
|
|
894 $locAttr{complement} = 1 if ($loc->strand == -1);
|
|
895 if ($start ne "" && ($start == $end || $end eq "")) {
|
|
896 $locAttr{sitepos} = $start;
|
|
897 push @added, $self->_addel($xmlFeat,'Site-loc',\%locAttr);
|
|
898 } elsif ($start ne "" && $end ne "") {
|
|
899 if ($start > $end) {
|
|
900 # The feature is on the complementary strand
|
|
901 ($start, $end) = ($end, $start);
|
|
902 $locAttr{complement} = 1;
|
|
903 }
|
|
904 $locAttr{startpos} = $start;
|
|
905 $locAttr{endpos} = $end;
|
|
906 push @added, $self->_addel($xmlFeat,'Interval-loc',\%locAttr);
|
|
907 } else {
|
|
908 warn "Failure to parse SeqFeature location. Start = '$start' & End = '$end'";
|
|
909 }
|
|
910 }
|
|
911 return \@added;
|
|
912 }
|
|
913
|
|
914 =head2 _parse_bsml_feature
|
|
915
|
|
916 Title : _parse_bsml_feature
|
|
917 Usage : $obj->_parse_bsml_feature($xmlFeature )
|
|
918 Function: Will examine the <Feature> element provided by $xmlFeature and
|
|
919 return a generic seq feature.
|
|
920 Returns : Bio::SeqFeature::Generic
|
|
921 Args : 0 XML::DOM::Element <Feature> being analyzed.
|
|
922
|
|
923 =cut
|
|
924
|
|
925 sub _parse_bsml_feature {
|
|
926 my $self = shift;
|
|
927 my ($feat) = @_;
|
|
928
|
|
929 my $basegsf = new Bio::SeqFeature::Generic;
|
|
930 # score
|
|
931 # frame
|
|
932 # source_tag
|
|
933
|
|
934 # Use the class as the primary tag value, if it is present
|
|
935 if ( my $val = $feat->getAttribute("class") ) {
|
|
936 $basegsf->primary_tag($val);
|
|
937 }
|
|
938
|
|
939 # Positional information is in <Interval-loc>s or <Site-loc>s
|
|
940 # We need to grab these in order, to try to recreate joins...
|
|
941 my @locations = ();
|
|
942 foreach my $kid ($feat->getChildNodes) {
|
|
943 my $nodeName = $kid->getNodeName;
|
|
944 next unless ($nodeName eq "Interval-loc" ||
|
|
945 $nodeName eq "Site-loc");
|
|
946 push @locations, $kid;
|
|
947 }
|
|
948 if ($#locations == 0) {
|
|
949 # There is only one location specified
|
|
950 $self->_parse_bsml_location($locations[0], $basegsf);
|
|
951 } elsif ($#locations > 0) {
|
|
952 #>>>> # This is not working, I think the error is somewhere downstream
|
|
953 # of add_sub_SeqFeature, probably in RangeI::union ?
|
|
954 # The sub features are added fine, but the EXPANDed parent feature
|
|
955 # location has a messed up start - Bio::SeqFeature::Generic ref
|
|
956 # instead of an integer - and an incorrect end - the end of the first
|
|
957 # sub feature added, not of the union of all of them.
|
|
958
|
|
959 # Also, the SeqIO::genbank.pm output is odd - the sub features appear
|
|
960 # to be listed with the *previous* feature, not this one.
|
|
961
|
|
962 foreach my $location (@locations) {
|
|
963 my $subgsf = $self->_parse_bsml_location($location);
|
|
964 # print "start ", $subgsf->start,"\n";
|
|
965 # print "end ", $subgsf->end,"\n";
|
|
966 $basegsf->add_sub_SeqFeature($subgsf, 'EXPAND');
|
|
967 }
|
|
968 # print $feat->getAttribute('id'),"\n";
|
|
969 # print $basegsf->primary_tag,"\n";
|
|
970
|
|
971 } else {
|
|
972 # What to do if there are no locations? Nothing needed?
|
|
973 }
|
|
974
|
|
975 # Look at any <Attribute>s or <Qualifier>s that are present:
|
|
976 my $floppies = &GETFLOPPIES($feat);
|
|
977 foreach my $attr (@{$floppies}) {
|
|
978 my ($name, $content) = &FLOPPYVALS($attr);
|
|
979
|
|
980 if ($name =~ /xref/i) {
|
|
981 # Do we want to put these in DBLinks??
|
|
982 }
|
|
983
|
|
984 # Don't know what the object is, dump it to a tag:
|
|
985 $basegsf->add_tag_value(lc($name), $content);
|
|
986 }
|
|
987
|
|
988 # Mostly this helps with debugging, but may be of utility...
|
|
989 # Add a tag holding the BSML id value
|
|
990 if ( (my $val = $feat->getAttribute('id')) &&
|
|
991 !$basegsf->has_tag('bsml-id')) {
|
|
992 # Decided that this got a little sloppy...
|
|
993 # $basegsf->add_tag_value("bsml-id", $val);
|
|
994 }
|
|
995 return $basegsf;
|
|
996 }
|
|
997
|
|
998 =head2 _parse_bsml_location
|
|
999
|
|
1000 Title : _parse_bsml_location
|
|
1001 Usage : $obj->_parse_bsml_feature( $intOrSiteLoc, $gsfObject )
|
|
1002 Function: Will examine the <Interval-loc> or <Site-loc> element provided
|
|
1003 Returns : Bio::SeqFeature::Generic
|
|
1004 Args : 0 XML::DOM::Element <Interval/Site-loc> being analyzed.
|
|
1005 1 Optional SeqFeature::Generic to use
|
|
1006
|
|
1007 =cut
|
|
1008
|
|
1009 sub _parse_bsml_location {
|
|
1010 my $self = shift;
|
|
1011 my ($loc, $gsf) = @_;
|
|
1012
|
|
1013 $gsf ||= new Bio::SeqFeature::Generic;
|
|
1014 my $type = $loc->getNodeName;
|
|
1015 my ($start, $end);
|
|
1016 if ($type eq 'Interval-loc') {
|
|
1017 $start = $loc->getAttribute('startpos');
|
|
1018 $end = $loc->getAttribute('endpos');
|
|
1019 } elsif ($type eq 'Site-loc') {
|
|
1020 $start = $end = $loc->getAttribute('sitepos');
|
|
1021 } else {
|
|
1022 warn "Unknown location type '$type', could not make GSF\n";
|
|
1023 return undef;
|
|
1024 }
|
|
1025 $gsf->start($start);
|
|
1026 $gsf->end($end);
|
|
1027
|
|
1028 # BSML does not have an explicit method to set undefined strand
|
|
1029 if (my $s = $loc->getAttribute("complement")) {
|
|
1030 if ($s) {
|
|
1031 $gsf->strand(-1);
|
|
1032 } else {
|
|
1033 $gsf->strand(1);
|
|
1034 }
|
|
1035 } else {
|
|
1036 # We're setting "strand nonspecific" here - bad idea?
|
|
1037 # In most cases the user likely meant it to be on the + strand
|
|
1038 $gsf->strand(0);
|
|
1039 }
|
|
1040
|
|
1041 return $gsf;
|
|
1042 }
|
|
1043
|
|
1044 =head2 _parse_reference
|
|
1045
|
|
1046 Title : _parse_reference
|
|
1047 Usage : $obj->_parse_reference(@args )
|
|
1048 Function: Makes a new <Reference> object from a ::Reference, which is
|
|
1049 then stored in an array provide by -refs. It will be
|
|
1050 appended to the XML tree later.
|
|
1051 Returns :
|
|
1052 Args : Argument array. Recognized keys:
|
|
1053
|
|
1054 -xml The DOM::Document being modified
|
|
1055
|
|
1056 -refobj The Annotation::Reference Object
|
|
1057
|
|
1058 -refs An array reference to hold the new <Reference> DOM object
|
|
1059
|
|
1060 -id Optional. If the XML id for the 'calling' element is
|
|
1061 provided, it will be placed in any <Reference> refs
|
|
1062 attribute.
|
|
1063
|
|
1064 =cut
|
|
1065
|
|
1066 sub _parse_reference {
|
|
1067 my $self = shift;
|
|
1068 my $args = $self->_parseparams( @_);
|
|
1069 my ($xml, $ref, $refRef) = ($args->{XML}, $args->{REFOBJ}, $args->{REFS});
|
|
1070
|
|
1071 ###############
|
|
1072 # <Reference> #
|
|
1073 ###############
|
|
1074
|
|
1075 my $xmlRef = $xml->createElement("Reference");
|
|
1076 #>> This may not be the right way to make a BSML dbxref...
|
|
1077 if (my $link = $ref->medline) {
|
|
1078 $xmlRef->setAttribute('dbxref', $link);
|
|
1079 }
|
|
1080
|
|
1081 # Make attributes for some of the characteristics
|
|
1082 my %stuff = ( start => $ref->start,
|
|
1083 end => $ref->end,
|
|
1084 rp => $ref->rp,
|
|
1085 comment => $ref->comment,
|
|
1086 pubmed => $ref->pubmed,
|
|
1087 );
|
|
1088 foreach my $s (keys %stuff) {
|
|
1089 $self->_addel($xmlRef, "Attribute", {
|
|
1090 name => $s, content => $stuff{$s} }) if ($stuff{$s});
|
|
1091 }
|
|
1092 $xmlRef->setAttribute('refs', $args->{ID}) if ($args->{ID});
|
|
1093 # Add the basic information
|
|
1094 # Should probably check for content before creation...
|
|
1095 $self->_addel($xmlRef, "RefAuthors")->
|
|
1096 appendChild( $xml->createTextNode(&STRIP($ref->authors)) );
|
|
1097 $self->_addel($xmlRef, "RefTitle")->
|
|
1098 appendChild( $xml->createTextNode(&STRIP($ref->title)) );
|
|
1099 $self->_addel($xmlRef, "RefJournal")->
|
|
1100 appendChild( $xml->createTextNode(&STRIP($ref->location)) );
|
|
1101 # References will be added later in a <Feature-Table>
|
|
1102 push @{$refRef}, $xmlRef;
|
|
1103 }
|
|
1104
|
|
1105 =head2 _parse_annotation
|
|
1106
|
|
1107 Title : _parse_annotation
|
|
1108 Usage : $obj->_parse_annotation(@args )
|
|
1109 Function: Will examine any Annotations found in -obj. Data found in
|
|
1110 ::Comment and ::DBLink structures, as well as Annotation
|
|
1111 description fields are stored in -desc for later
|
|
1112 generation of <Attribute>s. <Reference> objects are generated
|
|
1113 from ::References, and are stored in -refs - these will
|
|
1114 be appended to the XML tree later.
|
|
1115 Returns :
|
|
1116 Args : Argument array. Recognized keys:
|
|
1117
|
|
1118 -xml The DOM::Document being modified
|
|
1119
|
|
1120 -obj Reference to the Bio object being analyzed
|
|
1121
|
|
1122 -descr An array reference for holding description text items
|
|
1123
|
|
1124 -refs An array reference to hold <Reference> DOM objects
|
|
1125
|
|
1126 -id Optional. If the XML id for the 'calling' element is
|
|
1127 provided, it will be placed in any <Reference> refs
|
|
1128 attribute.
|
|
1129
|
|
1130 =cut
|
|
1131
|
|
1132 sub _parse_annotation {
|
|
1133 my $self = shift;
|
|
1134 my $args = $self->_parseparams( @_);
|
|
1135 my ($xml, $obj, $descRef, $refRef) =
|
|
1136 ( $args->{XML}, $args->{OBJ}, $args->{DESC}, $args->{REFS} );
|
|
1137 # No good place to put any of this (except for references). Most stuff
|
|
1138 # just gets dumped to <Attribute>s
|
|
1139 my $ann = $obj->annotation;
|
|
1140 return undef unless ($ann);
|
|
1141 # use BMS::Branch; my $debug = BMS::Branch->new( ); warn "$obj :"; $debug->branch($ann);
|
|
1142 unless (ref($ann) =~ /Collection/) {
|
|
1143 # Old style annotation. It seems that Features still use this
|
|
1144 # form of object
|
|
1145 $self->_parse_annotation_old(@_);
|
|
1146 return;
|
|
1147 }
|
|
1148
|
|
1149 foreach my $key ($ann->get_all_annotation_keys()) {
|
|
1150 foreach my $thing ($ann->get_Annotations($key)) {
|
|
1151 if ($key eq 'description') {
|
|
1152 push @{$descRef}, ["description" , $thing->value];
|
|
1153 } elsif ($key eq 'comment') {
|
|
1154 push @{$descRef}, ["comment" , $thing->text];
|
|
1155 } elsif ($key eq 'dblink') {
|
|
1156 # DBLinks get dumped to attributes, too
|
|
1157 push @{$descRef}, ["db_xref" , $thing->database . ":"
|
|
1158 . $thing->primary_id ];
|
|
1159 if (my $com = $thing->comment) {
|
|
1160 push @{$descRef}, ["link" , $com->text ];
|
|
1161 }
|
|
1162
|
|
1163 } elsif ($key eq 'reference') {
|
|
1164 $self->_parse_reference( @_, -refobj => $thing );
|
|
1165 } elsif (ref($thing) =~ /SimpleValue/) {
|
|
1166 push @{$descRef}, [$key , $thing->value];
|
|
1167 } else {
|
|
1168 # What is this??
|
|
1169 push @{$descRef}, ["error", "bsml.pm did not understand ".
|
|
1170 "'$key' = '$thing'" ];
|
|
1171 }
|
|
1172 }
|
|
1173 }
|
|
1174 }
|
|
1175
|
|
1176 =head2 _parse_annotation_old
|
|
1177
|
|
1178 Title : _parse_annotation_old
|
|
1179 Usage : $obj->_parse_annotation_old(@args)
|
|
1180 Function: As above, but for the old Annotation system.
|
|
1181 Apparently needed because Features are still using the old-style
|
|
1182 annotations?
|
|
1183 Returns :
|
|
1184 Args : Argument array. Recognized keys:
|
|
1185
|
|
1186 -xml The DOM::Document being modified
|
|
1187
|
|
1188 -obj Reference to the Bio object being analyzed
|
|
1189
|
|
1190 -descr An array reference for holding description text items
|
|
1191
|
|
1192 -refs An array reference to hold <Reference> DOM objects
|
|
1193
|
|
1194 -id Optional. If the XML id for the 'calling' element is
|
|
1195 provided, it will be placed in any <Reference> refs
|
|
1196 attribute.
|
|
1197
|
|
1198 =cut
|
|
1199
|
|
1200 ###############
|
|
1201 # <Reference> #
|
|
1202 ###############
|
|
1203
|
|
1204 sub _parse_annotation_old {
|
|
1205 my $self = shift;
|
|
1206 my $args = $self->_parseparams( @_);
|
|
1207 my ($xml, $obj, $descRef, $refRef) =
|
|
1208 ( $args->{XML}, $args->{OBJ}, $args->{DESC}, $args->{REFS} );
|
|
1209 # No good place to put any of this (except for references). Most stuff
|
|
1210 # just gets dumped to <Attribute>s
|
|
1211 if (my $ann = $obj->annotation) {
|
|
1212 push @{$descRef}, ["annotation", $ann->description];
|
|
1213 foreach my $com ($ann->each_Comment) {
|
|
1214 push @{$descRef}, ["comment" , $com->text];
|
|
1215 }
|
|
1216
|
|
1217 # Gene names just get dumped to <Attribute name="gene">
|
|
1218 foreach my $gene ($ann->each_gene_name) {
|
|
1219 push @{$descRef}, ["gene" , $gene];
|
|
1220 }
|
|
1221
|
|
1222 # DBLinks get dumped to attributes, too
|
|
1223 foreach my $link ($ann->each_DBLink) {
|
|
1224 push @{$descRef}, ["db_xref" ,
|
|
1225 $link->database . ":" . $link->primary_id ];
|
|
1226 if (my $com = $link->comment) {
|
|
1227 push @{$descRef}, ["link" , $com->text ];
|
|
1228 }
|
|
1229 }
|
|
1230
|
|
1231 # References get produced and temporarily held
|
|
1232 foreach my $ref ($ann->each_Reference) {
|
|
1233 $self->_parse_reference( @_, -refobj => $ref );
|
|
1234 }
|
|
1235 }
|
|
1236 }
|
|
1237
|
|
1238 =head2 _add_page
|
|
1239
|
|
1240 Title : _add_page
|
|
1241 Usage : $obj->_add_page($xmlDocument, $xmlSequenceObject)
|
|
1242 Function: Adds a simple <Page> and <View> structure for a <Sequence>
|
|
1243 Returns : a reference to the newly created <Page>
|
|
1244 Args : 0 The DOM::Document being modified
|
|
1245 1 Reference to the <Sequence> object
|
|
1246
|
|
1247 =cut
|
|
1248
|
|
1249 sub _add_page {
|
|
1250 my $self = shift;
|
|
1251 my ($xml, $seq) = @_;
|
|
1252 my $disp = $xml->getElementsByTagName("Display")->item(0);
|
|
1253 my $page = $self->_addel($disp, "Page");
|
|
1254 my ($width, $height) = ( 7.8, 5.5);
|
|
1255 my $screen = $self->_addel($page, "Screen", {
|
|
1256 width => $width, height => $height, });
|
|
1257 # $screen->appendChild($xml->createComment("Must close explicitly"));
|
|
1258 my $view = $self->_addel($page, "View", {
|
|
1259 seqref => $seq->getAttribute('id'),
|
|
1260 title => $seq->getAttribute('title'),
|
|
1261 title1 => "{NAME}",
|
|
1262 title2 => "{LENGTH} {UNIT}",
|
|
1263 });
|
|
1264 $self->_addel($view, "View-line-widget", {
|
|
1265 shape => 'horizontal',
|
|
1266 hcenter => $width/2 + 0.7,
|
|
1267 'linear-length' => $width - 2,
|
|
1268 });
|
|
1269 $self->_addel($view, "View-axis-widget");
|
|
1270 return $page;
|
|
1271 }
|
|
1272
|
|
1273
|
|
1274 =head2 _addel
|
|
1275
|
|
1276 Title : _addel
|
|
1277 Usage : $obj->_addel($parentElem, 'ChildName',
|
|
1278 { anAttr => 'someValue', anotherAttr => 'aValue',})
|
|
1279 Function: Add an element with attribute values to a DOM tree
|
|
1280 Returns : a reference to the newly added element
|
|
1281 Args : 0 The DOM::Element parent that you want to add to
|
|
1282 1 The name of the new child element
|
|
1283 2 Optional hash reference containing
|
|
1284 attribute name => attribute value assignments
|
|
1285
|
|
1286 =cut
|
|
1287
|
|
1288 sub _addel {
|
|
1289 my $self = shift;
|
|
1290 my ($root, $name, $attr) = @_;
|
|
1291
|
|
1292 # Find the DOM::Document for the parent
|
|
1293 my $doc = $root->getOwnerDocument || $root;
|
|
1294 my $elem = $doc->createElement($name);
|
|
1295 foreach my $a (keys %{$attr}) {
|
|
1296 $elem->setAttribute($a, $attr->{$a});
|
|
1297 }
|
|
1298 $root->appendChild($elem);
|
|
1299 return $elem;
|
|
1300 }
|
|
1301
|
|
1302 =head2 _show_dna
|
|
1303
|
|
1304 Title : _show_dna
|
|
1305 Usage : $obj->_show_dna($newval)
|
|
1306 Function: (cut-and-pasted directly from embl.pm)
|
|
1307 Returns : value of _show_dna
|
|
1308 Args : newvalue (optional)
|
|
1309
|
|
1310 =cut
|
|
1311
|
|
1312 sub _show_dna {
|
|
1313 my $obj = shift;
|
|
1314 if( @_ ) {
|
|
1315 my $value = shift;
|
|
1316 $obj->{'_show_dna'} = $value;
|
|
1317 }
|
|
1318 return $obj->{'_show_dna'};
|
|
1319 }
|
|
1320
|
|
1321 =head2 _initialize
|
|
1322
|
|
1323 Title : _initialize
|
|
1324 Usage : $dom = $obj->_initialize(@args)
|
|
1325 Function: Coppied from embl.pm, and augmented with initialization of the
|
|
1326 XML DOM tree
|
|
1327 Returns :
|
|
1328 Args : -file => the XML file to be parsed
|
|
1329
|
|
1330 =cut
|
|
1331
|
|
1332 sub _initialize {
|
|
1333 my($self,@args) = @_;
|
|
1334
|
|
1335 $self->SUPER::_initialize(@args);
|
|
1336 # hash for functions for decoding keys.
|
|
1337 $self->{'_func_ftunit_hash'} = {};
|
|
1338 $self->_show_dna(1); # sets this to one by default. People can change it
|
|
1339
|
|
1340 my %param = @args; # From SeqIO.pm
|
|
1341 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
|
|
1342 if ( exists $param{-file} && $param{-file} !~ /^>/) {
|
|
1343 # Is it blasphemy to add your own keys to an object in another package?
|
|
1344 # domtree => the parsed DOM tree retruned by XML::DOM
|
|
1345 $self->{'domtree'} = $self->_parse_xml( $param{-file} );
|
|
1346 # current_node => the <Sequence> node next in line for next_seq
|
|
1347 $self->{'current_node'} = 0;
|
|
1348 }
|
|
1349
|
|
1350 $self->sequence_factory( new Bio::Seq::SeqFactory
|
|
1351 ( -verbose => $self->verbose(),
|
|
1352 -type => 'Bio::Seq::RichSeq'))
|
|
1353 if( ! defined $self->sequence_factory );
|
|
1354 }
|
|
1355
|
|
1356
|
|
1357 =head2 _parseparams
|
|
1358
|
|
1359 Title : _parseparams
|
|
1360 Usage : my $paramHash = $obj->_parseparams(@args)
|
|
1361 Function: Borrowed from Bio::Parse.pm, who borrowed it from CGI.pm
|
|
1362 Lincoln Stein -> Richard Resnick -> here
|
|
1363 Returns : A hash reference of the parameter keys (uppercase) pointing to
|
|
1364 their values.
|
|
1365 Args : An array of key, value pairs. Easiest to pass values as:
|
|
1366 -key1 => value1, -key2 => value2, etc
|
|
1367 Leading "-" are removed.
|
|
1368
|
|
1369 =cut
|
|
1370
|
|
1371 sub _parseparams {
|
|
1372 my $self = shift;
|
|
1373 my %hash = ();
|
|
1374 my @param = @_;
|
|
1375
|
|
1376 # Hacked out from Parse.pm
|
|
1377 # The next few lines strip out the '-' characters which
|
|
1378 # preceed the keys, and capitalizes them.
|
|
1379 for (my $i=0;$i<@param;$i+=2) {
|
|
1380 $param[$i]=~s/^\-//;
|
|
1381 $param[$i]=~tr/a-z/A-Z/;
|
|
1382 }
|
|
1383 pop @param if @param %2; # not an even multiple
|
|
1384 %hash = @param;
|
|
1385 return \%hash;
|
|
1386 }
|
|
1387
|
|
1388 =head2 _parse_xml
|
|
1389
|
|
1390 Title : _parse_xml
|
|
1391 Usage : $dom = $obj->_parse_xml($filename)
|
|
1392 Function: uses XML::DOM to construct a DOM tree from the BSML document
|
|
1393 Returns : a reference to the parsed DOM tree
|
|
1394 Args : 0 Path to the XML file needing to be parsed
|
|
1395
|
|
1396 =cut
|
|
1397
|
|
1398 sub _parse_xml {
|
|
1399 my $self = shift;
|
|
1400 my $file = shift;
|
|
1401
|
|
1402 unless (-e $file) {
|
|
1403 $self->throw("Could not parse non-existant XML file '$file'.");
|
|
1404 return undef;
|
|
1405 }
|
|
1406 my $parser = new XML::DOM::Parser;
|
|
1407 my $doc = $parser->parsefile ($file);
|
|
1408 return $doc;
|
|
1409 }
|
|
1410
|
|
1411 sub DESTROY {
|
|
1412 my $self = shift;
|
|
1413 # Reports off the net imply that DOM::Parser will memory leak if you
|
|
1414 # do not explicitly dispose of it:
|
|
1415 # http://aspn.activestate.com/ASPN/Mail/Message/perl-xml/788458
|
|
1416 my $dom = $self->{'domtree'};
|
|
1417 # For some reason the domtree can get undef-ed somewhere...
|
|
1418 $dom->dispose if ($dom);
|
|
1419 }
|
|
1420
|
|
1421
|
|
1422 =head1 TESTING SCRIPT
|
|
1423
|
|
1424 The following script may be used to test the conversion process. You
|
|
1425 will need a file of the format you wish to test. The script will
|
|
1426 convert the file to BSML, store it in /tmp/bsmltemp, read that file
|
|
1427 into a new SeqIO stream, and write it back as the original
|
|
1428 format. Comparison of this second file to the original input file
|
|
1429 will allow you to track where data may be lost or corrupted. Note
|
|
1430 that you will need to specify $readfile and $readformat.
|
|
1431
|
|
1432 use Bio::SeqIO;
|
|
1433 # Tests preservation of details during round-trip conversion:
|
|
1434 # $readformat -> BSML -> $readformat
|
|
1435 my $tempspot = "/tmp/bsmltemp"; # temp folder to hold generated files
|
|
1436 my $readfile = "rps4y.embl"; # The name of the file you want to test
|
|
1437 my $readformat = "embl"; # The format of the file being tested
|
|
1438
|
|
1439 system "mkdir $tempspot" unless (-d $tempspot);
|
|
1440 # Make Seq object from the $readfile
|
|
1441 my $biostream = Bio::SeqIO->new( -file => "$readfile" );
|
|
1442 my $seq = $biostream->next_seq();
|
|
1443
|
|
1444 # Write BSML from SeqObject
|
|
1445 my $bsmlout = Bio::SeqIO->new( -format => 'bsml',
|
|
1446 -file => ">$tempspot/out.bsml");
|
|
1447 warn "\nBSML written to $tempspot/out.bsml\n";
|
|
1448 $bsmlout->write_seq($seq);
|
|
1449 # Need to kill object for following code to work... Why is this so?
|
|
1450 $bsmlout = "";
|
|
1451
|
|
1452 # Make Seq object from BSML
|
|
1453 my $bsmlin = Bio::SeqIO->new( -file => "$tempspot/out.bsml",
|
|
1454 -format => 'bsml');
|
|
1455 my $seq2 = $bsmlin->next_seq();
|
|
1456
|
|
1457 # Write format back from Seq Object
|
|
1458 my $genout = Bio::SeqIO->new( -format => $readformat,
|
|
1459 -file => ">$tempspot/out.$readformat");
|
|
1460 $genout->write_seq($seq2);
|
|
1461 warn "$readformat written to $tempspot/out.$readformat\n";
|
|
1462
|
|
1463 # BEING LOST:
|
|
1464 # Join information (not possible in BSML 2.2)
|
|
1465 # Sequence type (??)
|
|
1466
|
|
1467 =cut
|
|
1468
|
|
1469
|
|
1470 1;
|