0
|
1 #
|
|
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::GFF
|
|
3 #
|
|
4
|
|
5 =head1 LICENSE
|
|
6
|
|
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
8 Genome Research Limited. All rights reserved.
|
|
9
|
|
10 This software is distributed under a modified Apache license.
|
|
11 For license details, please see
|
|
12
|
|
13 http://www.ensembl.org/info/about/code_licence.html
|
|
14
|
|
15 =head1 CONTACT
|
|
16
|
|
17 Please email comments or questions to the public Ensembl
|
|
18 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
19
|
|
20 Questions may also be sent to the Ensembl help desk at
|
|
21 <helpdesk@ensembl.org>.
|
|
22
|
|
23 #Could this be based on a Generic Flat file parser?
|
|
24
|
|
25 =head1 NAME
|
|
26
|
|
27 Bio::EnsEMBL::Funcgen::Parsers::GFF
|
|
28
|
|
29 =head1 SYNOPSIS
|
|
30
|
|
31 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::GFF";
|
|
32 push @INC, $parser_type;
|
|
33 my $imp = $class->SUPER::new(@_);
|
|
34
|
|
35
|
|
36 =head1 DESCRIPTION
|
|
37
|
|
38 This is a definitions class which should not be instatiated directly, it
|
|
39 normally set by the Importer as the parent class. GFF contains meta
|
|
40 data and methods specific to data in bed format, to aid
|
|
41 parsing and importing of experimental data.
|
|
42
|
|
43 =cut
|
|
44
|
|
45 package Bio::EnsEMBL::Funcgen::Parsers::GFF;
|
|
46
|
|
47 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
|
|
48 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
49 use strict;
|
|
50
|
|
51
|
|
52 use vars qw(@ISA);
|
|
53 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::ExperimentalSet);
|
|
54
|
|
55 =head2 new
|
|
56
|
|
57 Example : my $self = $class->SUPER::new(@_);
|
|
58 Description: Constructor method for GFF class
|
|
59 Returntype : Bio::EnsEMBL::Funcgen::Parsers::GFF
|
|
60 Exceptions : None
|
|
61 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
62 Status : at risk
|
|
63
|
|
64 =cut
|
|
65
|
|
66
|
|
67 sub new{
|
|
68 my $caller = shift;
|
|
69
|
|
70 my $class = ref($caller) || $caller;
|
|
71
|
|
72 #define default fields here and pass
|
|
73 #We also need to be able to take custom attrs mappings
|
|
74
|
|
75 #keys are array index of field, key are Feature paramter names
|
|
76 #reverse this?
|
|
77 #Unless we have a compound field which we name accordingly
|
|
78 #And then call e.g. parse_attrs
|
|
79 #Which will return a hash with the relevant Feature attributes
|
|
80
|
|
81 #Is splitting this up simply going to make the parse slower due to acessor methods?
|
|
82
|
|
83 #Pass or just set directly here?
|
|
84 #<seqname> <source> <feature> <start> <end> <score> <strand> <frame> [attributes] [comments]
|
|
85
|
|
86
|
|
87 #Some of these may be highly redundant due to the nature of the data.
|
|
88 #We can hash things to lessen the impact but we're still going to be checking if exists for each one
|
|
89 #No way around this? Unless it is marginally faster to set a permanent type and then only check a boolean.
|
|
90 #Yes there is, this is the exhaustive GFF definition, we can just redefine or delete some entries dynamically to
|
|
91 #avoid ever considering a particular field index.
|
|
92
|
|
93
|
|
94 #Don't need any of this? Can we simply define process fields?
|
|
95 #This will remove the ability to define custom formats
|
|
96 #But then again we can only have custom format if it has ensembl compliant data
|
|
97 #i.e. no preprocessing has to be done before populating the feature_params hash
|
|
98
|
|
99 #my %fields = (
|
|
100 # 0 => 'fetch_slice',
|
|
101 # 1 => 'get_source',
|
|
102 # 2 => 'get_feature_type',
|
|
103 # 3 => '-start',
|
|
104 # 4 => '-end',
|
|
105 # 5 => '-strand',#Will most likely be , need to convert to -.+ > -1 0 1
|
|
106 #6 => 'frame',#will most likely be .
|
|
107 # 7 => 'get_attributes',
|
|
108 # );
|
|
109
|
|
110 #We want to be able to define mappings between attributes and fields
|
|
111 #we're basically just dealing with display_label for annotated_feature
|
|
112 #e.g -display_label_format => ID+ACC
|
|
113 #Or maybe format of several fields and attrs + text?
|
|
114 #We need a separator which will not be used in the GFF attr names
|
|
115 #we also need to be able to differentiate
|
|
116 #First check standard GFF field, then check attrs
|
|
117 ##No no no, just have method, generate display label
|
|
118 #forget this for now and just use one field
|
|
119
|
|
120 my $display_label_field = 'ID';#default
|
|
121
|
|
122 #We still need to define the field name here as a global hash to allow this display_label_field look up.
|
|
123
|
|
124
|
|
125 my $self = $class->SUPER::new(@_);#, -fields => \%fields);
|
|
126
|
|
127 ($display_label_field) = rearrange(['DISPLAY_LABEL_FIELD'], @_);
|
|
128
|
|
129 #We need to define meta header method, starting with '##'
|
|
130 #Also need to skip comments '#' at begining or end of line
|
|
131 #Do we also need to skip field header? No methinks not.
|
|
132
|
|
133 #Define result method
|
|
134 # $self->{'file_ext'} => 'gff';#Could use vendor here?
|
|
135
|
|
136 #define this if we want to override the generic method in Simple
|
|
137 #$self->{'config'}{'results_data'} => ["and_import_gff"];
|
|
138
|
|
139 $self->display_label_field($display_label_field);
|
|
140
|
|
141
|
|
142 return $self;
|
|
143 }
|
|
144
|
|
145
|
|
146 =head2 set_config
|
|
147
|
|
148 Example : my $self->set_config;
|
|
149 Description: Sets attribute dependent config
|
|
150 Returntype : None
|
|
151 Exceptions : None
|
|
152 Caller : Bio::EnsEMBL::Funcgen::Importer
|
|
153 Status : at risk
|
|
154
|
|
155 =cut
|
|
156
|
|
157
|
|
158 sub set_config{
|
|
159 my $self = shift;
|
|
160
|
|
161 $self->SUPER::set_config;
|
|
162
|
|
163 #GFF specific stuff here.
|
|
164
|
|
165 return;
|
|
166 }
|
|
167
|
|
168 #Need to implement this!
|
|
169 sub parse_line{
|
|
170 my ($self, $line) = @_;
|
|
171
|
|
172 #return if $line ~=
|
|
173
|
|
174 #my %fields = (
|
|
175 # 0 => 'fetch_slice',
|
|
176 # 1 => 'get_source',
|
|
177 # 2 => 'get_feature_type',
|
|
178 # 3 => '-start',
|
|
179 # 4 => '-end',
|
|
180 # 5 => '-strand',#Will most likely be , need to convert to -.+ > -1 0 1
|
|
181 #6 => 'frame',#will most likely be .
|
|
182 # 7 => 'get_attributes',
|
|
183 # );
|
|
184
|
|
185
|
|
186
|
|
187 my ($chr, $start, $end, $pid, $score) = split/\t/o, $line;
|
|
188
|
|
189 #we need to return feature_params and seq if defined?
|
|
190
|
|
191 }
|
|
192
|
|
193
|
|
194
|
|
195 1;
|