0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 package Bio::EnsEMBL::Funcgen::Parsers::biotiffin;
|
|
22
|
|
23 use strict;
|
|
24
|
|
25 use File::Basename;
|
|
26
|
|
27 # To get files for bioTIFFIN, download the following GFF file (e.g. via wget):
|
|
28 #
|
|
29 # http://td-blade.gurdon.cam.ac.uk/tad26/fly-tiffinScan-tiffin12.dm3.gff.gz
|
|
30
|
|
31 # Thomas Down <thomas.down@gurdon.cam.ac.uk>
|
|
32
|
|
33 #
|
|
34 # 3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0
|
|
35 # 3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0
|
|
36 # 3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0
|
|
37
|
|
38 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
|
|
39 use Bio::EnsEMBL::DBEntry;
|
|
40 use Bio::EnsEMBL::Funcgen::ExternalFeature;
|
|
41 use Bio::EnsEMBL::Utils::Exception qw( throw );
|
|
42 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
43
|
|
44 use vars qw(@ISA);
|
|
45 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
|
|
46
|
|
47
|
|
48 sub new {
|
|
49 my $caller = shift;
|
|
50 my $class = ref($caller) || $caller;
|
|
51
|
|
52 my $self = $class->SUPER::new(@_, type => 'BioTiffin');
|
|
53
|
|
54 #Set default feature_type and feature_set config
|
|
55
|
|
56 #We need to capture version/release/data of external feature sets.
|
|
57 #This can be nested in the description? Need to add description to feature_set?
|
|
58
|
|
59 $self->{static_config}{feature_types} =
|
|
60 {
|
|
61 'BioTIFFIN Motif' => {
|
|
62 name => 'BioTIFFIN Motif',
|
|
63 class => 'Regulatory Motif',
|
|
64 description => 'BioTIFFIN motif',
|
|
65 }
|
|
66 };
|
|
67
|
|
68 $self->{static_config}{analyses} =
|
|
69 {
|
|
70 'BioTIFFIN Motif' => {
|
|
71 -logic_name => 'BioTIFFIN Motif',
|
|
72 -description => 'BioTIFFIN regulatory motif database',
|
|
73 -display_label => 'BioTIFFIN motifs',
|
|
74 -displayable => 1,
|
|
75 },
|
|
76 };
|
|
77
|
|
78 $self->{static_config}{feature_sets} =
|
|
79 {
|
|
80 'BioTIFFIN Motif' =>
|
|
81 {
|
|
82 feature_set => {
|
|
83 -feature_type => 'BioTIFFIN Motif',
|
|
84 -analysis => 'BioTIFFIN Motif',
|
|
85 },
|
|
86 xrefs => 0,
|
|
87 }
|
|
88 };
|
|
89
|
|
90
|
|
91 #Move xref flag here?
|
|
92 $self->{config} = {
|
|
93 'BioTIFFIN Motif' => {
|
|
94 file => $ENV{'EFG_DATA'}.'/input/BioTIFFIN/fly-tiffinScan-tiffin12.dm3.gff',
|
|
95 gff_attrs => {
|
|
96 'ID' => 1,
|
|
97 },
|
|
98 },
|
|
99 };
|
|
100
|
|
101 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
|
|
102 $self->set_feature_sets;
|
|
103
|
|
104 return $self;
|
|
105 }
|
|
106
|
|
107
|
|
108
|
|
109 # Parse file and return hashref containing:
|
|
110 #
|
|
111 # - arrayref of features
|
|
112 # - arrayref of factors
|
|
113
|
|
114
|
|
115
|
|
116
|
|
117 sub parse_and_load {
|
|
118 my ($self, $files, $old_assembly, $new_assembly) = @_;
|
|
119
|
|
120 if(scalar(@$files) != 1){
|
|
121 throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files));
|
|
122 }
|
|
123
|
|
124
|
|
125 my %slice_cache;
|
|
126 my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor;
|
|
127 my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
|
|
128 my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
|
|
129 # this object is only used for projection
|
|
130 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'BioTIFFINProjection');#do we need this?
|
|
131 my $species = $self->db->species;
|
|
132 if(! $species){
|
|
133 throw('Must define a species to define the external_db');
|
|
134 }
|
|
135 #Just to make sure we hav homo_sapiens and not Homo Sapiens
|
|
136 ($species = lc($species)) =~ s/ /_/;
|
|
137
|
|
138
|
|
139 if(scalar @{$self->import_sets} != 1){
|
|
140 throw('biotiffin parser currently only supports one import FeatureSet');
|
|
141 }
|
|
142
|
|
143 my ($import_set) = @{$self->import_sets};
|
|
144
|
|
145
|
|
146 #foreach my $import_set(@{$self->import_sets}){
|
|
147 $self->log_header("Parsing $import_set data");
|
|
148
|
|
149 my %motif_cache; # name -> factor_id
|
|
150 my $config = $self->{'config'}{$import_set};
|
|
151 my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set};
|
|
152 my %gff_attrs = %{$config->{'gff_attrs'}};
|
|
153
|
|
154
|
|
155 # Parse motifs.txt file
|
|
156 #my $file = $config->{'file'};
|
|
157 my $file = $files->[0];
|
|
158 my $skipped = 0;
|
|
159 my $motif_cnt = 0;
|
|
160 my $factor_xref_cnt = 0;
|
|
161 my $feature_cnt = 0;
|
|
162 my $feature_target_cnt = 0;
|
|
163
|
|
164 open (FILE, "<$file") || die("Can't open $file\n$!\n");
|
|
165
|
|
166 LINE: while (my $line = <FILE>) {
|
|
167 chomp $line;
|
|
168
|
|
169 #GFF3
|
|
170 #3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0
|
|
171 #3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0
|
|
172 #3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0
|
|
173 #3R MotifScanner TIFDMEM0000001 17973965 17973975 0.0 + 0
|
|
174
|
|
175 #seq_name, source, feature, start, end, score, strand, frame, [attrs]
|
|
176 my ($chromosome, $program, $feature, $start, $end, $score, $strand, undef) = split /\t/o, $line;
|
|
177
|
|
178 if(! exists $slice_cache{$chromosome}){
|
|
179
|
|
180 if($old_assembly){
|
|
181 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
|
|
182 $chromosome,
|
|
183 undef,
|
|
184 undef,
|
|
185 undef,
|
|
186 $old_assembly);
|
|
187 } else {
|
|
188 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
|
|
189 }
|
|
190 }
|
|
191
|
|
192 if(! defined $slice_cache{$chromosome}){
|
|
193 warn "Can't get slice $chromosome for motif $feature;\n";
|
|
194 $skipped++;
|
|
195 next;
|
|
196 }
|
|
197
|
|
198 if(! exists $motif_cache{$feature}){
|
|
199
|
|
200 $motif_cache{$feature} = $ftype_adaptor->fetch_by_name($feature);
|
|
201
|
|
202 if(! defined $motif_cache{$feature}){
|
|
203
|
|
204 ($motif_cache{$feature}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
|
|
205 (
|
|
206 -name => $feature,
|
|
207 -class => $fset->feature_type->class,
|
|
208 -description => $fset->feature_type->description,
|
|
209 ))};
|
|
210
|
|
211 $motif_cnt ++;
|
|
212 }
|
|
213 }
|
|
214
|
|
215 my $feature_type = $motif_cache{$feature};
|
|
216
|
|
217 #Now build actual feature
|
|
218
|
|
219 $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
|
|
220 (
|
|
221 -display_label => $feature,
|
|
222 -start => $start,
|
|
223 -end => $end,
|
|
224 -strand => (($strand eq '+') ? 1 : -1),
|
|
225 -feature_type => $feature_type,
|
|
226 -feature_set => $fset,
|
|
227 -slice => $slice_cache{$chromosome},
|
|
228 );
|
|
229
|
|
230
|
|
231 # project if necessary
|
|
232 if ($new_assembly) {
|
|
233 $feature = $self->project_feature($feature, $new_assembly);
|
|
234
|
|
235 if(! defined $feature){
|
|
236 $skipped ++;
|
|
237 next;
|
|
238 }
|
|
239 }
|
|
240
|
|
241 ($feature) = @{$extf_adaptor->store($feature)};
|
|
242 $feature_cnt++;
|
|
243
|
|
244
|
|
245 }
|
|
246
|
|
247 close FILE;
|
|
248
|
|
249 $self->log("Loaded ".$fset->name);
|
|
250 $self->log("$motif_cnt feature types");
|
|
251 $self->log("$feature_cnt features");
|
|
252 $self->log("Skipped $skipped features");
|
|
253
|
|
254 #}
|
|
255
|
|
256 return;
|
|
257 }
|
|
258
|
|
259 1;
|