0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 package Bio::EnsEMBL::Funcgen::Parsers::redfly;
|
|
22
|
|
23 use strict;
|
|
24
|
|
25 use File::Basename;
|
|
26
|
|
27 # To get files for REDfly, download the following 2 GFF3 files (e.g. via wget):
|
|
28 #
|
|
29 # http://redfly.ccr.buffalo.edu/datadumps/tbfs_dump.gff
|
|
30 # http://redfly.ccr.buffalo.edu/datadumps/crm_dump.gff
|
|
31
|
|
32 #contact?
|
|
33
|
|
34 #TFBS
|
|
35 #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
|
|
36 #2L REDfly regulatory_region 2456352 2456369 . . . ID="dl_dpp:REDFLY:TF000069"; Dbxref="Flybase:FBgn0000490", "PMID:8458580", "REDfly:645, "FlyBase:FBgn0000463"; Evidence="footprint/binding assay"; Factor="dl"; Target="dpp";
|
|
37
|
|
38 #CRMs
|
|
39 #2L REDfly regulatory_region 2455781 2457764 . . . ID="dpp_intron2"; Dbxref="Flybase:FBgn0000490", "PMID:8167377", "REDfly:247; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005304";
|
|
40 #2L REDfly regulatory_region 2445769 2446581 . . . ID="dpp_dpp813"; Dbxref="Flybase:FBgn0000490", "PMID:7821226", "REDfly:246; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005653","FBbt:00001051";
|
|
41
|
|
42
|
|
43
|
|
44 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
|
|
45 use Bio::EnsEMBL::DBEntry;
|
|
46 use Bio::EnsEMBL::Funcgen::ExternalFeature;
|
|
47 use Bio::EnsEMBL::Utils::Exception qw( throw );
|
|
48 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
|
|
49
|
|
50 use vars qw(@ISA);
|
|
51 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
|
|
52
|
|
53
|
|
54
|
|
55
|
|
56
|
|
57 sub new {
|
|
58 my $caller = shift;
|
|
59 my $class = ref($caller) || $caller;
|
|
60
|
|
61 my $self = $class->SUPER::new(@_, type => 'REDfly');
|
|
62
|
|
63 #Set default feature_type and feature_set config
|
|
64
|
|
65 #We need to capture version/release/data of external feature sets.
|
|
66 #This can be nested in the description? Need to add description to feature_set?
|
|
67
|
|
68 $self->{static_config}{feature_types} =
|
|
69 {
|
|
70 'REDfly TFBS' => {
|
|
71 -name => 'REDfly TFBS',
|
|
72 -class => 'Transcription Factor',
|
|
73 -description => 'REDfly transciption factor binding site',
|
|
74 },
|
|
75 'REDfly CRM' => {
|
|
76 -name => 'REDfly CRM',
|
|
77 -class => 'Regulatory Motif',
|
|
78 -description => 'REDfly cis regulatory motif',
|
|
79 },
|
|
80 };
|
|
81
|
|
82
|
|
83 $self->{static_config}{analyses} =
|
|
84 {
|
|
85 'REDfly TFBS' => {
|
|
86 -logic_name => 'REDfly TFBS',
|
|
87 -description => 'REDfly transcription factor binding sites (http://redfly.ccr.buffalo.edu/)',
|
|
88 -display_label => 'REDfly TFBS',
|
|
89 -displayable => 1,
|
|
90 },
|
|
91
|
|
92 'REDfly CRM' => {
|
|
93 -logic_name => 'REDfly CRM',
|
|
94 -description => 'REDfly cis regulatory motif (http://redfly.ccr.buffalo.edu/)',
|
|
95 -display_label => 'REDfly CRM',
|
|
96 -displayable => 1,
|
|
97 },
|
|
98 };
|
|
99
|
|
100 $self->{static_config}{feature_sets} =
|
|
101 {
|
|
102 'REDfly TFBSs' => {
|
|
103 feature_set =>
|
|
104 {
|
|
105 -feature_type => 'REDfly TFBS',
|
|
106 -analysis => 'REDfly TFBS',
|
|
107 },
|
|
108 xrefs => 1,
|
|
109 },
|
|
110
|
|
111 'REDfly CRMs' => {
|
|
112 feature_set =>
|
|
113 {
|
|
114 -feature_type => 'REDfly CRM',
|
|
115 -analysis => 'REDfly CRM',
|
|
116 },
|
|
117
|
|
118 xrefs => 1,
|
|
119 },
|
|
120 };
|
|
121
|
|
122
|
|
123 #Move xref flag here?
|
|
124 $self->{config} = {
|
|
125 'REDfly CRMs' => {
|
|
126 #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_crm.gff',
|
|
127 gff_attrs => {
|
|
128 'ID' => 1,
|
|
129 },
|
|
130 },
|
|
131
|
|
132 'REDfly TFBSs' => {
|
|
133 #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_tfbs.gff',
|
|
134 gff_attrs => {
|
|
135 'ID' => 1,
|
|
136 'Factor' => 1,
|
|
137 'Target' => 1,
|
|
138 },
|
|
139 desc_suffix => ' binding site',
|
|
140 }
|
|
141 };
|
|
142
|
|
143
|
|
144 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
|
|
145 $self->set_feature_sets;
|
|
146
|
|
147 return $self;
|
|
148 }
|
|
149
|
|
150
|
|
151
|
|
152
|
|
153
|
|
154 # Parse file and return hashref containing:
|
|
155 #
|
|
156 # - arrayref of features
|
|
157 # - arrayref of factors
|
|
158
|
|
159
|
|
160
|
|
161
|
|
162 sub parse_and_load {
|
|
163 my ($self, $files, $old_assembly, $new_assembly) = @_;
|
|
164
|
|
165 if(scalar(@$files) != 2){
|
|
166 throw('You must currently define a crm and tfbs file to load redfly features from:\t'.join(' ', @$files));
|
|
167 }
|
|
168
|
|
169 #More validation of files here?
|
|
170 $self->{config}{'REDfly CRMs'}{file} = grep(/crm/, @$files);
|
|
171 $self->{config}{'REDfly TFBSs'}{file} = grep(/tfbs/, @$files);
|
|
172
|
|
173 my %slice_cache;
|
|
174 my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor;
|
|
175 my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
|
|
176 my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
|
|
177 # this object is only used for projection
|
|
178 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'REDflyProjection');#do we need this?
|
|
179 my $species = $self->db->species;
|
|
180
|
|
181 if(! $species){
|
|
182 throw('Must define a species to define the external_db');
|
|
183 }
|
|
184
|
|
185 #Just to make sure we hav homo_sapiens and not Homo Sapiens
|
|
186 ($species = lc($species)) =~ s/ /_/;
|
|
187
|
|
188
|
|
189 foreach my $import_set(@{$self->import_sets}){
|
|
190 $self->log_header("Parsing $import_set data");
|
|
191
|
|
192 my %factor_cache; # name -> factor_id
|
|
193 my %target_cache;
|
|
194 my $config = $self->{'config'}{$import_set};
|
|
195 my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set};
|
|
196 my %gff_attrs = %{$config->{'gff_attrs'}};
|
|
197
|
|
198
|
|
199 # Parse motifs.txt file
|
|
200 my $file = $config->{'file'};
|
|
201 my $skipped = 0;
|
|
202 my $factor_cnt = 0;
|
|
203 my $factor_xref_cnt = 0;
|
|
204 my $feature_cnt = 0;
|
|
205 my $feature_target_cnt = 0;
|
|
206
|
|
207 open (FILE, "<$file") || die("Can't open $file\n$!\n");
|
|
208 <FILE>; # skip header
|
|
209
|
|
210 LINE: while (my $line = <FILE>) {
|
|
211 next if ($line =~ /^\s*\#/o || $line =~ /^\s*$/o);
|
|
212 chomp $line;
|
|
213 my %attr_cache;#Can we move this outside the loop and rely on it being reset each time?
|
|
214
|
|
215
|
|
216 #GFF3
|
|
217 #Is this format valid, missing " after REDfly xref
|
|
218 #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
|
|
219 #seq_name, source, feature, start, end, score, strand, frame, [attrs]
|
|
220 my ($chromosome, undef, $feature, $start, $end, undef, undef, undef, $attrs) = split /\t/o, $line;
|
|
221 my @attrs = split/\;\s+/o, $attrs;
|
|
222
|
|
223
|
|
224 #UCSC coords
|
|
225 $start ++;
|
|
226 $end ++;
|
|
227
|
|
228
|
|
229
|
|
230 foreach my $gff_attr(keys %gff_attrs){
|
|
231
|
|
232 if(($attr_cache{$gff_attr}) = grep {/^${gff_attr}\=/} @attrs){
|
|
233 $attr_cache{$gff_attr} =~ s/(^${gff_attr}\=\")(.*)(\")/$2/;
|
|
234
|
|
235 #warn "attr cache is $attr_cache{$gff_attr} ";
|
|
236
|
|
237 }
|
|
238 else{
|
|
239 warn "Skipping import, unable to find mandatory $gff_attr attribute in:\t$line";
|
|
240 next LINE;
|
|
241 }
|
|
242 }
|
|
243
|
|
244
|
|
245 #For TFBS
|
|
246 #Factor = coding gene name display_label
|
|
247 #Target = Target gene?
|
|
248 #Ignore other xrefs for name, just put ID in feature as display_label
|
|
249
|
|
250 #These are mixed up! and where not getting any coding xrefs!
|
|
251
|
|
252
|
|
253 #For CRM
|
|
254 #Can we split the ID and have Reguatory XREF?
|
|
255 #e.g. ID="dpp_dpp813"; => dpp
|
|
256
|
|
257
|
|
258
|
|
259
|
|
260 #This can be moved to the BaseExternalParser
|
|
261
|
|
262 if(! exists $slice_cache{$chromosome}){
|
|
263
|
|
264 if($old_assembly){
|
|
265 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
|
|
266 $chromosome,
|
|
267 undef,
|
|
268 undef,
|
|
269 undef,
|
|
270 $old_assembly);
|
|
271 }else{
|
|
272 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
|
|
273 }
|
|
274 }
|
|
275
|
|
276 if(! defined $slice_cache{$chromosome}){
|
|
277 warn "Can't get slice $chromosome for motif $attr_cache{'ID'};\n";
|
|
278 $skipped++;
|
|
279 next;
|
|
280 }
|
|
281
|
|
282
|
|
283 #get feature_type first
|
|
284
|
|
285 #we are not maintaining this link in the DB!
|
|
286 #Do we need another xref for this or a different table?
|
|
287
|
|
288 my $feature_type;
|
|
289
|
|
290 #TFBSs
|
|
291 if(exists $attr_cache{'Factor'}){
|
|
292
|
|
293 if(! exists $factor_cache{$attr_cache{'Factor'}}){
|
|
294
|
|
295 $factor_cache{$attr_cache{'Factor'}} = $ftype_adaptor->fetch_by_name($attr_cache{'Factor'});
|
|
296
|
|
297 if(! defined $factor_cache{$attr_cache{'Factor'}}){
|
|
298
|
|
299 #Would need to add CODING DBEntry here!
|
|
300 #Will this work on a scalar ref to a hash?
|
|
301 my $desc = (exists $config->{'desc_suffix'}) ? $attr_cache{'Factor'}.$config->{'desc_suffix'} : undef;
|
|
302
|
|
303 ($factor_cache{$attr_cache{'Factor'}}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
|
|
304 (
|
|
305 -name => $attr_cache{'Factor'},
|
|
306 -class => $fset->feature_type->class,
|
|
307 -description => $desc,
|
|
308 ))};
|
|
309
|
|
310 $feature_type = $factor_cache{$attr_cache{'Factor'}};
|
|
311 $factor_cnt ++;
|
|
312 my $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $attr_cache{'Factor'});
|
|
313
|
|
314 #Handle release/version in xref version as stable_id version?
|
|
315
|
|
316 if(! defined $stable_id){
|
|
317 warn "Could not generate CODING xref for feature_type:\t". $attr_cache{'Factor'}."\n";
|
|
318 }else{
|
|
319 #warn "got $stable_id for ".$attr_cache{'Factor'};
|
|
320 my $dbentry = Bio::EnsEMBL::DBEntry->new(
|
|
321 -dbname => $species.'_core_Gene',
|
|
322 #-release => $self->db->dnadb->dbc->dbname,
|
|
323 -status => 'KNOWNXREF',#This is for the external DB
|
|
324 #-display_label_linkable => 1,
|
|
325 -#db_display_name => $self->db->dnadb->dbc->dbname,
|
|
326 -db_display_name => 'EnsemblGene',
|
|
327 -type => 'MISC',#Is for the external_db
|
|
328 -primary_id => $stable_id,
|
|
329 -display_id => $attr_cache{'Factor'},
|
|
330 -info_type => 'MISC',
|
|
331 -into_text => 'GENE',
|
|
332 -linkage_annotation => 'REDfly Coding'
|
|
333 -analysis => $fset->analysis,
|
|
334
|
|
335 #-description => 'cisRED motif gene xref',#This is now generic and no longer resitricted to REDfly
|
|
336 #could have version here if we use the correct dnadb to build the cache
|
|
337 );
|
|
338
|
|
339 $dbentry_adaptor->store($dbentry, $factor_cache{$attr_cache{'Factor'}}->dbID, 'FeatureType', 1);#1 is ignore release flag
|
|
340 $factor_xref_cnt ++;
|
|
341 }
|
|
342 }
|
|
343 }
|
|
344 }
|
|
345 else{
|
|
346 #CRMs
|
|
347 $feature_type = $fset->feature_type;
|
|
348 }
|
|
349
|
|
350
|
|
351 #Now build actual feature
|
|
352 $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
|
|
353 (
|
|
354 -display_label => $attr_cache{'ID'},
|
|
355 -start => $start,
|
|
356 -end => $end,
|
|
357 -strand => 0,
|
|
358 -feature_type => $feature_type,
|
|
359 -feature_set => $fset,
|
|
360 -slice => $slice_cache{$chromosome},
|
|
361 );
|
|
362
|
|
363 # project if necessary
|
|
364 if ($new_assembly) {
|
|
365 $feature = $self->project_feature($feature, $new_assembly);
|
|
366
|
|
367 if(! defined $feature){
|
|
368 $skipped ++;
|
|
369 next;
|
|
370 }
|
|
371 }
|
|
372
|
|
373 ($feature) = @{$extf_adaptor->store($feature)};
|
|
374 $feature_cnt++;
|
|
375
|
|
376 my $target = (exists $attr_cache{'Target'}) ? $attr_cache{'Target'} : (split/_/, $attr_cache{'ID'})[0];
|
|
377 my $stable_id;
|
|
378
|
|
379 if($target ne 'Unspecified'){
|
|
380 $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $target);
|
|
381 }
|
|
382
|
|
383
|
|
384 if(! defined $stable_id){
|
|
385 warn "Could not generate TARGET xref for feature:\t". $attr_cache{'ID'}."\n" if $target ne 'Unspecified';
|
|
386 }
|
|
387 else{
|
|
388 #Handle release/version in xref version as stable_id version?
|
|
389 my $dbentry = Bio::EnsEMBL::DBEntry->new(
|
|
390 -dbname => $species.'_core_Gene',
|
|
391 #-release => $self->db->dnadb->dbc->dbname,
|
|
392 -status => 'KNOWNXREF',
|
|
393 #-display_label_linkable => 1,
|
|
394 -#db_display_name => $self->db->dnadb->dbc->dbname,
|
|
395 -db_display_name => 'EnsemblGene',
|
|
396 -type => 'MISC',#
|
|
397 -primary_id => $stable_id,
|
|
398 -display_id => $target,
|
|
399 -info_type => 'MISC',
|
|
400 -info_text => 'GENE',
|
|
401 -linkage_annotation => $fset->feature_type->name.' Target',
|
|
402 -analysis => $fset->analysis,
|
|
403
|
|
404 #could have version here if we use the correct dnadb to build the cache
|
|
405 );
|
|
406
|
|
407 $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag
|
|
408
|
|
409 $feature_target_cnt ++;
|
|
410 }
|
|
411 }
|
|
412
|
|
413 close FILE;
|
|
414
|
|
415 $self->log("Loaded ".$fset->name);
|
|
416 $self->log("$factor_cnt feature types");
|
|
417 $self->log("$factor_xref_cnt feature type coding xrefs");
|
|
418 $self->log("$feature_cnt features");
|
|
419 $self->log("$feature_target_cnt feature target xrefs");
|
|
420 $self->log("Skipped $skipped features");
|
|
421
|
|
422 }
|
|
423
|
|
424 return;
|
|
425 }
|
|
426
|
|
427
|
|
428
|
|
429
|
|
430 1;
|