comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/redfly.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =head1 LICENSE
2
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and
4 Genome Research Limited. All rights reserved.
5
6 This software is distributed under a modified Apache license.
7 For license details, please see
8
9 http://www.ensembl.org/info/about/code_licence.html
10
11 =head1 CONTACT
12
13 Please email comments or questions to the public Ensembl
14 developers list at <ensembl-dev@ebi.ac.uk>.
15
16 Questions may also be sent to the Ensembl help desk at
17 <helpdesk@ensembl.org>.
18
19 =cut
20
21 package Bio::EnsEMBL::Funcgen::Parsers::redfly;
22
23 use strict;
24
25 use File::Basename;
26
27 # To get files for REDfly, download the following 2 GFF3 files (e.g. via wget):
28 #
29 # http://redfly.ccr.buffalo.edu/datadumps/tbfs_dump.gff
30 # http://redfly.ccr.buffalo.edu/datadumps/crm_dump.gff
31
32 #contact?
33
34 #TFBS
35 #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
36 #2L REDfly regulatory_region 2456352 2456369 . . . ID="dl_dpp:REDFLY:TF000069"; Dbxref="Flybase:FBgn0000490", "PMID:8458580", "REDfly:645, "FlyBase:FBgn0000463"; Evidence="footprint/binding assay"; Factor="dl"; Target="dpp";
37
38 #CRMs
39 #2L REDfly regulatory_region 2455781 2457764 . . . ID="dpp_intron2"; Dbxref="Flybase:FBgn0000490", "PMID:8167377", "REDfly:247; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005304";
40 #2L REDfly regulatory_region 2445769 2446581 . . . ID="dpp_dpp813"; Dbxref="Flybase:FBgn0000490", "PMID:7821226", "REDfly:246; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005653","FBbt:00001051";
41
42
43
44 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
45 use Bio::EnsEMBL::DBEntry;
46 use Bio::EnsEMBL::Funcgen::ExternalFeature;
47 use Bio::EnsEMBL::Utils::Exception qw( throw );
48 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
49
50 use vars qw(@ISA);
51 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
52
53
54
55
56
57 sub new {
58 my $caller = shift;
59 my $class = ref($caller) || $caller;
60
61 my $self = $class->SUPER::new(@_, type => 'REDfly');
62
63 #Set default feature_type and feature_set config
64
65 #We need to capture version/release/data of external feature sets.
66 #This can be nested in the description? Need to add description to feature_set?
67
68 $self->{static_config}{feature_types} =
69 {
70 'REDfly TFBS' => {
71 -name => 'REDfly TFBS',
72 -class => 'Transcription Factor',
73 -description => 'REDfly transciption factor binding site',
74 },
75 'REDfly CRM' => {
76 -name => 'REDfly CRM',
77 -class => 'Regulatory Motif',
78 -description => 'REDfly cis regulatory motif',
79 },
80 };
81
82
83 $self->{static_config}{analyses} =
84 {
85 'REDfly TFBS' => {
86 -logic_name => 'REDfly TFBS',
87 -description => 'REDfly transcription factor binding sites (http://redfly.ccr.buffalo.edu/)',
88 -display_label => 'REDfly TFBS',
89 -displayable => 1,
90 },
91
92 'REDfly CRM' => {
93 -logic_name => 'REDfly CRM',
94 -description => 'REDfly cis regulatory motif (http://redfly.ccr.buffalo.edu/)',
95 -display_label => 'REDfly CRM',
96 -displayable => 1,
97 },
98 };
99
100 $self->{static_config}{feature_sets} =
101 {
102 'REDfly TFBSs' => {
103 feature_set =>
104 {
105 -feature_type => 'REDfly TFBS',
106 -analysis => 'REDfly TFBS',
107 },
108 xrefs => 1,
109 },
110
111 'REDfly CRMs' => {
112 feature_set =>
113 {
114 -feature_type => 'REDfly CRM',
115 -analysis => 'REDfly CRM',
116 },
117
118 xrefs => 1,
119 },
120 };
121
122
123 #Move xref flag here?
124 $self->{config} = {
125 'REDfly CRMs' => {
126 #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_crm.gff',
127 gff_attrs => {
128 'ID' => 1,
129 },
130 },
131
132 'REDfly TFBSs' => {
133 #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_tfbs.gff',
134 gff_attrs => {
135 'ID' => 1,
136 'Factor' => 1,
137 'Target' => 1,
138 },
139 desc_suffix => ' binding site',
140 }
141 };
142
143
144 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
145 $self->set_feature_sets;
146
147 return $self;
148 }
149
150
151
152
153
154 # Parse file and return hashref containing:
155 #
156 # - arrayref of features
157 # - arrayref of factors
158
159
160
161
162 sub parse_and_load {
163 my ($self, $files, $old_assembly, $new_assembly) = @_;
164
165 if(scalar(@$files) != 2){
166 throw('You must currently define a crm and tfbs file to load redfly features from:\t'.join(' ', @$files));
167 }
168
169 #More validation of files here?
170 $self->{config}{'REDfly CRMs'}{file} = grep(/crm/, @$files);
171 $self->{config}{'REDfly TFBSs'}{file} = grep(/tfbs/, @$files);
172
173 my %slice_cache;
174 my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor;
175 my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
176 my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
177 # this object is only used for projection
178 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'REDflyProjection');#do we need this?
179 my $species = $self->db->species;
180
181 if(! $species){
182 throw('Must define a species to define the external_db');
183 }
184
185 #Just to make sure we hav homo_sapiens and not Homo Sapiens
186 ($species = lc($species)) =~ s/ /_/;
187
188
189 foreach my $import_set(@{$self->import_sets}){
190 $self->log_header("Parsing $import_set data");
191
192 my %factor_cache; # name -> factor_id
193 my %target_cache;
194 my $config = $self->{'config'}{$import_set};
195 my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set};
196 my %gff_attrs = %{$config->{'gff_attrs'}};
197
198
199 # Parse motifs.txt file
200 my $file = $config->{'file'};
201 my $skipped = 0;
202 my $factor_cnt = 0;
203 my $factor_xref_cnt = 0;
204 my $feature_cnt = 0;
205 my $feature_target_cnt = 0;
206
207 open (FILE, "<$file") || die("Can't open $file\n$!\n");
208 <FILE>; # skip header
209
210 LINE: while (my $line = <FILE>) {
211 next if ($line =~ /^\s*\#/o || $line =~ /^\s*$/o);
212 chomp $line;
213 my %attr_cache;#Can we move this outside the loop and rely on it being reset each time?
214
215
216 #GFF3
217 #Is this format valid, missing " after REDfly xref
218 #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
219 #seq_name, source, feature, start, end, score, strand, frame, [attrs]
220 my ($chromosome, undef, $feature, $start, $end, undef, undef, undef, $attrs) = split /\t/o, $line;
221 my @attrs = split/\;\s+/o, $attrs;
222
223
224 #UCSC coords
225 $start ++;
226 $end ++;
227
228
229
230 foreach my $gff_attr(keys %gff_attrs){
231
232 if(($attr_cache{$gff_attr}) = grep {/^${gff_attr}\=/} @attrs){
233 $attr_cache{$gff_attr} =~ s/(^${gff_attr}\=\")(.*)(\")/$2/;
234
235 #warn "attr cache is $attr_cache{$gff_attr} ";
236
237 }
238 else{
239 warn "Skipping import, unable to find mandatory $gff_attr attribute in:\t$line";
240 next LINE;
241 }
242 }
243
244
245 #For TFBS
246 #Factor = coding gene name display_label
247 #Target = Target gene?
248 #Ignore other xrefs for name, just put ID in feature as display_label
249
250 #These are mixed up! and where not getting any coding xrefs!
251
252
253 #For CRM
254 #Can we split the ID and have Reguatory XREF?
255 #e.g. ID="dpp_dpp813"; => dpp
256
257
258
259
260 #This can be moved to the BaseExternalParser
261
262 if(! exists $slice_cache{$chromosome}){
263
264 if($old_assembly){
265 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
266 $chromosome,
267 undef,
268 undef,
269 undef,
270 $old_assembly);
271 }else{
272 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
273 }
274 }
275
276 if(! defined $slice_cache{$chromosome}){
277 warn "Can't get slice $chromosome for motif $attr_cache{'ID'};\n";
278 $skipped++;
279 next;
280 }
281
282
283 #get feature_type first
284
285 #we are not maintaining this link in the DB!
286 #Do we need another xref for this or a different table?
287
288 my $feature_type;
289
290 #TFBSs
291 if(exists $attr_cache{'Factor'}){
292
293 if(! exists $factor_cache{$attr_cache{'Factor'}}){
294
295 $factor_cache{$attr_cache{'Factor'}} = $ftype_adaptor->fetch_by_name($attr_cache{'Factor'});
296
297 if(! defined $factor_cache{$attr_cache{'Factor'}}){
298
299 #Would need to add CODING DBEntry here!
300 #Will this work on a scalar ref to a hash?
301 my $desc = (exists $config->{'desc_suffix'}) ? $attr_cache{'Factor'}.$config->{'desc_suffix'} : undef;
302
303 ($factor_cache{$attr_cache{'Factor'}}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
304 (
305 -name => $attr_cache{'Factor'},
306 -class => $fset->feature_type->class,
307 -description => $desc,
308 ))};
309
310 $feature_type = $factor_cache{$attr_cache{'Factor'}};
311 $factor_cnt ++;
312 my $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $attr_cache{'Factor'});
313
314 #Handle release/version in xref version as stable_id version?
315
316 if(! defined $stable_id){
317 warn "Could not generate CODING xref for feature_type:\t". $attr_cache{'Factor'}."\n";
318 }else{
319 #warn "got $stable_id for ".$attr_cache{'Factor'};
320 my $dbentry = Bio::EnsEMBL::DBEntry->new(
321 -dbname => $species.'_core_Gene',
322 #-release => $self->db->dnadb->dbc->dbname,
323 -status => 'KNOWNXREF',#This is for the external DB
324 #-display_label_linkable => 1,
325 -#db_display_name => $self->db->dnadb->dbc->dbname,
326 -db_display_name => 'EnsemblGene',
327 -type => 'MISC',#Is for the external_db
328 -primary_id => $stable_id,
329 -display_id => $attr_cache{'Factor'},
330 -info_type => 'MISC',
331 -into_text => 'GENE',
332 -linkage_annotation => 'REDfly Coding'
333 -analysis => $fset->analysis,
334
335 #-description => 'cisRED motif gene xref',#This is now generic and no longer resitricted to REDfly
336 #could have version here if we use the correct dnadb to build the cache
337 );
338
339 $dbentry_adaptor->store($dbentry, $factor_cache{$attr_cache{'Factor'}}->dbID, 'FeatureType', 1);#1 is ignore release flag
340 $factor_xref_cnt ++;
341 }
342 }
343 }
344 }
345 else{
346 #CRMs
347 $feature_type = $fset->feature_type;
348 }
349
350
351 #Now build actual feature
352 $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
353 (
354 -display_label => $attr_cache{'ID'},
355 -start => $start,
356 -end => $end,
357 -strand => 0,
358 -feature_type => $feature_type,
359 -feature_set => $fset,
360 -slice => $slice_cache{$chromosome},
361 );
362
363 # project if necessary
364 if ($new_assembly) {
365 $feature = $self->project_feature($feature, $new_assembly);
366
367 if(! defined $feature){
368 $skipped ++;
369 next;
370 }
371 }
372
373 ($feature) = @{$extf_adaptor->store($feature)};
374 $feature_cnt++;
375
376 my $target = (exists $attr_cache{'Target'}) ? $attr_cache{'Target'} : (split/_/, $attr_cache{'ID'})[0];
377 my $stable_id;
378
379 if($target ne 'Unspecified'){
380 $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $target);
381 }
382
383
384 if(! defined $stable_id){
385 warn "Could not generate TARGET xref for feature:\t". $attr_cache{'ID'}."\n" if $target ne 'Unspecified';
386 }
387 else{
388 #Handle release/version in xref version as stable_id version?
389 my $dbentry = Bio::EnsEMBL::DBEntry->new(
390 -dbname => $species.'_core_Gene',
391 #-release => $self->db->dnadb->dbc->dbname,
392 -status => 'KNOWNXREF',
393 #-display_label_linkable => 1,
394 -#db_display_name => $self->db->dnadb->dbc->dbname,
395 -db_display_name => 'EnsemblGene',
396 -type => 'MISC',#
397 -primary_id => $stable_id,
398 -display_id => $target,
399 -info_type => 'MISC',
400 -info_text => 'GENE',
401 -linkage_annotation => $fset->feature_type->name.' Target',
402 -analysis => $fset->analysis,
403
404 #could have version here if we use the correct dnadb to build the cache
405 );
406
407 $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag
408
409 $feature_target_cnt ++;
410 }
411 }
412
413 close FILE;
414
415 $self->log("Loaded ".$fset->name);
416 $self->log("$factor_cnt feature types");
417 $self->log("$factor_xref_cnt feature type coding xrefs");
418 $self->log("$feature_cnt features");
419 $self->log("$feature_target_cnt feature target xrefs");
420 $self->log("Skipped $skipped features");
421
422 }
423
424 return;
425 }
426
427
428
429
430 1;