Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/redfly.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <ensembl-dev@ebi.ac.uk>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 package Bio::EnsEMBL::Funcgen::Parsers::redfly; | |
22 | |
23 use strict; | |
24 | |
25 use File::Basename; | |
26 | |
27 # To get files for REDfly, download the following 2 GFF3 files (e.g. via wget): | |
28 # | |
29 # http://redfly.ccr.buffalo.edu/datadumps/tbfs_dump.gff | |
30 # http://redfly.ccr.buffalo.edu/datadumps/crm_dump.gff | |
31 | |
32 #contact? | |
33 | |
34 #TFBS | |
35 #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp"; | |
36 #2L REDfly regulatory_region 2456352 2456369 . . . ID="dl_dpp:REDFLY:TF000069"; Dbxref="Flybase:FBgn0000490", "PMID:8458580", "REDfly:645, "FlyBase:FBgn0000463"; Evidence="footprint/binding assay"; Factor="dl"; Target="dpp"; | |
37 | |
38 #CRMs | |
39 #2L REDfly regulatory_region 2455781 2457764 . . . ID="dpp_intron2"; Dbxref="Flybase:FBgn0000490", "PMID:8167377", "REDfly:247; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005304"; | |
40 #2L REDfly regulatory_region 2445769 2446581 . . . ID="dpp_dpp813"; Dbxref="Flybase:FBgn0000490", "PMID:7821226", "REDfly:246; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005653","FBbt:00001051"; | |
41 | |
42 | |
43 | |
44 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; | |
45 use Bio::EnsEMBL::DBEntry; | |
46 use Bio::EnsEMBL::Funcgen::ExternalFeature; | |
47 use Bio::EnsEMBL::Utils::Exception qw( throw ); | |
48 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
49 | |
50 use vars qw(@ISA); | |
51 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); | |
52 | |
53 | |
54 | |
55 | |
56 | |
57 sub new { | |
58 my $caller = shift; | |
59 my $class = ref($caller) || $caller; | |
60 | |
61 my $self = $class->SUPER::new(@_, type => 'REDfly'); | |
62 | |
63 #Set default feature_type and feature_set config | |
64 | |
65 #We need to capture version/release/data of external feature sets. | |
66 #This can be nested in the description? Need to add description to feature_set? | |
67 | |
68 $self->{static_config}{feature_types} = | |
69 { | |
70 'REDfly TFBS' => { | |
71 -name => 'REDfly TFBS', | |
72 -class => 'Transcription Factor', | |
73 -description => 'REDfly transciption factor binding site', | |
74 }, | |
75 'REDfly CRM' => { | |
76 -name => 'REDfly CRM', | |
77 -class => 'Regulatory Motif', | |
78 -description => 'REDfly cis regulatory motif', | |
79 }, | |
80 }; | |
81 | |
82 | |
83 $self->{static_config}{analyses} = | |
84 { | |
85 'REDfly TFBS' => { | |
86 -logic_name => 'REDfly TFBS', | |
87 -description => 'REDfly transcription factor binding sites (http://redfly.ccr.buffalo.edu/)', | |
88 -display_label => 'REDfly TFBS', | |
89 -displayable => 1, | |
90 }, | |
91 | |
92 'REDfly CRM' => { | |
93 -logic_name => 'REDfly CRM', | |
94 -description => 'REDfly cis regulatory motif (http://redfly.ccr.buffalo.edu/)', | |
95 -display_label => 'REDfly CRM', | |
96 -displayable => 1, | |
97 }, | |
98 }; | |
99 | |
100 $self->{static_config}{feature_sets} = | |
101 { | |
102 'REDfly TFBSs' => { | |
103 feature_set => | |
104 { | |
105 -feature_type => 'REDfly TFBS', | |
106 -analysis => 'REDfly TFBS', | |
107 }, | |
108 xrefs => 1, | |
109 }, | |
110 | |
111 'REDfly CRMs' => { | |
112 feature_set => | |
113 { | |
114 -feature_type => 'REDfly CRM', | |
115 -analysis => 'REDfly CRM', | |
116 }, | |
117 | |
118 xrefs => 1, | |
119 }, | |
120 }; | |
121 | |
122 | |
123 #Move xref flag here? | |
124 $self->{config} = { | |
125 'REDfly CRMs' => { | |
126 #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_crm.gff', | |
127 gff_attrs => { | |
128 'ID' => 1, | |
129 }, | |
130 }, | |
131 | |
132 'REDfly TFBSs' => { | |
133 #file => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_tfbs.gff', | |
134 gff_attrs => { | |
135 'ID' => 1, | |
136 'Factor' => 1, | |
137 'Target' => 1, | |
138 }, | |
139 desc_suffix => ' binding site', | |
140 } | |
141 }; | |
142 | |
143 | |
144 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); | |
145 $self->set_feature_sets; | |
146 | |
147 return $self; | |
148 } | |
149 | |
150 | |
151 | |
152 | |
153 | |
154 # Parse file and return hashref containing: | |
155 # | |
156 # - arrayref of features | |
157 # - arrayref of factors | |
158 | |
159 | |
160 | |
161 | |
162 sub parse_and_load { | |
163 my ($self, $files, $old_assembly, $new_assembly) = @_; | |
164 | |
165 if(scalar(@$files) != 2){ | |
166 throw('You must currently define a crm and tfbs file to load redfly features from:\t'.join(' ', @$files)); | |
167 } | |
168 | |
169 #More validation of files here? | |
170 $self->{config}{'REDfly CRMs'}{file} = grep(/crm/, @$files); | |
171 $self->{config}{'REDfly TFBSs'}{file} = grep(/tfbs/, @$files); | |
172 | |
173 my %slice_cache; | |
174 my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; | |
175 my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; | |
176 my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; | |
177 # this object is only used for projection | |
178 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'REDflyProjection');#do we need this? | |
179 my $species = $self->db->species; | |
180 | |
181 if(! $species){ | |
182 throw('Must define a species to define the external_db'); | |
183 } | |
184 | |
185 #Just to make sure we hav homo_sapiens and not Homo Sapiens | |
186 ($species = lc($species)) =~ s/ /_/; | |
187 | |
188 | |
189 foreach my $import_set(@{$self->import_sets}){ | |
190 $self->log_header("Parsing $import_set data"); | |
191 | |
192 my %factor_cache; # name -> factor_id | |
193 my %target_cache; | |
194 my $config = $self->{'config'}{$import_set}; | |
195 my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set}; | |
196 my %gff_attrs = %{$config->{'gff_attrs'}}; | |
197 | |
198 | |
199 # Parse motifs.txt file | |
200 my $file = $config->{'file'}; | |
201 my $skipped = 0; | |
202 my $factor_cnt = 0; | |
203 my $factor_xref_cnt = 0; | |
204 my $feature_cnt = 0; | |
205 my $feature_target_cnt = 0; | |
206 | |
207 open (FILE, "<$file") || die("Can't open $file\n$!\n"); | |
208 <FILE>; # skip header | |
209 | |
210 LINE: while (my $line = <FILE>) { | |
211 next if ($line =~ /^\s*\#/o || $line =~ /^\s*$/o); | |
212 chomp $line; | |
213 my %attr_cache;#Can we move this outside the loop and rely on it being reset each time? | |
214 | |
215 | |
216 #GFF3 | |
217 #Is this format valid, missing " after REDfly xref | |
218 #2L REDfly regulatory_region 2456365 2456372 . . . ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp"; | |
219 #seq_name, source, feature, start, end, score, strand, frame, [attrs] | |
220 my ($chromosome, undef, $feature, $start, $end, undef, undef, undef, $attrs) = split /\t/o, $line; | |
221 my @attrs = split/\;\s+/o, $attrs; | |
222 | |
223 | |
224 #UCSC coords | |
225 $start ++; | |
226 $end ++; | |
227 | |
228 | |
229 | |
230 foreach my $gff_attr(keys %gff_attrs){ | |
231 | |
232 if(($attr_cache{$gff_attr}) = grep {/^${gff_attr}\=/} @attrs){ | |
233 $attr_cache{$gff_attr} =~ s/(^${gff_attr}\=\")(.*)(\")/$2/; | |
234 | |
235 #warn "attr cache is $attr_cache{$gff_attr} "; | |
236 | |
237 } | |
238 else{ | |
239 warn "Skipping import, unable to find mandatory $gff_attr attribute in:\t$line"; | |
240 next LINE; | |
241 } | |
242 } | |
243 | |
244 | |
245 #For TFBS | |
246 #Factor = coding gene name display_label | |
247 #Target = Target gene? | |
248 #Ignore other xrefs for name, just put ID in feature as display_label | |
249 | |
250 #These are mixed up! and where not getting any coding xrefs! | |
251 | |
252 | |
253 #For CRM | |
254 #Can we split the ID and have Reguatory XREF? | |
255 #e.g. ID="dpp_dpp813"; => dpp | |
256 | |
257 | |
258 | |
259 | |
260 #This can be moved to the BaseExternalParser | |
261 | |
262 if(! exists $slice_cache{$chromosome}){ | |
263 | |
264 if($old_assembly){ | |
265 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', | |
266 $chromosome, | |
267 undef, | |
268 undef, | |
269 undef, | |
270 $old_assembly); | |
271 }else{ | |
272 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); | |
273 } | |
274 } | |
275 | |
276 if(! defined $slice_cache{$chromosome}){ | |
277 warn "Can't get slice $chromosome for motif $attr_cache{'ID'};\n"; | |
278 $skipped++; | |
279 next; | |
280 } | |
281 | |
282 | |
283 #get feature_type first | |
284 | |
285 #we are not maintaining this link in the DB! | |
286 #Do we need another xref for this or a different table? | |
287 | |
288 my $feature_type; | |
289 | |
290 #TFBSs | |
291 if(exists $attr_cache{'Factor'}){ | |
292 | |
293 if(! exists $factor_cache{$attr_cache{'Factor'}}){ | |
294 | |
295 $factor_cache{$attr_cache{'Factor'}} = $ftype_adaptor->fetch_by_name($attr_cache{'Factor'}); | |
296 | |
297 if(! defined $factor_cache{$attr_cache{'Factor'}}){ | |
298 | |
299 #Would need to add CODING DBEntry here! | |
300 #Will this work on a scalar ref to a hash? | |
301 my $desc = (exists $config->{'desc_suffix'}) ? $attr_cache{'Factor'}.$config->{'desc_suffix'} : undef; | |
302 | |
303 ($factor_cache{$attr_cache{'Factor'}}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new | |
304 ( | |
305 -name => $attr_cache{'Factor'}, | |
306 -class => $fset->feature_type->class, | |
307 -description => $desc, | |
308 ))}; | |
309 | |
310 $feature_type = $factor_cache{$attr_cache{'Factor'}}; | |
311 $factor_cnt ++; | |
312 my $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $attr_cache{'Factor'}); | |
313 | |
314 #Handle release/version in xref version as stable_id version? | |
315 | |
316 if(! defined $stable_id){ | |
317 warn "Could not generate CODING xref for feature_type:\t". $attr_cache{'Factor'}."\n"; | |
318 }else{ | |
319 #warn "got $stable_id for ".$attr_cache{'Factor'}; | |
320 my $dbentry = Bio::EnsEMBL::DBEntry->new( | |
321 -dbname => $species.'_core_Gene', | |
322 #-release => $self->db->dnadb->dbc->dbname, | |
323 -status => 'KNOWNXREF',#This is for the external DB | |
324 #-display_label_linkable => 1, | |
325 -#db_display_name => $self->db->dnadb->dbc->dbname, | |
326 -db_display_name => 'EnsemblGene', | |
327 -type => 'MISC',#Is for the external_db | |
328 -primary_id => $stable_id, | |
329 -display_id => $attr_cache{'Factor'}, | |
330 -info_type => 'MISC', | |
331 -into_text => 'GENE', | |
332 -linkage_annotation => 'REDfly Coding' | |
333 -analysis => $fset->analysis, | |
334 | |
335 #-description => 'cisRED motif gene xref',#This is now generic and no longer resitricted to REDfly | |
336 #could have version here if we use the correct dnadb to build the cache | |
337 ); | |
338 | |
339 $dbentry_adaptor->store($dbentry, $factor_cache{$attr_cache{'Factor'}}->dbID, 'FeatureType', 1);#1 is ignore release flag | |
340 $factor_xref_cnt ++; | |
341 } | |
342 } | |
343 } | |
344 } | |
345 else{ | |
346 #CRMs | |
347 $feature_type = $fset->feature_type; | |
348 } | |
349 | |
350 | |
351 #Now build actual feature | |
352 $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new | |
353 ( | |
354 -display_label => $attr_cache{'ID'}, | |
355 -start => $start, | |
356 -end => $end, | |
357 -strand => 0, | |
358 -feature_type => $feature_type, | |
359 -feature_set => $fset, | |
360 -slice => $slice_cache{$chromosome}, | |
361 ); | |
362 | |
363 # project if necessary | |
364 if ($new_assembly) { | |
365 $feature = $self->project_feature($feature, $new_assembly); | |
366 | |
367 if(! defined $feature){ | |
368 $skipped ++; | |
369 next; | |
370 } | |
371 } | |
372 | |
373 ($feature) = @{$extf_adaptor->store($feature)}; | |
374 $feature_cnt++; | |
375 | |
376 my $target = (exists $attr_cache{'Target'}) ? $attr_cache{'Target'} : (split/_/, $attr_cache{'ID'})[0]; | |
377 my $stable_id; | |
378 | |
379 if($target ne 'Unspecified'){ | |
380 $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $target); | |
381 } | |
382 | |
383 | |
384 if(! defined $stable_id){ | |
385 warn "Could not generate TARGET xref for feature:\t". $attr_cache{'ID'}."\n" if $target ne 'Unspecified'; | |
386 } | |
387 else{ | |
388 #Handle release/version in xref version as stable_id version? | |
389 my $dbentry = Bio::EnsEMBL::DBEntry->new( | |
390 -dbname => $species.'_core_Gene', | |
391 #-release => $self->db->dnadb->dbc->dbname, | |
392 -status => 'KNOWNXREF', | |
393 #-display_label_linkable => 1, | |
394 -#db_display_name => $self->db->dnadb->dbc->dbname, | |
395 -db_display_name => 'EnsemblGene', | |
396 -type => 'MISC',# | |
397 -primary_id => $stable_id, | |
398 -display_id => $target, | |
399 -info_type => 'MISC', | |
400 -info_text => 'GENE', | |
401 -linkage_annotation => $fset->feature_type->name.' Target', | |
402 -analysis => $fset->analysis, | |
403 | |
404 #could have version here if we use the correct dnadb to build the cache | |
405 ); | |
406 | |
407 $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag | |
408 | |
409 $feature_target_cnt ++; | |
410 } | |
411 } | |
412 | |
413 close FILE; | |
414 | |
415 $self->log("Loaded ".$fset->name); | |
416 $self->log("$factor_cnt feature types"); | |
417 $self->log("$factor_xref_cnt feature type coding xrefs"); | |
418 $self->log("$feature_cnt features"); | |
419 $self->log("$feature_target_cnt feature target xrefs"); | |
420 $self->log("Skipped $skipped features"); | |
421 | |
422 } | |
423 | |
424 return; | |
425 } | |
426 | |
427 | |
428 | |
429 | |
430 1; |