Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/cisred.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <ensembl-dev@ebi.ac.uk>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 package Bio::EnsEMBL::Funcgen::Parsers::cisred; | |
22 | |
23 use strict; | |
24 | |
25 use File::Basename; | |
26 | |
27 # To get files for CisRed data, download the following 2 files (e.g. via wget): | |
28 # | |
29 # http://www.cisred.org/content/databases_methods/human_2/data_files/motifs.txt | |
30 # | |
31 # http://www.cisred.org/content/databases_methods/human_2/data_files/search_regions.txt | |
32 | |
33 | |
34 #No longer valid urls, now use the following for ensembl formats for all species: | |
35 #http://www.bcgsc.ca/downloads/cisred/temp/cisRED4Ensembl/ | |
36 #naminf may not be obvious, may have to cross reference with above previous urls to get build info | |
37 | |
38 # Format of motifs.txt (note group_name often blank) | |
39 | |
40 #name chromosome start end strand group_name ensembl_gene | |
41 #craHsap1 1 168129978 168129997 -1 1 ENSG00000000457 | |
42 #craHsap2 1 168129772 168129781 -1 2 ENSG00000000457 | |
43 #craHsap3 1 168129745 168129756 -1 3 ENSG00000000457 | |
44 #craHsap4 1 168129746 168129753 -1 4 ENSG00000000457 | |
45 #craHsap5 1 168129745 168129752 -1 5 ENSG00000000457 | |
46 #craHsap6 1 168129741 168129757 -1 6 ENSG00000000457 | |
47 | |
48 | |
49 # Format of search_regions.txt | |
50 # name chromosome start end strand ensembl_gene_id | |
51 # 1 17 39822200 39824467 -1 ENSG00000005961 | |
52 # 8 17 23151483 23153621 -1 ENSG00000007171 | |
53 # 14 1 166434638 166437230 -1 ENSG00000007908 | |
54 # 19 1 23602820 23605631 -1 ENSG00000007968 | |
55 | |
56 | |
57 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; | |
58 use Bio::EnsEMBL::DBEntry; | |
59 use Bio::EnsEMBL::Funcgen::ExternalFeature; | |
60 use Bio::EnsEMBL::Utils::Exception qw( throw ); | |
61 | |
62 | |
63 use vars qw(@ISA); | |
64 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); | |
65 | |
66 | |
67 | |
68 | |
69 | |
70 sub new { | |
71 my $caller = shift; | |
72 my $class = ref($caller) || $caller; | |
73 | |
74 my $self = $class->SUPER::new(@_, type => 'cisRED'); | |
75 | |
76 #Set default feature_type and feature_set config | |
77 $self->{static_config}{feature_types} = | |
78 { | |
79 'cisRED Search Region' => { | |
80 -name => 'cisRED Search Region', | |
81 -class => 'Search Region', | |
82 -description => 'cisRED search region', | |
83 }, | |
84 'cisRED Motif' => { | |
85 -name => 'cisRED Motif', | |
86 -class => 'Regulatory Motif', | |
87 -description => 'cisRED atomic motif', | |
88 }, | |
89 }; | |
90 | |
91 $self->{static_config}{analyses} = | |
92 { | |
93 cisRED => { | |
94 -logic_name => 'cisRED', | |
95 -description => 'cisRED motif search (www.cisred.org)', | |
96 -display_label => 'cisRED', | |
97 -displayable => 1, | |
98 }, | |
99 }; | |
100 | |
101 $self->{static_config}{feature_sets} = | |
102 { | |
103 'cisRED search regions' => | |
104 { | |
105 analyses => $self->{static_config}{analyses}, | |
106 feature_types => $self->{static_config}{feature_types}, | |
107 feature_set => { | |
108 #feature_type and analysis here are the keys from above | |
109 -feature_type => 'cisRED Search Region', | |
110 -display_label => 'cisRED searches', | |
111 -analysis => 'cisRED', | |
112 }, | |
113 xrefs => 1, | |
114 }, | |
115 | |
116 | |
117 'cisRED motifs' => | |
118 { | |
119 analyses => $self->{static_config}{analyses}, | |
120 feature_types => $self->{static_config}{feature_types}, | |
121 feature_set => { | |
122 #feature_type and analysis here are the keys from above | |
123 -feature_type => 'cisRED Motif', | |
124 -analysis => 'cisRED', | |
125 }, | |
126 xrefs => 1, | |
127 }, | |
128 }; | |
129 | |
130 #$self->validate_and_store_feature_types; | |
131 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); | |
132 $self->set_feature_sets; | |
133 | |
134 return $self; | |
135 } | |
136 | |
137 | |
138 | |
139 | |
140 | |
141 # Parse file and return hashref containing: | |
142 # | |
143 # - arrayref of features | |
144 # - arrayref of factors | |
145 | |
146 | |
147 #To do | |
148 # 1 This needs to take both file names, motifs, then search regions. Like the Bed/GFF importers do. | |
149 | |
150 | |
151 sub parse_and_load { | |
152 my ($self, $files, $old_assembly, $new_assembly) = @_; | |
153 $self->log_header("Parsing cisRED data"); | |
154 | |
155 if(scalar(@$files) != 2){ | |
156 throw('You must currently define a motif and search file to load cisRED features from:\t'.join(' ', @$files)); | |
157 } | |
158 | |
159 | |
160 my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); | |
161 #my %features_by_group; # name -> factor_id | |
162 my %groups; | |
163 my %slice_cache; | |
164 my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; | |
165 my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; | |
166 my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; | |
167 #my $display_name_cache = $self->build_display_name_cache('gene'); | |
168 # this object is only used for projection | |
169 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisREDProjection'); | |
170 | |
171 # ---------------------------------------- | |
172 # We need a "blank" factor for those features which aren't assigned factors | |
173 # Done this way to maintain referential integrity | |
174 #my $blank_factor_id = $self->get_blank_factor_id($db_adaptor); | |
175 | |
176 #More validation of files here? | |
177 my ($motif_file) = grep(/motif/, @$files); | |
178 my ($search_file) = grep(/search/, @$files); | |
179 my $species = $self->db->species; | |
180 if(! $species){ | |
181 throw('Must define a species to define the external_db'); | |
182 } | |
183 #Just to make sure we hav homo_sapiens and not Homo Sapiens | |
184 ($species = lc($species)) =~ s/ /_/; | |
185 | |
186 | |
187 # Parse motifs.txt file | |
188 $self->log_header("Parsing cisRED motifs from $motif_file"); | |
189 my $skipped = 0; | |
190 my $skipped_xref = 0; | |
191 #my $coords_changed = 0; | |
192 my $cnt = 0; | |
193 my $set = $self->{static_config}{feature_sets}{'cisRED motifs'}{feature_set}; | |
194 | |
195 | |
196 | |
197 open (FILE, "<$motif_file") || die "Can't open $motif_file"; | |
198 <FILE>; # skip header | |
199 | |
200 while (<FILE>) { | |
201 next if ($_ =~ /^\s*\#/o || $_ =~ /^\s*$/o); | |
202 chomp; | |
203 | |
204 #name chromosome start end strand group_name ensembl_gene | |
205 #craHsap1 1 168129978 168129997 - crtHsap40066,crtHsap40060 ENSG00000000457 | |
206 #craHsap2 1 168129772 168129781 - crtHsap40068,crtHsap40193,crtHsap40130 ENSG00000000457 | |
207 | |
208 #So we only ever have one atomic motif, which may belong to several groups | |
209 #Do not store atmoic motifs as feature types as this is the actual feature | |
210 #simply use the feature_set->feature_type and store the atmoic motif id as the name | |
211 | |
212 | |
213 my ($motif_name, $chromosome, $start, $end, $strand, $groups, $gene_id) = split/\t/o; | |
214 #($gene_id) = $gene_id =~ /(ENS.*G\d{11})/; | |
215 my @group_names = split/,/, $groups; | |
216 | |
217 #These are stranded features, so either - or +, never 0; | |
218 $strand = ($strand eq '-') ? -1 : 1; | |
219 | |
220 if(! exists $slice_cache{$chromosome}){ | |
221 | |
222 if($old_assembly){ | |
223 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', | |
224 $chromosome, | |
225 undef, | |
226 undef, | |
227 undef, | |
228 $old_assembly); | |
229 }else{ | |
230 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); | |
231 } | |
232 } | |
233 | |
234 if(! defined $slice_cache{$chromosome}){ | |
235 warn "Can't get slice $chromosome for motif $motif_name\n"; | |
236 $skipped++; | |
237 next; | |
238 } | |
239 | |
240 | |
241 #get feature_type first | |
242 | |
243 #we are not maintaining this link in the DB! | |
244 #Do we need another xref for this or a different table? | |
245 | |
246 | |
247 #if ($group_name && $group_name ne '' && $group_name !~ /\s/o) { | |
248 # | |
249 # if(! exists $features_by_group{$group_name}){ | |
250 # $features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name); | |
251 # | |
252 # if(! defined $features_by_group{$group_name}){ | |
253 # ($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new | |
254 # ( | |
255 # -name => 'crtHsap'.$group_name, | |
256 # -class => 'Regulatory Motif', | |
257 # -description => 'cisRED group', | |
258 # ))}; | |
259 # } | |
260 # } | |
261 # } | |
262 #}else{ | |
263 # throw("Found cisRED feature $motif_name with no group_name, unable to defined feature_type"); | |
264 #} | |
265 | |
266 foreach my $group(@group_names){ | |
267 | |
268 next if exists $groups{$group}; | |
269 | |
270 #else store the new group as a feature_type and set $group to be the feature_type | |
271 ($group) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new | |
272 ( | |
273 -name => $group, | |
274 -class => 'Regulatory Motif', | |
275 -description => 'cisRED group', | |
276 ))}; | |
277 } | |
278 | |
279 | |
280 | |
281 #my $ftype = (defined $features_by_group{$group_name}) ? $features_by_group{$group_name} : $self->{'feature_sets'}{'cisRED group motifs'}->feature_type; | |
282 | |
283 | |
284 my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new | |
285 ( | |
286 -display_label => $motif_name, | |
287 -start => $start, | |
288 -end => $end, | |
289 -strand => $strand, | |
290 #-feature_type => $ftype, | |
291 -associated_feature_types => \@group_names, | |
292 -feature_set => $set, | |
293 -slice => $slice_cache{$chromosome}, | |
294 ); | |
295 | |
296 | |
297 | |
298 # project if necessary | |
299 if ($new_assembly) { | |
300 $feature = $self->project_feature($feature, $new_assembly); | |
301 | |
302 if(! defined $feature){ | |
303 $skipped ++; | |
304 next; | |
305 } | |
306 | |
307 } | |
308 | |
309 ($feature) = @{$extf_adaptor->store($feature)}; | |
310 $cnt++; | |
311 | |
312 | |
313 | |
314 #We don't care so much about loading features for retired Genes here | |
315 #as the Genes are only used to define the search regions | |
316 #Not a direct alignment as with the miRanda set | |
317 | |
318 #However, adding an xref will create dead link in the browser | |
319 | |
320 #Build Xref here | |
321 if (! $gene_id) { | |
322 warn("No xref available for motif $motif_name\n"); | |
323 $skipped_xref++; | |
324 next; | |
325 } | |
326 | |
327 my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); | |
328 | |
329 #Handle release/version in xref version as stable_id version? | |
330 | |
331 my $dbentry = Bio::EnsEMBL::DBEntry->new | |
332 ( | |
333 -dbname => $species.'_core_Gene', | |
334 #-release => $self->db->_get_schema_build($self->db->dnadb), | |
335 #-release => '49_36b',#harcoded for human | |
336 -release => '49_37b', #hardcoded for mouse | |
337 -status => 'KNOWNXREF', | |
338 #-display_label_linkable => 1, | |
339 -db_display_name => 'EnsemblGene', | |
340 -type => 'MISC',#this is external_db.type | |
341 -primary_id => $gene_id, | |
342 -display_id => $display_name, | |
343 -info_type => 'MISC', | |
344 -info_text => 'GENE', | |
345 -linkage_annotation => 'cisRED motif gene', | |
346 -analysis => $set->analysis, | |
347 #could have version here if we use the correct dnadb to build the cache | |
348 ); | |
349 $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag | |
350 } | |
351 | |
352 | |
353 close FILE; | |
354 | |
355 $self->log("Stored $cnt cisRED ExternalFeature motif"); | |
356 $self->log("Skipped $skipped cisRED ExternalFeature motif imports"); | |
357 $self->log("Skipped an additional $skipped_xref DBEntry imports"); | |
358 | |
359 #Now store states | |
360 foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){ | |
361 $set->adaptor->store_status($status, $set); | |
362 } | |
363 | |
364 | |
365 | |
366 # ---------------------------------------- | |
367 # Search regions | |
368 # read search_regions.txt from same location as $file | |
369 | |
370 #my $search_regions_file = dirname($file) . "/search_regions.txt"; | |
371 #my $search_file; | |
372 #($search_regions_file = $file) =~ s/motifs/searchregions/; | |
373 | |
374 $skipped = 0; | |
375 $cnt = 0; | |
376 $skipped_xref = 0; | |
377 $set = $self->{static_config}{feature_sets}{'cisRED search regions'}{feature_set}; | |
378 | |
379 $self->log_header("Parsing cisRED search regions from $search_file"); | |
380 open (SEARCH_REGIONS, "<$search_file") || die "Can't open $search_file"; | |
381 <SEARCH_REGIONS>; # skip header | |
382 | |
383 while (<SEARCH_REGIONS>) { | |
384 chomp; | |
385 my ($id, $chromosome, $start, $end, $strand, $gene_id) = split; | |
386 my $display_id = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); | |
387 my $name = "CisRed_Search_$id"; | |
388 | |
389 if(! exists $slice_cache{$chromosome}){ | |
390 | |
391 if($old_assembly){ | |
392 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', | |
393 $chromosome, | |
394 undef, | |
395 undef, | |
396 undef, | |
397 $old_assembly); | |
398 }else{ | |
399 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); | |
400 } | |
401 } | |
402 | |
403 if(! defined $slice_cache{$chromosome}){ | |
404 warn "Can't get slice $chromosome for search region $name\n"; | |
405 next; | |
406 } | |
407 | |
408 | |
409 | |
410 | |
411 | |
412 my $search_feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new | |
413 ( | |
414 -display_label => $name, | |
415 -start => $start, | |
416 -end => $end, | |
417 -strand => $strand, | |
418 -feature_set => $set, | |
419 -slice => $slice_cache{$chromosome}, | |
420 ); | |
421 | |
422 | |
423 | |
424 # project if necessary | |
425 if ($new_assembly) { | |
426 $search_feature = $self->project_feature($search_feature); | |
427 | |
428 if(! defined $search_feature){ | |
429 $skipped ++; | |
430 next; | |
431 } | |
432 } | |
433 | |
434 $extf_adaptor->store($search_feature); | |
435 $cnt++; | |
436 | |
437 #Build Xref here | |
438 #need to validate gene_id here!! | |
439 | |
440 if (! $gene_id) { | |
441 warn("Can't get internal ID for $gene_id\n"); | |
442 $skipped_xref++; | |
443 next; | |
444 } | |
445 | |
446 my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); | |
447 | |
448 my $dbentry = Bio::EnsEMBL::DBEntry->new | |
449 ( | |
450 -dbname => $species.'_core_Gene', | |
451 #-release => $self->db->dnadb->dbc->dbname, | |
452 -status => 'KNOWNXREF', | |
453 #-display_label_linkable => 1, | |
454 #-db_display_name => $self->db->dnadb->dbc->dbname, | |
455 -db_display_name => 'EnsemblGene', | |
456 -type => 'MISC', | |
457 -primary_id => $gene_id, | |
458 -display_id => $display_name, | |
459 -info_type => 'MISC', | |
460 -info_text => 'GENE', | |
461 -linkage_annotation => 'cisRED search region gene',#omit? | |
462 -analysis => $set->analysis, | |
463 #could have version here if we use the correct dnadb to build the cache | |
464 ); | |
465 $dbentry_adaptor->store($dbentry, $search_feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag | |
466 } | |
467 | |
468 close(SEARCH_REGIONS); | |
469 | |
470 | |
471 $self->log("Stored $cnt cisRED search region ExternalFeatures"); | |
472 $self->log("Skipped $skipped cisRED search region ExternalFeatures"); | |
473 $self->log("Skipped an additional $skipped_xref cisRED search region DBEntry imports"); | |
474 | |
475 #No MART_DISPLAYABLE here | |
476 $set->adaptor->store_status('DISPLAYABLE', $set); | |
477 | |
478 | |
479 #print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly); | |
480 | |
481 return; | |
482 | |
483 } | |
484 | |
485 | |
486 | |
487 1; |