0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 package Bio::EnsEMBL::Funcgen::Parsers::cisred;
|
|
22
|
|
23 use strict;
|
|
24
|
|
25 use File::Basename;
|
|
26
|
|
27 # To get files for CisRed data, download the following 2 files (e.g. via wget):
|
|
28 #
|
|
29 # http://www.cisred.org/content/databases_methods/human_2/data_files/motifs.txt
|
|
30 #
|
|
31 # http://www.cisred.org/content/databases_methods/human_2/data_files/search_regions.txt
|
|
32
|
|
33
|
|
34 #No longer valid urls, now use the following for ensembl formats for all species:
|
|
35 #http://www.bcgsc.ca/downloads/cisred/temp/cisRED4Ensembl/
|
|
36 #naminf may not be obvious, may have to cross reference with above previous urls to get build info
|
|
37
|
|
38 # Format of motifs.txt (note group_name often blank)
|
|
39
|
|
40 #name chromosome start end strand group_name ensembl_gene
|
|
41 #craHsap1 1 168129978 168129997 -1 1 ENSG00000000457
|
|
42 #craHsap2 1 168129772 168129781 -1 2 ENSG00000000457
|
|
43 #craHsap3 1 168129745 168129756 -1 3 ENSG00000000457
|
|
44 #craHsap4 1 168129746 168129753 -1 4 ENSG00000000457
|
|
45 #craHsap5 1 168129745 168129752 -1 5 ENSG00000000457
|
|
46 #craHsap6 1 168129741 168129757 -1 6 ENSG00000000457
|
|
47
|
|
48
|
|
49 # Format of search_regions.txt
|
|
50 # name chromosome start end strand ensembl_gene_id
|
|
51 # 1 17 39822200 39824467 -1 ENSG00000005961
|
|
52 # 8 17 23151483 23153621 -1 ENSG00000007171
|
|
53 # 14 1 166434638 166437230 -1 ENSG00000007908
|
|
54 # 19 1 23602820 23605631 -1 ENSG00000007968
|
|
55
|
|
56
|
|
57 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
|
|
58 use Bio::EnsEMBL::DBEntry;
|
|
59 use Bio::EnsEMBL::Funcgen::ExternalFeature;
|
|
60 use Bio::EnsEMBL::Utils::Exception qw( throw );
|
|
61
|
|
62
|
|
63 use vars qw(@ISA);
|
|
64 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
|
|
65
|
|
66
|
|
67
|
|
68
|
|
69
|
|
70 sub new {
|
|
71 my $caller = shift;
|
|
72 my $class = ref($caller) || $caller;
|
|
73
|
|
74 my $self = $class->SUPER::new(@_, type => 'cisRED');
|
|
75
|
|
76 #Set default feature_type and feature_set config
|
|
77 $self->{static_config}{feature_types} =
|
|
78 {
|
|
79 'cisRED Search Region' => {
|
|
80 -name => 'cisRED Search Region',
|
|
81 -class => 'Search Region',
|
|
82 -description => 'cisRED search region',
|
|
83 },
|
|
84 'cisRED Motif' => {
|
|
85 -name => 'cisRED Motif',
|
|
86 -class => 'Regulatory Motif',
|
|
87 -description => 'cisRED atomic motif',
|
|
88 },
|
|
89 };
|
|
90
|
|
91 $self->{static_config}{analyses} =
|
|
92 {
|
|
93 cisRED => {
|
|
94 -logic_name => 'cisRED',
|
|
95 -description => 'cisRED motif search (www.cisred.org)',
|
|
96 -display_label => 'cisRED',
|
|
97 -displayable => 1,
|
|
98 },
|
|
99 };
|
|
100
|
|
101 $self->{static_config}{feature_sets} =
|
|
102 {
|
|
103 'cisRED search regions' =>
|
|
104 {
|
|
105 analyses => $self->{static_config}{analyses},
|
|
106 feature_types => $self->{static_config}{feature_types},
|
|
107 feature_set => {
|
|
108 #feature_type and analysis here are the keys from above
|
|
109 -feature_type => 'cisRED Search Region',
|
|
110 -display_label => 'cisRED searches',
|
|
111 -analysis => 'cisRED',
|
|
112 },
|
|
113 xrefs => 1,
|
|
114 },
|
|
115
|
|
116
|
|
117 'cisRED motifs' =>
|
|
118 {
|
|
119 analyses => $self->{static_config}{analyses},
|
|
120 feature_types => $self->{static_config}{feature_types},
|
|
121 feature_set => {
|
|
122 #feature_type and analysis here are the keys from above
|
|
123 -feature_type => 'cisRED Motif',
|
|
124 -analysis => 'cisRED',
|
|
125 },
|
|
126 xrefs => 1,
|
|
127 },
|
|
128 };
|
|
129
|
|
130 #$self->validate_and_store_feature_types;
|
|
131 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
|
|
132 $self->set_feature_sets;
|
|
133
|
|
134 return $self;
|
|
135 }
|
|
136
|
|
137
|
|
138
|
|
139
|
|
140
|
|
141 # Parse file and return hashref containing:
|
|
142 #
|
|
143 # - arrayref of features
|
|
144 # - arrayref of factors
|
|
145
|
|
146
|
|
147 #To do
|
|
148 # 1 This needs to take both file names, motifs, then search regions. Like the Bed/GFF importers do.
|
|
149
|
|
150
|
|
151 sub parse_and_load {
|
|
152 my ($self, $files, $old_assembly, $new_assembly) = @_;
|
|
153 $self->log_header("Parsing cisRED data");
|
|
154
|
|
155 if(scalar(@$files) != 2){
|
|
156 throw('You must currently define a motif and search file to load cisRED features from:\t'.join(' ', @$files));
|
|
157 }
|
|
158
|
|
159
|
|
160 my $analysis_adaptor = $self->db->get_AnalysisAdaptor();
|
|
161 #my %features_by_group; # name -> factor_id
|
|
162 my %groups;
|
|
163 my %slice_cache;
|
|
164 my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor;
|
|
165 my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
|
|
166 my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
|
|
167 #my $display_name_cache = $self->build_display_name_cache('gene');
|
|
168 # this object is only used for projection
|
|
169 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisREDProjection');
|
|
170
|
|
171 # ----------------------------------------
|
|
172 # We need a "blank" factor for those features which aren't assigned factors
|
|
173 # Done this way to maintain referential integrity
|
|
174 #my $blank_factor_id = $self->get_blank_factor_id($db_adaptor);
|
|
175
|
|
176 #More validation of files here?
|
|
177 my ($motif_file) = grep(/motif/, @$files);
|
|
178 my ($search_file) = grep(/search/, @$files);
|
|
179 my $species = $self->db->species;
|
|
180 if(! $species){
|
|
181 throw('Must define a species to define the external_db');
|
|
182 }
|
|
183 #Just to make sure we hav homo_sapiens and not Homo Sapiens
|
|
184 ($species = lc($species)) =~ s/ /_/;
|
|
185
|
|
186
|
|
187 # Parse motifs.txt file
|
|
188 $self->log_header("Parsing cisRED motifs from $motif_file");
|
|
189 my $skipped = 0;
|
|
190 my $skipped_xref = 0;
|
|
191 #my $coords_changed = 0;
|
|
192 my $cnt = 0;
|
|
193 my $set = $self->{static_config}{feature_sets}{'cisRED motifs'}{feature_set};
|
|
194
|
|
195
|
|
196
|
|
197 open (FILE, "<$motif_file") || die "Can't open $motif_file";
|
|
198 <FILE>; # skip header
|
|
199
|
|
200 while (<FILE>) {
|
|
201 next if ($_ =~ /^\s*\#/o || $_ =~ /^\s*$/o);
|
|
202 chomp;
|
|
203
|
|
204 #name chromosome start end strand group_name ensembl_gene
|
|
205 #craHsap1 1 168129978 168129997 - crtHsap40066,crtHsap40060 ENSG00000000457
|
|
206 #craHsap2 1 168129772 168129781 - crtHsap40068,crtHsap40193,crtHsap40130 ENSG00000000457
|
|
207
|
|
208 #So we only ever have one atomic motif, which may belong to several groups
|
|
209 #Do not store atmoic motifs as feature types as this is the actual feature
|
|
210 #simply use the feature_set->feature_type and store the atmoic motif id as the name
|
|
211
|
|
212
|
|
213 my ($motif_name, $chromosome, $start, $end, $strand, $groups, $gene_id) = split/\t/o;
|
|
214 #($gene_id) = $gene_id =~ /(ENS.*G\d{11})/;
|
|
215 my @group_names = split/,/, $groups;
|
|
216
|
|
217 #These are stranded features, so either - or +, never 0;
|
|
218 $strand = ($strand eq '-') ? -1 : 1;
|
|
219
|
|
220 if(! exists $slice_cache{$chromosome}){
|
|
221
|
|
222 if($old_assembly){
|
|
223 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
|
|
224 $chromosome,
|
|
225 undef,
|
|
226 undef,
|
|
227 undef,
|
|
228 $old_assembly);
|
|
229 }else{
|
|
230 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
|
|
231 }
|
|
232 }
|
|
233
|
|
234 if(! defined $slice_cache{$chromosome}){
|
|
235 warn "Can't get slice $chromosome for motif $motif_name\n";
|
|
236 $skipped++;
|
|
237 next;
|
|
238 }
|
|
239
|
|
240
|
|
241 #get feature_type first
|
|
242
|
|
243 #we are not maintaining this link in the DB!
|
|
244 #Do we need another xref for this or a different table?
|
|
245
|
|
246
|
|
247 #if ($group_name && $group_name ne '' && $group_name !~ /\s/o) {
|
|
248 #
|
|
249 # if(! exists $features_by_group{$group_name}){
|
|
250 # $features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name);
|
|
251 #
|
|
252 # if(! defined $features_by_group{$group_name}){
|
|
253 # ($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
|
|
254 # (
|
|
255 # -name => 'crtHsap'.$group_name,
|
|
256 # -class => 'Regulatory Motif',
|
|
257 # -description => 'cisRED group',
|
|
258 # ))};
|
|
259 # }
|
|
260 # }
|
|
261 # }
|
|
262 #}else{
|
|
263 # throw("Found cisRED feature $motif_name with no group_name, unable to defined feature_type");
|
|
264 #}
|
|
265
|
|
266 foreach my $group(@group_names){
|
|
267
|
|
268 next if exists $groups{$group};
|
|
269
|
|
270 #else store the new group as a feature_type and set $group to be the feature_type
|
|
271 ($group) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
|
|
272 (
|
|
273 -name => $group,
|
|
274 -class => 'Regulatory Motif',
|
|
275 -description => 'cisRED group',
|
|
276 ))};
|
|
277 }
|
|
278
|
|
279
|
|
280
|
|
281 #my $ftype = (defined $features_by_group{$group_name}) ? $features_by_group{$group_name} : $self->{'feature_sets'}{'cisRED group motifs'}->feature_type;
|
|
282
|
|
283
|
|
284 my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
|
|
285 (
|
|
286 -display_label => $motif_name,
|
|
287 -start => $start,
|
|
288 -end => $end,
|
|
289 -strand => $strand,
|
|
290 #-feature_type => $ftype,
|
|
291 -associated_feature_types => \@group_names,
|
|
292 -feature_set => $set,
|
|
293 -slice => $slice_cache{$chromosome},
|
|
294 );
|
|
295
|
|
296
|
|
297
|
|
298 # project if necessary
|
|
299 if ($new_assembly) {
|
|
300 $feature = $self->project_feature($feature, $new_assembly);
|
|
301
|
|
302 if(! defined $feature){
|
|
303 $skipped ++;
|
|
304 next;
|
|
305 }
|
|
306
|
|
307 }
|
|
308
|
|
309 ($feature) = @{$extf_adaptor->store($feature)};
|
|
310 $cnt++;
|
|
311
|
|
312
|
|
313
|
|
314 #We don't care so much about loading features for retired Genes here
|
|
315 #as the Genes are only used to define the search regions
|
|
316 #Not a direct alignment as with the miRanda set
|
|
317
|
|
318 #However, adding an xref will create dead link in the browser
|
|
319
|
|
320 #Build Xref here
|
|
321 if (! $gene_id) {
|
|
322 warn("No xref available for motif $motif_name\n");
|
|
323 $skipped_xref++;
|
|
324 next;
|
|
325 }
|
|
326
|
|
327 my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene');
|
|
328
|
|
329 #Handle release/version in xref version as stable_id version?
|
|
330
|
|
331 my $dbentry = Bio::EnsEMBL::DBEntry->new
|
|
332 (
|
|
333 -dbname => $species.'_core_Gene',
|
|
334 #-release => $self->db->_get_schema_build($self->db->dnadb),
|
|
335 #-release => '49_36b',#harcoded for human
|
|
336 -release => '49_37b', #hardcoded for mouse
|
|
337 -status => 'KNOWNXREF',
|
|
338 #-display_label_linkable => 1,
|
|
339 -db_display_name => 'EnsemblGene',
|
|
340 -type => 'MISC',#this is external_db.type
|
|
341 -primary_id => $gene_id,
|
|
342 -display_id => $display_name,
|
|
343 -info_type => 'MISC',
|
|
344 -info_text => 'GENE',
|
|
345 -linkage_annotation => 'cisRED motif gene',
|
|
346 -analysis => $set->analysis,
|
|
347 #could have version here if we use the correct dnadb to build the cache
|
|
348 );
|
|
349 $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag
|
|
350 }
|
|
351
|
|
352
|
|
353 close FILE;
|
|
354
|
|
355 $self->log("Stored $cnt cisRED ExternalFeature motif");
|
|
356 $self->log("Skipped $skipped cisRED ExternalFeature motif imports");
|
|
357 $self->log("Skipped an additional $skipped_xref DBEntry imports");
|
|
358
|
|
359 #Now store states
|
|
360 foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){
|
|
361 $set->adaptor->store_status($status, $set);
|
|
362 }
|
|
363
|
|
364
|
|
365
|
|
366 # ----------------------------------------
|
|
367 # Search regions
|
|
368 # read search_regions.txt from same location as $file
|
|
369
|
|
370 #my $search_regions_file = dirname($file) . "/search_regions.txt";
|
|
371 #my $search_file;
|
|
372 #($search_regions_file = $file) =~ s/motifs/searchregions/;
|
|
373
|
|
374 $skipped = 0;
|
|
375 $cnt = 0;
|
|
376 $skipped_xref = 0;
|
|
377 $set = $self->{static_config}{feature_sets}{'cisRED search regions'}{feature_set};
|
|
378
|
|
379 $self->log_header("Parsing cisRED search regions from $search_file");
|
|
380 open (SEARCH_REGIONS, "<$search_file") || die "Can't open $search_file";
|
|
381 <SEARCH_REGIONS>; # skip header
|
|
382
|
|
383 while (<SEARCH_REGIONS>) {
|
|
384 chomp;
|
|
385 my ($id, $chromosome, $start, $end, $strand, $gene_id) = split;
|
|
386 my $display_id = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene');
|
|
387 my $name = "CisRed_Search_$id";
|
|
388
|
|
389 if(! exists $slice_cache{$chromosome}){
|
|
390
|
|
391 if($old_assembly){
|
|
392 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome',
|
|
393 $chromosome,
|
|
394 undef,
|
|
395 undef,
|
|
396 undef,
|
|
397 $old_assembly);
|
|
398 }else{
|
|
399 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
|
|
400 }
|
|
401 }
|
|
402
|
|
403 if(! defined $slice_cache{$chromosome}){
|
|
404 warn "Can't get slice $chromosome for search region $name\n";
|
|
405 next;
|
|
406 }
|
|
407
|
|
408
|
|
409
|
|
410
|
|
411
|
|
412 my $search_feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
|
|
413 (
|
|
414 -display_label => $name,
|
|
415 -start => $start,
|
|
416 -end => $end,
|
|
417 -strand => $strand,
|
|
418 -feature_set => $set,
|
|
419 -slice => $slice_cache{$chromosome},
|
|
420 );
|
|
421
|
|
422
|
|
423
|
|
424 # project if necessary
|
|
425 if ($new_assembly) {
|
|
426 $search_feature = $self->project_feature($search_feature);
|
|
427
|
|
428 if(! defined $search_feature){
|
|
429 $skipped ++;
|
|
430 next;
|
|
431 }
|
|
432 }
|
|
433
|
|
434 $extf_adaptor->store($search_feature);
|
|
435 $cnt++;
|
|
436
|
|
437 #Build Xref here
|
|
438 #need to validate gene_id here!!
|
|
439
|
|
440 if (! $gene_id) {
|
|
441 warn("Can't get internal ID for $gene_id\n");
|
|
442 $skipped_xref++;
|
|
443 next;
|
|
444 }
|
|
445
|
|
446 my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene');
|
|
447
|
|
448 my $dbentry = Bio::EnsEMBL::DBEntry->new
|
|
449 (
|
|
450 -dbname => $species.'_core_Gene',
|
|
451 #-release => $self->db->dnadb->dbc->dbname,
|
|
452 -status => 'KNOWNXREF',
|
|
453 #-display_label_linkable => 1,
|
|
454 #-db_display_name => $self->db->dnadb->dbc->dbname,
|
|
455 -db_display_name => 'EnsemblGene',
|
|
456 -type => 'MISC',
|
|
457 -primary_id => $gene_id,
|
|
458 -display_id => $display_name,
|
|
459 -info_type => 'MISC',
|
|
460 -info_text => 'GENE',
|
|
461 -linkage_annotation => 'cisRED search region gene',#omit?
|
|
462 -analysis => $set->analysis,
|
|
463 #could have version here if we use the correct dnadb to build the cache
|
|
464 );
|
|
465 $dbentry_adaptor->store($dbentry, $search_feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag
|
|
466 }
|
|
467
|
|
468 close(SEARCH_REGIONS);
|
|
469
|
|
470
|
|
471 $self->log("Stored $cnt cisRED search region ExternalFeatures");
|
|
472 $self->log("Skipped $skipped cisRED search region ExternalFeatures");
|
|
473 $self->log("Skipped an additional $skipped_xref cisRED search region DBEntry imports");
|
|
474
|
|
475 #No MART_DISPLAYABLE here
|
|
476 $set->adaptor->store_status('DISPLAYABLE', $set);
|
|
477
|
|
478
|
|
479 #print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly);
|
|
480
|
|
481 return;
|
|
482
|
|
483 }
|
|
484
|
|
485
|
|
486
|
|
487 1;
|