0
|
1 =pod
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 Creates WUBlast indexes of the given GZipped file. The resulting index
|
|
28 is created under the parameter location I<base_path> in blast and then in a
|
|
29 directory defined by the type of dump. The type of dump also changes the file
|
|
30 name generated. Genomic dumps have their release number replaced with the
|
|
31 last repeat masked date.
|
|
32
|
|
33 Allowed parameters are:
|
|
34
|
|
35 =over 8
|
|
36
|
|
37 =item file - The file to index
|
|
38
|
|
39 =item program - The location of the xdformat program
|
|
40
|
|
41 =item molecule - The type of molecule to index. I<dna> and I<pep> are allowed
|
|
42
|
|
43 =item type - Type of index we are creating. I<genomic> and I<genes> are allowed
|
|
44
|
|
45 =item base_path - The base of the dumps
|
|
46
|
|
47 =item release - Required for correct DB naming
|
|
48
|
|
49 =back
|
|
50
|
|
51 =cut
|
|
52
|
|
53 package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer;
|
|
54
|
|
55 use strict;
|
|
56 use warnings;
|
|
57 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
|
|
58
|
|
59 use Bio::EnsEMBL::Utils::Exception qw/throw/;
|
|
60 use File::Copy qw/copy/;
|
|
61 use File::Spec;
|
|
62 use POSIX qw/strftime/;
|
|
63
|
|
64 sub param_defaults {
|
|
65 my ($self) = @_;
|
|
66 return {
|
|
67 program => 'xdformat',
|
|
68 # molecule => 'pep', #pep or dna
|
|
69 # type => 'genes' #genes or genomic
|
|
70 };
|
|
71 }
|
|
72
|
|
73 sub fetch_input {
|
|
74 my ($self) = @_;
|
|
75 my $mol = $self->param('molecule');
|
|
76 if($mol ne 'dna' && $mol ne 'pep') {
|
|
77 throw "param 'molecule' must be set to 'dna' or 'pep'";
|
|
78 }
|
|
79 my $type = $self->param('type');
|
|
80 if($type ne 'genomic' && $type ne 'genes') {
|
|
81 throw "param 'type' must be set to 'genomic' or 'genes'";
|
|
82 }
|
|
83 $self->assert_executable($self->param('program'));
|
|
84 $self->assert_executable('gunzip');
|
|
85 }
|
|
86
|
|
87 sub write_output {
|
|
88 my ($self) = @_;
|
|
89 $self->dataflow_output_id({
|
|
90 species => $self->param('species'),
|
|
91 type => $self->param('type'),
|
|
92 molecule => $self->param('molecule'),
|
|
93 index_base => $self->param('index_base')
|
|
94 }, 1);
|
|
95 return;
|
|
96 }
|
|
97
|
|
98 sub index_file {
|
|
99 my ($self, $file) = @_;
|
|
100 my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ;
|
|
101 my $silence = ($self->debug()) ? 0 : 1;
|
|
102 my $target_dir = $self->target_dir();
|
|
103 my $target_file = $self->target_file($file);
|
|
104 my $db_title = $self->db_title($file);
|
|
105 my $date = $self->db_date();
|
|
106
|
|
107 my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s },
|
|
108 $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file);
|
|
109
|
|
110 $self->info('About to run "%s"', $cmd);
|
|
111 my $output = `$cmd 2>&1`;
|
|
112 my $rc = $? >> 8;
|
|
113 throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
|
|
114 unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
|
|
115 $self->param('index_base', $target_file);
|
|
116 return;
|
|
117 }
|
|
118
|
|
119 sub target_file {
|
|
120 my ($self, $file) = @_;
|
|
121 my $target_dir = $self->target_dir();
|
|
122 my $target_filename = $self->target_filename($file);
|
|
123 return File::Spec->catfile($target_dir, $target_filename);
|
|
124 }
|
|
125
|
|
126 # Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX
|
|
127 sub target_dir {
|
|
128 my ($self) = @_;
|
|
129 return $self->get_dir('blast', $self->param('type'));
|
|
130 }
|
|
131
|
|
132 sub db_title {
|
|
133 my ($self, $source_file) = @_;
|
|
134 my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
|
|
135 my $release = $self->param('release');
|
|
136 my $title = $file;
|
|
137 $title =~ s/$release\.//;
|
|
138 return $title;
|
|
139 }
|
|
140
|
|
141 sub db_date {
|
|
142 my ($self) = @_;
|
|
143 return strftime('%d-%m-%Y', gmtime());
|
|
144 }
|
|
145
|
|
146 #Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa
|
|
147 #Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa
|
|
148 sub target_filename {
|
|
149 my ($self, $source_file) = @_;
|
|
150 my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
|
|
151 if($self->param('type') eq 'genomic') {
|
|
152 my @split = split(/\./, $file);
|
|
153 my $rm_date = $self->repeat_mask_date();
|
|
154 $split[-4] = $rm_date;
|
|
155 return join(q{.}, @split);
|
|
156 }
|
|
157 return $file;
|
|
158 }
|
|
159
|
|
160 1;
|