annotate variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/WuBlastIndexer.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 =pod
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 =head1 LICENSE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6 Genome Research Limited. All rights reserved.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8 This software is distributed under a modified Apache license.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9 For license details, please see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 http://www.ensembl.org/info/about/code_licence.html
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 =head1 CONTACT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 Please email comments or questions to the public Ensembl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16 developers list at <dev@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 Questions may also be sent to the Ensembl help desk at
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19 <helpdesk@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 Creates WUBlast indexes of the given GZipped file. The resulting index
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28 is created under the parameter location I<base_path> in blast and then in a
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 directory defined by the type of dump. The type of dump also changes the file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 name generated. Genomic dumps have their release number replaced with the
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 last repeat masked date.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33 Allowed parameters are:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35 =over 8
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 =item file - The file to index
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 =item program - The location of the xdformat program
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 =item molecule - The type of molecule to index. I<dna> and I<pep> are allowed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 =item type - Type of index we are creating. I<genomic> and I<genes> are allowed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 =item base_path - The base of the dumps
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 =item release - Required for correct DB naming
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49 =back
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 package Bio::EnsEMBL::Pipeline::FASTA::WuBlastIndexer;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 use warnings;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Indexer/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59 use Bio::EnsEMBL::Utils::Exception qw/throw/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 use File::Copy qw/copy/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 use File::Spec;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 use POSIX qw/strftime/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64 sub param_defaults {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 return {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 program => 'xdformat',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 # molecule => 'pep', #pep or dna
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 # type => 'genes' #genes or genomic
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 sub fetch_input {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 my $mol = $self->param('molecule');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 if($mol ne 'dna' && $mol ne 'pep') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 throw "param 'molecule' must be set to 'dna' or 'pep'";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 my $type = $self->param('type');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 if($type ne 'genomic' && $type ne 'genes') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 throw "param 'type' must be set to 'genomic' or 'genes'";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 $self->assert_executable($self->param('program'));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 $self->assert_executable('gunzip');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87 sub write_output {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 $self->dataflow_output_id({
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 species => $self->param('species'),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 type => $self->param('type'),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92 molecule => $self->param('molecule'),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 index_base => $self->param('index_base')
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 }, 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98 sub index_file {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 my ($self, $file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 my $molecule_arg = ($self->param('molecule') eq 'dna') ? '-n' : '-p' ;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 my $silence = ($self->debug()) ? 0 : 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102 my $target_dir = $self->target_dir();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 my $target_file = $self->target_file($file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 my $db_title = $self->db_title($file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 my $date = $self->db_date();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107 my $cmd = sprintf(q{cd %s && %s %s -q%d -I -t %s -d %s -o %s %s },
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 $target_dir, $self->param('program'), $molecule_arg, $silence, $db_title, $date, $target_file, $file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 $self->info('About to run "%s"', $cmd);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 my $output = `$cmd 2>&1`;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 my $rc = $? >> 8;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 throw "Cannot run program '$cmd'. Return code was ${rc}. Program output was $output" if $rc;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114 unlink $file or throw "Cannot remove the file '$file' from the filesystem: $!";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 $self->param('index_base', $target_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 sub target_file {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 my ($self, $file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 my $target_dir = $self->target_dir();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 my $target_filename = $self->target_filename($file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 return File::Spec->catfile($target_dir, $target_filename);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126 # Produce a dir like /nfs/path/to/blast/genes/XXX && /nfs/path/to/blast/dna/XXX
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 sub target_dir {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 return $self->get_dir('blast', $self->param('type'));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 sub db_title {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133 my ($self, $source_file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 my $release = $self->param('release');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 my $title = $file;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 $title =~ s/$release\.//;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 return $title;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141 sub db_date {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143 return strftime('%d-%m-%Y', gmtime());
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146 #Source like Homo_sapiens.GRCh37.68.dna.toplevel.fa
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 #Filename like Homo_sapiens.GRCh37.20090401.dna.toplevel.fa
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 sub target_filename {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 my ($self, $source_file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150 my ($vol, $dir, $file) = File::Spec->splitpath($source_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 if($self->param('type') eq 'genomic') {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152 my @split = split(/\./, $file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 my $rm_date = $self->repeat_mask_date();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 $split[-4] = $rm_date;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 return join(q{.}, @split);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 return $file;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 1;