0
|
1 =pod
|
|
2
|
|
3 =head1 LICENSE
|
|
4
|
|
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and
|
|
6 Genome Research Limited. All rights reserved.
|
|
7
|
|
8 This software is distributed under a modified Apache license.
|
|
9 For license details, please see
|
|
10
|
|
11 http://www.ensembl.org/info/about/code_licence.html
|
|
12
|
|
13 =head1 CONTACT
|
|
14
|
|
15 Please email comments or questions to the public Ensembl
|
|
16 developers list at <dev@ensembl.org>.
|
|
17
|
|
18 Questions may also be sent to the Ensembl help desk at
|
|
19 <helpdesk@ensembl.org>.
|
|
20
|
|
21 =head1 NAME
|
|
22
|
|
23 Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles
|
|
24
|
|
25 =head1 DESCRIPTION
|
|
26
|
|
27 Performs a find in the DNA dumps directory for the given species and then
|
|
28 concats files which match a specified name pattern. We only allow
|
|
29 two types of concats; DNA and RM DNA. The concat file is a series
|
|
30 of cat command calls from all other Gzipped FASTA dumps (allowed under
|
|
31 the GZip specification).
|
|
32
|
|
33 Allowed parameters are:
|
|
34
|
|
35 =over 8
|
|
36
|
|
37 =item release - Needed to build the target path
|
|
38
|
|
39 =item species - Required to indicate which species we are working with
|
|
40
|
|
41 =item data_type - The type of data to work with. Can be I<dna>, I<dn_sm> or I<dna_rm>
|
|
42
|
|
43 =item base_path - The base of the dumps
|
|
44
|
|
45 =back
|
|
46
|
|
47 =cut
|
|
48
|
|
49 package Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles;
|
|
50
|
|
51 use strict;
|
|
52 use warnings;
|
|
53 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/;
|
|
54
|
|
55 use File::Spec;
|
|
56 use File::stat;
|
|
57
|
|
58 sub param_defaults {
|
|
59 my ($self) = @_;
|
|
60 return {
|
|
61 dna => {
|
|
62 regex => qr/.+\.dna\..+\.fa\.gz$/,
|
|
63 },
|
|
64 dna_rm => {
|
|
65 regex => qr/.+\.dna_rm\..+\.fa\.gz$/,
|
|
66 },
|
|
67 dna_sm => {
|
|
68 regex => qr/.+\.dna_sm\..+\.fa\.gz$/,
|
|
69 },
|
|
70 };
|
|
71 }
|
|
72
|
|
73 sub fetch_input {
|
|
74 my ($self) = @_;
|
|
75 foreach my $key (qw/data_type species release base_path/) {
|
|
76 $self->throw("Cannot find the required parameter $key") unless $self->param($key);
|
|
77 }
|
|
78 return;
|
|
79 }
|
|
80
|
|
81 # sticks ends of files together into one big file.
|
|
82 sub run {
|
|
83 my ($self) = @_;
|
|
84
|
|
85 my @file_list = @{$self->get_dna_files()};
|
|
86 my $count = scalar(@file_list);
|
|
87 my $running_total_size = 0;
|
|
88
|
|
89 if($count) {
|
|
90 my $target_file = $self->target_file();
|
|
91 $self->info("Concatting type %s with %d file(s) into %s", $self->param('data_type'), $count, $target_file);
|
|
92
|
|
93 if(-f $target_file) {
|
|
94 $self->info("Target already exists. Removing");
|
|
95 unlink $target_file or $self->throw("Could not remove $target_file: $!");
|
|
96 }
|
|
97
|
|
98 $self->info('Running concat');
|
|
99 foreach my $file (@file_list) {
|
|
100 $self->fine('Processing %s', $file);
|
|
101 $running_total_size += stat($file)->size;
|
|
102 system("cat $file >> $target_file")
|
|
103 and $self->throw( sprintf('Cannot concat %s into %s. RC %d', $file, $target_file, ($?>>8)));
|
|
104 }
|
|
105
|
|
106 $self->info("Catted files together");
|
|
107
|
|
108 my $catted_size = stat($target_file)->size;
|
|
109
|
|
110 if($running_total_size != $catted_size) {
|
|
111 $self->throw(sprintf('The total size of the files catted together should be %d but was in fact %d. Failing as we expect the catted size to be the same', $running_total_size, $catted_size));
|
|
112 }
|
|
113
|
|
114 $self->param('target_file', $target_file);
|
|
115 }
|
|
116 else {
|
|
117 $self->throw("Cannot continue as we found no files to concat");
|
|
118 }
|
|
119 return;
|
|
120 }
|
|
121
|
|
122 sub write_output {
|
|
123 my ($self) = @_;
|
|
124 my $file = $self->param('target_file');
|
|
125 if($file) {
|
|
126 $self->dataflow_output_id({ file => $file, species => $self->param('species') }, 1);
|
|
127 }
|
|
128 return;
|
|
129 }
|
|
130
|
|
131 sub get_dna_files {
|
|
132 my ($self) = @_;
|
|
133 my $path = $self->fasta_path('dna');
|
|
134 my $data_type = $self->param('data_type');
|
|
135 my $regex_hash = $self->param($data_type);
|
|
136 if(! $regex_hash ) {
|
|
137 $self->throw("We do not have an entry for the data_type $data_type in our regex lookup hash. Edit this module");
|
|
138 }
|
|
139 my $regex = $regex_hash->{regex};
|
|
140 my $filter = sub {
|
|
141 my ($filename) = @_;
|
|
142 return ($filename =~ $regex && $filename !~ /\.toplevel\./) ? 1 : 0;
|
|
143 };
|
|
144 my $files = $self->find_files($path, $filter);
|
|
145 return [ sort @{$files} ];
|
|
146 }
|
|
147
|
|
148
|
|
149 sub target_file {
|
|
150 my ($self) = @_;
|
|
151 # File name format looks like:
|
|
152 # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz
|
|
153 # e.g. Homo_sapiens.GRCh37.64.dna_rm.toplevel.fa.gz
|
|
154 my @name_bits;
|
|
155 push @name_bits, $self->web_name();
|
|
156 push @name_bits, $self->assembly();
|
|
157 push @name_bits, $self->param('release');
|
|
158 push @name_bits, $self->param('data_type');
|
|
159 push @name_bits, 'toplevel';
|
|
160 push @name_bits, 'fa', 'gz';
|
|
161 my $file_name = join( '.', @name_bits );
|
|
162 my $dir = $self->fasta_path('dna');
|
|
163 return File::Spec->catfile( $dir, $file_name );
|
|
164 }
|
|
165
|
|
166 1;
|