Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Pipeline/FASTA/ConcatFiles.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 LICENSE | |
4 | |
5 Copyright (c) 1999-2012 The European Bioinformatics Institute and | |
6 Genome Research Limited. All rights reserved. | |
7 | |
8 This software is distributed under a modified Apache license. | |
9 For license details, please see | |
10 | |
11 http://www.ensembl.org/info/about/code_licence.html | |
12 | |
13 =head1 CONTACT | |
14 | |
15 Please email comments or questions to the public Ensembl | |
16 developers list at <dev@ensembl.org>. | |
17 | |
18 Questions may also be sent to the Ensembl help desk at | |
19 <helpdesk@ensembl.org>. | |
20 | |
21 =head1 NAME | |
22 | |
23 Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles | |
24 | |
25 =head1 DESCRIPTION | |
26 | |
27 Performs a find in the DNA dumps directory for the given species and then | |
28 concats files which match a specified name pattern. We only allow | |
29 two types of concats; DNA and RM DNA. The concat file is a series | |
30 of cat command calls from all other Gzipped FASTA dumps (allowed under | |
31 the GZip specification). | |
32 | |
33 Allowed parameters are: | |
34 | |
35 =over 8 | |
36 | |
37 =item release - Needed to build the target path | |
38 | |
39 =item species - Required to indicate which species we are working with | |
40 | |
41 =item data_type - The type of data to work with. Can be I<dna>, I<dn_sm> or I<dna_rm> | |
42 | |
43 =item base_path - The base of the dumps | |
44 | |
45 =back | |
46 | |
47 =cut | |
48 | |
49 package Bio::EnsEMBL::Pipeline::FASTA::ConcatFiles; | |
50 | |
51 use strict; | |
52 use warnings; | |
53 use base qw/Bio::EnsEMBL::Pipeline::FASTA::Base/; | |
54 | |
55 use File::Spec; | |
56 use File::stat; | |
57 | |
58 sub param_defaults { | |
59 my ($self) = @_; | |
60 return { | |
61 dna => { | |
62 regex => qr/.+\.dna\..+\.fa\.gz$/, | |
63 }, | |
64 dna_rm => { | |
65 regex => qr/.+\.dna_rm\..+\.fa\.gz$/, | |
66 }, | |
67 dna_sm => { | |
68 regex => qr/.+\.dna_sm\..+\.fa\.gz$/, | |
69 }, | |
70 }; | |
71 } | |
72 | |
73 sub fetch_input { | |
74 my ($self) = @_; | |
75 foreach my $key (qw/data_type species release base_path/) { | |
76 $self->throw("Cannot find the required parameter $key") unless $self->param($key); | |
77 } | |
78 return; | |
79 } | |
80 | |
81 # sticks ends of files together into one big file. | |
82 sub run { | |
83 my ($self) = @_; | |
84 | |
85 my @file_list = @{$self->get_dna_files()}; | |
86 my $count = scalar(@file_list); | |
87 my $running_total_size = 0; | |
88 | |
89 if($count) { | |
90 my $target_file = $self->target_file(); | |
91 $self->info("Concatting type %s with %d file(s) into %s", $self->param('data_type'), $count, $target_file); | |
92 | |
93 if(-f $target_file) { | |
94 $self->info("Target already exists. Removing"); | |
95 unlink $target_file or $self->throw("Could not remove $target_file: $!"); | |
96 } | |
97 | |
98 $self->info('Running concat'); | |
99 foreach my $file (@file_list) { | |
100 $self->fine('Processing %s', $file); | |
101 $running_total_size += stat($file)->size; | |
102 system("cat $file >> $target_file") | |
103 and $self->throw( sprintf('Cannot concat %s into %s. RC %d', $file, $target_file, ($?>>8))); | |
104 } | |
105 | |
106 $self->info("Catted files together"); | |
107 | |
108 my $catted_size = stat($target_file)->size; | |
109 | |
110 if($running_total_size != $catted_size) { | |
111 $self->throw(sprintf('The total size of the files catted together should be %d but was in fact %d. Failing as we expect the catted size to be the same', $running_total_size, $catted_size)); | |
112 } | |
113 | |
114 $self->param('target_file', $target_file); | |
115 } | |
116 else { | |
117 $self->throw("Cannot continue as we found no files to concat"); | |
118 } | |
119 return; | |
120 } | |
121 | |
122 sub write_output { | |
123 my ($self) = @_; | |
124 my $file = $self->param('target_file'); | |
125 if($file) { | |
126 $self->dataflow_output_id({ file => $file, species => $self->param('species') }, 1); | |
127 } | |
128 return; | |
129 } | |
130 | |
131 sub get_dna_files { | |
132 my ($self) = @_; | |
133 my $path = $self->fasta_path('dna'); | |
134 my $data_type = $self->param('data_type'); | |
135 my $regex_hash = $self->param($data_type); | |
136 if(! $regex_hash ) { | |
137 $self->throw("We do not have an entry for the data_type $data_type in our regex lookup hash. Edit this module"); | |
138 } | |
139 my $regex = $regex_hash->{regex}; | |
140 my $filter = sub { | |
141 my ($filename) = @_; | |
142 return ($filename =~ $regex && $filename !~ /\.toplevel\./) ? 1 : 0; | |
143 }; | |
144 my $files = $self->find_files($path, $filter); | |
145 return [ sort @{$files} ]; | |
146 } | |
147 | |
148 | |
149 sub target_file { | |
150 my ($self) = @_; | |
151 # File name format looks like: | |
152 # <species>.<assembly>.<release>.<sequence type>.<id type>.<id>.fa.gz | |
153 # e.g. Homo_sapiens.GRCh37.64.dna_rm.toplevel.fa.gz | |
154 my @name_bits; | |
155 push @name_bits, $self->web_name(); | |
156 push @name_bits, $self->assembly(); | |
157 push @name_bits, $self->param('release'); | |
158 push @name_bits, $self->param('data_type'); | |
159 push @name_bits, 'toplevel'; | |
160 push @name_bits, 'fa', 'gz'; | |
161 my $file_name = join( '.', @name_bits ); | |
162 my $dir = $self->fasta_path('dna'); | |
163 return File::Spec->catfile( $dir, $file_name ); | |
164 } | |
165 | |
166 1; |