1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoSequenceFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:20 $ 5 # $Revision: 1.29 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SequenceFileUtil; 38 use StatisticsUtil; 39 40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 41 42 # Autoflush STDOUT 43 $| = 1; 44 45 # Starting message... 46 $ScriptName = basename($0); 47 print "\n$ScriptName: Starting...\n\n"; 48 $StartTime = new Benchmark; 49 50 # Get the options and setup script... 51 SetupScriptUsage(); 52 if ($Options{help} || @ARGV < 1) { 53 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 54 } 55 56 my(@SequenceFilesList); 57 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir"); 58 59 print "Processing options...\n"; 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 print "Checking input sequence file(s)...\n"; 64 my(%SequenceFilesInfo); 65 RetrieveSequenceFilesInfo(); 66 67 my($FileIndex); 68 if (@SequenceFilesList > 1) { 69 print "\nProcessing sequence files...\n"; 70 } 71 for $FileIndex (0 .. $#SequenceFilesList) { 72 if ($SequenceFilesInfo{FileOkay}[$FileIndex]) { 73 print "\nProcessing file $SequenceFilesList[$FileIndex]...\n"; 74 ListSequenceFileInfo($FileIndex); 75 } 76 } 77 ListTotalSizeOfFiles(); 78 79 print "\n$ScriptName:Done...\n\n"; 80 81 $EndTime = new Benchmark; 82 $TotalTime = timediff ($EndTime, $StartTime); 83 print "Total time: ", timestr($TotalTime), "\n"; 84 85 ############################################################################### 86 87 # List appropriate information... 88 sub ListSequenceFileInfo { 89 my($Index) = @_; 90 my($SequenceFile, $SequenceDataRef); 91 92 $SequenceFile = $SequenceFilesList[$Index]; 93 94 $SequenceDataRef = ReadSequenceFile($SequenceFile); 95 96 my($SequencesCount) = $SequenceDataRef->{Count}; 97 print "\nNumber of sequences: $SequencesCount\n"; 98 99 if ($OptionsInfo{ListShortestSequence} && ($SequencesCount > 1)) { 100 my($ShortestSeqID, $ShortestSeq, $ShortestSeqLen, $Description) = GetShortestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps}); 101 print "\nShortest sequence information:\nID: $ShortestSeqID; Length:$ShortestSeqLen\n"; 102 if ($OptionsInfo{DetailLevel} >= 2) { 103 print "Description: $Description\n"; 104 } 105 if ($OptionsInfo{DetailLevel} >= 3) { 106 print "Sequence: $ShortestSeq\n"; 107 } 108 } 109 if ($OptionsInfo{ListLongestSequence} && ($SequencesCount > 1)) { 110 my($LongestSeqID, $LongestSeq, $LongestSeqLen, $Description) = GetLongestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps}); 111 print "\nLongest sequence information:\nID: $LongestSeqID; Length: $LongestSeqLen\n"; 112 if ($OptionsInfo{DetailLevel} >= 2) { 113 print "Description: $Description\n"; 114 } 115 if ($OptionsInfo{DetailLevel} >= 3) { 116 print "Sequence: $LongestSeq\n"; 117 } 118 } 119 if ($OptionsInfo{FrequencyAnalysis} && ($SequencesCount > 1)) { 120 PerformLengthFrequencyAnalysis($SequenceDataRef); 121 } 122 if ($OptionsInfo{ListSequenceLengths}) { 123 ListSequenceLengths($SequenceDataRef); 124 } 125 126 # File size and modification information... 127 print "\nFile size: ", FormatFileSize($SequenceFilesInfo{FileSize}[$Index]), " \n"; 128 print "Last modified: ", $SequenceFilesInfo{FileLastModified}[$Index], " \n"; 129 } 130 131 # List information about sequence lengths... 132 sub ListSequenceLengths { 133 my($SequenceDataRef) = @_; 134 my($ID, $SeqLen, $Sequence, $Description); 135 136 print "\nSequence lengths information:\n"; 137 for $ID (@{$SequenceDataRef->{IDs}}) { 138 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 139 $Description = $SequenceDataRef->{Description}{$ID}; 140 $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps}); 141 if ($OptionsInfo{IgnoreGaps}) { 142 $Sequence = RemoveSequenceGaps($Sequence); 143 } 144 print "ID: $ID; Length:$SeqLen\n"; 145 if ($OptionsInfo{DetailLevel} >= 2) { 146 print "Description: $Description\n"; 147 } 148 if ($OptionsInfo{DetailLevel} >= 3) { 149 print "Sequence: $Sequence\n"; 150 } 151 if ($OptionsInfo{DetailLevel} >= 2) { 152 print "\n"; 153 } 154 } 155 } 156 157 # Total size of all the fiels... 158 sub ListTotalSizeOfFiles { 159 my($FileOkayCount, $TotalSize, $Index); 160 161 $FileOkayCount = 0; 162 $TotalSize = 0; 163 164 for $Index (0 .. $#SequenceFilesList) { 165 if ($SequenceFilesInfo{FileOkay}[$Index]) { 166 $FileOkayCount++; 167 $TotalSize += $SequenceFilesInfo{FileSize}[$Index]; 168 } 169 } 170 if ($FileOkayCount > 1) { 171 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 172 } 173 } 174 175 176 # Perform frequency analysis of sequence lengths 177 sub PerformLengthFrequencyAnalysis { 178 my($SequenceDataRef, $SequenceLengthsRef) = @_; 179 my ($ID, $SeqLen, $Sequence, $SequenceLenBin, $LenBin, $SequenceLenCount, @SequenceLengths, %SequenceLenFrequency); 180 181 @SequenceLengths = (); 182 %SequenceLenFrequency = (); 183 for $ID (@{$SequenceDataRef->{IDs}}) { 184 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 185 $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps}); 186 push @SequenceLengths, $SeqLen; 187 } 188 if (@{$OptionsInfo{BinRange}}) { 189 %SequenceLenFrequency = Frequency(\@SequenceLengths, \@{$OptionsInfo{BinRange}}); 190 } 191 else { 192 %SequenceLenFrequency = Frequency(\@SequenceLengths, $OptionsInfo{NumOfBins}); 193 } 194 print "\nDistribution of sequence lengths (LengthBin => Count):\n"; 195 for $SequenceLenBin (sort { $a <=> $b} keys %SequenceLenFrequency) { 196 $SequenceLenCount = $SequenceLenFrequency{$SequenceLenBin}; 197 $LenBin = sprintf("%.1f", $SequenceLenBin) + 0; 198 print "$LenBin => $SequenceLenCount; "; 199 } 200 print "\n"; 201 } 202 203 # Retrieve information about sequence files... 204 sub RetrieveSequenceFilesInfo { 205 my($Index, $SequenceFile, $FileSupported, $FileFormat, $ModifiedTimeString, $ModifiedDateString); 206 207 %SequenceFilesInfo = (); 208 @{$SequenceFilesInfo{FileOkay}} = (); 209 @{$SequenceFilesInfo{FileFormat}} = (); 210 @{$SequenceFilesInfo{FileSize}} = (); 211 @{$SequenceFilesInfo{FileLastModified}} = (); 212 213 FILELIST: for $Index (0 .. $#SequenceFilesList) { 214 $SequenceFile = $SequenceFilesList[$Index]; 215 216 if (! open SEQUENCEFILE, "$SequenceFile") { 217 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n"; 218 next FILELIST; 219 } 220 close SEQUENCEFILE; 221 222 $SequenceFilesInfo{FileOkay}[$Index] = 0; 223 $SequenceFilesInfo{FileFormat}[$Index] = 'NotSupported'; 224 $SequenceFilesInfo{FileSize}[$Index] = 0; 225 $SequenceFilesInfo{FileLastModified}[$Index] = ''; 226 227 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile); 228 if (!$FileSupported) { 229 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n"; 230 next FILELIST; 231 } 232 233 $SequenceFilesInfo{FileOkay}[$Index] = 1; 234 $SequenceFilesInfo{FileFormat}[$Index] = $FileFormat; 235 $SequenceFilesInfo{FileSize}[$Index] = FileSize($SequenceFile); 236 237 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SequenceFile); 238 $SequenceFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 239 } 240 } 241 242 # Process option values... 243 sub ProcessOptions { 244 245 $OptionsInfo{All} = defined $Options{all} ? $Options{all} : undef; 246 247 $OptionsInfo{Count} = defined $Options{count} ? $Options{count} : undef; 248 $OptionsInfo{DetailLevel} = $Options{detail}; 249 $OptionsInfo{Frequency} = defined $Options{frequency} ? $Options{frequency} : undef; 250 $OptionsInfo{FrequencyBins} = defined $Options{frequencybins} ? $Options{frequencybins} : undef; 251 $OptionsInfo{IgnoreGaps} = defined $Options{ignoregaps} ? $Options{ignoregaps} : undef; 252 $OptionsInfo{Longest} = defined $Options{longest} ? $Options{longest} : undef; 253 $OptionsInfo{Shortest} = defined $Options{shortest} ? $Options{shortest} : undef; 254 $OptionsInfo{SequenceLengths} = defined $Options{sequencelengths} ? $Options{sequencelengths} : undef; 255 256 $OptionsInfo{FrequencyAnalysis} = ($Options{all} || $Options{frequency}) ? 1 : 0; 257 $OptionsInfo{ListLongestSequence} = ($Options{all} || $Options{longest}) ? 1 : 0; 258 $OptionsInfo{ListShortestSequence} = ($Options{all} || $Options{shortest}) ? 1 : 0; 259 $OptionsInfo{ListSequenceLengths} = ($Options{all} || $Options{sequencelengths}) ? 1 : 0; 260 $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0; 261 262 # Setup frequency bin values... 263 $OptionsInfo{NumOfBins} = 4; 264 @{$OptionsInfo{BinRange}} = (); 265 266 if ($Options{frequencybins} =~ /\,/) { 267 my($BinValue, @SpecifiedBinRange); 268 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 269 if (@SpecifiedBinRange < 2) { 270 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 271 } 272 for $BinValue (@SpecifiedBinRange) { 273 if (!IsNumerical($BinValue)) { 274 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 275 } 276 } 277 my($Index1, $Index2); 278 for $Index1 (0 .. $#SpecifiedBinRange) { 279 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 280 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 281 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 282 } 283 } 284 } 285 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; 286 } 287 else { 288 $OptionsInfo{NumOfBins} = $Options{frequencybins}; 289 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { 290 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 291 } 292 } 293 } 294 295 # Setup script usage and retrieve command line arguments specified using various options... 296 sub SetupScriptUsage { 297 298 # Retrieve all the options... 299 %Options = (); 300 $Options{detail} = 1; 301 $Options{ignoregaps} = 'no'; 302 $Options{frequencybins} = 10; 303 304 if (!GetOptions(\%Options, "all|a", "count|c", "detail|d=i", "frequency|f", "frequencybins=s", "help|h", "ignoregaps|i=s", "longest|l", "shortest|s", "sequencelengths", "workingdir|w=s")) { 305 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 306 } 307 if ($Options{workingdir}) { 308 if (! -d $Options{workingdir}) { 309 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 310 } 311 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 312 } 313 if (!IsPositiveInteger($Options{detail})) { 314 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 315 } 316 if ($Options{ignoregaps} !~ /^(yes|no)$/i) { 317 die "Error: The value specified, $Options{ignoregaps}, for option \"-i --IgnoreGaps\" is not valid. Allowed values: yes or no\n"; 318 } 319 } 320