MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoSequenceFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.29 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SequenceFileUtil;
  38 use StatisticsUtil;
  39 
  40 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  41 
  42 # Autoflush STDOUT
  43 $| = 1;
  44 
  45 # Starting message...
  46 $ScriptName = basename($0);
  47 print "\n$ScriptName: Starting...\n\n";
  48 $StartTime = new Benchmark;
  49 
  50 # Get the options and setup script...
  51 SetupScriptUsage();
  52 if ($Options{help} || @ARGV < 1) {
  53   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  54 }
  55 
  56 my(@SequenceFilesList);
  57 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  58 
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 print "Checking input sequence file(s)...\n";
  64 my(%SequenceFilesInfo);
  65 RetrieveSequenceFilesInfo();
  66 
  67 my($FileIndex);
  68 if (@SequenceFilesList > 1) {
  69   print "\nProcessing sequence files...\n";
  70 }
  71 for $FileIndex (0 .. $#SequenceFilesList) {
  72   if ($SequenceFilesInfo{FileOkay}[$FileIndex]) {
  73     print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
  74     ListSequenceFileInfo($FileIndex);
  75   }
  76 }
  77 ListTotalSizeOfFiles();
  78 
  79 print "\n$ScriptName:Done...\n\n";
  80 
  81 $EndTime = new Benchmark;
  82 $TotalTime = timediff ($EndTime, $StartTime);
  83 print "Total time: ", timestr($TotalTime), "\n";
  84 
  85 ###############################################################################
  86 
  87 # List appropriate information...
  88 sub ListSequenceFileInfo {
  89   my($Index) = @_;
  90   my($SequenceFile, $SequenceDataRef);
  91 
  92   $SequenceFile = $SequenceFilesList[$Index];
  93 
  94   $SequenceDataRef = ReadSequenceFile($SequenceFile);
  95 
  96   my($SequencesCount) = $SequenceDataRef->{Count};
  97   print "\nNumber of sequences: $SequencesCount\n";
  98 
  99   if ($OptionsInfo{ListShortestSequence} && ($SequencesCount > 1)) {
 100     my($ShortestSeqID, $ShortestSeq, $ShortestSeqLen, $Description) = GetShortestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps});
 101     print "\nShortest sequence information:\nID: $ShortestSeqID; Length:$ShortestSeqLen\n";
 102     if ($OptionsInfo{DetailLevel} >= 2) {
 103       print "Description: $Description\n";
 104     }
 105     if ($OptionsInfo{DetailLevel} >= 3) {
 106       print "Sequence: $ShortestSeq\n";
 107     }
 108   }
 109   if ($OptionsInfo{ListLongestSequence} && ($SequencesCount > 1)) {
 110     my($LongestSeqID, $LongestSeq, $LongestSeqLen, $Description) = GetLongestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps});
 111     print "\nLongest sequence information:\nID: $LongestSeqID; Length: $LongestSeqLen\n";
 112     if ($OptionsInfo{DetailLevel} >= 2) {
 113       print "Description: $Description\n";
 114     }
 115     if ($OptionsInfo{DetailLevel} >= 3) {
 116       print "Sequence: $LongestSeq\n";
 117     }
 118   }
 119   if ($OptionsInfo{FrequencyAnalysis} && ($SequencesCount > 1)) {
 120     PerformLengthFrequencyAnalysis($SequenceDataRef);
 121   }
 122   if ($OptionsInfo{ListSequenceLengths}) {
 123     ListSequenceLengths($SequenceDataRef);
 124   }
 125 
 126   # File size and modification information...
 127   print "\nFile size: ", FormatFileSize($SequenceFilesInfo{FileSize}[$Index]), " \n";
 128   print "Last modified: ", $SequenceFilesInfo{FileLastModified}[$Index], " \n";
 129 }
 130 
 131 # List information about sequence lengths...
 132 sub ListSequenceLengths {
 133   my($SequenceDataRef) = @_;
 134   my($ID, $SeqLen, $Sequence, $Description);
 135 
 136   print "\nSequence lengths information:\n";
 137   for $ID (@{$SequenceDataRef->{IDs}}) {
 138     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 139     $Description = $SequenceDataRef->{Description}{$ID};
 140     $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps});
 141     if ($OptionsInfo{IgnoreGaps}) {
 142       $Sequence = RemoveSequenceGaps($Sequence);
 143     }
 144     print "ID: $ID; Length:$SeqLen\n";
 145     if ($OptionsInfo{DetailLevel} >= 2) {
 146       print "Description: $Description\n";
 147     }
 148     if ($OptionsInfo{DetailLevel} >= 3) {
 149       print "Sequence: $Sequence\n";
 150     }
 151     if ($OptionsInfo{DetailLevel} >= 2) {
 152       print "\n";
 153     }
 154   }
 155 }
 156 
 157 # Total size of all the fiels...
 158 sub ListTotalSizeOfFiles {
 159   my($FileOkayCount, $TotalSize, $Index);
 160 
 161   $FileOkayCount = 0;
 162   $TotalSize = 0;
 163 
 164   for $Index (0 .. $#SequenceFilesList) {
 165     if ($SequenceFilesInfo{FileOkay}[$Index]) {
 166       $FileOkayCount++;
 167       $TotalSize += $SequenceFilesInfo{FileSize}[$Index];
 168     }
 169   }
 170   if ($FileOkayCount > 1) {
 171     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 172   }
 173 }
 174 
 175 
 176 # Perform frequency analysis of sequence lengths
 177 sub PerformLengthFrequencyAnalysis {
 178   my($SequenceDataRef, $SequenceLengthsRef) = @_;
 179   my ($ID, $SeqLen, $Sequence, $SequenceLenBin, $LenBin, $SequenceLenCount, @SequenceLengths, %SequenceLenFrequency);
 180 
 181   @SequenceLengths = ();
 182   %SequenceLenFrequency = ();
 183   for $ID (@{$SequenceDataRef->{IDs}}) {
 184     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 185     $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps});
 186     push @SequenceLengths, $SeqLen;
 187   }
 188   if (@{$OptionsInfo{BinRange}}) {
 189     %SequenceLenFrequency = Frequency(\@SequenceLengths, \@{$OptionsInfo{BinRange}});
 190   }
 191   else {
 192     %SequenceLenFrequency = Frequency(\@SequenceLengths, $OptionsInfo{NumOfBins});
 193   }
 194   print "\nDistribution of sequence lengths (LengthBin => Count):\n";
 195   for $SequenceLenBin (sort { $a <=> $b} keys %SequenceLenFrequency) {
 196     $SequenceLenCount = $SequenceLenFrequency{$SequenceLenBin};
 197     $LenBin = sprintf("%.1f", $SequenceLenBin) + 0;
 198     print "$LenBin => $SequenceLenCount; ";
 199   }
 200   print "\n";
 201 }
 202 
 203 # Retrieve information about sequence files...
 204 sub RetrieveSequenceFilesInfo {
 205   my($Index, $SequenceFile, $FileSupported, $FileFormat, $ModifiedTimeString, $ModifiedDateString);
 206 
 207   %SequenceFilesInfo = ();
 208   @{$SequenceFilesInfo{FileOkay}} = ();
 209   @{$SequenceFilesInfo{FileFormat}} = ();
 210   @{$SequenceFilesInfo{FileSize}} = ();
 211   @{$SequenceFilesInfo{FileLastModified}} = ();
 212 
 213   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 214     $SequenceFile = $SequenceFilesList[$Index];
 215 
 216     if (! open SEQUENCEFILE, "$SequenceFile") {
 217       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 218       next FILELIST;
 219     }
 220     close SEQUENCEFILE;
 221 
 222     $SequenceFilesInfo{FileOkay}[$Index] = 0;
 223     $SequenceFilesInfo{FileFormat}[$Index] = 'NotSupported';
 224     $SequenceFilesInfo{FileSize}[$Index] = 0;
 225     $SequenceFilesInfo{FileLastModified}[$Index] = '';
 226 
 227     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 228     if (!$FileSupported) {
 229       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 230       next FILELIST;
 231     }
 232 
 233     $SequenceFilesInfo{FileOkay}[$Index] = 1;
 234     $SequenceFilesInfo{FileFormat}[$Index] = $FileFormat;
 235     $SequenceFilesInfo{FileSize}[$Index] = FileSize($SequenceFile);
 236 
 237     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SequenceFile);
 238     $SequenceFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 239   }
 240 }
 241 
 242 # Process option values...
 243 sub ProcessOptions {
 244 
 245   $OptionsInfo{All} = defined $Options{all} ? $Options{all} : undef;
 246 
 247   $OptionsInfo{Count} = defined $Options{count} ? $Options{count} : undef;
 248   $OptionsInfo{DetailLevel} = $Options{detail};
 249   $OptionsInfo{Frequency} = defined $Options{frequency} ? $Options{frequency} : undef;
 250   $OptionsInfo{FrequencyBins} = defined $Options{frequencybins} ? $Options{frequencybins} : undef;
 251   $OptionsInfo{IgnoreGaps} = defined $Options{ignoregaps} ? $Options{ignoregaps} : undef;
 252   $OptionsInfo{Longest} = defined $Options{longest} ? $Options{longest} : undef;
 253   $OptionsInfo{Shortest} = defined $Options{shortest} ? $Options{shortest} : undef;
 254   $OptionsInfo{SequenceLengths} = defined $Options{sequencelengths} ? $Options{sequencelengths} : undef;
 255 
 256   $OptionsInfo{FrequencyAnalysis} = ($Options{all} || $Options{frequency}) ? 1 : 0;
 257   $OptionsInfo{ListLongestSequence} = ($Options{all} || $Options{longest}) ? 1 : 0;
 258   $OptionsInfo{ListShortestSequence} = ($Options{all} || $Options{shortest}) ? 1 : 0;
 259   $OptionsInfo{ListSequenceLengths} = ($Options{all} || $Options{sequencelengths}) ? 1 : 0;
 260   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 261 
 262   # Setup frequency bin values...
 263   $OptionsInfo{NumOfBins} = 4;
 264   @{$OptionsInfo{BinRange}} = ();
 265 
 266   if ($Options{frequencybins} =~ /\,/) {
 267     my($BinValue, @SpecifiedBinRange);
 268     @SpecifiedBinRange = split /\,/,  $Options{frequencybins};
 269     if (@SpecifiedBinRange < 2) {
 270       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
 271     }
 272     for $BinValue (@SpecifiedBinRange) {
 273       if (!IsNumerical($BinValue)) {
 274         die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
 275       }
 276     }
 277     my($Index1, $Index2);
 278     for $Index1 (0 .. $#SpecifiedBinRange) {
 279       for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
 280         if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
 281           die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
 282         }
 283       }
 284     }
 285     push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
 286   }
 287   else {
 288     $OptionsInfo{NumOfBins} = $Options{frequencybins};
 289     if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
 290       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
 291     }
 292   }
 293 }
 294 
 295 # Setup script usage  and retrieve command line arguments specified using various options...
 296 sub SetupScriptUsage {
 297 
 298   # Retrieve all the options...
 299   %Options = ();
 300   $Options{detail} = 1;
 301   $Options{ignoregaps} = 'no';
 302   $Options{frequencybins} = 10;
 303 
 304   if (!GetOptions(\%Options, "all|a", "count|c", "detail|d=i", "frequency|f", "frequencybins=s", "help|h", "ignoregaps|i=s", "longest|l", "shortest|s", "sequencelengths", "workingdir|w=s")) {
 305     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 306   }
 307   if ($Options{workingdir}) {
 308     if (! -d $Options{workingdir}) {
 309       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 310     }
 311     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 312   }
 313   if (!IsPositiveInteger($Options{detail})) {
 314     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 315   }
 316   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 317     die "Error: The value specified, $Options{ignoregaps}, for option \"-i --IgnoreGaps\" is not valid. Allowed values: yes or no\n";
 318   }
 319 }
 320