MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: AnalyzeSequenceFilesData.pl,v $
   4 # $Date: 2015/02/28 20:46:04 $
   5 # $Revision: 1.33 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SequenceFileUtil;
  38 use AminoAcids;
  39 use NucleicAcids;
  40 
  41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  42 
  43 # Autoflush STDOUT
  44 $| = 1;
  45 
  46 # Starting message...
  47 $ScriptName = basename($0);
  48 print "\n$ScriptName: Starting...\n\n";
  49 $StartTime = new Benchmark;
  50 
  51 # Setup script usage message...
  52 SetupScriptUsage();
  53 if ($Options{help} || @ARGV < 1) {
  54   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  55 }
  56 
  57 # Expand wild card file names...
  58 my(@SequenceFilesList);
  59 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  60 
  61 print "Processing options...\n";
  62 my(%OptionsInfo);
  63 ProcessOptions();
  64 
  65 # Set up information about input files...
  66 print "Checking input sequence file(s)...\n";
  67 my(%SequenceFilesInfo);
  68 RetrieveSequenceFilesInfo();
  69 SetupSequenceRegionsData();
  70 
  71 # Process input files..
  72 my($FileIndex);
  73 if (@SequenceFilesList > 1) {
  74   print "\nProcessing sequence files...\n";
  75 }
  76 for $FileIndex (0 .. $#SequenceFilesList) {
  77   if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
  78     print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
  79     AnalyzeSequenceFileData($FileIndex);
  80   }
  81 }
  82 print "\n$ScriptName:Done...\n\n";
  83 
  84 $EndTime = new Benchmark;
  85 $TotalTime = timediff ($EndTime, $StartTime);
  86 print "Total time: ", timestr($TotalTime), "\n";
  87 
  88 ###############################################################################
  89 
  90 # Analyze sequence file...
  91 sub AnalyzeSequenceFileData {
  92   my($FileIndex) = @_;
  93   my($SequenceFile, $SequenceDataRef);
  94 
  95   $SequenceFile = $SequenceFilesList[$FileIndex];
  96 
  97   open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
  98   $SequenceDataRef = ReadSequenceFile($SequenceFile);
  99   close SEQUENCEFILE;
 100 
 101   if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
 102     CalculatePercentIdentityMatrix($FileIndex, $SequenceDataRef);
 103   }
 104   if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
 105     PerformResidueFrequencyAnalysis($FileIndex, $SequenceDataRef);
 106   }
 107 }
 108 
 109 # Calculate percent identity matrix...
 110 sub CalculatePercentIdentityMatrix {
 111   my($FileIndex, $SequenceDataRef) = @_;
 112   my($PercentIdentity, $PercentIdentityMatrixFile, $PercentIdentityMatrixRef, $RowID, $ColID, $Line, @LineWords);
 113 
 114   $PercentIdentityMatrixFile = $SequenceFilesInfo{OutFileRoot}[$FileIndex] . 'PercentIdentityMatrix.' . $SequenceFilesInfo{OutFileExt}[$FileIndex];
 115   $PercentIdentityMatrixRef = CalculatePercentSequenceIdentityMatrix($SequenceDataRef, $OptionsInfo{IgnoreGaps}, $OptionsInfo{Precision});
 116 
 117   print "Generating percent identity matrix file $PercentIdentityMatrixFile...\n";
 118   open OUTFILE, ">$PercentIdentityMatrixFile" or die "Can't open $PercentIdentityMatrixFile: $!\n";
 119 
 120   # Write out column labels...
 121   @LineWords = ();
 122   push @LineWords, '';
 123   for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
 124     push @LineWords, $ColID;
 125   }
 126   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 127   print OUTFILE "$Line\n";
 128 
 129   # Write out rows...
 130   for $RowID (@{$PercentIdentityMatrixRef->{IDs}}) {
 131     @LineWords = ();
 132     push @LineWords, $RowID;
 133     for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
 134       $PercentIdentity = $PercentIdentityMatrixRef->{PercentIdentity}{$RowID}{$ColID};
 135       push @LineWords, $PercentIdentity;
 136     }
 137     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 138     print OUTFILE "$Line\n";
 139   }
 140   close OUTFILE;
 141 }
 142 
 143 # Perform frequency analysis...
 144 sub PerformResidueFrequencyAnalysis {
 145   my($FileIndex, $SequenceDataRef) = @_;
 146 
 147   CountResiduesInRegions($FileIndex, $SequenceDataRef);
 148   CalculatePercentResidueFrequencyInRegions($FileIndex, $SequenceDataRef);
 149   GeneratePercentResidueFrequencyOutFilesForRegions($FileIndex, $SequenceDataRef);
 150 }
 151 
 152 # Count residues...
 153 sub CountResiduesInRegions {
 154   my($FileIndex, $SequenceDataRef) = @_;
 155 
 156   # Setup rerfernce sequence data...
 157   my($RefereceSequenceID, $RefereceSequence);
 158   $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
 159   $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
 160 
 161   # Count residues...
 162   my($RegionID, $StartResNum, $EndResNum, $ResNum, $ResIndex, $ID, $Sequence, $Residue);
 163   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 164     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 165     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 166     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 167       $ResIndex = $ResNum - 1;
 168       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 169         next RESNUM;
 170       }
 171       # Go over residues in column $ResNum in all the sequences...
 172       ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 173         $Sequence = $SequenceDataRef->{Sequence}{$ID};
 174         $Residue = substr($Sequence, $ResIndex, 1);
 175         if (IsGapResidue($Residue)) {
 176           $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{Gap} += 1;
 177         }
 178         else {
 179           if (exists $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}) {
 180             $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue} += 1;
 181           }
 182           else {
 183             # Internal error...
 184             print "Warning: Ignoring residue $Residue in sequence $ID during ResidueFrequencyAnalysis calculation: Unknown residue...\n";
 185           }
 186         }
 187       }
 188     }
 189   }
 190 }
 191 
 192 # Calculate percent frequency for various residues in the sequence regions...
 193 sub CalculatePercentResidueFrequencyInRegions {
 194   my($FileIndex, $SequenceDataRef) = @_;
 195   my($RegionID, $StartResNum, $EndResNum, $ResNum, $Residue, $Count, $PercentCount, $MaxResiduesCount, $Precision);
 196 
 197   $MaxResiduesCount = $SequenceDataRef->{Count};
 198   $Precision = $OptionsInfo{Precision};
 199   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 200     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 201     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 202     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 203       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 204         next RESNUM;
 205       }
 206       for $Residue (keys %{$SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}}) {
 207         $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 208         $PercentCount = ($Count / $MaxResiduesCount) * 100;
 209         $PercentCount = sprintf("%.${Precision}f", $PercentCount) + 0;
 210         $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue} = $PercentCount;
 211       }
 212     }
 213   }
 214 }
 215 
 216 # Generate output files...
 217 sub GeneratePercentResidueFrequencyOutFilesForRegions {
 218   my($FileIndex, $SequenceDataRef) = @_;
 219 
 220   # Setup rerfernce sequence data...
 221   my($RefereceSequenceID, $RefereceSequence);
 222   $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
 223   $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
 224 
 225   my($RegionID, $StartResNum, $EndResNum, $ResNum, $Count, $PercentCount, $Residue, $RegionNum, $RegionOutFile, $PercentRegionOutFile, $OutFileRoot, $OutFileExt, $Line, @LineWords, @PercentLineWords);
 226 
 227   $OutFileRoot = $SequenceFilesInfo{OutFileRoot}[$FileIndex];
 228   $OutFileExt = $SequenceFilesInfo{OutFileExt}[$FileIndex];
 229   $RegionNum = 0;
 230   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 231     $RegionNum++;
 232     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 233     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 234 
 235     $RegionOutFile = "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
 236     $PercentRegionOutFile = "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
 237 
 238     print "Generating $RegionOutFile and $PercentRegionOutFile...\n";
 239     open REGIONOUTFILE, ">$RegionOutFile" or die "Error: Can't open $RegionOutFile: $! \n";
 240     open PERCENTREGIONOUTFILE, ">$PercentRegionOutFile" or die "Error: Can't open $PercentRegionOutFile: $! \n";
 241 
 242     # Write out reference residue positions as column values....
 243     @LineWords = ();
 244     push @LineWords, '';
 245     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 246       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 247         next RESNUM;
 248       }
 249       push @LineWords, $ResNum;
 250     }
 251     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 252     print REGIONOUTFILE "$Line\n";
 253     print PERCENTREGIONOUTFILE "$Line\n";
 254 
 255 
 256     # Write out row data for each residue; Gap residue is written last...
 257     RESIDUE: for $Residue (sort @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}) {
 258       if ($Residue =~ /^Gap$/i) {
 259         next RESIDUE;
 260       }
 261       @LineWords = ();
 262       push @LineWords, $Residue;
 263       @PercentLineWords = ();
 264       push @PercentLineWords, $Residue;
 265 
 266       RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 267         if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 268           next RESNUM;
 269         }
 270         $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 271         push @LineWords, $Count;
 272         $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
 273         push @PercentLineWords, $PercentCount;
 274       }
 275       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 276       print REGIONOUTFILE "$Line\n";
 277 
 278       $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 279       print PERCENTREGIONOUTFILE "$Line\n";
 280     }
 281 
 282     # Write out data for gap...
 283     $Residue = 'Gap';
 284     @LineWords = ();
 285     push @LineWords, $Residue;
 286     @PercentLineWords = ();
 287     push @PercentLineWords, $Residue;
 288 
 289     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 290       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 291         next RESNUM;
 292       }
 293       $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 294       push @LineWords, $Count;
 295 
 296       $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
 297       push @PercentLineWords, $PercentCount;
 298     }
 299     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 300     print REGIONOUTFILE "$Line\n";
 301 
 302     $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 303     print PERCENTREGIONOUTFILE "$Line\n";
 304 
 305     close REGIONOUTFILE;
 306     close PERCENTREGIONOUTFILE;
 307   }
 308 }
 309 
 310 # Process option values...
 311 sub ProcessOptions {
 312   %OptionsInfo = ();
 313 
 314   # Setup analysis mode...
 315   $OptionsInfo{CalculatePercentIdentityMatrix} = ($Options{mode} =~ /^(PercentIdentityMatrix|All)$/i) ? 1 : 0;
 316   $OptionsInfo{PerformResidueFrequencyAnalysis} = ($Options{mode} =~ /^(ResidueFrequencyAnalysis|All)$/i) ? 1 : 0;
 317 
 318   # Setup delimiter and quotes...
 319   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 320   $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
 321 
 322   # Setup reference sequence and regions for residue frequence analysis...
 323   $OptionsInfo{SpecifiedRefereceSequence} = $Options{referencesequence};
 324   $OptionsInfo{SpecifiedRegion} = $Options{region};
 325   @{$OptionsInfo{SpecifiedRegions}} = ();
 326 
 327   my(@SpecifiedRegions);
 328   @SpecifiedRegions = ();
 329   if ($Options{region} =~ /\,/) {
 330     @SpecifiedRegions = split /\,/, $OptionsInfo{SpecifiedRegion};
 331     if (@SpecifiedRegions % 2) {
 332       die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
 333     }
 334     # Make sure EndResNum > StartResNum...
 335     my($StartResNum, $EndResNum, $Index, $RegionNum);
 336     $RegionNum = 0;
 337     for ($Index = 0; $Index <= $#SpecifiedRegions; $Index += 2) {
 338       $StartResNum = $SpecifiedRegions[$Index];
 339       $EndResNum = $SpecifiedRegions[$Index + 1];
 340       $RegionNum++;
 341       if (!IsPositiveInteger($StartResNum)) {
 342         die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be a positive integer for region $RegionNum.\n";
 343       }
 344       if (!IsPositiveInteger($EndResNum)) {
 345         die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $EndResNum, must be a positive integer for region $RegionNum.\n";
 346       }
 347       if ($StartResNum >= $EndResNum) {
 348         die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller than end residue number, $EndResNum, for region $RegionNum.\n";
 349       }
 350     }
 351   }
 352   else {
 353     if ($Options{region} !~ /^UseCompleteSequence$/i) {
 354       die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
 355     }
 356   }
 357   push @{$OptionsInfo{SpecifiedRegions}}, @SpecifiedRegions;
 358 
 359   # Miscellaneous options...
 360   $OptionsInfo{Precision} = $Options{precision};
 361   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 362   $OptionsInfo{RegionResiduesMode} = $Options{regionresiduesmode};
 363 
 364   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 365   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 366 }
 367 
 368 # Retrieve information about sequence files...
 369 sub RetrieveSequenceFilesInfo {
 370   my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $RefereceSequence, $RefereceSequenceID, $RefereceSequenceLen, $RefereceSequenceWithNoGaps, $RefereceSequenceWithNoGapsLen, $RefereceSequenceRegionCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $SequenceDataRef, $SpecifiedRefereceSequence, @SpecifiedRegions, @RefereceSequenceRegions);
 371 
 372   %SequenceFilesInfo = ();
 373   @{$SequenceFilesInfo{FilesOkay}} = ();
 374   @{$SequenceFilesInfo{OutFileRoot}} = ();
 375   @{$SequenceFilesInfo{OutFileExt}} = ();
 376   @{$SequenceFilesInfo{Format}} = ();
 377   @{$SequenceFilesInfo{SequenceCount}} = ();
 378   @{$SequenceFilesInfo{RefereceSequenceID}} = ();
 379   @{$SequenceFilesInfo{RefereceSequence}} = ();
 380   @{$SequenceFilesInfo{RefereceSequenceLen}} = ();
 381   @{$SequenceFilesInfo{RefereceSequenceWithNoGaps}} = ();
 382   @{$SequenceFilesInfo{RefereceSequenceWithNoGapsLen}} = ();
 383   @{$SequenceFilesInfo{RefereceSequenceRegions}} = ();
 384   @{$SequenceFilesInfo{RefereceSequenceRegionCount}} = ();
 385   @{$SequenceFilesInfo{ResidueCodes}} = ();
 386 
 387   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 388     $SequenceFile = $SequenceFilesList[$Index];
 389     $SequenceFilesInfo{FilesOkay}[$Index] = 0;
 390     $SequenceFilesInfo{OutFileRoot}[$Index] = '';
 391     $SequenceFilesInfo{OutFileExt}[$Index] = '';
 392     $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
 393     $SequenceFilesInfo{SequenceCount}[$Index] = 0;
 394     $SequenceFilesInfo{RefereceSequenceID}[$Index] = '';
 395     $SequenceFilesInfo{RefereceSequence}[$Index] = '';
 396     $SequenceFilesInfo{RefereceSequenceLen}[$Index] = '';
 397     $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = '';
 398     $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = '';
 399     @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]} = ();
 400     $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = 0;
 401     @{$SequenceFilesInfo{ResidueCodes}[$Index]} = ();
 402 
 403     if (! open SEQUENCEFILE, "$SequenceFile") {
 404       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 405       next FILELIST;
 406     }
 407     close SEQUENCEFILE;
 408 
 409     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 410     if (!$FileSupported) {
 411       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 412       next FILELIST;
 413     }
 414 
 415     $SequenceDataRef = ReadSequenceFile($SequenceFile);
 416 
 417     $SequenceCount = $SequenceDataRef->{Count};
 418     if (!$SequenceCount) {
 419       warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
 420       next FILELIST;
 421     }
 422 
 423     # Make sure all sequence lengths are identical...
 424     if (!AreSequenceLengthsIdentical($SequenceDataRef)) {
 425       warn "Warning: Ignoring file $SequenceFile: Sequence legths are not identical.\n";
 426       next FILELIST;
 427     }
 428     $SpecifiedRefereceSequence = $OptionsInfo{SpecifiedRefereceSequence};
 429     # Make sure reference sequence ID is valid...
 430     if ($SpecifiedRefereceSequence =~ /^UseFirstSequenceID$/i) {
 431       $RefereceSequenceID = $SequenceDataRef->{IDs}[0];
 432     }
 433     else {
 434       if (!exists($SequenceDataRef->{Sequence}{$SpecifiedRefereceSequence})) {
 435         warn "Warning: Ignoring file $SequenceFile: Rreference sequence ID, $SpecifiedRefereceSequence, specified using option \"--referencesequence\" is missing.\n";
 436         next FILELIST;
 437       }
 438       $RefereceSequenceID = $SpecifiedRefereceSequence;
 439     }
 440 
 441     # Make sure sequence regions corresponding to reference sequence are valid...
 442     @RefereceSequenceRegions = ();
 443     $RefereceSequenceRegionCount = 0;
 444     $RefereceSequence = $SequenceDataRef->{Sequence}{$RefereceSequenceID};
 445     $RefereceSequenceLen = length $RefereceSequence;
 446 
 447     $RefereceSequenceWithNoGaps = RemoveSequenceGaps($RefereceSequence);
 448     $RefereceSequenceWithNoGapsLen = length $RefereceSequenceWithNoGaps;
 449 
 450     @SpecifiedRegions = ();
 451     push @SpecifiedRegions, @{$OptionsInfo{SpecifiedRegions}};
 452     if (@SpecifiedRegions) {
 453       # Make sure specified regions are valid...
 454       my($StartResNum, $EndResNum, $RegionIndex, $RegionNum);
 455       $RegionNum = 0;
 456       for ($RegionIndex = 0; $RegionIndex <= $#SpecifiedRegions; $RegionIndex += 2) {
 457         $StartResNum = $SpecifiedRegions[$RegionIndex];
 458         $EndResNum = $SpecifiedRegions[$RegionIndex + 1];
 459         $RegionNum++;
 460         if ($OptionsInfo{IgnoreGaps}) {
 461           if ($StartResNum > $RefereceSequenceWithNoGapsLen) {
 462             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
 463             next FILELIST;
 464           }
 465           if ($EndResNum > $RefereceSequenceWithNoGapsLen) {
 466             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
 467             next FILELIST;
 468           }
 469         }
 470         else {
 471           if ($StartResNum > $RefereceSequenceLen) {
 472             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum.\n";
 473             next FILELIST;
 474           }
 475           if ($EndResNum > $RefereceSequenceLen) {
 476             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum.\n";
 477             next FILELIST;
 478           }
 479         }
 480         push @RefereceSequenceRegions, ($StartResNum, $EndResNum);
 481       }
 482       $RefereceSequenceRegionCount = $RegionNum;
 483     }
 484     else {
 485       # Use complete sequence corresponding to reference sequence ID...
 486       if ($OptionsInfo{IgnoreGaps}) {
 487         push @RefereceSequenceRegions, (1, $RefereceSequenceWithNoGapsLen);
 488       }
 489       else {
 490         push @RefereceSequenceRegions, (1, $RefereceSequenceLen);
 491       }
 492       $RefereceSequenceRegionCount = 1;
 493     }
 494     # Setup output file names...
 495     $FileDir = ""; $FileName = ""; $FileExt = "";
 496     ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
 497     $FileExt = "csv";
 498     if ($Options{outdelim} =~ /^tab$/i) {
 499       $FileExt = "tsv";
 500     }
 501     $OutFileExt = $FileExt;
 502     if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
 503       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 504       if ($RootFileName && $RootFileExt) {
 505         $FileName = $RootFileName;
 506       }
 507       else {
 508         $FileName = $OptionsInfo{OutFileRoot};
 509       }
 510       $OutFileRoot = $FileName;
 511     }
 512     else {
 513       $OutFileRoot = $FileName;
 514     }
 515     if (!$OptionsInfo{OverwriteFiles}) {
 516       if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
 517         if (-e "${OutFileRoot}PercentIdentityMatrix.${OutFileExt}") {
 518           warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentIdentityMatrix.${OutFileExt} already exists\n";
 519           next FILELIST;
 520         }
 521       }
 522       if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
 523         my($RegionNum);
 524         for $RegionNum (1 .. $RefereceSequenceRegionCount) {
 525           if (-e "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
 526             warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
 527             next FILELIST;
 528           }
 529           if (-e "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
 530             warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
 531             next FILELIST;
 532           }
 533         }
 534       }
 535     }
 536 
 537     $SequenceFilesInfo{FilesOkay}[$Index] = 1;
 538     $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 539     $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 540 
 541     $SequenceFilesInfo{Format}[$Index] = $FileFormat;
 542     $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
 543     $SequenceFilesInfo{RefereceSequenceID}[$Index] = $RefereceSequenceID;
 544     $SequenceFilesInfo{RefereceSequence}[$Index] = $RefereceSequence;
 545     $SequenceFilesInfo{RefereceSequenceLen}[$Index] = $RefereceSequenceLen;
 546     $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = $RefereceSequenceWithNoGaps;
 547     $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = $RefereceSequenceWithNoGapsLen;
 548     push @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}, @RefereceSequenceRegions;
 549     $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = $RefereceSequenceRegionCount;
 550 
 551     # Setup residue codes...
 552     SetupSequenceFileResidueCodes($SequenceDataRef, $Index);
 553   }
 554 }
 555 
 556 sub SetupSequenceFileResidueCodes {
 557   my($SequenceDataRef, $FileIndex) = @_;
 558   my($Residue, @ResidueCodesList, %ResidueCodesMap);
 559 
 560   # Initialize
 561   @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]} = ();
 562 
 563   %ResidueCodesMap = ();
 564   @ResidueCodesList = ();
 565 
 566   # Setup standard amino acids and nucleic acids one letter codes...
 567   if ($OptionsInfo{RegionResiduesMode} =~ /^AminoAcids$/i) {
 568     @ResidueCodesList = AminoAcids::GetAminoAcids('OneLetterCode');
 569   }
 570   elsif ($OptionsInfo{RegionResiduesMode} =~ /^NucleicAcids$/i) {
 571     push @ResidueCodesList, ('A', 'G', 'T', 'U', 'C');
 572   }
 573   push @ResidueCodesList, 'Gap';
 574   for $Residue (@ResidueCodesList) {
 575     $ResidueCodesMap{$Residue} = $Residue;
 576   }
 577 
 578   # Go over all the residues in all the sequences and add missing ones to the list...
 579   my($ID, $Sequence, $ResIndex);
 580   for $ID (@{$SequenceDataRef->{IDs}}) {
 581     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 582     RES: for $ResIndex (0 .. (length($Sequence) - 1)) {
 583       $Residue = substr($Sequence, $ResIndex, 1);
 584       if (IsGapResidue($Residue)) {
 585         next RES;
 586       }
 587       if (exists $ResidueCodesMap{$Residue}) {
 588         next RES;
 589       }
 590       push @ResidueCodesList, $Residue;
 591       $ResidueCodesMap{$Residue} = $Residue;
 592     }
 593   }
 594   push @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}, @ResidueCodesList;
 595 }
 596 
 597 # Setup regions data for performing residue frequency analysis...
 598 sub SetupSequenceRegionsData {
 599   my($Index, $RefereceSequence, $RefereceSequenceLen, $RegionID, $StartResNum, $EndResNum, $RegionIndex, $RegionNum, $NoGapResNum, $ResNum, $ResIndex, $Residue, $ResidueCode, @RefereceSequenceRegions);
 600 
 601 
 602   @{$SequenceFilesInfo{RefereceSequenceResNums}} = ();
 603   @{$SequenceFilesInfo{RegionsData}} = ();
 604 
 605   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 606     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}} = ();
 607     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}} = ();
 608     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}} = ();
 609     %{$SequenceFilesInfo{RegionsData}[$Index]} = ();
 610     @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}} = ();
 611 
 612     if (!$SequenceFilesInfo{FilesOkay}[$Index]) {
 613       next FILELIST;
 614     }
 615     if (!$OptionsInfo{PerformResidueFrequencyAnalysis}) {
 616       next FILELIST;
 617     }
 618 
 619     $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$Index];
 620     $RefereceSequenceLen = $SequenceFilesInfo{RefereceSequenceLen}[$Index];
 621 
 622     # Setup residue number mapping and gap status for refernece sequence...
 623     $NoGapResNum = 0;
 624     $ResNum = 0;
 625     for $ResIndex (0 .. ($RefereceSequenceLen - 1)) {
 626       $ResNum++;
 627       $Residue = substr($RefereceSequence, $ResIndex, 1);
 628       $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 1;
 629       if (!IsGapResidue($Residue)) {
 630         $NoGapResNum++;
 631         $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 0;
 632         $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$NoGapResNum} = $ResNum;
 633         $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}{$ResNum} = $NoGapResNum;
 634       }
 635     }
 636     # Map residue numbers for specified regions to the reference residue in input sequence/alignment files
 637     $RegionNum = 0;
 638     @RefereceSequenceRegions = ();
 639     push @RefereceSequenceRegions, @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]};
 640     for ($RegionIndex = 0; $RegionIndex <= $#RefereceSequenceRegions; $RegionIndex += 2) {
 641       $StartResNum = $RefereceSequenceRegions[$RegionIndex];
 642       $EndResNum = $RefereceSequenceRegions[$RegionIndex + 1];
 643       $RegionNum++;
 644       $RegionID = "Region${RegionNum}";
 645       if ($OptionsInfo{IgnoreGaps}) {
 646         # Map residue numbers to the reference sequence residue numbers to account for any ignored gaps...
 647         $StartResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$StartResNum};
 648         $EndResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$EndResNum};
 649       }
 650       push @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}}, $RegionID;
 651       $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{StartResNum} = $StartResNum;
 652       $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{EndResNum} = $EndResNum;
 653 
 654       # Initialize data for residue codes...
 655       for $ResNum ($StartResNum .. $EndResNum) {
 656         for $ResidueCode (@{$SequenceFilesInfo{ResidueCodes}[$Index]}) {
 657           $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{Count}{$ResNum}{$ResidueCode} = 0;
 658         }
 659       }
 660     }
 661   }
 662 }
 663 
 664 # Setup script usage  and retrieve command line arguments specified using various options...
 665 sub SetupScriptUsage {
 666 
 667   # Retrieve all the options...
 668   %Options = ();
 669   $Options{ignoregaps} = 'yes';
 670   $Options{regionresiduesmode} = 'None';
 671   $Options{mode} = 'PercentIdentityMatrix';
 672   $Options{outdelim} = 'comma';
 673   $Options{precision} = 2;
 674   $Options{quote} = 'yes';
 675   $Options{referencesequence} = 'UseFirstSequenceID';
 676   $Options{region} = 'UseCompleteSequence';
 677 
 678   if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "referencesequence=s", "region=s", "regionresiduesmode=s", "root|r=s", "workingdir|w=s")) {
 679     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 680   }
 681   if ($Options{workingdir}) {
 682     if (! -d $Options{workingdir}) {
 683       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 684     }
 685     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 686   }
 687   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 688     die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 689   }
 690   if ($Options{regionresiduesmode} !~ /^(AminoAcids|NucleicAcids|None)$/i) {
 691     die "Error: The value specified, $Options{regionresiduesmode}, for option \"--regionresiduesmode\" is not valid. Allowed values: AminoAcids, NucleicAcids or None\n";
 692   }
 693   if ($Options{mode} !~ /^(PercentIdentityMatrix|ResidueFrequencyAnalysis|All)$/i) {
 694     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: PercentIdentityMatrix, ResidueFrequencyAnalysis  or All\n";
 695   }
 696   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 697     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 698   }
 699   if ($Options{quote} !~ /^(yes|no)$/i) {
 700     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 701   }
 702   if (!IsPositiveInteger($Options{precision})) {
 703     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 704   }
 705 }
 706