MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: ExtractFromSequenceFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:19 $
   5 # $Revision: 1.23 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SequenceFileUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Setup script usage message...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 # Expand wild card file names...
  56 my(@SequenceFilesList);
  57 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  58 
  59 # Process options...
  60 print "Processing options...\n";
  61 my(%OptionsInfo);
  62 ProcessOptions();
  63 
  64 # Set up information about input files...
  65 print "Checking input sequence file(s)...\n";
  66 my(%SequenceFilesInfo);
  67 RetrieveSequenceFilesInfo();
  68 
  69 # Process input files..
  70 my($FileIndex);
  71 if (@SequenceFilesList > 1) {
  72   print "\nProcessing sequence files...\n";
  73 }
  74 for $FileIndex (0 .. $#SequenceFilesList) {
  75   if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
  76     print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
  77     ExtractFromSequenceFiles($FileIndex);
  78   }
  79 }
  80 print "\n$ScriptName:Done...\n\n";
  81 
  82 $EndTime = new Benchmark;
  83 $TotalTime = timediff ($EndTime, $StartTime);
  84 print "Total time: ", timestr($TotalTime), "\n";
  85 
  86 ###############################################################################
  87 
  88 # Extract from sequence files...
  89 sub ExtractFromSequenceFiles {
  90   my($FileIndex) = @_;
  91   my($OutSequenceFile, $SequenceFile, $SequenceDataRef, $SpecifiedSequenceDataRef);
  92 
  93   # Read sequence file...
  94   $SequenceFile = $SequenceFilesList[$FileIndex];
  95   open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
  96   $SequenceDataRef = ReadSequenceFile($SequenceFile);
  97   close SEQUENCEFILE;
  98 
  99   $OutSequenceFile = $SequenceFilesInfo{OutFile}[$FileIndex];
 100   print "Generating sequence file $OutSequenceFile...\n";
 101 
 102   # Retrieve sequence data for specified sequences...
 103   $SpecifiedSequenceDataRef = GetSpecifiedSequenceData($SequenceDataRef);
 104 
 105   # Handle gaps...
 106   if ($OptionsInfo{IgnoreGaps}) {
 107     if (@{$SpecifiedSequenceDataRef->{IDs}} > 1) {
 108       if (AreSequenceLengthsIdentical($SpecifiedSequenceDataRef)) {
 109         $SpecifiedSequenceDataRef = RemoveSequenceAlignmentGapColumns($SpecifiedSequenceDataRef);
 110       }
 111     }
 112     else {
 113       # Remove the gaps from the sequence...
 114       my($ID, $Sequence);
 115       $ID = $SpecifiedSequenceDataRef->{IDs}[0];
 116       $Sequence = $SpecifiedSequenceDataRef->{Sequence}{$ID};
 117       $SpecifiedSequenceDataRef->{Sequence}{$ID} = RemoveSequenceGaps($Sequence);
 118     }
 119   }
 120 
 121   # Write out the file...
 122   WritePearsonFastaSequenceFile($OutSequenceFile, $SpecifiedSequenceDataRef, $OptionsInfo{MaxSequenceLength});
 123 }
 124 
 125 # Get specified sequence data...
 126 sub GetSpecifiedSequenceData {
 127   my($SequenceDataRef) = @_;
 128 
 129   if ($OptionsInfo{Mode} =~ /^SequenceID$/i) {
 130     return GetDataBySequenceIDs($SequenceDataRef);
 131   }
 132   elsif ($Options{mode} =~ /^SequenceNum$/i) {
 133     return GetDataBySequenceNums($SequenceDataRef);
 134   }
 135   elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
 136     return GetDataBySequenceNumRange($SequenceDataRef);
 137   }
 138   else {
 139     return undef;
 140   }
 141 }
 142 
 143 # Get specified sequence data...
 144 sub GetDataBySequenceIDs {
 145   my($SequenceDataRef) = @_;
 146   my($ID, $SequenceCount, $IDMatched, $SpecifiedID, %SpecifiedSequenceDataMap);
 147 
 148   # Go over sequences and collect sequences for writing out a new sequence file...
 149   %SpecifiedSequenceDataMap = ();
 150   @{$SpecifiedSequenceDataMap{IDs}} = ();
 151   %{$SpecifiedSequenceDataMap{Description}} = ();
 152   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 153 
 154   $SequenceCount = 0;
 155   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 156     if ($OptionsInfo{MatchExactSequenceIDs}) {
 157       if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{lc($ID)}) {
 158         next ID;
 159       }
 160       if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
 161         last ID;
 162       }
 163       $SequenceCount++;
 164     }
 165     else {
 166       # Does this ID contains specified ID as substring...
 167       $IDMatched = 0;
 168       SPECIFIEDID: for $SpecifiedID (@{$OptionsInfo{SpecifiedSequenceIDs}}) {
 169         if ($ID =~ /$SpecifiedID/i) {
 170           $IDMatched = 1;
 171           last SPECIFIEDID;
 172         }
 173       }
 174       if (!$IDMatched) {
 175         next ID;
 176       }
 177       $SequenceCount++;
 178     }
 179     # Collect sequence data...
 180     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 181     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 182     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 183   }
 184 
 185   return \%SpecifiedSequenceDataMap;
 186 }
 187 
 188 # Get specified sequence data...
 189 sub GetDataBySequenceNums {
 190   my($SequenceDataRef) = @_;
 191   my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
 192 
 193   # Go over sequences and collect sequences for writing out a new sequence file...
 194   %SpecifiedSequenceDataMap = ();
 195   @{$SpecifiedSequenceDataMap{IDs}} = ();
 196   %{$SpecifiedSequenceDataMap{Description}} = ();
 197   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 198 
 199   $SequenceNum = 0;
 200   $SequenceCount = 0;
 201   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 202     $SequenceNum++;
 203     if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceNum}) {
 204       next ID;
 205     }
 206     if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
 207       last ID;
 208     }
 209     $SequenceCount++;
 210 
 211     # Collect sequence data...
 212     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 213     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 214     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 215   }
 216 
 217   return \%SpecifiedSequenceDataMap;
 218 }
 219 
 220 # Get specified sequence data...
 221 sub GetDataBySequenceNumRange {
 222   my($SequenceDataRef) = @_;
 223   my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
 224 
 225   # Go over sequences and collect sequences for writing out a new sequence file...
 226   %SpecifiedSequenceDataMap = ();
 227   @{$SpecifiedSequenceDataMap{IDs}} = ();
 228   %{$SpecifiedSequenceDataMap{Description}} = ();
 229   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 230 
 231   $SequenceNum = 0;
 232   $SequenceCount = 0;
 233   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 234     $SequenceNum++;
 235 
 236     if (!($SequenceNum >= $OptionsInfo{SpecifiedSequenceIDs}[0] && $SequenceNum <= $OptionsInfo{SpecifiedSequenceIDs}[1])) {
 237       next ID;
 238     }
 239     if ($SequenceNum > $OptionsInfo{SpecifiedSequenceIDs}[1]) {
 240       last ID;
 241     }
 242     $SequenceCount++;
 243     # Collect sequence data...
 244     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 245     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 246     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 247   }
 248 
 249   return \%SpecifiedSequenceDataMap;
 250 }
 251 
 252 
 253 # Process option values...
 254 sub ProcessOptions {
 255   %OptionsInfo = ();
 256 
 257   # Miscellaneous options...
 258   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 259 
 260   $OptionsInfo{Mode} = $Options{mode};
 261   $OptionsInfo{MatchExactSequenceIDs} = $Options{sequenceidmatch} =~ /Exact/i ? 1 :0;
 262 
 263   # Check specified sequences value...
 264   $OptionsInfo{SpecifiedSequences} = $Options{sequences};
 265   @{$OptionsInfo{SpecifiedSequenceIDs}} = ();
 266   %{$OptionsInfo{SpecifiedSequenceIDsMap}} = ();
 267 
 268   my(@SpecifiedSequenceIDs) = ();
 269   if ($Options{mode} =~ /^SequenceID$/i) {
 270     if (!$Options{sequences}) {
 271       die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceID\" of \"-m, --mode\" option\n";
 272     }
 273     @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 274   }
 275   elsif ($Options{mode} =~ /^SequenceNum$/i) {
 276     if ($Options{sequences}) {
 277       @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 278       my($SequenceNum);
 279       for $SequenceNum (@SpecifiedSequenceIDs) {
 280         if (!IsPositiveInteger($SequenceNum)) {
 281           die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
 282         }
 283       }
 284     }
 285     else {
 286       push @SpecifiedSequenceIDs, "1";
 287     }
 288   }
 289   elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
 290     if (!$Options{sequences}) {
 291       die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceNumRange\" of \"-m, --mode\" option\n";
 292     }
 293     @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 294     if (@SpecifiedSequenceIDs != 2) {
 295       die "Error: The number of values", scalar @SpecifiedSequenceIDs, " specified, $Options{sequences}, for option \"-s, --Sequences\" are not valid. Number of values must be 2 to indicate starting and ending sequence number.\n";
 296     }
 297     my($SequenceNum);
 298     for $SequenceNum (@SpecifiedSequenceIDs) {
 299       if (!IsPositiveInteger($SequenceNum)) {
 300         die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
 301       }
 302     }
 303     if ($SpecifiedSequenceIDs[0] > $SpecifiedSequenceIDs[1]) {
 304       die "Error: The value specified \"$Options{sequences}\" for option \"-s, --Sequences\" are not valid: Starting sequence number $SpecifiedSequenceIDs[0] must be smaller than ending sequence number $SpecifiedSequenceIDs[1]\n";
 305     }
 306   }
 307   push @{$OptionsInfo{SpecifiedSequenceIDs}}, @SpecifiedSequenceIDs;
 308   my($SequenceID);
 309   for $SequenceID (@SpecifiedSequenceIDs) {
 310     if ($Options{mode} =~ /^SequenceID$/i) {
 311       $OptionsInfo{SpecifiedSequenceIDsMap}{lc($SequenceID)} = $SequenceID;
 312     }
 313     else {
 314       $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceID} = $SequenceID;
 315     }
 316   }
 317 
 318   $OptionsInfo{MaxSequenceLength} = $Options{sequencelength};
 319   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 320   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 321 }
 322 
 323 # Retrieve information about sequence files...
 324 sub RetrieveSequenceFilesInfo {
 325   my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $OutFileMode, $SequenceDataRef);
 326 
 327   %SequenceFilesInfo = ();
 328   @{$SequenceFilesInfo{FilesOkay}} = ();
 329   @{$SequenceFilesInfo{OutFileRoot}} = ();
 330   @{$SequenceFilesInfo{OutFileExt}} = ();
 331   @{$SequenceFilesInfo{OutFile}} = ();
 332   @{$SequenceFilesInfo{Format}} = ();
 333   @{$SequenceFilesInfo{SequenceCount}} = ();
 334 
 335   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 336     $SequenceFile = $SequenceFilesList[$Index];
 337     $SequenceFilesInfo{FilesOkay}[$Index] = 0;
 338     $SequenceFilesInfo{OutFileRoot}[$Index] = '';
 339     $SequenceFilesInfo{OutFileExt}[$Index] = '';
 340     $SequenceFilesInfo{OutFile}[$Index] = '';
 341     $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
 342     $SequenceFilesInfo{SequenceCount}[$Index] = 0;
 343 
 344     if (! open SEQUENCEFILE, "$SequenceFile") {
 345       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 346       next FILELIST;
 347     }
 348     close SEQUENCEFILE;
 349 
 350     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 351     if (!$FileSupported) {
 352       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 353       next FILELIST;
 354     }
 355     $SequenceDataRef = ReadSequenceFile($SequenceFile);
 356 
 357     $SequenceCount = $SequenceDataRef->{Count};
 358     if (!$SequenceCount) {
 359       warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
 360       next FILELIST;
 361     }
 362 
 363     # Setup output file names...
 364     $FileDir = ""; $FileName = ""; $FileExt = "";
 365     ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
 366     $OutFileExt = 'fasta';
 367     if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
 368       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 369       if ($RootFileName && $RootFileExt) {
 370         $FileName = $RootFileName;
 371       }
 372       else {
 373         $FileName = $OptionsInfo{OutFileRoot};
 374       }
 375       $OutFileRoot = $FileName;
 376     }
 377     else {
 378       $OutFileRoot = $FileName;
 379     }
 380     MODE: {
 381         if ($OptionsInfo{Mode} =~ /^SequenceID$/i) { $OutFileMode = 'SequenceID'; last MODE;}
 382         if ($OptionsInfo{Mode} =~ /^SequenceNum$/i) { $OutFileMode = 'SequenceNum'; last MODE;}
 383         if ($OptionsInfo{Mode} =~ /^SequenceNumRange$/i) { $OutFileMode = 'SequenceNumRange'; last MODE;}
 384         $OutFileMode = '';
 385     }
 386     if (!$OptionsInfo{OverwriteFiles}) {
 387       if (-e "${OutFileRoot}${OutFileMode}.${OutFileExt}") {
 388         warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}${OutFileMode}.${OutFileExt} already exists\n";
 389         next FILELIST;
 390       }
 391     }
 392 
 393     $SequenceFilesInfo{FilesOkay}[$Index] = 1;
 394     $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 395     $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 396     $SequenceFilesInfo{OutFile}[$Index] = "${OutFileRoot}${OutFileMode}.${OutFileExt}";
 397 
 398     $SequenceFilesInfo{Format}[$Index] = $FileFormat;
 399     $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
 400   }
 401 }
 402 
 403 # Setup script usage  and retrieve command line arguments specified using various options...
 404 sub SetupScriptUsage {
 405 
 406   # Retrieve all the options...
 407   %Options = ();
 408   $Options{ignoregaps} = 'Yes';
 409   $Options{mode} = 'SequenceNum';
 410   $Options{sequenceidmatch} = 'Relaxed';
 411   $Options{sequencelength} = 80;
 412 
 413   if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "overwrite|o", "root|r=s", "sequences|s=s", "sequenceidmatch=s", "sequencelength=i", "workingdir|w=s")) {
 414     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 415   }
 416   if ($Options{workingdir}) {
 417     if (! -d $Options{workingdir}) {
 418       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 419     }
 420     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 421   }
 422   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 423     die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 424   }
 425   if ($Options{mode} !~ /^(SequenceID|SequenceNum|SequenceNumRange)$/i) {
 426     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: SequenceID, SequenceNum, or SequenceNumRange\n";
 427   }
 428   if ($Options{sequenceidmatch} !~ /^(Exact|Relaxed)$/i) {
 429     die "Error: The value specified, $Options{sequenceidmatch}, for option \"--SequenceIDMatch\" is not valid. Allowed values: Exact or Relaxed\n";
 430   }
 431   if (!IsPositiveInteger($Options{sequencelength})) {
 432     die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n";
 433   }
 434 }
 435