MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoAminoAcids.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.26 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use AminoAcids;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help}) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 print "Processing options...\n";
  56 my(%OptionsInfo);
  57 ProcessOptions();
  58 
  59 ListAminoAcidProperties();
  60 print "\n$ScriptName:Done...\n\n";
  61 
  62 $EndTime = new Benchmark;
  63 $TotalTime = timediff ($EndTime, $StartTime);
  64 print "Total time: ", timestr($TotalTime), "\n";
  65 
  66 ###############################################################################
  67 
  68 # List data for an amino acid...
  69 sub ListAminoAcidData {
  70   my($DataLabelRef, $DataValueRef) = @_;
  71   my($Index, $Line, $Value);
  72 
  73   if ($OptionsInfo{AminoAcidRowsOutput}) {
  74     $Line = '';
  75     # Format data...
  76     if ($OptionsInfo{OutQuote} || $Options{outdelim} !~ /^comma$/i) {
  77       $Line = JoinWords($DataValueRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
  78     }
  79     else {
  80       # Always quote values containing commas...
  81       $Line = ($DataValueRef->[0] =~ /\,/) ? qq("$DataValueRef->[0]") : $DataValueRef->[0];
  82       for $Index (1 .. $#{$DataValueRef} ) {
  83         $Value = $DataValueRef->[$Index];
  84         if ($Value =~ /\,/) {
  85           $Value = qq("$Value");
  86         }
  87         $Line .= $OptionsInfo{OutDelim} . $Value;
  88       }
  89     }
  90     if ($OptionsInfo{FileOutput}) {
  91       print OUTFILE "$Line\n";
  92     }
  93     else {
  94       print "$Line\n";
  95     }
  96   }
  97   else {
  98     # Format and list data...
  99     $Line = '';
 100     for $Index (0 .. $#{$DataLabelRef} ) {
 101       $Line = $DataLabelRef->[$Index] . ' ' . $DataValueRef->[$Index];
 102       if ($OptionsInfo{FileOutput}) {
 103         print OUTFILE "$Line\n";
 104       }
 105       else {
 106         print "$Line\n";
 107       }
 108     }
 109   }
 110 }
 111 
 112 # List data for an amino acid...
 113 sub ListHeaderRowData {
 114   my($DataLabelRef) = @_;
 115   my($Line);
 116 
 117   # Format data...
 118   $Line = JoinWords($DataLabelRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 119   $Line =~ s/\://g;
 120   # List data...
 121   if ($OptionsInfo{FileOutput}) {
 122     print OUTFILE "$Line\n";
 123   }
 124   else {
 125     print "$Line\n";
 126   }
 127 }
 128 
 129 # List properties for amino acids...
 130 sub ListAminoAcidProperties {
 131   my($AminoAcidID, $AminoAcidDataRef, $PropertyName, $PropertyValue, @PropertyLabels, @PropertyValues);
 132 
 133   print "Listing information for amino acid(s)...\n";
 134 
 135   if ($OptionsInfo{FileOutput}) {
 136     print "Generating file $OptionsInfo{OutFileName}...\n";
 137     open OUTFILE, ">$OptionsInfo{OutFileName}" or die "Couldn't open $OptionsInfo{OutFileName}: $!\n";
 138   }
 139 
 140   # Setup property labels...
 141   @PropertyLabels = ();
 142   for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) {
 143     push @PropertyLabels, ("$PropertyName:");
 144   }
 145 
 146   if ($OptionsInfo{AminoAcidRowsOutput}) {
 147     ListHeaderRowData(\@PropertyLabels);
 148   }
 149 
 150   # Go over specified properties...
 151   for $AminoAcidID (@{$OptionsInfo{SpecifiedAminoAcidIDs}}) {
 152     $AminoAcidDataRef = AminoAcids::GetAminoAcidPropertiesData($AminoAcidID);
 153 
 154     if (!$OptionsInfo{AminoAcidRowsOutput}) {
 155       if ($OptionsInfo{FileOutput}) {
 156         print OUTFILE "\nListing properties for amino acid $AminoAcidID...\n\n";
 157       }
 158       else {
 159         print "\nListing properties for amino acid $AminoAcidID...\n\n";
 160       }
 161     }
 162 
 163     # Collect data..
 164     @PropertyValues = ();
 165     for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) {
 166       $PropertyValue = $AminoAcidDataRef->{$PropertyName};
 167       if (IsFloat($PropertyValue)) {
 168         $PropertyValue = sprintf("%.$OptionsInfo{Precision}f", $PropertyValue) + 0;
 169       }
 170       push @PropertyValues, $PropertyValue;
 171     }
 172     # List data...
 173     ListAminoAcidData(\@PropertyLabels, \@PropertyValues);
 174   }
 175   if ($OptionsInfo{FileOutput}) {
 176     close OUTFILE;
 177   }
 178   print "\n";
 179 }
 180 
 181 # Get propery names from categories...
 182 sub GetPropertyNamesFromCategories {
 183   my($CategoryName) = @_;
 184   my(@PropertyNames);
 185 
 186   @PropertyNames = ();
 187   if ($CategoryName =~ /^Basic$/i) {
 188     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula','MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4');
 189   } elsif ($CategoryName =~ /^BasicPlus$/i) {
 190     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'AcidicBasic', 'PolarNonpolar', 'Charged', 'Aromatic', 'HydrophobicHydophilic', 'IsoelectricPoint', 'pKCOOH', 'pKNH3+', 'ChemicalFormula', 'MolecularWeight', 'ExactMass', 'ChemicalFormulaMinusH2O', 'MolecularWeightMinusH2O(18.01524)', 'ExactMassMinusH2O(18.01056)','LinearStructure', 'LinearStructureAtpH7.4');
 191   } elsif ($CategoryName =~ /^BasicAndHydrophobicity$/i) {
 192     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula', 'MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4', 'HydrophobicityEisenbergAndOthers', 'HydrophobicityHoppAndWoods', 'HydrophobicityJanin', 'HydrophobicityKyteAndDoolittle', 'HydrophobicityRoseAndOthers', 'HydrophobicityWolfendenAndOthers');
 193   } elsif ($CategoryName =~ /^BasicAndHydrophobicityPlus$/i) {
 194     @PropertyNames = ('ThreeLetterCode', 'OneLetterCode', 'AminoAcid', 'DNACodons', 'RNACodons', 'ChemicalFormula', 'MolecularWeight', 'LinearStructure', 'LinearStructureAtpH7.4', 'HydrophobicityAbrahamAndLeo', 'HydrophobicityBlack', 'HydrophobicityBullAndBreese', 'HydrophobicityChothia', 'HydrophobicityEisenbergAndOthers', 'HydrophobicityFauchereAndOthers', 'HydrophobicityGuy', 'HydrophobicityHPLCAtpH3.4Cowan', 'HydrophobicityHPLCAtpH7.5Cowan', 'HydrophobicityHPLCParkerAndOthers', 'HydrophobicityHPLCWilsonAndOthers', 'HydrophobicityHoppAndWoods', 'HydrophobicityJanin', 'HydrophobicityKyteAndDoolittle', 'HydrophobicityManavalanAndOthers', 'HydrophobicityMiyazawaAndOthers', 'HydrophobicityOMHSweetAndOthers', 'HydrophobicityRaoAndArgos', 'HydrophobicityRfMobility', 'HydrophobicityRoseAndOthers', 'HydrophobicityRoseman', 'HydrophobicityWellingAndOthers', 'HydrophobicityWolfendenAndOthers');
 195   }
 196 
 197   return @PropertyNames;
 198 }
 199 
 200 # Process option values...
 201 sub ProcessOptions {
 202   %OptionsInfo = ();
 203 
 204   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 205   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 206 
 207   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 208   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 209 
 210   $OptionsInfo{OutputStyle} = $Options{outputstyle};
 211 
 212   $OptionsInfo{AminoAcidRowsOutput} = ($Options{outputstyle} =~ /^AminoAcidRows$/i) ? 1 : 0;
 213   $OptionsInfo{FileOutput} = ($Options{output} =~ /^File$/i) ? 1 : 0;
 214 
 215   $OptionsInfo{Precision} = $Options{precision};
 216 
 217   my($AminoAcidID, @AminoAcidIDs);
 218 
 219   @{$OptionsInfo{SpecifiedAminoAcidIDs}} = ();
 220 
 221   # Set up Amino Acids IDs except for All mode...
 222   @AminoAcidIDs = ();
 223 
 224   if (@ARGV >= 1) {
 225     push @AminoAcidIDs, @ARGV;
 226   }
 227   else {
 228     # Setup mode specified default values...
 229     push @AminoAcidIDs, 'Ala';
 230   }
 231 
 232   # Generate list of amino acids...
 233   if (@ARGV == 1 && $ARGV[0] =~ /^All$/i) {
 234     push @{$OptionsInfo{SpecifiedAminoAcidIDs}}, AminoAcids::GetAminoAcids();
 235   }
 236   else {
 237     ID: for $AminoAcidID (@AminoAcidIDs) {
 238       if (AminoAcids::IsAminoAcid($AminoAcidID)) {
 239         push @{$OptionsInfo{SpecifiedAminoAcidIDs}}, $AminoAcidID;
 240       }
 241       else {
 242         warn "Ignoring amino acid ID, $AminoAcidID, specified using command line parameter option: Unknown amino acid ID...\n";
 243         next ID;
 244       }
 245     }
 246   }
 247   SetupSpecifiedProperties();
 248 
 249   # Setup output file name...
 250   $OptionsInfo{OutFileName} = '';
 251   if ($OptionsInfo{FileOutput}) {
 252     my($OutFileRoot, $OutFileExt);
 253 
 254     $OutFileRoot = '';
 255     $OutFileExt = "csv";
 256     if ($Options{outdelim} =~ /^tab$/i) {
 257       $OutFileExt = "tsv";
 258     }
 259     if ($Options{root}) {
 260       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 261       if ($RootFileName && $RootFileExt) {
 262         $OutFileRoot = $RootFileName;
 263       }
 264       else {
 265         $OutFileRoot = $Options{root};
 266       }
 267     }
 268     else {
 269       $OutFileRoot = 'AminoAcidsInfo';
 270     }
 271     $OptionsInfo{OutFileName} = $OutFileRoot . '.' . $OutFileExt;
 272     if (!$Options{overwrite}) {
 273       if (-e $OptionsInfo{OutFileName}) {
 274         die "Error: Output file, $OptionsInfo{OutFileName}, already exists.\nUse \-o --overwrite\ option or specify a different name using \"-r --root\" option.\n";
 275       }
 276     }
 277   }
 278 }
 279 
 280 # Setup properties to list...
 281 sub SetupSpecifiedProperties {
 282 
 283   $OptionsInfo{Properties} = defined $Options{properties} ? $Options{properties} : undef;
 284 
 285   $OptionsInfo{PropertiesMode} = $Options{propertiesmode};
 286   $OptionsInfo{PropertiesListing} = $Options{propertieslisting};
 287 
 288   # Make sure appropriate properties/category names are specified...
 289   @{$OptionsInfo{SpecifiedProperies}} = ();
 290   if ($Options{properties} && ($Options{propertiesmode} =~ /^All$/i) ) {
 291     warn "Warning: Ignoring values specifed by \"-p --properties\" option: Not valid for All value of \"--propertiesmode\" option...\n";
 292   }
 293   if ($Options{propertiesmode} =~ /^All$/i) {
 294     if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 295       push @{$OptionsInfo{SpecifiedProperies}}, AminoAcids::GetAminoAcidPropertiesNames('Alphabetical');
 296     }
 297     else {
 298       push @{$OptionsInfo{SpecifiedProperies}}, AminoAcids::GetAminoAcidPropertiesNames();
 299     }
 300   }
 301   else {
 302     if ($Options{properties}) {
 303       if ($Options{propertiesmode} =~ /^Categories$/i) {
 304         # Check category name...
 305         if ($Options{properties} !~ /^(Basic|BasicPlus|BasicAndHydrophobicity|BasicAndHydrophobicityPlus)$/i) {
 306           die "Error: The value specified, $Options{properties}, for option \"-p --properties\" in conjunction with \"Categories\" value for option \"--propertiesmode\" is not valid. Allowed values: Basic, BasicPlus, BasicAndHydrophobicity, and BasicAndHydrophobicityPlus\n";
 307         }
 308         # Set propertynames...
 309         push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories($Options{properties});
 310       }
 311       else {
 312         # Check property names..
 313         my($Name, $PropertyName, @Names);
 314         @Names = split /\,/, $Options{properties};
 315         NAME: for $Name (@Names) {
 316           $PropertyName = RemoveLeadingAndTrailingWhiteSpaces($Name);
 317           if (AminoAcids::IsAminoAcidProperty($PropertyName)) {
 318             push @{$OptionsInfo{SpecifiedProperies}}, $PropertyName;
 319           }
 320           else {
 321             warn "Warning: Ignoring value, $Name, specifed by \"-p --properties\" option: Unknown property name...\n";
 322           }
 323         }
 324         if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 325           # ThreeLetterCode, OneLetterCode and AminoAcid are always listed first...
 326           # NaturalIsotopeData in the end...
 327           my($OneLetterCodePresent, $ThreeLetterCodePresent, $AminoAcidPresent,  @AlphabeticalProperties, %PropertiesMap);
 328           %PropertiesMap = ();
 329           @AlphabeticalProperties = ();
 330           $OneLetterCodePresent = 0; $ThreeLetterCodePresent = 0; $AminoAcidPresent = 0;
 331           NAME: for $Name (@{$OptionsInfo{SpecifiedProperies}}) {
 332             if ($Name =~ /^OneLetterCode$/i) {
 333               $OneLetterCodePresent = 1;
 334               next NAME;
 335             }
 336             if ($Name =~ /^ThreeLetterCode$/i) {
 337               $ThreeLetterCodePresent = 1;
 338               next NAME;
 339             }
 340             if ($Name =~ /^AminoAcid$/i) {
 341               $AminoAcidPresent = 1;
 342               next NAME;
 343             }
 344             $PropertiesMap{$Name} = $Name;
 345           }
 346           # Setup the alphabetical list...
 347           if ($ThreeLetterCodePresent) {
 348             push @AlphabeticalProperties, 'ThreeLetterCode';
 349           }
 350           if ($OneLetterCodePresent) {
 351             push @AlphabeticalProperties, 'OneLetterCode';
 352           }
 353           if ($AminoAcidPresent) {
 354             push @AlphabeticalProperties, 'AminoAcid';
 355           }
 356           for $Name (sort keys %PropertiesMap) {
 357             push @AlphabeticalProperties, $Name;
 358           }
 359           @{$OptionsInfo{SpecifiedProperies}} = ();
 360           push @{$OptionsInfo{SpecifiedProperies}}, @AlphabeticalProperties;
 361         }
 362       }
 363     }
 364     else {
 365       # Set default value...
 366       push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories('Basic');
 367     }
 368   }
 369 }
 370 
 371 # Setup script usage  and retrieve command line arguments specified using various options...
 372 sub SetupScriptUsage {
 373 
 374   # Retrieve all the options...
 375   %Options = ();
 376   $Options{outdelim} = "comma";
 377   $Options{output} = "STDOUT";
 378   $Options{outputstyle} = "AminoAcidBlock";
 379   $Options{precision} = 4;
 380   $Options{propertiesmode} = "Categories";
 381   $Options{propertieslisting} = "ByGroup";
 382   $Options{quote} = "yes";
 383 
 384   if (!GetOptions(\%Options, "help|h", "outdelim=s", "output=s", "outputstyle=s", "overwrite|o", "precision=i", "properties|p=s", "propertieslisting=s", "propertiesmode=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 385     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 386   }
 387   if ($Options{workingdir}) {
 388     if (! -d $Options{workingdir}) {
 389       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 390     }
 391     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 392   }
 393   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 394     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 395   }
 396   if ($Options{output} !~ /^(STDOUT|File)$/i) {
 397     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: STDOUT or File\n";
 398   }
 399   if ($Options{outputstyle} !~ /^(AminoAcidBlock|AminoAcidRows)$/i) {
 400     die "Error: The value specified, $Options{outputstyle}, for option \"--outputstyle\" is not valid. Allowed values: AminoAcidBlock or AminoAcidRows\n";
 401   }
 402   if (!IsPositiveInteger($Options{precision})) {
 403     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 404   }
 405   if ($Options{propertiesmode} !~ /^(Categories|Names|All)$/i) {
 406     die "Error: The value specified, $Options{propertiesmode}, for option \"--propertiesmode\" is not valid. Allowed values: Categories, Names, or All\n";
 407   }
 408   if ($Options{propertieslisting} !~ /^(ByGroup|Alphabetical)$/i) {
 409     die "Error: The value specified, $Options{propertieslisting}, for option \"--propertieslisting\" is not valid. Allowed values: ByGroup, or Alphabetical\n";
 410   }
 411   if ($Options{quote} !~ /^(yes|no)$/i) {
 412     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 413   }
 414 }
 415