MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoNucleicAcids.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.26 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use NucleicAcids;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help}) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 print "Processing options...\n";
  56 my(%OptionsInfo);
  57 ProcessOptions();
  58 
  59 ListNucleicAcidProperties();
  60 print "\n$ScriptName:Done...\n\n";
  61 
  62 $EndTime = new Benchmark;
  63 $TotalTime = timediff ($EndTime, $StartTime);
  64 print "Total time: ", timestr($TotalTime), "\n";
  65 
  66 ###############################################################################
  67 
  68 # List data for an nucleic acid...
  69 sub ListNucleicAcidData {
  70   my($DataLabelRef, $DataValueRef) = @_;
  71   my($Index, $Line, $Value);
  72 
  73   if ($OptionsInfo{NucleicAcidRowsOutput}) {
  74     $Line = '';
  75     # Format data...
  76     if ($OptionsInfo{OutQuote} || $Options{outdelim} !~ /^comma$/i) {
  77       $Line = JoinWords($DataValueRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
  78     }
  79     else {
  80       # Always quote values containing commas...
  81       $Line = ($DataValueRef->[0] =~ /\,/) ? qq("$DataValueRef->[0]") : $DataValueRef->[0];
  82       for $Index (1 .. $#{$DataValueRef} ) {
  83         $Value = $DataValueRef->[$Index];
  84         if ($Value =~ /\,/) {
  85           $Value = qq("$Value");
  86         }
  87         $Line .= $OptionsInfo{OutDelim} . $Value;
  88       }
  89     }
  90     if ($OptionsInfo{FileOutput}) {
  91       print OUTFILE "$Line\n";
  92     }
  93     else {
  94       print "$Line\n";
  95     }
  96   }
  97   else {
  98     # Format and list data...
  99     $Line = '';
 100     for $Index (0 .. $#{$DataLabelRef} ) {
 101       $Line = $DataLabelRef->[$Index] . ' ' . $DataValueRef->[$Index];
 102       if ($OptionsInfo{FileOutput}) {
 103         print OUTFILE "$Line\n";
 104       }
 105       else {
 106         print "$Line\n";
 107       }
 108     }
 109   }
 110 }
 111 
 112 # List data for an nucleic acid...
 113 sub ListHeaderRowData {
 114   my($DataLabelRef) = @_;
 115   my($Line);
 116 
 117   # Format data...
 118   $Line = JoinWords($DataLabelRef, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 119   $Line =~ s/\://g;
 120   # List data...
 121   if ($OptionsInfo{FileOutput}) {
 122     print OUTFILE "$Line\n";
 123   }
 124   else {
 125     print "$Line\n";
 126   }
 127 }
 128 
 129 # List properties for nucleic acids...
 130 sub ListNucleicAcidProperties {
 131   my($NucleicAcidID, $NucleicAcidDataRef, $PropertyName, $PropertyValue, @PropertyLabels, @PropertyValues);
 132 
 133   print "Listing information for nucleic acid(s)...\n";
 134 
 135   if ($OptionsInfo{FileOutput}) {
 136     print "Generating file $OptionsInfo{OutFileName}...\n";
 137     open OUTFILE, ">$OptionsInfo{OutFileName}" or die "Couldn't open $OptionsInfo{OutFileName}: $!\n";
 138   }
 139 
 140   # Setup property labels...
 141   @PropertyLabels = ();
 142   for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) {
 143     push @PropertyLabels, ("$PropertyName:");
 144   }
 145 
 146   if ($OptionsInfo{NucleicAcidRowsOutput}) {
 147     ListHeaderRowData(\@PropertyLabels);
 148   }
 149 
 150   # Go over specified properties...
 151   for $NucleicAcidID (@{$OptionsInfo{SpecifiedNucleicAcidIDs}}) {
 152     $NucleicAcidDataRef = NucleicAcids::GetNucleicAcidPropertiesData($NucleicAcidID);
 153 
 154     if (!$OptionsInfo{NucleicAcidRowsOutput}) {
 155       if ($OptionsInfo{FileOutput}) {
 156         print OUTFILE "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 157       }
 158       else {
 159         print "\nListing properties for nucleic acid $NucleicAcidID...\n\n";
 160       }
 161     }
 162 
 163     # Collect data..
 164     @PropertyValues = ();
 165     for $PropertyName (@{$OptionsInfo{SpecifiedProperies}}) {
 166       $PropertyValue = $NucleicAcidDataRef->{$PropertyName};
 167       if (IsFloat($PropertyValue)) {
 168         $PropertyValue = sprintf("%.$OptionsInfo{Precision}f", $PropertyValue) + 0;
 169       }
 170       push @PropertyValues, $PropertyValue;
 171     }
 172     # List data...
 173     ListNucleicAcidData(\@PropertyLabels, \@PropertyValues);
 174   }
 175   if ($OptionsInfo{FileOutput}) {
 176     close OUTFILE;
 177   }
 178   print "\n";
 179 }
 180 
 181 # Get propery names from categories...
 182 sub GetPropertyNamesFromCategories {
 183   my($CategoryName) = @_;
 184   my(@PropertyNames);
 185 
 186   @PropertyNames = ();
 187   if ($CategoryName =~ /^Basic$/i) {
 188     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight');
 189   } elsif ($CategoryName =~ /^BasicPlus$/i) {
 190     @PropertyNames = ('Code', 'OtherCodes', 'Name', 'Type', 'MolecularFormula', 'MolecularWeight', 'ExactMass', 'ElementalComposition');
 191   }
 192 
 193   return @PropertyNames;
 194 }
 195 
 196 # Process option values...
 197 sub ProcessOptions {
 198   %OptionsInfo = ();
 199 
 200   $OptionsInfo{Mode} = $Options{mode};
 201 
 202   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 203   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 204 
 205   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 206   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 207 
 208   $OptionsInfo{Output} = $Options{output};
 209   $OptionsInfo{OutputStyle} = $Options{outputstyle};
 210 
 211   $OptionsInfo{NucleicAcidRowsOutput} = ($Options{outputstyle} =~ /^NucleicAcidRows$/i) ? 1 : 0;
 212   $OptionsInfo{FileOutput} = ($Options{output} =~ /^File$/i) ? 1 : 0;
 213 
 214   $OptionsInfo{Precision} = $Options{precision};
 215 
 216   my($NucleicAcidID, @NucleicAcidIDs);
 217 
 218   @{$OptionsInfo{SpecifiedNucleicAcidIDs}} = ();
 219 
 220   # Set up Nucleic Acids IDs except for All mode...
 221   @NucleicAcidIDs = ();
 222 
 223   if (@ARGV >= 1) {
 224     push @NucleicAcidIDs, @ARGV;
 225   }
 226   else {
 227     # Setup mode specified default values...
 228     if ($Options{mode} =~ /NucleicAcidID/i) {
 229       push @NucleicAcidIDs, 'A';
 230     }
 231     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 232       push @NucleicAcidIDs, 'Nucleoside';
 233     }
 234     else {
 235       push @NucleicAcidIDs, 'A';
 236     }
 237   }
 238 
 239   # Generate list of nucleic acids...
 240   if (@ARGV == 1 && $ARGV[0] =~ /^All$/i) {
 241     push @{$OptionsInfo{SpecifiedNucleicAcidIDs}}, NucleicAcids::GetNucleicAcids();
 242   }
 243   else {
 244     if ($Options{mode} =~ /NucleicAcidID/i) {
 245       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 246         if (NucleicAcids::IsNucleicAcid($NucleicAcidID)) {
 247           push @{$OptionsInfo{SpecifiedNucleicAcidIDs}}, $NucleicAcidID;
 248         }
 249         else {
 250           warn "Ignoring nucleic acid ID, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid ID...\n";
 251           next ID;
 252         }
 253       }
 254     }
 255     elsif ($Options{mode} =~ /NucleicAcidType/i) {
 256       ID: for $NucleicAcidID (@NucleicAcidIDs) {
 257           if (!NucleicAcids::IsNucleicAcidType($NucleicAcidID)) {
 258             warn "Ignoring nucleic acid type, $NucleicAcidID, specified using command line parameter option: Unknown nucleic acid type...\n";
 259             next ID;
 260           }
 261           push @{$OptionsInfo{SpecifiedNucleicAcidIDs}}, NucleicAcids::GetNucleicAcidsByType($NucleicAcidID);
 262         }
 263       }
 264   }
 265   SetupSpecifiedProperties();
 266 
 267   # Setup output file name...
 268   $OptionsInfo{OutFileName} = '';
 269   if ($OptionsInfo{FileOutput}) {
 270     my($OutFileRoot, $OutFileExt);
 271 
 272     $OutFileRoot = '';
 273     $OutFileExt = "csv";
 274     if ($Options{outdelim} =~ /^tab$/i) {
 275       $OutFileExt = "tsv";
 276     }
 277     if ($Options{root}) {
 278       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 279       if ($RootFileName && $RootFileExt) {
 280         $OutFileRoot = $RootFileName;
 281       }
 282       else {
 283         $OutFileRoot = $Options{root};
 284       }
 285     }
 286     else {
 287       $OutFileRoot = 'NucleicAcidsInfo';
 288     }
 289     $OptionsInfo{OutFileName} = $OutFileRoot . '.' . $OutFileExt;
 290     if (!$Options{overwrite}) {
 291       if (-e $OptionsInfo{OutFileName}) {
 292         die "Error: Output file, $OptionsInfo{OutFileName}, already exists.\nUse \-o --overwrite\ option or specify a different name using \"-r --root\" option.\n";
 293       }
 294     }
 295   }
 296 }
 297 
 298 # Setup properties to list...
 299 sub SetupSpecifiedProperties {
 300 
 301   $OptionsInfo{Properties} = defined $Options{properties} ? $Options{properties} : undef;
 302 
 303   $OptionsInfo{PropertiesMode} = $Options{propertiesmode};
 304   $OptionsInfo{PropertiesListing} = $Options{propertieslisting};
 305 
 306   # Make sure appropriate properties/category names are specified...
 307   @{$OptionsInfo{SpecifiedProperies}} = ();
 308   if ($Options{properties} && ($Options{propertiesmode} =~ /^All$/i) ) {
 309     warn "Warning: Ignoring values specifed by \"-p --properties\" option: Not valid for All value of \"--propertiesmode\" option...\n";
 310   }
 311   if ($Options{propertiesmode} =~ /^All$/i) {
 312     if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 313       push @{$OptionsInfo{SpecifiedProperies}}, NucleicAcids::GetNucleicAcidPropertiesNames('Alphabetical');
 314     }
 315     else {
 316       push @{$OptionsInfo{SpecifiedProperies}}, NucleicAcids::GetNucleicAcidPropertiesNames();
 317     }
 318   }
 319   else {
 320     if ($Options{properties}) {
 321       if ($Options{propertiesmode} =~ /^Categories$/i) {
 322         # Check category name...
 323         if ($Options{properties} !~ /^(Basic|BasicPlus)$/i) {
 324           die "Error: The value specified, $Options{properties}, for option \"-p --properties\" in conjunction with \"Categories\" value for option \"--propertiesmode\" is not valid. Allowed values: Basic and BasicPlus\n";
 325         }
 326         # Set propertynames...
 327         push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories($Options{properties});
 328       }
 329       else {
 330         # Check property names..
 331         my($Name, $PropertyName, @Names);
 332         @Names = split /\,/, $Options{properties};
 333         NAME: for $Name (@Names) {
 334           $PropertyName = RemoveLeadingAndTrailingWhiteSpaces($Name);
 335           if (NucleicAcids::IsNucleicAcidProperty($PropertyName)) {
 336             push @{$OptionsInfo{SpecifiedProperies}}, $PropertyName;
 337           }
 338           else {
 339             warn "Warning: Ignoring value, $Name, specifed by \"-p --properties\" option: Unknown property name...\n";
 340           }
 341         }
 342         if ($Options{propertieslisting} =~ /^Alphabetical$/i) {
 343           # Code, OtherCodes and Name are always listed first...
 344           my($CodePresent, $OtherCodesPresent, $NamePresent,  @AlphabeticalProperties, %PropertiesMap);
 345           %PropertiesMap = ();
 346           @AlphabeticalProperties = ();
 347           $CodePresent = 0; $OtherCodesPresent = 0; $NamePresent = 0;
 348           NAME: for $Name (@{$OptionsInfo{SpecifiedProperies}}) {
 349             if ($Name =~ /^Code$/i) {
 350               $CodePresent = 1;
 351               next NAME;
 352             }
 353             if ($Name =~ /^OtherCodes$/i) {
 354               $OtherCodesPresent = 1;
 355               next NAME;
 356             }
 357             if ($Name =~ /^Name$/i) {
 358               $NamePresent = 1;
 359               next NAME;
 360             }
 361             $PropertiesMap{$Name} = $Name;
 362           }
 363           # Setup the alphabetical list...
 364           if ($CodePresent) {
 365             push @AlphabeticalProperties, 'Code';
 366           }
 367           if ($OtherCodesPresent) {
 368             push @AlphabeticalProperties, 'OtherCodesPresent';
 369           }
 370           if ($NamePresent) {
 371             push @AlphabeticalProperties, 'Name';
 372           }
 373           for $Name (sort keys %PropertiesMap) {
 374             push @AlphabeticalProperties, $Name;
 375           }
 376           @{$OptionsInfo{SpecifiedProperies}} = ();
 377           push @{$OptionsInfo{SpecifiedProperies}}, @AlphabeticalProperties;
 378         }
 379       }
 380     }
 381     else {
 382       # Set default value...
 383       push @{$OptionsInfo{SpecifiedProperies}}, GetPropertyNamesFromCategories('Basic');
 384     }
 385   }
 386 }
 387 
 388 # Setup script usage  and retrieve command line arguments specified using various options...
 389 sub SetupScriptUsage {
 390 
 391   # Retrieve all the options...
 392   %Options = ();
 393   $Options{mode} = "NucleicAcidID";
 394   $Options{outdelim} = "comma";
 395   $Options{output} = "STDOUT";
 396   $Options{outputstyle} = "NucleicAcidBlock";
 397   $Options{precision} = 4;
 398   $Options{propertiesmode} = "Categories";
 399   $Options{propertieslisting} = "ByGroup";
 400   $Options{quote} = "yes";
 401 
 402   if (!GetOptions(\%Options, "help|h", "mode|m=s", "outdelim=s", "output=s", "outputstyle=s", "overwrite|o", "precision=i", "properties|p=s", "propertieslisting=s", "propertiesmode=s", "quote|q=s", "root|r=s", "workingdir|w=s")) {
 403     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 404   }
 405   if ($Options{workingdir}) {
 406     if (! -d $Options{workingdir}) {
 407       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 408     }
 409     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 410   }
 411   if ($Options{mode} !~ /^(NucleicAcidID|NucleicAcidType)$/i) {
 412     die "Error: The value specified, $Options{mode}, for option \"--mode\" is not valid. Allowed values: NucleicAcidID or NucleicAcidType\n";
 413   }
 414   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 415     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 416   }
 417   if ($Options{output} !~ /^(STDOUT|File)$/i) {
 418     die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: STDOUT or File\n";
 419   }
 420   if ($Options{outputstyle} !~ /^(NucleicAcidBlock|NucleicAcidRows)$/i) {
 421     die "Error: The value specified, $Options{outputstyle}, for option \"--outputstyle\" is not valid. Allowed values: NucleicAcidBlock or NucleicAcidRows\n";
 422   }
 423   if (!IsPositiveInteger($Options{precision})) {
 424     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 425   }
 426   if ($Options{propertiesmode} !~ /^(Categories|Names|All)$/i) {
 427     die "Error: The value specified, $Options{propertiesmode}, for option \"--propertiesmode\" is not valid. Allowed values: Categories, Names, or All\n";
 428   }
 429   if ($Options{propertieslisting} !~ /^(ByGroup|Alphabetical)$/i) {
 430     die "Error: The value specified, $Options{propertieslisting}, for option \"--propertieslisting\" is not valid. Allowed values: ByGroup, or Alphabetical\n";
 431   }
 432   if ($Options{quote} !~ /^(yes|no)$/i) {
 433     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 434   }
 435 }
 436