MayaChemTools

   1 package NucleicAcids;
   2 #
   3 # $RCSfile: NucleicAcids.pm,v $
   4 # $Date: 2015/02/28 20:47:18 $
   5 # $Revision: 1.25 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Carp;
  31 use Text::ParseWords;
  32 use TextUtil;
  33 use FileUtil;
  34 
  35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 @ISA = qw(Exporter);
  38 @EXPORT = qw();
  39 @EXPORT_OK = qw(GetNucleicAcids GetNucleicAcidsByType GetNucleicAcidPropertiesData GetNucleicAcidPropertiesNames IsNucleicAcid IsNucleicAcidProperty IsNucleicAcidType);
  40 
  41 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  42 
  43 #
  44 # Load nucleic acids data...
  45 #
  46 my(%NucleicAcidDataMap, %NucleicAcidCodeMap, %NucleicAcidOtherCodeMap, %NucleicAcidNameMap, @NucleicAcidCodes, @NucleicAcidPropertyNames, %NucleicAcidPropertyNamesMap, %NucleicAcidTypesMap);
  47 _LoadNucleicAcidsData();
  48 
  49 #
  50 # Get a list of all known nucleic acids as one of these values:
  51 # code or nucleic acid name...
  52 #
  53 sub GetNucleicAcids {
  54   my($NameType, $Code, $Name, @NucleicAcidNames);
  55 
  56   $NameType = 'Code';
  57   if (@_ >= 1) {
  58     ($NameType) = @_;
  59   }
  60 
  61   # Collect names...
  62   @NucleicAcidNames = ();
  63   for $Code (@NucleicAcidCodes) {
  64     NAME : {
  65       if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
  66       $Name = $Code;
  67     }
  68     push @NucleicAcidNames, $Name;
  69   }
  70 
  71   return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
  72 }
  73 
  74 #
  75 # Get a list of all known nucleic acids by one of these specified types:
  76 # Nucleobase, Nucleoside, Deoxynucleoside, Nucleotide, Deoxynucleotide. Default: Nucleoside
  77 #
  78 sub GetNucleicAcidsByType {
  79   my($NameType, $Type, $Code, $Name, @NucleicAcidNames);
  80 
  81   $Type = 'Nucleoside';
  82   $NameType = 'Code';
  83   if (@_ == 2) {
  84     ($Type, $NameType) = @_;
  85   }
  86   elsif (@_ == 1) {
  87     ($Type) = @_;
  88   }
  89 
  90   # Collect names...
  91   @NucleicAcidNames = ();
  92   CODE: for $Code (@NucleicAcidCodes) {
  93     if ($NucleicAcidDataMap{$Code}{Type} !~ /^$Type$/i ) {
  94       next CODE;
  95     }
  96     NAME : {
  97       if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
  98       $Name = $Code;
  99     }
 100     push @NucleicAcidNames, $Name;
 101   }
 102 
 103   return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
 104 }
 105 
 106 #
 107 # Get all available properties data for an nucleic acid using any of these symbols:
 108 # code, other code or name.
 109 #
 110 # A reference to a hash array is returned with keys and values representing property
 111 # name and its values respectively.
 112 #
 113 sub GetNucleicAcidPropertiesData {
 114   my($NucleicAcidID) = @_;
 115   my($Code);
 116 
 117   if ($Code = _ValidateNucleicAcidID($NucleicAcidID)) {
 118     return \%{$NucleicAcidDataMap{$Code}};
 119   }
 120   else {
 121     return undef;
 122   }
 123 }
 124 
 125 #
 126 # Get names of all available nucleic acid properties. A reference to  an array containing
 127 # names of all available properties is returned.
 128 #
 129 sub GetNucleicAcidPropertiesNames {
 130   my($Mode);
 131   my($PropertyName, @PropertyNames);
 132 
 133   $Mode = 'ByGroup';
 134   if (@_ == 1) {
 135     ($Mode) = @_;
 136   }
 137 
 138   @PropertyNames = ();
 139   if ($Mode =~ /^Alphabetical$/i) {
 140     my($PropertyName);
 141     # Code, OtherCodes and Name are always listed first...
 142     push @PropertyNames, qw(Code OtherCodes Name);
 143     for $PropertyName (sort keys %NucleicAcidPropertyNamesMap) {
 144       if ($PropertyName !~ /^(Code|OtherCodes|Name)$/) {
 145         push @PropertyNames, $PropertyName;
 146       }
 147     }
 148   }
 149   else {
 150     push @PropertyNames, @NucleicAcidPropertyNames;
 151   }
 152   return (wantarray ? @PropertyNames : \@PropertyNames);
 153 }
 154 
 155 #
 156 # Is it a known nucleic acid? Input is either a code or a name
 157 #
 158 sub IsNucleicAcid {
 159   my($NucleicAcidID) = @_;
 160   my($Status);
 161 
 162   $Status = (_ValidateNucleicAcidID($NucleicAcidID)) ? 1 : 0;
 163 
 164   return $Status;
 165 }
 166 
 167 #
 168 # Is it an available nucleic acid property?
 169 #
 170 sub IsNucleicAcidProperty {
 171   my($PropertyName) = @_;
 172   my($Status);
 173 
 174   $Status = (exists($NucleicAcidPropertyNamesMap{$PropertyName})) ? 1 : 0;
 175 
 176   return $Status;
 177 }
 178 
 179 #
 180 # Is it an available nucleic acid type?
 181 #
 182 sub IsNucleicAcidType {
 183   my($Type) = @_;
 184   my($Status);
 185 
 186   $Status = (exists($NucleicAcidTypesMap{lc($Type)})) ? 1 : 0;
 187 
 188   return $Status;
 189 }
 190 
 191 #
 192 # Implents GetNucleicAcid<PropertyName> for a valid proprty name.
 193 #
 194 sub AUTOLOAD {
 195   my($NucleicAcidID) = @_;
 196   my($FunctionName, $PropertyName, $PropertyValue, $Code);
 197 
 198   $PropertyValue = undef;
 199 
 200   use vars qw($AUTOLOAD);
 201   $FunctionName = $AUTOLOAD;
 202   $FunctionName =~ s/.*:://;
 203 
 204   # Only Get<PropertyName> functions are supported...
 205   if ($FunctionName !~ /^Get/) {
 206     croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Only Get<PropertyName> functions are implemented...";
 207   }
 208 
 209   $PropertyName = $FunctionName;
 210   $PropertyName =~  s/^GetNucleicAcid//;
 211   if (!exists $NucleicAcidPropertyNamesMap{$PropertyName}) {
 212     croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Unknown nucleic acid property name, $PropertyName, specified...";
 213   }
 214 
 215   if (!($Code = _ValidateNucleicAcidID($NucleicAcidID))) {
 216     return undef;
 217   }
 218   $PropertyValue = $NucleicAcidDataMap{$Code}{$PropertyName};
 219   return $PropertyValue;
 220 }
 221 
 222 #
 223 # Load NucleicAcidsData.csv files from <MayaChemTools>/lib directory...
 224 #
 225 sub _LoadNucleicAcidsData {
 226   my($NucleicAcidsDataFile, $MayaChemToolsLibDir);
 227 
 228   $MayaChemToolsLibDir = GetMayaChemToolsLibDirName();
 229 
 230   $NucleicAcidsDataFile =  "$MayaChemToolsLibDir" . "/data/NucleicAcidsData.csv";
 231 
 232   if (! -e "$NucleicAcidsDataFile") {
 233     croak "Error: MayaChemTools package file, $NucleicAcidsDataFile, is missing: Possible installation problems...";
 234   }
 235 
 236   _LoadData($NucleicAcidsDataFile);
 237 }
 238 
 239 #
 240 # Load NucleicAcidsData.csv file from <MayaChemTools>/lib directory...
 241 #
 242 sub _LoadData {
 243   my($NucleicAcidsDataFile) = @_;
 244 
 245   %NucleicAcidDataMap = ();
 246   @NucleicAcidCodes = ();
 247   @NucleicAcidPropertyNames = ();
 248   %NucleicAcidPropertyNamesMap = ();
 249   %NucleicAcidCodeMap = ();
 250   %NucleicAcidOtherCodeMap = ();
 251   %NucleicAcidNameMap = ();
 252   %NucleicAcidTypesMap = ();
 253 
 254   # Load property data for all nucleic acids...
 255   #
 256   # File Format:
 257   # "Code","OtherCodes","BasePair","Name","Type","ChemicalFormula","ChemicalFormulaAtpH7.5","MolecularWeight","ExactMass","ElementalComposition"
 258   #
 259   my($Code, $OtherCodes, $NucleicAcidName, $Line, $NumOfCols, $InDelim, $Index, $Name, $Value, $Units, @LineWords, @ColLabels);
 260 
 261   $InDelim = "\,";
 262   open NUCLEICACIDSDATAFILE, "$NucleicAcidsDataFile" or croak "Couldn't open $NucleicAcidsDataFile: $! ...";
 263 
 264   # Skip lines up to column labels...
 265   LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
 266     if ($Line !~ /^#/) {
 267       last LINE;
 268     }
 269   }
 270   @ColLabels= quotewords($InDelim, 0, $Line);
 271   $NumOfCols = @ColLabels;
 272 
 273   # Extract property names from column labels...
 274   @NucleicAcidPropertyNames = ();
 275   for $Index (0 .. $#ColLabels) {
 276     $Name = $ColLabels[$Index];
 277     push @NucleicAcidPropertyNames, $Name;
 278 
 279     # Store property names...
 280     $NucleicAcidPropertyNamesMap{$Name} = $Name;
 281   }
 282 
 283   # Process nucleic acid data...
 284   LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
 285     if ($Line =~ /^#/) {
 286       next LINE;
 287     }
 288     @LineWords = ();
 289     @LineWords = quotewords($InDelim, 0, $Line);
 290     if (@LineWords != $NumOfCols) {
 291       croak "Error: The number of data fields, @LineWords, in $NucleicAcidsDataFile must be $NumOfCols.\nLine: $Line...";
 292     }
 293     $Code = $LineWords[0]; $OtherCodes = $LineWords[1]; $NucleicAcidName = $LineWords[3];
 294     if (exists $NucleicAcidDataMap{$Code}) {
 295       carp "Warning: Ignoring data for nucleic acid $Code: It has already been loaded.\nLine: $Line....";
 296       next LINE;
 297     }
 298 
 299     # Store all the values...
 300     push @NucleicAcidCodes, $Code;
 301     %{$NucleicAcidDataMap{$Code}} = ();
 302     for $Index (0 .. $#LineWords) {
 303       $Name = $NucleicAcidPropertyNames[$Index];
 304       $Value = $LineWords[$Index];
 305       $NucleicAcidDataMap{$Code}{$Name} = $Value;
 306     }
 307   }
 308   close NUCLEICACIDSDATAFILE;
 309 
 310   # Setup one letter and nucleic acid name maps...
 311   _SetupNucleicAcidIDMap();
 312 }
 313 
 314 #
 315 # Setup lowercase other codes and name maps pointing
 316 # to code as show in data file.
 317 #
 318 sub _SetupNucleicAcidIDMap {
 319   my($Code, @OtherCodes, $OtherCode, $NucleicAcidName, $NucleicAcidType);
 320 
 321   %NucleicAcidCodeMap = ();
 322   %NucleicAcidOtherCodeMap = ();
 323   %NucleicAcidNameMap = ();
 324   %NucleicAcidTypesMap = ();
 325 
 326   for $Code (keys %NucleicAcidDataMap) {
 327     $NucleicAcidCodeMap{lc($Code)} = $Code;
 328 
 329     $NucleicAcidName = $NucleicAcidDataMap{$Code}{Name};
 330     $NucleicAcidNameMap{lc($NucleicAcidName)} = $Code;
 331 
 332     $NucleicAcidType = $NucleicAcidDataMap{$Code}{Type};
 333     if (! exists $NucleicAcidTypesMap{$NucleicAcidType}) {
 334       $NucleicAcidTypesMap{lc($NucleicAcidType)} = $NucleicAcidType;
 335     }
 336 
 337     @OtherCodes = split /\,/, $NucleicAcidDataMap{$Code}{OtherCodes};
 338     OTHERCODE: for $OtherCode (@OtherCodes) {
 339       if (!$OtherCode) {
 340         next OTHERCODE;
 341       }
 342       $OtherCode = RemoveLeadingAndTrailingWhiteSpaces($OtherCode);
 343       $NucleicAcidOtherCodeMap{lc($OtherCode)} = $Code;
 344     }
 345   }
 346 }
 347 
 348 # Validate Nucleic acid ID...
 349 sub _ValidateNucleicAcidID {
 350   my($NucleicAcidID) = @_;
 351   my($Code) = undef;
 352 
 353   if (exists $NucleicAcidCodeMap{lc($NucleicAcidID)}) {
 354     $Code = $NucleicAcidCodeMap{lc($NucleicAcidID)};
 355   }
 356   elsif (exists $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}) {
 357     $Code = $NucleicAcidOtherCodeMap{lc($NucleicAcidID)};
 358   }
 359   elsif (exists $NucleicAcidNameMap{lc($NucleicAcidID)}) {
 360     $Code = $NucleicAcidNameMap{lc($NucleicAcidID)};
 361   }
 362   return $Code;
 363 }
 364 
 365