1 package NucleicAcids; 2 # 3 # $RCSfile: NucleicAcids.pm,v $ 4 # $Date: 2015/02/28 20:47:18 $ 5 # $Revision: 1.25 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Carp; 31 use Text::ParseWords; 32 use TextUtil; 33 use FileUtil; 34 35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 @ISA = qw(Exporter); 38 @EXPORT = qw(); 39 @EXPORT_OK = qw(GetNucleicAcids GetNucleicAcidsByType GetNucleicAcidPropertiesData GetNucleicAcidPropertiesNames IsNucleicAcid IsNucleicAcidProperty IsNucleicAcidType); 40 41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 42 43 # 44 # Load nucleic acids data... 45 # 46 my(%NucleicAcidDataMap, %NucleicAcidCodeMap, %NucleicAcidOtherCodeMap, %NucleicAcidNameMap, @NucleicAcidCodes, @NucleicAcidPropertyNames, %NucleicAcidPropertyNamesMap, %NucleicAcidTypesMap); 47 _LoadNucleicAcidsData(); 48 49 # 50 # Get a list of all known nucleic acids as one of these values: 51 # code or nucleic acid name... 52 # 53 sub GetNucleicAcids { 54 my($NameType, $Code, $Name, @NucleicAcidNames); 55 56 $NameType = 'Code'; 57 if (@_ >= 1) { 58 ($NameType) = @_; 59 } 60 61 # Collect names... 62 @NucleicAcidNames = (); 63 for $Code (@NucleicAcidCodes) { 64 NAME : { 65 if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; } 66 $Name = $Code; 67 } 68 push @NucleicAcidNames, $Name; 69 } 70 71 return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames); 72 } 73 74 # 75 # Get a list of all known nucleic acids by one of these specified types: 76 # Nucleobase, Nucleoside, Deoxynucleoside, Nucleotide, Deoxynucleotide. Default: Nucleoside 77 # 78 sub GetNucleicAcidsByType { 79 my($NameType, $Type, $Code, $Name, @NucleicAcidNames); 80 81 $Type = 'Nucleoside'; 82 $NameType = 'Code'; 83 if (@_ == 2) { 84 ($Type, $NameType) = @_; 85 } 86 elsif (@_ == 1) { 87 ($Type) = @_; 88 } 89 90 # Collect names... 91 @NucleicAcidNames = (); 92 CODE: for $Code (@NucleicAcidCodes) { 93 if ($NucleicAcidDataMap{$Code}{Type} !~ /^$Type$/i ) { 94 next CODE; 95 } 96 NAME : { 97 if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; } 98 $Name = $Code; 99 } 100 push @NucleicAcidNames, $Name; 101 } 102 103 return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames); 104 } 105 106 # 107 # Get all available properties data for an nucleic acid using any of these symbols: 108 # code, other code or name. 109 # 110 # A reference to a hash array is returned with keys and values representing property 111 # name and its values respectively. 112 # 113 sub GetNucleicAcidPropertiesData { 114 my($NucleicAcidID) = @_; 115 my($Code); 116 117 if ($Code = _ValidateNucleicAcidID($NucleicAcidID)) { 118 return \%{$NucleicAcidDataMap{$Code}}; 119 } 120 else { 121 return undef; 122 } 123 } 124 125 # 126 # Get names of all available nucleic acid properties. A reference to an array containing 127 # names of all available properties is returned. 128 # 129 sub GetNucleicAcidPropertiesNames { 130 my($Mode); 131 my($PropertyName, @PropertyNames); 132 133 $Mode = 'ByGroup'; 134 if (@_ == 1) { 135 ($Mode) = @_; 136 } 137 138 @PropertyNames = (); 139 if ($Mode =~ /^Alphabetical$/i) { 140 my($PropertyName); 141 # Code, OtherCodes and Name are always listed first... 142 push @PropertyNames, qw(Code OtherCodes Name); 143 for $PropertyName (sort keys %NucleicAcidPropertyNamesMap) { 144 if ($PropertyName !~ /^(Code|OtherCodes|Name)$/) { 145 push @PropertyNames, $PropertyName; 146 } 147 } 148 } 149 else { 150 push @PropertyNames, @NucleicAcidPropertyNames; 151 } 152 return (wantarray ? @PropertyNames : \@PropertyNames); 153 } 154 155 # 156 # Is it a known nucleic acid? Input is either a code or a name 157 # 158 sub IsNucleicAcid { 159 my($NucleicAcidID) = @_; 160 my($Status); 161 162 $Status = (_ValidateNucleicAcidID($NucleicAcidID)) ? 1 : 0; 163 164 return $Status; 165 } 166 167 # 168 # Is it an available nucleic acid property? 169 # 170 sub IsNucleicAcidProperty { 171 my($PropertyName) = @_; 172 my($Status); 173 174 $Status = (exists($NucleicAcidPropertyNamesMap{$PropertyName})) ? 1 : 0; 175 176 return $Status; 177 } 178 179 # 180 # Is it an available nucleic acid type? 181 # 182 sub IsNucleicAcidType { 183 my($Type) = @_; 184 my($Status); 185 186 $Status = (exists($NucleicAcidTypesMap{lc($Type)})) ? 1 : 0; 187 188 return $Status; 189 } 190 191 # 192 # Implents GetNucleicAcid<PropertyName> for a valid proprty name. 193 # 194 sub AUTOLOAD { 195 my($NucleicAcidID) = @_; 196 my($FunctionName, $PropertyName, $PropertyValue, $Code); 197 198 $PropertyValue = undef; 199 200 use vars qw($AUTOLOAD); 201 $FunctionName = $AUTOLOAD; 202 $FunctionName =~ s/.*:://; 203 204 # Only Get<PropertyName> functions are supported... 205 if ($FunctionName !~ /^Get/) { 206 croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Only Get<PropertyName> functions are implemented..."; 207 } 208 209 $PropertyName = $FunctionName; 210 $PropertyName =~ s/^GetNucleicAcid//; 211 if (!exists $NucleicAcidPropertyNamesMap{$PropertyName}) { 212 croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Unknown nucleic acid property name, $PropertyName, specified..."; 213 } 214 215 if (!($Code = _ValidateNucleicAcidID($NucleicAcidID))) { 216 return undef; 217 } 218 $PropertyValue = $NucleicAcidDataMap{$Code}{$PropertyName}; 219 return $PropertyValue; 220 } 221 222 # 223 # Load NucleicAcidsData.csv files from <MayaChemTools>/lib directory... 224 # 225 sub _LoadNucleicAcidsData { 226 my($NucleicAcidsDataFile, $MayaChemToolsLibDir); 227 228 $MayaChemToolsLibDir = GetMayaChemToolsLibDirName(); 229 230 $NucleicAcidsDataFile = "$MayaChemToolsLibDir" . "/data/NucleicAcidsData.csv"; 231 232 if (! -e "$NucleicAcidsDataFile") { 233 croak "Error: MayaChemTools package file, $NucleicAcidsDataFile, is missing: Possible installation problems..."; 234 } 235 236 _LoadData($NucleicAcidsDataFile); 237 } 238 239 # 240 # Load NucleicAcidsData.csv file from <MayaChemTools>/lib directory... 241 # 242 sub _LoadData { 243 my($NucleicAcidsDataFile) = @_; 244 245 %NucleicAcidDataMap = (); 246 @NucleicAcidCodes = (); 247 @NucleicAcidPropertyNames = (); 248 %NucleicAcidPropertyNamesMap = (); 249 %NucleicAcidCodeMap = (); 250 %NucleicAcidOtherCodeMap = (); 251 %NucleicAcidNameMap = (); 252 %NucleicAcidTypesMap = (); 253 254 # Load property data for all nucleic acids... 255 # 256 # File Format: 257 # "Code","OtherCodes","BasePair","Name","Type","ChemicalFormula","ChemicalFormulaAtpH7.5","MolecularWeight","ExactMass","ElementalComposition" 258 # 259 my($Code, $OtherCodes, $NucleicAcidName, $Line, $NumOfCols, $InDelim, $Index, $Name, $Value, $Units, @LineWords, @ColLabels); 260 261 $InDelim = "\,"; 262 open NUCLEICACIDSDATAFILE, "$NucleicAcidsDataFile" or croak "Couldn't open $NucleicAcidsDataFile: $! ..."; 263 264 # Skip lines up to column labels... 265 LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) { 266 if ($Line !~ /^#/) { 267 last LINE; 268 } 269 } 270 @ColLabels= quotewords($InDelim, 0, $Line); 271 $NumOfCols = @ColLabels; 272 273 # Extract property names from column labels... 274 @NucleicAcidPropertyNames = (); 275 for $Index (0 .. $#ColLabels) { 276 $Name = $ColLabels[$Index]; 277 push @NucleicAcidPropertyNames, $Name; 278 279 # Store property names... 280 $NucleicAcidPropertyNamesMap{$Name} = $Name; 281 } 282 283 # Process nucleic acid data... 284 LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) { 285 if ($Line =~ /^#/) { 286 next LINE; 287 } 288 @LineWords = (); 289 @LineWords = quotewords($InDelim, 0, $Line); 290 if (@LineWords != $NumOfCols) { 291 croak "Error: The number of data fields, @LineWords, in $NucleicAcidsDataFile must be $NumOfCols.\nLine: $Line..."; 292 } 293 $Code = $LineWords[0]; $OtherCodes = $LineWords[1]; $NucleicAcidName = $LineWords[3]; 294 if (exists $NucleicAcidDataMap{$Code}) { 295 carp "Warning: Ignoring data for nucleic acid $Code: It has already been loaded.\nLine: $Line...."; 296 next LINE; 297 } 298 299 # Store all the values... 300 push @NucleicAcidCodes, $Code; 301 %{$NucleicAcidDataMap{$Code}} = (); 302 for $Index (0 .. $#LineWords) { 303 $Name = $NucleicAcidPropertyNames[$Index]; 304 $Value = $LineWords[$Index]; 305 $NucleicAcidDataMap{$Code}{$Name} = $Value; 306 } 307 } 308 close NUCLEICACIDSDATAFILE; 309 310 # Setup one letter and nucleic acid name maps... 311 _SetupNucleicAcidIDMap(); 312 } 313 314 # 315 # Setup lowercase other codes and name maps pointing 316 # to code as show in data file. 317 # 318 sub _SetupNucleicAcidIDMap { 319 my($Code, @OtherCodes, $OtherCode, $NucleicAcidName, $NucleicAcidType); 320 321 %NucleicAcidCodeMap = (); 322 %NucleicAcidOtherCodeMap = (); 323 %NucleicAcidNameMap = (); 324 %NucleicAcidTypesMap = (); 325 326 for $Code (keys %NucleicAcidDataMap) { 327 $NucleicAcidCodeMap{lc($Code)} = $Code; 328 329 $NucleicAcidName = $NucleicAcidDataMap{$Code}{Name}; 330 $NucleicAcidNameMap{lc($NucleicAcidName)} = $Code; 331 332 $NucleicAcidType = $NucleicAcidDataMap{$Code}{Type}; 333 if (! exists $NucleicAcidTypesMap{$NucleicAcidType}) { 334 $NucleicAcidTypesMap{lc($NucleicAcidType)} = $NucleicAcidType; 335 } 336 337 @OtherCodes = split /\,/, $NucleicAcidDataMap{$Code}{OtherCodes}; 338 OTHERCODE: for $OtherCode (@OtherCodes) { 339 if (!$OtherCode) { 340 next OTHERCODE; 341 } 342 $OtherCode = RemoveLeadingAndTrailingWhiteSpaces($OtherCode); 343 $NucleicAcidOtherCodeMap{lc($OtherCode)} = $Code; 344 } 345 } 346 } 347 348 # Validate Nucleic acid ID... 349 sub _ValidateNucleicAcidID { 350 my($NucleicAcidID) = @_; 351 my($Code) = undef; 352 353 if (exists $NucleicAcidCodeMap{lc($NucleicAcidID)}) { 354 $Code = $NucleicAcidCodeMap{lc($NucleicAcidID)}; 355 } 356 elsif (exists $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}) { 357 $Code = $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}; 358 } 359 elsif (exists $NucleicAcidNameMap{lc($NucleicAcidID)}) { 360 $Code = $NucleicAcidNameMap{lc($NucleicAcidID)}; 361 } 362 return $Code; 363 } 364 365