view mayachemtools/lib/NucleicAcids.pm @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
line wrap: on
line source

package NucleicAcids;
#
# $RCSfile: NucleicAcids.pm,v $
# $Date: 2015/02/28 20:47:18 $
# $Revision: 1.25 $
#
# Author: Manish Sud <msud@san.rr.com>
#
# Copyright (C) 2015 Manish Sud. All rights reserved.
#
# This file is part of MayaChemTools.
#
# MayaChemTools is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# MayaChemTools is distributed in the hope that it will be useful, but without
# any warranty; without even the implied warranty of merchantability of fitness
# for a particular purpose.  See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
# Boston, MA, 02111-1307, USA.
#

use strict;
use Carp;
use Text::ParseWords;
use TextUtil;
use FileUtil;

use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);

@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(GetNucleicAcids GetNucleicAcidsByType GetNucleicAcidPropertiesData GetNucleicAcidPropertiesNames IsNucleicAcid IsNucleicAcidProperty IsNucleicAcidType);

%EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);

#
# Load nucleic acids data...
#
my(%NucleicAcidDataMap, %NucleicAcidCodeMap, %NucleicAcidOtherCodeMap, %NucleicAcidNameMap, @NucleicAcidCodes, @NucleicAcidPropertyNames, %NucleicAcidPropertyNamesMap, %NucleicAcidTypesMap);
_LoadNucleicAcidsData();

#
# Get a list of all known nucleic acids as one of these values:
# code or nucleic acid name...
#
sub GetNucleicAcids {
  my($NameType, $Code, $Name, @NucleicAcidNames);

  $NameType = 'Code';
  if (@_ >= 1) {
    ($NameType) = @_;
  }

  # Collect names...
  @NucleicAcidNames = ();
  for $Code (@NucleicAcidCodes) {
    NAME : {
      if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
      $Name = $Code;
    }
    push @NucleicAcidNames, $Name;
  }

  return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
}

#
# Get a list of all known nucleic acids by one of these specified types:
# Nucleobase, Nucleoside, Deoxynucleoside, Nucleotide, Deoxynucleotide. Default: Nucleoside
#
sub GetNucleicAcidsByType {
  my($NameType, $Type, $Code, $Name, @NucleicAcidNames);

  $Type = 'Nucleoside';
  $NameType = 'Code';
  if (@_ == 2) {
    ($Type, $NameType) = @_;
  }
  elsif (@_ == 1) {
    ($Type) = @_;
  }

  # Collect names...
  @NucleicAcidNames = ();
  CODE: for $Code (@NucleicAcidCodes) {
    if ($NucleicAcidDataMap{$Code}{Type} !~ /^$Type$/i ) {
      next CODE;
    }
    NAME : {
      if ($NameType =~ /^Name$/i) {$Name = $NucleicAcidDataMap{$Code}{Name}; last NAME; }
      $Name = $Code;
    }
    push @NucleicAcidNames, $Name;
  }

  return (wantarray ? @NucleicAcidNames : \@NucleicAcidNames);
}

#
# Get all available properties data for an nucleic acid using any of these symbols:
# code, other code or name.
#
# A reference to a hash array is returned with keys and values representing property
# name and its values respectively.
#
sub GetNucleicAcidPropertiesData {
  my($NucleicAcidID) = @_;
  my($Code);

  if ($Code = _ValidateNucleicAcidID($NucleicAcidID)) {
    return \%{$NucleicAcidDataMap{$Code}};
  }
  else {
    return undef;
  }
}

#
# Get names of all available nucleic acid properties. A reference to  an array containing
# names of all available properties is returned.
#
sub GetNucleicAcidPropertiesNames {
  my($Mode);
  my($PropertyName, @PropertyNames);

  $Mode = 'ByGroup';
  if (@_ == 1) {
    ($Mode) = @_;
  }

  @PropertyNames = ();
  if ($Mode =~ /^Alphabetical$/i) {
    my($PropertyName);
    # Code, OtherCodes and Name are always listed first...
    push @PropertyNames, qw(Code OtherCodes Name);
    for $PropertyName (sort keys %NucleicAcidPropertyNamesMap) {
      if ($PropertyName !~ /^(Code|OtherCodes|Name)$/) {
	push @PropertyNames, $PropertyName;
      }
    }
  }
  else {
    push @PropertyNames, @NucleicAcidPropertyNames;
  }
  return (wantarray ? @PropertyNames : \@PropertyNames);
}

#
# Is it a known nucleic acid? Input is either a code or a name
#
sub IsNucleicAcid {
  my($NucleicAcidID) = @_;
  my($Status);

  $Status = (_ValidateNucleicAcidID($NucleicAcidID)) ? 1 : 0;

  return $Status;
}

#
# Is it an available nucleic acid property?
#
sub IsNucleicAcidProperty {
  my($PropertyName) = @_;
  my($Status);

  $Status = (exists($NucleicAcidPropertyNamesMap{$PropertyName})) ? 1 : 0;

  return $Status;
}

#
# Is it an available nucleic acid type?
#
sub IsNucleicAcidType {
  my($Type) = @_;
  my($Status);

  $Status = (exists($NucleicAcidTypesMap{lc($Type)})) ? 1 : 0;

  return $Status;
}

#
# Implents GetNucleicAcid<PropertyName> for a valid proprty name.
#
sub AUTOLOAD {
  my($NucleicAcidID) = @_;
  my($FunctionName, $PropertyName, $PropertyValue, $Code);

  $PropertyValue = undef;

  use vars qw($AUTOLOAD);
  $FunctionName = $AUTOLOAD;
  $FunctionName =~ s/.*:://;

  # Only Get<PropertyName> functions are supported...
  if ($FunctionName !~ /^Get/) {
    croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Only Get<PropertyName> functions are implemented...";
  }

  $PropertyName = $FunctionName;
  $PropertyName =~  s/^GetNucleicAcid//;
  if (!exists $NucleicAcidPropertyNamesMap{$PropertyName}) {
    croak "Error: Function, NucleicAcid::$FunctionName, is not supported by AUTOLOAD in NucleicAcid module: Unknown nucleic acid property name, $PropertyName, specified...";
  }

  if (!($Code = _ValidateNucleicAcidID($NucleicAcidID))) {
    return undef;
  }
  $PropertyValue = $NucleicAcidDataMap{$Code}{$PropertyName};
  return $PropertyValue;
}

#
# Load NucleicAcidsData.csv files from <MayaChemTools>/lib directory...
#
sub _LoadNucleicAcidsData {
  my($NucleicAcidsDataFile, $MayaChemToolsLibDir);

  $MayaChemToolsLibDir = GetMayaChemToolsLibDirName();

  $NucleicAcidsDataFile =  "$MayaChemToolsLibDir" . "/data/NucleicAcidsData.csv";

  if (! -e "$NucleicAcidsDataFile") {
    croak "Error: MayaChemTools package file, $NucleicAcidsDataFile, is missing: Possible installation problems...";
  }

  _LoadData($NucleicAcidsDataFile);
}

#
# Load NucleicAcidsData.csv file from <MayaChemTools>/lib directory...
#
sub _LoadData {
  my($NucleicAcidsDataFile) = @_;

  %NucleicAcidDataMap = ();
  @NucleicAcidCodes = ();
  @NucleicAcidPropertyNames = ();
  %NucleicAcidPropertyNamesMap = ();
  %NucleicAcidCodeMap = ();
  %NucleicAcidOtherCodeMap = ();
  %NucleicAcidNameMap = ();
  %NucleicAcidTypesMap = ();

  # Load property data for all nucleic acids...
  #
  # File Format:
  # "Code","OtherCodes","BasePair","Name","Type","ChemicalFormula","ChemicalFormulaAtpH7.5","MolecularWeight","ExactMass","ElementalComposition"
  #
  my($Code, $OtherCodes, $NucleicAcidName, $Line, $NumOfCols, $InDelim, $Index, $Name, $Value, $Units, @LineWords, @ColLabels);

  $InDelim = "\,";
  open NUCLEICACIDSDATAFILE, "$NucleicAcidsDataFile" or croak "Couldn't open $NucleicAcidsDataFile: $! ...";

  # Skip lines up to column labels...
  LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
    if ($Line !~ /^#/) {
      last LINE;
    }
  }
  @ColLabels= quotewords($InDelim, 0, $Line);
  $NumOfCols = @ColLabels;

  # Extract property names from column labels...
  @NucleicAcidPropertyNames = ();
  for $Index (0 .. $#ColLabels) {
    $Name = $ColLabels[$Index];
    push @NucleicAcidPropertyNames, $Name;

    # Store property names...
    $NucleicAcidPropertyNamesMap{$Name} = $Name;
  }

  # Process nucleic acid data...
  LINE: while ($Line = GetTextLine(\*NUCLEICACIDSDATAFILE)) {
    if ($Line =~ /^#/) {
      next LINE;
    }
    @LineWords = ();
    @LineWords = quotewords($InDelim, 0, $Line);
    if (@LineWords != $NumOfCols) {
      croak "Error: The number of data fields, @LineWords, in $NucleicAcidsDataFile must be $NumOfCols.\nLine: $Line...";
    }
    $Code = $LineWords[0]; $OtherCodes = $LineWords[1]; $NucleicAcidName = $LineWords[3];
    if (exists $NucleicAcidDataMap{$Code}) {
      carp "Warning: Ignoring data for nucleic acid $Code: It has already been loaded.\nLine: $Line....";
      next LINE;
    }

    # Store all the values...
    push @NucleicAcidCodes, $Code;
    %{$NucleicAcidDataMap{$Code}} = ();
    for $Index (0 .. $#LineWords) {
      $Name = $NucleicAcidPropertyNames[$Index];
      $Value = $LineWords[$Index];
      $NucleicAcidDataMap{$Code}{$Name} = $Value;
    }
  }
  close NUCLEICACIDSDATAFILE;

  # Setup one letter and nucleic acid name maps...
  _SetupNucleicAcidIDMap();
}

#
# Setup lowercase other codes and name maps pointing
# to code as show in data file.
#
sub _SetupNucleicAcidIDMap {
  my($Code, @OtherCodes, $OtherCode, $NucleicAcidName, $NucleicAcidType);

  %NucleicAcidCodeMap = ();
  %NucleicAcidOtherCodeMap = ();
  %NucleicAcidNameMap = ();
  %NucleicAcidTypesMap = ();

  for $Code (keys %NucleicAcidDataMap) {
    $NucleicAcidCodeMap{lc($Code)} = $Code;

    $NucleicAcidName = $NucleicAcidDataMap{$Code}{Name};
    $NucleicAcidNameMap{lc($NucleicAcidName)} = $Code;

    $NucleicAcidType = $NucleicAcidDataMap{$Code}{Type};
    if (! exists $NucleicAcidTypesMap{$NucleicAcidType}) {
      $NucleicAcidTypesMap{lc($NucleicAcidType)} = $NucleicAcidType;
    }

    @OtherCodes = split /\,/, $NucleicAcidDataMap{$Code}{OtherCodes};
    OTHERCODE: for $OtherCode (@OtherCodes) {
      if (!$OtherCode) {
	next OTHERCODE;
      }
      $OtherCode = RemoveLeadingAndTrailingWhiteSpaces($OtherCode);
      $NucleicAcidOtherCodeMap{lc($OtherCode)} = $Code;
    }
  }
}

# Validate Nucleic acid ID...
sub _ValidateNucleicAcidID {
  my($NucleicAcidID) = @_;
  my($Code) = undef;

  if (exists $NucleicAcidCodeMap{lc($NucleicAcidID)}) {
    $Code = $NucleicAcidCodeMap{lc($NucleicAcidID)};
  }
  elsif (exists $NucleicAcidOtherCodeMap{lc($NucleicAcidID)}) {
    $Code = $NucleicAcidOtherCodeMap{lc($NucleicAcidID)};
  }
  elsif (exists $NucleicAcidNameMap{lc($NucleicAcidID)}) {
    $Code = $NucleicAcidNameMap{lc($NucleicAcidID)};
  }
  return $Code;
}


1;

__END__

=head1 NAME

NucleicAcids

=head1 SYNOPSIS

use NucleicAcids;

use NucleicAcids qw(:all);

=head1 DESCRIPTION

B<NucleicAcids> module the provides the following functions:

GetNucleicAcidPropertiesData, GetNucleicAcidPropertiesNames,
GetNucleicAcids, GetNucleicAcidsByType, IsNucleicAcid, IsNucleicAcidProperty,
IsNucleicAcidType

=head1 Functions

=over 4

=item B<GetNucleicAcids>

    (@Names) = GetNucleicAcids([$NameType]);
    $NamesRef = GetNucleicAcids([$NameType]);

Returns an array or a reference to an array containing names of nucleic acids
as a code or nucleic acid name controlled by optional parameter I<NameType>. By
default, nucleic acids names are returned as the code. Possible values for
I<NameType>: I<Code or Name>.

=item B<GetNucleicAcidsByType>

    (@Names) = GetNucleicAcidsByType([$Type, $NameType]);
    $NamesRef = GetNucleicAcidsByType([$Type, $NameType]);

Returns an array or a reference to an array containing names of nucleic acids
specified by parameter I<Type> as a code or name controlled by optional
parameter I<NameType>. Default values for I<Type>: I<Nucleoside>. Default value for
I<NameType>: I<Code>. Possible values for I<Type>: I<Nucleobase, Nucleoside, Deoxynucleoside,
Nucleotide, Deoxynucleotide>. Possible values for I<NameType>: I<Code or Name>.

=item B<GetNucleicAcidPropertiesData>

    $DataHashRef = GetNucleicAcidPropertiesData($NucleicAcidID);

Returns a reference to hash containing property names and values for a specified
I<NucleicAcidID>.

=item B<GetNucleicAcidPropertyName>

    $Value = GetNucleicAcid<PropertyName>($NucleicAcidID);

Returns nucleic acid property value for a specified I<NucleicAcidID>. This function is
implemented on-the-fly using Perl's AUTOLOAD functionality.

=item B<GetNucleicAcidPropertiesNames>

    @Names = GetNucleicAcidPropertiesNames([$Mode]);
    $NamesRef = GetNucleicAcidPropertiesNames([$Mode]);

Returns an array or a reference to an array containing names of properties for
nucleic acids. Order of nucleic acids properties is controlled by optional parameter
I<Mode>. Possible values for I<Mode>: I<Alphabetical or ByGroup>; Default: I<ByGroup>.

=item B<IsNucleicAcid>

    $Status = IsNucleicAcid($NucleicAcidID);

Returns 1 or 0 based on whether it's a known nucleic acid ID.

=item B<IsNucleicAcidProperty>

    $Status = IsNucleicAcid($PropertyName);

Returns 1 or 0 based on whether it's a known nucleic acid property name.

=item B<IsNucleicAcidType>

    $Status = IsNucleicAcidType();

Returns 1 or 0 based on whether it's a known nucleic acid type.

=back

=head1 AUTHOR

Manish Sud <msud@san.rr.com>

=head1 SEE ALSO

AminoAcids.pm, PeriodicTable.pm

=head1 COPYRIGHT

Copyright (C) 2015 Manish Sud. All rights reserved.

This file is part of MayaChemTools.

MayaChemTools is free software; you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.

=cut