diff mayachemtools/lib/Parsers/Lexer.pm @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mayachemtools/lib/Parsers/Lexer.pm	Wed Jan 20 11:55:01 2016 -0500
@@ -0,0 +1,638 @@
+package Parsers::Lexer;
+#
+# $RCSfile: Lexer.pm,v $
+# $Date: 2015/02/28 20:50:55 $
+# $Revision: 1.10 $
+#
+# Author: Manish Sud <msud@san.rr.com>
+#
+# Copyright (C) 2015 Manish Sud. All rights reserved.
+#
+# This file is part of MayaChemTools.
+#
+# MayaChemTools is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option) any
+# later version.
+#
+# MayaChemTools is distributed in the hope that it will be useful, but without
+# any warranty; without even the implied warranty of merchantability of fitness
+# for a particular purpose.  See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
+# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
+# Boston, MA, 02111-1307, USA.
+#
+
+use strict;
+use Carp;
+use Exporter;
+use Scalar::Util ();
+
+use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+
+@ISA = qw(Exporter);
+@EXPORT = qw();
+@EXPORT_OK = qw();
+
+%EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
+
+# Setup class variables...
+my($ClassName);
+_InitializeClass();
+
+# Overload Perl functions...
+use overload '""' => 'StringifyLexer';
+
+# Class constructor...
+sub new {
+  my($Class, $Input, @TokensSpec) = @_;
+
+  # Initialize object...
+  my $This = {};
+  bless $This, ref($Class) || $Class;
+  $This->_InitializeLexer();
+
+  $This->_ValidateParametersAndGenerateLexer($Input, @TokensSpec);
+
+  return $This;
+}
+
+
+# Initialize class ...
+sub _InitializeClass {
+  #Class name...
+  $ClassName = __PACKAGE__;
+}
+
+# Initialize object data...
+#
+sub _InitializeLexer {
+  my($This) = @_;
+
+  # Input parameter used by lexer to retrieve text to be lexed. Supported parameter types:
+  #   . Reference to input iterator function
+  #   . Reference to an open file handle
+  #   . Text string
+  #
+  $This->{Input} = undef;
+
+  # Type of input paramater determined using Perl ref function:
+  #   . InputIterator - ref returns CODE
+  #   . FileStream - ref return GLOB and fileno is valid
+  #   . String - ref return an empty string
+  #
+  $This->{InputType} = '';
+
+  # Tokens specifications supplied by the caller. It's an array containing references
+  # to arrays with each containing TokenLabel and TokenMatchRegex pair along with
+  # an option reference to code to be executed after a matched.
+  #
+  # For example:
+  #
+  # @LexerTokensSpec = (
+  #			[ 'LETTER', qr/[a-zA-Z]/ ],
+  #			[ 'NUMBER', qr/\d+/ ],
+  #			[ 'SPACE', qr/[ ]*/, sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; } ],
+  #			[ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { my($This, $TokenLabel, $MatchedText) = @_;  return "\n"; } ],
+  #			[ 'CHAR', qr/[\.]/ ],
+  #		       );
+  #
+  @{$This->{TokensSpec}} = ();
+
+  # Refernce to chained lexer...
+  $This->{ChainedLexer} = undef;
+
+  return $This;
+}
+
+# Validate input parameters and generate a chained lexer...
+#
+sub _ValidateParametersAndGenerateLexer {
+  my($This, $Input, @TokensSpec) = @_;
+
+  #
+  # Validate input to be lexed...
+  if (!defined $Input) {
+    croak "Error: ${ClassName}->new: Object can't be instantiated: Input is not defined. Supported values: a reference to input iterator function, a reference to an open file handle or a text string...";
+  }
+  $This->{Input} = $Input;
+
+  # Check input parameter type...
+  my($InputType);
+
+  $InputType = ref $Input;
+  if ($InputType =~ /CODE/i) {
+    # Input iterator...
+    $This->{InputType} = "InputIterator";
+  }
+  elsif ($InputType =~ /GLOB/i && defined fileno $Input) {
+    # Input stream...
+    $This->{InputType} = "FileStream";
+  }
+  elsif ($InputType) {
+    # Perl ref function returns nonempty string for all other references...
+    croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
+  }
+  else {
+    # Input string...
+    $This->{InputType} = "String";
+  }
+
+  # Check tokens specifications...
+  if (!@TokensSpec) {
+    croak "Error: ${ClassName}->new: TokensSpec is not defined or the array doesn't contain any values. Supported values: a reference to an array containg token label, regular expression to match and an option reference to function to modify matched values...";
+  }
+  @{$This->{TokensSpec}} = @TokensSpec;
+
+  $This->_GenerateLexer($Input, @TokensSpec);
+
+  return $This;
+}
+
+# Generate a lexer using reference to an input iterator function, an open file
+# handle or an input string passed as first parameter by the caller along
+# with token specifications as second paramater...
+#
+sub _GenerateLexer {
+  my($This, $Input, @TokensSpec) = @_;
+
+  if ($This->{InputType} =~ /^InputIterator$/i) {
+    $This->_GenerateInputIteratorLexer($Input, @TokensSpec);
+  }
+  elsif ($This->{InputType} =~ /^FileStream$/i) {
+    $This->_GenerateInputFileStreamLexer($Input, @TokensSpec);
+  }
+  elsif ($This->{InputType} =~ /^String$/i) {
+    $This->_GenerateInputStringLexer($Input, @TokensSpec);
+  }
+  else {
+    croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
+  }
+
+  return $This;
+}
+
+# Generate a lexer using specifed input iterator...
+#
+sub _GenerateInputIteratorLexer {
+  my($This, $InputIteratorRef, @TokensSpec) = @_;
+
+  $This->_GenerateChainedLexer($InputIteratorRef, @TokensSpec);
+
+  return $This;
+}
+
+# Generate a lexer using specifed input file stream reference...
+#
+sub _GenerateInputFileStreamLexer {
+  my($This, $FileHandleRef, @TokensSpec) = @_;
+
+  # Iterator is a annoymous function reference and Perl keeps $FileHandleRef
+  # in scope during its execution.
+
+  $This->_GenerateChainedLexer( sub { return <$FileHandleRef>; }, @TokensSpec);
+
+  return $This;
+}
+
+# Generate a lexer using specifed input string...
+#
+sub _GenerateInputStringLexer {
+  my($This, $Text, @TokensSpec) = @_;
+  my(@InputText) = ($Text);
+
+  # Iterator is a annoymous function reference and Perl keeps @InputText
+  # in scope during its execution.
+
+  $This->_GenerateChainedLexer( sub { return shift @InputText; }, @TokensSpec);
+
+  return $This;
+}
+
+# Get next available token label and value pair as an array reference or unrecognized
+# text from input stream by either removing it from the input or simply peeking ahead...
+#
+# Supported mode values: Peek, Next. Default: Next
+#
+sub Lex {
+  my($This, $Mode) = @_;
+
+  return $This->{ChainedLexer}->($Mode)
+}
+
+# Get next available token label and value pair as an array reference or unrecognized
+# text from input stream by either removing it from the input stream...
+#
+sub Next {
+  my($This) = @_;
+
+  return $This->Lex();
+}
+
+# Get next available token label and value pair as an array reference or unrecognized
+# text from input stream by simply peeking ahead and without removing it from the input
+# stream..
+#
+sub Peek {
+  my($This) = @_;
+
+  return $This->Lex('Peek')
+}
+
+# Get a reference to lexer method to be used by the caller...
+#
+sub GetLex {
+  my($This) = @_;
+
+  return sub { $This->Lex(); };
+}
+
+# The chained lexer generation is implemented based on examples in Higher-order Perl
+# [ Ref 126 ] book.
+#
+# Generate a lexer using specified input iterator and chaining it with other lexers generated
+# for all token specifications. The lexer generated for first token specification uses input
+# iterator to retrieve any available input text; the subsequent chained lexeres for rest
+# of the tokens use lexers generated for previous token specifications to get next input
+# which might be unmatched input text or a reference to an array containing token and
+# matched text pair.
+#
+sub _GenerateChainedLexer {
+  my($This, $InputIteratorRef, @TokensSpec) = @_;
+  my($TokenSpecRef, $ChainedLexer);
+
+  $ChainedLexer = undef;
+  for $TokenSpecRef (@TokensSpec) {
+    $ChainedLexer = defined $ChainedLexer ? $This->_GenerateLexerForToken($ChainedLexer, @{$TokenSpecRef}) : $This->_GenerateLexerForToken($InputIteratorRef, @{$TokenSpecRef});
+  }
+
+  $This->{ChainedLexer} = $ChainedLexer;
+
+  return $This;
+}
+
+
+# Generate a lexer using specifed token specification using specified input or
+# input retrieved using another token lexer. The lexer retrieving input from the
+# specified input stream is at the bottom of the chain.
+#
+sub _GenerateLexerForToken {
+  my($This, $InputIteratorOrLexer, $TokenLabel, $RegexPattern, $TokenMatchActionRef) = @_;
+  my($TokenMatchAndSplitRef, $InputBuffer, @ProcessedTokens);
+
+  # Input buffer for a specific lexer in chained lexers containing unprocessed
+  # text for token specifications retrieved from a downstrean lexer or intial
+  # input...
+  #
+  $InputBuffer = "";
+
+  # @ProcessedTokens contains either references to an array containing token label
+  # and matched text or any unmatched input text string...
+  #
+  @ProcessedTokens = ();
+
+  # Setup a default annoymous function reference to generate an array reference
+  # containing $Token and text matched to $RegexPattern.
+  #
+  $TokenMatchActionRef = defined $TokenMatchActionRef ? $TokenMatchActionRef : sub { my($This, $Label, $MatchedText) = @_; return [$Label, $MatchedText]  };
+
+  # Setup an annoymous function to match and split input text using $RegexPattern for
+  # a specific token during its lexer invocation in chained lexers.
+  #
+  # The usage of parenthesis around $RegexPattern during split allows capturing of matched
+  # text, which is subsequently processed to retrieve matched $Token values. The split function
+  # inserts a "" separator in the returned array as first entry whenever $InputText starts with
+  # $RegexPattern. $InputText is returned as the only element for no match.
+  #
+  $TokenMatchAndSplitRef = sub { my($InputText) = @_; return split /($RegexPattern)/, $InputText; };
+
+  # Setup a lexer for $TokenLabel as an annoymous function and return its reference to caller
+  # which in turns chains the lexers for all $Tokens before returning a reference to a lexer
+  # at top of the lexer chain.
+  #
+  # Perl maintains scope of all variables defined with in the scope of the current function
+  # during invocation of annoymous function even after the return call.
+  #
+  return sub {
+    my($Mode) = @_;
+
+    # Currenly supported value for mode: Peek, Next
+    #
+    $Mode = defined $Mode ? $Mode : 'Next';
+
+    while (@ProcessedTokens == 0 && defined $InputBuffer ) {
+      # Get any new input....
+      my $NewInput = $InputIteratorOrLexer->();
+
+      if (ref $NewInput) {
+	# Input is an array reference containing matched token and text returned by
+	# a chained lexer downstream lexer...
+	#
+	# Match $RegexPattern in available buffer text to retieve any matched text
+	# for current $Token. $Separator might be "": $RegexPattern is at start of
+	# of $InputBuffer
+	#
+	# Process input buffer containing text to be matched for the current lexer
+	# which didn't get processed earlier during @NewTokens > 2  while loop:
+	# no match for current lexer or more input available. It maintains order
+	# of token matching in input stream.
+	#
+	my($Separator, $MatchedTokenRefOrText);
+
+	($Separator, $MatchedTokenRefOrText) = $TokenMatchAndSplitRef->($InputBuffer);
+	if (defined $MatchedTokenRefOrText) {
+	  $MatchedTokenRefOrText = $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenRefOrText);
+	}
+
+	# Collect valid token references or text...
+	push @ProcessedTokens, grep { defined $_ && $_ ne "" } ($Separator, $MatchedTokenRefOrText, $NewInput);
+
+	# Empty put buffer...
+	$InputBuffer = "";
+
+	# Get out of the loop as processed token refererences and/or text  are available...
+	last;
+      }
+
+      # Process input retrieved from downstream lexer or input iterator which hasn't
+      # been processed into tokens..
+      if (defined $NewInput) {
+	$InputBuffer .= $NewInput;
+      }
+
+      # Retrieve any matched tokens from available input for the current lexer...
+      #
+      my(@NewTokens) = $TokenMatchAndSplitRef->($InputBuffer);
+
+      while ( @NewTokens > 2 || @NewTokens && !defined $NewInput) {
+	# Scenario 1: Complete match
+	#   @NewTokens > 2 : Availability of separator, matched token text, separator.
+	#   The separator might correspond to token for a token for upstream lexer followed
+	#   by matched token from current lexer. It ends up getting passed to upsrteam
+	#   lexer for processing.
+	#
+	# Scenario 2: No more input available from iterator or downstream lexer
+	#   @NewTokens <= 2 and no more input implies any left over text in buffer. And
+	#   it ends up getting passed to upsrteam for processing.
+	#
+
+	# Take off any unprocessed input text that doesn't match off the buffer: It'll be
+	# passed to upstream chained lexer for processing...
+	#
+	push @ProcessedTokens, shift @NewTokens;
+
+	if (@NewTokens) {
+	  my $MatchedTokenText = shift @NewTokens;
+	  push @ProcessedTokens, $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenText);
+	}
+      }
+
+      # Retrieve any leftover text from NewTokens and put it back into InputBuffer for
+      # processing by current lexer. All token references have been taken out....
+      #
+      $InputBuffer = "";
+      if (@NewTokens) {
+	$InputBuffer = join "", @NewTokens;
+      }
+
+      if (!defined $NewInput) {
+	# No more input from the downstream lexer...
+	$InputBuffer = undef;
+      }
+
+      # Clean up any empty strings from ProcessedTokens containing token
+      # references or text...
+      @ProcessedTokens = grep { $_ ne "" } @ProcessedTokens;
+
+    }
+
+    # Return reference to an array containing token and matched text or just unmatched input text...
+    my $TokenRefOrText = undef;
+
+    if (@ProcessedTokens) {
+      # Get first available reference either by just peeking or removing it from the list
+      # of available tokens...
+      $TokenRefOrText = ($Mode =~ /^Peek$/i) ?  $ProcessedTokens[0] : shift @ProcessedTokens;
+    }
+
+    return $TokenRefOrText;
+  };
+}
+
+# Is it a lexer object?
+sub _IsLexer {
+  my($Object) = @_;
+
+  return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0;
+}
+
+# Return a string containing information about lexer...
+sub StringifyLexer {
+  my($This) = @_;
+  my($LexerString);
+
+  $LexerString = "Lexer: PackageName: $ClassName; " . $This->_GetLexerInfoString();
+
+  return $LexerString;
+}
+
+# Return a string containing information about lexer...
+sub _GetLexerInfoString {
+  my($This) = @_;
+  my($LexerInfoString, $TokensSpec, $TokenSpec, $TokenLabel, $TokenMatchRegex, $TokenMatchAction);
+
+  $LexerInfoString = "InputType: $This->{InputType}";
+
+  if ($This->{InputType} =~ /^String$/i) {
+    $LexerInfoString .= "; InputString: $This->{Input}";
+  }
+
+  $TokensSpec = "TokensSpecifications: <None>";
+  if (@{$This->{TokensSpec}}) {
+    $TokensSpec = "TokensSpecifications: < [Label, MatchRegex, MatchAction]:";
+    for $TokenSpec (@{$This->{TokensSpec}}) {
+      ($TokenLabel, $TokenMatchRegex) = @{$TokenSpec};
+      $TokenMatchAction = (@{$TokenSpec} == 3) ? "$TokenSpec->[2]" : "undefined";
+      $TokensSpec .= " [$TokenLabel, $TokenMatchRegex, $TokenMatchAction]";
+    }
+    $TokensSpec .= " >";
+  }
+
+  $LexerInfoString .= "; $TokensSpec";
+
+  return $LexerInfoString;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Parsers::Lexer
+
+=head1 SYNOPSIS
+
+use Parsers::Lexer;
+
+use Parsers::Lexer qw(:all);
+
+=head1 DESCRIPTION
+
+B<Lexer> class provides the following methods:
+
+new, GetLex, Lex, Next, Peek, StringifyLexer
+
+The object oriented chained B<Lexer> is implemented based on examples available in
+Higher-order Perl [ Ref 126 ] book by Mark J. Dominus. It is designed to be used
+both in standalone mode or as a base class for B<YYLexer>.
+
+A chained lexer is created by generating a lexer for for the first specified token
+specification using specified input and chaining it with other lexers generated for all
+subsequent token specifications. The lexer generated for the first token specification
+uses input iterator to retrieve any available input text; the subsequent chained lexeres
+for rest of the token specifications use lexers generated for previous token
+specifications to get next input, which might be unmatched input text or a reference
+to an array containing token and  matched text pair.
+
+=head2 METHODS
+
+=over 4
+
+=item B<new>
+
+    $Lexer = new Parsers::Lexer($Input, @TokensSpec);
+
+Using specified I<Input> and I<TokensSpec>, B<new> method generates a new lexer
+and returns a reference to newly created B<Lexer> object.
+
+Example:
+
+    # Tokens specifications supplied by the caller. It's an array containing references
+    # to arrays with each containing TokenLabel and TokenMatchRegex pair along with
+    # an option reference to code to be executed after a matched.
+    #
+    @LexerTokensSpec = (
+        [ 'LETTER', qr/[a-zA-Z]/ ],
+        [ 'NUMBER', qr/\d+/ ],
+        [ 'SPACE', qr/[ ]*/,
+            sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; }
+        ],
+        [ 'NEWLINE', qr/(?:\r\n|\r|\n)/,
+            sub { my($This, $TokenLabel, $MatchedText) = @_;  return "\n"; }
+        ],
+        [ 'CHAR', qr/./ ]
+    );
+
+    # Input string...
+    $InputText = 'y = 3 + 4';
+    $Lexer = new Parsers::Lexer($InputText, @LexerTokensSpec);
+
+    # Process input stream...
+    while (defined($Token = $Lexer->Lex())) {
+        print "Token: " . ((ref $Token) ? "@{$Token}" : "$Token") . "\n";
+    }
+
+    # Input file...
+    $InputFile = "Input.txt";
+    open INPUTFILE, "$InputFile" or die "Couldn't open $InputFile: $!\n";
+    $Lexer = new Parsers::Lexer(\*INPUTFILE, @LexerTokensSpec);
+
+    # Input file iterator...
+    $InputFile = "TestSimpleCalcParser.txt";
+    open INPUTFILE, "$InputFile" or die "Couldn't open $InputFile: $!\n";
+    $InputIterator = sub { return <INPUTFILE>; };
+    $Lexer = new Parsers::Lexer($InputIterator, @LexerTokensSpec);
+
+    @LexerTokensSpec = (
+        [ 'VAR', qr/[[:alpha:]]+/ ],
+        [ 'NUM', qr/\d+/ ],
+        [ 'OP', qr/[-+=\/]/,
+            sub { my($This, $Label, $Value) = @_;
+                $Value .= "; ord: " . ord $Value;
+                return [$Label, $Value];
+            }
+        ],
+        [ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { return [$_[1], 'NewLine']; } ],
+        [ 'SPACE', qr/\s*/, sub { return [$_[1], 'Space']; } ],
+    );
+
+    # Look ahead without removing...
+    $Token = $Lexer->Lex('Peek');
+    if (defined $Token && ref $Token) {
+        print "PEEK: Token: @{$Token}\n\n";
+    }
+
+    # Process input stream...
+    while (defined($Token = $Lexer->Lex())) {
+        print "Token: " . ((ref $Token) ? "@{$Token}" : "$Token") . "\n";
+    }
+
+=item B<GetLex>
+
+    $LexerRef = $Lexer->GetLex();
+
+Returns a refernece to I<Lexer> method to the caller for use in a specific B<YYLexer>.
+
+=item B<Lex>
+
+    $TokenRefOrText = $Lexer->Lex($Mode);
+    if (ref $TokenRefOrText) {
+        ($TokenLabel, $TokenValue) = @{$TokenRefOrText};
+    }
+    else {
+        $TokenText = $TokenRefOrText;
+    }
+
+Get next available token label and value pair as an array reference or unrecognized
+text from input stream by either removing it from the input or simply peeking ahead
+and without removing it from the input stream.
+
+Possible I<Mode> values: I<Peek, Next>. Default: I<Next>.
+
+=item B<Next>
+
+    $TokenRefOrText = $Lexer->Next();
+
+Get next available token label and value pair as an array reference or unrecognized
+text from input stream by removing it from the input stream.
+
+=item B<Peek>
+
+    $TokenRefOrText = $Lexer->Peek();
+
+Get next available token label and value pair as an array reference or unrecognized
+text from input stream by by simply peeking ahead and without removing it from the
+input stream.
+
+=item B<StringifyLexer>
+
+    $LexerString = $Lexer->StringifyLexer();
+
+Returns a string containing information about I<Lexer> object.
+
+=back
+
+=head1 AUTHOR
+
+Manish Sud <msud@san.rr.com>
+
+=head1 SEE ALSO
+
+YYLexer.pm, SimpleCalcYYLexer.pm, SimpleCalcParser.yy
+
+=head1 COPYRIGHT
+
+Copyright (C) 2015 Manish Sud. All rights reserved.
+
+This file is part of MayaChemTools.
+
+MayaChemTools is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+
+=cut