MayaChemTools

   1 package Parsers::Lexer;
   2 #
   3 # $RCSfile: Lexer.pm,v $
   4 # $Date: 2015/02/28 20:50:55 $
   5 # $Revision: 1.10 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Carp;
  31 use Exporter;
  32 use Scalar::Util ();
  33 
  34 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  35 
  36 @ISA = qw(Exporter);
  37 @EXPORT = qw();
  38 @EXPORT_OK = qw();
  39 
  40 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  41 
  42 # Setup class variables...
  43 my($ClassName);
  44 _InitializeClass();
  45 
  46 # Overload Perl functions...
  47 use overload '""' => 'StringifyLexer';
  48 
  49 # Class constructor...
  50 sub new {
  51   my($Class, $Input, @TokensSpec) = @_;
  52 
  53   # Initialize object...
  54   my $This = {};
  55   bless $This, ref($Class) || $Class;
  56   $This->_InitializeLexer();
  57 
  58   $This->_ValidateParametersAndGenerateLexer($Input, @TokensSpec);
  59 
  60   return $This;
  61 }
  62 
  63 
  64 # Initialize class ...
  65 sub _InitializeClass {
  66   #Class name...
  67   $ClassName = __PACKAGE__;
  68 }
  69 
  70 # Initialize object data...
  71 #
  72 sub _InitializeLexer {
  73   my($This) = @_;
  74 
  75   # Input parameter used by lexer to retrieve text to be lexed. Supported parameter types:
  76   #   . Reference to input iterator function
  77   #   . Reference to an open file handle
  78   #   . Text string
  79   #
  80   $This->{Input} = undef;
  81 
  82   # Type of input paramater determined using Perl ref function:
  83   #   . InputIterator - ref returns CODE
  84   #   . FileStream - ref return GLOB and fileno is valid
  85   #   . String - ref return an empty string
  86   #
  87   $This->{InputType} = '';
  88 
  89   # Tokens specifications supplied by the caller. It's an array containing references
  90   # to arrays with each containing TokenLabel and TokenMatchRegex pair along with
  91   # an option reference to code to be executed after a matched.
  92   #
  93   # For example:
  94   #
  95   # @LexerTokensSpec = (
  96   #                        [ 'LETTER', qr/[a-zA-Z]/ ],
  97   #                        [ 'NUMBER', qr/\d+/ ],
  98   #                        [ 'SPACE', qr/[ ]*/, sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; } ],
  99   #                        [ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { my($This, $TokenLabel, $MatchedText) = @_;  return "\n"; } ],
 100   #                        [ 'CHAR', qr/[\.]/ ],
 101   #                       );
 102   #
 103   @{$This->{TokensSpec}} = ();
 104 
 105   # Refernce to chained lexer...
 106   $This->{ChainedLexer} = undef;
 107 
 108   return $This;
 109 }
 110 
 111 # Validate input parameters and generate a chained lexer...
 112 #
 113 sub _ValidateParametersAndGenerateLexer {
 114   my($This, $Input, @TokensSpec) = @_;
 115 
 116   #
 117   # Validate input to be lexed...
 118   if (!defined $Input) {
 119     croak "Error: ${ClassName}->new: Object can't be instantiated: Input is not defined. Supported values: a reference to input iterator function, a reference to an open file handle or a text string...";
 120   }
 121   $This->{Input} = $Input;
 122 
 123   # Check input parameter type...
 124   my($InputType);
 125 
 126   $InputType = ref $Input;
 127   if ($InputType =~ /CODE/i) {
 128     # Input iterator...
 129     $This->{InputType} = "InputIterator";
 130   }
 131   elsif ($InputType =~ /GLOB/i && defined fileno $Input) {
 132     # Input stream...
 133     $This->{InputType} = "FileStream";
 134   }
 135   elsif ($InputType) {
 136     # Perl ref function returns nonempty string for all other references...
 137     croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
 138   }
 139   else {
 140     # Input string...
 141     $This->{InputType} = "String";
 142   }
 143 
 144   # Check tokens specifications...
 145   if (!@TokensSpec) {
 146     croak "Error: ${ClassName}->new: TokensSpec is not defined or the array doesn't contain any values. Supported values: a reference to an array containg token label, regular expression to match and an option reference to function to modify matched values...";
 147   }
 148   @{$This->{TokensSpec}} = @TokensSpec;
 149 
 150   $This->_GenerateLexer($Input, @TokensSpec);
 151 
 152   return $This;
 153 }
 154 
 155 # Generate a lexer using reference to an input iterator function, an open file
 156 # handle or an input string passed as first parameter by the caller along
 157 # with token specifications as second paramater...
 158 #
 159 sub _GenerateLexer {
 160   my($This, $Input, @TokensSpec) = @_;
 161 
 162   if ($This->{InputType} =~ /^InputIterator$/i) {
 163     $This->_GenerateInputIteratorLexer($Input, @TokensSpec);
 164   }
 165   elsif ($This->{InputType} =~ /^FileStream$/i) {
 166     $This->_GenerateInputFileStreamLexer($Input, @TokensSpec);
 167   }
 168   elsif ($This->{InputType} =~ /^String$/i) {
 169     $This->_GenerateInputStringLexer($Input, @TokensSpec);
 170   }
 171   else {
 172     croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
 173   }
 174 
 175   return $This;
 176 }
 177 
 178 # Generate a lexer using specifed input iterator...
 179 #
 180 sub _GenerateInputIteratorLexer {
 181   my($This, $InputIteratorRef, @TokensSpec) = @_;
 182 
 183   $This->_GenerateChainedLexer($InputIteratorRef, @TokensSpec);
 184 
 185   return $This;
 186 }
 187 
 188 # Generate a lexer using specifed input file stream reference...
 189 #
 190 sub _GenerateInputFileStreamLexer {
 191   my($This, $FileHandleRef, @TokensSpec) = @_;
 192 
 193   # Iterator is a annoymous function reference and Perl keeps $FileHandleRef
 194   # in scope during its execution.
 195 
 196   $This->_GenerateChainedLexer( sub { return <$FileHandleRef>; }, @TokensSpec);
 197 
 198   return $This;
 199 }
 200 
 201 # Generate a lexer using specifed input string...
 202 #
 203 sub _GenerateInputStringLexer {
 204   my($This, $Text, @TokensSpec) = @_;
 205   my(@InputText) = ($Text);
 206 
 207   # Iterator is a annoymous function reference and Perl keeps @InputText
 208   # in scope during its execution.
 209 
 210   $This->_GenerateChainedLexer( sub { return shift @InputText; }, @TokensSpec);
 211 
 212   return $This;
 213 }
 214 
 215 # Get next available token label and value pair as an array reference or unrecognized
 216 # text from input stream by either removing it from the input or simply peeking ahead...
 217 #
 218 # Supported mode values: Peek, Next. Default: Next
 219 #
 220 sub Lex {
 221   my($This, $Mode) = @_;
 222 
 223   return $This->{ChainedLexer}->($Mode)
 224 }
 225 
 226 # Get next available token label and value pair as an array reference or unrecognized
 227 # text from input stream by either removing it from the input stream...
 228 #
 229 sub Next {
 230   my($This) = @_;
 231 
 232   return $This->Lex();
 233 }
 234 
 235 # Get next available token label and value pair as an array reference or unrecognized
 236 # text from input stream by simply peeking ahead and without removing it from the input
 237 # stream..
 238 #
 239 sub Peek {
 240   my($This) = @_;
 241 
 242   return $This->Lex('Peek')
 243 }
 244 
 245 # Get a reference to lexer method to be used by the caller...
 246 #
 247 sub GetLex {
 248   my($This) = @_;
 249 
 250   return sub { $This->Lex(); };
 251 }
 252 
 253 # The chained lexer generation is implemented based on examples in Higher-order Perl
 254 # [ Ref 126 ] book.
 255 #
 256 # Generate a lexer using specified input iterator and chaining it with other lexers generated
 257 # for all token specifications. The lexer generated for first token specification uses input
 258 # iterator to retrieve any available input text; the subsequent chained lexeres for rest
 259 # of the tokens use lexers generated for previous token specifications to get next input
 260 # which might be unmatched input text or a reference to an array containing token and
 261 # matched text pair.
 262 #
 263 sub _GenerateChainedLexer {
 264   my($This, $InputIteratorRef, @TokensSpec) = @_;
 265   my($TokenSpecRef, $ChainedLexer);
 266 
 267   $ChainedLexer = undef;
 268   for $TokenSpecRef (@TokensSpec) {
 269     $ChainedLexer = defined $ChainedLexer ? $This->_GenerateLexerForToken($ChainedLexer, @{$TokenSpecRef}) : $This->_GenerateLexerForToken($InputIteratorRef, @{$TokenSpecRef});
 270   }
 271 
 272   $This->{ChainedLexer} = $ChainedLexer;
 273 
 274   return $This;
 275 }
 276 
 277 
 278 # Generate a lexer using specifed token specification using specified input or
 279 # input retrieved using another token lexer. The lexer retrieving input from the
 280 # specified input stream is at the bottom of the chain.
 281 #
 282 sub _GenerateLexerForToken {
 283   my($This, $InputIteratorOrLexer, $TokenLabel, $RegexPattern, $TokenMatchActionRef) = @_;
 284   my($TokenMatchAndSplitRef, $InputBuffer, @ProcessedTokens);
 285 
 286   # Input buffer for a specific lexer in chained lexers containing unprocessed
 287   # text for token specifications retrieved from a downstrean lexer or intial
 288   # input...
 289   #
 290   $InputBuffer = "";
 291 
 292   # @ProcessedTokens contains either references to an array containing token label
 293   # and matched text or any unmatched input text string...
 294   #
 295   @ProcessedTokens = ();
 296 
 297   # Setup a default annoymous function reference to generate an array reference
 298   # containing $Token and text matched to $RegexPattern.
 299   #
 300   $TokenMatchActionRef = defined $TokenMatchActionRef ? $TokenMatchActionRef : sub { my($This, $Label, $MatchedText) = @_; return [$Label, $MatchedText]  };
 301 
 302   # Setup an annoymous function to match and split input text using $RegexPattern for
 303   # a specific token during its lexer invocation in chained lexers.
 304   #
 305   # The usage of parenthesis around $RegexPattern during split allows capturing of matched
 306   # text, which is subsequently processed to retrieve matched $Token values. The split function
 307   # inserts a "" separator in the returned array as first entry whenever $InputText starts with
 308   # $RegexPattern. $InputText is returned as the only element for no match.
 309   #
 310   $TokenMatchAndSplitRef = sub { my($InputText) = @_; return split /($RegexPattern)/, $InputText; };
 311 
 312   # Setup a lexer for $TokenLabel as an annoymous function and return its reference to caller
 313   # which in turns chains the lexers for all $Tokens before returning a reference to a lexer
 314   # at top of the lexer chain.
 315   #
 316   # Perl maintains scope of all variables defined with in the scope of the current function
 317   # during invocation of annoymous function even after the return call.
 318   #
 319   return sub {
 320     my($Mode) = @_;
 321 
 322     # Currenly supported value for mode: Peek, Next
 323     #
 324     $Mode = defined $Mode ? $Mode : 'Next';
 325 
 326     while (@ProcessedTokens == 0 && defined $InputBuffer ) {
 327       # Get any new input....
 328       my $NewInput = $InputIteratorOrLexer->();
 329 
 330       if (ref $NewInput) {
 331         # Input is an array reference containing matched token and text returned by
 332         # a chained lexer downstream lexer...
 333         #
 334         # Match $RegexPattern in available buffer text to retieve any matched text
 335         # for current $Token. $Separator might be "": $RegexPattern is at start of
 336         # of $InputBuffer
 337         #
 338         # Process input buffer containing text to be matched for the current lexer
 339         # which didn't get processed earlier during @NewTokens > 2  while loop:
 340         # no match for current lexer or more input available. It maintains order
 341         # of token matching in input stream.
 342         #
 343         my($Separator, $MatchedTokenRefOrText);
 344 
 345         ($Separator, $MatchedTokenRefOrText) = $TokenMatchAndSplitRef->($InputBuffer);
 346         if (defined $MatchedTokenRefOrText) {
 347           $MatchedTokenRefOrText = $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenRefOrText);
 348         }
 349 
 350         # Collect valid token references or text...
 351         push @ProcessedTokens, grep { defined $_ && $_ ne "" } ($Separator, $MatchedTokenRefOrText, $NewInput);
 352 
 353         # Empty put buffer...
 354         $InputBuffer = "";
 355 
 356         # Get out of the loop as processed token refererences and/or text  are available...
 357         last;
 358       }
 359 
 360       # Process input retrieved from downstream lexer or input iterator which hasn't
 361       # been processed into tokens..
 362       if (defined $NewInput) {
 363         $InputBuffer .= $NewInput;
 364       }
 365 
 366       # Retrieve any matched tokens from available input for the current lexer...
 367       #
 368       my(@NewTokens) = $TokenMatchAndSplitRef->($InputBuffer);
 369 
 370       while ( @NewTokens > 2 || @NewTokens && !defined $NewInput) {
 371         # Scenario 1: Complete match
 372         #   @NewTokens > 2 : Availability of separator, matched token text, separator.
 373         #   The separator might correspond to token for a token for upstream lexer followed
 374         #   by matched token from current lexer. It ends up getting passed to upsrteam
 375         #   lexer for processing.
 376         #
 377         # Scenario 2: No more input available from iterator or downstream lexer
 378         #   @NewTokens <= 2 and no more input implies any left over text in buffer. And
 379         #   it ends up getting passed to upsrteam for processing.
 380         #
 381 
 382         # Take off any unprocessed input text that doesn't match off the buffer: It'll be
 383         # passed to upstream chained lexer for processing...
 384         #
 385         push @ProcessedTokens, shift @NewTokens;
 386 
 387         if (@NewTokens) {
 388           my $MatchedTokenText = shift @NewTokens;
 389           push @ProcessedTokens, $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenText);
 390         }
 391       }
 392 
 393       # Retrieve any leftover text from NewTokens and put it back into InputBuffer for
 394       # processing by current lexer. All token references have been taken out....
 395       #
 396       $InputBuffer = "";
 397       if (@NewTokens) {
 398         $InputBuffer = join "", @NewTokens;
 399       }
 400 
 401       if (!defined $NewInput) {
 402         # No more input from the downstream lexer...
 403         $InputBuffer = undef;
 404       }
 405 
 406       # Clean up any empty strings from ProcessedTokens containing token
 407       # references or text...
 408       @ProcessedTokens = grep { $_ ne "" } @ProcessedTokens;
 409 
 410     }
 411 
 412     # Return reference to an array containing token and matched text or just unmatched input text...
 413     my $TokenRefOrText = undef;
 414 
 415     if (@ProcessedTokens) {
 416       # Get first available reference either by just peeking or removing it from the list
 417       # of available tokens...
 418       $TokenRefOrText = ($Mode =~ /^Peek$/i) ?  $ProcessedTokens[0] : shift @ProcessedTokens;
 419     }
 420 
 421     return $TokenRefOrText;
 422   };
 423 }
 424 
 425 # Is it a lexer object?
 426 sub _IsLexer {
 427   my($Object) = @_;
 428 
 429   return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0;
 430 }
 431 
 432 # Return a string containing information about lexer...
 433 sub StringifyLexer {
 434   my($This) = @_;
 435   my($LexerString);
 436 
 437   $LexerString = "Lexer: PackageName: $ClassName; " . $This->_GetLexerInfoString();
 438 
 439   return $LexerString;
 440 }
 441 
 442 # Return a string containing information about lexer...
 443 sub _GetLexerInfoString {
 444   my($This) = @_;
 445   my($LexerInfoString, $TokensSpec, $TokenSpec, $TokenLabel, $TokenMatchRegex, $TokenMatchAction);
 446 
 447   $LexerInfoString = "InputType: $This->{InputType}";
 448 
 449   if ($This->{InputType} =~ /^String$/i) {
 450     $LexerInfoString .= "; InputString: $This->{Input}";
 451   }
 452 
 453   $TokensSpec = "TokensSpecifications: <None>";
 454   if (@{$This->{TokensSpec}}) {
 455     $TokensSpec = "TokensSpecifications: < [Label, MatchRegex, MatchAction]:";
 456     for $TokenSpec (@{$This->{TokensSpec}}) {
 457       ($TokenLabel, $TokenMatchRegex) = @{$TokenSpec};
 458       $TokenMatchAction = (@{$TokenSpec} == 3) ? "$TokenSpec->[2]" : "undefined";
 459       $TokensSpec .= " [$TokenLabel, $TokenMatchRegex, $TokenMatchAction]";
 460     }
 461     $TokensSpec .= " >";
 462   }
 463 
 464   $LexerInfoString .= "; $TokensSpec";
 465 
 466   return $LexerInfoString;
 467 }
 468