1 package Parsers::Lexer; 2 # 3 # $RCSfile: Lexer.pm,v $ 4 # $Date: 2015/02/28 20:50:55 $ 5 # $Revision: 1.10 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Carp; 31 use Exporter; 32 use Scalar::Util (); 33 34 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 35 36 @ISA = qw(Exporter); 37 @EXPORT = qw(); 38 @EXPORT_OK = qw(); 39 40 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 41 42 # Setup class variables... 43 my($ClassName); 44 _InitializeClass(); 45 46 # Overload Perl functions... 47 use overload '""' => 'StringifyLexer'; 48 49 # Class constructor... 50 sub new { 51 my($Class, $Input, @TokensSpec) = @_; 52 53 # Initialize object... 54 my $This = {}; 55 bless $This, ref($Class) || $Class; 56 $This->_InitializeLexer(); 57 58 $This->_ValidateParametersAndGenerateLexer($Input, @TokensSpec); 59 60 return $This; 61 } 62 63 64 # Initialize class ... 65 sub _InitializeClass { 66 #Class name... 67 $ClassName = __PACKAGE__; 68 } 69 70 # Initialize object data... 71 # 72 sub _InitializeLexer { 73 my($This) = @_; 74 75 # Input parameter used by lexer to retrieve text to be lexed. Supported parameter types: 76 # . Reference to input iterator function 77 # . Reference to an open file handle 78 # . Text string 79 # 80 $This->{Input} = undef; 81 82 # Type of input paramater determined using Perl ref function: 83 # . InputIterator - ref returns CODE 84 # . FileStream - ref return GLOB and fileno is valid 85 # . String - ref return an empty string 86 # 87 $This->{InputType} = ''; 88 89 # Tokens specifications supplied by the caller. It's an array containing references 90 # to arrays with each containing TokenLabel and TokenMatchRegex pair along with 91 # an option reference to code to be executed after a matched. 92 # 93 # For example: 94 # 95 # @LexerTokensSpec = ( 96 # [ 'LETTER', qr/[a-zA-Z]/ ], 97 # [ 'NUMBER', qr/\d+/ ], 98 # [ 'SPACE', qr/[ ]*/, sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; } ], 99 # [ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { my($This, $TokenLabel, $MatchedText) = @_; return "\n"; } ], 100 # [ 'CHAR', qr/[\.]/ ], 101 # ); 102 # 103 @{$This->{TokensSpec}} = (); 104 105 # Refernce to chained lexer... 106 $This->{ChainedLexer} = undef; 107 108 return $This; 109 } 110 111 # Validate input parameters and generate a chained lexer... 112 # 113 sub _ValidateParametersAndGenerateLexer { 114 my($This, $Input, @TokensSpec) = @_; 115 116 # 117 # Validate input to be lexed... 118 if (!defined $Input) { 119 croak "Error: ${ClassName}->new: Object can't be instantiated: Input is not defined. Supported values: a reference to input iterator function, a reference to an open file handle or a text string..."; 120 } 121 $This->{Input} = $Input; 122 123 # Check input parameter type... 124 my($InputType); 125 126 $InputType = ref $Input; 127 if ($InputType =~ /CODE/i) { 128 # Input iterator... 129 $This->{InputType} = "InputIterator"; 130 } 131 elsif ($InputType =~ /GLOB/i && defined fileno $Input) { 132 # Input stream... 133 $This->{InputType} = "FileStream"; 134 } 135 elsif ($InputType) { 136 # Perl ref function returns nonempty string for all other references... 137 croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string..."; 138 } 139 else { 140 # Input string... 141 $This->{InputType} = "String"; 142 } 143 144 # Check tokens specifications... 145 if (!@TokensSpec) { 146 croak "Error: ${ClassName}->new: TokensSpec is not defined or the array doesn't contain any values. Supported values: a reference to an array containg token label, regular expression to match and an option reference to function to modify matched values..."; 147 } 148 @{$This->{TokensSpec}} = @TokensSpec; 149 150 $This->_GenerateLexer($Input, @TokensSpec); 151 152 return $This; 153 } 154 155 # Generate a lexer using reference to an input iterator function, an open file 156 # handle or an input string passed as first parameter by the caller along 157 # with token specifications as second paramater... 158 # 159 sub _GenerateLexer { 160 my($This, $Input, @TokensSpec) = @_; 161 162 if ($This->{InputType} =~ /^InputIterator$/i) { 163 $This->_GenerateInputIteratorLexer($Input, @TokensSpec); 164 } 165 elsif ($This->{InputType} =~ /^FileStream$/i) { 166 $This->_GenerateInputFileStreamLexer($Input, @TokensSpec); 167 } 168 elsif ($This->{InputType} =~ /^String$/i) { 169 $This->_GenerateInputStringLexer($Input, @TokensSpec); 170 } 171 else { 172 croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string..."; 173 } 174 175 return $This; 176 } 177 178 # Generate a lexer using specifed input iterator... 179 # 180 sub _GenerateInputIteratorLexer { 181 my($This, $InputIteratorRef, @TokensSpec) = @_; 182 183 $This->_GenerateChainedLexer($InputIteratorRef, @TokensSpec); 184 185 return $This; 186 } 187 188 # Generate a lexer using specifed input file stream reference... 189 # 190 sub _GenerateInputFileStreamLexer { 191 my($This, $FileHandleRef, @TokensSpec) = @_; 192 193 # Iterator is a annoymous function reference and Perl keeps $FileHandleRef 194 # in scope during its execution. 195 196 $This->_GenerateChainedLexer( sub { return <$FileHandleRef>; }, @TokensSpec); 197 198 return $This; 199 } 200 201 # Generate a lexer using specifed input string... 202 # 203 sub _GenerateInputStringLexer { 204 my($This, $Text, @TokensSpec) = @_; 205 my(@InputText) = ($Text); 206 207 # Iterator is a annoymous function reference and Perl keeps @InputText 208 # in scope during its execution. 209 210 $This->_GenerateChainedLexer( sub { return shift @InputText; }, @TokensSpec); 211 212 return $This; 213 } 214 215 # Get next available token label and value pair as an array reference or unrecognized 216 # text from input stream by either removing it from the input or simply peeking ahead... 217 # 218 # Supported mode values: Peek, Next. Default: Next 219 # 220 sub Lex { 221 my($This, $Mode) = @_; 222 223 return $This->{ChainedLexer}->($Mode) 224 } 225 226 # Get next available token label and value pair as an array reference or unrecognized 227 # text from input stream by either removing it from the input stream... 228 # 229 sub Next { 230 my($This) = @_; 231 232 return $This->Lex(); 233 } 234 235 # Get next available token label and value pair as an array reference or unrecognized 236 # text from input stream by simply peeking ahead and without removing it from the input 237 # stream.. 238 # 239 sub Peek { 240 my($This) = @_; 241 242 return $This->Lex('Peek') 243 } 244 245 # Get a reference to lexer method to be used by the caller... 246 # 247 sub GetLex { 248 my($This) = @_; 249 250 return sub { $This->Lex(); }; 251 } 252 253 # The chained lexer generation is implemented based on examples in Higher-order Perl 254 # [ Ref 126 ] book. 255 # 256 # Generate a lexer using specified input iterator and chaining it with other lexers generated 257 # for all token specifications. The lexer generated for first token specification uses input 258 # iterator to retrieve any available input text; the subsequent chained lexeres for rest 259 # of the tokens use lexers generated for previous token specifications to get next input 260 # which might be unmatched input text or a reference to an array containing token and 261 # matched text pair. 262 # 263 sub _GenerateChainedLexer { 264 my($This, $InputIteratorRef, @TokensSpec) = @_; 265 my($TokenSpecRef, $ChainedLexer); 266 267 $ChainedLexer = undef; 268 for $TokenSpecRef (@TokensSpec) { 269 $ChainedLexer = defined $ChainedLexer ? $This->_GenerateLexerForToken($ChainedLexer, @{$TokenSpecRef}) : $This->_GenerateLexerForToken($InputIteratorRef, @{$TokenSpecRef}); 270 } 271 272 $This->{ChainedLexer} = $ChainedLexer; 273 274 return $This; 275 } 276 277 278 # Generate a lexer using specifed token specification using specified input or 279 # input retrieved using another token lexer. The lexer retrieving input from the 280 # specified input stream is at the bottom of the chain. 281 # 282 sub _GenerateLexerForToken { 283 my($This, $InputIteratorOrLexer, $TokenLabel, $RegexPattern, $TokenMatchActionRef) = @_; 284 my($TokenMatchAndSplitRef, $InputBuffer, @ProcessedTokens); 285 286 # Input buffer for a specific lexer in chained lexers containing unprocessed 287 # text for token specifications retrieved from a downstrean lexer or intial 288 # input... 289 # 290 $InputBuffer = ""; 291 292 # @ProcessedTokens contains either references to an array containing token label 293 # and matched text or any unmatched input text string... 294 # 295 @ProcessedTokens = (); 296 297 # Setup a default annoymous function reference to generate an array reference 298 # containing $Token and text matched to $RegexPattern. 299 # 300 $TokenMatchActionRef = defined $TokenMatchActionRef ? $TokenMatchActionRef : sub { my($This, $Label, $MatchedText) = @_; return [$Label, $MatchedText] }; 301 302 # Setup an annoymous function to match and split input text using $RegexPattern for 303 # a specific token during its lexer invocation in chained lexers. 304 # 305 # The usage of parenthesis around $RegexPattern during split allows capturing of matched 306 # text, which is subsequently processed to retrieve matched $Token values. The split function 307 # inserts a "" separator in the returned array as first entry whenever $InputText starts with 308 # $RegexPattern. $InputText is returned as the only element for no match. 309 # 310 $TokenMatchAndSplitRef = sub { my($InputText) = @_; return split /($RegexPattern)/, $InputText; }; 311 312 # Setup a lexer for $TokenLabel as an annoymous function and return its reference to caller 313 # which in turns chains the lexers for all $Tokens before returning a reference to a lexer 314 # at top of the lexer chain. 315 # 316 # Perl maintains scope of all variables defined with in the scope of the current function 317 # during invocation of annoymous function even after the return call. 318 # 319 return sub { 320 my($Mode) = @_; 321 322 # Currenly supported value for mode: Peek, Next 323 # 324 $Mode = defined $Mode ? $Mode : 'Next'; 325 326 while (@ProcessedTokens == 0 && defined $InputBuffer ) { 327 # Get any new input.... 328 my $NewInput = $InputIteratorOrLexer->(); 329 330 if (ref $NewInput) { 331 # Input is an array reference containing matched token and text returned by 332 # a chained lexer downstream lexer... 333 # 334 # Match $RegexPattern in available buffer text to retieve any matched text 335 # for current $Token. $Separator might be "": $RegexPattern is at start of 336 # of $InputBuffer 337 # 338 # Process input buffer containing text to be matched for the current lexer 339 # which didn't get processed earlier during @NewTokens > 2 while loop: 340 # no match for current lexer or more input available. It maintains order 341 # of token matching in input stream. 342 # 343 my($Separator, $MatchedTokenRefOrText); 344 345 ($Separator, $MatchedTokenRefOrText) = $TokenMatchAndSplitRef->($InputBuffer); 346 if (defined $MatchedTokenRefOrText) { 347 $MatchedTokenRefOrText = $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenRefOrText); 348 } 349 350 # Collect valid token references or text... 351 push @ProcessedTokens, grep { defined $_ && $_ ne "" } ($Separator, $MatchedTokenRefOrText, $NewInput); 352 353 # Empty put buffer... 354 $InputBuffer = ""; 355 356 # Get out of the loop as processed token refererences and/or text are available... 357 last; 358 } 359 360 # Process input retrieved from downstream lexer or input iterator which hasn't 361 # been processed into tokens.. 362 if (defined $NewInput) { 363 $InputBuffer .= $NewInput; 364 } 365 366 # Retrieve any matched tokens from available input for the current lexer... 367 # 368 my(@NewTokens) = $TokenMatchAndSplitRef->($InputBuffer); 369 370 while ( @NewTokens > 2 || @NewTokens && !defined $NewInput) { 371 # Scenario 1: Complete match 372 # @NewTokens > 2 : Availability of separator, matched token text, separator. 373 # The separator might correspond to token for a token for upstream lexer followed 374 # by matched token from current lexer. It ends up getting passed to upsrteam 375 # lexer for processing. 376 # 377 # Scenario 2: No more input available from iterator or downstream lexer 378 # @NewTokens <= 2 and no more input implies any left over text in buffer. And 379 # it ends up getting passed to upsrteam for processing. 380 # 381 382 # Take off any unprocessed input text that doesn't match off the buffer: It'll be 383 # passed to upstream chained lexer for processing... 384 # 385 push @ProcessedTokens, shift @NewTokens; 386 387 if (@NewTokens) { 388 my $MatchedTokenText = shift @NewTokens; 389 push @ProcessedTokens, $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenText); 390 } 391 } 392 393 # Retrieve any leftover text from NewTokens and put it back into InputBuffer for 394 # processing by current lexer. All token references have been taken out.... 395 # 396 $InputBuffer = ""; 397 if (@NewTokens) { 398 $InputBuffer = join "", @NewTokens; 399 } 400 401 if (!defined $NewInput) { 402 # No more input from the downstream lexer... 403 $InputBuffer = undef; 404 } 405 406 # Clean up any empty strings from ProcessedTokens containing token 407 # references or text... 408 @ProcessedTokens = grep { $_ ne "" } @ProcessedTokens; 409 410 } 411 412 # Return reference to an array containing token and matched text or just unmatched input text... 413 my $TokenRefOrText = undef; 414 415 if (@ProcessedTokens) { 416 # Get first available reference either by just peeking or removing it from the list 417 # of available tokens... 418 $TokenRefOrText = ($Mode =~ /^Peek$/i) ? $ProcessedTokens[0] : shift @ProcessedTokens; 419 } 420 421 return $TokenRefOrText; 422 }; 423 } 424 425 # Is it a lexer object? 426 sub _IsLexer { 427 my($Object) = @_; 428 429 return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0; 430 } 431 432 # Return a string containing information about lexer... 433 sub StringifyLexer { 434 my($This) = @_; 435 my($LexerString); 436 437 $LexerString = "Lexer: PackageName: $ClassName; " . $This->_GetLexerInfoString(); 438 439 return $LexerString; 440 } 441 442 # Return a string containing information about lexer... 443 sub _GetLexerInfoString { 444 my($This) = @_; 445 my($LexerInfoString, $TokensSpec, $TokenSpec, $TokenLabel, $TokenMatchRegex, $TokenMatchAction); 446 447 $LexerInfoString = "InputType: $This->{InputType}"; 448 449 if ($This->{InputType} =~ /^String$/i) { 450 $LexerInfoString .= "; InputString: $This->{Input}"; 451 } 452 453 $TokensSpec = "TokensSpecifications: <None>"; 454 if (@{$This->{TokensSpec}}) { 455 $TokensSpec = "TokensSpecifications: < [Label, MatchRegex, MatchAction]:"; 456 for $TokenSpec (@{$This->{TokensSpec}}) { 457 ($TokenLabel, $TokenMatchRegex) = @{$TokenSpec}; 458 $TokenMatchAction = (@{$TokenSpec} == 3) ? "$TokenSpec->[2]" : "undefined"; 459 $TokensSpec .= " [$TokenLabel, $TokenMatchRegex, $TokenMatchAction]"; 460 } 461 $TokensSpec .= " >"; 462 } 463 464 $LexerInfoString .= "; $TokensSpec"; 465 466 return $LexerInfoString; 467 } 468