comparison mayachemtools/lib/Parsers/Lexer.pm @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 package Parsers::Lexer;
2 #
3 # $RCSfile: Lexer.pm,v $
4 # $Date: 2015/02/28 20:50:55 $
5 # $Revision: 1.10 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use Carp;
31 use Exporter;
32 use Scalar::Util ();
33
34 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
35
36 @ISA = qw(Exporter);
37 @EXPORT = qw();
38 @EXPORT_OK = qw();
39
40 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
41
42 # Setup class variables...
43 my($ClassName);
44 _InitializeClass();
45
46 # Overload Perl functions...
47 use overload '""' => 'StringifyLexer';
48
49 # Class constructor...
50 sub new {
51 my($Class, $Input, @TokensSpec) = @_;
52
53 # Initialize object...
54 my $This = {};
55 bless $This, ref($Class) || $Class;
56 $This->_InitializeLexer();
57
58 $This->_ValidateParametersAndGenerateLexer($Input, @TokensSpec);
59
60 return $This;
61 }
62
63
64 # Initialize class ...
65 sub _InitializeClass {
66 #Class name...
67 $ClassName = __PACKAGE__;
68 }
69
70 # Initialize object data...
71 #
72 sub _InitializeLexer {
73 my($This) = @_;
74
75 # Input parameter used by lexer to retrieve text to be lexed. Supported parameter types:
76 # . Reference to input iterator function
77 # . Reference to an open file handle
78 # . Text string
79 #
80 $This->{Input} = undef;
81
82 # Type of input paramater determined using Perl ref function:
83 # . InputIterator - ref returns CODE
84 # . FileStream - ref return GLOB and fileno is valid
85 # . String - ref return an empty string
86 #
87 $This->{InputType} = '';
88
89 # Tokens specifications supplied by the caller. It's an array containing references
90 # to arrays with each containing TokenLabel and TokenMatchRegex pair along with
91 # an option reference to code to be executed after a matched.
92 #
93 # For example:
94 #
95 # @LexerTokensSpec = (
96 # [ 'LETTER', qr/[a-zA-Z]/ ],
97 # [ 'NUMBER', qr/\d+/ ],
98 # [ 'SPACE', qr/[ ]*/, sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; } ],
99 # [ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { my($This, $TokenLabel, $MatchedText) = @_; return "\n"; } ],
100 # [ 'CHAR', qr/[\.]/ ],
101 # );
102 #
103 @{$This->{TokensSpec}} = ();
104
105 # Refernce to chained lexer...
106 $This->{ChainedLexer} = undef;
107
108 return $This;
109 }
110
111 # Validate input parameters and generate a chained lexer...
112 #
113 sub _ValidateParametersAndGenerateLexer {
114 my($This, $Input, @TokensSpec) = @_;
115
116 #
117 # Validate input to be lexed...
118 if (!defined $Input) {
119 croak "Error: ${ClassName}->new: Object can't be instantiated: Input is not defined. Supported values: a reference to input iterator function, a reference to an open file handle or a text string...";
120 }
121 $This->{Input} = $Input;
122
123 # Check input parameter type...
124 my($InputType);
125
126 $InputType = ref $Input;
127 if ($InputType =~ /CODE/i) {
128 # Input iterator...
129 $This->{InputType} = "InputIterator";
130 }
131 elsif ($InputType =~ /GLOB/i && defined fileno $Input) {
132 # Input stream...
133 $This->{InputType} = "FileStream";
134 }
135 elsif ($InputType) {
136 # Perl ref function returns nonempty string for all other references...
137 croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
138 }
139 else {
140 # Input string...
141 $This->{InputType} = "String";
142 }
143
144 # Check tokens specifications...
145 if (!@TokensSpec) {
146 croak "Error: ${ClassName}->new: TokensSpec is not defined or the array doesn't contain any values. Supported values: a reference to an array containg token label, regular expression to match and an option reference to function to modify matched values...";
147 }
148 @{$This->{TokensSpec}} = @TokensSpec;
149
150 $This->_GenerateLexer($Input, @TokensSpec);
151
152 return $This;
153 }
154
155 # Generate a lexer using reference to an input iterator function, an open file
156 # handle or an input string passed as first parameter by the caller along
157 # with token specifications as second paramater...
158 #
159 sub _GenerateLexer {
160 my($This, $Input, @TokensSpec) = @_;
161
162 if ($This->{InputType} =~ /^InputIterator$/i) {
163 $This->_GenerateInputIteratorLexer($Input, @TokensSpec);
164 }
165 elsif ($This->{InputType} =~ /^FileStream$/i) {
166 $This->_GenerateInputFileStreamLexer($Input, @TokensSpec);
167 }
168 elsif ($This->{InputType} =~ /^String$/i) {
169 $This->_GenerateInputStringLexer($Input, @TokensSpec);
170 }
171 else {
172 croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
173 }
174
175 return $This;
176 }
177
178 # Generate a lexer using specifed input iterator...
179 #
180 sub _GenerateInputIteratorLexer {
181 my($This, $InputIteratorRef, @TokensSpec) = @_;
182
183 $This->_GenerateChainedLexer($InputIteratorRef, @TokensSpec);
184
185 return $This;
186 }
187
188 # Generate a lexer using specifed input file stream reference...
189 #
190 sub _GenerateInputFileStreamLexer {
191 my($This, $FileHandleRef, @TokensSpec) = @_;
192
193 # Iterator is a annoymous function reference and Perl keeps $FileHandleRef
194 # in scope during its execution.
195
196 $This->_GenerateChainedLexer( sub { return <$FileHandleRef>; }, @TokensSpec);
197
198 return $This;
199 }
200
201 # Generate a lexer using specifed input string...
202 #
203 sub _GenerateInputStringLexer {
204 my($This, $Text, @TokensSpec) = @_;
205 my(@InputText) = ($Text);
206
207 # Iterator is a annoymous function reference and Perl keeps @InputText
208 # in scope during its execution.
209
210 $This->_GenerateChainedLexer( sub { return shift @InputText; }, @TokensSpec);
211
212 return $This;
213 }
214
215 # Get next available token label and value pair as an array reference or unrecognized
216 # text from input stream by either removing it from the input or simply peeking ahead...
217 #
218 # Supported mode values: Peek, Next. Default: Next
219 #
220 sub Lex {
221 my($This, $Mode) = @_;
222
223 return $This->{ChainedLexer}->($Mode)
224 }
225
226 # Get next available token label and value pair as an array reference or unrecognized
227 # text from input stream by either removing it from the input stream...
228 #
229 sub Next {
230 my($This) = @_;
231
232 return $This->Lex();
233 }
234
235 # Get next available token label and value pair as an array reference or unrecognized
236 # text from input stream by simply peeking ahead and without removing it from the input
237 # stream..
238 #
239 sub Peek {
240 my($This) = @_;
241
242 return $This->Lex('Peek')
243 }
244
245 # Get a reference to lexer method to be used by the caller...
246 #
247 sub GetLex {
248 my($This) = @_;
249
250 return sub { $This->Lex(); };
251 }
252
253 # The chained lexer generation is implemented based on examples in Higher-order Perl
254 # [ Ref 126 ] book.
255 #
256 # Generate a lexer using specified input iterator and chaining it with other lexers generated
257 # for all token specifications. The lexer generated for first token specification uses input
258 # iterator to retrieve any available input text; the subsequent chained lexeres for rest
259 # of the tokens use lexers generated for previous token specifications to get next input
260 # which might be unmatched input text or a reference to an array containing token and
261 # matched text pair.
262 #
263 sub _GenerateChainedLexer {
264 my($This, $InputIteratorRef, @TokensSpec) = @_;
265 my($TokenSpecRef, $ChainedLexer);
266
267 $ChainedLexer = undef;
268 for $TokenSpecRef (@TokensSpec) {
269 $ChainedLexer = defined $ChainedLexer ? $This->_GenerateLexerForToken($ChainedLexer, @{$TokenSpecRef}) : $This->_GenerateLexerForToken($InputIteratorRef, @{$TokenSpecRef});
270 }
271
272 $This->{ChainedLexer} = $ChainedLexer;
273
274 return $This;
275 }
276
277
278 # Generate a lexer using specifed token specification using specified input or
279 # input retrieved using another token lexer. The lexer retrieving input from the
280 # specified input stream is at the bottom of the chain.
281 #
282 sub _GenerateLexerForToken {
283 my($This, $InputIteratorOrLexer, $TokenLabel, $RegexPattern, $TokenMatchActionRef) = @_;
284 my($TokenMatchAndSplitRef, $InputBuffer, @ProcessedTokens);
285
286 # Input buffer for a specific lexer in chained lexers containing unprocessed
287 # text for token specifications retrieved from a downstrean lexer or intial
288 # input...
289 #
290 $InputBuffer = "";
291
292 # @ProcessedTokens contains either references to an array containing token label
293 # and matched text or any unmatched input text string...
294 #
295 @ProcessedTokens = ();
296
297 # Setup a default annoymous function reference to generate an array reference
298 # containing $Token and text matched to $RegexPattern.
299 #
300 $TokenMatchActionRef = defined $TokenMatchActionRef ? $TokenMatchActionRef : sub { my($This, $Label, $MatchedText) = @_; return [$Label, $MatchedText] };
301
302 # Setup an annoymous function to match and split input text using $RegexPattern for
303 # a specific token during its lexer invocation in chained lexers.
304 #
305 # The usage of parenthesis around $RegexPattern during split allows capturing of matched
306 # text, which is subsequently processed to retrieve matched $Token values. The split function
307 # inserts a "" separator in the returned array as first entry whenever $InputText starts with
308 # $RegexPattern. $InputText is returned as the only element for no match.
309 #
310 $TokenMatchAndSplitRef = sub { my($InputText) = @_; return split /($RegexPattern)/, $InputText; };
311
312 # Setup a lexer for $TokenLabel as an annoymous function and return its reference to caller
313 # which in turns chains the lexers for all $Tokens before returning a reference to a lexer
314 # at top of the lexer chain.
315 #
316 # Perl maintains scope of all variables defined with in the scope of the current function
317 # during invocation of annoymous function even after the return call.
318 #
319 return sub {
320 my($Mode) = @_;
321
322 # Currenly supported value for mode: Peek, Next
323 #
324 $Mode = defined $Mode ? $Mode : 'Next';
325
326 while (@ProcessedTokens == 0 && defined $InputBuffer ) {
327 # Get any new input....
328 my $NewInput = $InputIteratorOrLexer->();
329
330 if (ref $NewInput) {
331 # Input is an array reference containing matched token and text returned by
332 # a chained lexer downstream lexer...
333 #
334 # Match $RegexPattern in available buffer text to retieve any matched text
335 # for current $Token. $Separator might be "": $RegexPattern is at start of
336 # of $InputBuffer
337 #
338 # Process input buffer containing text to be matched for the current lexer
339 # which didn't get processed earlier during @NewTokens > 2 while loop:
340 # no match for current lexer or more input available. It maintains order
341 # of token matching in input stream.
342 #
343 my($Separator, $MatchedTokenRefOrText);
344
345 ($Separator, $MatchedTokenRefOrText) = $TokenMatchAndSplitRef->($InputBuffer);
346 if (defined $MatchedTokenRefOrText) {
347 $MatchedTokenRefOrText = $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenRefOrText);
348 }
349
350 # Collect valid token references or text...
351 push @ProcessedTokens, grep { defined $_ && $_ ne "" } ($Separator, $MatchedTokenRefOrText, $NewInput);
352
353 # Empty put buffer...
354 $InputBuffer = "";
355
356 # Get out of the loop as processed token refererences and/or text are available...
357 last;
358 }
359
360 # Process input retrieved from downstream lexer or input iterator which hasn't
361 # been processed into tokens..
362 if (defined $NewInput) {
363 $InputBuffer .= $NewInput;
364 }
365
366 # Retrieve any matched tokens from available input for the current lexer...
367 #
368 my(@NewTokens) = $TokenMatchAndSplitRef->($InputBuffer);
369
370 while ( @NewTokens > 2 || @NewTokens && !defined $NewInput) {
371 # Scenario 1: Complete match
372 # @NewTokens > 2 : Availability of separator, matched token text, separator.
373 # The separator might correspond to token for a token for upstream lexer followed
374 # by matched token from current lexer. It ends up getting passed to upsrteam
375 # lexer for processing.
376 #
377 # Scenario 2: No more input available from iterator or downstream lexer
378 # @NewTokens <= 2 and no more input implies any left over text in buffer. And
379 # it ends up getting passed to upsrteam for processing.
380 #
381
382 # Take off any unprocessed input text that doesn't match off the buffer: It'll be
383 # passed to upstream chained lexer for processing...
384 #
385 push @ProcessedTokens, shift @NewTokens;
386
387 if (@NewTokens) {
388 my $MatchedTokenText = shift @NewTokens;
389 push @ProcessedTokens, $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenText);
390 }
391 }
392
393 # Retrieve any leftover text from NewTokens and put it back into InputBuffer for
394 # processing by current lexer. All token references have been taken out....
395 #
396 $InputBuffer = "";
397 if (@NewTokens) {
398 $InputBuffer = join "", @NewTokens;
399 }
400
401 if (!defined $NewInput) {
402 # No more input from the downstream lexer...
403 $InputBuffer = undef;
404 }
405
406 # Clean up any empty strings from ProcessedTokens containing token
407 # references or text...
408 @ProcessedTokens = grep { $_ ne "" } @ProcessedTokens;
409
410 }
411
412 # Return reference to an array containing token and matched text or just unmatched input text...
413 my $TokenRefOrText = undef;
414
415 if (@ProcessedTokens) {
416 # Get first available reference either by just peeking or removing it from the list
417 # of available tokens...
418 $TokenRefOrText = ($Mode =~ /^Peek$/i) ? $ProcessedTokens[0] : shift @ProcessedTokens;
419 }
420
421 return $TokenRefOrText;
422 };
423 }
424
425 # Is it a lexer object?
426 sub _IsLexer {
427 my($Object) = @_;
428
429 return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0;
430 }
431
432 # Return a string containing information about lexer...
433 sub StringifyLexer {
434 my($This) = @_;
435 my($LexerString);
436
437 $LexerString = "Lexer: PackageName: $ClassName; " . $This->_GetLexerInfoString();
438
439 return $LexerString;
440 }
441
442 # Return a string containing information about lexer...
443 sub _GetLexerInfoString {
444 my($This) = @_;
445 my($LexerInfoString, $TokensSpec, $TokenSpec, $TokenLabel, $TokenMatchRegex, $TokenMatchAction);
446
447 $LexerInfoString = "InputType: $This->{InputType}";
448
449 if ($This->{InputType} =~ /^String$/i) {
450 $LexerInfoString .= "; InputString: $This->{Input}";
451 }
452
453 $TokensSpec = "TokensSpecifications: <None>";
454 if (@{$This->{TokensSpec}}) {
455 $TokensSpec = "TokensSpecifications: < [Label, MatchRegex, MatchAction]:";
456 for $TokenSpec (@{$This->{TokensSpec}}) {
457 ($TokenLabel, $TokenMatchRegex) = @{$TokenSpec};
458 $TokenMatchAction = (@{$TokenSpec} == 3) ? "$TokenSpec->[2]" : "undefined";
459 $TokensSpec .= " [$TokenLabel, $TokenMatchRegex, $TokenMatchAction]";
460 }
461 $TokensSpec .= " >";
462 }
463
464 $LexerInfoString .= "; $TokensSpec";
465
466 return $LexerInfoString;
467 }
468
469 1;
470
471 __END__
472
473 =head1 NAME
474
475 Parsers::Lexer
476
477 =head1 SYNOPSIS
478
479 use Parsers::Lexer;
480
481 use Parsers::Lexer qw(:all);
482
483 =head1 DESCRIPTION
484
485 B<Lexer> class provides the following methods:
486
487 new, GetLex, Lex, Next, Peek, StringifyLexer
488
489 The object oriented chained B<Lexer> is implemented based on examples available in
490 Higher-order Perl [ Ref 126 ] book by Mark J. Dominus. It is designed to be used
491 both in standalone mode or as a base class for B<YYLexer>.
492
493 A chained lexer is created by generating a lexer for for the first specified token
494 specification using specified input and chaining it with other lexers generated for all
495 subsequent token specifications. The lexer generated for the first token specification
496 uses input iterator to retrieve any available input text; the subsequent chained lexeres
497 for rest of the token specifications use lexers generated for previous token
498 specifications to get next input, which might be unmatched input text or a reference
499 to an array containing token and matched text pair.
500
501 =head2 METHODS
502
503 =over 4
504
505 =item B<new>
506
507 $Lexer = new Parsers::Lexer($Input, @TokensSpec);
508
509 Using specified I<Input> and I<TokensSpec>, B<new> method generates a new lexer
510 and returns a reference to newly created B<Lexer> object.
511
512 Example:
513
514 # Tokens specifications supplied by the caller. It's an array containing references
515 # to arrays with each containing TokenLabel and TokenMatchRegex pair along with
516 # an option reference to code to be executed after a matched.
517 #
518 @LexerTokensSpec = (
519 [ 'LETTER', qr/[a-zA-Z]/ ],
520 [ 'NUMBER', qr/\d+/ ],
521 [ 'SPACE', qr/[ ]*/,
522 sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; }
523 ],
524 [ 'NEWLINE', qr/(?:\r\n|\r|\n)/,
525 sub { my($This, $TokenLabel, $MatchedText) = @_; return "\n"; }
526 ],
527 [ 'CHAR', qr/./ ]
528 );
529
530 # Input string...
531 $InputText = 'y = 3 + 4';
532 $Lexer = new Parsers::Lexer($InputText, @LexerTokensSpec);
533
534 # Process input stream...
535 while (defined($Token = $Lexer->Lex())) {
536 print "Token: " . ((ref $Token) ? "@{$Token}" : "$Token") . "\n";
537 }
538
539 # Input file...
540 $InputFile = "Input.txt";
541 open INPUTFILE, "$InputFile" or die "Couldn't open $InputFile: $!\n";
542 $Lexer = new Parsers::Lexer(\*INPUTFILE, @LexerTokensSpec);
543
544 # Input file iterator...
545 $InputFile = "TestSimpleCalcParser.txt";
546 open INPUTFILE, "$InputFile" or die "Couldn't open $InputFile: $!\n";
547 $InputIterator = sub { return <INPUTFILE>; };
548 $Lexer = new Parsers::Lexer($InputIterator, @LexerTokensSpec);
549
550 @LexerTokensSpec = (
551 [ 'VAR', qr/[[:alpha:]]+/ ],
552 [ 'NUM', qr/\d+/ ],
553 [ 'OP', qr/[-+=\/]/,
554 sub { my($This, $Label, $Value) = @_;
555 $Value .= "; ord: " . ord $Value;
556 return [$Label, $Value];
557 }
558 ],
559 [ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { return [$_[1], 'NewLine']; } ],
560 [ 'SPACE', qr/\s*/, sub { return [$_[1], 'Space']; } ],
561 );
562
563 # Look ahead without removing...
564 $Token = $Lexer->Lex('Peek');
565 if (defined $Token && ref $Token) {
566 print "PEEK: Token: @{$Token}\n\n";
567 }
568
569 # Process input stream...
570 while (defined($Token = $Lexer->Lex())) {
571 print "Token: " . ((ref $Token) ? "@{$Token}" : "$Token") . "\n";
572 }
573
574 =item B<GetLex>
575
576 $LexerRef = $Lexer->GetLex();
577
578 Returns a refernece to I<Lexer> method to the caller for use in a specific B<YYLexer>.
579
580 =item B<Lex>
581
582 $TokenRefOrText = $Lexer->Lex($Mode);
583 if (ref $TokenRefOrText) {
584 ($TokenLabel, $TokenValue) = @{$TokenRefOrText};
585 }
586 else {
587 $TokenText = $TokenRefOrText;
588 }
589
590 Get next available token label and value pair as an array reference or unrecognized
591 text from input stream by either removing it from the input or simply peeking ahead
592 and without removing it from the input stream.
593
594 Possible I<Mode> values: I<Peek, Next>. Default: I<Next>.
595
596 =item B<Next>
597
598 $TokenRefOrText = $Lexer->Next();
599
600 Get next available token label and value pair as an array reference or unrecognized
601 text from input stream by removing it from the input stream.
602
603 =item B<Peek>
604
605 $TokenRefOrText = $Lexer->Peek();
606
607 Get next available token label and value pair as an array reference or unrecognized
608 text from input stream by by simply peeking ahead and without removing it from the
609 input stream.
610
611 =item B<StringifyLexer>
612
613 $LexerString = $Lexer->StringifyLexer();
614
615 Returns a string containing information about I<Lexer> object.
616
617 =back
618
619 =head1 AUTHOR
620
621 Manish Sud <msud@san.rr.com>
622
623 =head1 SEE ALSO
624
625 YYLexer.pm, SimpleCalcYYLexer.pm, SimpleCalcParser.yy
626
627 =head1 COPYRIGHT
628
629 Copyright (C) 2015 Manish Sud. All rights reserved.
630
631 This file is part of MayaChemTools.
632
633 MayaChemTools is free software; you can redistribute it and/or modify it under
634 the terms of the GNU Lesser General Public License as published by the Free
635 Software Foundation; either version 3 of the License, or (at your option)
636 any later version.
637
638 =cut