#* This script parses and tries to translate to ruby the teaching language pl/0 which was defined by N.Wirth (of pascal fame) for teaching compiler construction ("Compilerbau"). The script demonstrates the potential of the pep machine and language. pl/0 is so limited as not to be very useful for actual programming. STATUS 24 June 2022 Some syntax is ok, but ruby requires a special syntax for global variables $? which is a problem TESTING * test program parsing >> pep -f eg/plzero.ruby.pss -i 'const c=0; var a; proc x;var b; b:=1; !a ;' * check if variable declarations are parsing >> pep -f eg/plzero.ruby.pss -i "var a, b, c ;" * translate to golang and run, using fn in helpers.pars.sh >> pep.goff eg/plzero.ruby.pss test.plzero.primes.txt NOTES A Challenge: The script, when it encounters a pl/0 variable name, needs to look up the variable in either 'globals' or 'locals'. These locations would contain a list of variable names such as ":x:y:z:" with some delimiter so that the script can search for ":varname:" etc. These 2 locations can be named cells on the tape array (i.e. named with "mark 'globals';" etc). But the machine currently cannot test for a substring in the workspace or a tape cell. One part solution is ---- put; replace ":x:" ""; (==) { swap; #* workspace has ":x:"! *# } ,,,, But this only works where the var name is known in advance. Variables can have scope in pl/0 so A basic function of a compiler is to decide if a variable or function has been defined before it is called. This is difficult for the "pep" system at the moment. One solution is to introduce a new command "replace ;" which replaces with the substring found in the tapecell. Also, the compiler needs to distinguish between global and local scope, or keep track of what scope a variable or procedure is defined at. This appears to require essentially a stack of scopes. This grammar doesnt do nested procedures at the moment, because they seem silly. The semi-colon rules for pl/0 seem strange: that is, no semi-colon for a statement before "end" tokens. GRAMMAR The pep script below does not follow strictly the grammar listed here. For example there is no "block" token in the script (and no nested procedures). The script also allows multiline comments in the form "{...}" and accepts "write" and "writeln" as synonyms for "!". Also, all keywords as parsed by the script are case-insensitive, since later versions of pl/0 allowed uppercase keywords (CONST, VAR etc). * the pl/0 grammar in wsn/ebnf form ------- program = block "." . block = [ "const" ident "=" number {"," ident "=" number} ";"] [ "var" ident {"," ident} ";"] { "procedure" ident ";" block ";" } statement . statement = [ ident ":=" expression | "call" ident | "?" ident | "!" expression | "begin" statement {";" statement } "end" | "if" condition "then" statement | "while" condition "do" statement ]. condition = "odd" expression | expression ("="|"#"|"<"|"<="|">"|">=") expression . expression = [ "+"|"-"] term { ("+"|"-") term}. term = factor {("*"|"/") factor}. factor = ident | number | "(" expression ")". ,,, HISTORY 24 june 2022 A little extra work. 8 sept 2021 begun *# read; # make character numbers (for error messages) relative # to line numbers "\n" { clear; nochars; .reparse } [:space:] { clear; .reparse } "+","-" { put; clear; add "opadd*"; push; .reparse } "*","/" { put; clear; add "opmul*"; push; .reparse } # modern pascal style comments, but not (* ... *) because # they are tricky to parse. "{" { # save the line number for possible error message later clear; lines; put; clear; add "{"; until "}"; E"}" { # can convert to another format put; clear; # create a "comment" parse token # comment-out this line to remove multiline comments from the # translated code # add "comment*"; push; .reparse } # make an unterminated multiline comment an error # to ease debugging of scripts. clear; add "[pl/0 error] Unterminated pascal comment { ... } \n"; add "stating at line number "; get; add "\n"; print; clear; quit; } "*","/" { put; clear; add "opmul*"; push; .reparse } # literal tokens # '!' is the print command in pl/0 # '?' is the read command in pl/0 # '#' means != # '.' marks the end of a program. ".",",","=",";","(",")" { put; add "*"; push; .reparse } "!" { clear; add "puts"; put; clear; add "!*"; push; .reparse } "?" { clear; add "read"; put; clear; add "?*"; push; .reparse } # also need to parse <= >= comparison operators # '=' is comparison and also constant assignment. "<",">","#" { while [=]; !"<".!">".!">=".!"<=".!"#" { # error, message and quit put; clear; add "[pl/0 error] unrecognised operator "; get; add " at line/char "; lines; add "/"; chars; add "\n"; print; quit; } "#" { clear; add "!="; } put; clear; add "compare*"; push; } ":" { read; ":=" { clear; add "="; put; clear; add ":=*"; push; .reparse } put; clear; add "[pl/0 error] unrecognised operator "; get; add " at line/char "; lines; add "/"; chars; add "\n"; print; quit; } [0-9] { while [0-9]; put; clear; add "number*"; push; .reparse } [:alpha:] { while [:alpha:]; # make keywords case insensitive put; lower; # synonym with ! "write","writeln" { clear; add "puts"; put; clear; add "!*"; push; .reparse } # keywords in pl/0, translate to ruby "const","var","if","then","while","do","begin","end", "proc","procedure","call","odd" { put; # this can be simplified "odd" { clear; add ".odd?"; clear; put; add "odd"; } "call" { clear; put; add "call"; } "begin" { clear; put; add "begin"; } "end" { clear; put; add "end"; } "var" { clear; put; add "var"; } "const" { clear; put; add "const"; } "procedure","proc" { clear; add "def"; put; clear; add "proc"; } add "*"; push; .reparse } # all vars are global in pl/0 clear; get; put; clear; add "ident*"; push; .reparse } !"" { # error unrecognised character put; clear; lines; add ":"; chars; add " [pl/0 error] incorrect character '"; get; add "'\n"; print; quit; } parse> # To visualise token reduction uncomment this below: # lines; add ":"; chars; add " "; print; clear; # add "\n"; unstack; print; clip; stack; # we will do these 5 token reductions first, so that # "statement*end*" does not have precedence pop; pop; pop; pop; pop; # sometimes statements are terminated by expressions, which # in turn must be terminated by something else, so there is a # trailing token there B"if*condition*then*statement*".!"if*condition*then*statement*" { replace "if*condition*then*statement*" "statement*"; push; push; clear; ++; add " "; get; replace "\n" "\n "; put; --; --; --; clear; get; add " "; ++; get; ++; add " "; get; add "\n"; ++; get; add "\nend"; --; --; --; put; clear; # transfer 'invisible' token value ++; ++; ++; ++; get; --; --; --; put; --; clear; ++; ++; .reparse } B"while*condition*do*statement*".!"while*condition*do*statement*" { replace "while*condition*do*statement*" "statement*"; push; push; clear; ++; add " "; get; replace "\n" "\n "; put; --; --; --; clear; get; add " "; ++; get; ++; add " "; get; add "\n"; ++; get; add "\nend"; --; --; --; put; clear; # transfer invisible token value, and realign tape pointer ++; ++; ++; ++; get; --; --; --; put; --; clear; ++; ++; .reparse } push; push; push; push; push; #----------------- # 1 token pop; # errors (eof).!".*" { clear; lines; add ":"; chars; add " [pl/0 error] Missing '.' at end of program ? \n"; add " \n"; print; quit; } #----------------- # 2 tokens pop; # Some errors "ident*ident*" { clear; lines; add ":"; chars; add " [pl/0 error] 2 variable names in a row (is there a \n"; add " missing operator?) \n"; print; quit; } "vardec*const*" { clear; lines; add ":"; chars; add " [pl/0 error] Constant declarations must precede variable \n"; add " declarations in pl/0 \n"; print; quit; } "condec*const*" { clear; lines; add ":"; chars; add " [pl/0 error] only 1 constant declaration block is allowed \n"; add " in each scope \n"; print; quit; } "vardec*var*" { clear; lines; add ":"; chars; add " [pl/0 error] only 1 variable declaration block is allowed \n"; add " in each scope \n"; print; quit; } ";*end*" { clear; lines; add ":"; chars; add " [pl/0 error] Last statement in block does not require \n"; add " a semi-colon ';' \n"; print; quit; } #* B"const*".!"const*".!E"ident*" { clear; lines; add ":"; chars; add " [pl/0 error] The 'const' keyword must be followed by \n"; add " a variable name. \n"; print; quit; } *# # add an empty procset* (set of procedures) if there is # not one already. This simplifies program reduction later # it doesnt matter if there is 1 or 2 tokens here. E"proc*".!B"procset*" { !"proc*" { # need to conserve the preceding token push; clear; get; ++; put; --; clear; put; add "procset*proc*"; push; push; .reparse } # only 1 token clear; get; ++; put; --; clear; put; add "procset*proc*"; push; push; .reparse } # procedure headers (name etc) "procset*proceed*" { clear; get; add "\n"; ++; get; --; put; clear; add "procset*"; push; .reparse } # "=" can be used for constant assignment but also as # a comparison operator "=*ident*","=*exp*" { # make ruby assigment '=' into comparison '==' swap; add "="; swap; replace "=*" "compare*"; push; push; .reparse } "exp*=*" { ++; swap; add "="; swap; --; replace "=*" "compare*"; push; push; .reparse } # expression transmog "opmul*ident*","opadd*ident*","opmul*number*","opadd*number*", "compare*ident*","compare*number*",":=*ident*",":=*number*", "if*ident*","if*number*","while*ident*","while*number*", "(*ident*","(*number*","!*ident*","!*number*" { push; clear; add "exp*"; push; .reparse } # expression transmutation "ident*opmul*","ident*opadd*","number*opmul*","number*opadd*", "ident*compare*","number*compare*" { replace "ident*" "exp*"; replace "number*" "exp*"; push; push; .reparse } "const*conlist*" { # change! clear; get; add " "; ++; get; --; put; clear; add "condec*"; push; .reparse } # non-tail reduction for variables "var*ident*" { clear; get; # global vars use $ in ruby add ""; ++; get; add "=0"; --; put; clear; add "varlist*"; push; .reparse } # variable decs "varlist*;*" { clear; get; ++; get; --; put; clear; add "vardec*"; push; .reparse } "block*.*" { clear; add "program*"; push; .reparse } "call*ident*" { clear; ++; get; add "()"; --; put; clear; add "statement*"; push; .reparse } "?*ident*" { clear; get; add " "; ++; get; --; put; clear; add "statement*"; push; .reparse } # a multi statement block, between begin/end "begin*statementset*" { clear; get; # no exra newline is required here. ++; get; --; # dont need to indent # replace "\n" "\n "; put; clear; add "statement*"; push; .reparse } # tail reduction for statementsets "statement*end*" { clear; # get; ++; get; --; put; clear; add "statementset*"; push; .reparse } #----------------- # 3 tokens pop; # ! expression must be parsed while looked at the # trailing token, like all expression parsing B"!*exp*".!"!*exp*".!E"opmul*".!E"opadd*" { # need to conserve the "invisible" last token replace "!*exp*" "statement*"; push; push; # also transfer attributes --; --; get; add " "; ++; get; --; put; clear; ++; ++; get; --; put; ++; clear; .reparse } # procedure headers (name etc) "proc*ident*;*" { clear; get; add " "; ++; get; add "()"; --; put; clear; add "prochead*"; push; .reparse } # procedure headers (name etc) "prochead*statement*;*" { # indent the statement or statements clear; ++; add " "; get; replace "\n" "\n "; put; --; clear; get; add "\n"; ++; get; add "\nend"; --; put; clear; add "proceed*"; push; .reparse } # expressions, this could be the trickiest aspect of # the grammar. transmog ident/number to exp "exp*opmul*exp*", "exp*opadd*exp*" { clear; get; ++; get; ++; get; --; --; put; clear; add "exp*"; push; .reparse } "(*exp*)*" { clear; get; ++; get; ++; get; --; --; put; clear; add "exp*"; push; .reparse } "statement*;*statementset*" { clear; get; add "\n"; ++; ++; get; --; --; put; clear; add "statementset*"; push; .reparse } # variable decs "varlist*,*ident*" { clear; get; ++; # compose global variables for ruby add "; "; ++; get; add "=0"; --; --; put; clear; #get; add " "; ++; get; --; --; put; clear; add "varlist*"; push; .reparse } #----------------- # 4 tokens pop; # 4 token errors B"ident*=*number*".!E";*".!E",*" { clear; lines; add ":"; chars; add " [pl/0 error] incorrect assignment operator '=' \n"; add " did you mean ':=' ? \n"; print; quit; } # procedure headers (name etc). Need to indent the variable decs etc "prochead*vardec*statement*;*","prochead*condec*statement*;*" { # indent the variable/constant declaration clear; add " "; ++; get; replace "\n" "\n "; put; --; # also indent statments; clear; add " "; ++; ++; get; replace "\n" "\n "; put; --; --; clear; get; add "\n"; ++; get; add "\n"; ++; get; add "\nend"; --; --; put; clear; add "proceed*"; push; .reparse } # ident and number have already been transmog'ed into exp* # and =* into compare* B"exp*compare*exp*".!"exp*compare*exp*".!E"opmul*".!E"opadd*" { # need to conserve the "invisible" last token replace "exp*compare*exp*" "condition*"; push; push; --; --; get; add " "; ++; get; add " "; ++; get; --; --; put; clear; # transfer trailing token value ++; ++; ++; get; --; --; put; ++; clear; .reparse } # also see the 5 token reduction, because of the trailing token # required by exp* "if*condition*then*statement*", "while*condition*do*statement*" { # indent statements for readability clear; ++; ++; ++; add " "; get; replace "\n" "\n "; put; --; --; --; clear; get; add " "; ++; get; add " "; ++; get; add "\n"; ++; get; add "\nend"; --; --; --; put; clear; add "statement*"; push; .reparse } # lookahead for expressions in statements # the problem is: x:=4*3+1 will be parsed at # statement*+1 if no lookahead. # If the expression # is not followed by a */+- then it is a complete statement # and can be reduced. lets transmog ident and number to make simpler B"ident*:=*exp*".!"ident*:=*exp*".!E"opmul*".!E"opadd*" { # need to conserve the "invisible" last token replace "ident*:=*exp*" "statement*"; push; push; --; --; get; add " "; ++; get; add " "; ++; get; --; --; put; clear; ++; ++; .reparse } # tail reduction for constant decs "ident*=*number*;*" { clear; get; add " "; ++; get; add " "; ++; get; ++; get; --; --; --; put; clear; add "conlist*"; push; .reparse } # "ident*;*" { clear; get; ++; get; --; put; clear; add "varlist*"; push; .reparse } #----------------- # 5 tokens pop; # procedure headers (name etc), need to indent condec vardec "prochead*condec*vardec*statement*;*" { # indent the variable and constant declarations clear; add " "; ++; get; replace "\n" "\n "; put; clear; add " "; ++; get; replace "\n" "\n "; put; clear; add " "; ++; get; replace "\n" "\n "; put; --; --; --; clear; get; add "\n"; ++; get; add "\n"; ++; get; add "\n"; ++; get; add "\nend"; --; --; --; put; clear; add "proceed*"; push; .reparse } # constant declarations, tail reduction, but tail redux not # necessary "ident*=*number*,*conlist*" { clear; get; add " "; ++; get; add " "; ++; get; ++; get; add " "; ++; get; --; --; --; --; put; clear; add "conlist*"; push; .reparse } # program reduction (eof) { "statement*.*" { clear; add "\n# main program \n"; get; put; clear; add "program*"; } "vardec*statement*.*", "condec*statement*.*", "procset*statement*.*" { clear; get; add "\n# main program \n"; ++; get; --; put; clear; add "program*"; } "condec*vardec*statement*.*", "vardec*procset*statement*.*", "condec*procset*statement*.*" { clear; get; add "\n"; ++; get; add "\n# main program \n"; ++; get; --; --; put; clear; add "program*"; } "condec*vardec*procset*statement*.*" { clear; get; add "\n"; ++; get; add "\n"; ++; get; add "\n# main program \n"; ++; get; --; --; --; put; clear; add "program*"; } } (eof) { unstack; "program*" { clear; add "#!/usr/bin/ruby\n"; add "# ok pl/0 \n"; add "# code checked and transpiled by eg/plzero.ruby.pss \n"; get; add "\n"; print; quit; } put; clear; add "Pl/0 program did not parse well \n"; add "The final parse tokens were: "; get; add "\n"; print; quit; } push; push; push; push; push;