#* Compile.quoteset.pss This is a parse-script which compiles parse-scripts (!). This is an old version of compile.pss which is just saved for the sake of posterity (24 august 2019) *# read; #-------------- [:space:] { clear; .reparse } #--------------- # We can ellide all these single character tests, because # the stack token is just the character itself with a * "{", "}", ";", ",", "!", "B", "E" { put; add "*"; push; .reparse } #--------------- # format: "text" "\"" { until "\""; put; clear; add "quote*"; push; .reparse } #--------------- # format: 'text', single quotes are converted to double quotes # but we must escape embedded double quotes. "'" { clear; until "'"; clip; escape '"'; put; clear; add "\""; get; add "\""; put; clear; add "quote*"; push; .reparse } #--------------- # formats: [:space:] [a-z] [abcd] [:alpha:] etc "[" { until "]"; put; clear; add "class*"; push; .reparse } #--------------- # formats: (eof) (==) etc. I may change this syntax to just # EOF and == "(" { clear; until ")"; clip; put; clear; add "state*"; push; .reparse } #--------------- # multiline and single line comments, eg #... and #* ... *# "#" { clear; read; # calling .restart here is a bug, because the (eof) clause # will never be called and the script never written or # printed. "\n" { clear; .restart } # checking for multiline comments of the form "#* \n\n\n *#" # these are just ignored at the moment (deleted) "*" { until "*#"; E"*#" { # it appears that calling .restart is usually a bad # idea. safer to call .reparse clear; .restart } # make an unterminated multiline comment an error # to ease debugging of scripts. clear; add "unterminated multiline comment #* ... *# \n"; print; clear; quit; } # single line comments. some will get lost. put; clear; add "#"; get; until "\n"; clip; put; clear; add "comment*"; push; #clear; .restart clear; .reparse } #---------------------------------- # parse command words (and abbreviations) # legal characters for keywords (commands) ![abcdefghijklmnopqrstuvwxyzBEKGPRWS+-<>0^.] { # error message about a misplaced character put; clear; add "!! Misplaced character '"; get; add "' in script near line "; ll; add " (character "; cc; add ") \n"; print; clear; bail; } #while [:alpha:] # my testclass implementation cannot handle complex lists # eg [a-z+-] this is why I have to write out the whole alphabet while [abcdefghijklmnopqrstuvwxyzBEKGPRWS+-<>0^.]; #---------------------------------- # KEYWORDS # here we can test for all the keywords (command words) and their # abbreviated one letter versions (eg: clip k, clop K etc). Then # we can print an error message and abort if the word is not a # legal keyword for the parse-edit language "a" { clear; add "add"; } "k" { clear; add "clip"; } "K" { clear; add "clop"; } "D" { clear; add "replace"; } "d" { clear; add "clear"; } "t" { clear; add "print"; } "p" { clear; add "pop"; } "P" { clear; add "push"; } "G" { clear; add "put"; } "g" { clear; add "get"; } "x" { clear; add "swap"; } ">" { clear; add "++"; } "<" { clear; add "--"; } "r" { clear; add "read"; } "R" { clear; add "until"; } "w" { clear; add "while"; } "W" { clear; add "whilenot"; } # we can probably omit tests and jumps since they are not # designed to be used in scripts (only assembled parse programs). #* "b" { clear; add "jump"; } "j" { clear; add "jumptrue"; } "J" { clear; add "jumpfalse"; } "=" { clear; add "testis"; } "?" { clear; add "testclass"; } "b" { clear; add "testbegins"; } "B" { clear; add "testends"; } "E" { clear; add "testeof"; } "*" { clear; add "testtape"; } *# "n" { clear; add "count"; } "+" { clear; add "a+"; } "-" { clear; add "a-"; } "0" { clear; add "zero"; } "c" { clear; add "cc"; } "l" { clear; add "ll"; } "^" { clear; add "escape"; } "v" { clear; add "unescape"; } "S" { clear; add "state"; } "q" { clear; add "quit"; } "Q" { clear; add "bail"; } "s" { clear; add "write"; } "o" { clear; add "nop"; } "add","clip","clop","replace","clear","print", "pop","push","put","get","swap","++","--","read", "until","while","whilenot", "jump","jumptrue","jumpfalse", "testis","testclass","testbegins","testends", "testeof","testtape", "count","a+","a-","zero","cc","ll", "escape","unescape","state","quit","bail", "write","nop" { put; clear; add "word*"; push; .reparse } #------------ # the .restart command just jumps to the start: label # (which is usually followed by a "read" command) ".restart" { clear; add "jump 0"; put; clear; add "command*"; push; .reparse } #------------ # the .reparse command and "parse label" is a simple way to # make sure that all shift-reductions occur. It should be used inside # a block test, so as not to create an infinite loop. ".reparse" { clear; add "jump parse"; put; clear; add "command*"; push; .reparse } "parse>" { clear; add "parse:"; put; clear; add "command*"; push; .reparse } # -------------------- # try to implement begin-blocks, which are only executed # once, at the beginning of the script (similar to awk's BEGIN {} rules) "begin" { put; add "*"; push; .reparse } add " << unknown command on line "; ll; add " (char "; cc; add ")"; add " of source file. \n"; print; clear; quit; # ---------------------------------- # PARSING PHASE: # the lexing phase finishes here, and below is the # parse/compile phase of the script. Here we pop tokens # off the stack and check for sequences of tokens eg word*semicolon* # If we find a valid series of tokens, we "shift-reduce" or "resolve" # the token series eg word*semicolon* --> command* # # At the same time, we manipulate (transform) the attributes on the # tape, as required. So Tape=|pop|;| becomes |\npop| where the # bars | indicate tape cells. (2 tapes cells are merged into 1). # # Each time the stack is reduced, the tape must also be reduced # parse> #------------------------------------- # 2 tokens #------------------------------------- pop; pop; # All of the below are currently errors, but may not # be in the future if we expand the syntax of the parse # language. Also consider: # begintext* endtext* quoteset* notclass*, !* ,* ;* B* E* # It is nice to trap the errors here because we can emit some # hopefully not-very-cryptic error messages with a line number. # Otherwise the script writer has to debug with # pp -a asm.pp scriptfile -I # "word*word*", "word*}*", "word*begintext*", "word*endtext*", "word*!*", "word*,*", "quote*word*", "quote*class*", "quote*state*", "quote*}*", "quote*begintext*", "quote*endtext*", "class*word*", "class*quote*", "class*class*", "class*state*", "class*}*", "class*begintext*", "class*endtext*", "class*!*", "class*,*", "notclass*word*", "notclass*quote*", "notclass*class*", "notclass*state*", "notclass*}*" { push; push; add "error near line "; ll; add " (char "; cc; add ")"; add " of script (missing semicolon?) \n"; print; clear; quit; } "{*;*", ";*;*", "}*;*" { push; push; add "error near line "; ll; add " (char "; cc; add ")"; add " of script: misplaced semi-colon? ; \n"; print; clear; quit; } ",*{*" { push; push; add "error near line "; ll; add " (char "; cc; add ")"; add " of script: extra comma in list? \n"; print; clear; quit; } "!*{*" { push; push; add "error near line "; ll; add " (char "; cc; add ")"; add " of script: misplaced negation operator (!)? \n"; print; clear; quit; } "!*command*" { push; push; add "error near line "; ll; add " (at char "; cc; add ") \n"; add " The negation operator (!) is not valid in this context. \n"; print; clear; quit; } "!*begintext*", "!*endtext*" { push; push; add "error near line "; ll; add " (at char "; cc; add ") \n"; add " The negation operator (!) has not been implemented \n"; add " for 'begin-tests' and 'end-tests' (but maybe in the future) \n"; print; clear; quit; } ";*{*", "command*{*", "commandset*{*" { push; push; add "error near line "; ll; add " (char "; cc; add ")"; add " of script: no test for brace block? \n"; print; clear; quit; } "{*}*" { push; push; add "error near line "; ll; add " of script: empty braces {}. \n"; print; clear; quit; } #----------------------------------------- # compiling comments so as to transfer them to the compiled # file. Improve this by just forming rules for # command*comment* and comment*command* and comment*comment* # implement these rules to conserve comments "comment*command*" { nop; } "command*comment*" { nop; } "comment*comment*" { nop; } E"comment*" { # just leave the other token on the stack, whatever it is push; # avoid an infinite loop if only one token on the stack "" { .restart } clear; --; get; ++; add "\n"; get; --; put; ++; clear; .reparse } #----------------------------------------- # There is a problem, that attaching comments to { or } or # other trivial tokens makes them disappear because we # dont retrieve the attribute for those tokens. B"comment*" { # just leave the other token on the stack, whatever it is push; # avoid an infinite loop if only one token on the stack "" { .restart } # some tricky juggling of the unknown token # get rid of comment* token but conserve other one. ++; put; pop; clear; ++; get; push; clear; --; --; --; get; ++; add "\n"; get; --; put; ++; clear; .reparse } # ----------------------- # negated tokens. # # This is a new more elegant way to negate a whole set of # tests (tokens) where the negation logic is stored on the # stack, not in the current tape cell. #----------------------------------------- # format: ![:alpha:] ![a-z] ![abcd] !"abc" !B"abc" !E"xyz" # This format is used to indicate a negative class test for # a brace block. eg: ![aeiou] { add "< not a vowel"; print; clear; } "!*quote*","!*class*","!*begintext*", "!*endtext*" { # save the second token (quote/class/begintext) on the # tape (but not the current tape cell which has other stuff in # it push; ++; put; --; clear; pop; clear; add "not"; # extract the saved token name from the tape ++; ++; get; --; --; # now we have a token "notquote*" / "notclass*" / "notbegintext* push; get; --; put; ++; clear; .reparse } #* "!*class*" { clear; add "notclass*"; push; get; --; put; ++; clear; .reparse } *# #----------------------------------------- # format: !"text" # This format is used to indicate a negative workspace text test # eg: !"" { add "not empty!!"; } print; clear; #* "!*quote*" { clear; add "notquote*"; push; get; --; put; ++; clear; .reparse } *# #----------------------------------------- # format: E"text" or E'text' # This format is used to indicate a "workspace-ends-with" text before # a brace block. "E*quote*" { clear; add "endtext*"; push; get; --; put; ++; clear; .reparse } #----------------------------------------- # format: B"sometext" or B'sometext' # A 'B' preceding some quoted text is used to indicate a # 'workspace-begins-with' test, before a brace block. "B*quote*" { clear; add "begintext*"; push; get; --; put; ++; clear; .reparse } #-------------------------------------------- # ebnf: command := word, ';' ; # formats: "pop; push; clear; print; " etc # all commands need to end with a semi-colon except for # .reparse and .restart # "word*;*" { clear; # check if command requires parameter get; "add", "until", "while", "whilenot", "escape", "unescape", "replace" { add " < command needs an argument, on line: "; ll; print; clear; quit; } clear; add "command*"; # no need to format tape cells because current cell contains word push; .reparse } #----------------------------------------- # ebnf: commandset := command, command ; "command*command*", "commandset*command*" { clear; add "commandset*"; push; # format the tape attributes. Add the next command on a newline --; get; add "\n"; ++; get; --; put; ++; clear; .reparse } #------------------- # 3 tokens #------------------- pop; #-------------------------------------------- # ebnf: command := keyword , quoted-text , ";" ; # format: add "text"; "word*quote*;*" { clear; get; "replace" { # error add "< command requires 2 parameters, not 1 \n"; add "near line "; ll; add " of script. \n"; print; clear; quit; } "add", "until", "while", "whilenot", "escape", "unescape" { clear; add "command*"; push; # a command plus argument, eg add "this" --; get; add " "; ++; get; --; put; ++; clear; .reparse } # error, superfluous argument add ": command does not take an argument \n"; add "near line "; ll; add " of script. \n"; print; clear; #state quit; } #---------------------------------- # format: "while [:alpha:] ;" or whilenot [a-z] "word*class*;*" { clear; get; "while", "whilenot" { clear; add "command*"; push; # a command plus argument, eg while [a-z] --; get; add " "; ++; get; --; put; ++; clear; .reparse } # error add " < command cannot have a class argument \n"; add "line "; ll; add ": error in script \n"; print; clear; quit; } # ------------------------------- # 4 tokens # ------------------------------- pop; #------------------------------------- # ebnf: command := replace , quote , quote , ";" ; # example: replace "and" "AND" ; "word*quote*quote*;*" { clear; get; "replace" { clear; add "command*"; push; #--------------------------- # a command plus 2 arguments, eg replace "this" "that" --; get; add " "; ++; get; add " "; ++; get; --; --; put; ++; clear; .reparse } add " << command does not take 2 quoted arguments. \n"; add " on line "; ll; add " of script.\n"; quit; } #------------------------------------- # format: begin { #* commands *# } # implementing begin blocks which are only executed once (they # will be put before the "start:" label. They must come before # all other commands. "begin*{*commandset*}*", "begin*{*command*}*" { clear; ++; ++; get; --; --; put; clear; add "beginblock*"; push; .reparse } #* it would be better to implement this as quoteset '{' := quote '{' . quoteset '{' := notquote '{' . quoteset '{' := begintext '{' . quoteset '{' := endtext '{' . quoteset '{' := class '{' . quoteset '{' := notclass '{' . This makes it easier to add other types of tests to quotesets. *# # This will be changed, see notes above. # A new approach to quotesets, which eliminates the rabbit # hops for true tests. # an interesting "look-ahead" technique for compiling quotesets # uses the accumulator to keep track of the jumptrue target # This eliminates the need for "rabbit-hops" for the jumptrue code. "quote*,*quote*{*" { # set accumulator == 0 zero; clear; add "testis "; get; add "\n"; add "jumptrue 3 \n"; ++; ++; add "testis "; get; add "\n"; # dont add jumpfalse here # add "jumpfalse "; # the final jumpfalse + target will be added when # "quoteset*{*commandset*}*" is parsed --; --; put; a+; a+; a+; a+; a+; clear; add "quoteset*{*"; push; push; .reparse } "quote*,*quoteset*{*" { clear; add "testis "; get; add "\n"; add "jumpfalse 2 \n"; add "jumptrue "; count; add "\n"; ++; ++; get; --; --; put; clear; add "quoteset*{*"; push; push; a+; a+; a+; .reparse } # but actually!! # we should change the way quotesets are parsed, see above # should be easy enough to include, begintests and endtest # I think the quoteset code doesnt have to change at all. "begintext*,*begintext*{*" { # set accumulator == 0 zero; clear; add "testbegins "; get; add "\n"; add "jumptrue 3 \n"; ++; ++; add "testbegins "; get; add "\n"; # the final jumpfalse + target will be added when # "quoteset*{*commandset*}*" is parsed --; --; put; a+; a+; a+; a+; a+; clear; add "quoteset*{*"; push; push; .reparse } "begintext*,*quoteset*{*" { clear; add "testbegins "; get; add "\n"; add "jumpfalse 2 \n"; add "jumptrue "; count; add "\n"; ++; ++; get; --; --; put; clear; add "quoteset*{*"; push; push; a+; a+; a+; .reparse } "endtext*,*endtext*{*" { # set accumulator == 0 zero; clear; add "testends "; get; add "\n"; add "jumptrue 3 \n"; ++; ++; add "testends "; get; add "\n"; # the final jumpfalse + target will be added when # "quoteset*{*commandset*}*" is parsed --; --; put; a+; a+; a+; a+; a+; clear; add "quoteset*{*"; push; push; .reparse } "endtext*,*quoteset*{*" { clear; add "testends "; get; add "\n"; add "jumpfalse 2 \n"; add "jumptrue "; count; add "\n"; ++; ++; get; --; --; put; clear; add "quoteset*{*"; push; push; a+; a+; a+; .reparse } #------------------------------------- #* These rules will become unnecessary if the quoteset implementation above is written. Because all quotes/classes/begintexts etc will become quotesets. Because of nifty backwards lookahead compilation...! ebnf: command := quote , "{" , commandset , "}" ; command := quote , "{" , command "}" ; examples: "text" { add ":"; print; clear; } "text" { print; } *# "quote*{*commandset*}*", "quote*{*command*}*" { # indent the assembled code for readability clear; ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "testis "; get; add "\njumpfalse not.text."; cc; add "\n"; ++; ++; get; add "\nnot.text."; cc; --; --; add ":"; put; clear; add "command*"; push; # always reparse/compile .reparse } #------------------------------------- # ebnf: # command := quoteset , "{" , commandset , "}" ; # command := quoteset , "{" , command , "}" ; # examples: # "text","more","and" { add ":"; print; clear; } # "text","more","and" { print; } "quoteset*{*commandset*}*", "quoteset*{*command*}*" { clear; # indent for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; # quotesets compile differently to quote blocks clear; get; add "\njumpfalse not.quoteset.text."; cc; add "\n"; ++; ++; get; add "\nnot.quoteset.text."; cc; --; --; add ":"; put; clear; add "command*"; push; # always reparse/compile .reparse } #------------------------------------- # ebnf: # command := begintext , "{" , commandset , "}" ; # command := begintext , "{" , command , "}" ; # examples: # B"text" { add ":"; print; clear; } # B"text" { print; } "begintext*{*commandset*}*", "begintext*{*command*}*" { clear; # indent for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "testbegins "; get; add "\njumpfalse not.begins."; cc; add "\n"; ++; ++; get; add "\nnot.begins."; cc; --; --; add ":"; put; clear; add "command*"; push; .reparse } #------------------------------------- # bnf: command <- endtext { commandset } # bnf: command <- endtext { command } # format: ,"text" { add ":"; print; clear; } # format: ,"text" { print; } "endtext*{*commandset*}*", "endtext*{*command*}*" { clear; # indent for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "testends "; get; add "\njumpfalse not.ends."; cc; add "\n"; ++; ++; get; add "\nnot.ends."; cc; --; --; add ":"; put; clear; add "command*"; push; .reparse } #------------------------------------- # formats: [:alpha:] { add ":"; print; clear; } # [a-z] { print; } # this syntax pattern executes a brace-block of commands if # the single letter in the workspace is in a particular # character class, list or range. "class*{*commandset*}*", "class*{*command*}*" { clear; # indent for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "testclass "; get; add "\njumpfalse not.in.class."; cc; add "\n"; ++; ++; get; add "\nnot.in.class."; cc; --; --; add ":"; put; clear; add "command*"; push; .reparse } #------------------------------------- # formats: ![:alpha:] { add ":"; print; clear; } # ![a-z] { print; } # this syntax pattern executes a brace-block of commands if # all the letters in the workspace are not in a particular # character class, list or range. "notclass*{*commandset*}*", "notclass*{*command*}*" { clear; # indent for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "testclass "; get; add "\njumptrue is.in.class."; cc; add "\n"; ++; ++; get; add "\nis.in.class."; cc; --; --; add ":"; put; clear; add "command*"; push; .reparse } #------------------------------------- # formats: !"text" { #*commands*# } # this syntax pattern executes a brace-block of commands if # the letter in the workspace is not in a particular # character class, list or range. "notquote*{*commandset*}*", "notquote*{*command*}*" { clear; # indent for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "testis "; get; add "\njumptrue is.text."; cc; add "\n"; ++; ++; get; add "\nis.text."; cc; --; --; add ":"; put; clear; add "command*"; push; .reparse } #------------------------------------- # format: (eof) { add ":"; print; clear; } # format: (==) { print; } "state*{*commandset*}*", "state*{*command*}*" { clear; # indent for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; get; "eof" { clear; add "testeof "; add "\njumpfalse not.eof."; cc; add "\n"; ++; ++; get; add "\nnot.eof."; cc; --; --; add ":"; put; clear; add "command*"; push; .reparse } # ---------- # compile to tape= test "==" { clear; add "testtape "; add "\njumpfalse not.testtape."; cc; add "\n"; ++; ++; get; add "\nnot.testtape."; cc; --; --; add ":"; put; clear; add "command*"; push; .reparse } add ": unknown state test near line "; ll; add " of script.\n"; add " State tests are \n"; add " (eof) test if end of stream reached. \n"; add " (==) test if workspace is same as current tape cell \n"; print; clear; quit; } # put the 4 (or less) tokens back on the stack push; push; push; push; (eof) { #add "end of script!! \n" print; clear; #--------------------- # check if the script correctly parsed (there should only # be one token on the stack, namely "commandset*" or "command*" pop; pop; "commandset*", "command*" { push; --; add "# Assembled with the script 'compile.pss' \n"; add "start:\n"; get; # an extra space because of a bug in compile() add "\njump start \n"; # put a copy of the final compilation into the tapecell # so it can be inspected interactively. put; print; # save the compiled script to 'sav.pp' write; clear; quit; } "beginblock*commandset*", "beginblock*command*" { clear; add "# Assembled with the script 'compile.pss' \n"; get; add "\n"; ++; add "start:\n"; get; # an extra space because of a bug in compile() add "\njump start \n"; # put a copy of the final compilation into the tapecell # so it can be inspected interactively. put; print; # also save the compiled script to 'sav.pp' write; clear; quit; } push; push; # state clear; add "After compiling with 'compile.pss' (at EOF): \n "; add " parse error in input script, check syntax or try \n "; add " 'pp -Ia asm.pp script' to debug compilation \n "; print; clear; # clear sav.pp because script could not be compiled write; # bail means exit with error bail; } # not eof # there is an implicit .restart command here (jump start)