#* translate.cpp.pss This is a parse-script which translates parse-scripts into c++ code, using the 'pep' tool. The script will create a standalone compilable c++ program. The virtual machine and engine is implemented in plain c at http://bumble.sf.net/books/pars/gh.c. This implements a script language with a syntax reminiscent of sed and awk (much simpler than awk, but more complex than sed). This code was originally created in a straightforward manner by adapting the code in 'compile.js.pss' which compiles scripts to javascript STATUS just begun from tr.java.pss NOTES We use labelled loops and break/continue to implement the parse> label and .reparse .restart commands. Breaks are also used to implement the quit and bail commands. TODO TESTING * testing the multiple escaped until bug >> pep.jas 'r;until"c";add".";t;d;' 'ab\\cab\cabc' ---- pep -f translate.cpp.pss eg/mark.html.pss > eg/cpp/mark.html.cpp g++ cat pars-book.txt | ,,,, CPP SYNTAX * check if a string ends with prefix ------ if (fullString.length() >= ending.length()) { return (0 == fullString.compare (fullString.length() - ending.length(), ending.length(), ending)); } else { return false; } ,,,, * another way, or use compare, is better but more verbose ---- if(argument.substr(0, prefix.size()) == prefix) { std::string argumentValue = argument.substr(prefix.size()); } ,,,, * check if string starts with text ---- std::string s = "tititoto"; if (s.rfind("titi", 0) == 0) { // pos=0 limits the search to the prefix // s starts with prefix } ,,, * read one character from stdin --- c = std::cin.get(); str += c; ,,, >> std::cout << str3 << '\n'; * bind 'out' to a file or stdout --------- std::streambuf * buf; std::ofstream of; if(!condition) { of.open("file.txt"); buf = of.rdbuf(); } else { buf = std::cout.rdbuf(); } std::ostream out(buf); ,,, * write string input to file ---- std::ofstream out("output.txt"); out << input; out.close(); ,,, GOTCHAS BUGS unescape needs to walk the string. Its a bit strange to talk about a multicharacter string being "escaped" (eg when calling 'until') but this is allowed in the pep engine. add "\{"; will generate an "illegal escape character" error when trying to compile the generated java code. I need to consider what to do in this situation (eg escape \ to \\ ?) check "go/mark" code. what happens if the mark is not found?? throw error and exit I think. SOLVED BUGS found a bug in "replace" code, which was returning from inline code. Found and fixed a bug in the (==) code ie in java (stringa == stringb) doesnt work. found and fixed a bug in java whilenot/while. The code exits if the character is not found, which is not correct. TASKS SEE ALSO At http://bumble.sf.net/books/pars/ tr/translate.*.pss scripts for translating into other languages. eg c,ruby,python,java,tcl compile.pss compiles a script into an "assembly" format that can be loaded and run on the parse-machine with the -a switch. This performs the same function as "asm.pp" HISTORY 8 august 2021 A bit more editing 15 july 2021 just started to adapt this from java code *# read; #-------------- [:space:] { clear; .reparse } #--------------- # We can ellide all these single character tests, because # the stack token is just the character itself with a * # Braces {} are used for blocks of commands, ',' and '.' for concatenating # tests with OR or AND logic. 'B' and 'E' for begin and end # tests, '!' is used for negation, ';' is used to terminate a # command. "{", "}", ";", ",", ".", "!", "B", "E" { put; add "*"; push; .reparse } #--------------- # format: "text" "\"" { # save the start line number (for error messages) in case # there is no terminating quote character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; add '"'; until '"'; !E'"' { clear; add 'Unterminated quote character (") starting at '; get; add ' !\n'; print; quit; } put; clear; add "quote*"; push; .reparse } #--------------- # format: 'text', single quotes are converted to double quotes # but we must escape embedded double quotes. "'" { # save the start line number (for error messages) in case # there is no terminating quote character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; until "'"; !E"'" { clear; add "Unterminated quote (') starting at "; get; add '!\n'; print; quit; } clip; escape '"'; put; clear; add "\""; get; add "\""; put; clear; add "quote*"; push; .reparse } #--------------- # formats: [:space:] [a-z] [abcd] [:alpha:] etc # should class tests really be multiline??! "[" { # save the start line number (for error messages) in case # there is no terminating bracket character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; add "["; until "]"; "[]" { clear; add "pep script error at line "; lines; add " (character "; chars; add "): \n"; add " empty character class [] \n"; print; quit; } !E"]" { clear; add "Unterminated class text ([...]) starting at "; get; add " class text can be used in tests or with the 'while' and 'whilenot' commands. For example: [:alpha:] { while [:alpha:]; print; clear; } "; print; quit; } # need to escape quotes so they dont interfere with the # quotes cpp needs for .matches("...") ? # check! escape '"'; # the caret is not a negation operator in pep scripts replace "^" "\\\\^"; # save the class on the tape put; clop; clop; !B"-" { # not a range class, eg [a-z] so need to escape '-' chars # java requires a double escape and cpp? clear; get; replace '-' '\\\\-'; put; } B"-" { # a range class, eg [a-z], check if it is correct clip; clip; !"-" { clear; add "Error in pep script at line "; lines; add " (character "; chars; add "): \n"; add " Incorrect character range class "; get; add " For example: [a-g] # correct [f-gh] # error! \n"; print; clear; quit; } } clear; get; # restore class text B"[:".!E":]" { clear; add "malformed character class starting at "; get; add '!\n'; print; quit; } B"[:".!"[:]" { clip; clip; clop; clop; # unicode posix character classes # Also, abbreviations (not implemented in gh.c yet.) "alnum","N" { clear; add "\\\\p{Alnum}"; } "alpha","A" { clear; add "\\\\p{Alpha}"; } "ascii","I" { clear; add "\\\\p{ASCII}"; } "blank","B" { clear; add "\\\\p{Blank}"; } "cntrl","C" { clear; add "\\\\p{Cntrl}"; } "digit","D" { clear; add "\\\\p{Digit}"; } "graph","G" { clear; add "\\\\p{Graph}"; } "lower","L" { clear; add "\\\\p{Lower}"; } "print","P" { clear; add "\\\\p{Print}"; } "punct","T" { clear; add "\\\\p{Punct}"; } "space","S" { clear; add "\\\\p{Space}"; } "upper","U" { clear; add "\\\\p{Upper}"; } "xdigit","X" { clear; add "\\\\p{Xdigit}"; } !B"\\\\p{" { put; clear; add "Pep script syntax error near line "; lines; add " (character "; chars; add "): \n"; add "Unknown character class '"; get; add "'\n"; print; clear; quit; } } #* alnum - alphanumeric like [0-9a-zA-Z] alpha - alphabetic like [a-zA-Z] blank - blank chars, space and tab cntrl - control chars, ascii 000 to 037 and 177 (del) digit - digits 0-9 graph - graphical chars same as :alnum: and :punct: lower - lower case letters [a-z] print - printable chars ie :graph: + space punct - punctuation ie !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~. space - all whitespace, eg \n\r\t vert tab, space, \f upper - upper case letters [A-Z] xdigit - hexadecimal digit ie [0-9a-fA-F] *# put; clear; # add quotes around the class and limits around the # class so it can be used with the string.matches() method # (must match the whole string, not just one character) add '"^'; get; add '+$"'; put; clear; add "class*"; push; .reparse } #--------------- # formats: (eof) (EOF) (==) etc. "(" { clear; until ")"; clip; put; "eof","EOF" { clear; add "eof*"; push; .reparse } "==" { clear; add "tapetest*"; push; .reparse } add " << unknown test near line "; lines; add " of script.\n"; add " bracket () tests are \n"; add " (eof) test if end of stream reached. \n"; add " (==) test if workspace is same as current tape cell \n"; print; clear; quit; } #--------------- # multiline and single line comments, eg #... and #* ... *# "#" { clear; read; "\n" { clear; .reparse } # checking for multiline comments of the form "#* \n\n\n *#" # these are just ignored at the moment (deleted) "*" { # save the line number for possible error message later clear; lines; put; clear; until "*#"; E"*#" { # convert to /* ... */ c++ multiline comment clip; clip; put; clear; add "/*"; get; add "*/"; # create a "comment" parse token put; clear; # comment-out this line to remove multiline comments from the # compiled code. # add "comment*"; push; .reparse } # make an unterminated multiline comment an error # to ease debugging of scripts. clear; add "unterminated multiline comment #* ... *# \n"; add "stating at line number "; get; add "\n"; print; clear; quit; } # single line comments. some will get lost. put; clear; add "//"; get; until "\n"; clip; put; clear; add "comment*"; push; .reparse } #---------------------------------- # parse command words (and abbreviations) # legal characters for keywords (commands) ![abcdefghijklmnopqrstuvwxyzBEKGPRUWS+-<>0^] { # error message about a misplaced character put; clear; add "!! Misplaced character '"; get; add "' in script near line "; lines; add " (character "; chars; add ") \n"; print; clear; quit; } # my testclass implementation cannot handle complex lists # eg [a-z+-] this is why I have to write out the whole alphabet while [abcdefghijklmnopqrstuvwxyzBEOFKGPRUWS+-<>0^]; #---------------------------------- # KEYWORDS # here we can test for all the keywords (command words) and their # abbreviated one letter versions (eg: clip k, clop K etc). Then # we can print an error message and abort if the word is not a # legal keyword for the parse-edit language # make ll an alias for "lines" and cc an alias for chars "ll" { clear; add "lines"; } "cc" { clear; add "chars"; } # one letter command abbreviations "a" { clear; add "add"; } "k" { clear; add "clip"; } "K" { clear; add "clop"; } "D" { clear; add "replace"; } "d" { clear; add "clear"; } "t" { clear; add "print"; } "p" { clear; add "pop"; } "P" { clear; add "push"; } "u" { clear; add "unstack"; } "U" { clear; add "stack"; } "G" { clear; add "put"; } "g" { clear; add "get"; } "x" { clear; add "swap"; } ">" { clear; add "++"; } "<" { clear; add "--"; } "m" { clear; add "mark"; } "M" { clear; add "go"; } "r" { clear; add "read"; } "R" { clear; add "until"; } "w" { clear; add "while"; } "W" { clear; add "whilenot"; } "n" { clear; add "count"; } "+" { clear; add "a+"; } "-" { clear; add "a-"; } "0" { clear; add "zero"; } "c" { clear; add "chars"; } "l" { clear; add "lines"; } "^" { clear; add "escape"; } "v" { clear; add "unescape"; } "z" { clear; add "delim"; } "S" { clear; add "state"; } "q" { clear; add "quit"; } "s" { clear; add "write"; } "o" { clear; add "nop"; } "rs" { clear; add "restart"; } "rp" { clear; add "reparse"; } # some extra syntax for testeof and testtape "","" { put; clear; add "eof*"; push; .reparse } "<==>" { put; clear; add "tapetest*"; push; .reparse } "jump","jumptrue","jumpfalse", "testis","testclass","testbegins","testends", "testeof","testtape" { put; clear; add "The instruction '"; get; add "' near line "; lines; add " (character "; chars; add ")\n"; add "can be used in pep assembly code but not scripts. \n"; print; clear; quit; } # show information if these "deprecated" commands are used "Q","bail","state" { put; clear; add "The instruction '"; get; add "' near line "; lines; add " (character "; chars; add ")\n"; add "is no longer part of the pep language (july 2020). \n"; add "use 'quit' instead of 'bail', and use 'unstack; print;' \n"; add "instead of 'state'. \n"; print; clear; quit; } "add","clip","clop","replace","upper","lower","cap","clear","print", "pop","push","unstack","stack","put","get","swap", "++","--","mark","go","read","until","while","whilenot", "count","a+","a-","zero","chars","lines","nochars","nolines", "escape","unescape","delim","quit", "write","nop","reparse","restart" { put; clear; add "word*"; push; .reparse } #------------ # the .reparse command and "parse label" is a simple way to # make sure that all shift-reductions occur. It should be used inside # a block test, so as not to create an infinite loop. There is # Is there a "goto" in c++ ? # implement .reparse/parse> "parse>" { clear; count; !"0" { clear; add "script error:\n"; add " extra parse> label at line "; lines; add ".\n"; print; quit; } clear; add "// parse>"; put; clear; add "parse>*"; push; # use accumulator to indicate after parse> label a+; .reparse } # -------------------- # implement "begin-blocks", which are only executed # once, at the beginning of the script (similar to awk's BEGIN {} rules) "begin" { put; add "*"; push; .reparse } add " << unknown command on line "; lines; add " (char "; chars; add ")"; add " of source file. \n"; print; clear; quit; # ---------------------------------- # PARSING PHASE: # Below is the parse/compile phase of the script. Here we pop tokens off the # stack and check for sequences of tokens eg "word*semicolon*". If we find a # valid series of tokens, we "shift-reduce" or "resolve" the token series eg # word*semicolon* --> command* # # At the same time, we manipulate (transform) the attributes on the tape, as # required. # parse> #------------------------------------- # 2 tokens #------------------------------------- pop; pop; # All of the patterns below are currently errors, but may not # be in the future if we expand the syntax of the parse # language. Also consider: # begintext* endtext* quoteset* notclass*, !* ,* ;* B* E* # It is nice to trap the errors here because we can emit some # (hopefully not very cryptic) error messages with a line number. # Otherwise the script writer has to debug with # pep -a asm.pp -I scriptfile # "word*word*","word*}*","word*begintext*","word*endtext*", "word*!*", "word*,*","quote*word*", "quote*class*", "quote*state*", "quote*}*", "quote*begintext*", "quote*endtext*", "class*word*", "class*quote*", "class*class*", "class*state*", "class*}*", "class*begintext*", "class*endtext*", "class*!*", "notclass*word*", "notclass*quote*", "notclass*class*", "notclass*state*", "notclass*}*" { add " (Token stack) \nValue: \n"; get; add "\nValue: \n"; ++; get; --; add "\n"; add "Error near line "; lines; add " (char "; chars; add ")"; add " of pep script (missing semicolon?) \n"; print; clear; quit; } "{*;*", ";*;*", "}*;*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of pep script: misplaced semi-colon? ; \n"; print; clear; quit; } ",*{*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of script: extra comma in list? \n"; print; clear; quit; } "command*;*","commandset*;*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of script: extra semi-colon? \n"; print; clear; quit; } "!*!*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: \n double negation '!!' is not implemented \n"; add " and probably won't be, because what would be the point? \n"; print; clear; quit; } "!*{*","!*;*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: misplaced negation operator (!)? \n"; add " The negation operator precedes tests, for example: \n"; add " !B'abc'{ ... } or !(eof),!'abc'{ ... } \n"; print; clear; quit; } ",*command*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: misplaced comma? \n"; print; clear; quit; } "!*command*" { push; push; add "error near line "; lines; add " (at char "; chars; add ") \n"; add " The negation operator (!) cannot precede a command \n"; print; clear; quit; } ";*{*", "command*{*", "commandset*{*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: no test for brace block? \n"; print; clear; quit; } "{*}*" { push; push; add "error near line "; lines; add " of script: empty braces {}. \n"; print; clear; quit; } "B*class*","E*class*" { push; push; add "error near line "; lines; add " of script:\n classes ([a-z], [:space:] etc). \n"; add " cannot use the 'begin' or 'end' modifiers (B/E) \n"; print; clear; quit; } "comment*{*" { push; push; add "error near line "; lines; add " of script: comments cannot occur between \n"; add " a test and a brace ({). \n"; print; clear; quit; } "}*command*" { push; push; add "error near line "; lines; add " of script: extra closing brace '}' ?. \n"; print; clear; quit; } #* E"begin*".!"begin*" { push; push; add "error near line "; lines; add " of script: Begin blocks must precede code \n"; print; clear; quit; } *# #------------ # The .restart command jumps to the first instruction after the # begin block (if there is a begin block), or the first instruction # of the script. ".*word*" { clear; ++; get; --; "restart" { clear; add "continue script;"; # not required because we have labelled loops, # continue script works both before and after the parse> label # "0" { clear; add "continue script;"; } # "1" { clear; add "break lex;"; } put; clear; add "command*"; push; .reparse } "reparse" { clear; count; # check accumulator to see if we are in the "lex" block # or the "parse" block and adjust the .reparse compilation # accordingly. "0" { clear; add "break lex;"; } "1" { clear; add "continue parse;"; } put; clear; add "command*"; push; .reparse } push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: \n"; add " misplaced dot '.' (use for AND logic or in .reparse/.restart \n"; print; clear; quit; } #--------------------------------- # Compiling comments so as to transfer them to the cpp "comment*command*","command*comment*","commandset*comment*" { clear; get; add "\n"; ++; get; --; put; clear; add "command*"; push; .reparse } "comment*comment*" { clear; get; add "\n"; ++; get; --; put; clear; add "comment*"; push; .reparse } # ----------------------- # negated tokens. # # This is a new more elegant way to negate a whole set of # tests (tokens) where the negation logic is stored on the # stack, not in the current tape cell. We just add "not" to # the stack token. # eg: ![:alpha:] ![a-z] ![abcd] !"abc" !B"abc" !E"xyz" # This format is used to indicate a negative test for # a brace block. eg: ![aeiou] { add "< not a vowel"; print; clear; } "!*quote*","!*class*","!*begintext*", "!*endtext*", "!*eof*","!*tapetest*" { # a simplification: store the token name "quote*/class*/..." # in the tape cell corresponding to the "!*" token. replace "!*" "not"; push; # this was a bug?? a missing ++; ?? # now get the token-value get; --; put; ++; clear; .reparse } #----------------------------------------- # format: E"text" or E'text' # This format is used to indicate a "workspace-ends-with" text before # a brace block. "E*quote*" { clear; add "endtext*"; push; get; '""' { # empty argument is an error clear; add "pep script error near line "; lines; add " (character "; chars; add "): \n"; add ' empty argument for end-test (E"") \n'; print; quit; } --; put; ++; clear; .reparse } #----------------------------------------- # format: B"sometext" or B'sometext' # A 'B' preceding some quoted text is used to indicate a # 'workspace-begins-with' test, before a brace block. "B*quote*" { clear; add "begintext*"; push; get; '""' { # empty argument is an error clear; add "pep script error near line "; lines; add " (character "; chars; add "): \n"; add ' empty argument for begin-test (B"") \n'; print; quit; } --; put; ++; clear; .reparse } #-------------------------------------------- # ebnf: command := word, ';' ; # formats: "pop; push; clear; print; " etc # all commands need to end with a semi-colon except for # .reparse and .restart # "word*;*" { clear; # check if command requires parameter get; "add", "until", "while", "whilenot", "mark", "go", "escape", "unescape", "delim", "replace" { put; clear; add "'"; get; add "'"; add " << command needs an argument, on line "; lines; add " of script.\n"; print; clear; quit; } "clip" { clear; # are these length tests really necessary add "if (!mm.work.empty()) { /* clip */\n"; add " mm.work.erase(mm.work.size() - 1); }"; put; } "clop" { clear; add "if (!mm.work.empty()) { \n"; add " mm.work.erase(mm.work.begin()); } /* clop */"; put; } "clear" { clear; add 'mm.work.clear(); /* clear */'; put; } "upper" { clear; add "out << \"upper not done!!\"); /* upper */"; put; } "lower" { clear; add "out << \"lower not done!!\"); /* lower */"; put; } "cap" { clear; add "out << \"cap not done!!\"); /* cap */"; put; } "print" { clear; add "out << mm.work; /* print */"; put; } "pop" { clear; add "mm.pop();"; put; } "push" { clear; add "mm.push();"; put; } "unstack" { clear; add "while (mm.pop()); /* unstack */"; put; } "stack" { clear; add "while(mm.push()); /* stack */"; put; } "put" { clear; add "mm.tape.at(mm.cell) = mm.work; /* put */"; put; } "get" { clear; add "mm.work += mm.tape.at(mm.cell); /* get */"; put; } "swap" { clear; add "mm.swap();"; put; } "++" { clear; add "mm.increment(); /* ++ */"; put; } "--" { clear; add "if (mm.cell > 0) { mm.cell--; } /* -- */"; put; } "read" { clear; add "mm.readNext(); /* read */"; put; } "count" { clear; add "mm.work += mm.counter; /* count */"; put; } "a+" { clear; add "mm.counter++; /* a+ */"; put; } "a-" { clear; add "mm.counter--; /* a- */"; put; } "zero" { clear; add "mm.counter = 0; /* zero */"; put; } "chars" { clear; add "mm.work += mm.charsRead; /* chars */"; put; } "lines" { clear; add "mm.work += mm.linesRead; /* lines */"; put; } "nochars" { clear; add "mm.charsRead = 0; /* nochars */"; put; } "nolines" { clear; add "mm.linesRead = 0; /* nolines */"; put; } # use a labelled loop to quit script. "quit" { clear; add "break script;"; put; } "write" { clear; add 'mm.writeToFile("sav.pp");'; put; } # just eliminate since it does nothing. "nop" { clear; add "/* nop: no-operation eliminated */"; put; } clear; add "command*"; push; .reparse } #----------------------------------------- # ebnf: commandset := command , command ; "command*command*", "commandset*command*" { clear; add "commandset*"; push; # format the tape attributes. Add the next command on a newline --; get; add "\n"; ++; get; --; put; ++; clear; .reparse } #------------------- # here we begin to parse "test*" and "ortestset*" and "andtestset*" # #------------------- # eg: B"abc" {} or E"xyz" {} # transform and markup the different test types "begintext*,*","endtext*,*","quote*,*","class*,*", "eof*,*","tapetest*,*", "begintext*.*","endtext*.*","quote*.*","class*.*", "eof*.*","tapetest*.*", "begintext*{*","endtext*{*","quote*{*","class*{*", "eof*{*","tapetest*{*" { #//if (s.rfind("titi", 0) == 0) { // pos=0 limits the search to the prefix B"begin" { clear; add "mm.work.rfind("; } B"end" { clear; add "mm.work.endsWith("; } B"quote" { clear; add "mm.work.equals("; } B"class" { clear; add "mm.work.matches("; } # clear the tapecell for testeof and testtape because # they take no arguments. B"eof" { clear; put; add "mm.eof"; } B"tapetest" { clear; put; add "(mm.work.equals(mm.tape[mm.tapePointer])"; } get; !B"mm.eof" { add ")"; } put; #* # maybe we could ellide the not tests by doing here B"not" { clear; add "!"; get; put; } *# clear; add "test*"; push; # the trick below pushes the right token back on the stack. get; add "*"; push; .reparse } #------------------- # negated tests # eg: !B"xyz {} !(eof) {} !(==) {} # !E"xyz" {} # !"abc" {} # ![a-z] {} "notbegintext*,*","notendtext*,*","notquote*,*","notclass*,*", "noteof*,*","nottapetest*,*", "notbegintext*.*","notendtext*.*","notquote*.*","notclass*.*", "noteof*.*","nottapetest*.*", "notbegintext*{*","notendtext*{*","notquote*{*","notclass*{*", "noteof*{*","nottapetest*{*" { B"notbegin" { clear; add "!mm.work.startsWith("; } B"notend" { clear; add "!mm.work.endsWith("; } B"notquote" { clear; add "!mm.work.equals("; } B"notclass" { clear; add "!mm.work.matches("; } # clear the tapecell for testeof and testtape because # they take no arguments. B"noteof" { clear; put; add "!mm.eof"; } B"nottapetest" { clear; put; add "(!mm.work.equals(mm.tape[mm.tapePointer])"; } get; !B"!mm.eof" { add ")"; } put; clear; add "test*"; push; # the trick below pushes the right token back on the stack. get; add "*"; push; .reparse } #------------------- # 3 tokens #------------------- pop; #----------------------------- # some 3 token errors!!! # not a comprehensive list "{*quote*;*","{*begintext*;*","{*endtext*;*","{*class*;*", "commandset*quote*;*", "command*quote*;*" { push; push; push; add "[pep error]\n invalid syntax near line "; lines; add " (char "; chars; add ")"; add " of script (misplaced semicolon?) \n"; print; clear; quit; } # to simplify subsequent tests, transmogrify a single command # to a commandset (multiple commands). "{*command*}*" { clear; add "{*commandset*}*"; push; push; push; .reparse } # errors! mixing AND and OR concatenation ",*andtestset*{*", ".*ortestset*{*" { # push the tokens back to make debugging easier push; push; push; add " error: mixing AND (.) and OR (,) concatenation in \n"; add " in pep script near line "; lines; add " (character "; chars; add ") \n"; add ' For example: B".".!E"/".[abcd./] { print; } # Correct! B".".!E"/",[abcd./] { print; } # Error! \n'; print; clear; quit; } #-------------------------------------------- # ebnf: command := keyword , quoted-text , ";" ; # format: add "text"; "word*quote*;*" { clear; get; "replace" { # error add "< command requires 2 parameters, not 1 \n"; add "near line "; lines; add " of script. \n"; print; clear; quit; } # check whether argument is single character, otherwise # throw an error "escape", "unescape", "while", "whilenot" { # This is trickier than I thought it would be. clear; ++; get; --; # check that arg not empty, (but an empty quote is ok # for the second arg of 'replace' '""' { clear; add "[pep error] near line "; lines; add " (or char "; chars; add "): \n"; add " command '"; get; add '\' cannot have an empty argument ("") \n'; print; quit; } # quoted text has the quotes still around it. # also handle escape characters like \n \r etc clip; clop; clop; clop; # B "\\" { clip; } clip; !"" { clear; add "Pep script error near line "; lines; add " (character "; chars; add "): \n"; add " command '"; get; add "' takes only a single character argument. \n"; print; quit; } clear; get; } "mark" { clear; add "/* mark */ \n"; add "mm.marks[mm.tapePointer].setLength(0); // mark \n"; add "mm.marks[mm.tapePointer].append("; ++; get; --; add "); // mark"; put; clear; add "command*"; push; .reparse } "go" { clear; add "/* go */\n"; add "for (var ii = 0; ii < mm.marks.length; ii++) \n"; add " if (mm.marks[ii].equals("; ++; get; --; add ")) \n"; add " { mm.tapePointer = ii; }"; put; clear; add "command*"; push; .reparse } "delim" { clear; # this.delimiter.setCharAt(0, text.charAt(0)); # only the first character of the delimiter argument is used. add "mm.delimiter.setLength(0); /* delim */\n"; add "mm.delimiter.append("; ++; get; --; add "); "; put; clear; add "command*"; push; .reparse } "add" { clear; add "mm.work.append("; ++; get; --; # handle multiline text replace "\n" '"); \nmm.work.append("\\n'; add "); /* add */"; put; clear; add "command*"; push; .reparse } "while" { clear; add "while ((char) mm.peep == "; ++; get; --; add ".charAt(0)) /* while */\n "; add " { if (mm.eof) {break;} mm.read(); }"; put; clear; add "command*"; push; .reparse } "whilenot" { clear; add "while ((char) mm.peep != "; ++; get; --; add ".charAt(0)) /* whilenot */\n "; add " { if (mm.eof) {break;} mm.read(); }"; put; clear; add "command*"; push; .reparse } "until" { clear; add "mm.until("; ++; get; --; # error until cannot have empty argument 'mm.until(""' { clear; add "Pep script error near line "; lines; add " (character "; chars; add "): \n"; add " empty argument for 'until' \n"; add " For example: until '.txt'; until \">\"; # correct until ''; until \"\"; # errors! \n"; print; quit; } # handle multiline argument replace "\n" "\\n"; add ');'; put; clear; add "command*"; push; .reparse } # But really, can't the "replace" command just be used # instead of escape/unescape?? This seems a flaw in the # machine design. "escape","unescape" { clear; add "mm."; get; add "Char"; add "("; ++; get; --; add '.charAt(0));'; put; clear; add "command*"; push; .reparse } # error, superfluous argument add ": command does not take an argument \n"; add "near line "; lines; add " of script. \n"; print; clear; #state quit; } #---------------------------------- # format: "while [:alpha:] ;" or whilenot [a-z] ; "word*class*;*" { clear; get; "while" { clear; add "/* while */ \n"; add "while (Character.toString((char)mm.peep).matches("; ++; get; --; add ")) { if (mm.eof) { break; } mm.read(); }"; put; clear; add "command*"; push; .reparse } "whilenot" { clear; add "/* whilenot */ \n"; add "while (!Character.toString((char)mm.peep).matches("; ++; get; --; add ")) { if (mm.eof) { break; } mm.read(); }"; put; clear; add "command*"; push; .reparse } # error add " < command cannot have a class argument \n"; add "line "; lines; add ": error in script \n"; print; clear; quit; } # arrange the parse> label loops (eof) { "commandset*parse>*commandset*","command*parse>*commandset*", "commandset*parse>*command*","command*parse>*command*" { clear; # indent both code blocks add " "; get; replace "\n" "\n "; put; clear; ++; ++; add " "; get; replace "\n" "\n "; put; clear; --; --; # add a block so that .reparse works before the parse> label. add "lex: { \n"; get; add "\n}\n"; ++; ++; # indent code block # add " "; get; replace "\n" "\n "; put; clear; add "parse: \n"; add "while (true) { \n"; get; add "\n break parse;\n}"; --; --; put; clear; add "commandset*"; push; .reparse } } # ------------------------------- # 4 tokens # ------------------------------- pop; #------------------------------------- # bnf: command := replace , quote , quote , ";" ; # example: replace "and" "AND" ; "word*quote*quote*;*" { clear; get; "replace" { #--------------------------- # a command plus 2 arguments, eg replace "this" "that" clear; add "/* replace */ \n"; add "if (!mm.work.empty()) { \n"; add " temp = mm.work.replace("; ++; get; add ", "; ++; get; add ");\n"; add " mm.work.clear(); \n"; add " mm.work.append(temp);\n} "; --; --; put; clear; add "command*"; push; .reparse } add "pep script error on line "; lines; add " (character "; chars; add "): \n"; add " command does not take 2 quoted arguments. \n"; print; quit; } #------------------------------------- # format: begin { #* commands *# } # "begin" blocks which are only executed once (they # will are assembled before the "start:" label. They must come before # all other commands. # "begin*{*command*}*", "begin*{*commandset*}*" { clear; ++; ++; get; --; --; put; clear; add "beginblock*"; push; .reparse } # ------------- # parses and compiles concatenated tests # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ... # these 2 tests should be all that is necessary "test*,*ortestset*{*", "test*,*test*{*" { clear; get; add " || "; ++; ++; get; --; --; put; clear; add "ortestset*{*"; push; push; .reparse } # dont mix AND and OR concatenations # ------------- # AND logic # parses and compiles concatenated AND tests # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ... # it is possible to elide this block with the negated block # for compactness but maybe readability is not as good. # negated tests can be chained with non negated tests. # eg: B'http' . !E'.txt' { ... } "test*.*andtestset*{*", "test*.*test*{*" { clear; get; add " && "; ++; ++; get; --; --; put; clear; add "andtestset*{*"; push; push; .reparse } #------------------------------------- # we should not have to check for the {*command*}* pattern # because that has already been transformed to {*commandset*}* "test*{*commandset*}*", "andtestset*{*commandset*}*", "ortestset*{*commandset*}*" { clear; # indent the java code for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "if ("; get; add ") {\n"; ++; ++; get; add "\n}"; --; --; put; clear; add "command*"; push; # always reparse/compile .reparse } # ------------- # multi-token end-of-stream errors # not a comprehensive list of errors... (eof) { E"begintext*",E"endtext*",E"test*",E"ortestset*",E"andtestset*" { add " Error near end of script at line "; lines; add ". Test with no brace block? \n"; print; clear; quit; } E"quote*",E"class*",E"word*"{ put; clear; add "Error at end of pep script near line "; lines; add ": missing semi-colon? \n"; add "Parse stack: "; get; add "\n"; print; clear; quit; } E"{*", E"}*", E";*", E",*", E".*", E"!*", E"B*", E"E*" { put; clear; add "Error: misplaced terminal character at end of script! (line "; lines; add "). \n"; add "Parse stack: "; get; add "\n"; print; clear; quit; } } # put the 4 (or less) tokens back on the stack push; push; push; push; (eof) { print; clear; # create the virtual machine object code and save it # somewhere on the tape. add ' /* C++ code generated by "translate.cpp.pss" */ // NOTE: script in developement. #include #include #include using namespace std; class Machine { // how does cpp do unicode? private: int counter; // counter for anything int peep; // next char in input stream int charsRead; // No. of chars read so far int linesRead; // No. of lines read so far string work; // text buffer for all manipulations. vector stack; // parse token stack int LENGTH = 100; // current tape length vector tape; // array of token attributes vector marks; // tape marks int cell; // pointer to current cell Reader input; // text input stream bool eof; // end of stream reached? string escape; // char used to "escape" others "\\" string delimiter; // push/pop delimiter (default is "*") public: /** make a new machine with a character stream reader */ // and an output stream? Machine(Reader reader) { this.input = reader; this.eof = false; this.flag = false; this.charsRead = 0; this.linesRead = 1; this.escape = "\\\\" this.delimiter = "*"; this.counter = 0; this.work = ""; //this.stack = vector(); ?? this.cell = 0; for (int ii = 0; ii < this.SIZE; ii++) { this.tape.push_back(""); this.marks.push_back(""); } this.peep = this.input.read(); } /* init */ /** read one character from the input stream and update the machine. */ void readNext() { int iChar; if (this.eof) { exit(0); } this.charsRead++; // increment lines if ((char)this.peep == \'\\n\') { this.linesRead++; } this.work += this.peep; this.peep = this.input.read(); if (this.peep == -1) { this.eof = true; } } /** increment tape pointer by one */ void increment() { this.cell++; if (this.cell >= Machine.LENGTH) { for (int ii = 0; ii < 50; ii++) { this.tape.push_back(""); this.marks.push_back(""); } } } /** remove escape character */ void unescapeChar(char c) { if (work.length() > 0) { string s = this.work.replace("\\\\"+c, c+""); this.work.setLength(0); work.append(s); } } /** add escape character */ void escapeChar(char c) { if (!this.work.empty()) { string s = this.work.replace(c+"", "\\\\"+c); work.setLength(0); work.append(s); } } /** whether trailing escapes \\\\ are even or odd */ // untested code. check! eg try: add "x \\\\"; print; etc bool isEscaped(string ss, string sSuffix) { int count = 0; if (ss.length() < 2) return false; if (ss.length() <= sSuffix.length()) return false; if (ss.indexOf(this.escape.charAt(0)) == -1) { return false; } int pos = ss.length()-sSuffix.length(); while ((pos > -1) && (ss.charAt(pos) == this.escape.charAt(0))) { count++; pos--; } if (count % 2 == 0) return false; return true; } /* a helper to see how many trailing \\\\ escape chars */ int countEscaped(string sSuffix) { string s = ""; int count = 0; int index = this.work.lastIndexOf(sSuffix); // remove suffix if it exists if (index > 0) { s = this.work.substring(0, index); } while (s.endsWith(this.escape)) { count++; s = s.substring(0, s.lastIndexOf(this.escape)); } return count; } /** reads the input stream until the workspace end with text */ // can test this with void until(string sSuffix) { // read at least one character if (this.eof) return; this.readNext(); while (true) { if (this.eof) { return; } if (this.work.endsWith(sSuffix)) { if (this.countEscaped(sSuffix) % 2 == 0) { return; } } this.readNext(); } } /** pop the first token from the stack into the workspace */ bool pop() { if (this.stack.isEmpty()) { return false; } this.work.insert(0, this.stack.pop_back()); if (this.cell > 0) { this.cell--; } return true; } /** push the first token from the workspace to the stack */ bool push() { string sItem = ""; // dont increment the tape pointer on an empty push if (this.work.empty()) { return false; } int iFirstStar = this.work.indexOf(this.delimiter); if (iFirstStar != -1) { sItem = this.work.substring(0, iFirstStar + 1); this.work.delete(0, iFirstStar + 1); } else { sItem = this.work; this.work = ""; } this.stack.push_back(sItem); this.increment(); return true; } /** swap current tape cell with the workspace */ // trivial in c++ ? void swap() { this.work.swap(this.tape[this.cell]; } /** save the workspace to a file */ void writeToFile(string filename) { std::ofstream out(filename); out << this.work; out.close(); } /** parse/check/compile the input */ void parse(InputStreamReader input) { //this is where the actual parsing/compiling code should go //but this means that all generated code must use //"this." not "mm." } }; /* end of machine class definition /* we could read from a file or stdin here */ int main() { string temp = ""; // this calls the default constructor in c++ Machine mm; \n'; # save the code in the current tape cell put; clear; #--------------------- # check if the script correctly parsed (there should only # be one token on the stack, namely "commandset*" or "command*"). pop; pop; "commandset*", "command*" { clear; # indent generated code (6 spaces) for readability. add " "; get; replace "\n" "\n "; put; clear; # restore the java preamble from the tape ++; get; --; add ' script: while (!mm.eof) {\n'; get; add "\n }"; add "\n }"; add "\n}\n"; # put a copy of the final compilation into the tapecell # so it can be inspected interactively. put; print; clear; quit; } "beginblock*commandset*", "beginblock*command*" { clear; # indent begin block code add " "; get; replace "\n" "\n "; put; clear; # indent main code for readability. ++; add " "; get; replace "\n" "\n "; put; clear; --; # get java preamble from tape ++; ++; get; --; --; get; add "\n"; ++; # a labelled loop for "quit" (but quit can just exit?) add " script: \n"; add " while (!mm.eof) {\n"; get; add "\n }"; add "\n }"; add "\n}\n"; # put a copy of the final compilation into the tapecell # for interactive debugging. put; print; clear; quit; } push; push; # try to explain some more errors unstack; B"parse>" { put; clear; add "[error] pep syntax error:\n"; add " The parse> label cannot be the 1st item \n"; add " of a script \n"; print; quit; } put; clear; clear; add "[error] compiling with 'translate.cpp.pss' (at EOF): \n "; add " parse error in input script. \n "; print; clear; unstack; put; clear; add "Parse stack: "; get; add "\n"; add " * debug script "; add " >> pep -If script -i 'some input' \n "; add " * debug compilation. \n "; add " >> pep -Ia asm.pp script' \n "; print; clear; quit; } # not eof # there is an implicit .restart command here (jump start)