#* translate.rust.pss This is a parse-script which translates parse-scripts into rust code, using the 'pep' tool. The script creates a standalone compilable rust program. The virtual machine and engine is implemented in plain c at http://bumble.sf.net/books/pars/pep.c. This implements a script language with a syntax reminiscent of sed and awk (much simpler than awk, but more complex than sed). STATUS Adapting code, but I am finding rust very confusing and unnecessarily complicated. May work on the cpp translator instead. NOTES no multiline comments in rust? We may use labelled loops and break/continue to implement the parse> label and .reparse .restart commands. Breaks are also used to implement the quit and bail commands. TODO Convert the parsing code to a method which takes an input stream as a parameter. This way the same parser/compiler can be used with a string/file/stdin etc and can also be used by other classes/objects. SEE ALSO At http://bumble.sf.net/books/pars/ translate.java.pss A very similar script for compiling scripts into java And all the other translation scripts in pars/tr/ TESTING * testing the multiple escaped until bug >> pep.rsas 'r;until"c";add".";t;d;' 'ab\\cab\cabc' Complex scripts can be translated into java and work, including this script itself. GOTCHAS I was trying to run >> pep -e "r;a'\\';print;d;" -i "abc" and I kept getting an unterminated quote message, which I thought I had fixed in machine.interp.c (until code). But the problem was actually the bash shell which resolves \\ to \ in double quotes, but not single quotes! BUGS These are java bugs, not rust. Xdigit in java not valid class. Its a bit strange to talk about a multicharacter string being "escaped" (eg when calling 'until') but this is allowed in the pep engine. add "\{"; will generate an "illegal escape character" error when trying to compile the generated java code. I need to consider what to do in this situation (eg escape \ to \\ ?) SOLVED BUGS check "go/mark" code. what happens if the mark is not found?? throw error and exit I think. found a bug in "replace" code, which was returning from inline code. RUST NOTES * delete a string >> s.clear() * a vector of chars >> let mut chars: Vec = pangram.chars().collect(); * append a char to a string ---- s.push(c); // append string string.push_str(", "); ,,, * iterate an array ----- for word in pangram.split_whitespace().rev() { println!("> {}", word); } ,,, TASKS HISTORY 4 july 2022 Began to adapt from tr/translate.java.pss Also working on eg/sed.tojava.pss *# read; #-------------- [:space:] { clear; .reparse } #--------------- # We can ellide all these single character tests, because # the stack token is just the character itself with a * # Braces {} are used for blocks of commands, ',' and '.' for concatenating # tests with OR or AND logic. 'B' and 'E' for begin and end # tests, '!' is used for negation, ';' is used to terminate a # command. "{", "}", ";", ",", ".", "!", "B", "E" { put; add "*"; push; .reparse } #--------------- # format: "text" "\"" { # save the start line number (for error messages) in case # there is no terminating quote character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; add '"'; until '"'; !E'"' { clear; add 'Unterminated quote character (") starting at '; get; add ' !\n'; print; quit; } put; clear; add "quote*"; push; .reparse } #--------------- # format: 'text', single quotes are converted to double quotes # but we must escape embedded double quotes. "'" { # save the start line number (for error messages) in case # there is no terminating quote character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; until "'"; !E"'" { clear; add "Unterminated quote (') starting at "; get; add '!\n'; print; quit; } clip; escape '"'; put; clear; add "\""; get; add "\""; put; clear; add "quote*"; push; .reparse } #--------------- # formats: [:space:] [a-z] [abcd] [:alpha:] etc # should class tests really be multiline??! "[" { # save the start line number (for error messages) in case # there is no terminating bracket character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; add "["; until "]"; "[]" { clear; add "pep script error at line "; lines; add " (character "; chars; add "): \n"; add " empty character class [] \n"; print; quit; } !E"]" { clear; add "Unterminated class text ([...]) starting at "; get; add " class text can be used in tests or with the 'while' and 'whilenot' commands. For example: [:alpha:] { while [:alpha:]; print; clear; } "; print; quit; } # need to escape quotes so they dont interfere with the # quotes java needs for .matches("...") escape '"'; # the caret is not a negation operator in pep scripts replace "^" "\\\\^"; # save the class on the tape put; clop; clop; !B"-" { # not a range class, eg [a-z] so need to escape '-' chars # java requires a double escape clear; get; replace '-' '\\\\-'; put; } B"-" { # a range class, eg [a-z], check if it is correct clip; clip; !"-" { clear; add "Error in pep script at line "; lines; add " (character "; chars; add "): \n"; add " Incorrect character range class "; get; add " For example: [a-g] # correct [f-gh] # error! \n"; print; clear; quit; } } clear; get; # restore class text B"[:".!E":]" { clear; add "malformed character class starting at "; get; add '!\n'; print; quit; } B"[:".!"[:]" { clip; clip; clop; clop; # unicode posix character classes in java # Also, abbreviations (not implemented in gh.c yet.) "alnum","N" { clear; add "\\\\p{Alnum}"; } "alpha","A" { clear; add "\\\\p{Alpha}"; } "ascii","I" { clear; add "\\\\p{ASCII}"; } "blank","B" { clear; add "\\\\p{Blank}"; } "cntrl","C" { clear; add "\\\\p{Cntrl}"; } "digit","D" { clear; add "\\\\p{Digit}"; } "graph","G" { clear; add "\\\\p{Graph}"; } "lower","L" { clear; add "\\\\p{Lower}"; } "print","P" { clear; add "\\\\p{Print}"; } "punct","T" { clear; add "\\\\p{Punct}"; } "space","S" { clear; add "\\\\p{Space}"; } "upper","U" { clear; add "\\\\p{Upper}"; } "xdigit","X" { clear; add "\\\\p{XDigit}"; } !B"\\\\p{" { put; clear; add "Pep script syntax error near line "; lines; add " (character "; chars; add "): \n"; add "Unknown character class '"; get; add "'\n"; print; clear; quit; } } #* alnum - alphanumeric like [0-9a-zA-Z] alpha - alphabetic like [a-zA-Z] blank - blank chars, space and tab cntrl - control chars, ascii 000 to 037 and 177 (del) digit - digits 0-9 graph - graphical chars same as :alnum: and :punct: lower - lower case letters [a-z] print - printable chars ie :graph: + space punct - punctuation ie !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~. space - all whitespace, eg \n\r\t vert tab, space, \f upper - upper case letters [A-Z] xdigit - hexadecimal digit ie [0-9a-fA-F] *# put; clear; # add quotes around the class and limits around the # class so it can be used with the string.matches() method # (must match the whole string, not just one character) add '"^'; get; add '+$"'; put; clear; add "class*"; push; .reparse } #--------------- # formats: (eof) (EOF) (==) etc. "(" { clear; until ")"; clip; put; "eof","EOF" { clear; add "eof*"; push; .reparse } "==" { clear; add "tapetest*"; push; .reparse } add " << unknown test near line "; lines; add " of script.\n"; add " bracket () tests are \n"; add " (eof) test if end of stream reached. \n"; add " (==) test if workspace is same as current tape cell \n"; print; clear; quit; } #--------------- # multiline and single line comments, eg #... and #* ... *# "#" { clear; read; "\n" { clear; .reparse } # checking for multiline comments of the form "#* \n\n\n *#" # these are just ignored at the moment (deleted) "*" { # save the line number for possible error message later clear; lines; put; clear; until "*#"; E"*#" { # convert to /* ... */ java multiline comment clip; clip; put; clear; add "/*"; get; add "*/"; # create a "comment" parse token put; clear; # comment-out this line to remove multiline comments from the # compiled java. # add "comment*"; push; .reparse } # make an unterminated multiline comment an error # to ease debugging of scripts. clear; add "unterminated multiline comment #* ... *# \n"; add "stating at line number "; get; add "\n"; print; clear; quit; } # single line comments. some will get lost. put; clear; add "//"; get; until "\n"; clip; put; clear; add "comment*"; push; .reparse } #---------------------------------- # parse command words (and abbreviations) # legal characters for keywords (commands) ![abcdefghijklmnopqrstuvwxyzBEKGPRUWS+-<>0^] { # error message about a misplaced character put; clear; add "!! Misplaced character '"; get; add "' in script near line "; lines; add " (character "; chars; add ") \n"; print; clear; quit; } # my testclass implementation cannot handle complex lists # eg [a-z+-] this is why I have to write out the whole alphabet while [abcdefghijklmnopqrstuvwxyzBEOFKGPRUWS+-<>0^]; #---------------------------------- # KEYWORDS # here we can test for all the keywords (command words) and their # abbreviated one letter versions (eg: clip k, clop K etc). Then # we can print an error message and abort if the word is not a # legal keyword for the parse-edit language # make ll an alias for "lines" and cc an alias for chars "ll" { clear; add "lines"; } "cc" { clear; add "chars"; } # one letter command abbreviations "a" { clear; add "add"; } "k" { clear; add "clip"; } "K" { clear; add "clop"; } "D" { clear; add "replace"; } "d" { clear; add "clear"; } "t" { clear; add "print"; } "p" { clear; add "pop"; } "P" { clear; add "push"; } "u" { clear; add "unstack"; } "U" { clear; add "stack"; } "G" { clear; add "put"; } "g" { clear; add "get"; } "x" { clear; add "swap"; } ">" { clear; add "++"; } "<" { clear; add "--"; } "m" { clear; add "mark"; } "M" { clear; add "go"; } "r" { clear; add "read"; } "R" { clear; add "until"; } "w" { clear; add "while"; } "W" { clear; add "whilenot"; } "n" { clear; add "count"; } "+" { clear; add "a+"; } "-" { clear; add "a-"; } "0" { clear; add "zero"; } "c" { clear; add "chars"; } "l" { clear; add "lines"; } "^" { clear; add "escape"; } "v" { clear; add "unescape"; } "z" { clear; add "delim"; } "S" { clear; add "state"; } "q" { clear; add "quit"; } "s" { clear; add "write"; } "o" { clear; add "nop"; } "rs" { clear; add "restart"; } "rp" { clear; add "reparse"; } # some extra syntax for testeof and testtape "","" { put; clear; add "eof*"; push; .reparse } "<==>" { put; clear; add "tapetest*"; push; .reparse } "jump","jumptrue","jumpfalse", "testis","testclass","testbegins","testends", "testeof","testtape" { put; clear; add "The instruction '"; get; add "' near line "; lines; add " (character "; chars; add ")\n"; add "can be used in pep assembly code but not scripts. \n"; print; clear; quit; } # show information if these "deprecated" commands are used "Q","bail","state" { put; clear; add "The instruction '"; get; add "' near line "; lines; add " (character "; chars; add ")\n"; add "is no longer part of the pep language (july 2020). \n"; add "use 'quit' instead of 'bail', and use 'unstack; print;' \n"; add "instead of 'state'. \n"; print; clear; quit; } "add","clip","clop","replace","upper","lower","cap","clear","print", "pop","push","unstack","stack","put","get","swap", "++","--","mark","go","read","until","while","whilenot", "count","a+","a-","zero","chars","lines","nochars","nolines", "escape","unescape","delim","quit", "write","nop","reparse","restart" { put; clear; add "word*"; push; .reparse } #------------ # the .reparse command and "parse label" is a simple way to # make sure that all shift-reductions occur. It should be used inside # a block test, so as not to create an infinite loop. There is # no "goto" in java so we need to use labelled loops to # implement .reparse/parse> "parse>" { clear; count; !"0" { clear; add "script error:\n"; add " extra parse> label at line "; lines; add ".\n"; print; quit; } clear; add "// parse>"; put; clear; add "parse>*"; push; # use accumulator to indicate after parse> label a+; .reparse } # -------------------- # implement "begin-blocks", which are only executed # once, at the beginning of the script (similar to awk's BEGIN {} rules) "begin" { put; add "*"; push; .reparse } add " << unknown command on line "; lines; add " (char "; chars; add ")"; add " of source file. \n"; print; clear; quit; # ---------------------------------- # PARSING PHASE: # Below is the parse/compile phase of the script. Here we pop tokens off the # stack and check for sequences of tokens eg "word*semicolon*". If we find a # valid series of tokens, we "shift-reduce" or "resolve" the token series eg # word*semicolon* --> command* # # At the same time, we manipulate (transform) the attributes on the tape, as # required. # parse> #------------------------------------- # 2 tokens #------------------------------------- pop; pop; # All of the patterns below are currently errors, but may not # be in the future if we expand the syntax of the parse # language. Also consider: # begintext* endtext* quoteset* notclass*, !* ,* ;* B* E* # It is nice to trap the errors here because we can emit some # (hopefully not very cryptic) error messages with a line number. # Otherwise the script writer has to debug with # pep -a asm.pp -I scriptfile # "word*word*","word*}*","word*begintext*","word*endtext*", "word*!*", "word*,*","quote*word*", "quote*class*", "quote*state*", "quote*}*", "quote*begintext*", "quote*endtext*", "class*word*", "class*quote*", "class*class*", "class*state*", "class*}*", "class*begintext*", "class*endtext*", "class*!*", "notclass*word*", "notclass*quote*", "notclass*class*", "notclass*state*", "notclass*}*" { add " (Token stack) \nValue: \n"; get; add "\nValue: \n"; ++; get; --; add "\n"; add "Error near line "; lines; add " (char "; chars; add ")"; add " of pep script (missing semicolon?) \n"; print; clear; quit; } "{*;*", ";*;*", "}*;*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of pep script: misplaced semi-colon? ; \n"; print; clear; quit; } ",*{*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of script: extra comma in list? \n"; print; clear; quit; } "command*;*","commandset*;*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of script: extra semi-colon? \n"; print; clear; quit; } "!*!*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: \n double negation '!!' is not implemented \n"; add " and probably won't be, because what would be the point? \n"; print; clear; quit; } "!*{*","!*;*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: misplaced negation operator (!)? \n"; add " The negation operator precedes tests, for example: \n"; add " !B'abc'{ ... } or !(eof),!'abc'{ ... } \n"; print; clear; quit; } ",*command*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: misplaced comma? \n"; print; clear; quit; } "!*command*" { push; push; add "error near line "; lines; add " (at char "; chars; add ") \n"; add " The negation operator (!) cannot precede a command \n"; print; clear; quit; } ";*{*", "command*{*", "commandset*{*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: no test for brace block? \n"; print; clear; quit; } "{*}*" { push; push; add "error near line "; lines; add " of script: empty braces {}. \n"; print; clear; quit; } "B*class*","E*class*" { push; push; add "error near line "; lines; add " of script:\n classes ([a-z], [:space:] etc). \n"; add " cannot use the 'begin' or 'end' modifiers (B/E) \n"; print; clear; quit; } "comment*{*" { push; push; add "error near line "; lines; add " of script: comments cannot occur between \n"; add " a test and a brace ({). \n"; print; clear; quit; } "}*command*" { push; push; add "error near line "; lines; add " of script: extra closing brace '}' ?. \n"; print; clear; quit; } #* E"begin*".!"begin*" { push; push; add "error near line "; lines; add " of script: Begin blocks must precede code \n"; print; clear; quit; } *# #------------ # The .restart command jumps to the first instruction after the # begin block (if there is a begin block), or the first instruction # of the script. ".*word*" { clear; ++; get; --; "restart" { clear; add "continue script;"; # not required because we have labelled loops, # continue script works both before and after the parse> label # "0" { clear; add "continue script;"; } # "1" { clear; add "break lex;"; } put; clear; add "command*"; push; .reparse } "reparse" { clear; count; # check accumulator to see if we are in the "lex" block # or the "parse" block and adjust the .reparse compilation # accordingly. "0" { clear; add "break lex;"; } "1" { clear; add "continue parse;"; } put; clear; add "command*"; push; .reparse } push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: \n"; add " misplaced dot '.' (use for AND logic or in .reparse/.restart \n"; print; clear; quit; } #--------------------------------- # Compiling comments so as to transfer them to the java "comment*command*","command*comment*","commandset*comment*" { clear; get; add "\n"; ++; get; --; put; clear; add "command*"; push; .reparse } "comment*comment*" { clear; get; add "\n"; ++; get; --; put; clear; add "comment*"; push; .reparse } # ----------------------- # negated tokens. # # This is a new more elegant way to negate a whole set of # tests (tokens) where the negation logic is stored on the # stack, not in the current tape cell. We just add "not" to # the stack token. # eg: ![:alpha:] ![a-z] ![abcd] !"abc" !B"abc" !E"xyz" # This format is used to indicate a negative test for # a brace block. eg: ![aeiou] { add "< not a vowel"; print; clear; } "!*quote*","!*class*","!*begintext*", "!*endtext*", "!*eof*","!*tapetest*" { # a simplification: store the token name "quote*/class*/..." # in the tape cell corresponding to the "!*" token. replace "!*" "not"; push; # this was a bug?? a missing ++; ?? # now get the token-value get; --; put; ++; clear; .reparse } #----------------------------------------- # format: E"text" or E'text' # This format is used to indicate a "workspace-ends-with" text before # a brace block. "E*quote*" { clear; add "endtext*"; push; get; '""' { # empty argument is an error clear; add "pep script error near line "; lines; add " (character "; chars; add "): \n"; add ' empty argument for end-test (E"") \n'; print; quit; } --; put; ++; clear; .reparse } #----------------------------------------- # format: B"sometext" or B'sometext' # A 'B' preceding some quoted text is used to indicate a # 'workspace-begins-with' test, before a brace block. "B*quote*" { clear; add "begintext*"; push; get; '""' { # empty argument is an error clear; add "pep script error near line "; lines; add " (character "; chars; add "): \n"; add ' empty argument for begin-test (B"") \n'; print; quit; } --; put; ++; clear; .reparse } #-------------------------------------------- # ebnf: command := word, ';' ; # formats: "pop; push; clear; print; " etc # all commands need to end with a semi-colon except for # .reparse and .restart # "word*;*" { clear; # check if command requires parameter get; "add", "until", "while", "whilenot", "mark", "go", "escape", "unescape", "delim", "replace" { put; clear; add "'"; get; add "'"; add " << command needs an argument, on line "; lines; add " of script.\n"; print; clear; quit; } "clip" { clear; add "mm.work.pop(); /* clip */"; put; } "clop" { clear; add "if !mm.work.is_empty() { /* clop */\n"; add " mm.work.remove(0); \n} "; put; } "clear" { clear; add "mm.work.clear(); /* clear */"; put; } "upper" { clear; add "let mm.work = mm.work.to_uppercase(); /* upper */"; put; } "lower" { clear; add "let mm.work = mm.work.to_lowercase(); /* lower */"; put; } "cap" { clear; add "if !mm.work.is_empty() { /* cap */\n"; add "let mm.work = mm.work.remove(0).to_uppercase().to_string() + "; add " &mm.work; } "; put; } "print" { clear; add "print!(\"{}\", mm.work); /* print */"; put; } "pop" { clear; add "mm.pop();"; put; } "push" { clear; add "mm.push();"; put; } "unstack" { clear; add "while mm.pop(); /* unstack */"; put; } "stack" { clear; add "while mm.push(); /* stack */"; put; } "put" { clear; add "mm.tape[mm.cell].clear(); /* put */\n"; add "mm.tape[mm.cell).append(mm.work); "; put; } "get" { clear; add "mm.work.push_str(mm.tape[mm.cell)); /* get */"; put; } "swap" { clear; add "mem::swap(&mut , &mut );"; put; } "++" { clear; add "mm.increment(); /* ++ */"; put; } "--" { clear; add "if mm.cell > 0 { mm.cell -= 1; } /* -- */"; put; } "read" { clear; add "mm.read(); /* read */"; put; } "count" { clear; add "mm.work.push_str(mm.accumulator); /* count */"; put; } "a+" { clear; add "mm.accumulator += 1; /* a+ */"; put; } "a-" { clear; add "mm.accumulator -= 1; /* a- */"; put; } "zero" { clear; add "mm.accumulator = 0; /* zero */"; put; } "chars" { clear; add "mm.work.push_str(mm.charsRead); /* chars */"; put; } "lines" { clear; add "mm.work.push_str(mm.linesRead); /* lines */"; put; } "nochars" { clear; add "mm.charsRead = 0; /* nochars */"; put; } "nolines" { clear; add "mm.linesRead = 0; /* nolines */"; put; } # use a labelled loop to quit script. "quit" { clear; add "break script;"; put; } "write" { clear; add 'fs::write("sav.pp", mm.work).expect("Unable to write file");'; put; } # just eliminate since it does nothing. "nop" { clear; add "/* nop: no-operation eliminated */"; put; } clear; add "command*"; push; .reparse } #----------------------------------------- # ebnf: commandset := command , command ; "command*command*", "commandset*command*" { clear; add "commandset*"; push; # format the tape attributes. Add the next command on a newline --; get; add "\n"; ++; get; --; put; ++; clear; .reparse } #------------------- # here we begin to parse "test*" and "ortestset*" and "andtestset*" # #------------------- # eg: B"abc" {} or E"xyz" {} # transform and markup the different test types "begintext*,*","endtext*,*","quote*,*","class*,*", "eof*,*","tapetest*,*", "begintext*.*","endtext*.*","quote*.*","class*.*", "eof*.*","tapetest*.*", "begintext*{*","endtext*{*","quote*{*","class*{*", "eof*{*","tapetest*{*" { B"begin" { clear; add "mm.work.starts_with("; } B"end" { clear; add "mm.work.ends_with("; } B"quote" { clear; add "mm.work.eq(&"; } B"class" { clear; add "mm.work.matches("; } # clear the tapecell for testeof and testtape because # they take no arguments. B"eof" { clear; put; add "mm.eof"; } B"tapetest" { clear; put; add "(mm.work.eq(&mm.tape[mm.cell))"; } get; !B"mm.eof" { add ")"; } put; #* # maybe we could ellide the not tests by doing here B"not" { clear; add "!"; get; put; } *# clear; add "test*"; push; # the trick below pushes the right token back on the stack. get; add "*"; push; .reparse } #------------------- # negated tests # eg: !B"xyz {} !(eof) {} !(==) {} # !E"xyz" {} # !"abc" {} # ![a-z] {} "notbegintext*,*","notendtext*,*","notquote*,*","notclass*,*", "noteof*,*","nottapetest*,*", "notbegintext*.*","notendtext*.*","notquote*.*","notclass*.*", "noteof*.*","nottapetest*.*", "notbegintext*{*","notendtext*{*","notquote*{*","notclass*{*", "noteof*{*","nottapetest*{*" { B"notbegin" { clear; add "!mm.work.starts_with("; } B"notend" { clear; add "!mm.work.ends_with("; } B"notquote" { clear; add "!mm.work.eq(&"; } B"notclass" { clear; add "!mm.work.matches("; } # clear the tapecell for testeof and testtape because # they take no arguments. B"noteof" { clear; put; add "!mm.eof"; } B"nottapetest" { clear; put; add "(!mm.work.eq(&mm.tape[mm.cell))"; } get; !B"!mm.eof" { add ")"; } put; clear; add "test*"; push; # the trick below pushes the right token back on the stack. get; add "*"; push; .reparse } #------------------- # 3 tokens #------------------- pop; #----------------------------- # some 3 token errors!!! # not a comprehensive list of 3 token errors "{*quote*;*","{*begintext*;*","{*endtext*;*","{*class*;*", "commandset*quote*;*", "command*quote*;*" { push; push; push; add "[pep error]\n invalid syntax near line "; lines; add " (char "; chars; add ")"; add " of script (misplaced semicolon?) \n"; print; clear; quit; } # to simplify subsequent tests, transmogrify a single command # to a commandset (multiple commands). "{*command*}*" { clear; add "{*commandset*}*"; push; push; push; .reparse } # errors! mixing AND and OR concatenation ",*andtestset*{*", ".*ortestset*{*" { # push the tokens back to make debugging easier push; push; push; add " error: mixing AND (.) and OR (,) concatenation in \n"; add " in pep script near line "; lines; add " (character "; chars; add ") \n"; add ' For example: B".".!E"/".[abcd./] { print; } # Correct! B".".!E"/",[abcd./] { print; } # Error! \n'; print; clear; quit; } #-------------------------------------------- # ebnf: command := keyword , quoted-text , ";" ; # format: add "text"; "word*quote*;*" { clear; get; "replace" { # error add "< command requires 2 parameters, not 1 \n"; add "near line "; lines; add " of script. \n"; print; clear; quit; } # check whether argument is single character, otherwise # throw an error "escape", "unescape", "while", "whilenot" { # This is trickier than I thought it would be. clear; ++; get; --; # check that arg not empty, (but an empty quote is ok # for the second arg of 'replace' '""' { clear; add "[pep error] near line "; lines; add " (or char "; chars; add "): \n"; add " command '"; get; add '\' cannot have an empty argument ("") \n'; print; quit; } # quoted text has the quotes still around it. # also handle escape characters like \n \r etc clip; clop; clop; clop; # B "\\" { clip; } clip; !"" { clear; add "Pep script error near line "; lines; add " (character "; chars; add "): \n"; add " command '"; get; add "' takes only a single character argument. \n"; print; quit; } clear; get; } "mark" { clear; add "/* mark */ \n"; add "mm.marks[mm.cell).clear(); // mark \n"; add "mm.marks[mm.cell).push_str("; ++; get; --; add "); // mark"; put; clear; add "command*"; push; .reparse } "go" { clear; add "mm.goToMark("; ++; get; --; add "); /* go */"; put; clear; add "command*"; push; .reparse } "delim" { clear; # this.delimiter.setCharAt(0, text.charAt(0)); # only the first character of the delimiter argument is used. add "mm.delimiter.clear(); /* delim */\n"; add "mm.delimiter.push_str("; ++; get; --; add "); "; put; clear; add "command*"; push; .reparse } "add" { clear; add "mm.work.push_str("; ++; get; --; # handle multiline text replace "\n" '"); \nmm.work.push_str("\\n'; add "); /* add */"; put; clear; add "command*"; push; .reparse } "while" { clear; add "while (mm.peep == "; ++; get; --; add ".charAt(0)) /* while */\n "; add " { if mm.eof {break;} mm.read(); }"; put; clear; add "command*"; push; .reparse } "whilenot" { clear; add "while (mm.peep != "; ++; get; --; add ".charAt(0)) /* whilenot */\n "; add " { if mm.eof {break;} mm.read(); }"; put; clear; add "command*"; push; .reparse } "until" { clear; add "mm.until("; ++; get; --; # error until cannot have empty argument 'mm.until(""' { clear; add "Pep script error near line "; lines; add " (character "; chars; add "): \n"; add " empty argument for 'until' \n"; add " For example: until '.txt'; until \">\"; # correct until ''; until \"\"; # errors! \n"; print; quit; } # handle multiline argument replace "\n" "\\n"; add ');'; put; clear; add "command*"; push; .reparse } # But really, can't the "replace" command just be used # instead of escape/unescape?? This seems a flaw in the # machine design. "escape","unescape" { clear; add "mm."; get; add "Char"; add "("; ++; get; --; add '.charAt(0));'; put; clear; add "command*"; push; .reparse } # error, superfluous argument add ": command does not take an argument \n"; add "near line "; lines; add " of script. \n"; print; clear; #state quit; } #---------------------------------- # format: "while [:alpha:] ;" or whilenot [a-z] ; "word*class*;*" { clear; get; "while" { clear; add "/* while */ \n"; add "while (mm.peep.matches("; ++; get; --; add ")) { if mm.eof { break; } mm.read(); }"; put; clear; add "command*"; push; .reparse } "whilenot" { clear; add "/* whilenot */ \n"; add "while (!mm.peep).matches("; ++; get; --; add ")) { if mm.eof { break; } mm.read(); }"; put; clear; add "command*"; push; .reparse } # error add " < command cannot have a class argument \n"; add "line "; lines; add ": error in script \n"; print; clear; quit; } # arrange the parse> label loops (eof) { "commandset*parse>*commandset*","command*parse>*commandset*", "commandset*parse>*command*","command*parse>*command*" { clear; # indent both code blocks add " "; get; replace "\n" "\n "; put; clear; ++; ++; add " "; get; replace "\n" "\n "; put; clear; --; --; # add a block so that .reparse works before the parse> label. add "lex: { \n"; get; add "\n}\n"; ++; ++; # indent code block # add " "; get; replace "\n" "\n "; put; clear; add "parse: \n"; add "loop { \n"; get; add "\n break parse;\n}"; --; --; put; clear; add "commandset*"; push; .reparse } } # ------------------------------- # 4 tokens # ------------------------------- pop; #------------------------------------- # bnf: command := replace , quote , quote , ";" ; # example: replace "and" "AND" ; "word*quote*quote*;*" { clear; get; "replace" { #--------------------------- # a command plus 2 arguments, eg replace "this" "that" clear; add "/* replace */ \n"; add "if !mm.work.is_empty() { \n"; add " temp = mm.work.replace("; ++; get; add ", "; ++; get; add ");\n"; add " mm.work.clear(); \n"; add " mm.work.push_str(temp);\n} "; --; --; put; clear; add "command*"; push; .reparse } add "pep script error on line "; lines; add " (character "; chars; add "): \n"; add " command does not take 2 quoted arguments. \n"; print; quit; } #------------------------------------- # format: begin { #* commands *# } # "begin" blocks which are only executed once (they # will are assembled before the "start:" label. They must come before # all other commands. # "begin*{*command*}*", "begin*{*commandset*}*" { clear; ++; ++; get; --; --; put; clear; add "beginblock*"; push; .reparse } # ------------- # parses and compiles concatenated tests # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ... # these 2 tests should be all that is necessary "test*,*ortestset*{*", "test*,*test*{*" { clear; get; add " || "; ++; ++; get; --; --; put; clear; add "ortestset*{*"; push; push; .reparse } # dont mix AND and OR concatenations # ------------- # AND logic # parses and compiles concatenated AND tests # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ... # it is possible to elide this block with the negated block # for compactness but maybe readability is not as good. # negated tests can be chained with non negated tests. # eg: B'http' . !E'.txt' { ... } "test*.*andtestset*{*", "test*.*test*{*" { clear; get; add " && "; ++; ++; get; --; --; put; clear; add "andtestset*{*"; push; push; .reparse } #------------------------------------- # we should not have to check for the {*command*}* pattern # because that has already been transformed to {*commandset*}* "test*{*commandset*}*", "andtestset*{*commandset*}*", "ortestset*{*commandset*}*" { clear; # indent the java code for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "if ("; get; add ") {\n"; ++; ++; get; add "\n}"; --; --; put; clear; add "command*"; push; # always reparse/compile .reparse } # ------------- # multi-token end-of-stream errors # not a comprehensive list of errors... (eof) { E"begintext*",E"endtext*",E"test*",E"ortestset*",E"andtestset*" { add " Error near end of script at line "; lines; add ". Test with no brace block? \n"; print; clear; quit; } E"quote*",E"class*",E"word*"{ put; clear; add "Error at end of pep script near line "; lines; add ": missing semi-colon? \n"; add "Parse stack: "; get; add "\n"; print; clear; quit; } E"{*", E"}*", E";*", E",*", E".*", E"!*", E"B*", E"E*" { put; clear; add "Error: misplaced terminal character at end of script! (line "; lines; add "). \n"; add "Parse stack: "; get; add "\n"; print; clear; quit; } } # put the 4 (or less) tokens back on the stack push; push; push; push; (eof) { print; clear; # create the virtual machine object code and save it # somewhere on the tape. add ' /* Rust code generated by "translate.rust.pss" */ // use std::mem; // for swap use std::io; use std::io::Read; use std::io::BufReader; use std::io::BufRead; use std::process; use std::fs; use std::fs::File; pub struct Machine { accumulator: i32, // counter for anything peep: char, // next char in input stream charsRead: u32, // No. of chars read so far linesRead: u32, // No. of lines read so far work: String, // text accumulator stack: Vec, // parse token stack LENGTH: u32, // tape initial length // vectors are growable in rust tape: Vec, // array of token attributes, growable marks: Vec, // tape marks cell: u32, // pointer to current cell input: BufReader, // text input stream eof: bool, // end of stream reached? flag: bool, // not used here escape: String, // char used to "escape" others "\\" delimiter: String // push/pop delimiter (default is "*") } impl Machine { // read from stdin or from a file or a string. // BufReader::new(io::stdin()) // BufReader::new(fs::File::open(filename).unwrap()) // let mut streader = StringReader::new("Line 1\\nLine 2"); // let mut bufreader = BufReader::new(streader); /** make a new machine with input from stdin */ /* pub fn new() -> Self { return Machine::new(BufReader::new(io::stdin())); } */ /** make a new machine with input from a string and output to a string */ /* stringreader is a crate. pub fn new(input: String, output: String) -> Self { let mut reader = StringReader::new(input); return Machine::new(BufReader::new(reader)); } */ /** make a new machine with a buffered stream reader */ pub fn new(reader: R) -> Self { Self { LENGTH: 100, // BufReader::new(io::stdin()) input: reader, eof: false, flag: false, charsRead: 0, linesRead: 1, escape: String::from("\\\\"), delimiter: String::from("*"), accumulator: 0, work: String::new(), stack: vec!["".to_string();100], cell: 0, tape: vec!["".to_string();100], marks: vec!["".to_string();100], peep: 'z' // peep: Self.input.read() } // self } /** read one character from the input stream and update the machine. */ pub fn readNext(&mut self) { //int iChar; if self.eof { process::exit(0); } self.charsRead += 1; // increment lines if self.peep == \'\\n\' { self.linesRead += 1; } self.work.push(self.peep); self.peep = self.input.read(); if self.peep == 'x' { self.eof = true; } } /** increment tape pointer by one */ pub fn increment(&mut self) { self.cell += 1; if self.cell >= self.LENGTH { self.tape.push(String::from("")); self.marks.push(String::from("")); self.LENGTH += 1; } } /** remove escape character */ pub fn unescapeChar(&mut self, c: char) { if !self.work.is_empty() { let s: String = self.work.replace("\\\\".push(c), c.to_string()); self.work.clear(); self.work.push_str(&s); } } /** add escape character */ pub fn escapeChar(&mut self, c: char) { if !self.work.is_empty() { let s: String = self.work.replace(c.to_string(), "\\\\".push(c)); self.work.clear(); self.work.push_str(&s); } } /** whether trailing escapes \\\\ are even or odd */ // untested code. check! eg try: add "x \\\\"; print; etc pub fn isEscaped(&mut self, ss: String, sSuffix: String) -> bool { let count: i32 = 0; if ss.chars().count() < 2 { return false; } if ss.chars().count() <= sSuffix.chars().count() { return false; } if ss.indexOf(self.escape.charAt(0)) == -1 { return false; } let pos: i32 = ss.chars().count()-sSuffix.length(); while (pos > -1) && (ss.charAt(pos) == self.escape.charAt(0)) { count += 1; pos -= 1; } if count % 2 == 0 { return false; } return true; } /* a helper to see how many trailing \\\\ escape chars */ pub fn countEscaped(&mut self, sSuffix: String) -> u32 { let mut s = String::new(); let count: i32 = 0; match s.strip_suffix(sSuffix) { Some(s) => s, None => s } // remove suffix if it exists if index > 0 { s = self.work.substring(0, index); } while s.ends_with(self.escape) { count += 1; s = s.substring(0, s.lastIndexOf(self.escape)); } return count; } /** reads the input stream until the work end with text */ // can test this with pub fn until(&mut self, sSuffix: String) { // read at least one character if self.eof { return; } self.readNext(); loop { if self.eof { return; } if self.work.ends_with(sSuffix) { if self.countEscaped(sSuffix) % 2 == 0 { return; } } self.readNext(); } } /** pop the first token from the stack into the workspace */ pub fn pop(&mut self) -> bool { if self.stack.len() == 0 { return false; } self.work.insert_str(0, self.stack.pop().as_str()); if self.cell > 0 { self.cell -= 1; } return true; } // push the first token from the workspace to the stack pub fn push(&mut self) -> bool { let sItem: String = String::new(); // dont increment the tape pointer on an empty push if self.work.is_empty() { return false; } // need to get this from self.delim not "*" let iFirstStar: u32 = self.work.indexOf(self.delimiter); if iFirstStar != -1 { sItem = self.work.substring(0, iFirstStar + 1); self.work.delete(0, iFirstStar + 1); } else { sItem = self.work; self.work.clear(); } self.stack.push(sItem); self.increment(); return true; } // swap not required, use mem::swap // save the workspace to file "sav.pp" */ // not required. pub fn writeToFile(&mut self) { fs::write("sav.pp", self.work).expect("Unable to write file"); } pub fn goToMark(&mut self, mark: String) { for (ii, thismark) in self.marks.iter().enumerate() { if thismark.eq(&mark) { self.cell = ii; return; } } print!("badmark \'{}\'!", mark); process::exit(1); } /** parse/check/compile the input */ pub fn parse(&mut self) { //this is where the actual parsing/compiling code should go //but this means that all generated code must use //"self." not "mm." let ii = 1; } } fn main() -> io::Result<()> { // BufReader::new(io::stdin()) let temp: String = String::new(); let mm: Machine = Machine::new(BufReader::new(io::stdin())); \n'; # save the code in the current tape cell put; clear; #--------------------- # check if the script correctly parsed (there should only # be one token on the stack, namely "commandset*" or "command*"). pop; pop; "commandset*", "command*" { clear; # indent generated code (6 spaces) for readability. add " "; get; replace "\n" "\n "; put; clear; # restore the rust preamble from the tape ++; get; --; add ' \'script: while !mm.eof {\n'; get; add "\n }"; add "\n }\n"; #add "\n}\n"; # put a copy of the final compilation into the tapecell # so it can be inspected interactively. put; print; clear; quit; } "beginblock*commandset*", "beginblock*command*" { clear; # indent begin block code add " "; get; replace "\n" "\n "; put; clear; # indent main code for readability. ++; add " "; get; replace "\n" "\n "; put; clear; --; # get rust preamble from tape ++; ++; get; --; --; get; add "\n"; ++; # a labelled loop for "quit" (but quit can just exit?) add " 'script: \n"; add " while !mm.eof {\n"; get; add "\n }"; add "\n }\n"; #add "\n}\n"; # put a copy of the final compilation into the tapecell # for interactive debugging. put; print; clear; quit; } push; push; # try to explain some more errors unstack; B"parse>" { put; clear; add "[error] pep syntax error:\n"; add " The parse> label cannot be the 1st item \n"; add " of a script \n"; print; quit; } put; clear; clear; add "After compiling with 'compile.java.pss' (at EOF): \n "; add " parse error in input script. \n "; print; clear; unstack; put; clear; add "Parse stack: "; get; add "\n"; add " * debug script "; add " >> pep -If script -i 'some input' \n "; add " * debug compilation. \n "; add " >> pep -Ia asm.pp script' \n "; print; clear; quit; } # not eof # there is an implicit .restart command here (jump start)