#* translate.rust.pss Not working, in development. This is a parse-script which translates parse-scripts into rust code, using the 'pep' tool. The script creates a standalone compilable rust program. The virtual machine and engine is implemented in plain c at http://bumble.sf.net/books/pars/pep.c. This implements a script language with a syntax reminiscent of sed and awk (much simpler than awk, but more complex than sed). STATUS Adapting code, but I am finding rust very confusing and unnecessarily complicated. May work on the cpp translator instead. Much harder to learn than "go" in my opinion NOTES no multiline comments in rust? We may use labelled loops and break/continue to implement the parse> label and .reparse .restart commands. Breaks are also used to implement the quit and bail commands. TODO Convert the parsing code to a method which takes an input stream as a parameter. This way the same parser/compiler can be used with a string/file/stdin etc and can also be used by other classes/objects. SEE ALSO At http://bumble.sf.net/books/pars/ pars/tr/translate.java.pss A very similar script for compiling scripts into java And all the other translation scripts in pars/tr/ TESTING * testing the multiple escaped until bug >> pep.rsas 'r;until"c";add".";t;d;' 'ab\\cab\cabc' GOTCHAS I was trying to run >> pep -e "r;a'\\';print;d;" -i "abc" and I kept getting an unterminated quote message, which I thought I had fixed in machine.interp.c (until code). But the problem was actually the bash shell which resolves \\ to \ in double quotes, but not single quotes! BUGS These are java bugs, not rust. Xdigit in java not valid class. Its a bit strange to talk about a multicharacter string being "escaped" (eg when calling 'until') but this is allowed in the pep engine. add "\{"; will generate an "illegal escape character" error when trying to compile the generated java code. I need to consider what to do in this situation (eg escape \ to \\ ?) SOLVED BUGS check "go/mark" code. what happens if the mark is not found?? throw error and exit I think. found a bug in "replace" code, which was returning from inline code. RUST NOTES * delete a string >> s.clear() * a vector of chars >> let mut chars: Vec = pangram.chars().collect(); * append a char to a string ---- s.push(c); // append string string.push_str(", "); ,,, * iterate an array ----- for word in pangram.split_whitespace().rev() { println!("> {}", word); } ,,, TASKS HISTORY 4 july 2022 Began to adapt from tr/translate.java.pss Also working on eg/sed.tojava.pss *# read; #-------------- [:space:] { clear; .reparse } #--------------- # We can ellide all these single character tests, because # the stack token is just the character itself with a * # Braces {} are used for blocks of commands, ',' and '.' for concatenating # tests with OR or AND logic. 'B' and 'E' for begin and end # tests, '!' is used for negation, ';' is used to terminate a # command. "{", "}", ";", ",", ".", "!", "B", "E" { put; add "*"; push; .reparse } #--------------- # format: "text" "\"" { # save the start line number (for error messages) in case # there is no terminating quote character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; add '"'; until '"'; !E'"' { clear; add 'Unterminated quote character (") starting at '; get; add ' !\n'; print; quit; } put; clear; add "quote*"; push; .reparse } #--------------- # format: 'text', single quotes are converted to double quotes # but we must escape embedded double quotes. "'" { # save the start line number (for error messages) in case # there is no terminating quote character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; until "'"; !E"'" { clear; add "Unterminated quote (') starting at "; get; add '!\n'; print; quit; } clip; escape '"'; put; clear; add "\""; get; add "\""; put; clear; add "quote*"; push; .reparse } #--------------- # formats: [:space:] [a-z] [abcd] [:alpha:] etc # should class tests really be multiline??! "[" { # save the start line number (for error messages) in case # there is no terminating bracket character. clear; add "line "; lines; add " (character "; chars; add ") "; put; clear; add "["; until "]"; "[]" { clear; add "pep script error at line "; lines; add " (character "; chars; add "): \n"; add " empty character class [] \n"; print; quit; } !E"]" { clear; add "Unterminated class text ([...]) starting at "; get; add " class text can be used in tests or with the 'while' and 'whilenot' commands. For example: [:alpha:] { while [:alpha:]; print; clear; } "; print; quit; } # need to escape quotes so they dont interfere with the # quotes java needs for .matches("...") escape '"'; # the caret is not a negation operator in pep scripts replace "^" "\\\\^"; # save the class on the tape put; clop; clop; !B"-" { # not a range class, eg [a-z] so need to escape '-' chars # java requires a double escape clear; get; replace '-' '\\\\-'; put; } B"-" { # a range class, eg [a-z], check if it is correct clip; clip; !"-" { clear; add "Error in pep script at line "; lines; add " (character "; chars; add "): \n"; add " Incorrect character range class "; get; add " For example: [a-g] # correct [f-gh] # error! \n"; print; clear; quit; } } clear; get; # restore class text B"[:".!E":]" { clear; add "malformed character class starting at "; get; add '!\n'; print; quit; } B"[:".!"[:]" { clip; clip; clop; clop; # unicode posix character classes in java # Also, abbreviations (not implemented in gh.c yet.) "alnum","N" { clear; add "\\\\p{Alnum}"; } "alpha","A" { clear; add "\\\\p{Alpha}"; } "ascii","I" { clear; add "\\\\p{ASCII}"; } "blank","B" { clear; add "\\\\p{Blank}"; } "cntrl","C" { clear; add "\\\\p{Cntrl}"; } "digit","D" { clear; add "\\\\p{Digit}"; } "graph","G" { clear; add "\\\\p{Graph}"; } "lower","L" { clear; add "\\\\p{Lower}"; } "print","P" { clear; add "\\\\p{Print}"; } "punct","T" { clear; add "\\\\p{Punct}"; } "space","S" { clear; add "\\\\p{Space}"; } "upper","U" { clear; add "\\\\p{Upper}"; } "xdigit","X" { clear; add "\\\\p{XDigit}"; } !B"\\\\p{" { put; clear; add "Pep script syntax error near line "; lines; add " (character "; chars; add "): \n"; add "Unknown character class '"; get; add "'\n"; print; clear; quit; } } #* alnum - alphanumeric like [0-9a-zA-Z] alpha - alphabetic like [a-zA-Z] blank - blank chars, space and tab cntrl - control chars, ascii 000 to 037 and 177 (del) digit - digits 0-9 graph - graphical chars same as :alnum: and :punct: lower - lower case letters [a-z] print - printable chars ie :graph: + space punct - punctuation ie !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~. space - all whitespace, eg \n\r\t vert tab, space, \f upper - upper case letters [A-Z] xdigit - hexadecimal digit ie [0-9a-fA-F] *# put; clear; # add quotes around the class and limits around the # class so it can be used with the string.matches() method # (must match the whole string, not just one character) add '"^'; get; add '+$"'; put; clear; add "class*"; push; .reparse } #--------------- # formats: (eof) (EOF) (==) etc. "(" { clear; until ")"; clip; put; "eof","EOF" { clear; add "eof*"; push; .reparse } "==" { clear; add "tapetest*"; push; .reparse } add " << unknown test near line "; lines; add " of script.\n"; add " bracket () tests are \n"; add " (eof) test if end of stream reached. \n"; add " (==) test if workspace is same as current tape cell \n"; print; clear; quit; } #--------------- # multiline and single line comments, eg #... and #* ... *# "#" { clear; read; "\n" { clear; .reparse } # checking for multiline comments of the form "#* \n\n\n *#" # these are just ignored at the moment (deleted) "*" { # save the line number for possible error message later clear; lines; put; clear; until "*#"; E"*#" { # convert to /* ... */ java multiline comment clip; clip; put; clear; add "/*"; get; add "*/"; # create a "comment" parse token put; clear; # comment-out this line to remove multiline comments from the # compiled java. # add "comment*"; push; .reparse } # make an unterminated multiline comment an error # to ease debugging of scripts. clear; add "unterminated multiline comment #* ... *# \n"; add "stating at line number "; get; add "\n"; print; clear; quit; } # single line comments. some will get lost. put; clear; add "//"; get; until "\n"; clip; put; clear; add "comment*"; push; .reparse } #---------------------------------- # parse command words (and abbreviations) # legal characters for keywords (commands) ![abcdefghijklmnopqrstuvwxyzBEKGPRUWS+-<>0^] { # error message about a misplaced character put; clear; add "!! Misplaced character '"; get; add "' in script near line "; lines; add " (character "; chars; add ") \n"; print; clear; quit; } # my testclass implementation cannot handle complex lists # eg [a-z+-] this is why I have to write out the whole alphabet while [abcdefghijklmnopqrstuvwxyzBEOFKGPRUWS+-<>0^]; #---------------------------------- # KEYWORDS # here we can test for all the keywords (command words) and their # abbreviated one letter versions (eg: clip k, clop K etc). Then # we can print an error message and abort if the word is not a # legal keyword for the parse-edit language # make ll an alias for "lines" and cc an alias for chars "ll" { clear; add "lines"; } "cc" { clear; add "chars"; } # one letter command abbreviations "a" { clear; add "add"; } "k" { clear; add "clip"; } "K" { clear; add "clop"; } "D" { clear; add "replace"; } "d" { clear; add "clear"; } "t" { clear; add "print"; } "p" { clear; add "pop"; } "P" { clear; add "push"; } "u" { clear; add "unstack"; } "U" { clear; add "stack"; } "G" { clear; add "put"; } "g" { clear; add "get"; } "x" { clear; add "swap"; } ">" { clear; add "++"; } "<" { clear; add "--"; } "m" { clear; add "mark"; } "M" { clear; add "go"; } "r" { clear; add "read"; } "R" { clear; add "until"; } "w" { clear; add "while"; } "W" { clear; add "whilenot"; } "n" { clear; add "count"; } "+" { clear; add "a+"; } "-" { clear; add "a-"; } "0" { clear; add "zero"; } "c" { clear; add "chars"; } "l" { clear; add "lines"; } "^" { clear; add "escape"; } "v" { clear; add "unescape"; } "z" { clear; add "delim"; } "S" { clear; add "state"; } "q" { clear; add "quit"; } "s" { clear; add "write"; } "o" { clear; add "nop"; } "rs" { clear; add "restart"; } "rp" { clear; add "reparse"; } # some extra syntax for testeof and testtape "","" { put; clear; add "eof*"; push; .reparse } "<==>" { put; clear; add "tapetest*"; push; .reparse } "jump","jumptrue","jumpfalse", "testis","testclass","testbegins","testends", "testeof","testtape" { put; clear; add "The instruction '"; get; add "' near line "; lines; add " (character "; chars; add ")\n"; add "can be used in pep assembly code but not scripts. \n"; print; clear; quit; } # show information if these "deprecated" commands are used "Q","bail","state" { put; clear; add "The instruction '"; get; add "' near line "; lines; add " (character "; chars; add ")\n"; add "is no longer part of the pep language (july 2020). \n"; add "use 'quit' instead of 'bail', and use 'unstack; print;' \n"; add "instead of 'state'. \n"; print; clear; quit; } "add","clip","clop","replace","upper","lower","cap","clear","print", "pop","push","unstack","stack","put","get","swap", "++","--","mark","go","read","until","while","whilenot", "count","a+","a-","zero","chars","lines","nochars","nolines", "escape","unescape","delim","quit", "write","nop","reparse","restart" { put; clear; add "word*"; push; .reparse } #------------ # the .reparse command and "parse label" is a simple way to # make sure that all shift-reductions occur. It should be used inside # a block test, so as not to create an infinite loop. There is # no "goto" in java so we need to use labelled loops to # implement .reparse/parse> "parse>" { clear; count; !"0" { clear; add "script error:\n"; add " extra parse> label at line "; lines; add ".\n"; print; quit; } clear; add "// parse>"; put; clear; add "parse>*"; push; # use accumulator to indicate after parse> label a+; .reparse } # -------------------- # implement "begin-blocks", which are only executed # once, at the beginning of the script (similar to awk's BEGIN {} rules) "begin" { put; add "*"; push; .reparse } add " << unknown command on line "; lines; add " (char "; chars; add ")"; add " of source file. \n"; print; clear; quit; # ---------------------------------- # PARSING PHASE: # Below is the parse/compile phase of the script. Here we pop tokens off the # stack and check for sequences of tokens eg "word*semicolon*". If we find a # valid series of tokens, we "shift-reduce" or "resolve" the token series eg # word*semicolon* --> command* # # At the same time, we manipulate (transform) the attributes on the tape, as # required. # parse> #------------------------------------- # 2 tokens #------------------------------------- pop; pop; # All of the patterns below are currently errors, but may not # be in the future if we expand the syntax of the parse # language. Also consider: # begintext* endtext* quoteset* notclass*, !* ,* ;* B* E* # It is nice to trap the errors here because we can emit some # (hopefully not very cryptic) error messages with a line number. # Otherwise the script writer has to debug with # pep -a asm.pp -I scriptfile # "word*word*","word*}*","word*begintext*","word*endtext*", "word*!*", "word*,*","quote*word*", "quote*class*", "quote*state*", "quote*}*", "quote*begintext*", "quote*endtext*", "class*word*", "class*quote*", "class*class*", "class*state*", "class*}*", "class*begintext*", "class*endtext*", "class*!*", "notclass*word*", "notclass*quote*", "notclass*class*", "notclass*state*", "notclass*}*" { add " (Token stack) \nValue: \n"; get; add "\nValue: \n"; ++; get; --; add "\n"; add "Error near line "; lines; add " (char "; chars; add ")"; add " of pep script (missing semicolon?) \n"; print; clear; quit; } "{*;*", ";*;*", "}*;*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of pep script: misplaced semi-colon? ; \n"; print; clear; quit; } ",*{*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of script: extra comma in list? \n"; print; clear; quit; } "command*;*","commandset*;*" { push; push; add "Error near line "; lines; add " (char "; chars; add ")"; add " of script: extra semi-colon? \n"; print; clear; quit; } "!*!*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: \n double negation '!!' is not implemented \n"; add " and probably won't be, because what would be the point? \n"; print; clear; quit; } "!*{*","!*;*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: misplaced negation operator (!)? \n"; add " The negation operator precedes tests, for example: \n"; add " !B'abc'{ ... } or !(eof),!'abc'{ ... } \n"; print; clear; quit; } ",*command*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: misplaced comma? \n"; print; clear; quit; } "!*command*" { push; push; add "error near line "; lines; add " (at char "; chars; add ") \n"; add " The negation operator (!) cannot precede a command \n"; print; clear; quit; } ";*{*", "command*{*", "commandset*{*" { push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: no test for brace block? \n"; print; clear; quit; } "{*}*" { push; push; add "error near line "; lines; add " of script: empty braces {}. \n"; print; clear; quit; } "B*class*","E*class*" { push; push; add "error near line "; lines; add " of script:\n classes ([a-z], [:space:] etc). \n"; add " cannot use the 'begin' or 'end' modifiers (B/E) \n"; print; clear; quit; } "comment*{*" { push; push; add "error near line "; lines; add " of script: comments cannot occur between \n"; add " a test and a brace ({). \n"; print; clear; quit; } "}*command*" { push; push; add "error near line "; lines; add " of script: extra closing brace '}' ?. \n"; print; clear; quit; } #* E"begin*".!"begin*" { push; push; add "error near line "; lines; add " of script: Begin blocks must precede code \n"; print; clear; quit; } *# #------------ # The .restart command jumps to the first instruction after the # begin block (if there is a begin block), or the first instruction # of the script. ".*word*" { clear; ++; get; --; "restart" { clear; add "continue script;"; # not required because we have labelled loops, # continue script works both before and after the parse> label # "0" { clear; add "continue script;"; } # "1" { clear; add "break lex;"; } put; clear; add "command*"; push; .reparse } "reparse" { clear; count; # check accumulator to see if we are in the "lex" block # or the "parse" block and adjust the .reparse compilation # accordingly. "0" { clear; add "break lex;"; } "1" { clear; add "continue parse;"; } put; clear; add "command*"; push; .reparse } push; push; add "error near line "; lines; add " (char "; chars; add ")"; add " of script: \n"; add " misplaced dot '.' (use for AND logic or in .reparse/.restart \n"; print; clear; quit; } #--------------------------------- # Compiling comments so as to transfer them to the java "comment*command*","command*comment*","commandset*comment*" { clear; get; add "\n"; ++; get; --; put; clear; add "command*"; push; .reparse } "comment*comment*" { clear; get; add "\n"; ++; get; --; put; clear; add "comment*"; push; .reparse } # ----------------------- # negated tokens. # # This is a new more elegant way to negate a whole set of # tests (tokens) where the negation logic is stored on the # stack, not in the current tape cell. We just add "not" to # the stack token. # eg: ![:alpha:] ![a-z] ![abcd] !"abc" !B"abc" !E"xyz" # This format is used to indicate a negative test for # a brace block. eg: ![aeiou] { add "< not a vowel"; print; clear; } "!*quote*","!*class*","!*begintext*", "!*endtext*", "!*eof*","!*tapetest*" { # a simplification: store the token name "quote*/class*/..." # in the tape cell corresponding to the "!*" token. replace "!*" "not"; push; # this was a bug?? a missing ++; ?? # now get the token-value get; --; put; ++; clear; .reparse } #----------------------------------------- # format: E"text" or E'text' # This format is used to indicate a "workspace-ends-with" text before # a brace block. "E*quote*" { clear; add "endtext*"; push; get; '""' { # empty argument is an error clear; add "pep script error near line "; lines; add " (character "; chars; add "): \n"; add ' empty argument for end-test (E"") \n'; print; quit; } --; put; ++; clear; .reparse } #----------------------------------------- # format: B"sometext" or B'sometext' # A 'B' preceding some quoted text is used to indicate a # 'workspace-begins-with' test, before a brace block. "B*quote*" { clear; add "begintext*"; push; get; '""' { # empty argument is an error clear; add "pep script error near line "; lines; add " (character "; chars; add "): \n"; add ' empty argument for begin-test (B"") \n'; print; quit; } --; put; ++; clear; .reparse } #-------------------------------------------- # ebnf: command := word, ';' ; # formats: "pop; push; clear; print; " etc # all commands need to end with a semi-colon except for # .reparse and .restart # "word*;*" { clear; # check if command requires parameter get; # todo: allow the mark; and go; syntax here # also writefile; syntax (use current tape cell) "add", "until", "while", "whilenot", "mark", "go", "escape", "unescape", "delim", "replace" { put; clear; add "'"; get; add "'"; add " << command needs an argument, on line "; lines; add " of script.\n"; print; clear; quit; } "clip" { clear; add "mm.work.pop(); /* clip */"; put; } "clop" { clear; add "if !mm.work.is_empty() { /* clop */\n"; add " mm.work.remove(0); \n} "; put; } "clear" { clear; add "mm.work.clear(); /* clear */"; put; } "upper" { clear; add "let mm.work = mm.work.to_uppercase(); /* upper */"; put; } "lower" { clear; add "let mm.work = mm.work.to_lowercase(); /* lower */"; put; } "cap" { clear; add "if !mm.work.is_empty() { /* cap */\n"; add "let mm.work = mm.work.remove(0).to_uppercase().to_string() + "; add " &mm.work; } "; put; } "print" { clear; add "print!(\"{}\", mm.work); /* print */"; put; } "pop" { clear; add "mm.pop();"; put; } "push" { clear; add "mm.push();"; put; } "unstack" { clear; add "while mm.pop(); /* unstack */"; put; } "stack" { clear; add "while mm.push(); /* stack */"; put; } "put" { clear; add "mm.tape[mm.cell].clear(); /* put */\n"; add "mm.tape[mm.cell).append(mm.work); "; put; } "get" { clear; add "mm.work.push_str(mm.tape[mm.cell)); /* get */"; put; } "swap" { clear; add "mem::swap(&mut , &mut );"; put; } "++" { clear; add "mm.increment(); /* ++ */"; put; } "--" { clear; add "if mm.cell > 0 { mm.cell -= 1; } /* -- */"; put; } "read" { clear; add "mm.read(); /* read */"; put; } "count" { clear; add "mm.work.push_str(mm.accumulator); /* count */"; put; } "a+" { clear; add "mm.accumulator += 1; /* a+ */"; put; } "a-" { clear; add "mm.accumulator -= 1; /* a- */"; put; } "zero" { clear; add "mm.accumulator = 0; /* zero */"; put; } "chars" { clear; add "mm.work.push_str(mm.charsRead); /* chars */"; put; } "lines" { clear; add "mm.work.push_str(mm.linesRead); /* lines */"; put; } "nochars" { clear; add "mm.charsRead = 0; /* nochars */"; put; } "nolines" { clear; add "mm.linesRead = 0; /* nolines */"; put; } # use a labelled loop to quit script. "quit" { clear; add "break script;"; put; } "write" { clear; add 'fs::write("sav.pp", mm.work).expect("Unable to write file");'; put; } # just eliminate since it does nothing. "nop" { clear; add "/* nop: no-operation eliminated */"; put; } clear; add "command*"; push; .reparse } #----------------------------------------- # ebnf: commandset := command , command ; "command*command*", "commandset*command*" { clear; add "commandset*"; push; # format the tape attributes. Add the next command on a newline --; get; add "\n"; ++; get; --; put; ++; clear; .reparse } #------------------- # here we begin to parse "test*" and "ortestset*" and "andtestset*" # #------------------- # eg: B"abc" {} or E"xyz" {} # transform and markup the different test types "begintext*,*","endtext*,*","quote*,*","class*,*", "eof*,*","tapetest*,*", "begintext*.*","endtext*.*","quote*.*","class*.*", "eof*.*","tapetest*.*", "begintext*{*","endtext*{*","quote*{*","class*{*", "eof*{*","tapetest*{*" { B"begin" { clear; add "mm.work.starts_with("; } B"end" { clear; add "mm.work.ends_with("; } B"quote" { clear; add "mm.work.eq(&"; } B"class" { clear; add "mm.work.matches("; } # clear the tapecell for testeof and testtape because # they take no arguments. B"eof" { clear; put; add "mm.eof"; } B"tapetest" { clear; put; add "(mm.work.eq(&mm.tape[mm.cell))"; } get; !B"mm.eof" { add ")"; } put; #* # maybe we could ellide the not tests by doing here B"not" { clear; add "!"; get; put; } *# clear; add "test*"; push; # the trick below pushes the right token back on the stack. get; add "*"; push; .reparse } #------------------- # negated tests # eg: !B"xyz {} !(eof) {} !(==) {} # !E"xyz" {} # !"abc" {} # ![a-z] {} "notbegintext*,*","notendtext*,*","notquote*,*","notclass*,*", "noteof*,*","nottapetest*,*", "notbegintext*.*","notendtext*.*","notquote*.*","notclass*.*", "noteof*.*","nottapetest*.*", "notbegintext*{*","notendtext*{*","notquote*{*","notclass*{*", "noteof*{*","nottapetest*{*" { B"notbegin" { clear; add "!mm.work.starts_with("; } B"notend" { clear; add "!mm.work.ends_with("; } B"notquote" { clear; add "!mm.work.eq(&"; } B"notclass" { clear; add "!mm.work.matches("; } # clear the tapecell for testeof and testtape because # they take no arguments. B"noteof" { clear; put; add "!mm.eof"; } B"nottapetest" { clear; put; add "(!mm.work.eq(&mm.tape[mm.cell))"; } get; !B"!mm.eof" { add ")"; } put; clear; add "test*"; push; # the trick below pushes the right token back on the stack. get; add "*"; push; .reparse } #------------------- # 3 tokens #------------------- pop; #----------------------------- # some 3 token errors!!! # not a comprehensive list of 3 token errors "{*quote*;*","{*begintext*;*","{*endtext*;*","{*class*;*", "commandset*quote*;*", "command*quote*;*" { push; push; push; add "[pep error]\n invalid syntax near line "; lines; add " (char "; chars; add ")"; add " of script (misplaced semicolon?) \n"; print; clear; quit; } # to simplify subsequent tests, transmogrify a single command # to a commandset (multiple commands). "{*command*}*" { clear; add "{*commandset*}*"; push; push; push; .reparse } # errors! mixing AND and OR concatenation ",*andtestset*{*", ".*ortestset*{*" { # push the tokens back to make debugging easier push; push; push; add " error: mixing AND (.) and OR (,) concatenation in \n"; add " in pep script near line "; lines; add " (character "; chars; add ") \n"; add ' For example: B".".!E"/".[abcd./] { print; } # Correct! B".".!E"/",[abcd./] { print; } # Error! \n'; print; clear; quit; } #-------------------------------------------- # ebnf: command := keyword , quoted-text , ";" ; # format: add "text"; "word*quote*;*" { clear; get; "replace" { # error add "< command requires 2 parameters, not 1 \n"; add "near line "; lines; add " of script. \n"; print; clear; quit; } # check whether argument is single character, otherwise # throw an error "escape", "unescape", "while", "whilenot" { # This is trickier than I thought it would be. clear; ++; get; --; # check that arg not empty, (but an empty quote is ok # for the second arg of 'replace' '""' { clear; add "[pep error] near line "; lines; add " (or char "; chars; add "): \n"; add " command '"; get; add '\' cannot have an empty argument ("") \n'; print; quit; } # quoted text has the quotes still around it. # also handle escape characters like \n \r etc clip; clop; clop; clop; # B "\\" { clip; } clip; !"" { clear; add "Pep script error near line "; lines; add " (character "; chars; add "): \n"; add " command '"; get; add "' takes only a single character argument. \n"; print; quit; } clear; get; } "mark" { clear; add "/* mark */ \n"; add "mm.marks[mm.cell).clear(); // mark \n"; add "mm.marks[mm.cell).push_str("; ++; get; --; add "); // mark"; put; clear; add "command*"; push; .reparse } "go" { clear; add "mm.goToMark("; ++; get; --; add "); /* go */"; put; clear; add "command*"; push; .reparse } "delim" { clear; # this.delimiter.setCharAt(0, text.charAt(0)); # only the first character of the delimiter argument is used. add "mm.delimiter.clear(); /* delim */\n"; add "mm.delimiter.push_str("; ++; get; --; add "); "; put; clear; add "command*"; push; .reparse } "add" { clear; add "mm.work.push_str("; ++; get; --; # handle multiline text replace "\n" '"); \nmm.work.push_str("\\n'; add "); /* add */"; put; clear; add "command*"; push; .reparse } "while" { clear; add "while (mm.peep == "; ++; get; --; add ".charAt(0)) /* while */\n "; add " { if mm.eof {break;} mm.read(); }"; put; clear; add "command*"; push; .reparse } "whilenot" { clear; add "while (mm.peep != "; ++; get; --; add ".charAt(0)) /* whilenot */\n "; add " { if mm.eof {break;} mm.read(); }"; put; clear; add "command*"; push; .reparse } "until" { clear; add "mm.until("; ++; get; --; # error until cannot have empty argument 'mm.until(""' { clear; add "Pep script error near line "; lines; add " (character "; chars; add "): \n"; add " empty argument for 'until' \n"; add " For example: until '.txt'; until \">\"; # correct until ''; until \"\"; # errors! \n"; print; quit; } # handle multiline argument replace "\n" "\\n"; add ');'; put; clear; add "command*"; push; .reparse } # But really, can't the "replace" command just be used # instead of escape/unescape?? This seems a flaw in the # machine design. "escape","unescape" { clear; add "mm."; get; add "Char"; add "("; ++; get; --; add '.charAt(0));'; put; clear; add "command*"; push; .reparse } # error, superfluous argument add ": command does not take an argument \n"; add "near line "; lines; add " of script. \n"; print; clear; #state quit; } #---------------------------------- # format: "while [:alpha:] ;" or whilenot [a-z] ; "word*class*;*" { clear; get; "while" { clear; add "/* while */ \n"; add "while (mm.peep.matches("; ++; get; --; add ")) { if mm.eof { break; } mm.read(); }"; put; clear; add "command*"; push; .reparse } "whilenot" { clear; add "/* whilenot */ \n"; add "while (!mm.peep).matches("; ++; get; --; add ")) { if mm.eof { break; } mm.read(); }"; put; clear; add "command*"; push; .reparse } # error add " < command cannot have a class argument \n"; add "line "; lines; add ": error in script \n"; print; clear; quit; } # arrange the parse> label loops (eof) { "commandset*parse>*commandset*","command*parse>*commandset*", "commandset*parse>*command*","command*parse>*command*" { clear; # indent both code blocks add " "; get; replace "\n" "\n "; put; clear; ++; ++; add " "; get; replace "\n" "\n "; put; clear; --; --; # add a block so that .reparse works before the parse> label. add "lex: { \n"; get; add "\n}\n"; ++; ++; # indent code block # add " "; get; replace "\n" "\n "; put; clear; add "parse: \n"; add "loop { \n"; get; add "\n break parse;\n}"; --; --; put; clear; add "commandset*"; push; .reparse } } # ------------------------------- # 4 tokens # ------------------------------- pop; #------------------------------------- # bnf: command := replace , quote , quote , ";" ; # example: replace "and" "AND" ; "word*quote*quote*;*" { clear; get; "replace" { #--------------------------- # a command plus 2 arguments, eg replace "this" "that" clear; add "/* replace */ \n"; add "if !mm.work.is_empty() { \n"; add " temp = mm.work.replace("; ++; get; add ", "; ++; get; add ");\n"; add " mm.work.clear(); \n"; add " mm.work.push_str(temp);\n} "; --; --; put; clear; add "command*"; push; .reparse } add "pep script error on line "; lines; add " (character "; chars; add "): \n"; add " command does not take 2 quoted arguments. \n"; print; quit; } #------------------------------------- # format: begin { #* commands *# } # "begin" blocks which are only executed once (they # will are assembled before the "start:" label. They must come before # all other commands. # "begin*{*command*}*", "begin*{*commandset*}*" { clear; ++; ++; get; --; --; put; clear; add "beginblock*"; push; .reparse } # ------------- # parses and compiles concatenated tests # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ... # these 2 tests should be all that is necessary "test*,*ortestset*{*", "test*,*test*{*" { clear; get; add " || "; ++; ++; get; --; --; put; clear; add "ortestset*{*"; push; push; .reparse } # dont mix AND and OR concatenations # ------------- # AND logic # parses and compiles concatenated AND tests # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ... # it is possible to elide this block with the negated block # for compactness but maybe readability is not as good. # negated tests can be chained with non negated tests. # eg: B'http' . !E'.txt' { ... } "test*.*andtestset*{*", "test*.*test*{*" { clear; get; add " && "; ++; ++; get; --; --; put; clear; add "andtestset*{*"; push; push; .reparse } #------------------------------------- # we should not have to check for the {*command*}* pattern # because that has already been transformed to {*commandset*}* "test*{*commandset*}*", "andtestset*{*commandset*}*", "ortestset*{*commandset*}*" { clear; # indent the java code for readability ++; ++; add " "; get; replace "\n" "\n "; put; --; --; clear; add "if ("; get; add ") {\n"; ++; ++; get; add "\n}"; --; --; put; clear; add "command*"; push; # always reparse/compile .reparse } # ------------- # multi-token end-of-stream errors # not a comprehensive list of errors... (eof) { E"begintext*",E"endtext*",E"test*",E"ortestset*",E"andtestset*" { add " Error near end of script at line "; lines; add ". Test with no brace block? \n"; print; clear; quit; } E"quote*",E"class*",E"word*"{ put; clear; add "Error at end of pep script near line "; lines; add ": missing semi-colon? \n"; add "Parse stack: "; get; add "\n"; print; clear; quit; } E"{*", E"}*", E";*", E",*", E".*", E"!*", E"B*", E"E*" { put; clear; add "Error: misplaced terminal character at end of script! (line "; lines; add "). \n"; add "Parse stack: "; get; add "\n"; print; clear; quit; } } # put the 4 (or less) tokens back on the stack push; push; push; push; (eof) { print; clear; # create the virtual machine object code and save it # somewhere on the tape. add ' /* Rust code generated by "translate.rust.pss" */ // use std::mem; // for swap use std::io; use std::io::Read; use std::io::BufReader; use std::io::BufRead; use std::process; use std::fs; use std::fs::File; pub struct Machine { accumulator: i32, // counter for anything peep: char, // next char in input stream charsRead: u32, // No. of chars read so far linesRead: u32, // No. of lines read so far work: String, // text accumulator stack: Vec, // parse token stack LENGTH: u32, // tape initial length // vectors are growable in rust tape: Vec, // array of token attributes, growable marks: Vec, // tape marks cell: u32, // pointer to current cell input: BufReader, // text input stream eof: bool, // end of stream reached? flag: bool, // not used here escape: String, // char used to "escape" others "\\" delimiter: String // push/pop delimiter (default is "*") } impl Machine { // read from stdin or from a file or a string. // BufReader::new(io::stdin()) // BufReader::new(fs::File::open(filename).unwrap()) // let mut streader = StringReader::new("Line 1\\nLine 2"); // let mut bufreader = BufReader::new(streader); /** make a new machine with input from stdin */ /* pub fn new() -> Self { return Machine::new(BufReader::new(io::stdin())); } */ /** make a new machine with input from a string and output to a string */ /* stringreader is a crate. pub fn new(input: String, output: String) -> Self { let mut reader = StringReader::new(input); return Machine::new(BufReader::new(reader)); } */ /** make a new machine with a buffered stream reader */ pub fn new(reader: R) -> Self { Self { LENGTH: 100, // BufReader::new(io::stdin()) input: reader, eof: false, flag: false, charsRead: 0, linesRead: 1, escape: String::from("\\\\"), delimiter: String::from("*"), accumulator: 0, work: String::new(), stack: vec!["".to_string();100], cell: 0, tape: vec!["".to_string();100], marks: vec!["".to_string();100], peep: 'z' // peep: Self.input.read() } // self } /** read one character from the input stream and update the machine. */ pub fn readNext(&mut self) { //int iChar; if self.eof { process::exit(0); } self.charsRead += 1; // increment lines if self.peep == \'\\n\' { self.linesRead += 1; } self.work.push(self.peep); self.peep = self.input.read(); if self.peep == 'x' { self.eof = true; } } /** increment tape pointer by one */ pub fn increment(&mut self) { self.cell += 1; if self.cell >= self.LENGTH { self.tape.push(String::from("")); self.marks.push(String::from("")); self.LENGTH += 1; } } /** remove escape character */ pub fn unescapeChar(&mut self, c: char) { if !self.work.is_empty() { let s: String = self.work.replace("\\\\".push(c), c.to_string()); self.work.clear(); self.work.push_str(&s); } } /** add escape character */ pub fn escapeChar(&mut self, c: char) { if !self.work.is_empty() { let s: String = self.work.replace(c.to_string(), "\\\\".push(c)); self.work.clear(); self.work.push_str(&s); } } /** whether trailing escapes \\\\ are even or odd */ // untested code. check! eg try: add "x \\\\"; print; etc pub fn isEscaped(&mut self, ss: String, sSuffix: String) -> bool { let count: i32 = 0; if ss.chars().count() < 2 { return false; } if ss.chars().count() <= sSuffix.chars().count() { return false; } if ss.indexOf(self.escape.charAt(0)) == -1 { return false; } let pos: i32 = ss.chars().count()-sSuffix.length(); while (pos > -1) && (ss.charAt(pos) == self.escape.charAt(0)) { count += 1; pos -= 1; } if count % 2 == 0 { return false; } return true; } /* a helper to see how many trailing \\\\ escape chars */ pub fn countEscaped(&mut self, sSuffix: String) -> u32 { let mut s = String::new(); let count: i32 = 0; match s.strip_suffix(sSuffix) { Some(s) => s, None => s } // remove suffix if it exists if index > 0 { s = self.work.substring(0, index); } while s.ends_with(self.escape) { count += 1; s = s.substring(0, s.lastIndexOf(self.escape)); } return count; } /** reads the input stream until the work end with text */ // can test this with pub fn until(&mut self, sSuffix: String) { // read at least one character if self.eof { return; } self.readNext(); loop { if self.eof { return; } if self.work.ends_with(sSuffix) { if self.countEscaped(sSuffix) % 2 == 0 { return; } } self.readNext(); } } /** pop the first token from the stack into the workspace */ pub fn pop(&mut self) -> bool { if self.stack.len() == 0 { return false; } self.work.insert_str(0, self.stack.pop().as_str()); if self.cell > 0 { self.cell -= 1; } return true; } // push the first token from the workspace to the stack pub fn push(&mut self) -> bool { let sItem: String = String::new(); // dont increment the tape pointer on an empty push if self.work.is_empty() { return false; } // need to get this from self.delim not "*" let iFirstStar: u32 = self.work.indexOf(self.delimiter); if iFirstStar != -1 { sItem = self.work.substring(0, iFirstStar + 1); self.work.delete(0, iFirstStar + 1); } else { sItem = self.work; self.work.clear(); } self.stack.push(sItem); self.increment(); return true; } // swap not required, use mem::swap // save the workspace to file "sav.pp" */ // not required. pub fn writeToFile(&mut self) { fs::write("sav.pp", self.work).expect("Unable to write file"); } pub fn goToMark(&mut self, mark: String) { for (ii, thismark) in self.marks.iter().enumerate() { if thismark.eq(&mark) { self.cell = ii; return; } } print!("badmark \'{}\'!", mark); process::exit(1); } // Need to add an input stream and output stream to this method /** parse/check/compile the input */ pub fn parse(&mut self) { //this is where the actual parsing/compiling code should go //but this means that all generated code must use //"self." not "mm." let ii = 1; } } fn main() -> io::Result<()> { // BufReader::new(io::stdin()) let temp: String = String::new(); let mm: Machine = Machine::new(BufReader::new(io::stdin())); \n'; # save the code in the current tape cell put; clear; #--------------------- # check if the script correctly parsed (there should only # be one token on the stack, namely "commandset*" or "command*"). pop; pop; "commandset*", "command*" { clear; # indent generated code (6 spaces) for readability. add " "; get; replace "\n" "\n "; put; clear; # restore the rust preamble from the tape ++; get; --; add ' \'script: while !mm.eof {\n'; get; add "\n }"; add "\n }\n"; #add "\n}\n"; # put a copy of the final compilation into the tapecell # so it can be inspected interactively. put; print; clear; quit; } "beginblock*commandset*", "beginblock*command*" { clear; # indent begin block code add " "; get; replace "\n" "\n "; put; clear; # indent main code for readability. ++; add " "; get; replace "\n" "\n "; put; clear; --; # get rust preamble from tape ++; ++; get; --; --; get; add "\n"; ++; # a labelled loop for "quit" (but quit can just exit?) add " 'script: \n"; add " while !mm.eof {\n"; get; add "\n }"; add "\n }\n"; #add "\n}\n"; # put a copy of the final compilation into the tapecell # for interactive debugging. put; print; clear; quit; } push; push; # try to explain some more errors unstack; B"parse>" { put; clear; add "[error] pep syntax error:\n"; add " The parse> label cannot be the 1st item \n"; add " of a script \n"; print; quit; } put; clear; clear; add "After compiling with 'compile.java.pss' (at EOF): \n "; add " parse error in input script. \n "; print; clear; unstack; put; clear; add "Parse stack: "; get; add "\n"; add " * debug script "; add " >> pep -If script -i 'some input' \n "; add " * debug compilation. \n "; add " >> pep -Ia asm.pp script' \n "; print; clear; quit; } # not eof # there is an implicit .restart command here (jump start)