#* 

   An old version, too complicated

   This is a parse-script which translates parse-scripts into java
   code, using the 'pep' tool. The script creates a standalone 
   compilable java program.
   
   The virtual machine and engine is implemented in plain c at
   http://bumble.sf.net/books/pars/gh.c. This implements a script language
   with a syntax reminiscent of sed and awk (much simpler than awk, but
   more complex than sed).
   
   This code was originally created in a straightforward manner by adapting
   the code in 'compile.js.pss' which compiles scripts to javascript 

NOTES
   
   We use labelled loops and break/continue to implement the 
   parse> label and .reparse .restart commands. Breaks are also
   used to implement the quit and bail commands.

TODO

   Allow multiline arguments for the "add" command (since they are 
   allowed by the compile.pss script.

SEE ALSO
   
   At http://bumble.sf.net/books/pars/

   compile.js.pss
     A very similar script for compiling scripts into javascript

   compile.pss
     compiles a script into an "assembly" format that can be loaded
     and run on the parse-machine with the -a  switch. This performs
     the same function as "asm.pp" 

TESTING

   The script can be tested with something like
   ----
     pep -f compile.java.pss -i "r;[aeiou]{a '=vowel\n';t;}d;" > Script.java
     javac Script.java; 
     echo "abcdefhijklmnop" | java Script
   ,,, 

   The output will be java code which is equivalent to the 
   script provided to the -i switch.

BUGS
   
  * not sure how to unescape "\"... maybe unescape "\\ ";
  >> r; [a-z] { until "."; unescape "\\"; } t; d;

TASKS 

HISTORY
    
  23 july 2020
    
    Extensive revision of this script. rewriting methods as "inline".
    But revision is incomplete. This script should become 
    a good template for writing similar scripts in other languages.

  22 july 2020

    I need to change the stack code to use the java.util.Stack
    class.

    I am in the process of rethinking this script and reforming
    it. I would like to include the Machine class within the 
    output of the script, so that there are no dependencies on 
    external code. . Also, I would like to remove trivial 
    methods from the class.

  Oct 2019
    Made functions ppjjs, ppjjss, ppjjf in helpers.pars.sh so that java
    scripts can be easily run.

  30 sept 2019
    basic scripts working. whilenotPeep and whilePeep need to 
    be written properly. Also, translate unicode categories in
    [:text:] format to java regex.

  27 sept 2019
    Began to adapt this script from compile.javascript.pss

*# 

  read;
  #--------------
  [:space:] {
    clear; .reparse
  }

  #---------------
  # We can ellide all these single character tests, because
  # the stack token is just the character itself with a *
  # Braces {} are used for blocks of commands, ',' and '.' for concatenating
  # tests with OR or AND logic. 'B' and 'E' for begin and end
  # tests, '!' is used for negation, ';' is used to terminate a 
  # command.
  "{", "}", ";", ",", ".", "!", "B", "E" {
    put; add "*"; push; .reparse 
  }

  #---------------
  # format: "text"
  "\"" {
    # save the start line number (for error messages) in case 
    # there is no terminating quote character.
    clear; add "line "; lines; add " (character "; chars; add ") ";
    put; clear; add '"';
    until '"'; 
    !E'"' { 
      clear; add 'Unterminated quote character (") starting at ';
      get; add ' !\n'; 
      print; quit;
    }
    put; clear;
    add "quote*"; push;
    .reparse 
  }

 #---------------
 # format: 'text', single quotes are converted to double quotes
 # but we must escape embedded double quotes.
  "'" {
    # save the start line number (for error messages) in case 
    # there is no terminating quote character.
    clear; add "line "; lines; add " (character "; chars; add ") ";
    put; clear;
    until "'"; 
    !E"'" { 
      clear; add "Unterminated quote (') starting at ";
      get; add '!\n'; 
      print; quit;
    }
    clip; escape '"'; put; clear;
    add "\""; get; add "\"";
    put; clear;
    add "quote*";
    push;
    .reparse 
  }

  #---------------
  # formats: [:space:] [a-z] [abcd] [:alpha:] etc 
  # should class tests really be multiline??!
  "[" {
    # save the start line number (for error messages) in case 
    # there is no terminating bracket character.
    clear; add "line "; lines; add " (character "; chars; add ") ";
    put; clear; add "[";
    until "]"; 
    "[]" {
      clear; add "pep script error at line "; lines;
      add " (character "; chars; add "): \n";
      add "  empty character class [] \n";
      print; quit;
    }
    !E"]" { 
      clear; add "Unterminated class text ([...]) starting at "; get; 
      add "
      class text can be used in tests or with the 'while' and 
      'whilenot' commands. For example: 
        [:alpha:] { while [:alpha:]; print; clear; }
      ";
      print; quit;
    }
    B"[:".!E":]" { 
      clear; add "malformed character class starting at ";
      get; add '!\n'; 
      print; quit;
    }
    B"[:" {
      clip; clip; clop; clop;
      # unicode posix character classes in java 
      "alnum" { clear; add "\\p{Alnum}"; }
      "alpha" { clear; add "\\p{Alpha}"; }
      "ascii" { clear; add "\\p{ASCII}"; }
      "blank" { clear; add "\\p{Blank}"; }
      "cntrl" { clear; add "\\p{Cntrl}"; }
      "digit" { clear; add "\\p{Digit}"; }
      "graph" { clear; add "\\p{Graph}"; }
      "lower" { clear; add "\\p{Lower}"; }
      "print" { clear; add "\\p{Print}"; }
      "punct" { clear; add "\\p{Punct}"; }
      "space" { clear; add "\\p{Space}"; }
      "upper" { clear; add "\\p{Upper}"; }
      "xdigit" { clear; add "\\p{Xdigit}"; }
    }
    #*
     alnum - alphanumeric like [0-9a-zA-Z] 
     alpha - alphabetic like [a-zA-Z] 
     blank - blank chars, space and tab 
     cntrl - control chars, ascii 000 to 037 and 177 (del) 
     digit - digits 0-9 
     graph - graphical chars same as :alnum: and :punct: 
     lower - lower case letters [a-z] 
     print - printable chars ie :graph: + space 
     punct - punctuation ie !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~. 
     space - all whitespace, eg \n\r\t vert tab, space, \f 
     upper - upper case letters [A-Z] 
     xdigit - hexadecimal digit ie [0-9a-fA-F] 
    *#

    put; clear;
    # add quotes around the class
    add '"'; get; add '"'; put; clear;
    add "class*"; push;
    .reparse 
  }

 #---------------
 # formats: (eof) (EOF) (==) etc. 
  "(" {
    clear; until ")"; clip;
    put; 
    "eof","EOF" { clear; add "eof*"; push; .reparse } 
    "==" { clear; add "tapetest*"; push; .reparse } 
    add " << unknown test near line "; lines;
    add " of script.\n";
    add " bracket () tests are \n";
    add "   (eof) test if end of stream reached. \n";
    add "   (==)  test if workspace is same as current tape cell \n";
    print; clear;
    quit;
  }

  #---------------
  # multiline and single line comments, eg #... and #* ... *#
  "#" {
    clear; read;
    "\n" { clear; .reparse }

    # checking for multiline comments of the form "#* \n\n\n *#"
    # these are just ignored at the moment (deleted) 
    "*" { 
      # save the line number for possible error message later
      clear; lines; put; clear;
      until "*#"; 
      E"*#" {
        # convert to /* ... */ java multiline comment
        clip; clip;
        put; clear; add "/*"; get; add "*/";
        # create a "comment" parse token
        put; clear; 
        # comment-out this line to remove multiline comments from the 
        # compiled java.
        # add "comment*"; push; 
        .reparse  
      }
      # make an unterminated multiline comment an error
      # to ease debugging of scripts.
      clear; 
      add "unterminated multiline comment #* ... *# \n";
      add "stating at line number "; get; add "\n";
      print; clear;
      quit;
    }

    # single line comments. some will get lost.
    put; clear; add "//"; get; until "\n"; clip;
    put; clear; add "comment*"; push; 
    .reparse 
  }

 #----------------------------------
 # parse command words (and abbreviations)

 # legal characters for keywords (commands)
 ![abcdefghijklmnopqrstuvwxyzBEKGPRUWS+-<>0^] {
   # error message about a misplaced character
   put; clear;
   add "!! Misplaced character '";
   get;
   add "' in script near line "; lines;
   add " (character "; chars; add ") \n";
   print; clear; bail;
 }

   # my testclass implementation cannot handle complex lists
   # eg [a-z+-] this is why I have to write out the whole alphabet

   while [abcdefghijklmnopqrstuvwxyzBEOFKGPRUWS+-<>0^];
   #----------------------------------
   # KEYWORDS 
   # here we can test for all the keywords (command words) and their
   # abbreviated one letter versions (eg: clip k, clop K etc). Then
   # we can print an error message and abort if the word is not a 
   # legal keyword for the parse-edit language

   # make ll an alias for "lines" and cc an alias for chars
   "lines" { clear; add "ll"; }
   "chars" { clear; add "cc"; }
   # one letter command abbreviations
   "a" { clear; add "add"; }
   "k" { clear; add "clip"; }
   "K" { clear; add "clop"; }
   "D" { clear; add "replace"; }
   "d" { clear; add "clear"; }
   "t" { clear; add "print"; }
   "p" { clear; add "pop"; }
   "P" { clear; add "push"; }
   "u" { clear; add "unstack"; }
   "U" { clear; add "stack"; }
   "G" { clear; add "put"; }
   "g" { clear; add "get"; }
   "x" { clear; add "swap"; }
   ">" { clear; add "++"; }
   "<" { clear; add "--"; }
   "m" { clear; add "mark"; }
   "M" { clear; add "go"; }
   "r" { clear; add "read"; }
   "R" { clear; add "until"; }
   "w" { clear; add "while"; }
   "W" { clear; add "whilenot"; }
   "n" { clear; add "count"; }
   "+" { clear; add "a+"; }
   "-" { clear; add "a-"; }
   "0" { clear; add "zero"; }
   "c" { clear; add "cc"; }
   "l" { clear; add "ll"; }
   "^" { clear; add "escape"; }
   "v" { clear; add "unescape"; }
   "z" { clear; add "delim"; }
   "S" { clear; add "state"; }
   "q" { clear; add "quit"; }
   "Q" { clear; add "bail"; }
   "s" { clear; add "write"; }
   "o" { clear; add "nop"; }
   "rs" { clear; add "restart"; }
   "rp" { clear; add "reparse"; }

   # some extra syntax for testeof and testtape
   "<eof>","<EOF>" { put; clear; add "eof*"; push; .reparse }
   "<==>" { put; clear; add "tapetest*"; push; .reparse }

   "jump","jumptrue","jumpfalse",
   "testis","testclass","testbegins","testends",
   "testeof","testtape" {
     put; clear;
     add "The instruction '"; get; add "' near line "; lines; 
     add " (character "; chars; add ")\n";
     add "can be used in pep assembly code but not scripts. \n";
     print; clear; quit;
   }
   
   # show information if these "deprecated" commands are used
   "bail","state" {
     put; clear;
     add "The instruction '"; get; add "' near line "; lines; 
     add " (character "; chars; add ")\n";
     add "is no longer part of the pep language (july 2020). \n";
     add "use 'quit' instead of 'bail', and use 'unstack; print;' \n";
     add "instead of 'state'.";
     print; clear; quit;
   }
   
   "add","clip","clop","replace","clear","print",
   "pop","push","unstack","stack","put","get","swap",
   "++","--","mark","go",
   "read","until","while","whilenot",
   "count","a+","a-","zero","cc","ll", "nochars", "nolines",
   "escape","unescape","delim","state","quit",
   "write","nop","reparse","restart" {
     put; clear;
     add "word*";
     push; .reparse
   }
   
   #------------ 
   # the .reparse command and "parse label" is a simple way to 
   # make sure that all shift-reductions occur. It should be used inside
   # a block test, so as not to create an infinite loop. There is
   # no "goto" in java so we need to use labelled loops to 
   # implement .reparse/parse>

   "parse>" {
     clear; count;
     !"0" {
       clear; 
       add "script error:\n";
       add "  extra parse> label at line "; lines; add ".\n";
       print;
       quit;
     }
     clear; add "// parse>"; put;
     clear; add "parse>*"; push;
     # use accumulator to indicate after parse> label
     a+; .reparse 
   }

   # --------------------
   # implement "begin-blocks", which are only executed
   # once, at the beginning of the script (similar to awk's BEGIN {} rules)
   "begin" {
     put; add "*"; push; .reparse 
   }

   add " << unknown command on line "; lines; 
   add " (char "; chars; add ")"; 
   add " of source file. \n"; 
   print; clear; quit;

# ----------------------------------
# PARSING PHASE:

# Below is the parse/compile phase of the script. Here we pop tokens off the
# stack and check for sequences of tokens eg "word*semicolon*". If we find a
# valid series of tokens, we "shift-reduce" or "resolve" the token series eg
# word*semicolon* --> command*
#
# At the same time, we manipulate (transform) the attributes on the tape, as
# required. 
#

parse>

#-------------------------------------
# 2 tokens
#-------------------------------------
  pop; pop;

  # All of the patterns below are currently errors, but may not
  # be in the future if we expand the syntax of the parse
  # language. Also consider:
  #    begintext* endtext* quoteset* notclass*, !* ,* ;* B* E*
  # It is nice to trap the errors here because we can emit some
  # (hopefully not very cryptic) error messages with a line number.
  # Otherwise the script writer has to debug with
  #   pep -a asm.pp -I scriptfile 
  #

  "word*word*", "word*}*", "word*begintext*", "word*endtext*",
  "word*!*", "word*,*", 
  "quote*word*", "quote*class*", "quote*state*", "quote*}*",
  "quote*begintext*", "quote*endtext*",
  "class*word*", "class*quote*", "class*class*", "class*state*", "class*}*",
  "class*begintext*", "class*endtext*", "class*!*", 
  "notclass*word*", "notclass*quote*", "notclass*class*", 
  "notclass*state*", "notclass*}*"
  {
    push; push;
    add "Error near line "; lines; add " (char "; chars; add ")"; 
    add " of pep script (missing semicolon?) \n";
    print; clear; quit;
  }  

  "{*;*", ";*;*", "}*;*" {
    push; push;
    add "Error near line "; lines; add " (char "; chars; add ")"; 
    add " of pep script: misplaced semi-colon? ; \n";
    print; clear; quit;
  }

  ",*{*" {
    push; push;
    add "Error near line "; lines; add " (char "; chars; add ")"; 
    add " of script: extra comma in list? \n";
    print; clear; quit;
  }

  "command*;*","commandset*;*" {
    push; push;
    add "Error near line "; lines; add " (char "; chars; add ")"; 
    add " of script: extra semi-colon? \n";
    print; clear; quit;
  }

  "!*!*" {
    push; push;
    add "error near line "; lines; add " (char "; chars; add ")"; 
    add " of script: \n double negation '!!' is not implemented \n";
    add " and probably won't be, because what would be the point? \n";
    print; clear; quit;
  }

  "!*{*","!*;*" {
    push; push;
    add "error near line "; lines;
    add " (char "; chars; add ")"; 
    add " of script: misplaced negation operator (!)? \n";
    add " The negation operator precedes tests, for example: \n";
    add "   !B'abc'{ ... } or !(eof),!'abc'{ ... } \n";
    print; clear; quit;
  }

  ",*command*" {
    push; push;
    add "error near line "; lines;
    add " (char "; chars; add ")"; 
    add " of script: misplaced comma? \n";
    print; clear; quit;
  }

  "!*command*" {
    push; push;
    add "error near line "; lines;
    add " (at char "; chars; add ") \n"; 
    add " The negation operator (!) cannot precede a command \n";
    print; clear; quit;
  }

  ";*{*", "command*{*", "commandset*{*" {
    push; push;
    add "error near line "; lines;
    add " (char "; chars; add ")"; 
    add " of script: no test for brace block? \n";
    print; clear; quit;
  }

  "{*}*" {
    push; push;
    add "error near line "; lines;
    add " of script: empty braces {}. \n";
    print; clear; quit;
  }

  "B*class*","E*class*" {
    push; push;
    add "error near line "; lines;
    add " of script:\n  classes ([a-z], [:space:] etc). \n";
    add "  cannot use the 'begin' or 'end' modifiers (B/E) \n";
    print; clear; quit;
  }

  "comment*{*" {
    push; push;
    add "error near line "; lines;
    add " of script: comments cannot occur between \n";
    add " a test and a brace ({). \n";
    print; clear; quit;
  }

  "}*command*" {
    push; push;
    add "error near line "; lines;
    add " of script: extra closing brace '}' ?. \n";
    print; clear; quit;
  }

  E"begin*".!"begin*" {
    push; push;
    add "error near line "; lines;
    add " of script: Begin blocks must precede code \n";
    print; clear; quit;
  }

  #------------ 
  # The .restart command jumps to the first instruction after the
  # begin block (if there is a begin block), or the first instruction
  # of the script.
  ".*word*" {
    clear; ++; get; --;
    "restart" {
      clear; add "continue script;";
      put; clear;
      add "command*";
      push; .reparse 
    }
    "reparse" {
      clear; count; 
      # check accumulator to see if we are in the "lex" block
      # or the "parse" block and adjust the .reparse compilation
      # accordingly.
      "0" { clear; add "break lex;"; }
      "1" { clear; add "continue parse;"; }
      put; clear;
      add "command*";
      push; .reparse 
    }
    push; push;
    add "error near line "; lines;
    add " (char "; chars; add ")"; add " of script:  \n";
    add " misplaced dot '.' (use for AND logic or in .reparse/.restart \n";
    print; clear; quit;
  }

  #---------------------------------
  # Compiling comments so as to transfer them to the java 
  "comment*command*","command*comment*","commandset*comment*" {
    clear; get; add "\n"; ++; get; --; put; clear;
    add "command*"; push; .reparse
  }

  "comment*comment*" {
    clear; get; add "\n"; ++; get; --; put; clear;
    add "comment*"; push; .reparse
  }

  # -----------------------
  # negated tokens.
  #
  # This is a new more elegant way to negate a whole set of 
  # tests (tokens) where the negation logic is stored on the 
  # stack, not in the current tape cell. We just add "not" to 
  # the stack token.

  # eg: ![:alpha:] ![a-z] ![abcd] !"abc" !B"abc" !E"xyz"
  #  This format is used to indicate a negative test for 
  #  a brace block. eg: ![aeiou] { add "< not a vowel"; print; clear; }

  "!*quote*","!*class*","!*begintext*", "!*endtext*",
  "!*eof*","!*tapetest*" {
    # a simplification: store the token name "quote*/class*/..."
    # in the tape cell corresponding to the "!*" token. 
    replace "!*" "not"; push;
    # this was a bug?? a missing ++; ??
    # now get the token-value
    get; --; put; ++; clear;
    .reparse
  }

  #-----------------------------------------
  # format: E"text" or E'text'
  #  This format is used to indicate a "workspace-ends-with" text before
  #  a brace block.
  "E*quote*" {
     clear; add "endtext*"; push; get; 
     '""' {
       # empty argument is an error
       clear;
       add "pep script error near line "; lines;
       add " (character "; chars; add "): \n";
       add '  empty argument for end-test (E"") \n';
       print; quit;
     }
     --; put; ++;
     clear; .reparse
  } 

  #-----------------------------------------
  # format: B"sometext" or B'sometext' 
  #   A 'B' preceding some quoted text is used to indicate a 
  #   'workspace-begins-with' test, before a brace block.
  "B*quote*" {
     clear; add "begintext*"; push; get; 
     '""' {
       # empty argument is an error
       clear;
       add "pep script error near line "; lines;
       add " (character "; chars; add "): \n";
       add '  empty argument for begin-test (B"") \n';
       print; quit;
     }
     --; put; ++;
     clear; .reparse
  } 

  #--------------------------------------------
  # ebnf: command := word, ';' ;
  # formats: "pop; push; clear; print; " etc
  # all commands need to end with a semi-colon except for 
  # .reparse and .restart
  #
  "word*;*" {
     clear;
     # check if command requires parameter
     get;
     "add", "until", "while", "whilenot", "mark", "go",
     "escape", "unescape", "delim", "replace" {
       put; clear; add "'"; get; add "'";
       add " << command needs an argument, on line "; lines; 
       add " of script.\n";
       print; clear; quit;
     }

     "clip" { 
       clear; 
       add "/* clip */\n";
       add "if (mm.workspace.length() > 0) { \n";
       add "mm.workspace.delete(mm.workspace.length() - 1, \n";
       add "  mm.workspace.length()); }";
       put; 
     }
     "clop" { 
       clear; 
       #if (this.workspace.length() == 0) return;
       #this.workspace.delete(0, 1);
       add "mm.clop();"; put; 
     }
     "clear" { clear; add "mm.workspace.setLength(0); /* clear */"; put; }
     "print" { 
       clear; add "System.out.print(mm.workspace); /* print */"; put; 
     }
     "pop" { clear; add "mm.pop();"; put; }
     "push" { clear; add "mm.push();"; put; }
     "unstack" { clear; add "while (mm.pop()); // unstack"; put; }
     "stack" { clear; add "while(mm.push()); // stack"; put; }
     "put" { 
       clear; 
       add "mm.tape[mm.tapePointer].setLength(0); // put\n";
       add "mm.tape[mm.tapePointer].append(mm.workspace); // put";
       put; }
     "get" { 
       clear; add "mm.workspace.append(this.tape[this.tapePointer]); //get";
       put; }
     "swap" { clear; add "mm.swap();"; put; }
     "++" { clear; add "mm.increment();"; put; }
     "--" { 
       clear; 
       add "if (this.tapePointer > 0) this.tapePointer--; // --"; put;
     }
     "read" { clear; add "mm.read();"; put; }
     # It might be useful to do, after each read() 
     #   ww = mm.workspace.toString(); 

     # we can omit tests and jumps since they are not
     # designed to be used in scripts (only assembled parse programs).

     "count" { clear; add "mm.workspace.append(this.accumulator);"; put; }
     "a+" { clear; add "mm.accumulator++; // a+"; put; }
     "a-" { clear; add "mm.accumulator--; // a-"; put; }
     "zero" { clear; add "mm.accumulator = 0; // zero"; put; }
     "cc" { clear; add "mm.workspace.append(this.charsRead);"; put; }
     "ll" { clear; add "mm.workspace.append(this.linesRead);"; put; }
     "nochars" { clear; add "mm.charsRead = 0; // nochars"; put; }
     "nolines" { clear; add "mm.linesRead = 0; // nolines"; put; }
     "state" { clear; add "mm.state();"; put; }
     # use a labelled loop to quit script.
     "quit" { clear; add "break script;"; put; }
     "bail" { clear; add "break script;"; put; }
     "write" { clear; add "mm.writeToFile();"; put; }
     # just eliminate since it does nothing.
     "nop" { clear; add "// nop (eliminated)"; put; }

     clear; add "command*";
     push; .reparse
   }

  #-----------------------------------------
  # ebnf: commandset := command , command ;
  "command*command*", "commandset*command*" {
    clear;
    add "commandset*"; push;
    # format the tape attributes. Add the next command on a newline 
    --; get; add "\n"; 
    ++; get; --;
    put; ++; clear; 
    .reparse
  } 

  #-------------------
  # here we begin to parse "test*" and "ortestset*" and "andtestset*"
  # 

  #-------------------
  # eg: B"abc" {} or E"xyz" {}
  # beginning to change this parsing technique
  "begintext*,*","endtext*,*","quote*,*","class*,*",
  "eof*,*","tapetest*,*",
  "begintext*{*","endtext*{*","quote*{*","class*{*",
  "eof*{*","tapetest*{*" 
  {

    B"begin" { clear; add "mm.workspace.toString().startsWith("; }
    B"end" { clear; add "mm.workspace.toString().endsWith("; }
    B"quote" { clear; add "mm.workspace.toString().equals("; }
    B"class" { clear; add "mm.workspace.toString().matches("; }
    # clear the tapecell for testeof and testtape because
    # they take no arguments. 
    B"eof" { clear; put; add "mm.eof("; }
    B"tapetest" { 
      clear; put; 
      add "(mm.workspace == mm.tape[mm.tapePointer]"; 
    }
    get; add ")";

    put; 
    clear; add "test*{*";
    push; push; .reparse
  }

  #-------------------
  # negated tests
  # eg: !B"xyz {} 
  #     !E"xyz" {} 
  #     !"abc" {}
  #     ![a-z] {}
  "notbegintext*,*","notendtext*,*","notquote*,*","notclass*,*",
  "noteof*,*","nottapetest*,*",
  "notbegintext*{*","notendtext*{*","notquote*{*","notclass*{*",
  "noteof*{*","nottapetest*{*"
  {

    B"notbegin" { clear; add "!mm.workspace.toString().startsWith("; }
    B"notend" { clear; add "!mm.workspace.toString().endsWith("; }
    B"notquote" { clear; add "!mm.workspace.toString().equals("; }
    B"notclass" { clear; add "!mm.workspace.toString().matches("; }
    # clear the tapecell for testeof and testtape because
    # they take no arguments. 
    B"noteof" { clear; put; add "!mm.eof("; }
    B"nottapetest" { 
      clear; put; 
      add "(mm.workspace != mm.tape[mm.tapePointer]"; 
    }
    get; add ")";

    # the final jumpfalse + target will be added later
    # use the accumulator to store the incremented jump target
    put; clear; add "test*{*";
    push; push; .reparse
  }

  #-------------------
  # 3 tokens
  #-------------------

  pop;

  #-----------------------------
  # some 3 token errors!!!
 
  # there are many other of these errors but I am not going
  # to write them all.
  "{*quote*;*","{*begintext*;*","{*endtext*;*","{*class*;*"
  {
    push; push; push;
    add "error near line "; lines;
    add " (char "; chars; add ")"; 
    add " of script (misplaced semicolon?) \n";
    print; clear; quit;
  }  

  # to simplify subsequent tests, transmogrify a single command
  # to a commandset (multiple commands).
  "{*command*}*" {
    clear; add "{*commandset*}*"; push; push; push;
    .reparse
  }

  # rule 
  #',' ortestset ::= ',' test '{'
  # trigger a transmogrification from test to ortestset token
  # and 
  # '.' andtestset ::= '.' test '{'

  # shouldnt be necessary
  ",*test*{*" {
    clear; add ",*ortestset*{*"; push; push; push;
    .reparse
  }

  # trigger a transmogrification from "test" to "andtest" by
  # looking backwards in the stack

  # shouldnt be necessary now
  ".*test*{*" {
    clear; add ".*andtestset*{*"; push; push; push;
    .reparse
  }

  # errors! mixing AND and OR concatenation
  ",*andtestset*{*",
  ".*ortestset*{*" {
    # push the tokens back to make debugging easier
    push; push; push; 
    add " error: mixing AND (.) and OR (,) concatenation in \n";
    add " in script near line "; lines;
    add " (character "; chars; add ") \n";
    add ' 
  For example:
     B".".!E"/".[abcd./] { print; }  # Correct!
     B".".!E"/",[abcd./] { print; }  # Error! 
     ';
    print; clear; quit;
  }

  #--------------------------------------------
  # ebnf: command := keyword , quoted-text , ";" ;
  # format: add "text";

  "word*quote*;*" {
    clear; get;
    "replace" {
       # error 
       add "< command requires 2 parameters, not 1 \n";
       add "near line "; lines;
       add " of script. \n";
       print; clear; quit;
    }

    # check whether argument is single character, otherwise
    # throw and error
    "escape", "unescape", "while", "whilenot" {
      # This is tricker than I thought it would be.
      clear; ++; get; --; 
      # quoted text has the quotes still around it.
      # also handle escape characters like \n \r etc
      clip; clop; clip; clip;
      # B "\\\" { clip; } 
      clip; 
      !"" {
        clear; 
        add "Pep script error near line "; lines;
        add " (character "; chars; add "): \n"; 
        add "  command '"; get; 
        add "' takes only a single character argument. \n";
        print; quit;
      }
      clear; get;
    }

    "mark" {
      clear;
      add "/* mark */ \n";
      add "mm.marks[mm.tapePointer].setLength(0); // mark \n"; 
      add "mm.marks[mm.tapePointer].append("; ++; get; --; 
      add "); // mark";
      put; clear; add "command*"; push; .reparse
    }

    "go" {
      clear;
      add "/* go */\n";
      add "for (var ii = 0; ii < mm.marks.length; ii++) \n";
      add "  if (mm.marks[ii].toString().equals(text)) \n";
      add "    { mm.tapePointer = ii; }";
      put; clear; add "command*"; push; .reparse
    }

    "delim" {
      clear;
      # this.delimiter.setCharAt(0, text.charAt(0));
      # only the first character of the delimiter argument is used. 
      add "/* delim */\n";
      add "mm.delimiter.setLength(0); \n"; 
      add "mm.delimiter.append("; ++; get; --; 
      add "); ";
      put; clear; add "command*"; push; .reparse
    }

    "add" {
      clear;
      add "mm.workspace.append("; ++; get; --; 
      # handle multiline text
      replace "\n" '"); \nmm.workspace.append("\\n';
      add ");";
      put; clear; add "/* add */ \n"; get;
      put; clear; add "command*"; push; .reparse
    }

   
    "while" {
      clear;
      add "while ((char) mm.peep == "; ++; get; --;
      add ".charAt(0)) mm.read(); /* while */"; 
      put; clear; add "command*"; push; .reparse
    }

    "whilenot" {
      clear;
      add "/* whilenot */ \n";
      add "while ((char) mm.peep != "; ++; get; --;
      add ".charAt(0)) mm.read();"; 
      put; clear; add "command*"; push; .reparse
    }

    "until" {
       clear; add "mm.until("; 
       ++; get; --; 
       # error until cannot have empty argument
       E'""' { 
         clear; 
         add "pep script error near line "; lines;
         add " (character "; chars; add "): \n";
         add " empty argument for 'until' \n";
         print; quit;
       }
       # handle multiline argument
       replace "\n" "\\n";
       add ');'; put; clear;
       add "command*"; push; .reparse
     }

    "escape","unescape" {
       clear; add "mm."; get; add "Char"; 
       add "("; ++; get; --; add '.charAt(0));'; put; clear;
       add "command*"; push; .reparse
     }

     # error, superfluous argument
     add ": command does not take an argument \n";
     add "near line "; lines;
     add " of script. \n";
     print; clear;
     #state
     quit;
   }

   #----------------------------------
   # format: "while [:alpha:] ;" or whilenot [a-z] ;

   "word*class*;*" {
     clear; get;

     "while" {
       clear;
       add "/* while */ \n";
       add "while (Character.toString((char)mm.peep).matches("; ++; get; --;
       add ")) mm.read(); "; 
       put; clear; add "command*"; push; .reparse
     }

     "whilenot" {
       clear;
       add "/* whilenot */ \n";
       add "while (!Character.toString((char)mm.peep).matches("; ++; get; --;
       add ")) mm.read(); "; 
       put; clear; add "command*"; push; .reparse
     }

     # error 
     add " < command cannot have a class argument \n";
     add "line "; lines; add ": error in script \n";
     print; clear; quit;
   }


  # arrange the parse> label loops
  (eof) {
    "commandset*parse>*commandset*","command*parse>*commandset*",
    "commandset*parse>*command*","command*parse>*command*" {
      clear; 
      # indent both code blocks
      add "  "; get; replace "\n" "\n  "; put; clear; ++; ++;
      add "  "; get; replace "\n" "\n  "; put; clear; --; --;
      # add a block so that .reparse works before the parse> label.
      add "lex: { \n";
      get; add "\n}\n"; ++; ++;
      # indent code block
      # add "  "; get; replace "\n" "\n  "; put; clear;
      add "parse: \n";
      add "while (true) { \n"; get;
      add "\n  break parse;\n}"; 
      --; --; put; clear;
      add "commandset*"; push; .reparse
    }
  }

  # -------------------------------
  # 4 tokens
  # -------------------------------

  pop;

  #-------------------------------------
  # bnf:     command := replace , quote , quote , ";" ;
  # example:  replace "and" "AND" ; 

  "word*quote*quote*;*" {
    clear; get;
    "replace" {
      #---------------------------
      # a command plus 2 arguments, eg replace "this" "that"
      clear; 
      add "/* replace */ \n";
      add "if (this.workspace.length() == 0) return; \n";
      add "String s = mm.workspace.toString().replace(";
      ++; get; add ", ";
      ++; get; add ");\n"; 
      add "this.workspace.setLength(0); \n";
      add "this.workspace.append(s); ";
      --; --; put;
      clear; add "command*"; push; .reparse
    }

    add "pep script error on line "; lines; 
    add " (character "; chars; add "): \n";
    add "  command does not take 2 quoted arguments. \n";
    print; quit;
  }

  #-------------------------------------
  # format: begin { #* commands *# }
  # "begin" blocks which are only executed once (they
  # will are assembled before the "start:" label. They must come before
  # all other commands.

  # "begin*{*command*}*",
  "begin*{*commandset*}*" {
     clear; 
     ++; ++; get; --; --; put; clear;
     add "beginblock*";
     push; .reparse
   }

   # -------------
   # parses and compiles concatenated tests
   # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ...

   # these 2 tests should be all that is necessary
   "test*,*ortestset*{*",
   "test*,*test*{*",

   "begintext*,*ortestset*{*",
   "endtext*,*ortestset*{*",
   "quote*,*ortestset*{*",
   "class*,*ortestset*{*",
   "eof*,*ortestset*{*",
   "tapetest*,*ortestset*{*" {

     B"begin" { clear; add "mm.workspace.toString().startsWith("; }
     B"end" { clear; add "mm.workspace.toString().endsWith("; }
     B"quote" { clear; add "mm.workspace.toString().equals("; }
     B"class" { clear; add "mm.workspace.toString().matches("; }
     # put clears the tapecell since no arguments here 
     B"eof" { clear; put; add "mm.eof("; }
     B"tapetest" { 
       clear; put; 
       add "(mm.workspace == mm.tape[mm.tapePointer]"; 
     }
     get; add ") || ";

     ++; ++; get; --; --; put; clear; 
     add "ortestset*{*";
     push; push;
     .reparse
   }

   # A collection of negated tests.

   # these 2 tests should be all that is necessary
   "test*,*ortestset*{*",
   "test*,*test*{*",

   "notbegintext*,*ortestset*{*",
   "notendtext*,*ortestset*{*",
   "notquote*,*ortestset*{*",
   "notclass*,*ortestset*{*",
   "noteof*,*ortestset*{*",
   "nottapetest*,*ortestset*{*" {

     # much of this code can be eliminated.
     B"notbegin" { clear; add "!mm.workspace.toString().startsWith("; }
     B"notend" { clear; add "!mm.workspace.toString().endsWith("; }
     B"notquote" { clear; add "!mm.workspace.toString().equals("; }
     B"notclass" { clear; add "!mm.workspace.testClass("; }
     # put clears the tapecell since no arguments here 
     B"noteof" { clear; put; add "!mm.eof("; }
     B"nottapetest" { 
       clear; put; 
       add "(mm.workspace != mm.tape[mm.tapePointer]"; 
     }
     get; add ") || ";

     ++; ++; get; --; --; put; clear; 
     # dont mix AND and OR concatenations 
     add "ortestset*{*";
     push; push; .reparse
   }

   # dont mix AND and OR concatenations 

   # -------------
   # AND logic 
   # parses and compiles concatenated AND tests
   # eg: 'a',B'b',E'c',[def],[:space:],[g-k] { ...
   # it is possible to elide this block with the negated block
   # for compactness but maybe readability is not as good.

   # these 2 tests should be all that is necessary
   "test*.*ortestset*{*",
   "test*.*test*{*",

   "begintext*.*andtestset*{*",
   "endtext*.*andtestset*{*",
   "quote*.*andtestset*{*",
   "class*.*andtestset*{*",
   "eof*.*andtestset*{*",
   "tapetest*.*andtestset*{*" {

     B"begin" { clear; add "mm.workspace.toString().startsWith("; }
     B"end" { clear; add "mm.workspace.toString().endsWith("; }
     B"quote" { clear; add "mm.workspace.toString().equals("; }
     # but need to quote the class text "..."
     B"class" { clear; add "mm.workspace.testClass("; }
     # put clears the tapecell since no arguments here 
     B"eof" { clear; put; add "mm.eof("; }
     B"tapetest" { 
       clear; put; 
       add "(mm.workspace == mm.tape[mm.tapePointer]"; 
     }
     get; add ") && ";

     ++; ++; get; --; --; put; clear; 
     add "andtestset*{*";
     push; push; .reparse
   }

   # eg
   # negated tests concatenated with AND logic (.). The 
   # negated tests can be chained with non negated tests.
   # eg: B'http' . !E'.txt' { ... }

   # these 2 tests should be all that is necessary
   "test*.*ortestset*{*",
   "test*.*test*{*",

   "notbegintext*.*andtestset*{*",
   "notendtext*.*andtestset*{*",
   "notquote*.*andtestset*{*",
   "notclass*.*andtestset*{*",
   "noteof*.*andtestset*{*",
   "nottapetest*.*andtestset*{*" {
     B"notbegin" { clear; add "!mm.workspace.toString().startsWith("; }
     B"notend" { clear; add "!mm.workspace.toString().endsWith("; }
     B"notquote" { clear; add "!mm.workspace.toString().equals("; }
     B"notclass" { clear; add "!mm.workspace.testClass("; }
     # put clears the tapecell since no arguments here 
     B"noteof" { clear; put; add "(mm.peep != null"; }
     B"nottapetest" { 
       clear; put; 
       add "(mm.workspace != mm.tape[mm.tapePointer]"; 
     }
     get; add ") && ";
     ++; ++; get; --; --; put; clear; 
     add "andtestset*{*";
     push; push; .reparse
   }

  #-------------------------------------
  # we should not have to check for the {*command*}* pattern
  # because that has already been transformed to {*commandset*}*

  "test*{*commandset*}*",
  "andtestset*{*commandset*}*",
  "ortestset*{*commandset*}*" { 
     clear; 
     # indent the java code for readability
     ++; ++; add "  "; get; replace "\n" "\n  "; put; --; --; 
     clear; add "if ("; get; add ") {\n";
     ++; ++; get;
     add "\n}"; 
     --; --; put; clear;
     add "command*";
     push;
     # always reparse/compile
     .reparse
   }

  # -------------
  # multi-token end-of-stream errors
  # not a comprehensive list of errors...
  (eof) {
    E"begintext*",E"endtext*",E"test*",E"ortestset*",E"andtestset*" {
      add "  Error near end of script at line "; lines;
      add ". Test with no brace block? \n";
      print; clear; quit;
    }

    E"quote*",E"class*",E"word*"{
      put; clear; 
      add "Error at end of pep script near line "; lines; 
      add ": missing semi-colon? \n";
      add "Parse stack: "; get; add "\n";
      print; clear; quit;
    }

    E"{*", E"}*", E";*", E",*", E".*", E"!*", E"B*", E"E*" {
      put; clear; 
      add "Error: misplaced terminal character at end of script! (line "; 
      lines; add "). \n";
      add "Parse stack: "; get; add "\n";
      print; clear; quit;
    }
  }

  # put the 4 (or less) tokens back on the stack
  push; push; push; push;

  (eof) {
    print; clear;
    #---------------------
    # check if the script correctly parsed (there should only
    # be one token on the stack, namely "commandset*" or "command*").
    pop; pop;

    "commandset*", "command*" {
      clear;
      # indent generated code (6 spaces) for readability.
      add "      "; get; 
      replace "\n" "\n      "; put; clear;
      add '

 /* Java code generated by "compile.java.pss" */
 import java.io.*;
 import java.util.regex.*;
 import java.util.*;   // contains stack

 public class Machine {
   // using int instead of char so that all unicode code points are
   // available instead of just utf16. (emojis cant fit into utf16)
   private int accumulator;         // counter for anything
   private int peep;                // next char in input stream
   private int charsRead;           // No. of chars read so far
   private int linesRead;           // No. of lines read so far
   public StringBuffer workspace;   // text accumulator
   private Stack stack;              // parse token stack
   private static int LENGTH = 100;  // tape maximum length
   private StringBuffer[] tape;      // array of token attributes 
   private StringBuffer[] marks;     // tape marks
   private int tapePointer;          // pointer to current cell
   private Reader input;             // text input stream
   private boolean eof;              // end of stream reached?
   private boolean flag;             // not used here
   private StringBuffer escape;    // char used to "escape" others "\\"
   private StringBuffer delimiter; // push/pop delimiter (default is "*")
   
   /** make a new machine with a character stream reader */
   public Machine(Reader reader) {
     this.input = reader;
     this.eof = false;
     this.flag = false;
     this.charsRead = 0; 
     this.linesRead = 1; 
     this.escape = new StringBuffer("\\\\");
     this.delimiter = new StringBuffer("*");
     this.accumulator = 0;
     this.workspace = new StringBuffer("");
     this.stack = new Stack<String>();
     this.tapePointer = 0;
     this.tape = new StringBuffer[LENGTH];
     this.marks = new StringBuffer[LENGTH];

     for (int ii = 0; ii < this.tape.length; ii++) {
       this.tape[ii] = new StringBuffer(); 
       this.marks[ii] = new StringBuffer(); 
     }

     try
     { this.peep = this.input.read(); } 
     catch (java.io.IOException ex) {
       System.out.println("read error");
       System.exit(-1);
     }
   }

   /** read one character from the input stream and 
       update the machine. */
   public void read() {
     int iChar;
     try {
       if (this.eof) { System.exit(0); }
       this.charsRead++;
       // increment lines
       if (this.peep+"" == "\\n") { this.linesRead++; }
       this.workspace.append(Character.toChars(this.peep));
       this.peep = this.input.read(); 
       if (this.peep == -1) { this.eof = true; }
     }
     catch (IOException ex) {
       System.out.println("Error reading input stream" + ex);
       System.exit(-1);
     }
   }

   /** increment tape pointer by one */
   public void increment() {
     this.tapePointer++;
     if (this.tapePointer > Machine.LENGTH - 1) {
       System.out.println("Tape length exceeded [" + LENGTH + "]");
       System.exit(1);
     }
   }
   
   /** remove escape character  */
   public void unescapeChar(char c) {
     if (workspace.length() > 0) {
       String s = this.workspace.toString().replace("\\\\"+c, c+"");
       this.workspace.setLength(0); workspace.append(s);
     }
   }

   /** add escape character  */
   public void escapeChar(char c) {
     if (workspace.length() > 0) {
       String s = this.workspace.toString().replace(c+"", "\\\\"+c);
       workspace.setLength(0); workspace.append(s);
     }
   }

   /** reads the input stream until the workspace end with text */
   public void until(String sSuffix) {
     while (true) {
       if (this.eof) return;
       if (this.workspace.toString().endsWith(sSuffix) &&
          !this.workspace.toString().endsWith(this.escape.toString() + sSuffix))
         return;
       this.read();
     }
   }

   /** pop the first token from the stack into the workspace */
   public Boolean pop() {
     if (this.stack.isEmpty()) return false;
     this.workspace.insert(0, this.stack.pop());     
     if (this.tapePointer > 0) this.tapePointer--;
     return true;
   }

   /** push the first token from the workspace to the stack */
   public Boolean push() {
     String sItem;
     // dont increment the tape pointer on an empty push
     if (this.workspace.length() == 0) return false;
     int iFirstStar = this.workspace.indexOf("*");
     if (iFirstStar != -1) {
       sItem = this.workspace.toString().substring(0, iFirstStar + 1);
       this.workspace.delete(0, iFirstStar + 1);
     }
     else {
       sItem = this.workspace.toString();
       this.workspace.setLength(0);
     }
     this.stack.push(sItem);     
     this.increment(); 
     return true;
   }

  public static void main(String[] args) throws Exception { 
    Machine mm = new Machine(new InputStreamReader(System.in));
    script: 
    while (!mm.eof) {\n'; get;

      add "\n    }";
      add "\n  }";
      add "\n}\n";
      # put a copy of the final compilation into the tapecell
      # so it can be inspected interactively.
      put; print; clear; quit;
    }

    "beginblock*commandset*", "beginblock*command*" {
      clear; 
      # indent main code for readability.
      ++; add "      "; get; 
      replace "\n" "\n      "; put; clear; --;
      add "/* Assembled with the script 'compile.java.pss' */\n";
      add "import java.io.*; \n";
      # add "import java.util.regex.*; \n";
      add "public class Script { \n";
      add "  public static void main(String[] args) throws Exception { \n";
      add "    Machine mm = new Machine(new InputStreamReader(System.in));\n";
      get; add "\n"; ++; 
      # a labelled loop for "quit" (but quit can just exit?)
      add "    script: \n";
      add "    while (mm.eof() != true) {\n"; get;
      add "\n    }";
      add "\n  }";
      add "\n}\n";
      # put a copy of the final compilation into the tapecell
      # for interactive debugging.
      put; print; clear; quit;
    }

    push; push;
    clear;
    add "After compiling with 'compile.java.pss' (at EOF): \n ";
    add "  parse error in input script. \n ";
    print; clear; 
    unstack; put; clear;
    add "Parse stack: "; get; add "\n";
    add "   * debug script ";
    add "   >> pep -If script -i 'some input' \n ";
    add "   *  debug compilation. \n ";
    add "   >> pep -Ia asm.pp script' \n ";
    print; clear; 
    quit;

  } # not eof

  # there is an implicit .restart command here (jump start)