#!/usr/bin/env python3 # code generated by "translate.py.pss" a pep script # bumble.sf.net/books/pars/ import sys, re # for sys.read(), write() and regex from unicodedata import category # for matching classes # may use, which could make the char class code easier # import regex # regex.findall(r'[[:graph:]]', 'a 0 a b z') class Machine: # make a new machine def __init__(self): self.size = 300 # how many elements in stack/tape/marks self.eof = False # end of stream reached? self.charsRead = 0 # how many chars already read self.linesRead = 1 # how many lines already read self.escape = "\\" self.delimiter = "*" # push/pop delimiter (default "*") self.counter = 0 # a counter for anything self.work = "" # the workspace self.stack = [] # stack for parse tokens self.cell = 0 # current tape cell self.tape = [""]*self.size # a list of attribute for tokens self.marks = [""]*self.size # marked tape cells # or dont initialse peep until "parse()" calls "setInput()" self.peep = sys.stdin.read(1) def setInput(self, newInput): print("to be implemented") # read one character from the input stream and # update the machine. def read(self): if self.eof: System.exit(0) self.charsRead += 1; # increment lines if self.peep == "\n": self.linesRead += 1 self.work += self.peep self.peep = sys.stdin.read(1) if not self.peep: self.eof = True # increment the tape pointer (command ++) and increase the # tape and marks array sizes if necessary def increment(self): self.cell += 1 if self.cell >= self.size: self.tape.append("") self.marks.append("") self.size += 1 # test if all chars in the text are in the unicode category # no! bug! because while checks mm.peep, but class test # checks mm.work. so have to adapt this function for either. def isInCategory(self, cat, text): for ch in text: if not category(ch).startswith(cat): return False return True # def # remove escape character: trivial method ? def unescapeChar(self, c): if len(self.work) > 0: self.work = self.work.replace("\\"+c, c) # add escape character : trivial def escapeChar(self, c): if len(self.work) > 0: self.work = self.work.replace(c, "\\"+c) # a helper function for the multiple escape char bug def countEscaped(self, suffix): count = 0 if self.work.endswith(suffix): # removesuffix not available in early python s = self.work.removesuffix(suffix) while s.endswith(self.escape): count += 1 s = s.removesuffix(self.escape) return count # reads the input stream until the workspace end with text def until(self, suffix): # read at least one character if self.eof: return self.read() while True: if self.eof: return # no. bug! count the trailing escape chars, odd=continue, even=stop if self.work.endswith(suffix): #and (not self.work.endswith(self.escape + suffix)): if self.countEscaped(suffix) % 2 == 0: return self.read() # pop the first token from the stack into the workspace */ def pop(self): if len(self.stack) == 0: return False self.work = mm.stack.pop() + self.work if self.cell > 0: self.cell -= 1 return True # push the first token from the workspace to the stack def push(self): # dont increment the tape pointer on an empty push if len(self.work) == 0: return False # need to get this from the delimiter. iFirst = self.work.find(self.delimiter); if iFirst == -1: self.stack.append(self.work) self.work = "" return True self.stack.append(self.work[0:iFirst+1]) self.work = self.work[iFirst+1:] self.increment() return True # this function is not used (the code is "inlined") def swap(self): s = self.work self.work = self.tape[self.cell] self.tape[self.cell] = s def goToMark(self, mark): markFound = False length = len(self.marks) for ii in range(length): if (mm.marks[ii] == mark): mm.cell = ii; markFound = True if (markFound == False): print("badmark '" + mark + "'!") exit() def writeToFile(self): f = open("sav.pp", "w") f.write(self.work) f.close() def printState(self): print("Stack[" + ",".join(self.stack) + "] Work[" + self.work + "] Peep[" + self.peep + "]"); print("Acc:" + str(self.counter) + " Esc:" + self.escape + " Delim:" + self.delimiter + " Chars:" + str(self.charsRead) + " Lines:" + str(self.linesRead) + " Cell:" + str(self.cell)); # this is where the actual parsing/compiling code should go # so that it can be used by other python classes/objects. Also # should have a stream argument. def parse(self, s): # a reset or "setinput()" method would be useful to parse a # different string/file/stream, without creating a new # machine object. # could use code like this to check if input is string or file if isinstance(s, file): print("") # self.reset(s) # self.reader = s elif isinstance(s, string): f = StringIO.StringIO("test") for line in f: print(line) else: f = sys.stdin sys.stdout.write("not implemented") # end of Machine class definition # will become: # mm.parse(sys.stdin) or # mm.parse("abcdef") or # open f; mm.parse(f) temp = "" mm = Machine() while (not mm.eof): # lex block while True: mm.read() # read # Unlike Crockfords grammar, I will just completely ignore whitespace, # but this may not be acceptable in a rigorous application. Also, I # am just using the ctype.h definition of whitespace, whatever that # may be. if (re.match(r"^[\s]+$", mm.work)): mm.work = '' # clear break if (re.match(r"^[0-9]+$", mm.work)): # while while re.match(r"^[0-9]+$", mm.peep): if mm.eof: break mm.read() mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "integer*" mm.push(); break if (re.match(r"^[a-z]+$", mm.work) or re.match(r"^[A-Z]+$", mm.work)): # while while re.match(r"^[a-z]+$", mm.peep): if mm.eof: break mm.read() if (mm.work != "true" and mm.work != "false" and mm.work != "null" and mm.work != "e" and mm.work != "E"): # handle error mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "Unknown value '" mm.work += mm.tape[mm.cell] # get mm.work += "' at line " mm.work += str(mm.linesRead) # lines mm.work += " (character " mm.work += str(mm.charsRead) # chars mm.work += ").\n" sys.stdout.write(mm.work) # print exit() mm.tape[mm.cell] = mm.work # put if (mm.work == "e" or mm.work == "E"): mm.work = '' # clear mm.work += "E*" mm.push(); break mm.work = '' # clear mm.work += "value*" mm.push(); break if (mm.work == "\""): # save line number for error message mm.work = '' # clear mm.work += str(mm.linesRead) # lines mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.until("\""); if (mm.eof): mm.work = '' # clear mm.work += "Unterminated quote (\") char, starting at line " mm.work += mm.tape[mm.cell] # get mm.work += "\n" sys.stdout.write(mm.work) # print exit() # if len(mm.work) > 0: # clip mm.work = mm.work[:-1] # clip mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "string*" mm.push(); break # literal tokens if (mm.work == "." or mm.work == "," or mm.work == ":" or mm.work == "-" or mm.work == "+" or mm.work == "[" or mm.work == "]" or mm.work == "{" or mm.work == "}"): mm.tape[mm.cell] = mm.work # put mm.work += "*" mm.push(); break # here check if the workspace is empty. If not it is an error. if (mm.work != ""): mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += ", char " mm.work += str(mm.charsRead) # chars mm.work += ": unquoted '" mm.work += mm.tape[mm.cell] # get mm.work += "' character.\n" sys.stdout.write(mm.work) # print exit() break # parse block while True: # This is for visualising stack reductions when debugging #unstack; add "\n"; print; clip; stack; # The parse/compile phase # -------------- # 2 tokens mm.pop(); mm.pop(); #----------- # Two token errors (not necessarily a complete list) # comma errors if (mm.work == "{*,*" or mm.work == ",*}*" or mm.work == "[*,*" or mm.work == ",*,*" or mm.work == ",*]*"): mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += ", char " mm.work += str(mm.charsRead) # chars mm.work += " (extra or misplaced ',' comma)\n" sys.stdout.write(mm.work) # print exit() # exponent errors (e/E must be followed by an int or signed int) if (mm.work != "E*" and mm.work.startswith("E*") and not mm.work.endswith("integer*") and not mm.work.endswith("-*") and not mm.work.endswith("+*") and not mm.work.endswith("number*")): mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): misplaced exponent 'e' or 'E' \n" mm.work += "In JSON syntax, e/E may only precede an int or signed int.\n" mm.work += "for example: 33e+01 \n" sys.stdout.write(mm.work) # print exit() # exponent errors (e/E must be followed by an int or signed int) if (mm.work != "E*" and mm.work.endswith("E*") and not mm.work.startswith("integer*") and not mm.work.startswith("sign.integer*") and not mm.work.startswith("decimal*")): mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): misplaced exponent 'e' or 'E' \n" mm.work += "In JSON syntax, e/E may only be preceded by an int, signed int.\n" mm.work += "or decimal eg: 33e+01 \n" sys.stdout.write(mm.work) # print exit() # sign errors (+/- must be followed by an integer if (mm.work != "-*" and mm.work.startswith("-*") and not mm.work.endswith("integer*")): mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): misplaced negative '-' sign\n" mm.work += "In JSON syntax, - may only precede a number \n" mm.work += "for example: -33.01 \n" sys.stdout.write(mm.work) # print exit() # sign errors (+/- must be followed by an integer) if (mm.work != "+*" and mm.work.startswith("+*") and not mm.work.endswith("integer*")): mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): misplaced positive '+' sign\n" mm.work += "In JSON syntax, + may only precede a number \n" mm.work += "for example: +33.01 \n" sys.stdout.write(mm.work) # print exit() # dot errors (. must be followed by an integer) if (mm.work != ".*" and mm.work.startswith(".*") and not mm.work.endswith("integer*")): mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): misplaced dot '.' sign\n" mm.work += "In JSON syntax, dots may only be used in decimal numbers \n" mm.work += "for example: -33.01 \n" sys.stdout.write(mm.work) # print exit() # dot errors (. must be preceded by an integer or signed integer) if (mm.work != ".*" and mm.work.endswith(".*") and not mm.work.startswith("integer*") and not mm.work.startswith("sign.integer*")): mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): misplaced dot '.' sign\n" mm.work += "In JSON syntax, dots may only be used in decimal numbers \n" mm.work += "for example: -33.01, but .44 is not a legal JSON number \n" sys.stdout.write(mm.work) # print exit() # eg errors "items*:*","members*:*",",*:*","[*:*","{*:*" # A colon must be preceded by a string. Using logic if (mm.work.endswith(":*") and mm.work != ":*" and not mm.work.startswith("string*")): mm.work = '' # clear mm.work += "Json syntax error near line " mm.work += str(mm.linesRead) # lines mm.work += ", char " mm.work += str(mm.charsRead) # chars mm.work += " (misplaced colon ':') \n" mm.work += "A ':' can only occur after a string key in an object structure \n" mm.work += "Example: {\"cancelled\":true} \n" sys.stdout.write(mm.work) # print exit() # more colon errors if (mm.work != ":*" and mm.work.startswith(":*")): if (mm.work.endswith("}*") or mm.work.endswith(",*") or mm.work.endswith("]*")): mm.work = '' # clear mm.work += "JSON syntax error near line " mm.work += str(mm.linesRead) # lines mm.work += ", char " mm.work += str(mm.charsRead) # chars mm.work += " (misplaced colon ':' or missing value?) \n" mm.work += "A ':' only occur as part of an object member \n" mm.work += "Example: {\"cancelled\":true} \n" sys.stdout.write(mm.work) # print exit() # catch object member errors # also need to check that not only 1 token in on the stack # hence the !"member*" construct if (mm.work.startswith("member*") or mm.work.startswith("members*")): if (mm.work != "member*" and mm.work != "members*" and not mm.work.endswith(",*") and not mm.work.endswith("}*")): mm.work = '' # clear mm.work += "JSON syntax error after object member near line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += ")\n" sys.stdout.write(mm.work) # print exit() # catch array errors if (mm.work.startswith("items*") and mm.work != "items*" and not mm.work.endswith(",*") and not mm.work.endswith("]*")): mm.work = '' # clear mm.work += "Error after an array item near line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += ")\n" sys.stdout.write(mm.work) # print exit() if (mm.work.startswith("array*") or mm.work.startswith("object*")): if (mm.work != "array*" and mm.work != "object*" and not mm.work.endswith(",*") and not mm.work.endswith("}*") and not mm.work.endswith("]*")): mm.work = '' # clear mm.work += "JSON syntax error near line " mm.work += str(mm.linesRead) # lines mm.work += " char " mm.work += str(mm.charsRead) # chars mm.work += ")\n" sys.stdout.write(mm.work) # print exit() # invalid string sequence if (mm.work.startswith("string*")): if (mm.work != "string*" and not mm.work.endswith(",*") and not mm.work.endswith("]*") and not mm.work.endswith("}*") and not mm.work.endswith(":*")): mm.work = '' # clear mm.work += "JSON syntax error after a string near line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += ")\n" sys.stdout.write(mm.work) # print exit() # transmogrify into array item, start array if (mm.work == "[*number*" or mm.work == "[*string*" or mm.work == "[*value*" or mm.work == "[*array*" or mm.work == "[*object*"): mm.work = '' # clear mm.work += "[*items*" mm.push(); mm.push(); continue # exponents (e-403, E+120, E04), this slightly simplifies number parsing if (mm.work == "E*sign.integer*" or mm.work == "E*integer*"): mm.work = '' # clear mm.work += "^" mm.increment() # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0: mm.cell -= 1 # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "exponent*" mm.push(); continue # JSON scientific format (23e-10, -201E+33) if (mm.work == "integer*exponent*" or mm.work == "sign.integer*exponent*"): mm.work = '' # clear mm.work += mm.tape[mm.cell] # get # enforce multidigit zero rules # But is "0e44" legal JSON number syntax? That would seem odd # if it is. if (mm.work.startswith("+")): mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): \n" mm.work += "In JSON syntax, the number part may not have a positive sign \n" mm.work += "eg: +0.12e34 (error!) \n" mm.work += "eg: 0.12e+34 (OK!) \n" sys.stdout.write(mm.work) # print exit() if (mm.work.startswith("-")): # if len(mm.work) > 0: # clop mm.work = mm.work[1:]; # clop if (mm.work != "0" and mm.work.startswith("0")): mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): \n" mm.work += "In JSON syntax, multidigit numbers must begin with 1-9 \n" mm.work += "eg: -0234.01E+9 (error) \n" sys.stdout.write(mm.work) # print exit() mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.increment() # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0: mm.cell -= 1 # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "number*" mm.push(); continue # JSON scientific format (-0.23e10, 10.2E+33) if (mm.work == "decimal*exponent*"): mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.increment() # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0: mm.cell -= 1 # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "number*" mm.push(); continue # where does a number terminate, this is the problem # It terminates at the tokens ,* }* ]* and maybe space but # this script doesnt have a space* token. if (mm.work == "sign.integer*,*" or mm.work == "integer*,*"): mm.work = '' # clear mm.work += "number*,*" mm.push(); mm.push(); continue # transmog if (mm.work == "sign.integer*]*" or mm.work == "integer*]*"): mm.work = '' # clear mm.work += "items*]*" mm.push(); mm.push(); continue if (mm.work == "sign.integer*}*" or mm.work == "integer*}*"): mm.work = '' # clear mm.work += "number*}*" mm.push(); mm.push(); continue # convert decimals to numbers with token lookahead if (mm.work == "decimal*}*" or mm.work == "decimal*]*" or mm.work == "decimal*,*"): # replace if len(mm.work) != 0: mm.work = mm.work.replace("decimal*", "number*") mm.push(); mm.push(); continue # signed numbers if (mm.work == "-*integer*" or mm.work == "+*integer*"): mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.increment() # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0: mm.cell -= 1 # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "sign.integer*" mm.push(); continue # signed numbers if (mm.work == "-*integer*" or mm.work == "+*integer*"): mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.increment() # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0: mm.cell -= 1 # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "sign.integer*" mm.push(); continue # empty arrays are legal json if (mm.work == "[*]*"): mm.work = '' # clear mm.work += "array*" mm.push(); continue # empty objects are legal json if (mm.work == "{*}*"): mm.work = '' # clear mm.work += "object*" mm.push(); continue # -------------- # 3 tokens mm.pop(); #--------------- # Some three token errors # Object errors # A negative logic doesnt work because of the lookahead required for numbers if (mm.work == "{*string*}*" or mm.work == "{*integer*}*" or mm.work == "{*sign.integer*}*" or mm.work == "{*array*}*" or mm.work == "{*object*}*" or mm.work == "{*value*}*" or mm.work == "{*decimal*}*"): mm.work = '' # clear mm.work += "Json syntax error near line " mm.work += str(mm.linesRead) # lines mm.work += ", char " mm.work += str(mm.charsRead) # chars mm.work += " (misplaced brace '}' or bad object) \n" mm.work += "A '}' can only occur to terminate an object structure \n" mm.work += "Example: {\"hour\":21.00, \"cancelled\":true} \n" sys.stdout.write(mm.work) # print exit() # transmogrify number into array item if (mm.work == "[*number*,*"): mm.work = '' # clear mm.work += "[*items*,*" mm.push(); mm.push(); mm.push(); continue # decimal numbers eg -4.334 or +4.3 or 0.1 if (mm.work == "sign.integer*.*integer*"): mm.work = '' # clear mm.work += mm.tape[mm.cell] # get if (mm.work.startswith("+")): #error, no positive signed decimals in JSON mm.work += "Json syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): misplaced positive '+' sign\n" mm.work += "In JSON syntax, decimal numbers are not positively signed\n" mm.work += "eg: +33.01 (error) \n" sys.stdout.write(mm.work) # print exit() if (mm.work.startswith("-0") and mm.work != "-0"): mm.work += "Json syntax error at line " mm.work += str(mm.linesRead) # lines mm.work += " (char " mm.work += str(mm.charsRead) # chars mm.work += "): \n" mm.work += "In JSON syntax, multidigit numbers must begin with 1-9 \n" mm.work += "eg: -0234.01E+9 (error) \n" sys.stdout.write(mm.work) # print exit() mm.work = '' # clear mm.work += "decimal*" mm.push(); continue # decimal numbers eg -4.334 or +4.3 or 0.1 if (mm.work == "integer*.*integer*"): mm.work = '' # clear mm.work += "decimal*" mm.push(); continue # arrays, if (mm.work == "[*items*]*" or mm.work == "[*number*]*"): mm.work = '' # clear mm.work += "array*" mm.push(); continue # if (mm.work == "items*,*string*" or mm.work == "items*,*value*" or mm.work == "items*,*array*" or mm.work == "items*,*object*" or mm.work == "items*,*number*"): mm.work = '' # clear mm.work += "items*" mm.push(); continue # object members #"string*:*integer*", if (mm.work == "string*:*number*" or mm.work == "string*:*string*" or mm.work == "string*:*value*" or mm.work == "string*:*object*" or mm.work == "string*:*array*"): mm.work = '' # clear mm.work += "member*" mm.push(); continue # multiple elements of an object if (mm.work == "member*,*member*" or mm.work == "members*,*member*"): mm.work = '' # clear mm.work += "members*" mm.push(); continue # if (mm.work == "{*members*}*" or mm.work == "{*member*}*"): mm.work = '' # clear mm.work += "object*" mm.push(); continue mm.pop(); # -------------- # 4 tokens if (mm.work == "items*,*items*,*" or mm.work == "items*,*number*,*"): mm.work = '' # clear mm.work += "items*,*" mm.push(); mm.push(); continue # numbers require a lookahead token, unfortunately if (mm.work == "string*:*number*,*"): mm.work = '' # clear mm.work += "member*,*" mm.push(); mm.push(); continue # numbers require a lookahead token, unfortunately if (mm.work == "string*:*number*}*"): mm.work = '' # clear mm.work += "member*}*" mm.push(); mm.push(); continue # multiple elements of an object with lookahead if (mm.work == "member*,*member*,*" or mm.work == "members*,*member*,*"): mm.work = '' # clear mm.work += "members*,*" mm.push(); mm.push(); continue # multiple elements of an object with lookahead if (mm.work == "member*,*member*}*" or mm.work == "members*,*member*}*"): mm.work = '' # clear mm.work += "members*}*" mm.push(); mm.push(); continue mm.pop(); # -------------- # 5 tokens # need this clumsy rule for numbers which get resolved when # a ] is seen. This is the lookahead if (mm.work == "[*items*,*items*]*" or mm.work == "[*items*,*number*]*"): mm.work = '' # clear mm.work += "array*" mm.push(); continue mm.push(); mm.push(); mm.push(); mm.push(); mm.push(); if (mm.eof): while (mm.pop()): continue # unstack if (mm.work == "object*" or mm.work == "array*" or mm.work == "value*" or mm.work == "string*" or mm.work == "integer*" or mm.work == "decimal*" or mm.work == "number*"): while (mm.push()): continue # stack mm.work += "(Appears to be) valid JSON syntax. Top level structure was '" sys.stdout.write(mm.work) # print mm.work = '' # clear mm.pop(); # if len(mm.work) > 0: # clip mm.work = mm.work[:-1] # clip mm.work += "'\n" sys.stdout.write(mm.work) # print mm.work = '' # clear exit() while (mm.push()): continue # stack mm.work += "(maybe) Invalid JSON \n" mm.work += "The parse stack was \n" sys.stdout.write(mm.work) # print mm.work = '' # clear while (mm.pop()): continue # unstack mm.work += "\n" sys.stdout.write(mm.work) # print break # parse # end of code generated by tr/translate.py.pss