#!/usr/bin/ruby # code generated by "translate.ruby.pss" a pep script # http://bumble.sf.net/books/pars/tr/ # require 'something' class Machine # make a new machine attr_accessor :work, :charsRead, :linesRead, :escape, :delimiter, :counter, :stack, :tape, :cell, :marks, :eof, :peep def initialize() @size = 300 # how many elements in stack/tape/marks @eof = false # end of stream reached? @charsRead = 0 # how many chars already read @linesRead = 1 # how many lines already read @escape = "\\" @delimiter = "*" # push/pop delimiter (default "*") @counter = 0 # a counter for anything @work = "" # the workspace @stack = [] # stack for parse tokens @cell = 0 # current tape cell @tape = Array.new(@size) {String.new} # a list of attribute for tokens @marks = Array.new(@size) {String.new} # marked tape cells # or dont initialse peep until "parse()" calls "setInput()" @peep = ARGF.readchar end # multiline strings are ok in ruby def printSizeError() puts " Tape max size exceeded! tape maximum size = #{@size} tape cell (current) = #{@cell} You can increase the @size value in the ruby script but normally this error indicates an error in your parsing script. The only exception would be massively nested structures in the source data." end def setInput(newInput) puts "to be implemented" end # read one character from the input stream and # update the machine. def read if @eof then exit end @charsRead += 1; # increment lines if @peep == "\n" then @linesRead += 1 end @work += @peep @peep = ARGF.readchar if @peep.nil? then @eof = true; end end # test if all chars in workspace are in unicode category def isInCategory(cat) #for ch in @work # if not category(ch).start_with?(cat) then return false end #return True end # this needs to actually walk the string # eg "abcab\cabc" # not trivial def unescapeChar(c) @work.gsub!("#{@escape}#{c}", c) end # add escape character : trivial? def escapeChar(c) @work.gsub!(c, @escape+c) end # a helper for the multiescape until bug def countEscaped(suffix) count = 0 #s = @work.sub(/#{suffix}$/, "") s = @work.delete_suffix(suffix) while s.end_with?(@escape) count += 1 s.delete_suffix!(@escape) end # puts "count=#{count}" return count end # reads the input stream until the workspace end with text def until(suffix) # read at least one character if @eof then return end self.read() while true do if @eof then return end # need to count the @escape chars preceding suffix # if odd, keep reading, if even, stop if @work.end_with?(suffix) then if (self.countEscaped(suffix).even?) then return end end self.read() end end # pop the first token from the stack into the workspace */ def pop() if @stack.length == 0 then return false end @work = @stack.pop() + @work if @cell > 0 then @cell -= 1 end return true end # push the first token from the workspace to the stack def push() # dont increment the tape pointer on an empty push if @work == "" then return false end # need to get this from the delimiter. iFirst = @work.index(@delimiter) if iFirst.nil? @stack.push(@work); @work = ""; return true # also @stack << @work end # s[i..j] means all chars from i to j # s[i,n] means n chars from i @stack.push(@work[0..iFirst]) @work = @work[iFirst+1..-1] if @cell < @size then @cell += 1 else self.printSizeError(); exit end return true end def printState() puts "Stack[#{@stack.join(', ')}] Work[#{@work}] Peep[#{@peep}]" puts "Acc:#{@counter} Esc:#{@escape} Delim:#{@delimiter} Chars:#{@charsRead}" + " Lines:#{@linesRead} Cell:#{@cell}" end # this is where the actual parsing/compiling code should go # so that it can be used by other ruby classes/objects. Also # should have a stream argument. def parse(s) # a reset or "setinput()" method would be useful to parse a # different string/file/stream, without creating a new # machine object. # could use code like this to check if input is string or file #if isinstance(s, file) print("") # @reset(s) # @reader = s #elseif isinstance(s, string) #f = StringIO.StringIO("test") #for line in f print(line) #else # f = STDIN #end #puts "not implemented" end end # end of Machine class definition # will become: # mm.parse(sys.stdin) or # mm.parse("abcdef") or # open f; mm.parse(f) # the restart flag, which allows .restart to work before the # parse label, in languages (like ruby) that dont have # labelled loops restart = false mm = Machine.new while !mm.eof do # lex block while true mm.read() # read # Unlike Crockfords grammar, I will just completely ignore whitespace, # but this may not be acceptable in a rigorous application. Also, I # am just using the ctype.h definition of whitespace, whatever that # may be. if (mm.work.match?(/^[[:space:]]+$/)) then mm.work = '' # clear break end if (mm.work.match?(/^[0-9]+$/)) then # while while /^[0-9]+$/.match?(mm.peep) if mm.eof then break end mm.read() end mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "integer*" mm.push(); break end if (mm.work.match?(/^[a-z]+$/) || mm.work.match?(/^[A-Z]+$/)) then # while while /^[a-z]+$/.match?(mm.peep) if mm.eof then break end mm.read() end if (mm.work != "true" && mm.work != "false" && mm.work != "null" && mm.work != "e" && mm.work != "E") then # handle error mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "Unknown value '" mm.work += mm.tape[mm.cell] # get mm.work += "' at line " mm.work += mm.linesRead.to_s # lines mm.work += " (character " mm.work += mm.charsRead.to_s # chars mm.work += ").\n" print mm.work # print exit end mm.tape[mm.cell] = mm.work # put if (mm.work == "e" || mm.work == "E") then mm.work = '' # clear mm.work += "E*" mm.push(); break end mm.work = '' # clear mm.work += "value*" mm.push(); break end if (mm.work == "\"") then # save line number for error message mm.work = '' # clear mm.work += mm.linesRead.to_s # lines mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.until("\""); if (mm.eof) then mm.work = '' # clear mm.work += "Unterminated quote (\") char, starting at line " mm.work += mm.tape[mm.cell] # get mm.work += "\n" print mm.work # print exit end mm.work = mm.work[0..-2] # clip mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "string*" mm.push(); break end # literal tokens if (mm.work == "." || mm.work == "," || mm.work == ":" || mm.work == "-" || mm.work == "+" || mm.work == "[" || mm.work == "]" || mm.work == "{" || mm.work == "}") then mm.tape[mm.cell] = mm.work # put mm.work += "*" mm.push(); break end # here check if the workspace is empty. If not it is an error. if (mm.work != "") then mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += ", char " mm.work += mm.charsRead.to_s # chars mm.work += ": unquoted '" mm.work += mm.tape[mm.cell] # get mm.work += "' character.\n" print mm.work # print exit end break end if restart then restart = false; next; end # parse block while true # just for debugging while mm.pop() do next end # unstack mm.work += "\n" print mm.work # print mm.work = mm.work[0..-2] # clip while mm.push() do next end # stack # The parse/compile phase # -------------- # 2 tokens mm.pop(); mm.pop(); #----------- # Two token errors (not necessarily a complete list) # comma errors if (mm.work == "{*,*" || mm.work == ",*}*" || mm.work == "[*,*" || mm.work == ",*,*" || mm.work == ",*]*") then mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += ", char " mm.work += mm.charsRead.to_s # chars mm.work += " (extra or misplaced ',' comma)\n" print mm.work # print exit end # exponent errors (e/E must be followed by an int or signed int) if (mm.work != "E*" && mm.work.start_with?("E*") && !mm.work.end_with?("integer*") && !mm.work.end_with?("-*") && !mm.work.end_with?("+*") && !mm.work.end_with?("number*")) then mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): misplaced exponent 'e' or 'E' \n" mm.work += "In JSON syntax, e/E may only precede an int or signed int.\n" mm.work += "for example: 33e+01 \n" print mm.work # print exit end # exponent errors (e/E must be followed by an int or signed int) if (mm.work != "E*" && mm.work.end_with?("E*") && !mm.work.start_with?("integer*") && !mm.work.start_with?("sign.integer*") && !mm.work.start_with?("decimal*")) then mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): misplaced exponent 'e' or 'E' \n" mm.work += "In JSON syntax, e/E may only be preceded by an int, signed int.\n" mm.work += "or decimal eg: 33e+01 \n" print mm.work # print exit end # sign errors (+/- must be followed by an integer if (mm.work != "-*" && mm.work.start_with?("-*") && !mm.work.end_with?("integer*")) then mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): misplaced negative '-' sign\n" mm.work += "In JSON syntax, - may only precede a number \n" mm.work += "for example: -33.01 \n" print mm.work # print exit end # sign errors (+/- must be followed by an integer) if (mm.work != "+*" && mm.work.start_with?("+*") && !mm.work.end_with?("integer*")) then mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): misplaced positive '+' sign\n" mm.work += "In JSON syntax, + may only precede a number \n" mm.work += "for example: +33.01 \n" print mm.work # print exit end # dot errors (. must be followed by an integer) if (mm.work != ".*" && mm.work.start_with?(".*") && !mm.work.end_with?("integer*")) then mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): misplaced dot '.' sign\n" mm.work += "In JSON syntax, dots may only be used in decimal numbers \n" mm.work += "for example: -33.01 \n" print mm.work # print exit end # dot errors (. must be preceded by an integer or signed integer) if (mm.work != ".*" && mm.work.end_with?(".*") && !mm.work.start_with?("integer*") && !mm.work.start_with?("sign.integer*")) then mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): misplaced dot '.' sign\n" mm.work += "In JSON syntax, dots may only be used in decimal numbers \n" mm.work += "for example: -33.01, but .44 is not a legal JSON number \n" print mm.work # print exit end # eg errors "items*:*","members*:*",",*:*","[*:*","{*:*" # A colon must be preceded by a string. Using logic if (mm.work.end_with?(":*") && mm.work != ":*" && !mm.work.start_with?("string*")) then mm.work = '' # clear mm.work += "Json syntax error near line " mm.work += mm.linesRead.to_s # lines mm.work += ", char " mm.work += mm.charsRead.to_s # chars mm.work += " (misplaced colon ':') \n" mm.work += "A ':' can only occur after a string key in an object structure \n" mm.work += "Example: {\"cancelled\":true} \n" print mm.work # print exit end # more colon errors if (mm.work != ":*" && mm.work.start_with?(":*")) then if (mm.work.end_with?("}*") || mm.work.end_with?(",*") || mm.work.end_with?("]*")) then mm.work = '' # clear mm.work += "JSON syntax error near line " mm.work += mm.linesRead.to_s # lines mm.work += ", char " mm.work += mm.charsRead.to_s # chars mm.work += " (misplaced colon ':' or missing value?) \n" mm.work += "A ':' only occur as part of an object member \n" mm.work += "Example: {\"cancelled\":true} \n" print mm.work # print exit end end # catch object member errors # also need to check that not only 1 token in on the stack # hence the !"member*" construct if (mm.work.start_with?("member*") || mm.work.start_with?("members*")) then if (mm.work != "member*" && mm.work != "members*" && !mm.work.end_with?(",*") && !mm.work.end_with?("}*")) then mm.work = '' # clear mm.work += "JSON syntax error after object member near line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += ")\n" print mm.work # print exit end end # catch array errors if (mm.work.start_with?("items*") && mm.work != "items*" && !mm.work.end_with?(",*") && !mm.work.end_with?("]*")) then mm.work = '' # clear mm.work += "Error after an array item near line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += ")\n" print mm.work # print exit end if (mm.work.start_with?("array*") || mm.work.start_with?("object*")) then if (mm.work != "array*" && mm.work != "object*" && !mm.work.end_with?(",*") && !mm.work.end_with?("}*") && !mm.work.end_with?("]*")) then mm.work = '' # clear mm.work += "JSON syntax error near line " mm.work += mm.linesRead.to_s # lines mm.work += " char " mm.work += mm.charsRead.to_s # chars mm.work += ")\n" print mm.work # print exit end end # invalid string sequence if (mm.work.start_with?("string*")) then if (mm.work != "string*" && !mm.work.end_with?(",*") && !mm.work.end_with?("]*") && !mm.work.end_with?("}*") && !mm.work.end_with?(":*")) then mm.work = '' # clear mm.work += "JSON syntax error after a string near line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += ")\n" print mm.work # print exit end end # transmogrify into array item, start array if (mm.work == "[*number*" || mm.work == "[*string*" || mm.work == "[*value*" || mm.work == "[*array*" || mm.work == "[*object*") then mm.work = '' # clear mm.work += "[*items*" mm.push(); mm.push(); next end # exponents (e-403, E+120, E04), this slightly simplifies number parsing if (mm.work == "E*sign.integer*" || mm.work == "E*integer*") then mm.work = '' # clear mm.work += "^" mm.cell += 1 # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0 then mm.cell -= 1; end # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "exponent*" mm.push(); next end # JSON scientific format (23e-10, -201E+33) if (mm.work == "integer*exponent*" || mm.work == "sign.integer*exponent*") then mm.work = '' # clear mm.work += mm.tape[mm.cell] # get # enforce multidigit zero rules # But is "0e44" legal JSON number syntax? That would seem odd # if it is. if (mm.work.start_with?("+")) then mm.work = '' # clear mm.work += "JSON syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): \n" mm.work += "In JSON syntax, the number part may not have a positive sign \n" mm.work += "eg: +0.12e34 (error!) \n" mm.work += "eg: 0.12e+34 (OK!) \n" print mm.work # print exit end if (mm.work.start_with?("-")) then mm.work = mm.work[1..-1]; # clop end if (mm.work != "0" && mm.work.start_with?("0")) then mm.work = '' # clear mm.work += "Json syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): \n" mm.work += "In JSON syntax, multidigit numbers must begin with 1-9 \n" mm.work += "eg: -0234.01E+9 (error) \n" print mm.work # print exit end mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.cell += 1 # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0 then mm.cell -= 1; end # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "number*" mm.push(); next end # JSON scientific format (-0.23e10, 10.2E+33) if (mm.work == "decimal*exponent*") then mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.cell += 1 # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0 then mm.cell -= 1; end # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "number*" mm.push(); next end # where does a number terminate, this is the problem # It terminates at the tokens ,* }* ]* and maybe space but # this script doesnt have a space* token. if (mm.work == "sign.integer*,*" || mm.work == "integer*,*") then mm.work = '' # clear mm.work += "number*,*" mm.push(); mm.push(); next end # transmog if (mm.work == "sign.integer*]*" || mm.work == "integer*]*") then mm.work = '' # clear mm.work += "items*]*" mm.push(); mm.push(); next end if (mm.work == "sign.integer*}*" || mm.work == "integer*}*") then mm.work = '' # clear mm.work += "number*}*" mm.push(); mm.push(); next end # convert decimals to numbers with token lookahead if (mm.work == "decimal*}*" || mm.work == "decimal*]*" || mm.work == "decimal*,*") then # replace if mm.work.length > 0 then mm.work.gsub!("decimal*", "number*") end mm.push(); mm.push(); next end # signed numbers if (mm.work == "-*integer*" || mm.work == "+*integer*") then mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.cell += 1 # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0 then mm.cell -= 1; end # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "sign.integer*" mm.push(); next end # signed numbers if (mm.work == "-*integer*" || mm.work == "+*integer*") then mm.work = '' # clear mm.work += mm.tape[mm.cell] # get mm.cell += 1 # ++ mm.work += mm.tape[mm.cell] # get if mm.cell > 0 then mm.cell -= 1; end # -- mm.tape[mm.cell] = mm.work # put mm.work = '' # clear mm.work += "sign.integer*" mm.push(); next end # empty arrays are legal json if (mm.work == "[*]*") then mm.work = '' # clear mm.work += "array*" mm.push(); next end # empty objects are legal json if (mm.work == "{*}*") then mm.work = '' # clear mm.work += "object*" mm.push(); next end # -------------- # 3 tokens mm.pop(); #--------------- # Some three token errors # Object errors # A negative logic doesnt work because of the lookahead required for numbers if (mm.work == "{*string*}*" || mm.work == "{*integer*}*" || mm.work == "{*sign.integer*}*" || mm.work == "{*array*}*" || mm.work == "{*object*}*" || mm.work == "{*value*}*" || mm.work == "{*decimal*}*") then mm.work = '' # clear mm.work += "Json syntax error near line " mm.work += mm.linesRead.to_s # lines mm.work += ", char " mm.work += mm.charsRead.to_s # chars mm.work += " (misplaced brace '}' or bad object) \n" mm.work += "A '}' can only occur to terminate an object structure \n" mm.work += "Example: {\"hour\":21.00, \"cancelled\":true} \n" print mm.work # print exit end # transmogrify number into array item if (mm.work == "[*number*,*") then mm.work = '' # clear mm.work += "[*items*,*" mm.push(); mm.push(); mm.push(); next end # decimal numbers eg -4.334 or +4.3 or 0.1 if (mm.work == "sign.integer*.*integer*") then mm.work = '' # clear mm.work += mm.tape[mm.cell] # get if (mm.work.start_with?("+")) then #error, no positive signed decimals in JSON mm.work += "Json syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): misplaced positive '+' sign\n" mm.work += "In JSON syntax, decimal numbers are not positively signed\n" mm.work += "eg: +33.01 (error) \n" print mm.work # print exit end if (mm.work.start_with?("-0") && mm.work != "-0") then mm.work += "Json syntax error at line " mm.work += mm.linesRead.to_s # lines mm.work += " (char " mm.work += mm.charsRead.to_s # chars mm.work += "): \n" mm.work += "In JSON syntax, multidigit numbers must begin with 1-9 \n" mm.work += "eg: -0234.01E+9 (error) \n" print mm.work # print exit end mm.work = '' # clear mm.work += "decimal*" mm.push(); next end # decimal numbers eg -4.334 or +4.3 or 0.1 if (mm.work == "integer*.*integer*") then mm.work = '' # clear mm.work += "decimal*" mm.push(); next end # arrays, if (mm.work == "[*items*]*" || mm.work == "[*number*]*") then mm.work = '' # clear mm.work += "array*" mm.push(); next end # if (mm.work == "items*,*string*" || mm.work == "items*,*value*" || mm.work == "items*,*array*" || mm.work == "items*,*object*" || mm.work == "items*,*number*") then mm.work = '' # clear mm.work += "items*" mm.push(); next end # object members #"string*:*integer*", if (mm.work == "string*:*number*" || mm.work == "string*:*string*" || mm.work == "string*:*value*" || mm.work == "string*:*object*" || mm.work == "string*:*array*") then mm.work = '' # clear mm.work += "member*" mm.push(); next end # multiple elements of an object if (mm.work == "member*,*member*" || mm.work == "members*,*member*") then mm.work = '' # clear mm.work += "members*" mm.push(); next end # if (mm.work == "{*members*}*" || mm.work == "{*member*}*") then mm.work = '' # clear mm.work += "object*" mm.push(); next end mm.pop(); # -------------- # 4 tokens if (mm.work == "items*,*items*,*" || mm.work == "items*,*number*,*") then mm.work = '' # clear mm.work += "items*,*" mm.push(); mm.push(); next end # numbers require a lookahead token, unfortunately if (mm.work == "string*:*number*,*") then mm.work = '' # clear mm.work += "member*,*" mm.push(); mm.push(); next end # numbers require a lookahead token, unfortunately if (mm.work == "string*:*number*}*") then mm.work = '' # clear mm.work += "member*}*" mm.push(); mm.push(); next end # multiple elements of an object with lookahead if (mm.work == "member*,*member*,*" || mm.work == "members*,*member*,*") then mm.work = '' # clear mm.work += "members*,*" mm.push(); mm.push(); next end # multiple elements of an object with lookahead if (mm.work == "member*,*member*}*" || mm.work == "members*,*member*}*") then mm.work = '' # clear mm.work += "members*}*" mm.push(); mm.push(); next end mm.pop(); # -------------- # 5 tokens # need this clumsy rule for numbers which get resolved when # a ] is seen. This is the lookahead if (mm.work == "[*items*,*items*]*" || mm.work == "[*items*,*number*]*") then mm.work = '' # clear mm.work += "array*" mm.push(); next end mm.push(); mm.push(); mm.push(); mm.push(); mm.push(); if (mm.eof) then while mm.pop() do next end # unstack if (mm.work == "object*" || mm.work == "array*" || mm.work == "value*" || mm.work == "string*" || mm.work == "integer*" || mm.work == "decimal*" || mm.work == "number*") then while mm.push() do next end # stack mm.work += "(Appears to be) valid JSON syntax. Top level structure was '" print mm.work # print mm.work = '' # clear mm.pop(); mm.work = mm.work[0..-2] # clip mm.work += "'\n" print mm.work # print mm.work = '' # clear exit end while mm.push() do next end # stack mm.work += "(maybe) Invalid JSON \n" mm.work += "The parse stack was \n" print mm.work # print mm.work = '' # clear while mm.pop() do next end # unstack mm.work += "\n" print mm.work # print end break end # parse end # end of generated code