#-- attempts to tokenise or to lexically analyse
#-- a grammar file

#p;
#l;
#-- Under DOS is maybe necessary to remove the
#-- final \r character
# s/.$//;
#-- Lines that begin with a hash in the grammar
#-- file should be ignored as comments
/^#/{
 s/^/# Input Line:/;
 p;
 d;
 i\
 Ignored as comment
}

h;
s/^/# Input Line:/;
l;
x;

s/$/ /;
: convertBrackets
s/\("[^)"]*\))/\1\&rbrak;/;
s/\("[^("]*\)(/\1\&lbrak;/;
t convertBrackets
s/"\([^"]*\)"/QUOTED-STRING(\1)|/g;

#-- Within a quoted string, special symbol characters
#-- need to be converted
: convert
s/\(QUOTED-STRING([^);]*\);/\1[:semi-colon:]/;
s/\(QUOTED-STRING([^ )]*\) /\1[:space:]/;
s/\(QUOTED-STRING([^{)]*\){/\1[:lbrace:]/;
s/\(QUOTED-STRING([^})]*\)}/\1[:rbrace:]/;
s/\(QUOTED-STRING([^)]*\):=/\1[:colon-equals:]/;
s/\(QUOTED-STRING([^)\\]*\)\\/\1[:back-slash:]/;
s/\(QUOTED-STRING([^)]*\)-->/\1[:arrow:]/;
t convert

#-- Since an identifier is end-delimited by a space
#-- we place a space in front of other tokens to avoid 
#-- elision

s/:=/ :=/g;
s/{/ {/g;
s/}/ }/g;
s/\(\\[1-9]\)/ \1/g;
s/;/ ;/g;
s/-->/ -->/g;

s/:=/ASSIGNMENT-EQUALS|/g;
s/\(\\[1-9]\)/ATTRIBUTE-REFERENCE(\1)|/g;
s/{/LEFT-BRACE|/g;
s/}/RIGHT-BRACE|/g;
s/-->/ARROW|/g;
s/;/SEMI-COLON|/g;

s/\([-a-zA-Z][-a-zA-Z]*\) /IDENTIFIER(\1)|/g;
#-- Convert bars to new line characters, and
#-- thus put each token on a new line. This technique
#-- works with GNU sed version 4.0.8 and version 3.0.2
s/ //g;
#-- Convert special characters in quoted strings
#-- back to their original form. We will leave the
#-- brackets in their fake html entity form
s/\[:space:\]/ /g;
s/\[:lbrace:\]/{/g;
s/\[:rbrace:\]/}/g;
s/\[:colon-equals:\]/:=/g;
s/\[:back-slash:\]/\\/g;
s/\[:arrow:\]/-->/g;
s/\[:semi-colon:\]/;/g;
s/|/\
/g;
p;