#-- attempts to tokenise or to lexically analyse
#-- a grammar file

#p;
#l;
#-- Under DOS is maybe necessary to remove the
#-- final \r character
# s/.$//;
#-- Lines that begin with a hash in the grammar
#-- file should be ignored as comments
/^#/{
 s/^/# Input Line:/;
 p;
 d;
 i\
 Ignored as comment
}

h;
s/^/# Input Line:/;
l;
x;

s/$/ /;
: convertBars
s/\("[^|"]*\)|/\1[:bar:]/;
t convertBars
s/"\([^"]*\)"/Quoted-String:\1|/g;

#-- Within a quoted string, special symbol characters
#-- need to be converted
: convert
s/\(Quoted-String:[^|;]*\);/\1[:semi-colon:]/;
s/\(Quoted-String:[^ |]*\) /\1[:space:]/;
s/\(Quoted-String:[^{|]*\){/\1[:lbrace:]/;
s/\(Quoted-String:[^}|]*\)}/\1[:rbrace:]/;
s/\(Quoted-String:[^|]*\):=/\1[:colon-equals:]/;
s/\(Quoted-String:[^|\\]*\)\\/\1[:back-slash:]/;
s/\(Quoted-String:[^|]*\)-->/\1[:arrow:]/;
t convert

#-- Since an identifier is end-delimited by a space
#-- we place a space in front of other tokens to avoid 
#-- elision

s/:=/ :=/g;
s/{/ {/g;
s/}/ }/g;
s/\(\\[1-9]\)/ \1/g;
s/;/ ;/g;
s/-->/ -->/g;

s/:=/Assignment-Equals|/g;
s/\(\\[1-9]\)/Attribute-Reference:\1|/g;
s/{/Left-Brace|/g;
s/}/Right-Brace|/g;
s/-->/Arrow|/g;
s/;/Semi-Colon|/g;

s/\([-a-zA-Z][-a-zA-Z]*\) /Identifier:\1|/g;
#-- Convert bars to new line characters, and
#-- thus put each token on a new line. This technique
#-- works with GNU sed version 4.0.8 and version 3.0.2
s/ //g;
#-- Convert special characters in quoted strings
#-- back to their original form. We will leave the
#-- brackets in their fake html entity form
s/\[:space:\]/ /g;
s/\[:lbrace:\]/{/g;
s/\[:rbrace:\]/}/g;
s/\[:colon-equals:\]/:=/g;
s/\[:back-slash:\]/\\/g;
s/\[:arrow:\]/-->/g;
s/\[:semi-colon:\]/;/g;
s/|/\
/g;
p;