#-- attempts to tokenise or to lexically analyse #-- a grammar file #p; #l; #-- Under DOS is maybe necessary to remove the #-- final \r character # s/.$//; #-- Lines that begin with a hash in the grammar #-- file should be ignored as comments /^#/{ s/^/# Input Line:/; p; d; i\ Ignored as comment } h; s/^/# Input Line:/; l; x; s/$/ /; : convertBrackets s/\("[^)"]*\))/\1\&rbrak;/; s/\("[^("]*\)(/\1\&lbrak;/; t convertBrackets s/"\([^"]*\)"/QUOTED-STRING(\1)|/g; #-- Within a quoted string, special symbol characters #-- need to be converted : convert s/\(QUOTED-STRING([^);]*\);/\1[:semi-colon:]/; s/\(QUOTED-STRING([^ )]*\) /\1[:space:]/; s/\(QUOTED-STRING([^{)]*\){/\1[:lbrace:]/; s/\(QUOTED-STRING([^})]*\)}/\1[:rbrace:]/; s/\(QUOTED-STRING([^)]*\):=/\1[:colon-equals:]/; s/\(QUOTED-STRING([^)\\]*\)\\/\1[:back-slash:]/; s/\(QUOTED-STRING([^)]*\)-->/\1[:arrow:]/; t convert #-- Since an identifier is end-delimited by a space #-- we place a space in front of other tokens to avoid #-- elision s/:=/ :=/g; s/{/ {/g; s/}/ }/g; s/\(\\[1-9]\)/ \1/g; s/;/ ;/g; s/-->/ -->/g; s/:=/ASSIGNMENT-EQUALS|/g; s/\(\\[1-9]\)/ATTRIBUTE-REFERENCE(\1)|/g; s/{/LEFT-BRACE|/g; s/}/RIGHT-BRACE|/g; s/-->/ARROW|/g; s/;/SEMI-COLON|/g; s/\([-a-zA-Z][-a-zA-Z]*\) /IDENTIFIER(\1)|/g; #-- Convert bars to new line characters, and #-- thus put each token on a new line. This technique #-- works with GNU sed version 4.0.8 and version 3.0.2 s/ //g; #-- Convert special characters in quoted strings #-- back to their original form. We will leave the #-- brackets in their fake html entity form s/\[:space:\]/ /g; s/\[:lbrace:\]/{/g; s/\[:rbrace:\]/}/g; s/\[:colon-equals:\]/:=/g; s/\[:back-slash:\]/\\/g; s/\[:arrow:\]/-->/g; s/\[:semi-colon:\]/;/g; s/|/\ /g; p;