#-- attempts to tokenise or to lexically analyse #-- a grammar file #p; #l; #-- Under DOS is maybe necessary to remove the #-- final \r character # s/.$//; #-- Lines that begin with a hash in the grammar #-- file should be ignored as comments /^#/{ s/^/# Input Line:/; p; d; i\ Ignored as comment } h; s/^/# Input Line:/; l; x; s/$/ /; : convertBars s/\("[^|"]*\)|/\1[:bar:]/; t convertBars s/"\([^"]*\)"/Quoted-String:\1|/g; #-- Within a quoted string, special symbol characters #-- need to be converted : convert s/\(Quoted-String:[^|;]*\);/\1[:semi-colon:]/; s/\(Quoted-String:[^ |]*\) /\1[:space:]/; s/\(Quoted-String:[^{|]*\){/\1[:lbrace:]/; s/\(Quoted-String:[^}|]*\)}/\1[:rbrace:]/; s/\(Quoted-String:[^|]*\):=/\1[:colon-equals:]/; s/\(Quoted-String:[^|\\]*\)\\/\1[:back-slash:]/; s/\(Quoted-String:[^|]*\)-->/\1[:arrow:]/; t convert #-- Since an identifier is end-delimited by a space #-- we place a space in front of other tokens to avoid #-- elision s/:=/ :=/g; s/{/ {/g; s/}/ }/g; s/\(\\[1-9]\)/ \1/g; s/;/ ;/g; s/-->/ -->/g; s/:=/Assignment-Equals|/g; s/\(\\[1-9]\)/Attribute-Reference:\1|/g; s/{/Left-Brace|/g; s/}/Right-Brace|/g; s/-->/Arrow|/g; s/;/Semi-Colon|/g; s/\([-a-zA-Z][-a-zA-Z]*\) /Identifier:\1|/g; #-- Convert bars to new line characters, and #-- thus put each token on a new line. This technique #-- works with GNU sed version 4.0.8 and version 3.0.2 s/ //g; #-- Convert special characters in quoted strings #-- back to their original form. We will leave the #-- brackets in their fake html entity form s/\[:space:\]/ /g; s/\[:lbrace:\]/{/g; s/\[:rbrace:\]/}/g; s/\[:colon-equals:\]/:=/g; s/\[:back-slash:\]/\\/g; s/\[:arrow:\]/-->/g; s/\[:semi-colon:\]/;/g; s/|/\ /g; p;