#*

 This script does some simple word/line parsing. This could be 
 expanded into recognising multiword patterns. 

NOTES
  
 "name.ext" [code|script|file|image|document]
 name.ext [code|script|file|image|document]
 [code|script|file|image|document] "name.ext"
 [code|script|file|image|document] name.ext

HISTORY

 19 june 2022
   began, roughly working
   
*#

  read;
  # ignore \r
  [\r] { clear; (eof) {.reparse} .restart }
  
  # dont tokenize non-leading space. One space will be printed
  # between each word.
  [ \t\f] { while [ \t\f]; clear; (eof) {.reparse} .restart }
  
  [\n] { 
    # make character count relative to line.
    nochars;
    # save the leading space in the nl* token 
    while [:space:];
    put; clear; add "nl*"; push; .reparse
  }

  # everything else is a word
  !"" { 
    whilenot [:space:]; put; clear;
    add "word*"; push; .reparse 
  }

parse>
  # to visualise parse token reductions
   add "line "; lines; add " char "; chars; add ": "; print; clear; 
   unstack; add "\n"; print; clip; stack; 

  #-------
  # 1 token
  pop; 

  # the last line of the file with no final newline char
  (eof) {
    "word*","text*" {
      clear; add "line*"; push; .reparse
    }
  }

  #-------
  # 2 tokens
  pop; 
  # I want to recognise 2 word structures, so need to separate
  # the text*word* reduction from the word*word* rule. 

  # is there any need for a file* token, link token etc? 
  "word*word*","text*word*" {
     clear; get; add " "; ++; get; --; put; 
     clear; add "text*"; push; .reparse
  }

  "word*nl*","text*nl*" {
    clear; get; ++; get; --; put; clear;
    add "line*"; push; .reparse
  }

  "line*line*","lineset*line*" {
    clear; get; ++; get; --; put; clear;
    add "lineset*"; push; .reparse
  }

  push; push;

  (eof) {
    pop;
    "word*","text*","line*","lineset*" { 
      clear; get; add "\n"; print; clear; quit;
    }
  }