#  Description:
#    A script to reformat a plain text file document which contains a
#    set of urls and descriptions of those urls into some kind of html. The resulting
#    HTML also contains a form which allows the user to make comments about
#    the source document. This script can be used in conjuction with the 
#    'add-comment' script, which processes the form values and updates the 
#    text file apon which the HTML file is based.
#
#    The script recognises some special structures within the plain text
#    document. For example:
#
#    The '=' character, when the first non-whitespace character on a
#    line indicates that all the following text on the line should be
#    formatted as a 'heading' or 'page title'.
#
#    The '*' character, indicates that the following white-space
#    delimited text should be formatted as an Html hyperlink, with the
#    text content of the hyperlink being the url itself.
#
#    This script also accepts the format:
#      [Beginning Of Line][spaces]*[spaces]The Document Title|Url-Or-Path/to/Html/File|Url-Or-Path/To/Text/File|
# 
#    This script also accepts the format (all on one line):
#      [Beginning Of Line][spaces]*[spaces]The Document Title|Url-Or-Path/to/Html/File|Url-Or-Path/To/Text/File|
#          |Url-Or-Path/To/Pdf/File|
#
#    This script also accepts the format:
#      [Beginning Of Line][spaces]*[spaces]The Document Title|Url-Or-Path/to/Base/FileName||||
#    An example of this format would be
#       * A Interesting Analysis|/alexis-info/docs/the-ramble||||
#    This example assumes that there are files
#       /alexis-info/docs/the-ramble.html
#       /alexis-info/docs/the-ramble.txt
#       /alexis-info/docs/the-ramble.pdf
#
#    This format is useful when all the different 'versions' (that is, document formats)
#    have the same base name and directory location, but have the appropriate file name
#    extension for their documents type. The script will automatically generate links
#    to each of these document formats in the order: html, text, pdf
#
#
#    The script also accepts the format (all on one line)
#      [Beginning Of Line][spaces]*
#        [spaces]The Document Title|Url-Or-Path/to/Base/FileName|extension|extension|extension|
#    Where 'extension' is any file name extension
#    An example of this format would be
#       * A Interesting Analysis|/alexis-info/docs/the-ramble|txt|html|doc|
#    This example assumes that there are files
#       /alexis-info/docs/the-ramble.html
#       /alexis-info/docs/the-ramble.txt
#       /alexis-info/docs/the-ramble.doc
#
#    For the sake of the 'readability' of the text file, this format is prefered to the previous
#    one. Both of these formats can also be used with two file name extensions instead of one.
#
#    This script also accepts the format:
#      [Beginning Of Line][spaces]*[spaces]The Document Title|Url-Or-Path/to/Base/FileName|||
#    This produces the same results as the format above except that no link to a Adobe 'pdf'
#    file is created.
#    
#    The script also accepts the format (all on one line)
#      [Beginning Of Line][spaces]*
#        [spaces]The Document Title|Url-Or-Path/to/Base/FileName|extension|extension|
#    Where 'extension' is any file name extension
#    An example of this format would be
#       * A Interesting Analysis|/alexis-info/docs/the-ramble|txt|doc|
#    This example assumes that there are files
#       /alexis-info/docs/the-ramble.txt
#       /alexis-info/docs/the-ramble.doc
#
#    This script also accepts the format (All on one line):
#      [Beginning Of Line][spaces]*[spaces]
#        The Document/Link Title|Url-Or-Path/to/File|
# 
#    The script also accepts the format:
#      [Beginning Of Line][spaces]http://blah    
#    
#    The script will also format blocks of text between the strings -->> and --<<
#    (where they are the first string on the line) as an HTML <pre> block
# 
#    This filter script also ignors lines starting in a '#' character. That is
#    those lines will not be rendered into Html.
# 
#    Please see the file /var/www/alexis-info/docs/resources.txt for an
#    example of a file which utilizes some of the formats described above.
#
# Example:
#    ./linkdoc2html.sh aRave.txt > aRave.html
#     
# Parameters:
#   textFileName
#     The name of the text file which is to be transformed from text into html
#   [notran]
#     If the second parameter is the string 'notran' then the javascript links
#     to the google automatic language translation engine will NOT be inserted
#     into the HTML page. This is useful, for example, when the HTML page is 
#     going to be located within a 'password-protected' directory, because
#     the Google translation engine will not be able to access the page, and
#     therefor the translation links will not work.
#   [notoc]
#   [forum-processor-url]
#    
# Notes:
#   The idea of this script is to allow the text file to be as free of 'mark-up'
#   as is possible. This can allow the simple maintenance of the text file, although
#   the precision and utility of a system such as XML is not available. 
#   It should be possible to modify this script to produce XML instead of HTML
#
#   This script has been successfully run on the debian linux bash shell.
#   It is possible that it would also run on a Microsoft Windows bash shell,
#   such as the Cygwin Bash shell.
#   
#   There is a GPL perl program called text2html which performs a similar task
#   to this script.
#
#   The HTML produced by this script is NOT friendly to Lynx, the text browser
#   because it uses an HTML table to create a 'left margin' for the document
#   A style sheet should be used instead.
#
#   The code which used 'mawk' or 'awk' or 'gawk' in order to number certain lines
#   which matched a regular expression have been removed and replaced with code
#   which uses the 'nl' program. For some reason 'nl' place empty lines in between
#   every line in the file when it uses a regular expression to number lines. These
#   'empty' lines actually contain a series of spaces and nothing else.
#   
#   For this reason, some extra 'sed' lines are necessary in order to get rid of this
#   unwanted blank lines.
#
#  See Also:
#    txtdoc2html.sh, diary2html.sh, plaintext2html.sh
#    plaintext2pdf.sh, plaintext2html-forum.sh, linkdoc2html.sh
#    linkdoc2html-index.sh
#  Author:
#   m.j.bishop

 if [ "$1" = "" ]
 then
   echo "usage: $0 textFileName [notran] [notoc] [forum-processor-url]"
   cat $0 | sed -n "/^[ ]*#/p" 
   exit 1;
 fi


 #-- The section below creates the table of contents for the linkdoc.
 #-- This line is designed to only number lines which match a pattern
 #-- In theory 'nl -bpPATTERN' should also do this, but it insisted on
 #-- 'double-spacing' the output
 #-- Also the expressions below try and get rid of things like "can't" and "won't"
 #-- because I want to apply some formatting to the content of quotes, and these
 #-- things will get in my way.

 #-- This is the pattern which determines what sort of lines will
 #-- be interpreted as 'section headings'. I cannot use the for the 'awk' line
 #-- because awk does not seem to accept the notation \{n,\}
 
 sHeadingPattern='[ A-Z0-9.\/\\]*[A-Z]\{3,\}[ A-Z0-9.\/\\]*'
 sLanguage="ENGLISH" 

#-- I have disable the code below because in a cgi environment, this script doesn't
#-- seem to have permission to create a file. 
#-- This is a real gotcha. If the file $1.temp already exists and is not writable
#-- by 'other' then the 'add-comment' script falls over because it cant successfully
#-- call this script. This problem only arises in a CGI environment where the 
#-- Web server does not have root permissions. If the $1.temp file cannot be
#-- created then this script wont work. One solution is to manually give 
#-- write permission to 'other'.
#-- This script (and the 'add-comment' script) will succeed the FIRST time in
#-- a cgi environment if the $1.temp file does not exist at all. This is 
#-- because if the file does not exist then the Web Server has sufficient
#-- permissions to create it. HOWEVER, the second time and afterwards this
#-- script and the 'add-comment' script will FAIL because when the web
#-- Server creates the $1.temp file the first time it creates it without 
#-- write permission for 'other'. That is to say, the Web Server essentially
#-- is able to create a file which it is not allowed to subsequently 
#-- modify (nor re-create). Actually this whole second part may not be true
#-- The web server creates the file as 'mbishop' and probably cant write to 
#-- it.
#--
#-- There are, no doubt, various solutions to this problem, including giving the
#-- web server sufficient permissions to recreate the file. etc. However the
#-- simplest solution is just to not use $1.temp. It is/was only used in three 
#-- places. Removing it may or may not slow the script down. I dont know

# cat $1 | expand | \
#   mawk '/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/{ii++; print ii $0}!/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/' | \
#   sed "s/\([a-zA-Z]\{2,\}\)n[\"']t/\1nt/g" > $1.temp
   
 echo "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2//EN\">"
 echo "<html>"
 echo " <META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; CHARSET=iso-8859-1\">"
 echo " <META HTTP-EQUIV=\"Keywords\""
 echo "          CONTENT=\"\">"
 echo " <META HTTP-EQUIV=\"Description\""
 echo "          CONTENT=\"\">"
 echo "        <LINK REV=\"made\" HREF=\"mailto:webmaster@ella-associates.org\">"
 
 echo "<!-- html generated by the \"$(basename $0)\" script         -->"
 echo "<!-- From the File: \"$1\"  -->"
 echo "<!-- On the Date: $(date)  -->"
 echo "<!-- see http://www.ella-associates.org/utils/$(basename $0) -->"
 echo "<link   rel = \"stylesheet\"  type = \"text/css\""
 echo "       href = \"/stylesheets/swish-style.css\">"
 echo "<head>"
 echo "<script language = \"javascript\">"
 echo "
 <!--
   function redirectToGoogleTranslation(sSourceLanguage, sTargetLanguage)
   {
     var sTranslationUrl = 'http://translate.google.com/translate?u=';
 
     sTranslationUrl += escape(document.location.href);
     sTranslationUrl += '&langpair=' + sSourceLanguage + '|' + sTargetLanguage;
     sTranslationUrl += '&hl=' + sSourceLanguage;
     // document.testForm.test.value=sTranslationUrl;
     window.location = sTranslationUrl;
   } //-- redirectToGoogleTranslation()
 -->  "
 
 echo "</script>"
 echo "</head>"
 echo "<body>"
 #-- The Google automatic translation links below, are sometimes disabled because they will
 #-- not work from within a password protected directory, since Google does not
 #-- have permission to view that directory.
 if [ "$2" != "notran" ]
 then
   echo "<center>"
   if [ $sLanguage = "ENGLISH" ]
   then
     echo "See this page in (approximate):"
     echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'es');\">Español</a>|"
     echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'fr');\">Français</a>|"
     echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'it');\">Italiano</a>|"
     echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'de');\">Deutsch</a>|"
     echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'pt');\">Português</a>"
   else 
     echo "Vea esta página en (aproximado):"
     echo "<a  href=\"javascript:redirectToGoogleTranslation('es', 'en');\">Español</a>|"
   fi
 fi
#---- The file below contains a colorized table of the links 
#---- cat /var/www/utils/translator-bar.html


#-- Put the page heading before the table of contents
#--
 expand $1 | \
   sed "/^[ ]*=[ ]*[^=].*/!d" | \
   sed "s/\([a-zA-Z]\{2,\}\)n[\"']t/\1nt/g" | \
   sed -e "s/</\&lt;/g" -e "s/>/\&gt;/g" | \
   sed "s/^[ ]*=[ ]*\([^=].*\)/<center><h2>\1<\/h2><\/center>/gi" 

#- This line below is not 'lynx friendly' as style sheet
#- should be used instead.
echo "<table align=\"center\" width = \"90%\"><tr><td>"
echo "
  <strong><center>
    <a href=\"#commentForm\" class = \"t\">[make a comment about (or add to) this document]</a>
  </center></strong>"


#-- Insert the table of contents   
if [ "$3" != "notoc" ]
then
  #-- This is probably faster than the code in 'plaintext2html.sh' because
  #-- no files have to be written.

 echo "<a name = \"toc\"></a><table align=\"center\" width = \"70%\"><tr><td>"
 #expand $1 | \
 #   mawk '/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/{ii++; print ii $0}!/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/' | \
 expand $1 | \
   sed "s/^[ ]*$//g" | \
   nl -s" " -bp'^[ A-Z0-9.\/\\:]*[A-Z][A-Z][A-Z]+[ A-Z0-9.\/\\:]*$' | \
   sed  "/^[ ]\+$/d" | \
   sed "s/^[ ]*\([1-9][0-9]*\) /\1/g" | \
    sed "s/\([a-zA-Z]\{2,\}\)n[\"']t/\1nt/g" | \
    sed "/^[0-9]\{1,\}$sHeadingPattern$/!d" | \
    sed "s/^\([0-9]\{1,\}\)\($sHeadingPattern\)$/<br><a href=\"#item\1\" class = \"t\">\1. \2<\/a>/g"
  echo "</td></tr></table>" 

  # cat plain-text-toc.temp
fi

 expand $1 | \
   #-- This old 'awk' code was causing problems.
   #mawk '/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/{ii++; print ii $0}!/^[ A-Z0-9.\/\\]*[A-Z]+[ A-Z0-9.\/\\]*$/' | \
   sed "s/^[ ]*$//g" | \
   #-- Number all lines that are 'section headings'
   nl -s" " -bp'^[ A-Z0-9.\/\\:]*[A-Z][A-Z][A-Z]+[ A-Z0-9.\/\\:]*$' | \
   #-- Get rid of the 'blank' lines which nl puts into the output
   sed  "/^[ ]\+$/d" | \
   #-- Reformat the numbered section headings
   sed "s/^[ ]*\([1-9][0-9]*\) /\1/g" | \
   #-- Get rid of contraction apostrophes (like in don't, can't, isn't etc). This is not really required
   sed "s/\([a-zA-Z]\{2,\}\)n[\"']t/\1nt/g" | \
   #-- Delete all comments lines (beginning in a hash symbol)
   sed "/^[ ]*#/d" | \
   #-- Delete the page title because its already been output
   sed "/^[ ]*=[ ]*\([^=].*\)$/d" | \
   #-- Encode special characters '<>' as HTML entities
   sed -e "s/</\&lt;/g" -e "s/>/\&gt;/g"  | \
   #-- Do a trick to get the '-->>' and '--<<' blocks of text to work
   sed -e "s/^[ ]*\-\-\&gt;\&gt;/<pre>/g" -e "s/^[ ]*\-\-\&lt;\&lt;/<\/pre>/g" | \
   #-- Make each 'section heading' into an HTML anchor to work with the 'Table of Contents'
   sed "s/^\([0-9]\{1,\}\)\($sHeadingPattern\)$/<strong><tt><a name=\"item\1\" class = \"section-heading\">\1. \2<\/a><\/tt><\/strong> <a href=\"#toc\">[TOC]<\/a>/g" | \
   #-- Hyperlink URL style pieces of text
   sed "s/^[ ]*\(http:\/\/[^ ]\{3,\}\)/<a href=\"\1\">\1<\/a>/gi" | \
   #-- Hyperlink email addresses with a 'mailto:' link
   sed "/<pre/,/<\/pre>/!s/\([^ ]\{2,\}@[^ \"']\{2,\}\)/<a href=\"mailto:\1\">\1<\/a>/g" | \
   #-- Example of Format Below: * My Title|/my/path/to/file||||
   sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)||||/<b>\1<\/b> (<em>Formats:<\/em> <a href='\2\.html'>html<\/a> | <a href='\2\.txt'>text<\/a> | <a href='\2\.pdf'>pdf<\/a>)/gi" | \
   #-- Example of Format Below: * My Title|/my/path/to/file|html|txt|pdf|
   sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|\([a-zA-Z]\{1,8\}\)|\([a-zA-Z]\{1,8\}\)|\([a-zA-Z]\{1,8\}\)|/<b>\1<\/b> (<em>Formats:<\/em> <a href='\2\.\3'>\3<\/a> | <a href='\2\.\4'>\4<\/a> | <a href='\2\.\5'>\5<\/a>)/gi" | \
   #-- Example of Format Below: * My Title|/my/path/to/file|||
   sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|||/<b>\1<\/b> (<em>Formats:<\/em> <a href='\2\.html'>html<\/a> | <a href='\2\.txt'>text<\/a>)/gi" | \
   #-- Example of Format Below: * My Title|/my/path/to/file|pdf|html|
   sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|\([a-zA-Z]\{1,8\}\)|\([a-zA-Z]\{1,8\}\)|/<b>\1<\/b> (<em>Formats:<\/em> <a href='\2\.\3'>\3<\/a> | <a href='\2\.\4'>\4<\/a>)/gi" | \
   #-- Example of Format Below: * My Title|/full/path/to/htmlfile|/full/path/to/text/file|/full/path/to/pdffile|
   sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|\([^|]*\)|\([^|]*\)|/<b>\1<\/b> (<em>Formats:<\/em> <a href='\2'>html<\/a> | <a href='\3'>text<\/a> | <a href='\4'>pdf<\/a>)/gi" | \
   #-- Example of Format Below: * My Title|/full/path/to/htmlfile|/full/path/to/text/file|
   sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|\([^|]*\)|/<b>\1<\/b> (<em>Formats:<\/em> <a href='\2'>html<\/a> | <a href='\3'>text<\/a>)/gi" | \
   #-- Trick to make 'txt' links into 'text' links for readability
   sed "s/>txt<\/a>/>text<\/a>/gi" | \
   #-- Example of Format Below: * My Title|/full/path/to/any-old-file|
   sed "s/^[ ]*\*[ ]*\([^|]*\)|\([^|]*\)|/<b>\1<\/b>(<a href='\2'>\2<\/a>)/gi" | \
   #-- Example of Format Below: * /full/path/to/any-old-file
   sed "s/^[ ]*\*[ ]*\([^ ]\{2,\}\)/<a href='\1'>\1<\/a>/gi" | \
   #-- Format comments added by web-users
   sed "s/^\([ ]*added[ ]\{0,4\}by:\)\([^,]\{1,\}\)\,[ ]*on[ ]*\(.*\)/<u><em>\1<\/em><tt> \2<\/tt><em> on \3<\/em><\/u>/gi" | \
   #-- Turn spaces into non-breaking-spaces unless they are between 'pre' tags
   sed "/<pre/,/<\/pre>/!s/[ ]\{2\}/\&nbsp;\&nbsp;/g" | \
   #-- Turn line breaks into <br> tags unles they are between 'pre' tags
   sed "/<pre/,/<\/pre>/!s/^/<br>/g" 
 echo "<br>"
 echo "</td></tr></table>"


 #-- Define the cgi program which will handle the adding of 
 #-- comments to a particular text file.
 if [ "$4" != "" ]
 then
   sProcessorUrl=$4
 else
   #-- It would be possible to replace the Domain Name below with
   #-- an IP address, which would mean that the script would still
   #-- work even if the DNS configuration failed. I am not sure if this
   #-- is really a good idea or not.
   #sProcessorUrl="http://www.ella-associates.org/cgi-bin/add-comment"
   sProcessorUrl="http://63.105.73.195/cgi-bin/add-comment"
 fi
 #-- There is a problem in that I need to find the full path 
 #-- name of the $1 variable, but I dont know how to do this. This
 #-- is necessary because the target processor is not in the same
 #-- directory as the source document (the text file)
 #-- For the time being I have used the remedy of seeing if the path
 #-- is relative or absolute. The slightly dodgy path generating code below
 #-- appears to be working. There is almost certainly a much easier way 
 #-- of doing it

 sRelativePath=$(dirname $1)
 sFirstCharacter=$(echo $sRelativePath | sed "s/^\(.\).*$/\1/g")
 if [ "$sRelativePath" = "." ]
 then
   sFullPathName="$(pwd)/$1"
 elif [ "$sFirstCharacter" = "." ]
 then
   sFullPathName="$(pwd)/$1"
 elif [ "$sFirstCharacter" = "/" ]
 then
   sFullPathName="$1"
 else
   sFullPathName="$(pwd)/$1"
 fi
 # echo $sFullPathName
 
 echo "
    <form action = \"$sProcessorUrl\" 
          method = \"post\">
    <input  name = \"filename\" 
            type = \"hidden\"
           value = \"$sFullPathName\">
    <input  name = \"documenttype\" 
            type = \"hidden\"
           value = \"linkdoc\">
      
    <hr><a name = \"commentForm\"></a>
    <center>
    <small><strong>
    <a href=\"#toc\">BACK TO THE TABLE OF CONTENTS</a>
    </strong></small>
    <br>
    <em>"
 
 # The if/then below is an attempt to slightly 'internationalize' this script.
 if [ $sLanguage = "ENGLISH" ]
 then
  echo "
      If you wish, you may add a comment, suggestion or other contribution which will appear at the
      end of this document.
      <br>Any input you make is greatly appreciated."
 else
   echo "
      Si usted desea, usted puede agregar un comentario, sugerencia o otra contribuci&oacute;n que
      aparecer&aacute; al final de este documento
      <br>Cualquier comentario que usted haga se aprecia grandemente.
      "
 fi
 
 echo "</em>

    El su comentario (o la otra contribución a este documento)

    <br><br>"

 if [ $sLanguage = "ENGLISH" ]
 then
   echo "<strong>Your Comment (or other contribution to this document)</strong><br>"
 else   
   echo "<strong>Tu comentario (o otra contribuci&oacute;n a este documento)</strong><br>"
 fi

 echo "
    <textarea name = \"comment\"  cols = \"70\" 
              rows = \"5\"></textarea>
    <br><br>	      
 if [ $sLanguage = "ENGLISH" ]
 then
    echo "<strong>Your Name <small>[OPTIONAL BUT NICE]</small></strong><br>"
 else
    echo "<strong>Tu nombre <small>[OPCIONAL PERO AGRADABLE]</small></strong><br>"
 fi

    <textarea name = \"username\"  cols = \"70\" 
              rows = \"1\"></textarea>
    <br><br>
    <input   type = \"submit\"  value = \"S U B M I T   Y O U R   C O M M E N T\">
    </center>
    </form>"
      
 if [ "$2" != "notran" ]
 then
   echo "<center>"
   if [ $sLanguage = "ENGLISH" ]
   then
     echo "See this page in (approximate):"
   else 
     echo "Vea esta página en (aproximado):"
   fi

   echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'es');\">Español</a>|"
   echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'fr');\">Français</a>|"
   echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'it');\">Italiano</a>|"
   echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'de');\">Deutsch</a>|"
   echo "<a  href=\"javascript:redirectToGoogleTranslation('en', 'pt');\">Português</a>"
   echo "</center>"
 fi
 echo "</body>"
 echo "</html>"