# Description: # A script to reformat a plain text file document which contains # a dairy of some sort into some kind of html. By a 'diary' I mean # a series of entries for particular dates. Specifically, the text file # should contain dates in a format like '* 3 march 2003, Saturday' or # something similar, and should be followed by some sort of descriptive # text relating to that date. The script recognises some special structures # within the plain text document. For example: # # The diary entry dates should be on a line by themselves and should # begin with a * character like this: # * 3 January 1992, Friday # # The '=' character, when the first non-whitespace character on a line indicates # that all the following text on the line should be formatted as a # 'heading'. Also, url style strings should be recognised and given # a hyperlink token in from of them, such as '[*]'. I prefer this to underlining # the entire url, because I find that the underlining tends to interfer with # the readability of the text. Some people would say, "use style-sheets" but to # them I would reply that the 'heraldic' visual pattern of the underlined hyperlink # is imprinted in many internet users brains, and to change that 'iconography' can # lead to unnecessary confusion. # # Examples: # ./diary2html.sh mjb-work.txt notran > mjb-work.html # This command line, executed in some kind of a bash shell, will transform a # plain text file which has 'diary' style entries, into an HTML file (that is # it will create a new HTML file and leave the original text file unchanged) and # will not display the automatic translation links to Google. # Also an HTML table of contents (with one entry for each date) will be # inserted in the HTML document. # # ./diary2html.sh mjb-work.txt notran notoc > mjb-work.html # The text file will be transformed into HTML but no table of contents # will be inserted nor any translation links. # # ./diary2html.sh mjb-work.txt blah notoc > mjb-work.html # If translation links are desired but no table of contents use a # command line similar to above. The string 'blah' could be anything # as long as its not 'notran'. This slighty dodgy 'feature' is owing to the # fact that I am not using any 'getopt' style option parsing. # # Parameters: # textFileName [required] # The name of the text file which is to be transformed from text into html # notran [optional] # If the second parameter is the string 'notran' then the javascript links # to the google automatic language translation engine will NOT be inserted # into the HTML page. This is useful, for example, when the HTML page is # going to be located within a 'password-protected' directory, because # the Google translation engine will not be able to access the page, and # therefor the translation links will not work. # notoc [optional] # If the third parameter is the string "notoc", then no HTML table of # contents will be generated. # Notes: # This script contains an improved url detection regular expresion, better than that # in say txtdoc2html.sh. But the url pattern matcher still has a problem when # somebody puts a full stop after a url. It thinks that that dot is part of the # url. # See Also: # txtdoc2html.sh, linkdoc2html.sh, plaintext2html.sh # Author: # m.j.bishop if [ "$1" = "" ] then echo "usage: $0 textFileName [notran] [notoc]" cat $0 | sed -n "/^[ ]*#/p" exit 1; fi #-- The section below creates the table of contents for the diary. #-- This line is designed to only number lines which match a pattern #-- In theory 'nl -bpPATTERN' should also do this, but it insisted on #-- 'double-spacing' the output cat $1 | expand | mawk '/^[ ]*\*[ ]*[^ ]+/{ii++; print ii $0}!/^[ ]*\*[ ]*([^ ]+)/' > $1.temp (echo "<center><a name = \"toc\"></a>"; \ cat $1.temp | \ sed "/^[ ]*\([0-9]\{1,\}\)[ ]*\*\(.*\)/!d" | \ sed "s/\(monday\|tuesday\|wednesday\|thursday\|friday\|saturday\|sunday\)//gi" | \ sed "s/^[ ]*\([0-9]\{1,\}\)[ ]*\*\(.*\)/<a href=\"#item\1\" class = \"t\">\2<\/a> | /g"; \ echo "</center>";) > diary-toc.temp echo "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2//EN\">" echo "<html>" echo " <META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; CHARSET=iso-8859-1\">" echo " <META HTTP-EQUIV=\"Keywords\"" echo " CONTENT=\"\">" echo " <META HTTP-EQUIV=\"Description\"" echo " CONTENT=\"\">" echo " <LINK REV=\"made\" HREF=\"mailto:webmaster@ella-associates.org\">" echo "" echo "<!-- HTML generated by the \"$(basename $0)\" script -->" echo "<!-- From the File: \"$1\" -->" echo "<!-- On the Date: $(date) -->" echo "<!-- see http://www.ella-associates.org/utils/$(basename $0) -->" echo "<link rel = \"stylesheet\" type = \"text/css\"" echo " href = \"/stylesheets/swish-style.css\">" echo "<head>" echo "<script language = \"javascript\">" echo " <!-- function redirectToGoogleTranslation(sSourceLanguage, sTargetLanguage) { var sTranslationUrl = 'http://translate.google.com/translate?u='; sTranslationUrl += escape(document.location.href); sTranslationUrl += '&langpair=' + sSourceLanguage + '|' + sTargetLanguage; sTranslationUrl += '&hl=' + sSourceLanguage; // document.testForm.test.value=sTranslationUrl; window.location = sTranslationUrl; } //-- redirectToGoogleTranslation() --> " echo "</script>" echo "</head>" echo "<body>" #-- The Google automatic translation links below, are sometimes disabled because they will #-- not work from within a password protected directory, since Google does not #-- have permission to view that directory. if [ "$2" != "notran" ] then echo "<center>" echo "See this page in (approximate):" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'es');\">Español</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'fr');\">Français</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'it');\">Italiano</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'de');\">Deutsch</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'pt');\">Português</a>" echo "</center>" fi #-- This was the old regular expression used to find dates, but now I am using * format #-- #- sed "s/^[ 0-9,]*\(jan\|feb\|mar\|apr\|may\|jun\|jul\ # |aug\|sep\|oct\|nov\|dec\)[a-z]*[ 0-9,]*.*/<strong>&<\/strong>/gi" | \ #-- Put the page heading before the table of contents #-- cat $1.temp | \ sed "/^[ ]*=[ ]*.*/!d" | \ sed -e "s/</\</g" -e "s/>/\>/g" | \ sed "s/^[ ]*=[ ]*\(.*\)/<center><h2>\1<\/h2><\/center>/gi" echo "<table align=\"center\" width = \"90%\"><tr><td>" #-- Inset the table of contents if [ "$3" != "notoc" ] then cat diary-toc.temp fi #-- Transform the text to HTML, insert anchors #-- Also delete the heading line which has already been inserted in the HTML #-- But, the line will also delete lines beginning in == or === etc, which #-- may not be desirable. cat $1.temp | \ expand | \ sed "/^[ ]*=[ ]*\(.*\)/d" | \ sed -e "s/</\</g" -e "s/>/\>/g" | \ sed -e "s/^[ ]*\-\-\>\>/<pre class = \"sed\">/g" -e "s/^[ ]*\-\-\<\</<\/pre>/g" | \ sed "s/^[ ]*\([0-9]\{1,\}\)[ ]*\*\(.*\)/<br><u><strong><a name=\"item\1\">\2<\/a><\/strong><\/u> <a href=\"#toc\">[TOC]<\/a>/g" | \ sed "s/\(http:\/\/[-a-z\%0-9\~\\\/\"\'\.\@]\{3,\}\)/<a href='\1' class = \"t\">[*]<\/a><tt> \1<\/tt>/gi" | \ sed "s/[^a-zA-Z\/]\(www\.[-a-z\%0-9\~\\\/\"\'\.\@]\{2,\}\)/<a href='http:\/\/\1'>[*]<\/a><tt> \1<\/tt>/gi" | \ sed "/<pre class = \"sed\">/,/<\/pre>/!s/[ ]\{2\}/\ \ /g" | \ sed "/<pre class = \"sed\">/,/<\/pre>/!s/^/<br>/g" echo "<br>" echo "</td></tr></table>" if [ "$2" != "notran" ] then echo "<center>" echo "See this page in (approximate):" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'es');\">Español</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'fr');\">Français</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'it');\">Italiano</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'de');\">Deutsch</a>|" echo "<a href=\"javascript:redirectToGoogleTranslation('en', 'pt');\">Português</a>" echo "</center>" fi echo "</body>" echo "</html>" rm -f diary-toc.temp rm -f $1.temp