% text/plain % ------------------------------------------- % latex generated by: booktolatex.cgi % from source file : ../htdocs/books/awk/awk-book.txt % on: 04 March 2024, 8:10pm % querystring: books/awk/awk-book.txt % document-root: /var/www/html % script-name: /cgi-bin/booktopdf.cgi % Server-name: bumble.sourceforge.net % Sed-script: booktolatex.sed % ------------------------------------------- \documentclass[a4paper,12pt]{article} \usepackage[margin=0.4cm,noheadfoot]{geometry} \usepackage{color} %% to use colours, use "xcolor" for more \usepackage{multicol} %% for multiple columns \usepackage{keystroke} %% for keyboard key images \usepackage[toc]{multitoc} %% for multi column table of contents \usepackage{tocloft} %% to customize the table of contents \setcounter{tocdepth}{2} %% only display 2 levels in the contents \setlength{\cftbeforesecskip}{0cm} %% make the toc more compact \usepackage{listings} %% for nice code listings %\lstset{language={}, \lstset{language=awk, %% define special comment delimiters '##(' and ')' moredelim=[s][\color{grey}\itshape\footnotesize\ttfamily]{~(}{)}, basicstyle=\ttfamily, %% fixed pitch font xleftmargin=1cm, %% margin on the left outside the frames breaklines=true, %% break long code lines breakatwhitespace=false, %% break long code lines anywhere breakindent=10pt, %% reduce the indent from 20pt to 10 postbreak=\mbox{{\color{blue}\small$\Rightarrow$\space}}, %% mark with arrow showstringspaces=false, %% dont show spaces within strings framerule=5pt, %% thickness of the frames rulecolor=\color{lightgrey}, frame=l} %% source code settings \usepackage{graphicx} %% to include images \usepackage{fancybox} %% boxes with rounded corners \usepackage{wrapfig} %% flow text around tables, images \usepackage{tabularx} %% change width of tables \usepackage[table]{xcolor} %% alternate row colour tables \usepackage{booktabs} %% for heavier rules in tables \usepackage[small,compact]{titlesec} %% sections more compact, less space \usepackage{enumitem} %% more compact and better lists \setlist{noitemsep} %% reduce list item spacing \usepackage{hyperref} %% make urls into hyperlinks \hypersetup{ %% add "pdftex," if only pdf output is required colorlinks=true, %% set up the colours for the hyperlinks linkcolor=black, %% internal document links black urlcolor=black, %% url links black filecolor=red, citecolor=red, bookmarks=true, pdfpagemode=UseOutlines} % define some colours to use \definecolor{lightgrey}{gray}{0.70} \definecolor{grey}{gray}{0.30} \titleformat{\section}[frame] %% titlesec: create framed section headings {\normalfont} {\filleft \footnotesize \enspace Section \thesection\enspace\enspace} {3pt} {\bfseries\itshape\filright} \title{The Awk Text Processing Language} \author{} \date{28 April 2015, 11:36pm} \setlength{\parindent}{0pt} % \setlength{\parskip}{1ex} % label lists with stars \renewcommand{\labelitemi}{$\star$} \begin{document} \centerline{\Large \bf The Awk Text Processing Language} \medskip \begin{center} {\huge ``}\textit{}{\huge ''} \textsc{} \end{center} % ----------------------------------- % the toc should be 2 columns because of the \multitoc package \tableofcontents Awk is a unix tool, or programming language designed to process and transform text files which are arranged in a series of 'fields' (chunks of text separated by spaces, or any other delimiter) and records. This document is mainly about the 'mawk' variant of 'awk'. \emph{ Find out the version of mawk } \begin{lstlisting} mawk -W version \end{lstlisting} \arrayrulecolor{gray} \begin{center} \begin{tabular}{ |rl| } \multicolumn{2}{c}{\textbf{ helpful man pages for awk }} \\ \hline \texttt{ man gawk } & The gnu awk man page \\ \texttt{ man ed } & Contains regular expression examples \\ \texttt{ man mawk } & Contains good examples \\ \texttt{ man regex } & Regular expression syntax \\ \hline \end{tabular} \end{center} \begin{description}[labelindent=1cm, leftmargin=2cm, style=nextline] \item[\url{http://sparky.rice.edu/awk.html}] more awk one liners \end{description} \section{Gotchas} [+] The so-called ``gotchas'' are small but potentially frustating problems which arise and which stop a program from working or which make the awk program work in an unexpected way. Gotcha derives from the contraction of the english phrase ``got you''. \begin{itemize} \item On a unix system the awk phrase $<$$<$awk ``$\{$print \$1$\}$''$>$$>$ doesnt work as expected because the unix (bash) shell expands or ``interpolates'' the ``\$1'' variable. It is necessary to write ``awk '$\{$print \$1$\}$''' \item \end{itemize} \emph{ BEGIN and and variables such as FS must be uppercase } \begin{lstlisting} begin{FS=","}{print $2} ~(No!! this doesnt work) \end{lstlisting} \section{Simple Usage} \emph{ Simple usage of awk on different operating systems. } \begin{lstlisting} Unix: awk '/pattern/ {print "$1"}' # standard Unix shells DOS/Win: awk '/pattern/ {print "$1"}' # compiled with DJGPP, Cygwin awk "/pattern/ {print \"$1\"}" # GnuWin32, UnxUtils, Mingw \end{lstlisting} Users of MS-DOS or Microsoft Windows must remember that the percent sign (\%) is used to indicate environment variables, so this symbol must be doubled (\%\%) to yield a single percent sign visible to awk. \emph{ Run an awk script } \begin{lstlisting} cat file1 | awk -f a.awk > file2 \end{lstlisting} \begin{lstlisting} awk -f a.awk file1 > file2 ~(the same) \end{lstlisting} \section{Strings} \subsection{Concatenation Of String} Concatenation is the fancy term for joining 2 strings (bits of text) together. \emph{ Print the first two columns of the space/tab delimited file 'data.txt' } \begin{lstlisting} awk '{print $1 $2}' data.txt ~($1 and $2 are printed with no space between) \end{lstlisting} \begin{lstlisting} awk '{print $1$2}' data.txt ~(the same, at least on my mawk version) \end{lstlisting} \begin{lstlisting} awk '{print $1 $2;}' data.txt ~(the same again) \end{lstlisting} \begin{lstlisting} awk '{print $1 "" $2}' data.txt ~(the same again, but why would you?) \end{lstlisting} \emph{ Awk doesnt have variable 'interpolation' in strings } \begin{lstlisting} awk '{print "$1 ..."}' data.txt ~(this prints '$1 ...' literally) \end{lstlisting} \emph{ Print the first column of 'data.txt' with 3 dots '...' appended to it } \begin{lstlisting} awk '{print $1 "..."}' data.txt \end{lstlisting} \emph{ Append a string to itself (string concatenation) } \begin{lstlisting} s = s "xxx"; ~(this appends 3 x's to the end of the string 's') \end{lstlisting} \subsection{Matching Patterns} \emph{ Determine if the variable "s`` contains the letter ''r" } \begin{lstlisting} s ~ /r/ \end{lstlisting} \emph{ Print the first field of each line if it does *not* contain "a`` or ''b" } \begin{lstlisting} $1 !~ /(a|b)/ { print $1 } \end{lstlisting} \begin{lstlisting} $1 !~ /[ab]/ { print $1 } ~(the same) \end{lstlisting} \emph{ Add an "X" between every letter of every line } \begin{lstlisting} { gsub(//, "X") ; print } \end{lstlisting} \emph{ Split the string "s" into the array A using the pattern "r" } \begin{lstlisting} split(s, A, r) \end{lstlisting} \subsection{Printing Strings} \emph{ Make a multiline string.. } \begin{lstlisting} print "\ \n\
\n\ " \end{lstlisting} \emph{ Print multiple expressions } \begin{lstlisting} print "variable a is " a "." \end{lstlisting} \emph{ Its not possible to break printing expressions across lines } \begin{lstlisting} print "variable a is" a "."; \end{lstlisting} (this doesnt work, at least not with mawk 1.3.3) \subsection{Newlines} \emph{ Display the file 'days.txt' with all newline characters removed } \begin{lstlisting} awk '{ printf "%s", $0 }' days.txt \end{lstlisting} \begin{lstlisting} cat days.txt | awk '{ printf "%s", $0 }' ~(the same) \end{lstlisting} \emph{ Display 'days.txt' with newline characters replaced with spaces } \begin{lstlisting} awk '{ printf "%s ", $0 }' days.txt \end{lstlisting} \begin{lstlisting} cat days.txt | awk '{ printf "%s ", $0 }' \end{lstlisting} \section{Arrays} \emph{ Delete an array called record } \begin{lstlisting} delete record \end{lstlisting} \emph{ Assign a value to an associative style array } \begin{lstlisting} a["cars"] = 3 \end{lstlisting} \section{Regular Expressions} \emph{ Regular expression meta-characters: \^{} \$ . [ ] | ( ) * + ? } \emph{ Print all lines which start with an awk identifier } \begin{lstlisting} BEGIN { identifier = "[_a-zA-Z][_a-zA-Z0-9]*" } $0 ~ "^" identifier \end{lstlisting} \subsection{Case Insensitive Matching} \emph{ Use tolower } \begin{lstlisting} tolower($0) ~ /bhp/ {print $0} \end{lstlisting} \emph{ Set the ignorecase var } \begin{lstlisting} BEGIN {IGNORECASE=1} /bhp/ {print $0} \end{lstlisting} \section{Loops} \emph{ Loop through each field of each record } \begin{lstlisting} awk '{ for(i = 1 ; i <= NF ; i++) print $i }' /usr/share/dict/words \end{lstlisting} \emph{ Print each element of an array } \begin{lstlisting} for ( i in aa ) print aa[i] \end{lstlisting} \section{Splitting Data Fields} The field separator variable FS is interpreted as a regular expression \emph{ Split fields with any character followed by a colon ":" character } \begin{lstlisting} BEGIN {FS=".:"} \end{lstlisting} \emph{ Split quoted comma delimited fields (csv) } \begin{lstlisting} BEGIN {FS="\" *, *\""} \end{lstlisting} \arrayrulecolor{gray} \begin{center} \begin{tabular}{ |rl| } \multicolumn{2}{c}{\textbf{ awk built in variables }} \\ \hline \texttt{ ARGC } & Number of command line arguments. \\ \texttt{ ARGV } & Array of command line arguments, 0..ARGC-1. \\ \texttt{ CONVFMT } & Format for conversion of numbers to string, default ``\%.6g''. \\ \texttt{ ENVIRON } & Array indexed by environment variables. An environment string, var=value is stored as ENVIRON[var] = value. \\ \texttt{ FILENAME } & Name of the current input file. \\ \texttt{ FNR } & Current record number in FILENAME. \\ \texttt{ FS } & Splits records into fields as a regular expression. \\ \texttt{ NF } & Number of fields in the current record. \\ \texttt{ NR } & Current record number in the total input stream. \\ \texttt{ OFMT } & Format for printing numbers; initially = ``\%.6g''. \\ \texttt{ OFS } & Inserted between fields on output, initially = " ". \\ \texttt{ ORS } & Terminates each record on output, initially = ``\textbackslash n''. \\ \texttt{ RLENGTH } & Length set by the last call to the built-in function, match(). \\ \texttt{ RS } & Input record separator, initially = ``\textbackslash n''. \\ \texttt{ RSTART } & Index set by the last call to match(). \\ \texttt{ SUBSEP } & Used to build multiple array subscripts, initially = ``\textbackslash 034''. \\ \hline \end{tabular} \end{center} \section{Range Of Fields} Awk has no simple way to print a range of fields such as \$[1-4] A 'for' loop must be used to loop through the range and print each one. One may use cut instead \emph{ Use 'cut' to print fields 1 to 5 from a comma delimited file } \begin{lstlisting} cut -d, -f1-5 \end{lstlisting} \section{Awk One Line Recipes} These one line scripts were taken from http://www.pement.org/awk/awk1line.txt \emph{ 30 April 2008, by Eric Pement - eric [at] pement.org, version 0.27 } \begin{description}[labelindent=1cm, leftmargin=2cm, style=nextline] \item[\url{http://www.pement.org/awk/awk1line.txt}] Latest version of the Eric Pement one line scripts (in English) \item[\url{http://ximix.org/translation/awk1line_zh-CN.txt}] Chinese version of these one line scripts \end{description} \section{File Spacing} \emph{ Double space a file } \begin{lstlisting} awk '1;{print ""}' \end{lstlisting} \begin{lstlisting} awk 'BEGIN{ORS="\n\n"};1' ~(another way) \end{lstlisting} \emph{ Double space a file which already has blank lines in it. Output file } \emph{ Should contain no more than one blank line between lines of text. } \emph{ NOTE: On Unix systems, DOS lines which have only CRLF (\textbackslash r\textbackslash n) are } \emph{ Often treated as non-blank, and thus 'NF' alone will return TRUE. } \begin{lstlisting} awk 'NF{print $0 "\n"}' \end{lstlisting} \emph{ Triple space a file } \begin{lstlisting} awk '1;{print "\n"}' \end{lstlisting} \section{Summing Numeric Columns} \emph{ Sum up all the numbers in column 2 and print out the total at the end } \begin{lstlisting} awk '{ a+=$2 } END { print "total=" a }' data.txt \end{lstlisting} \emph{ Sum a column between 2 lines in a file (with help from sed) } \begin{lstlisting} sed -n '/#1/,/#2/p' data.txt | awk -F, '{a+=$2; print $2, a}' | less \end{lstlisting} \section{Line Numbering} \emph{ Precede each line by its line number FOR THAT FILE (left alignment). } \emph{ Using a tab (\textbackslash t) instead of space will preserve margins. } \begin{lstlisting} awk '{print FNR "\t" $0}' files* \end{lstlisting} \emph{ Precede each line by its line number FOR ALL FILES TOGETHER, with tab. } \begin{lstlisting} awk '{print NR "\t" $0}' files* \end{lstlisting} \emph{ Number each line of a file (number on left, right-aligned) } \emph{ Double the percent signs if typing from the DOS command prompt. } \begin{lstlisting} awk '{printf("%5d : %s\n", NR,$0)}' \end{lstlisting} \emph{ Number each line of file, but only print numbers if line is not blank } \emph{ Remember caveats about Unix treatment of \textbackslash r (mentioned above) } \begin{lstlisting} awk 'NF{$0=++a " :" $0};1' \end{lstlisting} \begin{lstlisting} awk '{print (NF? ++a " :" :"") $0}' \end{lstlisting} \emph{ Count lines (emulates ``wc -l'') } \begin{lstlisting} awk 'END{print NR}' \end{lstlisting} \emph{ Print the sums of the fields of every line } \begin{lstlisting} awk '{s=0; for (i=1; i<=NF; i++) s=s+$i; print s}' \end{lstlisting} \emph{ Add all fields in all lines and print the sum } \begin{lstlisting} awk '{for (i=1; i<=NF; i++) s=s+$i}; END{print s}' \end{lstlisting} \emph{ Print every line after replacing each field with its absolute value } \begin{lstlisting} awk '{for (i=1; i<=NF; i++) if ($i < 0) $i = -$i; print }' \end{lstlisting} \begin{lstlisting} awk '{for (i=1; i<=NF; i++) $i = ($i < 0) ? -$i : $i; print }' \end{lstlisting} \emph{ Print the total number of fields (``words'') in all lines } \begin{lstlisting} awk '{ total = total + NF }; END {print total}' file \end{lstlisting} \emph{ Print the total number of lines that contain ``Beth'' } \begin{lstlisting} awk '/Beth/{n++}; END {print n+0}' file \end{lstlisting} \emph{ Print the largest first field and the line that contains it } \emph{ Intended for finding the longest string in field \#1 } \begin{lstlisting} awk '$1 > max {max=$1; maxline=$0}; END{ print max, maxline}' \end{lstlisting} \section{The Number Of Fields} \emph{ Print the number of fields in each line, followed by the line } \begin{lstlisting} awk '{ print NF ":" $0 } ' \end{lstlisting} \emph{ Print the last field of each line } \begin{lstlisting} awk '{ print $NF }' \end{lstlisting} \emph{ Print the last field of the last line } \begin{lstlisting} awk '{ field = $NF }; END{ print field }' \end{lstlisting} \emph{ Print every line with more than 4 fields } \begin{lstlisting} awk 'NF > 4' \end{lstlisting} \emph{ Print every line where the value of the last field is $>$ 4 } \begin{lstlisting} awk '$NF > 4' \end{lstlisting} \section{String Creation} \emph{ Create a string of a specific length (e.g., generate 513 spaces) } \begin{lstlisting} awk 'BEGIN{while (a++<513) s=s " "; print s}' \end{lstlisting} \emph{ Insert a string of specific length at a certain character position } \emph{ Example: insert 49 spaces after column \#6 of each input line. } gawk --re-interval 'BEGIN$\{$while(a++$<$49)s=s " ``$\}$;$\{$sub(/\^{}.$\{$6$\}$/,''\&" s)$\}$;1' \section{Array Creation} \emph{ These next 2 entries are not one-line scripts, but the technique } \emph{ Is so handy that it merits inclusion here. } \emph{ Create an array named ``month'', indexed by numbers, so that month[1] } \emph{ Is 'Jan', month[2] is 'Feb', month[3] is 'Mar' and so on. } \begin{lstlisting} split("Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec", month, " ") \end{lstlisting} \emph{ Create an array named ``mdigit'', indexed by strings, so that } \emph{ Mdigit[``Jan''] is 1, mdigit[``Feb''] is 2, etc. Requires ``month'' array } \begin{lstlisting} for (i=1; i<=12; i++) mdigit[month[i]] = i \end{lstlisting} \section{Text Conversion And Substitution} \emph{ IN UNIX ENVIRONMENT: convert DOS newlines (CR/LF) to Unix format } \begin{lstlisting} awk '{sub(/\r$/,"")};1' # assumes EACH line ends with Ctrl-M \end{lstlisting} \emph{ IN UNIX ENVIRONMENT: convert Unix newlines (LF) to DOS format } \begin{lstlisting} awk '{sub(/$/,"\r")};1' \end{lstlisting} \emph{ IN DOS ENVIRONMENT: convert Unix newlines (LF) to DOS format } \begin{lstlisting} awk 1 \end{lstlisting} \emph{ IN DOS ENVIRONMENT: convert DOS newlines (CR/LF) to Unix format } \emph{ Cannot be done with DOS versions of awk, other than gawk } \begin{lstlisting} gawk -v BINMODE="w" '1' infile >outfile \end{lstlisting} \emph{ Use ``tr'' instead. } \begin{lstlisting} tr -d \r