% -*- mode: Noweb; noweb-code-mode: sml-mode -*-
% $Id: unparse.nw,v 1.17 2006-02-02 19:27:46 dias Exp $
\let\nocover1
\documentclass[twoside]{article}
\usepackage{alltt,array,tabularx,epic,latexsym}
\usepackage{spe2e}
\def\sysdetails{}
\usepackage{noweb}
% l2h substitution div div
% l2h substitution mod mod
\usepackage{pstricks,pst-node,pst-tree,pst-char}
\overfullrule=10pt
\noweboptions{shortxref}
\def\nwdocspar{\vskip0pt plus1.5in\penalty-200\vskip0pt plus -1.5in}
%\def\nwdocspar{\par\penalty-1000}
\def\nwendcode{\endtrivlist\endgroup} % ditches filbreak
%\def\nwendcode{\endtrivlist\endgroup\vskip\nwextraglue} % page tuning
\let\ostartdef=\nwstartdeflinemarkup
\let\oenddef=\nwenddeflinemarkup
\newtoks\tyboxes
\newif\ifboxespending
\boxespendingfalse
\newcommand\dpsmash[1]{% kill descenders to make trees look better
\relax\setbox0\hbox{#1}\dp0=0pt\box0\relax}
\makeatletter
\newcommand\nwnospecials{\@makeother\'\@makeother\_\obeyspaces}
\makeatother
\newcommand\addbox{%
\begingroup\nwnospecials\doaddbox
}
\newcommand\doaddbox[1]{%
\global\boxespendingtrue
\global\tyboxes=\expandafter{\the\tyboxes#1\\}%
\endgroup
\ignorespaces
}
\setlength\fboxsep{1pt}
\newdimen\deeperboxspace
\newcommand\deeperbox[1]{\deeperboxspace=#1\relax\advance\deeperboxspace by 5.5pt}
\deeperboxspace=0pt
\def\nwstartdeflinemarkup{\ostartdef
\ifboxespending
{\small
\penalty500\hskip -1em\mbox{}\nobreak\hfill\nobreak
\nwnospecials\nobreak
\smash{\fbox{\begin{tabular}[t]{@{\,}l@{\,}}\the\tyboxes\end{tabular}}}}%
\nobreak\vrule width0pt height10.5pt depth\deeperboxspace
\fi
\global\boxespendingfalse
\global\tyboxes={}%
}
\def\nwenddeflinemarkup{\nobreak\hskip \nwdefspace\nobreak}
\def\nwlbrace{\char123}
\def\nwrbrace{\char125}
\newcommand{\fig}[1]{Figure~\ref{fig:#1}}
\newcommand\sfilbreak[1]{\vskip 0pt plus#1\penalty-200\vskip 0pt plus -#1}
\setcounter{secnumdepth}{1}
\let\orghbox=\hbox
\newcommand\nohbox{\let\hbox=\orghbox\relax}
\providecommand\citeN[1]{\nocite{#1}Reference~\csname b@#1\endcsname}
\providecommand\citeNpair[2]{\nocite{#1}\nocite{#2}%
References \csname b@#1\endcsname~and~\csname b@#2\endcsname}
\newcommand\refno[1]{\nocite{#1}\csname b@#1\endcsname}
\title{Unparsing Expressions \penalty-200 With Prefix and Postfix Operators}
%\author{Norman Ramsey}
\affiliation{Department of Computer Science, University of Virginia,
Charlottesville, VA 22903}
\author{Norman Ramsey \\ \affilsize\itshape\centering
Department of Computer Science, University of Virginia,
Charlottesville, VA 22903}
\titlehead{\uppercase{Unparsing with prefix and postfix operators}}
\authorhead{Norman Ramsey}
\received{29 August 1997}
\revised{27 March 1998}
\spe{0}{0}{MONTH}{98}{1}{30}
\def\remark#1{\marginpar{\raggedright\hbadness=10000
\def\baselinestretch{0.8}\scriptsize
\it #1\par}}
\makeatletter
\newcommand\missing[2][Stuff]
{\@latex@warning{#1 is missing}%
\marginpar{\small\itshape\raggedright #1 is missing here.}%
\textsl{#2}}
\makeatother
% all this goo is for writing grammars. first symbols
\newcommand{\lit}{\begingroup\catcode`\_=12\relax\dolit}
\newcommand{\dolit}[1]{\texttt{\textup{#1}}\endgroup}
\newcommand{\term}[1]{\textsl{#1}}
\newcommand{\nt}[1]{\textit{#1}}
\newcommand\litbar{\texttt{|}}
% now the metasymbols
\newcommand{\produces}{\mbox{$\Rightarrow$}}
\newcommand{\gdelim}{\big}
\newcommand{\vbar}{\mbox{$\gdelim|$}}
\newcommand{\makevbar}{}
{\catcode`\|=\active \gdef\makevbar{\begingroup\catcode`\|=\active \let|=\vbar}}
\newcommand{\sequence}{\makevbar\dosequence}
\newcommand{\optional}{\makevbar\dooptional}
\newcommand{\alternate}{\makevbar\doalternate}
\newcommand{\dosequence}[1]{\mbox{$\gdelim\{$}#1\mbox{$\gdelim\}$}\endgroup}
\newcommand{\dooptional}[1]{\mbox{$\gdelim[$}#1\mbox{$\gdelim]$}\endgroup}
\newcommand{\doalternate}[1]{\mbox{$\gdelim($}#1\mbox{$\gdelim)$}\endgroup}
% \specindex is used to index stuff in the specification -- I'll have
% to read up on makeindex to get the font to come out right.
\newcommand{\indexlit}{\begingroup\catcode`\_=12\relax\doindexlit}
\newcommand{\doindexlit}[1]{\index{#1@\protect\lit{#1}}\endgroup}
\newcommand{\indexedlit}{\begingroup\catcode`\_=12\relax\doindexedlit}
\newcommand{\doindexedlit}[1]{\index{#1@\protect\lit{#1}}\dolit{#1}}
\newcommand{\indexnt}[1]{\index{#1@\protect\nt{#1}}}
\newcommand{\indexednt}[1]{\nt{#1}\indexnt{#1}}
%
\newcommand{\libindex}[1]{\index{#1@\protect\lit{#1}}}
%
\newcommand\commentstart{\relax} % was \#~
\newcommand{\comment}[1]{\commentstart #1}
\newcommand{\lbr}{}
\newcommand{\rbr}{}
\chardef\lbr`\{
\chardef\rbr`\}
% Now for productions in the grammar: In the production environment, |
% is active.
\newcommand\productionvbar{\mbox{$|$}}
\newcommand\productionor{\\&\productionvbar&}
\newcommand\makeproductionvbar{}
{\catcode`\|=\active
\gdef\makeproductionvbar{\catcode`\|=\active \let|=\productionor}}
\newenvironment{production}[1]
{\list{}{\leftmargin=\parindent}
\makeproductionvbar
% \renewcommand\arraystretch{1.10}
\item[]\begin{tabular}{l
@{\hspace{0.5\tabcolsep}}c@{\hspace{0.5\tabcolsep}}
l
>{\commentstart}l}%
\indexednt{#1}&\produces&}
{\\\end{tabular}\endlist}
% insert \productionglue between two consecutive production
% environments
\newcommand\productionglue{\vskip -1.1\baselineskip\relax}
\renewcommand\productionglue{\kern -1.1\baselineskip\relax}
\renewcommand\prec{\mathop{\mathrm{precedence}}}
\newcommand\assoc{\mathop{\mathrm{associativity}}}
\newcommand\implies{\mathrel{\Rightarrow}}
\newcommand\impliedby{\mathrel{\Leftarrow}}
\newcommand\dotlt{\mathrel{<\!\!\cdot}}
\newcommand\dotgt{\mathrel{\cdot\!\!>}}
\newtheorem{defn}{Definition}
\newtheorem{lemma}{Lemma}
\newtheorem{prop}{Proposition}
\newenvironment{proof}{\begin{trivlist}\item[]\textbf{Proof} \ignorespaces}
{\hskip2em plus1filll\hbox{$\Diamond$}\end{trivlist}}
\def\[#1\]{$\mathtt{#1}$}
\def\D\[#1\]{$$\mathtt{#1}$$}
% l2h macro [ 0
% l2h macro ] 0
% l2h ignore D
% N.B. {\LaTeX} subverts the resetting of \hangindent by \par.
\newenvironment{proofstep}
{\begin{trivlist}\item[]\leftskip=2em\global\hangindent=0pt % the big hammer
$\displaystyle\begin{array}{l}}
{ \end{array}$\end{trivlist}}
\newenvironment{proofstep*}
{\begin{trivlist}\item[]\leftskip=2em\global\hangindent=0pt % the big hammer
\begin{tabular}{l@{\,}l}}
{\end{tabular}\end{trivlist}}
\newcommand\proofhang{}
\def\proofhang#1({%
\setbox0=\hbox{#1(}%
\noindent
\hangindent=\wd0
\hangafter=1
\box0
}
\begin{document}
\maketitle
\ifx\stretchme\undefined\else\def\baselinestretch{1.5}\fi
\begin{abstract}
Unparsing is the problem of transforming an internal representation of
a program into an external, concrete syntax.
In conjunction with prettyprinting, it is useful for
generating readable programs from internal representations.
If the target language uses prefix and postfix operators,
the problem is nontrivial.
This paper shows how to unparse expressions using a simple, bottom-up
tree walk, which keeps track of the least tightly binding operator not
enclosed by parentheses.
During the tree walk, this operator is compared with the operator of
the parent expression, and parentheses are inserted based on the
precedence, associativity, and {fixity} (infix, prefix, or
postfix) of the two operators.
The paper is a literate program.
It includes code for the
unparser and for its inverse parser, both of which can handle
operators of dynamically chosen precedence and associativity.
Supporting such operators is useful for languages like~ML, in which
programmers may
assign precedence and associativity to their own functions.
\end{abstract}
\section*{Key Words}
\keywords{Unparsing\quad Parsing\quad Prettyprinting\quad Literate Programming\quad Standard~ML}
\section{Introduction}
% Symbolic-computing systems that manipulate algebraic expressions may
% need to present such expressions in a readable form.
Automatic generation of computer programs from high-level
specifications is a well-known technique, which has been used to
create
scanners, parsers, code generators, tree rewriters, packet filters,
dialogue handlers,
machine-code recognizers, and other tools.
To make it easier to debug an application generator, and to help
convince
prospective users that the generated code can be trusted,
the generated code should be as idiomatic and readable as possible.
It is therefore desirable, and sometimes essential, that the generated
code use
infix, prefix, and postfix operators, and that expressions be presented
without unnecessary parentheses.
Another helpful technique is to use a prettyprinter to automate
indentation and line breaking.\cite{oppen:prettyprinting,hughes:pretty-printing}
This paper presents a method of automatically parenthesizing
expressions that use prefix, postfix, and implicit operators; the
method is compatible with automatic prettyprinting.
The paper also shows how to parse expressions that use such operators.
The parsing algorithm can be used even if operator precedences are not
known at compile time, so it can be used with
arbitrarily many user-defined precedences.
%% For example, the New Jersey Machine-Code Toolkit is an application
%% generator that writes code to help encode and decode machine
%% instructions \cite{ramsey:jersey}.
%% The generated code may be written in C or in Modula-3.
%% \emph{What about compiling to C???}
This paper is a ``literate program,''\cite{knuth:literate} which means
that a single
source file is used to prepare both manuscript and code.
Including code provides a formal, precise, testable statement of the
unparsing algorithm.
The code is written in the purely functional subset of
Standard~ML.\cite{milner:definition}
ML is a good choice for several reasons.
Referential transparency makes the subset easy to reason about, so we
can show the correctness of simple unparsing algorithms.
It is possible to express some properties of the data as type
properties, which are checked by the compiler.
Finally, SML's parameterized modules make the code reusable
in several different contexts---the code that appears in the paper is
the code used in both the SLED and the $\lambda$-RTL
compilers,\cite{ramsey:specifying,ramsey:embedded} for example.
The paper explains many ML constructs as they are used, but
readers not familiar with ML may still wish to consult
References~\nocite{ullman:elements}\nocite{paulson:ml}\nocite{felleisen:mler}\refno{ullman:elements}--\refno{felleisen:mler}
for an introduction.
The paper presents algorithms for two versions of the unparsing problem.
In the first version, all operators are binary infix operators;
this version of the unparsing problem is well understood, and it is simple
enough to make a proof of correctness tractable.
The invariants and insights from this proof lead to an
algorithm for a
more general version, which handles prefix and postfix operators, as
well as non-associative infix operators of arbitrary arity.
The algorithm can also handle some ``mixfix'' operators, like the array-indexing
operator~$\cdot[\cdot]$.
The algorithm is the main contribution of this paper.
It is presented as executable code, which uses
parameterized modules to make it
independent of the representation of the unparsed form.
\section{Parsing and unparsing}
Parsers and unparsers translate between concrete and abstract syntax.
\iffalse
It is essential to avoid ambiguities when translating from an internal
form to a concrete syntax.
When emitting declarations and statements, there is little problem
with ambiguity; the only common pitfall is the
\textbf{if}-\textbf{then}-\textbf{else} ambiguity present in C and
some other languages \cite[page number?]{aho:compilers}, and there
it is sufficient to make sure that every
\textbf{if} statement has an \textbf{else} branch.
Expressions are another story.
\fi
The concrete syntax used to
represent expressions is usually ambiguous;
ambiguities are resolved using operator precedence and parentheses.
Internal representations are unambiguous but not suitable as
concrete syntax.
For example, expressions are often represented as simple trees, in which an
expression is either an ``atom'' (constant or variable) or an operator
applied to a list of expressions.
%
We can describe such trees by the following grammar,
written in extended BNF,\cite{wirth:ebnf} in which
nonterminals appear in \nt{italic} and terminals in \term{slant}
fonts:
\begin{production}{expression}
\term{atom} | \term{operator} \sequence{\nt{expression}}
\end{production}
LISP and related languages use a nearly identical concrete syntax,
simply by requiring parentheses around each application.
The designers of other languages have preferred other concrete syntax,
using
%%
%% A na\"\ive translation of internal forms to concrete syntax can be
%% incorrect, because the internal representations tend to be too simple.
%%
%% ; for example, a tree representing the product of $x+y$ and
%% $z$ can't simply be represented as $x+y\times z$.
%%
infix binary
operators, prefix and postfix unary operators, and possibly ``mixfix''
operators, as described by the following grammar:
\begin{production}{expression}
\term{atom}
| \nt{expression} \term{infix-operator} \nt{expression}
| \term{prefix-operator} \nt{expression}
| \nt{expression} \term{postfix-operator}
| \nt{expression} \sequence{\term{nary-operator} \nt{expression}}
| \nt{other form of expression\ldots}
\end{production}
Even if the internal form distinguishes these kinds of operators, this
grammar is unsuitable for parsing, because it is ambiguous.
Grammars intended for parsing use various levels of precedence and associativity
to resolve ambiguities.
Once precedence and associativity have been determined, they can be
encoded by introducing new nonterminals for each level
of precedence.
Users of shift-reduce parsers can specify precedence and associativity
directly, then have the parser use them to resolve shift-reduce
conflicts.
The parsing problem is to translate from concrete syntax to internal
representation.
The unparsing problem is to take an internal representation
and to produce a concrete syntax that, when parsed,
reproduces the original internal representation.
That is, an unparser is correct only if it is a right inverse of its
parser.
For example, if our internal representation is a tree representing the
product of $x+y$ with~$z$, we cannot simply walk the tree and emit
the concrete representation $x+y\times z$; we must emit $(x+y)\times
z$ instead.
As stated, the unparsing problem does not have a unique solution;
in general, there are many possible concrete syntaxes that,
when parsed, reproduce the original internal representation.\label{corr:inverse+}
For example, we could choose a concrete syntax that parenthesizes
every expression, as in $(((x)+(y))\times (z))$.
The LISP rule parenthesizes every application, as in
$((x+y)\times z)$.
Rules like these use too many parentheses.
Code with extra parentheses is unidiomatic and unreadable, like
this C~code, which has been emitted using something like the LISP rule:
{\hfuzz=28pt
\begin{verbatim}
if ((((Mem.u).Index8).index) != 4)
if (!((unsigned(((Mem.u).Index8).d)) < 0x100))
fail((("((Mem.u).Index8).d = 0x%x won't fit in 8 unsigned bits"),
(((Mem.u).Index8).d)));
else
{
emit((((7 | (15 @<< 4)) | (1 @<< 3)), 1));
emit((((4 | (3 @<< 3)) | (1 @<< 6)), 1));
emit((((((((Mem.u).Index8).ss) @<< 6) |
((((Mem.u).Index8).index) @<< 3)) |
(((Mem.u).Index8).base)), 1));
emit((((((Mem.u).Index8).d) & 0xff), 1));
}
else
fail(("Conditions not satisfied for constructor CALL.Epod"));
\end{verbatim}
The unparser in this paper is not merely a parser's right inverse; it
also produces results without extra parentheses:
\begin{verbatim}
if (Mem.u.Index8.index != 4)
if (!((unsigned)Mem.u.Index8.d < 0x100))
fail("Mem.u.Index8.d = 0x%x won't fit in 8 unsigned bits",
Mem.u.Index8.d);
else
{
emit(7 | 15 @<< 4 | 1 @<< 3, 1);
emit(4 | 3 @<< 3 | 1 @<< 6, 1);
emit(Mem.u.Index8.ss @<< 6 | Mem.u.Index8.index @<< 3 |
Mem.u.Index8.base, 1);
emit(Mem.u.Index8.d & 0xff, 1);
}
else
fail("Conditions not satisfied for constructor CALL.Epod");
\end{verbatim}
\par}
The unparsing method uses information about precedence,
associativity,
and ``fixity'' of operators to transform an internal form into
a concrete syntax.
The method works from the bottom up and from the
inside out.
In each expression, it finds the operator, and it considers the
subexpressions and their positions, left or right, relative to the
operator.
It decides whether to parenthesize subexpressions by comparing
precedence and associativity of the current operator with the
precedences and associativities of the most loosely binding operators
in the subexpressions.
Operators that are ``covered'' by parentheses do not participate in
the comparison.
Here are four example trees:\\
\centerline{%
\psset{linewidth=0.4pt}%
\setlength\tabcolsep{1.5em}%
\begin{tabular}{cccc}
\pstree[nodesep=2pt,levelsep=15pt]
{\TR{$\times$}}
{\pstree{\TR{$+$}}{\TR{$x$}\TR{$y$}}
\TR{$z$}}
&
\pstree[nodesep=2pt,levelsep=15pt]
{\TR{$+$}}
{\pstree{\TR{$\times$}}{\TR{$x$}\TR{$y$}}
\pstree{\TR{$+$}}{\TR{$z$}\TR{$w$}}}
&
\pstree[nodesep=2pt,levelsep=15pt]
{\TR{\dpsmash{postfix [[++]]}}}
{\pstree{\TR{[[*]]}}{\TR{[[p]]}}}
&
\pstree[nodesep=2pt,levelsep=15pt]
{\TR{\dpsmash{prefix [[++]]}}}
{\pstree{\TR{[[*]]}}{\TR{[[p]]}}}\\
\noalign{\medskip}
(a)&(b)&(c)&(d)\label{corr:trees}
\end{tabular}}\\[0.5ex]
In tree (a), discussed above, the unparser renders the left
subtree as $x+y$, without parentheses, because the subexpressions
$x$~and~$y$ contain no operators.
At the root, because operator~$\times$ has higher precedence than
operator~$+$, the unparser parenthesizes the left subexpression, and
the final result is $(x+y)\times z$.
In tree~(b), $+$~has lower precedence than~$\times$, so it is not
necessary to parenthesize the left subtree.
Because $+$~is left-associative, however, it is necessary to
parenthesize the right subtree, because it is a right child of a
parent with the same precedence and left associativity.
The final result is therefore $x \times y + (z + w)$.
Trees (c)~and~(d) show prefix and postfix operators as used in~C.
In tree~(c), [[++]]~and~[[*]] have opposite ``fixity'' (one is postfix
and one is prefix), and because [[++]]~has higher precedence
than~[[*]], it is necessary to parenthesize the subexpression,
resulting in [[(*p)++]].
In tree~(d), [[++]]~and~[[*]] have the same fixity, so precedence is
irrelevant, and the expression is unparsed as~[[++*p]].
Although the full unparsing method handles prefix, postfix, and infix operators,
it is easier to understand if we begin with the simpler
case in which all operators are
binary infix operators.
\section{Parsing and unparsing with infix operators}
\subsection{Abstract and concrete syntax for infix operators}
Type [[rator]] represents a binary infix operator, which has a text
representation, a precedence, and an
associativity:
<>=
type precedence = int
datatype associativity = LEFT | RIGHT | NONASSOC
type rator = string * precedence * associativity
@ This ML code uses simple integers ([[int]]) to represent precedence,
an enumeration to represent associativity, and a triple to represent
an operator.
(In the context of an ML type definition, a~[[*]] does not represent
multiplication; it connects elements of a tuple.)
The more general unparser, presented below, shows how to use an
arbitrary type, not just [[string]], as an operator's concrete
representation.
Precedence and associativity determine how infix expressions are
parsed into trees, or equivalently, how they are parenthesized.
For example,
if operator~$\otimes$ has higher precedence than operator~$\oplus$,
then
\mbox{$x \oplus y \otimes z = x \oplus (y \otimes z)$} and
\mbox{$x \otimes y \oplus z = (x \otimes y) \oplus z$}.
When two operators have the same precedence, associativity is used to
disambiguate.
If $\oplus$~is left-associative,
\mbox{$x \oplus y \oplus z = (x \oplus y) \oplus z$}; if it is
right-associative,
\mbox{$x \oplus y \oplus z = x \oplus (y \oplus z)$}.
{\hfuzz=2pt
Some languages have \emph{non-associative} operators;
if $\oplus$~is non-associative,
then $$(x \oplus y) \oplus z \ne x \oplus y \oplus z \ne x \oplus
(y \oplus z),$$ and \mbox{$x \oplus y \oplus z$} may be illegal.
The comma that separates parameters in a C~function call is an
example of a non-associative operator.
\par}
Many expressions are obtained by applying operators to other
expressions, but there must always be indivisible constituents of
expressions.
We call such constituents \emph{atoms}.
They appear at the leaves of abstract syntax trees and as the
non-operator tokens in concrete syntax.
In most languages the atoms include identifiers,
integer and real literals, string literals, and so on.
In our initial, simple model,
an expression is either an atom or an infix operator applied to two expressions:
<>=
datatype ast = ATOM of string
| APP of ast * rator * ast
@ Type [[ast]] represents an expression's abstract syntax tree.
This ML [[datatype]] definition is something like a production in a context-free
grammar; it gives two alternative ways of constructing an [[ast]].
These alternatives are always given names ([[ATOM]] and
[[APP]]), which are called \emph{constructors}.
An ML datatype may contain any number of constructors, and there may
be data associated with each one.
Here, for an atom, we care only about the atom's string
representation.
For an application, we want the operator and the [[ast]]s representing
the two operands.
Because [[ast]] is used in its own definition, it is a recursive type,
and values of type [[ast]] are trees.
@
Type type [[ast]] represents the input to the unparser; we also need a
type to represent the unparser's output.
For simplicity, we treat this output as a sequence of lexemes, where a
lexeme represents an atom, an operator, or a parenthesis.
Moreover, we undertake to emit only sequences in which atoms and
operators alternate, or in which parenthesized sequences take the
place of atoms.
Let us call such a sequence an [[image]] and use a representation that
forces it to satisfy the following grammar:
\begin{production}{image}\nt{lexical-atom} \sequence{\term{operator} \nt{lexical-atom}}\end{production}
\begin{production}{lexical-atom}\term{atom} \vbar\ \lit( \nt{image} \lit)
\end{production}
@
In the corresponding ML, the sequence in braces becomes
the type [[image']]:
<>=
datatype image = IMAGE of lexical_atom * image'
and image' = EOI (* end of image *)
| INFIX of rator * lexical_atom * image'
and lexical_atom = LEX_ATOM of string
| PARENS of image
@ [[EOI]] represents the end of the image, and [[PARENS]] represents
an image in parentheses.
This representation enforces the invariants that expressions and
operators alternate,
and that the first and last elements of an image are expressions.
@
\subsection{Parsing infix expressions}
To be correct, an unparser must produce a sequence that parses back
into the original abstract syntax tree.
We develop an unparsing algorithm by thinking about parsing.
To understand how to minimize parentheses, we need to consider where
parentheses are needed to get the correct parse.
Suppose we have an abstract syntax tree that has been obtained by
parsing an image without
any parentheses.
Then wherever the syntax tree has an [[APP]] node whose parent is also
an [[APP]] node, there are two cases: the child may be the left child
or the right child of its parent:
\begin{center}\tt
\setlength{\unitlength}{0.05in}%
\begin{picture}(14,15)(47,-29)
\put(55,-15){\makebox(0,0)[b]{outer}}
\put(50,-28){\makebox(0,0)[b]{inner}}
\drawline(54,-16)(50,-26)
\dashline{1}(56,-16)(60,-26)
\end{picture}%
\qquad\qquad
\begin{picture}(14,15)(47,-29)
\put(55,-15){\makebox(0,0)[b]{outer}}
\put(60,-28){\makebox(0,0)[b]{inner}}
\dashline{1}(54,-16)(50,-26)
\drawline(56,-16)(60,-26)
\end{picture}%
\end{center}
Because this tree was obtained by parsing parenthesis-free syntax,
either the inner operator has higher precedence than the outer, or
they have the same precedence and associativity and the associativity
is to the left (right) if the inner is a left (right) child of the
outer.
We formalize this condition using the predicate
\mbox{[[noparens(inner, outer, ]]\emph{side}[[)]]}, where \emph{side}
is [[LEFT]] or [[RIGHT]] and
[[noparens]] is defined
as follows:
\addbox{noparens : rator * rator * associativity -> bool}
\deeperbox{2pt}
<>=
fun noparens (inner as (_, pi, ai), outer as (_, po, ao), side) =
pi > po
orelse pi = po andalso ai = ao andalso ao = side
(*unboxval*)
@ where [[pi]], [[ai]], [[po]], and [[ao]] stand for precedence or
associativity of inner or outer operators.
This ML code defines a function [[noparens]].
Its name and type are given in the box at the beginning of the chunk.
Type assertions in boxes are inserted into the code and checked by the
compiler.
The type of [[noparens]] shows that it is a Boolean function
of three arguments (two operators and an
associativity).
The structure of the definition is
% \begin{quote}
\mbox{[[fun]] \term{name} \nt{binding} [[=]] \nt{body}}.
% \end{quote}
The \nt{binding} associates names to the arguments; these names are
used in the body.
\mbox{[[inner as (_, pi, ai)]]} is a nested binding; it binds [[pi]]
to the precedence of~[[inner]] and [[ai]] to the associativity
of~[[inner]].
The underscore~[[_]] is a ``wildcard,'' which indicates that the
text representation of [[inner]] is to be ignored and not bound to any
name.
The process of using the \nt{binding} to associate names with the
arguments and parts of the arguments is called \emph{ML pattern matching}.
@
Readers familiar with the treatment of operator-precedence
parsing in Section~4.6 of \citeN{aho:compilers} may recognize that
\begin{eqnarray*}
\mathtt{noparens}(i, o, \mathtt{LEFT}) &\equiv& i \dotgt o\\
\mathtt{noparens}(i, o, \mathtt{RIGHT}) &\equiv& o \dotlt i
\end{eqnarray*}
We use [[noparens]] during unparsing to decide when parentheses are
needed and during parsing to decide whether to shift or reduce.
@
\newcommand\rator{\oplus}
\newcommand\rators{\rator_s}
\newcommand\ratori{\rator_i}
\renewcommand\rators{\rator_{\mathtt{stack}}}
\renewcommand\ratori{\rator_{\mathtt{ipts}}}
\newcommand\ratorim{{\rator_{\mathtt{im}}}}
\newcommand\op{\oplus}
To prove correctness of the unparser, I introduce a simple LR~parser.
The unparser is correct if \mbox{[[parse (unparse e) = e]]}
for any syntax tree [[e]].
In other words, if we begin with an abstract syntax tree~[[e]],
unparse it, and then parse the resulting lexemes, we get back the
original abstract syntax tree.
@
In the parser's stack, expressions and operators
alternate. Unless the stack is empty, there is an operator on the
top, which is written to the right.
<>=
datatype stack = BOT
| PUSH of stack * ast * rator
@ This recursive datatype is isomorphic to a list of (operator, ast)
pairs, but it is helpful to define a special type because the
constructors [[BOT]] and
[[PUSH]] distinguish parser stacks from other kinds of lists.
@
The parser itself can be simplified by a bit of trickery.
Instead of treating ``bottom of stack'' or ``end of input'' as special
cases, treat them as occurrences of a sentinel operator,
[[minrator]].
[[minrator]] has precedence [[minprec]], which
must be lower than the precedence of any real
operator.
Using this trick, we define functions that return the operator on
the top of the stack and the operator about to be scanned in the input.
\addbox{srator : stack -> rator}
\addbox{irator : image' -> rator}
\deeperbox{2pt}
<>=
val minrator : rator =
("", minprec, NONASSOC)
fun srator (PUSH (_, _, $)) = $
| srator BOT = minrator
fun irator (INFIX ($, _, _)) = $
| irator EOI = minrator
(*unboxval*)
@ Given a stack [[stack]] and an input sequence of tokens [[ipts]], we
may
\iffalse % bad line break
write $\rators$~for \mbox{[[srator stack]]} and
$\ratori$~for \mbox{[[irator ipts]]}.
\else
abbreviate \mbox{[[srator stack]]} as~$\rators$ and
abbreviate \mbox{[[irator ipts]]} as~$\ratori$.
\fi
@
These function definitions show a new ML construct---we define
functions on [[datatype]]s by pattern-matching against the
datatype constructors.
We provide a \nt{binding} and a \nt{body} for each
constructor.
This ability to use pattern matching to do case analysis is one of the
advantages of using~ML, especially because the compiler ensures that
all cases are covered.
This code also illustrates an unusual syntactic feature of ML: we can use
strings of symbols, as well as strings of letters, to make identifiers.
In this paper, by convention, the identifier~[[$]] stands for an
arbitrary operator.
@
We now have enough machinery to write a shift-reduce LR~parser,
which continues parsing until its stack
and input are empty.
The parser's state includes a stack with an
operator~$\rators$ on top, a current expression~[[e]], and the input, which
begins with the operator~$\ratori$.
This state is represented by three parameters to the function [[parse']].
Parsing is complete when the stack is empty ([[BOT]]) and the input is
empty ([[EOI]]).
\addbox{parse' : stack * ast * image' -> ast}
\addbox{parse : image -> ast}
\addbox{parseAtom : lexical_atom -> ast}
<>=
exception Associativity
local
exception Impossible
fun parse' (BOT, e, EOI) = e
| parse' (stack, e, ipts) =
<>
and parse (IMAGE(a, tail)) = parse'(BOT, parseAtom a, tail)
and parseAtom (LEX_ATOM a) = ATOM a
| parseAtom (PARENS im) = parse im
(*unboxval*)
in
val parse = parse
end
@ The ML code uses two new constructs, and it shows a new way to use
pattern matching.
The declaration \mbox{[[local]] \nt{private-declarations} [[in]]
\nt{public-declarations} [[end]]} hides information.
Private declarations are invisible outside the [[local]]
construct; only public declarations escape.
The connective [[and]] is used with [[fun]] to define a nest of mutually
recursive functions.
The first match in the definition of [[parse']] handles the case when
both the stack and the input are empty.
The next match handles all other cases; this works because a variable
in a \nt{binding} matches any datatype constructor.
@
The interesting part of the parser is deciding whether to shift or
reduce.
Shifting pushes [[e]]~and~$\ratori$ onto the stack.
Reducing pops $\rators$ and the expression
below it, creating a new
[[APP]] node with operator $\rators$.
Shifting $\ratori$ guarantees that $\ratori$ will be reduced before
$\rators$, and therefore that $\ratori$~will eventually be a right
descendant of $\rators$.
Reducing guarantees that $\rators$ will eventually be a left
descendant of $\ratori$.
The parser uses [[noparens]] to choose whichever alternative is
correct without parentheses.
The choice is deterministic because the conditions for shifting and
reducing are mutually
exclusive; the proof of correctness relies on that fact.
<>=
if noparens(irator ipts, srator stack, RIGHT) then (* shift *)
case ipts
of INFIX ($, a, ipts') =>
parse'(PUSH(stack, e, $), parseAtom a, ipts')
| EOI => raise Impossible (* EOI has lowest precedence *)
else if noparens(srator stack, irator ipts, LEFT) then (* reduce *)
case stack
of PUSH (stack', e', $) => parse'(stack', APP(e', $, e), ipts)
| BOT => raise Impossible (* BOT has lowest precedence *)
else
raise Associativity
(* saw a + b + c, for some nonassociative + *)
@ Pattern
matching can be used in [[case]] statements as well as in function definitions.
Here it is used to extract the first two inputs or the expression and
operator on top of the stack.
Because [[srator]] and [[irator]] return [[minrator]] for [[BOT]] and
[[EOI]], the parser shifts when the stack is empty and
reduces when there are no more inputs.
Therefore,
the lines that raise \mbox{[[Impossible]]} are never
executed; they are included only to keep the compiler from complaining
about cases that are not covered.
@
The parser uses two different kinds of recursion.
The direct recursive calls from [[parse']] to itself are all tail
calls; such calls are ML programmers' way of writing loops.
In an imperative language, [[parse']] would be written with a loop,
and it would modify three variables: the stack, the current
expression, and the inputs. When the stack and
inputs were empty, it would exit the loop by returning the current
expression.
An imperative version of
[[parse']] would keep the indirect recursive call through
[[parseAtom]] and [[parse]].
@
\subsection{Unparsing infix expressions}
Before getting into the details of unparsing, we define functions that
make images from atoms and put parentheses around images.
\addbox{image : string -> image}
\addbox{parenthesize : image -> image}
\deeperbox{2pt}
<>=
fun image a = IMAGE (LEX_ATOM a, EOI)
fun parenthesize im = IMAGE (PARENS im, EOI)
(*unboxval*)
@ [[infixImage]] combines two images by putting an infix operator
between them. It uses [[append]] as an auxiliary function.
\addbox{append : image' * image' -> image'}
\addbox{infixImage : image * rator * image -> image}
\deeperbox{2pt}
<>=
local
fun append (EOI, image') = image'
| append (INFIX($, a, tail), image') =
INFIX($, a, append(tail, image'))
in
fun infixImage (IMAGE(a, tail), $, IMAGE(a', tail')) =
IMAGE(a, append(tail, INFIX($, a', tail')))
(*unboxval*)
end
@
To unparse an expression, we produce a \emph{fragment} containing
not only
the image of the expression, but also the lowest-precedence operator
used in the expression.
That operator tells us enough to decide whether
to parenthesize that expression when it is used.
<>=
type fragment = rator * image
@
The \emph{top-level operators} of an image are the
operators that appear outside of parentheses.
If an image has top-level operators, then the top-level operators
of least precedence must all have the same associativity, which must
be [[LEFT]] or [[RIGHT]].
Otherwise, one of those operators would have to be in parentheses,
which contradicts the assumption that they are top-level operators.
@
The function [[bracket(frag, side, rator)]] parenthesizes a fragment [[frag]]
that is to appear next to an operator [[rator]] on the side labelled [[side]]:
\addbox{bracket : fragment * associativity * rator -> image}
<>=
fun bracket((inner, image), side, outer) =
if noparens(inner, outer, side) then image else parenthesize image
@ Given the ability to bracket fragments, unparsing is straightforward.
We create another sentinel operator [[maxrator]], having precedence [[maxprec]],
which must be higher than the precedence of any true operator, so we
can use it in fragments made from atoms.
\addbox{unparse' : ast -> rator * image}
\addbox{unparse : ast -> image}
%%% type fragment = rator * image
\deeperbox{2pt}
<>=
local
val maxrator = ("", maxprec, NONASSOC)
fun unparse' (ATOM a) = (maxrator, image a)
| unparse' (APP(l, $, r)) =
let val l' = bracket (unparse' l, LEFT, $)
val r' = bracket (unparse' r, RIGHT, $)
in ($, infixImage(l', $, r'))
end
in
fun unparse e =
let val ($, im) = unparse' e
in im
end
(*unboxval*)
end
@ Using [[bracket]] maintains the precedence and associativity
invariants of the fragments.
[[unparse]] first computes a fragment, then discards the operator,
leaving only the image.
@
\subsection{Proof of correctness}
\def\dollar{\mbox{[[$]]}}%
Proving the simple unparser correct helps clarify how the parser and
unparser work together.
The result giving the most insight is Proposition~\ref{prop:nonassoc},
which gives
the properties of the top-level operators produced during unparsing.
Most of the proof is omitted from the body of the paper; it appears
instead in Appendix~\ref{app:proof}.
\newcommand{\parser}[3]{\[parse'(#1, #2, #3)\]}
\long\def\proofstext{
\setcounter{prop}{0}
\setcounter{defn}{0}
\setcounter{lemma}{0}
\ifshortproof
\else
Proofs are written in a goal-directed, calculational
style.
A goal-directed proof begins with a conclusion to be proved, then
reasons backwards towards premises.
The backwards arrow~$\impliedby$ is pronounced
``is implied by.''
\fi
\begin{prop}
[[parse (unparse e)]] $=$ [[e]].
\end{prop}
\ifshortproof
This proposition follows from
Proposition~\ref{e-induction}.
\else
\begin{proof}
Rewrite [[parse (unparse e)]] into an
expression more easily proved equal to~[[e]].
\begin{proofstep*}
[[parse (unparse e)]]
\end{proofstep*}
$=$
(by choosing suitable [[a]] and [[tail]] for the result of
[[unparse e]])
\begin{proofstep*}
[[parse (IMAGE (a, tail))]]
\end{proofstep*}
$=$
(by the definition of [[parse]])
\begin{proofstep*}
[[parse' (BOT, parseAtom a, tail)]]
\end{proofstep*}
$=$
(by the definition of [[append]])
\begin{proofstep*}
[[parse' (BOT, parseAtom a, append(tail, EOI))]]
\end{proofstep*}
$=$
(by Proposition~\ref{e-induction})
\begin{proofstep*}
[[parse'(BOT, e, EOI)]]
\end{proofstep*}
$=$
(by definition of [[parse']])
\begin{proofstep*}
[[e]]
\end{proofstep*}
\end{proof}
\fi
\begin{prop}[Correctness of unparsing in context]
\ifshortproof\label{e-induction}\fi
If [[e]] is any expression, and if\/ [[a]]~and\/~[[tail]] are chosen such
that \mbox{[[unparse e = IMAGE(a, tail)]]},
and if [[s]] is a parser stack
and [[i]]~an input sequence of type [[image']] such that
for all top-level operators $\op'$ in [[tail]],
\D\[noparens(\op', srator~s, RIGHT) \textrm{~and~} noparens(\op', irator~i, LEFT),\]
then [[parse' (s, parseAtom a, append(tail, i)) = parse' (s, e, i)]].
\end{prop}
\ifshortproof
In other words, even when [[e]]~is a fragment of a
larger expression, we can parse the concrete syntax
[[(a, tail)]] after having
parsed a prefix (stack~[[s]]) and before parsing
a suffix (inputs~[[i]]), and in this context we
recover the original expression~[[e]], leaving stack~[[s]] and
inputs~[[i]] to be processed.
The conditions involving [[noparens]] say that the
top-level operators in \mbox{[[unparse e]]} bind at least as tightly
as~$\rators$, which is on top of the stack, and at least as tightly as~$\ratori$, which
follows~[[e]];
furthermore, when the binding powers are the same, the associativity
forces the parser to shift or reduce as appropriate.
\fi
\ifshortproof
\begin{defn}[Binds as tightly as]
\label{bind-defn}
An operator $\rator$ \emph{binds as tightly as} another operator
[[rator]] if and only if
$$\begin{array}{c@{\,}l}
&\prec\rator > \prec{[[rator]]}\\
\lor&\prec\rator = \prec{[[rator]]} \land \assoc\rator = \assoc{[[rator]]}.
\end{array}$$
\end{defn}
This relation is like [[noparens]], except it ignores
the relative positions of
operators of the same precedence and associativity.
This weakening of [[noparens]] yields a reflexive, transitive, binary
relation, which is in fact a slight variation on ``has precedence at
least as great.''
\else
\stepcounter{defn}
\fi
\ifshortproof
\newcommand\parseAtom[1]{parseAtom~##1}
The proof of Proposition~\ref{e-induction} is by structural induction on~[[e]].
The base case is straightforward.
To get the idea of the proof of the induction step, we use textual
juxtaposition to write expressions, inputs, and parser stacks.
If~\[e = l \op r\],
we can write \[unparse~e=a_l t_l \op a_r t_r\], where \[a\]'s are
atoms and \[t\]'s (tails) are sequences of atoms and operators.
Writing
\parser{s}{\parseAtom a_l}{ t_l \op a_r t_r i}
for the state of the parser, we prove Proposition~\ref{e-induction} by
showing
\begin{proofstep*}
\parser{s}{\parseAtom a_l}{ t_l \op a_r t_r i}
\end{proofstep*}
$=$ (by the induction hypothesis applied to \[l\])
\begin{proofstep*}
\parser{s}{l}{\op a_r t_r i}
\end{proofstep*}
$=$ (by hypothesis, \[noparens(\op, srator~s, RIGHT)\], and the parser shifts)
\begin{proofstep*}
\parser{s l \op}{\parseAtom a_r}{t_r i}
\end{proofstep*}
$=$ (by the induction hypothesis applied to \[r\])
\begin{proofstep*}
\parser{s l \op}{r}{i}
\end{proofstep*}
$=$ (by hypothesis, \[noparens(\op, irator~i, LEFT)\], and the parser reduces)
\begin{proofstep*}
\parser{s}{l \op r}{i} $=$ \parser{s}{e}{i}
\end{proofstep*}
The details involve showing that the preconditions hold
and it is safe to apply the induction hypothesis.
We need facts about the
results returned by [[unparse]] and [[unparse']], as stated in the
next two propositions.
\else
\begin{proof}
By structural induction on~[[e]].
If [[e]] is an atom, write [[e = ATOM x]], and
\begin{proofstep*}
[[parse' (s, parseAtom a, append(tail, i))]]
\end{proofstep*}
$=$
(because by assumption [[unparse e = IMAGE (LEX_ATOM x, EOI)]])
\begin{proofstep*}
[[parse' (s, parseAtom (LEX_ATOM x), append(EOI, i))]]
\end{proofstep*}
$=$
(by definitions of [[parseAtom]] and [[append]])
\begin{proofstep*}
[[parse' (s, ATOM x, i)]]
\end{proofstep*}
$=$
(by assumption)
\begin{proofstep*}
[[parse' (s, e, i)]]
\end{proofstep*}
For the induction step, \[e\] must have the form
\[e = APP(l, \op, r)\], and we can choose \[l'\]~and~\[r'\] as in
\[unparse'\], so
\[unparse~e = infixImage(l', \op, r')\].
Because both \[l'\] and \[r'\] have type [[image]], we can
write \[l' = IMAGE(a_l, t_l)\] and \[r' = IMAGE(a_r, t_r)\].
By using juxtaposition to abbreviate appending, we
write
\[unparse~e = a_l t_l \op a_r t_r\].%
\footnote{This shorthand abbreviates % the mouthful
\[unparse~e = IMAGE(a_l, append(t_l, INFIX(\op, a_r, t_r)))\].}
We therefore consider
\begin{proofstep*}
\[parse'(s, [[parseAtom]]~a_l, t_l \op a_r t_r i)\]
\end{proofstep*}
$=$
(by applying the induction hypothesis to~\[l'\], as justified below)
\begin{proofstep*}
\[parse'(s, l, \op a_r t_r i)\]
\end{proofstep*}
\proofhang $=$
(by definition of [[parse']], the parser shifts, because by
hypothesis, $\op$~is a top-level operator from \mbox{[[unparse e]]}
and therefore
\[noparens(\op, srator~s, RIGHT)\])
\begin{proofstep*}
\[parse'(s l \op, [[parseAtom]]~a_r, t_r i)\]
\end{proofstep*}
$=$
(by applying the induction hypothesis to~\[r'\], as justified below)
\begin{proofstep*}
\[parse'(s l \op, r, i)\]
\end{proofstep*}
\proofhang $=$
(by definition of [[parse']], the parser reduces, because by
hypothesis, $\op$~is a top-level operator from \mbox{[[unparse e]]}
and therefore
\[noparens(\op, irator~i, LEFT)\])
\begin{proofstep*}
\[parse'(s, APP(l, \op, r), i)\]
\end{proofstep*}
$=$
(by assumption)
\begin{proofstep*}
\[parse'(s, e, i)\]
\end{proofstep*}
It remains to justify applying the induction hypothesis to
\[l'\]~and~\[r'\].
By definition of [[unparse]] and [[bracket]], either
\D\[l' = unparse~l \textrm{\qquad or\qquad}l' = parenthesize(unparse~l).\]
In the latter case, \[l' = IMAGE(PARENS(unparse~l), EOI)\], so by the
definitions of \[a_l\]~and~\[t_l\], we have
\[a_l = PARENS(unparse~l)\] and \[t_l = EOI\], and
\begin{proofstep*}
\[parse'(s, [[parseAtom]]~a_l, t_l \op a_r t_r i)\]
\end{proofstep*}
$=$ (by assumption and by definition of [[parseAtom]])
\begin{proofstep*}
\[parse'(s, parse(unparse~l), \op a_r t_r i)\]
\end{proofstep*}
\proofhang
$=$ (by the induction hypothesis applied to~\[l\], with empty stack and inputs)
\begin{proofstep*}
\[parse'(s, l, \op a_r t_r i)\]
\end{proofstep*}
which is the step made above.
The case where \[l' = unparse~l\] is the interesting case.
\begin{proofstep*}
\[parse'(s, [[parseAtom]]~a_l, t_l \op a_r t_r i)\]
\end{proofstep*}
\proofhang $=$ (For all top-level operators~$\op'$ in~\[t_l\],
\[noparens(\op', \op, LEFT)\], and we apply the induction hypothesis
with the inputs \[\op r' ~ i\], concluding that)
\begin{proofstep*}
\[parse'(s, [[parseAtom]]~a_l, t_l \op a_r t_r i)\]
\end{proofstep*}
\noindent
We must show \[noparens(\op', \op, LEFT)\] for any top-level $\op'$
from~\[t_l\].
If~$\op$ is non- or right-associative, then by
Proposition~\ref{prop:nonassoc}, $\prec \op' > \prec \op$, and
\[noparens(\op', \op, LEFT)\] follows.
If~$\op$ is left-associative, then
\begin{proofstep*}
\[noparens(\op', \op, LEFT)\]
\end{proofstep*}
$\impliedby$ (by definition of [[noparens]] and left-associativity of~$\op$)
\begin{proofstep*}
$\op'$ binds as tightly as $\op$
\end{proofstep*}
$\impliedby$ (transitivity)
\begin{proofstep*}
$\mbox{$\op'$ binds as tightly as $\op_l$} \land
\mbox{$\op_l$ binds as tightly as $\op$}$
\end{proofstep*}
Where \[unparse'~ l = (\op_l, a_l t_l)\] and the two clauses follow by
applying Proposition~\ref{unparse-covered} to \[l\]~and~\[e\].
\end{proof}
\fi
\begin{prop}
\ifshortproof\label{prop:nonassoc}\fi
If an expression \[e = APP(l, \op, r)\], then we can choose \[l'\] and
\[r'\] as in \[unparse'\], so that
\[unparse~e = l' \op r'\].
\begin{enumerate}
\item
If $\op$ is non-associative, then every top-level operator in \[l'\] and \[r'\] has a
precedence greater than that of $\op$.
\item
If $\op$ is left-associative, then every top-level operator in \[r'\] has a
precedence greater than that of $\op$.
\item
If $\op$ is right-associative, then every top-level operator in \[l'\] has a
precedence greater than that of $\op$.
\end{enumerate}
\end{prop}
\ifshortproof
\else
\newcommand\preland{\llap{$\land$~}\ignorespaces}
\begin{proof}
First, do case
analysis on \[l' = bracket(unparse'~l, LEFT, \op)\].
If \[l'\]~is parenthesized, then (3) and half of~(1) hold vacuously
because \[l'\]~has no top-level operators.
Otherwise, we can assume that \[unparse'~l = (\op_l, l')\], and
\begin{proofstep*}
For all top-level operators \[\op'\] in \[l'\], $\prec{\op'}>\prec\op$
\end{proofstep*}
$\impliedby$ (introducing \[\op_l\])
\begin{proofstep*}
For all top-level operators \[\op'\] in \[l'\], \rlap{$\prec{\op'}\ge\prec{\op_l}$}\\
\preland
\[\prec{\op_l} > \prec\op\]
\end{proofstep*}
$\impliedby$ (definitions of ``covered'' and ``binds as tightly as'')
\begin{proofstep*}
Fragment \[(\op_l, l')\] is covered
%%% \\$\land$&
$\land$
\[\prec{\op_l} > \prec\op\]
\end{proofstep*}
$\impliedby$ (definition of [[noparens]])
\begin{proofstep*}
Fragment \[(\op_l, l')\] is covered
\\\preland
\[noparens((\op_l, l'), \op, LEFT) \land \assoc\op \ne LEFT\]
\end{proofstep*}
$\impliedby$ (definition of [[bracket]])
\begin{proofstep*}
Fragment \[(\op_l, l')\] is covered
\\\preland
\[bracket((\op_l, l'), \op, LEFT)\] is not parenthesized
$ \land$ \rlap{\[ \assoc\op \ne LEFT\]}
\end{proofstep*}
$=$ (by assumption)
\begin{proofstep*}
[[unparse l]] is covered
\\\preland
\[l'\] is not parenthesized
$ \land$ \[ \assoc\op \ne LEFT\]
\end{proofstep*}
$\impliedby$ (Proposition~\ref{unparse-covered}, assumption)
\begin{proofstep*}
\[ \assoc\op \ne LEFT\]
\end{proofstep*}
Which proves (3) and half of~(1). A similar argument proves (2)~and the
other half of~(1).
\end{proof}
\fi
\begin{prop}\ifshortproof\label{unparse-covered}\fi
For any expression [[e]], [[unparse' e]] is properly covered%
\ifshortproof , as defined below\fi.
\end{prop}
\ifshortproof
\begin{defn}[Covered, properly covered]
\label{defn:covered}
A fragment [[frag = (rator, im)]] is \emph{covered} if and only
if every top-level operator in [[im]] binds as tightly as [[rator]].
A covered fragment is \emph{properly covered} if and only if
[[rator = maxrator]] or there is at least
one top-level operator in [[im]].
\end{defn}
The intuition behind a covered fragment [[(rator, im)]] is that
low-precedence operators in [[im]], which do not bind as tightly
as~[[rator]], are safely ``covered'' with parentheses.
Proper coverage avoids pathology for fragments that don't have any
top-level operators.
\fi
\ifshortproof
Proposition~\ref{prop:nonassoc} is proved by case analysis on the
result returned by~[[bracket]].
Proposition~\ref{unparse-covered} is proved by structural induction on
the expression unparsed.
The proof uses this lemma:
\fi
\ifshortproof
\else
%
\begin{proof}
By structural induction.
The base case is trivial.
For the induction step, we must prove
\begin{proofstep*}
[[unparse' e]] is properly covered
\end{proofstep*}
\proofhang
$=$ (letting \[e = l \op r\]
and \[unparse~e = l' \op r'\] as above)
\begin{proofstep*}
\[(\op, l' \op r')\] is properly covered
\end{proofstep*}
$\impliedby$
($\op$ is a top-level operator of \[l' \op r'\])
\begin{proofstep*}
\[(\op, l' \op r')\] is covered
\end{proofstep*}
$=$ (definition of ``covered'')
\begin{proofstep*}
\[\mbox{For all top-level operators } \op' \mbox{~in~} l' \op r',
\op' \mbox{~binds as tightly as~} \op
\]
\end{proofstep*}
$=$ (distributing)
\begin{proofstep*}
\[\mbox{For all top-level operators } \op' \mbox{~in~} l',
\op' \mbox{~binds as tightly as~} \op
\]\\
\preland \[\op \mbox{~binds as tightly as~} \op\]\\
\preland
\[\mbox{For all top-level operators } \op' \mbox{~in~} r',
\op' \mbox{~binds as tightly as~} \op
\]
\end{proofstep*}
$=$ (reflexivity of ``binds as tightly as,'' definition of
``covered'')
\begin{proofstep*}
\[(\op, l')\] is covered $\land$ \[(\op, r')\] is covered
\end{proofstep*}
$=$ (body of [[unparse']])
\begin{proofstep*}
\[(\op, bracket(unparse'~l, LEFT, \op))\] is covered
\\ \preland
\[(\op, bracket(unparse'~r, RIGHT, \op))\] is covered
\end{proofstep*}
$\impliedby$ (bracketing lemma)
\begin{proofstep*}
\[unparse'~l\] properly covered and \[unparse'~r\] properly covered
\end{proofstep*}
which is the induction hypothesis.
\end{proof}
%
\fi
%%%%%
%%%
{\hfuzz=3.8pt
\begin{lemma}[Bracketing lemma]
\ifshortproof\label{bracket-lemma}\fi
If fragment [[f]] is properly covered, then for any operator [[rator]] and
associativity~[[a]], [[bf = (rator, bracket(f, a, rator))]] is covered.
\end{lemma}
\ifshortproof
The lemma is proved by case analysis on whether [[bracket]]
parenthesizes its
fragment.
The proof uses transitivity of ``binds as tightly as.''
\else
\begin{proof}
By case analysis on the result of~[[bracket]].
If the result is parenthesized, then [[bf]] is trivially covered,
because it contains no top-level operators.
Otherwise, let us write [[f = ($, im)]].
Our goal is to show that
\begin{proofstep}
\mbox{[[bf = (rator, bracket(($, im), a, rator)]] is covered.}
\end{proofstep}
$\impliedby$ (by the assumption that the result of [[bracket]] is
[[im]])
\begin{proofstep}
\mbox{[[(rator, im)]] is covered}
\end{proofstep}
$\equiv$ (by Definition~\ref{defn:covered})
\newcommand\forallrator{\mbox{For all top-level operators $\rator$ in [[im]]}}%
\newcommand\allrator{\multicolumn{2}{l}{\forallrator}}%
\begin{proofstep}
\forallrator, \rator \,\mbox{binds as tightly as [[rator]]}
\end{proofstep}
$\impliedby$ (by transitivity of ``binds as tightly as'')
\begin{proofstep}
\forallrator, \phantom{\land} \rator \mbox{ binds as tightly as [[$]]}\\
\phantom{\forallrator, } \land \mbox{ [[$]] binds as tightly as [[rator]]}\\
\end{proofstep}
$\impliedby$ (pulling second clause out of $\forall$ and strengthening it)
\begin{proofstep}
(\forallrator,\rator \mbox{ binds as tightly as [[$]]})\\
\preland \mbox{[[$]] binds as tightly as [[rator]]} \land
\assoc\dollar = \mbox{[[a]]}
\end{proofstep}
$\equiv$ (by definitions of ``covered'' and [[noparens]])
\begin{proofstep}
\mbox{[[($, im)]] is covered}\\
\preland \mbox{[[noparens($, rator, a)]]}
\end{proofstep}
$\impliedby$ (by definition of ``properly covered'' and of [[bracket]])
\begin{proofstep}
\mbox{[[($, im)]] is properly covered}\\
\preland \mbox{[[bracket(($, im), ]]$\assoc\dollar$[[, rator)]]
is not parenthesized}
\end{proofstep}
both of which are true by assumption.
\end{proof}
\fi
} % end of \hfuzz
} %%%%%%%%%%%%%% end of \proofstext
\newif\ifshortproof
\shortprooftrue
\proofstext
The proof is not as simple as it would have been if [[parse]] and
[[unparse]]
had been designed in tandem with a proof,
but [[parse]] and
[[unparse]] were designed to have simple implementations, not a simple proof.
One reason the proof is complex is that
the functions are not full inverses; because parsing leaves no record
of redundant parentheses, \[unparse(parse~e)\ne e\].
Moreover, although the functions are partial inverses, the
computations are not inverses.
The parser is an LR~parser, working left to right with an
explicit stack.
The unparser works not left to right but bottom-up, with
an implicit stack created by recursive calls.
An alternative approach to unparsing might begin
with a parser and attempt to derive an unparser using
the program-transformation techniques known as
\emph{program inversion}.\cite{gries:science,chen:inversion,von-wright:inversion}\label{corr:inversion}
If successful, this attempt would produce an unparser that would be correct
by construction, but it would look substantially different from the
one presented here.
Program-inversion rules have been developed for simple programming
calculi, not real programming languages, so the unparser would have to
be translated into a programming language in order to be executed.
The unparsers in this paper are executable directly, and they are
designed to be easy to understand, even if that design makes the proof
of correctness harder to understand.
\section{Adding prefix and postfix operators}
Simple
infix expressions are inadequate for many real programming languages,
including C and ML.
% Java?
In this section, we add unary
prefix and postfix
operators,
as well as infix operators of arbitrary arity.
Moreover, we use the Standard ML modules system to generalize our idea
of what an image is, so we can produce output for a prettyprinter, not
just text.
\emph{Fixity} distinguishes prefix and postfix operators from infix
operators.
Only infix operators have associativity.
<>=
structure Assoc = struct
datatype associativity = LEFT | RIGHT | NONASSOC
datatype fixity = PREFIX | POSTFIX | INFIX of associativity
end
@ From these definitions,
the possible values of type [[fixity]] are [[PREFIX]],
[[POSTFIX]], \mbox{[[INFIX LEFT]]}, \mbox{[[INFIX RIGHT]]}, and \mbox{[[INFIX NONASSOC]]}.
Having prefix and postfix operators means that precedence alone no
longer determines
the correct parse, because the order of two prefix or of two postfix
operators overrides precedence.
For example, if [[++]] and [[*]] are prefix operators and [[p]]
is an atom, then no matter what the precedences of [[++]]
and [[*]],
[[++*p]] and [[*++p]] are both
correct and unambiguous, equivalent to [[++(*p)]] and
[[*(++p)]], respectively.
Infix operators of arbitrary arity broaden our
view of non-associative operators.
For example, in both ML and C, the following three expressions are all
legal, but all
different:
\begin{quote}
$[[f((a, b), c)]] \ne [[f(a, b, c)]] \ne [[f(a, (b, c))]]$
\end{quote}
The comma operator is an
$n$-ary, infix, non-associative operator.
@
Operators now have a concrete representation, a precedence, and a fixity.
<>=
type precedence = Atom.Precedence.precedence
type rator = Atom.atom * precedence * Assoc.fixity
@ In expressions,
operators may be unary, binary, or $n$-ary.
<>=
datatype ast = ATOM of Atom.atom
| UNARY of rator * ast
| BINARY of ast * rator * ast
| NARY of rator * ast list
@ List types are built in to ML.
Type [['a list]] means ``a list of values of type~[['a]],'' for any
type~[['a]].
It is convenient to treat \texttt{NARY($\oplus$, [e])} as
equivalent to \texttt{e} for any~$\oplus$; [[[e]]] is the singleton
list containing~[[e]].
@
In this new implementation, atomic components of
expressions are no longer limited to strings, but may be any values of
any type.
That type is given the name
[[Atom.atom]], but it is not otherwise specified.
The user of the unparser can pick any convenient type, e.g.,
prettyprinter directives.
The structure [[Atom]] is a
parameter to the unparser module.
It must
provide the following types and values:
<>=
type atom (* representation of operator or atomic expression *)
val parenthesize : atom list -> atom
@ The unparser uses [[Atom.parenthesize]] to parenthesize a list of atoms.
@
The chunk [[<>]] exploits a feature
of the Standard~ML modules system; in a \emph{signature}, we can name
types and values without giving their definitions.
The unparser module is parameterized by a module called [[Atom]], and
it works with \emph{any} [[Atom]] that provides a type~[[atom]], a
function [[parenthesize]], and the other values listed in
[[<>]].
This parameterization makes the unparser general and easy to reuse.
@
To make this general capability as useful as possible,
we make it possible to have the unparser add ``decorations'' to
list of atoms.
We do so by adding another kind of expression.
<>=
| DECORATE of (Atom.atom list -> Atom.atom) * ast
@ The [[DECORATE]] constructor attaches a ``decoration function'' to an expression.
% , which has the type \mbox{[[Atom.atom list -> Atom.atom]]},
The decoration function must transform a list of atoms into a single
atom in a way that is indetectible by the parser.
For example, a decoration function might add values of type
[[Atom.atom]] that indicate
changes in indentation, color, or font of
the unparsed text.
These values would appear to the parser as white space.
@
Finally, the user of the unparser must supply minimum and
maximum precedences, which are used to define the sentinels [[minrator]]
and [[maxrator]].
<>=
structure Precedence : sig
type precedence
val min : precedence (* precedence below that of any operator *)
val max : precedence (* precedence above that of any operator *)
val compare : precedence * precedence -> order
end
val bogus : atom (* bogus atom used to make sentinel operators *)
@ In many applications, [[minprec]] and [[maxprec]] can be computed
automatically.
@
The datatype [[ast]] and the structure [[Atom]] form the external
interfaces to the unparser.
In the implementation, images are more general---operators and atomic
terms no longer alternate, but they are
still properly parenthesized:
\begin{production}{image}
\sequence{\term{operator} |\ \nt{lexical-atom} |\ \nt{decorated-image}}
\end{production}
\productionglue
\begin{production}{lexical-atom}
\term{atom} \vbar\ \lit( \nt{image} \lit)\end{production}
The corresponding ML types are:
% these types really shouldn't be exported, but this may be temporary
% (to compare parsers)
<>=
datatype image = IMAGE of lexeme list
and lexeme = RATOR of rator
| EXP of lexical_atom
| DECORATED of (Atom.atom list -> Atom.atom) * image
and lexical_atom = LEX_ATOM of Atom.atom
| PARENS of image
@ where [[lexeme]] corresponds to
\alternate{\term{operator} |\ \nt{lexical-atom} |\ \nt{decorated-image}}.
@
Type [[image]] is internal;
the unparser returns a list of atoms.
It uses [[flatten]] to convert an [[image]] into such a list.
It uses [[Atom.parenthesize]] for parenthesization.
\addbox{image : image -> Atom.atom list}
\addbox{lexeme : lexeme -> Atom.atom}
\addbox{lexical_atom : lexical_atom -> Atom.atom}
\addbox{flatten : image -> Atom.atom list}
<>=
local
fun image (IMAGE l) = map lexeme l
and lexeme (RATOR (atom, _, _)) = atom
| lexeme (EXP a) = lexical_atom a
| lexeme (DECORATED (f, im)) = f (image im)
and lexical_atom (LEX_ATOM atom) = atom
| lexical_atom (PARENS im) = Atom.parenthesize (image im)
in
val flatten = image
(*unboxval*)
end
@ Each local function computes an atom or list of atoms that
corresponds to a value of one type.\label{corr:flatten}
By programming convention, the name of each function is the name of
the type it operates on.
Because [[local]] hides these functions, [[flatten]] is the only name visible
outside this code chunk.
@
\subsection{Parenthesizing general expressions}
As in the simpler case, development of an unparser begins
with the invariants that apply to
an abstract syntax tree produced by parsing an image without
parentheses.
There are now three cases to consider: an operator may be the left child
of a binary infix operator, the right child of a binary infix
operator, or the only child of a unary prefix or postfix operator.
\begin{center}\tt
\setlength{\unitlength}{0.05in}%
\begin{picture}(14,19)(47,-32)
\put(55,-15){\makebox(0,0)[b]{$\op$}}
\put(50,-30){\makebox(0,0)[b]{\[\op_l\]}}
\drawline(54,-16)(50,-26)
\dashline{1}(56,-16)(60,-26)
\put(55,-34){\makebox(0,0)[b]{\small\textrm{(a)}}}
\end{picture}%
\qquad\qquad
\begin{picture}(14,19)(47,-32)
\put(55,-15){\makebox(0,0)[b]{$\op$}}
\put(61,-30){\makebox(0,0)[b]{\[\op_r\]}}
\put(55,-34){\makebox(0,0)[b]{\small\textrm{(b)}}}
\dashline{1}(54,-16)(50,-26)
\drawline(56,-16)(60,-26)
\end{picture}%
\qquad\qquad
\begin{picture}(14,19)(47,-32)
\put(55,-15){\makebox(0,0)[b]{$\op$}}
\put(55,-30){\makebox(0,0)[b]{\rlap{$\op'$}\phantom{$\op$}}} % was (55,-30)
\drawline(55,-16)(55,-26)
\put(55,-34){\makebox(0,0)[b]{\small\textrm{(c)}}}
\end{picture}%
\end{center}
As before, the rules for parenthesization are encapsulated in a
[[noparens]] function.
Because
these trees are obtained by parsing parenthesis-free syntax,
in case~(a), \[noparens(\op_l, \op, LEFT)\];
in case~(b), \[noparens(\op_r, \op, RIGHT)\]; and
in case~(c), \[noparens(\op', \op, NONASSOC)\].
\addbox{noparens : rator * rator * Assoc.associativity -> bool}
\deeperbox{2pt}
<>=
fun noparens(inner as (_, pi, fi), outer as (_, po, fo), side) =
case Atom.Precedence.compare (pi, po)
of GREATER => true (* (a), (b), or (c) *)
| pdiff =>
case (fi, side)
of (POSTFIX, LEFT) => true (* (a) *)
| (PREFIX, RIGHT) => true (* (b) *)
| (INFIX LEFT, LEFT) =>
pdiff = EQUAL andalso fo = INFIX LEFT (* (a) *)
| (INFIX RIGHT, RIGHT) =>
pdiff = EQUAL andalso fo = INFIX RIGHT (* (b) *)
| (_, NONASSOC) => fi = fo (* (c) *)
| _ => false
(*unboxval*)
@ These rules can be understood as follows:
\begin{itemize}
\item
If the inner operator has higher precedence, parentheses are not
needed.
\item
If a postfix operator appears to the left of an infix operator~(a), it
always applies to the preceding expression, and parentheses are not
needed.
Similarly when a prefix operator appears to the right of an infix
operator~(b).
\item
In case~(a), if $\op_l$ has lower precedence than~$\op$, then
parentheses are needed, but if the two have the same precedence, then
parentheses can be avoided, \emph{provided} both are left-associative.
A similar argument applies to case~(b) when both operators are
right-associative.
\item
Finally, in case~(c), if the precedence of $\op'$ is not greater than
the precedence of~$\op$, then parentheses can be avoided if and only
if both operators have the same fixity, i.e., both are prefix
operators or both are postfix operators.
This is the case in the example of [[++*p]] used to
introduce this section.
\end{itemize}
These rules for parenthesization, as embodied in the [[noparens]]
predicate, are the key to understanding both parsing and unparsing algorithms.
The [[noparens]] function is used in an unparser, which appears
below, and in a parser, which appears in Appendix~\ref{parser}.
@
As before, the unparser needs auxiliary functions to manipulate images.
Because an image is now simply a list of lexemes,
these functions are a bit simpler.
\addbox{image : Atom.atom -> image}
\addbox{parenthesize : image -> image}
\addbox{infixImage : image * rator * image -> image}
\addbox{prefixImage : rator * image -> image}
\addbox{postfixImage : image * rator -> image}
<>=
fun image a = IMAGE [EXP (LEX_ATOM a)]
fun parenthesize image = IMAGE [EXP (PARENS image)]
fun infixImage (IMAGE l, $, IMAGE r) = IMAGE (l @ RATOR $ :: r)
fun prefixImage ( $, IMAGE r) = IMAGE ( RATOR $ :: r)
fun postfixImage(IMAGE l, $ ) = IMAGE (l @ RATOR $ :: [])
(*unboxval*)
@ where [[@]], [[::]], and [[[]]] are list append, cons, and the empty
list, which are all predefined in ML.
@
We use the same [[bracket]] to parenthesize sub-expressions.
\addbox{bracket : fragment * Assoc.associativity * rator -> image}
\deeperbox{2pt}
<>=
type fragment = rator * image
fun bracket((inner, image), side, outer) =
if noparens(inner, outer, side) then image else parenthesize image
(*unboxval*)
@
The structure of the unparser is as before:
\addbox{unparse : ast -> Atom.atom list}
<>=
local
val maxrator = (Atom.bogus, Atom.Precedence.max, INFIX NONASSOC)
exception Impossible
<>
in
fun unparse e =
let val ($, im) = unparse' e
in flatten im
end
(*unboxval*)
end
@
The unparsing function itself has new cases: the unary operator,
the $n$-ary operator, and the decorated expression.
The unary operator appears before or after its operand, depending on
its fixity.
\addbox{unparse' : ast -> fragment}
<>=
fun unparse' (ATOM a) = (maxrator, image a)
| unparse' (BINARY(l, $, r)) =
let val l' = bracket (unparse' l, LEFT, $)
val r' = bracket (unparse' r, RIGHT, $)
in ($, infixImage(l', $, r'))
end
| unparse' (UNARY($, e)) =
let val e' = bracket (unparse' e, NONASSOC, $)
val (_, _, fixity) = $
in ($, case fixity of PREFIX => prefixImage ($, e')
| POSTFIX => postfixImage(e', $)
| INFIX _ => raise Impossible)
end
| unparse' (NARY(_, [])) = raise Impossible
| unparse' (NARY($, [e])) = unparse' e
| unparse' (NARY($, e::es)) =
let val leftmost = bracket(unparse' e, LEFT, $)
fun addOne(r, leftPrefix) =
infixImage(leftPrefix, $,
bracket(unparse' r, RIGHT, $))
in ($, foldl addOne leftmost es)
end
| unparse' (DECORATE (f, e)) =
let val ($, im) = unparse' e
in ($, IMAGE [DECORATED (f, im)])
end
(*unboxval*)
@ The interesting case for the $n$-ary operator is covered by the
pattern [[NARY($, e::es)]],
which represents the operator~[[$]] applied to more than one operand.
The code inserts~[[$]] between the operands.
The first operand,~[[e]], is used to make [[leftmost]], an initial
``left prefix.''
The [[addOne]] function extends the left prefix, adding operator and
operand on the right.
The built-in [[foldl]] function walks the list~[[es]], using
[[addOne]] to extend the left prefix with
the remaining operands.
Because both [[LEFT]] and [[RIGHT]] are used in the arguments to
[[bracket]], operator must be non-associative for the operands to be
parenthesized properly.
Of course, no other associativity makes sense, because a left- or
right-associative operator can always be treated as an infix binary
operator.
The $n$-ary case is needed only for non-associative operators.
@
\section{Using the unparser}
Before the unparser can be used, it has to be instantiated with a
suitable [[Atom]] module.
Once the user creates an [[Atom]] module, he or she combines it with
the unparser's implementation to create [[Unparser]], an instance of
the unparser
that is specialized to that particular [[Atom]].\label{corr:Unparser}
This instance provides the datatype [[ast]], with its
contructors [[ATOM]], [[UNARY]], etc.
It also provides an unparsing function.
<>=
val unparse : ast -> Atom.atom list
@
Once the unparser is instantiated, it can be used in a real tool, where
unparsing is usually one of a series of transformations.
In a typical application generator, a program being generated passes
through several representations:
\begin{itemize}
\item an application-specific internal representation,
\item an unparsing tree of type [[Unparser.ast]],
\item
a list of atoms of type [[Atom.atom list]], which typically
includes prettyprinting directives as well as text, and
\item
the final program, represented as nicely indented text in the target
language.
\end{itemize}
This section discusses transformation from an internal representation
into an unparsing tree, using examples from
the New Jersey Machine-Code Toolkit.\cite{ramsey:jersey}
The first part of this paper discusses
transformation from an
unparsing tree to a list of atoms.
References \refno{oppen:prettyprinting}~and~\refno{hughes:pretty-printing}
discuss transformation of
prettyprinting directives to indented text.
%% uses the
%% unparser to emit C~code.\remark{What is [[Unparser]], [[unparse]]? Sig?}
%% Most of the expressions in the C~code come from solving equations
%% \cite{ramsey:solver}.
%% Internally, the toolkit represents expressions using a different
%% ML~constructor for each operator; this representation facilitates
%% algebraic simplification.
%% Emitting C~code therefore takes three steps: transforming the internal
%% representation of expressions to the simpler representation used by
%% the unparser, running the unparser to get input for a prettyprinter,
%% and finally running the prettyprinter to get C~source code.
%% The prettyprinter uses a model like that of \citeN{pugh:pretty}, but it
%% uses a dynamic-programming algorithm to convert to text.
A preliminary step in creating unparsing trees is assigning precedence and
associativity to the operators of the target language.
This can be done easily and reliably by putting
operators in a data structure that indicates precedence implicitly and
fixity (which includes associativity) explicitly.
A fragment of the structure used for~C is
\begin{alltt}
\vdots
(L, [">>", "@<<"]),
(L, ["+", "-"]),
(L, ["%", "/", "*"]),
\vdots
\end{alltt}
where [[L]] is an abbreviation for [[Assoc.INFIX Assoc.LEFT]],
``infix left'' being one of the five possible fixities.
Precedence increases as we move down the structure.
This data structure is transformed into functions [[Cprec]] and
[[Cfixity]] that return the precedence and fixity of given
operators.
It is also used
to compute the values of [[minprec]] and [[maxprec]], which are needed
to create an unparser.
%% These functions are used in turn to define functions that create
%% values of type [[Unparser.ast]], e.g.,
%% \begin{alltt}\hfuzz=23pt
%% fun binary $ =
%% let val optext = pptext (" " ^ $ ^ " ")
%% in fn (l, r) => Unparser.BINARY(l, (optext, Cprec $, Cfixity $), r)
%% end
%% val />>/ = binary ">>"
%% \vdots
%% \end{alltt}
%% where [[pptext]] turns a string into input for the prettyprinter.
Supplying [[minprec]] and [[maxprec]] to the unparser may seem
awkward.\label{corr:minprec}
One could supply instead a list of all operators to be used, letting
the unparser compute [[minprec]] and [[maxprec]].
This technique seems unnecessarily restrictive; it would limit the
unparser not only to a single kind of atom, but also to a single set
of operators.
As written, one unparser can be (and is) used to
emit code in different target languages, using different operator
sets.
Another way to eliminate [[minprec]] and [[maxprec]] would be to use a
different representation of
precedence inside the unparser, e.g.,
to represent precedence by an ML datatype that included values
$+\infty$ and $-\infty$ in addition to all the integers.
In a production environment, eliminating [[minprec]] and [[maxprec]]
from the interface would justify a more complex
implementation.
For expository purposes, however, another user-defined
datatype and special comparison functions would distract
readers from the explanation of unparsing.
%%
%%
%% Functions like [[/>>/]] can be declared infix in~ML; by careful use of
%% such declarations, we can generate C~code by using ML~code that is
%% reminiscent of C~code.
%% For example, this code fragment transforms the toolkit's internal
%% representation of a range test into the [[Unparser.ast]]
%% representation of the ultimate C~code.
%% The internal form [[U.INRANGE(e, {lo, hi})]] represents the predicate
%% \mbox{$\mathtt{lo} \le \mathtt{e} < \mathtt{hi}$}, in which
%% [[e]] is an arbitrary expression, and [[lo]] and [[hi]] are integers
%% known at transformation time.
%% \vskip 0pt plus15pt\penalty-20\vskip0pt plus -15pt
%% >=
%% fun exp(U.INRANGE(e, {lo, hi})) =
%% if lo + 1 = hi then exp e /==/ ppcon lo
%% else ppcon lo /<=/ exp e /&&/ exp e / ppcon hi
%% @ where [[ppcon]] converts an integer to a fragment of prettyprinter input,
%% which is of the proper type [[Atom.atom]].\remark{builds ast}
The fixities of operators can be surprising.
For example, the C operators [[.]]~and~[[->]], which are used to
select members from structures and unions, are superficially binary
infix operators, but their right-hand ``arguments'' are not
expressions but identifiers, and semantically they are unary
operators.
The Toolkit in fact treats them as unary postfix operators, choosing
their representations dynamically according to what member is being
selected.
For example, to create an unparsing tree corresponding to the
selection of member [[tag]] from structure~[[e]], it uses the function
\addbox{dot : Unparser.ast * string -> Unparser.ast}
<>=
fun dot (e, tag) =
Unparser.UNARY((pptext ("." ^ tag), Cprec ".", Assoc.POSTFIX), e)
@ Here the ML~[[^]] operator is string concatenation.
The
[[pptext]] function takes a string and converts it to an
[[Atom.atom]], making it intelligible to the prettyprinter.
@
Handling mixfix operators may involve unparsing some trees during the
construction of other trees.
For example, to translate an array-indexing expression into~C,
one must put the subscript in
square brackets.
The subscript is unparsed independently and turned into an atom by
applying [[pplist]].
It is then wrapped in square brackets, and
the indexing operation itself is represented by the empty string, but
with proper precedence and associativity, which, for
example, enable the unparser to produce either [[*p[n]]] or
[[(*p)[n]]] as required.
A similar trick is used to unparse function calls after applying an
$n$-ary comma operator to the arguments.
\addbox{subscript : Unparser.ast * Unparser.ast -> Unparser.ast}
<>=
fun subscript(array, index) =
let val index = pplist (Unparser.unparse index)
val index = pplist [pptext "[", index, pptext "]"]
in Unparser.BINARY ( array
, (pptext "", Cprec "subscript", L)
, Unparser.ATOM index
)
end
(*unboxval*)
@ [[pplist]] concatenates atoms;
it is also used to define [[Atom.parenthesize]]:
\begin{verbatim}
fun parenthesize l = pplist [pptext "(", pplist l, pptext ")"]
\end{verbatim}
\section{Use and availability}
The unparser described in this paper is used in two application
generators.\cite{ramsey:embedded,ramsey:jersey}
Because the paper is a literate program, the code used in the
generators is extracted directly from the source text of the paper,
and it is available from the
author's Web site, \verb+http://www.cs.virginia.edu/~nr+.
Practitioners who can't use this code directly can build unparsers
by implementing the [[noparens]] predicate and writing [[unparse]] as
a bottom-up tree walk.
Appendix~\ref{parser} of this paper presents the inverse of the
general unparser, i.e., a parser that handles
prefix and postfix as well as infix operators, all of dynamically
chosen precedence and associativity.
This parser, which is used in one of the application generators, is
also available online.
Appendix~\ref{app:proof} presents the full proof of correctness of the
simple unparser.
\acknowledgements
This work was supported in part by NSF grant ASC-9612756 and
by the US Department of Defense under contract MDA904-97-C-0247 and DARPA
order number E381.
The views expressed are my own and should not be taken to
be those of the DoD.
I thank Referee~A for demanding improvements in presentation.
\appendix
\def\nwdocspar{\vskip0pt plus3in\penalty-800\vskip0pt plus -3in}
\section{A parser for general expressions}
\label{parser}
The parser shown here is the inverse of the unparser in the body of
the paper.
It handles prefix, postfix, and infix operators.
It also supports concrete syntax in which concatenation of two
expressions signifies the application of a ``juxtaposition''
operator to the two expressions.
Like other binary operators, juxtaposition has
precedence and associativity.
For example, in Standard ML, juxtaposition denotes function
application, it associates to the left, and it has a precedence higher
than that of any explicit infix operator.\cite{milner:definition}
In awk, juxtaposition denotes string concatenation, and it has a
precedence higher than the comparison operators but lower than the
arithmetic operators.\cite{aho:awk}
The parser handles juxtaposition by inserting an implicit
``juxtaposition operator'' whenever two expressions are juxtaposed in
the input.
(The unparser handles juxtaposition by using a binary infix
operator whose representation is white space.)
The user supplies a specification of this operator as part of the
structure [[Atom]].
The specification includes an associativity, not a fixity, because
juxtaposition is infix by definition.
<>=
val juxtapositionSpec : atom * Precedence.precedence * Assoc.associativity
@ Internally, the parser converts this specification into an infix
operator.
<>=
local
val (rep, prec, assoc) = Atom.juxtapositionSpec
in
val juxtarator = (rep, prec, Assoc.INFIX assoc)
end
@
The parser's main data structure is a stack of operators and
expressions satisfying these
invariants:
\begin{enumerate}
\item
No postfix operator appears on the stack.
\item
If a binary operator is on the stack, its left argument is below it.
\item
An expression on the stack is preceded by a list of prefix operators,
then either an infix operator or the bottom of the stack.
\end{enumerate}
These invariants are only partially enforced by the type system.
All operators have the same type, so
[[prefixop]] and [[infixop]] are only hints.
<>=
type infixop = rator
type prefixop = rator
datatype stack = BOT
| BIN of stack * prefixop list * ast * infixop
type stack_top = (prefixop list * ast)
@ The sentinel operator [[minrator]] appears on the
bottom of the stack:
\addbox{srator : stack -> rator}
<>=
val minrator = (Atom.bogus, Atom.Precedence.min, INFIX NONASSOC)
fun srator (BIN (_, _, _, $)) = $
| srator BOT = minrator
(*unboxval*)
@
The parser's input looks something like this:
{\hfuzz=35pt
\begin{production}{image}
\sequence{\term{prefix}}\nt{lexical-atom}\sequence{\term{postfix}}
\sequence{\optional{\term{infix}}
\sequence{\term{prefix}}\nt{lexical-atom}\sequence{\term{postfix}}}
\end{production}
\par}
In parsing,
we assume that we can distinguish prefix, postfix, and infix
operators.
If juxtaposition is forbidden, these restrictions can be relaxed
somewhat; it is then sufficient to be able to distinguish postfix from
infix operators.
The adjustments needed are left as an exercise.
[[isPrefix]] identifies prefix operators.
\addbox{isPrefix : rator -> bool}
<>=
fun isPrefix (_, _, f) = f = PREFIX
(*unboxval*)
@
The parser uses two functions, [[parsePrefix]] and [[parsePostfix]],
depending on whether it is scanning before or
after an atomic input.
The overall structure of the parser is
\addbox{parse : image -> ast}
<>=
exception ParseError of string * rator list
local
exception Impossible
<>
in
val parse = parse
(*unboxval*)
end
@ There are many cases.
[[parsePrefix]] maintains a list of prefix operators, which is saves
until it reaches an atomic
input.
It is not safe to reduce the prefix operators, because the atomic
input might be followed by a postfix operator of higher precedence.
{\hfuzz=4pt
If [[parsePrefix]] encounters a non-prefix operator or end of file,
the input is not correct.
When [[parsePrefix]] encounters an atomic input, it passes the input and
the saved prefixes to [[parsePostfix]].
\addbox{parsePrefix : stack * prefixop list * lexeme list -> ast}
<>=
fun parsePrefix(stack, prefixes, RATOR $ :: ipts') =
if isPrefix $ then
parsePrefix(stack, $ :: prefixes, ipts')
else
raise ParseError("%s is not a prefix operator", [$])
| parsePrefix(_, _, []) =
raise ParseError ("premature end of expression", [])
| parsePrefix(stack, prefixes, EXP a :: ipts') =
parsePostfix(stack, (parseAtom a, prefixes), ipts')
| parsePrefix(stack, prefixes, DECORATED (_, IMAGE l) :: ipts') =
parsePrefix(stack, prefixes, l @ ipts')
@ If it encounters decorated lexemes, [[parsePrefix]] ignores the decoration.
\par
}
@
In addition to the stack and the input, [[parsePostfix]] maintains a
current expression~[[e]], and a list [[prefixes]] containing unreduced
prefix
operators that immediately precede~[[e]].
The idea is that operators arriving in the input must be
tested either against the nearest unreduced prefix operator, or
if there are no unreduced prefix operators, against the operator on
the top of the stack.
In the first case, the nearest unreduced prefix operator is
called~[[$]], and it may be compared with an operator [[irator]] from
the input.
If~[[$]] has higher precedence, it is reduced, by using [[UNARY]] to
apply it to the current expression~[[e]], and it becomes the new
expression.
Otherwise, the parser's behavior depends on the fixity of [[irator]];
it may reduce [[irator]], shift it, or insert [[juxtarator]].
\addbox{parsePostfix : stack * (ast * prefixop list) * lexeme list -> ast}
<>=
and parsePostfix(stack, (e, $ :: prefixes),
ipts as RATOR (irator as (_, _, ifixity)) :: ipts') =
if noparens($, irator, NONASSOC) then (* reduce prefix $ *)
parsePostfix(stack, (UNARY($, e), prefixes), ipts)
else if noparens(irator, $, NONASSOC) then
(* irator has higher precedence *)
case ifixity
of POSTFIX => (* reduce postfix rator *)
parsePostfix(stack,
(UNARY(irator, e), $ :: prefixes),
ipts')
| INFIX _ => (* shift, look for prefixes *)
parsePrefix(BIN(stack, $ :: prefixes, e, irator),
[], ipts')
| PREFIX => <>
else
raise ParseError
("can't parse (%s e %s); " ^
"operators have equal precedence", [$, irator])
@ If [[parsePostfix]] encounters an atom in the input, it inserts
[[juxtarator]] no matter what the state of unreduced prefixes, since
consuming atoms is the purview of [[parsePrefix]].
<>=
| parsePostfix(stack, (e, prefixes), ipts as EXP _ :: _) =
<>
@ The insertion itself is straightforward.
<>=
parsePostfix(stack, (e, prefixes), RATOR juxtarator :: ipts)
@
% would prefer no paragraph, but bad page break
The other major case for [[parsePostfix]] occurs when there are no
more unreduced prefix operators.
In that case, comparison must be made with [[srator stack]], the
operator on top of the stack.
<>=
| parsePostfix(stack, (e, prefixes as []),
ipts as RATOR (irator as (_, _, ifixity)) :: ipts') =
if noparens(srator stack, irator, LEFT) then
(* reduce infix on stack *)
case stack
of BIN (stack', prefixes, e', $) =>
parsePostfix(stack', (BINARY(e', $, e), prefixes), ipts)
| BOT => raise Impossible (* BOT has lowest prec *)
else if noparens(irator, srator stack, RIGHT) then
case ifixity
of POSTFIX => (* reduce postfix rator *)
parsePostfix(stack, (UNARY(irator, e), []), ipts')
| INFIX _ => (* shift, look for prefixes *)
parsePrefix(BIN(stack, [], e, irator), [], ipts')
| PREFIX => <>
else
raise ParseError ("%s is non-associative", [irator])
@
As above, decorations are ignored.
<>=
| parsePostfix (stack, (e, prefixes),
DECORATED (_, IMAGE l) :: ipts') =
parsePostfix(stack, (e, prefixes), l @ ipts')
@
When the input is exhausted, the parser reduces first the prefixes, then the stack,
until it reaches the result.
<>=
| parsePostfix(stack, (e, $ :: prefixes), []) =
(* reduce prefix $ *)
parsePostfix(stack, (UNARY($, e), prefixes), [])
| parsePostfix(BIN (stack', prefixes, e', $), (e, []), []) =
(* reduce stack *)
parsePostfix(stack', (BINARY(e', $, e), prefixes), [])
| parsePostfix(BOT, (e, []), []) = e
@
We complete the parser with functions that parse an entire input, or an
input in parentheses.
\addbox{parse : image -> ast}
\addbox{parseAtom : lexical_atom -> ast}
\deeperbox{2pt}
<>=
and parse(IMAGE(l)) = parsePrefix(BOT, [], l)
and parseAtom (LEX_ATOM a) = ATOM a
| parseAtom (PARENS im) = parse im
(*unboxval*)
@
\section{Proofs of correctness}
\label{app:proof}
\shortprooffalse
\proofstext
\bibliographystyle{spe}
%\vskip 0pt plus 1in\penalty-600\vskip 0pt plus-1in
\bibliography{cs,ml,ramsey,web}
\ifx\nocover\undefined
\cleardoublepage
\thispagestyle{realempty}
\centerline{\large\textbf{Unparsing Expressions --- Details of Corrections}}
\bigskip
\newcommand\corr[1]{page~\pageref{corr:#1}}
\newcommand\Corr[1]{Page~\pageref{corr:#1}}
\newcommand\shrinkit{\itemsep=0.2\itemsep \parsep=0.2\parsep}
\renewcommand\shrinkit{\relax}
\noindent
Referee~A:
\begin{enumerate}
\shrinkit
\item
More examples appear early, especially on \corr{trees}.
\item
The relationship with program inversion is discussed on
\corr{inversion}.
\item
The proofs have been rewritten in a top-down, goal-directed,
calculational style.
Because proofs written in this style are substantially longer than the
original
proofs, I have moved them to an
appendix. The main body of the paper sketches the proofs.
\item
Code has been clarified, and the types of all functions are now
given explicitly. These types are checked by the compiler.
\item
On \corr{inverse+}, the specification of unparsing has been completed as
requested.
\item
I have added examples to section~2 as requested.
\item
To reduce confusion, I have delayed the introduction of the
existential type [[atom]] to section~4, where it is properly
explained.
\item
The propositions and proofs have been completely rewritten in the
style requested by the referee.
The Boolean formula in [[noparens]] is now consistent with
Definition~\ref{bind-defn} and with Lemma~\ref{bracket-lemma}.
\item
On \corr{inversion}, the paper now explains why the proofs are not
shorter and more elegant. It also relates the work to program inversion.
\item
Section~4 is revised along the lines requested, except
that mention of [[juxtarator]] is moved to the appendix on parsing,
because [[juxtarator]] is not needed for unparsing.
\item
\Corr{minprec} explains why I have supplied [[minprec]] and
[[maxprec]] explicitly.
\item
\Corr{flatten} explains the mystery of the two [[image]]s.
\item
In the definition of [[unparse']], I have not eliminated
[[infixImage]]; instead, for consistency, I have added functions
[[prefixImage]] and
[[postfixImage]].
These are not useless extra declarations; they abstract away from the
representation of images, and they make the code easier to read for
those who are not familiar with~ML.
\item
The error handling in [[unparse']] is now consistent.
\item
The abbreviation [[L = Assoc.INFIX Assoc.LEFT]] is better explained.
\item
The structure [[Unparser]] is introduced and explained on
\corr{Unparser}.
\item
I have reduced the confusing transformations in Section~5.
\global\edef\lastcorrection{\arabic{enumi}}
\end{enumerate}
Referee~B:
\begin{enumerate}
\shrinkit
\setcounter{enumi}{\lastcorrection}
\item
Each new ML construct is now explained immediately after its
introduction.
I have also added some brief discussion of how crucial parts would
be different in an imperative implementation.
\end{enumerate}
\fi
\end{document}
<>=
val minprec = ~1
val maxprec = 101
@
<>=
structure InfixUnparse = struct
<>
<>
infix 7 **
infix 6 ++
infix 4 ==
infixr 3 ::==
val mul = ("*", 7, LEFT)
val add = ("+", 6, LEFT)
val eq = ("=", 4, NONASSOC)
val gets = (":=", 3, RIGHT)
fun apply $ (l, r) = APP (l, $, r)
val op ** = apply mul
val op ++ = apply add
val op == = apply eq
val op ::== = apply gets
fun imstring (IMAGE(a, tail)) = astring a ^ imstring' tail
and imstring' EOI = ""
| imstring' (INFIX ((t, _, _), a, tail)) = " " ^ t ^ " " ^ astring a ^ imstring' tail
and astring (LEX_ATOM a) = a
| astring (PARENS im) = "(" ^ imstring im ^ ")"
val x = ATOM "x"
val y = ATOM "y"
val z = ATOM "z"
fun E (n:int) = ATOM (Int.toString n)
val junk = imstring (unparse (z ::== y ::== x ++ y ** z ++ E 2 == E 99 == x))
val junk' = imstring (unparse (z ::== y ::== (x ++ y) ** (z ++ E 2) == E 99 == x))
val left = imstring (unparse ( (x ++ y) ++ z ))
val right = imstring (unparse ( x ++ (y ++ z )))
val left' = imstring (unparse ( (x ::== y) ::== z ))
val right' = imstring (unparse ( x ::== (y ::== z )))
val left'' = imstring (unparse ( (x == y) == z ))
val right'' = imstring (unparse ( x == (y == z )))
end
@
<>=
signature UNPARSEABLE_ATOM = sig
<>
end
signature UNPARSEABLE_PRECEDENCE = sig
type precedence
val min : precedence (* precedence below that of any operator *)
val max : precedence (* precedence above that of any operator *)
val compare : precedence * precedence -> order
end
signature UNPARSER = sig
structure Atom : UNPARSEABLE_ATOM
<>
<>
<>
end
@
<>=
functor UnparserFun(Atom: UNPARSEABLE_ATOM) = struct
structure Atom = Atom
open Assoc
<>
<>
<>
<>
end
@
I really bend over backwards, including paying a run-time cost, to use
this very concrete parser with more abstract streams.
<>=
signature EXP_STREAM = sig
type operator
val juxtarator : operator
val fixity : operator -> Assoc.fixity
structure Precedence : UNPARSEABLE_PRECEDENCE
val precedence : operator -> Precedence.precedence
type exp
val unary : operator * exp -> exp
val binary : exp * operator * exp -> exp
val nary : operator * exp list -> exp
datatype token = RATOR of operator | EXP of exp
end
<>=
functor ParserFun(structure ExpStream : EXP_STREAM) = struct
structure ES = ExpStream
exception Impossible
structure A = struct (* used for types, never for unparsing *)
datatype atom = RATOR of ES.operator
| EXP of ES.exp
fun parenthesize l = raise Impossible
structure Precedence = ExpStream.Precedence
val bogus = RATOR (ES.juxtarator)
val juxAtom = bogus
val juxPrec = ES.precedence ES.juxtarator
val juxAssoc = case ES.fixity ES.juxtarator
of Assoc.INFIX a => a
| _ => raise Impossible
val juxtapositionSpec = (juxAtom, juxPrec, juxAssoc)
end
structure U = UnparserFun(A)
structure P = struct
open U
open Assoc
<>
end
exception ParseError of string * ES.operator list
fun maptoken (ES.RATOR $) =
P.RATOR (A.RATOR $, ES.precedence $, ES.fixity $)
| maptoken (ES.EXP e) = P.EXP (P.LEX_ATOM (A.EXP e))
fun unrator (A.RATOR $, _, _) = $
| unrator _ = raise ParseError ("This can't happen --- operator is an exp", [])
fun unwrap (P.ATOM (A.EXP e)) = e
| unwrap (P.ATOM _) =
raise ParseError ("This can't happen --- exp is an operator", [])
| unwrap (P.UNARY ($, e)) = ES.unary (unrator $, unwrap e)
| unwrap (P.BINARY (e1, $, e2)) = ES.binary(unwrap e1, unrator $, unwrap e2)
| unwrap (P.NARY ($, l)) = ES.nary (unrator$, map unwrap l)
| unwrap (P.DECORATE (_, e)) = unwrap e
fun parse l =
(unwrap o P.parse o P.IMAGE o map maptoken) l
handle P.ParseError (s, l) => raise ParseError (s, map unrator l)
end
<>=
structure TextAtom = struct
type atom = string
fun parenthesize l = String.concat ("(" :: l) ^ ")"
structure Precedence = struct
type precedence = int
val min = ~1
val max = 101
val compare = Int.compare
end
val bogus = "bogus sentinel operator"
(* juxtaposition is an implicit call *)
val juxAtom = " "
val juxPrec = Precedence.max - 1
val juxAssoc = Assoc.LEFT
val juxtapositionSpec = (juxAtom, juxPrec, juxAssoc)
end
structure UnparseTest = struct
structure U = UnparserFun(TextAtom)
local
structure Unparser = U
fun pptext t = t
val pplist = String.concat
val L = Assoc.INFIX Assoc.LEFT
fun Cprec (x : string) = 99
in
<>
end
open U
open Atom
infix 7 **
infix 6 ++
infix 4 ==
infixr 3 ::==
val mul = ("*", 7, INFIX LEFT)
val add = ("+", 6, INFIX LEFT)
val eq = ("=", 4, INFIX NONASSOC)
val gets = (":=", 3, INFIX RIGHT)
val ince = ("++", 9, PREFIX)
val inco = ("++", 9, POSTFIX)
val dece = ("--", 9, PREFIX)
val deco = ("--", 9, POSTFIX)
val star = ("*", 8, PREFIX)
val tuple = ("*", 1, INFIX NONASSOC)
fun binary $ (l, r) = BINARY (l, $, r)
fun unary $ e = UNARY($, e)
fun nary $ es = NARY($, es)
val op ** = binary mul
val op ++ = binary add
val op == = binary eq
val op ::== = binary gets
val dece = unary dece
val deco = unary deco
val star = unary star
val /*/ = nary tuple
val imstring = String.concat
val x = ATOM "x"
val y = ATOM "y"
val z = ATOM "z"
fun E n = ATOM (Int.toString n)
val junk = imstring (unparse (z ::== y ::== x ++ y ** z ++ E 2 == E 99 == x))
val junk' = imstring (unparse (z ::== y ::== (x ++ y) ** (z ++ E 2) == E 99 == x))
val left = imstring (unparse ( (x ++ y) ++ z ))
val right = imstring (unparse ( x ++ (y ++ z )))
val left' = imstring (unparse ( (x ::== y) ::== z ))
val right' = imstring (unparse ( x ::== (y ::== z )))
val left'' = imstring (unparse ( (x == y) == z ))
val right'' = imstring (unparse ( x == (y == z )))
val pp = imstring (unparse (E 2 ++ star (deco x)))
val pp' = imstring (unparse (E 2 ++ deco (star x)))
val tuplel = imstring (unparse (/*/ [/*/ [x, y], z]))
val tupler = imstring (unparse (/*/ [x, /*/ [y, z]]))
val tuplen = imstring (unparse (/*/ [x, y, z]))
end
@
\section{Index}
\subsection{Chunks}
\nowebchunks
\subsection{Identifiers}
\nowebindex
@