%
%                             E T S E T
%
%                           by John Walker
%                      http://www.fourmilab.ch/
%
%   What's all this, you ask?  Well, this is a "literate program",
%   written in the CWEB language created by Donald E. Knuth and
%   Silvio Levy.  This file includes both the C source code for
%   the program and internal documentation in TeX   Processing
%   this file with the CTANGLE utility produces the C source file,
%   while the CWEAVE program emits documentation in TeX.  The
%   current version of these programs may be downloaded from:
%
%       http://www-cs-faculty.stanford.edu/~knuth/cweb.html
%
%   where you will find additional information on literate
%   programming and examples of other programs written in this
%   manner.
%
%   If you don't want to wade through all these details, don't
%   worry; this distribution includes a .c file already
%   extracted and ready to compile.  If "make" complains that it
%   can't find "ctangle" or "cweave", just "touch *.c"
%   and re-make--apparently the process of extracting the files
%   from the archive messed up the date and time, misleading
%   make into believing it needed to rebuild those files.

%   How to talk about LaTeX without actually ever using it
\def\LaTeX{L\kern-.36em\raise.40ex\hbox{\sevenrm A}\kern-.15em\TeX}

% This verbatim mode assumes that ! marks are !! in the text being copied.
% Borrowed from the CWEB manual: cwebman.tex
\def\verbatim{\begingroup
  \def\do##1{\catcode`##1=12 } \dospecials
  \parskip 0pt \parindent 0pt \let\!=!
  \catcode`\ =13 \catcode`\^^M=13
  \tt \catcode`\!=0 \verbatimdefs \verbatimgobble}
{\catcode`\^^M=13{\catcode`\ =13\gdef\verbatimdefs{\def^^M{\ \par}\let =\ }} %
  \gdef\verbatimgobble#1^^M{}}

\def\CPP/{{\mc C++}}	% Macro for C++, like \CEE/ and \UNIX/
\def\breakOK{\penalty 0}

\def\vbar{\char124} 	% Macros for characters difficult to quote in certain contexts
\def\bslash{\char92}
\def\atsign{\char64}
\def\caret{\char94}
\def\uline{\char95}
\def\realspace{\char32}
  
@i cweb/c++lib.w

@** Introduction.

\vskip 15pt
\centerline{\ttitlefont ETSET}
\vskip 10pt
\centerline{\titlefont Typeset an Electronic Text}
\vskip 15pt
\centerline{\pdfURL{by John Walker}{http://www.fourmilab.ch/}}

\vskip 15pt
\centerline{This program is in the public domain.}

\vskip 30pt
@d PRODUCT "etset"
@d VERSION "3.2"
@d REVDATE "2006-05-20"

@*1 Command line.
\bigskip
\.{ETSET} is invoked with a command line as follows:
\medskip
\hskip 6em \.{etset} {\it options input\_file output\_file}
\medskip
\noindent where {\it options} specify processing modes as defined below
and are either long names beginning with two hyphens or single
letter abbreviations introduced by a single hyphen.  If no
{\it input\_file} is specified, or ``\.{-}'' is given for
its name, input is read from standard input.  Similarly,
output is written to standard output if {\it  output\_file}
is omitted or ``\.{-}'' is specified.  When generating
HTML, an {\it  output\_file} name {\it must} be specified;
it is the ``base name'' used to generate the various
HTML files making up the document tree, which are created
in the current directory.

@*1 Options.

Options are specified on the command line prior to the input and
output file names (if any).  Options may appear in any order.  Long
options beginning with ``\.{--}'' may be abbreviated to any
unambiguous prefix; single-letter options introduced by a single ``\.{-}''
may be aggregated.

\bigskip

\def\opt#1#2{\vbox{\noindent {\.{#1}}\par
    \vbox{\leftskip=8em\noindent#2}\smallskip}}

\opt{--ascii-only}{Check for the presence of any characters not part of the
    7-bit ASCII set (for example, accented letters belonging to the
    ISO 8859-1 set), and generate warning messages identifying
    them.}

\opt{--babel {\it lang}}{Use the \LaTeX\ \.{babel} package for language {\it lang}.}

\opt{--check}{Check text for publication.  Report any invalid characters
    or formatting errors to standard error.}
    
\opt{--clean}{Clean up text for publication: expand tab characters to
    spaces, remove trailing blanks from lines.}
    
\opt{--copyright}{Print copying information.}

\opt{--debug-parser {\it file}}{Write parser debugging information to
    {\it file}.  Each line in the body of the text is labeled with the
    identification assigned it by the parser.}
    
\opt{--dos-characters}{Translate MS-DOS Code Page 850 character set
    to ISO 8859-1 and remove carriage returns from the ends of lines.}
    
\opt{--flatten-iso}{ISO 8859-1 8-bit characters are replaced with
    their closest 7-bit ASCII equivalent (for example, accented letters
    are changed to unaccented characters).  This is a {\it destructive}
    transformation, and should be performed only when a text must be
    displayed on a device which cannot accept 8-bit characters.}
    
\opt{--french-punctuation}{Insert nonbreaking spaces around punctuation
    as normally done when typesetting French.  Guillemets, colons, semicolons,
    question marks, and exclamation points are set off from the adjoining
    text by a space.  This mode is unnecessary when typesetting
    French with the ``\.{\hbox{--babel\realspace francais}}'' option.}
    
\opt{--help{\rm, }-u}{Print how-to-call information including a
    list of options.}
    
\opt{--html{\rm, }-h}{Generate HTML output.  By default, a document tree
    is generated with an index document which links to individual
    chapter documents, each of which contains navigation links.  If the
    \.{--single-file} option is specified, a single HTML document
    containing the entire text is generated.  HTML files are written to
    the current directory.}
    
\opt{--latex{\rm, }-l}{Generate a \LaTeX\ file to typeset the document.  If
    the document is in a language other than English, you may also wish
    to use the \.{--babel} option to invoke formatting appropriate
    for the language.}
    
\opt{--palm{\rm, }-p}{Generate a file in Palm Markup Language to
    create a document for Palm Reader on handheld platforms.}
    
\opt{--save-epilogue {\it file}}{The document epilogue is written to the
    designated {\it file}.}
    
\opt{--save-prologue {\it file}}{The document prologue is written to the
    designated {\it file}.}
    
\opt{--single-file}{Generate a single HTML file containing all chapters,
    as opposed to the default of a document tree with a separate file for
    each chapter.}
    
\opt{--special-strip}{Remove all format-specific special commands from
    the document, and blank lines following special command if they would
    result in consecutive blank lines in the document.  This option may
    be used in conjunction with the \.{--clean} option when preparing
    a text for publication in ``Plain ASCII'' format.}
    
\opt{--strict}{Generate HTML compatible with the XHTML 1.0 Strict Document
    Type Definition.  Note that this will make extensive use of Cascading
    Style Sheets (CSS) in the document, which may cause compatibility problems
    with older browsers which do not support or incorrectly implement
    style specifications.}
    
\opt{--unicode}{Generate XHTML Unicode text entities for characters
    (for example, opening and closing quotation marks and dashes)
    not present in the ISO-8859 Latin-1 character set.}
   
\opt{--verbose{\rm, }-v}{Print information regarding processing of the
    document, including the number of lines read and written.}
    
\opt{--version}{Print program version information.}

@*1 Input format.

@i etsetfmt.w

@** Program global context.

@c

@h

@<System include files@>@/
@<Program implementation@>@/

@
The following classes are defined and their
implementations provided.

@<Program implementation@>=
@<Global variables@>@/
@<Global functions@>@/
@<Class definitions@>@/
@<Main program@>@/

@
The following definitions describe the formatting of input body
copy.  Note that column numbers cited below assume the first column
of a line is 0.

@d FormatWidth 70               // Format width of original text
@d RaggedRightIndent 1	    	// Indentation for ragged right copy
@d PreformattedTableIndent 2	// Indentation for preformatted tables
@d QuoteIndent 4                // Indentation for block quotes
@d TitleMarkerCharacter '='	// Character identifying document title/author sequences
@d ChapterMarkerCharacter '-'	// Character identifying chapter number/title sequences
@d MarkerMinimumLength 3    	// Minimum length of title and chapter markers
@d SpecialMarker "<><><>"   	// Special text line marker (start and end of line)
@d SpecialPrefix (SpecialMarker @, "Special:") // Special text line prefix
@d PUNCTUATION ("?!:;" @, RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK)
    	    	    	    	/* Punctuation set after a space in French
                                   text processed with the \.{-f} option */
@d Iabs(x)   (((x) < 0) ? (-(x)) : (x)) // Absolute value

@** Text processing components.

The |textComponent| class is the abstract superclass of all of the
text source, sink, and filter classes.  A source is simply a
filter whose input is not a component, and a sink a filter whose
output is not a component.

@<Class definitions@>=
/* This ought to be a |static| member of |textComponent|, but I'll
   be damned if I can figure out how to make it work as one. */

static const string fTypeName[4] = { "Undefined", "Source", "Filter", "Sink" };

class textComponent {
protected:@/
    textComponent *output;     	    // Next filter in chain
    textComponent *source;     	    // Source at head of pipeline
    int lineNumber; 	    	    // Output line number
    enum filterType { UndefinedType = 0, SourceType = 1, FilterType = 2, SinkType = 3 };
    filterType fType;
public:@/

    textComponent() {
        output = NULL;
	source = NULL;
	lineNumber = 0;
	fType = UndefinedType;
    }
    
    virtual ~textComponent() { }

    virtual string componentName(void) = 0;    // Return name of filter

    virtual void put(string s) = 0; 	// Write string to filter

    @<Connect components in pipeline@>;

    @<Emit output to next component in pipeline@>;    
    
    @<Handle end of file notification@>;
    
    textComponent *getSource(void) {
    	assert(source != NULL);     	// Filter not wired to a source
    	return source;
    }
    
    int getLineNumber(void) {	    	// Output line number of this filter
    	return lineNumber;
    }
      
    int getSourceLineNumber(void) { 	// Line number of ultimate source
    	return getSource()->getLineNumber();
    }
    
    // Issue message tagged with source line number
    virtual void issueMessage(string msg, ostream &of = cerr) {
     	of << getSourceLineNumber() << ": " << msg << "\n";
    }

    // Write description to stream |of|
    virtual void writeDescription(ostream &of) {
        of << fTypeName[fType] << ": " << componentName() << "\n";
    };
};

@
Every text processing pipeline must start with a source
(|fType| of |SourceType|) and terminate with a sink
(|fType| of |SinkType|).  Any number of
filters (|fType| of |FilterType|) may be interposed
between these ends.  Successive components in a pipeline
are connected to one another by calling the |setOutput|
method of each component, starting with the source, in the
order in which they appear in the pipeline, giving the
next component as the argument.  The sink at the end of
the pipeline delivers the result to the ultimate output, so
|setOutput| is not called for it.

For example, suppose you have a three component pipeline
consisting of a source named \.{faucet}, a filter \.{strainer},
and a sink \.{sewer}.  To plumb these three components
into a pipeline, you could make the following function
calls:

\verbatim
!   	faucet.setOutput(strainer);
!   	strainer.setOutput(sewer);
!endgroup

We overload the \.{\char124} operator to allow connecting pipelines in
a less verbose fashion, one familiar to users of \UNIX/
shell commands.  Using this operator, the three component
pipeline can be connected with the single expression:

\verbatim
!   	faucet !char124 strainer !char124 sewer;
!endgroup

Each component in a pipeline contains a link back to the first
(|SourceType|) component.  The source component points to itself,
and the link to the start of the pipeline is propagated as each
additional component is added.  To obtain the source link, use
the |getSource| method of any component in the pipeline.  This
is frequently used when a downstream component wishes to label
a diagnostic message with the line number of the original
source line from which the text it's processing was derived.
This is needed so frequently, in fact, that the
|getSourceLineNumber| method is provided to directly obtain
this value.

@<Connect components in pipeline@>=

    virtual void setOutput(textComponent &ofilt) {
        output = &ofilt;
	ofilt.source = source;	    	// Propagate source to output
    }
    
    textComponent & operator | (textComponent &dest) {
    	setOutput(dest);
	return dest;
    }
@
Each component in the pipeline receives lines through its
|get| method, performs whatever processing is in order, then
passes them down the pipe to the next component with the
|emit| method, which also keeps track of the number of
lines generated by this component.  For the |textSource|
component at the start of the pipeline, this automatically
counts lines in the input stream.

Output from a component is normally emitted to the next
component in the pipeline, designated by |output|, but
may be directed to another component by supplying a pointer
to it as the second argument.  This permits components to
have multiple outputs and hence forks in pipelines.  Note
that the |lineNumber| in the component counts {\it all}
lines emitted, regardless of the destination.

@<Emit output to next component in pipeline@>=   
    virtual void emit(string s, textComponent *destination = NULL) {
    	if (destination == NULL) {
	    destination = output;
	}
        if (destination == NULL) {
            throw(invalid_argument("void destination in emit()"));
        }
	lineNumber++;
        destination->put(s);
    }

@
When the source at the head of the pipeline reaches the end
of the input to be processed, it performs whatever end of
file processing is appropriate and passes an end of file
notification down the pipeline.  The default mechanism for
handling this is the |eof| method defined in the superclass,
which does nothing except forward the notification onward.

If a component needs to perform local cleanup at end of
file (for example, if it's buffered look ahead data which
needs to be flushed out), it should override the default
|eof| method with one which does whatever local processing
is needed, then calls |eof| in its parent class to pass the
notification down the pipe.

@<Handle end of file notification@>=
    virtual void eof(void) {
    	output->eof();
    }


@*1 Source components.

The head end of a filter pipeline must be a |textSource|.  It
obtains its input from some external source and passes it to the
next component in the pipeline.  A source drives the pipeline
when |send| is called; this reads successive lines from the
source and passes them to the next item in the pipeline.

@<Class definitions@>=
class textSource : public textComponent {
protected:@/
    virtual bool get(string &s) = 0;   // Get next string from source

public:@/
    textSource() {
    	fType = SourceType;
	source = this;	    	    // A source is its very own source, of course
    }

    void put(string s) {
        throw(invalid_argument("cannot put to a source"));
    }

    virtual void send(void) {	    // Send lines from source to next in chain
        string s;

        while (get(s)) {
            emit(s);
        }
	eof();	    	    	    // Notify downstream components of end of file
    }
};

@*2 Stream source.

The |streamSource| is a source which reads text lines from an
input stream.  The |setStripEOL| method may be used to set a mode which
causes MS-DOS carriage returns left on the ends of lines to be
removed.

@<Class definitions@>=
class streamSource : public textSource {
private:@/
    istream *i;
    bool strip;

protected:@/
    bool get(string &s) {
        return getline(*i, s);
    }

public:@/
    string componentName(void) {
        return "streamSource";
    }
    
    void openFile(string pathName)
    //	Bind an input file to the stream source
    {
    	if (pathName == "-") {
	    i = &cin;
	} else {
    	    i = new ifstream(pathName.c_str(), ios::in);
	    if (!(*i)) {
	    	throw(invalid_argument("Cannot open input file \"" + pathName + "\"."));
	    }
	}
    }

    streamSource(istream &is = cin)
    //	Construct a stream source from an existing input stream
    {
        i = &is;
	strip = false;
    }
    
    streamSource(string pathName)
    //	Construct a stream to read from specified |pathName|; ``\.{-}'' denotes standard input
    {
    	openFile(pathName);
	strip = false;
    }
    
    void setStripEOL(bool dostrip) {
    	strip = dostrip;
    }
    
    bool getStripEOL(void) {
    	return strip;
    }
    
    virtual void emit(string s, textComponent *destination = NULL) {
    	if (strip) {
	    if (s[s.length() - 1] == '\r') {
	    	s.erase(s.length() - 1, 1);
	    }
	}
	textSource::emit(s, destination);
    }
};

@*1 Sink components.

A |textSink| forms the tail of a filter pipeline.  It consumes lines
from the pipeline and writes them to the ultimate destination.

@<Class definitions@>=
class textSink : public textComponent {

public:@/
    textSink() {
    	fType = SinkType;
    }
    
    void setOutput(textComponent &ofilt) {
        throw(invalid_argument("cannot setOutput of a sink"));
    }
    
    virtual void put(string s)
    /* Default |put| method keeps track of lines output to sink
       destination.  */
    {
    	lineNumber++;
    }
    
    virtual void eof(void)
    /* Default end of file action for a sink is to do nothing,
       as there's no component downstream to receive the EOF
       notification. */
    {
    }
};

@*2 Stream sink.

A |streamSink| writes output sent it to an output stream.  Two
constructors permit you to create a |streamSink| to write to an
already open |ostream| or to a specified file name or
standard output.

@<Class definitions@>=
class streamSink : public textSink {
private:@/
    ostream *o;
    bool closeStream;

public:@/
    string componentName(void) {
        return "streamSink";
    }

    streamSink(ostream &os) : closeStream(false)
    //	Construct a stream sink that writes to an existing output stream
    {
        o = &os;
    }
    
    streamSink(string pathName) : closeStream(false)
    //	Construct a stream sink that writes to a named |pathName|; ``\.{-}'' denotes standard output
    {
    	if (pathName == "-") {
	    o = &cout;
	} else {
    	    o = new ofstream(pathName.c_str(), ios::out);
	    closeStream = true;
	}
    }
    
    ~streamSink() {
    	if (closeStream) {
	    delete o;
	}
    }

    void put(string s) {
    	if (&s != NULL) {
            *o << s << "\n";
	    textSink::put(s);     	    // Call parent to update line counter
	}
    }
};

@*2 Heat sink.

A |heatSink| discards all data sent to it.
As the process of erasing its input is necessarily dissipative;
|heatSink| thermalises the information content it
receives, increasing the entropy of the universe.  See:
Bennett, C.H. ``The Thermodynamics of Computation---a Review''.
{\it Int.\ J. Theor.\ Phys.} {\bf 21:}905--940 (1982).

You can use |heatSink| as the final component in a pipeline
where the desired output is a side effect of an earlier
component, for example, the diagnostic messages produced by
|auditFilter|.  On a \UNIX/ system you could use |streamSink|
with a destination of \.{/dev/null} for this purpose, but
that will not work on other operating systems.

@<Class definitions@>=
class heatSink : public textSink {
public:@/
    string componentName(void) {
        return "heatSink";
    }
 
    void put(string s) {
    }
};

@*1 Filter components.

Each filter receives its input through its |put| method and
delivers output to the next item in the pipeline by calling the
|put| method of its designed |output|.

@<Class definitions@>=
class textFilter : public textComponent {

public:@/
    textFilter() {
    	fType = FilterType;
    }
};

@*2 Trim filter.

A |trimFilter| removes any blank space from the end of strings
which pass through it.

@<Class definitions@>=
class trimFilter : public textFilter {
public:@/
    string componentName(void) {
        return "trimFilter";
    }

    void put(string s) {
        while (s.length() > 0 && isspace(*(s.end() - 1))) {
            s.erase(s.end() - 1);
        }
        emit(s);
    }
};

@*2 Tab expander filter.

A |tabExpanderFilter| replaces tab characters with spaces to
align to the specified |tabInterval|.  We assume tab stops
are set at uniform intervals.

@<Class definitions@>=
class tabExpanderFilter : public textFilter {
private:@/
    int tabInterval;

public:@/
    string componentName(void) {
        return "tabExpanderFilter";
    }

    tabExpanderFilter(int interval = 8) {
        setTabInterval(interval);
    }

    void setTabInterval(int interval) {
        tabInterval = interval;
    }

    void put(string s) {
        if (s.find('\t') != string::npos) {
	    @<Expand tabs in text line@>;
        }
assert(s.find('\t') == string::npos);
        emit(s);
    }
};

@
Given a string |s| which may contain horizontal tab characters,
replace the tabs with spaces to achieve the same alignment,
assuming tab stops are set every |tabInterval| columns.

@<Expand tabs in text line@>=
    string os;

    string::iterator p;
    int n = 0;

    for (p = s.begin(); p != s.end(); p++) {
	if (*p == '\t') {
            do {
        	os += ' ';
        	n++;
            } while ((n % tabInterval) != 0);
	} else {
            os += *p;
            n++;
	}
    }
    s = os;

@*2 Flatten ISO characters filter.

A |flattenISOCharactersFilter| replaces ISO-8859/1
characters with their closest 7-bit ASCII representation.
This butchers any text containing accented characters, but
if the user asks for it, ya gotta do what ya gotta do.

@<Class definitions@>=
class flattenISOCharactersFilter : public textFilter {

public:@/
    string componentName(void) {
        return "flattenISOCharactersFilter";
    }

    void put(string s) {
    	@<Flatten ISO 8859 characters to 7-bit ASCII@>;
        emit(s);
    }
};

@
Given a string |s| which may contain ISO-8859/1 characters
with codes between |0xA0|--|0xFF|, return a string with
all such characters replaced by the closest ASCII
equivalents.

@<Flatten ISO 8859 characters to 7-bit ASCII@>=
    string os;
    string::iterator p;
    int c;

    for (p = s.begin(); p != s.end(); p++) {
    	c = (*p) & 0xFF;
	if ((c >= 0xA0) && (c <= 0xFF)) {
    	    os += flattenISO[c - 0xA0];
	} else {
    	    os += c;
    	}
    }
    s = os;

@*2 Convert foreign character set to ISO filter.

A |convertForeignCharacterSetToISOFilter| converts characters
in a foreign character set to ISO 8859-1.  It is driven by a
conversion table provided when the filter is instantiated
or set by the |setConversionTable| method.

@<Class definitions@>=
class convertForeignCharacterSetToISOFilter : public textFilter {

private:@/
    unsigned char *conversionTable;

public:@/
    
    void setConversionTable(unsigned char *tbl) {
    	conversionTable = tbl;
    }

    convertForeignCharacterSetToISOFilter(unsigned char *tbl) {
    	setConversionTable(tbl);
    }
    
    string componentName(void) {
        return "convertForeignCharacterSetToISOFilter";
    }
    
    void identityTransform(void) {
    	int i;
	
    	conversionTable = new unsigned char[256];
	for (i = 0; i < 256; i++) {
	    conversionTable[i] = i;
	}
    }
    
    unsigned char convert(unsigned char from) {
    	return conversionTable[from];
    }
    
    void setTranslation(unsigned char from, unsigned char to) {
    	conversionTable[from] = to;
    }

    void put(string s) {
	string::iterator p;

	for (p = s.begin(); p != s.end(); p++) {
    	    *p = convert((*p) & 0xFF);
	}
        emit(s);
    }
};

@*2 Section separator squid.

An Etext is divided into three sections, the {\it prologue},
{\it body}, and {\it epilogue}, delimited by the |sectionSep|
marker which consists of a 68 character line filled with
the sequence \.{<><><>}$\ldots$\.{<><><>}.  The section
separator processes lines of the input stream in sequence,
testing each against the section separator.  Lines prior
to the first section separator are emitted to the
|prologueProcessor| component, lines within the body to the
regular |output| of the component, and lines following the
separator at the end of the body (if any) to the |epilogueProcessor|
component.  If the |prologueProcessor| or |epilogueProcessor|
pointers are |NULL|, output for the corresponding section will
be discarded.

This is, thus, a component with one input and multiple
outputs, creating a three-way fork in the pipeline,
permitting arbitrary components to be attached to each
output.  In the spirit of
\pdfURL{UNIVAC}{http://www.fourmilab.ch/documents/univac/}~1004
plugboard wiring, this is referred to as a {\it squid}.

@d sectionSep "<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>"

@<Class definitions@>=
class sectionSeparatorSquid : public textFilter {
private:@/
    textComponent *prologueProcessor,
    	    	  *epilogueProcessor,
		  *currentOutput;
    int nsep;

public:@/
    sectionSeparatorSquid(textComponent *proP = NULL, textComponent *epiP = NULL) {
    	prologueProcessor = proP;
	epilogueProcessor = epiP;
	nsep = 0;
	currentOutput = prologueProcessor;
    }

    string componentName(void) {
        return "sectionSeparatorSquid";
    }
    
    void setPrologueProcessor(textComponent *proP) {
    	assert(prologueProcessor == NULL && currentOutput == NULL);
	currentOutput = prologueProcessor = proP;
    }
    
    void setEpilogueProcessor(textComponent *epiP) {
    	assert(epilogueProcessor == NULL);
	epilogueProcessor = epiP;
    }
    
    @<Section separator squid end of file handling@>;

    void put(string s) {
    	if (s.compare(sectionSep) == 0) {
	    @<Handle section separator@>;
	}
	if (currentOutput != NULL) {
            emit(s, currentOutput);
	}
    }
};

@
The section separator squid is rather flexible in the ways it
permits you to direct contents of the sections, and this makes for
a modicum of complexity when we see a section separator and wish
to redirect the incoming stream.  First of all, any of the three
output branches---prologue, body, or epilogue---may be discarded
by directing them to a |NULL| component pointer.  Further, you
may specify the same component as output for more than one
branch; for example, if you wish to concatenate the prologue
and epilogue into one file.

We need to provide the conventional end of file notification
by calling our output components' |eof| methods after they've
received the last line of output, but since a component may be
attached to more than one branch, when we're switching branches
we only want to call |eof| when the component does not appear in
a subsequent branch.

@<Handle section separator@>=
    switch (nsep) {
	case 0:@/
	    nsep++; 	    // Advance to body
	    if ((currentOutput != NULL) && (currentOutput != output) &&
	    	(currentOutput != epilogueProcessor)) {
		currentOutput->eof();
	    }
	    currentOutput = output; // Direct output to main component output
	    return; 	    // Discard section separator

	case 1:@/
	    nsep++;
	    if ((currentOutput != NULL) && (currentOutput != epilogueProcessor)) {
		currentOutput->eof();
	    }
	    currentOutput = epilogueProcessor; // Direct output to epilogue processor
	    return; 	    // Discard section separator

	case 2:@/
	    //	Extra |sectionSep| in epilogue.  Treat as part of epilogue.
	    break;
    }

@
Our much-vaunted ``flexibility'' in output arrangements also has
consequences for end of file processing.  When we receive an end
of file notification, we can be in any of the three sections,
emitting or discarding output, and with potentially identical
destinations for sections subsequent to the one which contained
the end of file.  We thus need to guarantee that not only
the current section destination is notified of the end of
file (unless it's |NULL|), but also that destinations for
subsequent sections which will never receive any lines
are notified {\it unless they are the same as the destination
for an earlier section which has been notified}.

@<Section separator squid end of file handling@>=
    void eof(void) {
    	if (currentOutput != NULL) {
	    //	Notify current destination unless it's |NULL|
	    currentOutput->eof();
	}
	
	switch (nsep) {
	    case 0:@/
	    	/*  In prologue.  Notify body of |eof| unless it's |NULL| or
		    the same destination as prologue.  If the epilogue
		    destination is the same as that of the prologue, |NULL|
		    it out so it isn't notified twice.  */
	    	if ((currentOutput != output) && (output != NULL)) {
		    output->eof();
		    if (epilogueProcessor == currentOutput) {
		    	epilogueProcessor = NULL;   // |eof| already sent
		    }
		}
		currentOutput = output;@/@,
		//  Wheee!!!  Fall-through$\ldots$
		
	    case 1:@/
	    	/*  End of file encountered in the body.  Notify the
		    epilogue destination it's not going to be getting
		    any output unless it's |NULL| or the same destination
		    as the body, which has already been notified.  */
	    	if ((currentOutput != epilogueProcessor) &&
		    (epilogueProcessor != NULL)) {
		    	epilogueProcessor->eof();
		}
		
	    case 2:@/
	    	/*  End of file in the epilogue.  No special handling is
		    required.  */
	    	break;
	}
    }

@*2 Tee squid.

The tee squid makes a simple fork in a pipeline.  It copies
everything it receives to both the component next in the
pipeline and the component designated as its |secondDestination|.

@<Class definitions@>=
class teeSquid : public textFilter {
private:@/
    textComponent *secondDestination;

public:@/
    teeSquid(textComponent *secP) {
    	secondDestination = secP;
    }

    string componentName(void) {
        return "teeSquid";
    }
    
    void eof(void) {
	secondDestination->eof();
    	textFilter::eof();
    }

    void put(string s) {
        emit(s, secondDestination);
    	emit(s);
    }
};
    
@*2 Etext body parser filter.

This filter processes the body of an Etext (if the source document
contains a prologue and epilogue, this filter should be placed
downstream of a |sectionSeparatorSquid|), identifying components
in the text and passing them down the pipeline tagged with their
type.  The body parser is implemented as a state machine, driven
by the lines of body copy if receives through its |put| method.

@<Class definitions@>=
class etextBodyParserFilter : public textFilter {

private:@/
    bodyState state;	    	    	// Current state of parser
    queue <string> lq;	    	    	// Queue for lines during look-ahead
    string specialFilter;   	    	// Filter special commands ?
    
    void emits(bodyState s, char bracket, string text = "") {  // Emit coded line
    	string bracks = "";
	
	bracks += bracket;
    	emit(EncodeBodyState(s) + bracks + text);
    }
    
    void emitQueuedLines(bodyState s);	// Emit lines in |lq| with bracketed state |s|

public:@/
    etextBodyParserFilter() {
    	state = BeginText;
	specialFilter = "";
    }
    
    virtual ~etextBodyParserFilter() {
    }

    string componentName(void) {
        return "etextBodyParserFilter";
    }
    
    void setSpecialFilter(string f) {
    	specialFilter = f;
    }
    
    string getSpecialFilter(void) {
    	return specialFilter;
    }
    
    void eof(void) {
    	emits(EndOfText, Void);
    	textFilter::eof();
    }
    
    void put(string s) {
    	bodyState lineClass = classifyLine(s);
	
	if (specialFilter != "*") {
	    if (isLineSpecial(s)) {
	    	if (specialType(s) != specialFilter) {
		    return; 	    	    	// Discard special line not matching filter
		}
	    }
	}
    	@<Parser state machine@>;
    }

    static bodyState classifyLine(string s);	// Classify line by justification type
    
    static bool isLineSpecial(string s);    	// Test for special command
    
    static string specialType(string s);    	// Extract type of special command

    static string specialCommand(string s);    	// Extract body of special command
};

@
We enter the parser state machine with two pieces of information:
the current |state| of the machine and the |lineClass| of the
line just passed to the |put| method.  The state machine consists
of a |switch| statement with cases for each possible state, wrapped
in an endless loop which permits cycling the machine with the
same input line after a state change with a simple |continue|
statement.  This means, of course, that we need to break out
of the machine explicitly when we've consumed the input line, but
this is simply accomplished with a |break| at the bottom of the loop.

@<Parser state machine@>=
    while (true) {
    	switch (state) {
	    @<BeginText state@>;
	    @<BeforeTitle state@>;
	    @<Declarations state@>;
	    @<PossibleTitle state@>;
	    @<TitleMarker state@>;
	    @<Author state@>;
	    
	    @<BetweenParagraphs state@>;
	    @<Within aligned paragraph state@>;
	    @<Within preformatted table state@>;
	    @<PossibleChapterNumber state@>;
	    @<ChapterMarker state@>;
	    @<ChapterName state@>;
	    
	    default:
	    	cerr << "Internal error: state \"" << stateNames[state] <<
		    	"\" not handled in etextBodyParserFilter.\n";
		exit(1);
	}
	break;
    }
    
@
The state machine starts in |BeginText| state.  All we do is emit
the corresponding marker to identify the start of the text and
drop into |BeforeTitle| state to process the line.

@<BeginText state@>=
    case BeginText:
    	emits(BeginText, Void);
    	state = BeforeTitle;
	continue;

@
Once we've output the |BeginText| marker we arrive in this
state.  At this point we're waiting to encounter either the
title and author sequence or the start of document body if no
such sequence exists.

@<BeforeTitle state@>=
    case BeforeTitle:
    	if (lineClass != BetweenParagraphs) {	// Discard blank lines before title/start of text
	    if (isLineSpecial(s)) {
	    	emits(Declarations, Begin);
		state = Declarations;
		continue;
	    }
	    
	    if (lineClass == InCentred) {
	    	state = PossibleTitle;
		lq.push(s);
		break;
	    }
	    
	    if (lineClass == TitleMarker) { 	// Weird--title marker with no title
	    	state = TitleMarker;	    	// Set state to accept author
		emits(DocumentTitle, Void); 	// Indicate no document title
		break;
	    }
	    
	    @/@,
	    //	Anything else is start of document with no title or author specified
	    
	    emits(DocumentTitle, Void);
	    emits(Author, Void);
	    
	    state = BetweenParagraphs;
	    continue;	    	    	    	// Re-parse line in |BetweenParagraphs| state
	}
    	break;
	
@
One or more format-specific special commands may appear before the
document title.  These are generally used for document-wide
declarations which need to appear before the body of
the text.  They are returned in a Declarations block consisting
of all consecutive special commands which appear before the
title.  If you separate blocks of declarations by blank lines,
multiple declarations blocks will be returned; this
is generally a dopey thing to do.

@<Declarations state@>=
    case Declarations:
	if (isLineSpecial(s)) {
	    emits(Declarations, Body, s);
	    break;
	}
	emits(Declarations, End);
	state = BeforeTitle;
	continue;
    	
@
We have seen a centred line at the start of the document.  This
may be a title, or it may simply be centred text which happens
to be at the start of a document with no title.  We save centred
lines in the |lq| queue until we either encounter a line which
isn't centred or a title marker.

@<PossibleTitle state@>=
    case PossibleTitle:
    	if (lineClass == TitleMarker) {     	// Title marker--lines saved were the title!
	    emitQueuedLines(DocumentTitle);
	    state = TitleMarker;
	    break;
	}
	
	if (lineClass == InCentred) {
	    lq.push(s);     	    	    	// Another centred line--save it
	    break;
	}
	
	if (lineClass == ChapterMarker) {   	// Chapter marker--it was a chapter!
	    @/@,
	    /*	We get here if the document doesn't have a title
	    	specification but begins with a chapter marker.  We
		need to emit a void title and author, then output the
		centred lines in the queue as a chapter number.  */

	    emits(DocumentTitle, Void);
	    emits(Author, Void);
	    
	    emitQueuedLines(ChapterNumber);
	    state = ChapterMarker;
	    break;
	  }
	
	@/@,
	/*  Anything else means the lines in the queue are just
	    centred text at the start of the document.  Emit a void
	    title and author, then the lines as a centred sequence.  */
	    
	emits(DocumentTitle, Void);
	emits(Author, Void);
	emitQueuedLines(InCentred);
	state = BetweenParagraphs;
	continue;
	
@
We have seen and processed a title marker.  Subsequent centred lines are
the author specification and will be output as such.  The author sequence
is terminated by any non-centred line, but blank lines are permitted
between the title marker and the first line of the author specification.

@<TitleMarker state@>=
    case TitleMarker:
	if (lineClass == InCentred) {
    	    emits(Author, Begin);
	    emits(Author, Body, s);
	    state = Author;
	    break;
	}
	
	if (lineClass == BetweenParagraphs) {
	    break;	    	    	// Discard blank line after title marker
	}

	//	No author specification.  Emit void author and process as text

	emits(Author, Void);
	state = BetweenParagraphs;
	continue;
    
@
One or more lines of the author specification have been output.
Successive centred lines are continuation of the author,
while anything else ends the author specification.

@<Author state@>=
    case Author:
	if (lineClass == InCentred) {
	    emits(Author, Body, s);
	    break;
	}
	emits(Author, End);
	state = BetweenParagraphs;
	continue;	

@
|BetweenParagraphs| is the ground state while processing the bulk of
the document.  Here we have no object open or pending, and we're
ignoring blank lines waiting to see something whose alignment
determines the handling of the next item we're to process.
If it's text (justified, ragged right, ragged left, or block
quote), we begin a sequence of that type and set the state
to accrue subsequent lines of the same kind.  A centred line,
however, may be the first line of a chapter break, so we
must save it in the |lq| queue and go into |PossibleChapterNumber|
state pending examination of the next line.

@<BetweenParagraphs state@>=
    case BetweenParagraphs:
    	switch (lineClass) {
	    case BetweenParagraphs:@/
	    	break;  	    	    	// Nugatory blank line
	    @#	
	    case InTextParagraph:@;
	    case InRaggedRight:@;
	    case InBlockQuote:@;
	    case InRaggedLeft:@;
	    case InPreformattedTable:@;
	    	emits(lineClass, Begin);    	// Emit begin of aligned block
		emits(lineClass, Body, s);
		state = lineClass;
		break;@#
		
	    case InCentred:@;	    	    	// Regular centred line
	    case TitleMarker:@;	    	    	// Doesn't belong here, but who knows?
	    	lq.push(s);
		state = PossibleChapterNumber;
		break;@#
		
	    case ChapterMarker:@;     	    	// Chapter marker without preceding number
		emits(ChapterNumber, Void);
		state = ChapterMarker;
	    	break;@#
		
	    default:@;	    	    	    	// Ignore anything else
	    	break;@#
	}
    	break;

@
This section handles all kinds of aligned paragraphs: justified, ragged
left and right, and block quote.  As long as we continue to
receive lines with same alignment as the state we're in, we simply
emit them as continuations of the current paragraph.  Upon encountering
a line with a different classification, we close the paragraph, revert
to |BetweenParagraphs| state, and re-parse the line in that state.

Note how the fact that |classifyLine| uses the same codes for its
alignment classes as we use for state when within a paragraph with
that alignment pays a big dividend of simplification here.
	
@<Within aligned paragraph state@>=
    case InTextParagraph:@;
    case InRaggedRight:@;
    case InBlockQuote:@;
    case InRaggedLeft:@;
    	if (lineClass == state) {
	    emits(state, Body, s);
	    break;
	}
	
	emits(state, End);
	state = BetweenParagraphs;
	continue;

@
To give the maximum latitude for formatting in preformatted tables,
once we've identified the first line as beginning in
the |PreformattedTableIndent| column and containing at least one
sequence of three or more spaces, we stay in preformatted table
state until we encounter a blank line.

@<Within preformatted table state@>=
    case InPreformattedTable:@;
    	if (lineClass != BetweenParagraphs) {
	    emits(state, Body, s);
	    break;
	}
	
	emits(state, End);
	state = BetweenParagraphs;
	break;

@
When we encounter a centred line, there are two possibilities.  It may
simply be the first of one or more centred lines in the document, or
it may be the first line of a chapter break, in which case it belongs
to a chapter number specification.  We can't distinguish these alternatives
until we see either a chapter marker or something other than a line of
centred text (including a blank line).  As long as we continue to
receive centred lines, add them to the |lq| queue.  If we get a chapter
marker, output the lines in the queue as a chapter number and change
state to process the chapter name; otherwise, emit the queued lines as
a centred paragraph, reset to |BetweenParagraphs| state, and re-parse
the non-centred line.

@<PossibleChapterNumber state@>=
    case PossibleChapterNumber:
	if (lineClass == InCentred) {	
	    lq.push(s);
	    break;
	}
	
	if (lineClass == ChapterMarker) {
	    emitQueuedLines(ChapterNumber);
	    state = ChapterMarker;
	    break;
	}

	emitQueuedLines(InCentred);
	state = BetweenParagraphs;
    	continue;

@
We've identified a chapter marker and processed the preceding chapter
number, if any.  Centred lines following the chapter number are
output as the chapter name.  Any non-centred line, including a blank
one, terminates the chapter name.  Hence, a chapter marker followed
by a blank line denotes a chapter with no title.
	
@<ChapterMarker state@>=
    case ChapterMarker:
	if (lineClass == InCentred) {
	    emits(ChapterName, Begin);
	    emits(ChapterName, Body, s);
	    state = ChapterName;
	    break;
	}
	
	emits(ChapterName, Void);
	state = BetweenParagraphs;
	continue;
	
@
Once we've seen a centred chapter name line following a chapter mark,
we consider subsequent centred lines as continuations of the
chapter name.  Anything else (including a blank line) terminates
the chapter name and is re-parsed in |BetweenParagraphs| state.

@<ChapterName state@>=
    case ChapterName:
	if (lineClass == InCentred) {
	    emits(ChapterName, Body, s);
	    break;
	}
	
	emits(ChapterName, End);
	state = BetweenParagraphs;
	continue;

@
The |classifyLine| function examines a line of the body copy and
classifies it based on its ``heuristic'' justification and content,
returning a context-free subset of the parser's |bodyState| values.
If you need to modify how the program decides a line should be
justified based on how it's aligned in the input text, here is
where you should be looking.

@<Class definitions@>=
bodyState@, etextBodyParserFilter::classifyLine(string s) {
    bodyState classification;
    
    if (s.length() == 0) {
    	classification = BetweenParagraphs; 	// Blank line
    } else if (s[0] != ' ') {
    	classification = InTextParagraph;   	// Justified body copy
    } else {
    	int i = s.find_first_not_of(' ');
	
	if (i == RaggedRightIndent) {
	    classification = InRaggedRight; 	// Ragged right text
    	} else if ((i == PreformattedTableIndent) &&
	    	   (s.find_first_of("   ") != string::npos)) {
    	    classification = InPreformattedTable;   // Preformatted table
	} else if (i == QuoteIndent) {
	    classification = InBlockQuote;	// Block quotation
    	} else if (s.length() == FormatWidth) {
    	    classification = InRaggedLeft;  	// Ragged left
	} else {
	    @<Classify centred line@>;
	}
    }
    
    return classification;
}

@
An indented non-blank line which begins in neither the |RaggedRightIndent|
nor |QuoteIndent| columns is taken to be centred.  We further classify
centred lines as either regular text or separators
between document title and author lines or chapter number and name
specifications which are denoted by centred lines exclusively
composed of, respectively, equal (\.{=}) or minus (\.{-}) signs.

The way in which we recognise these markers looks a little
cowboy style unless you realise we already know several important
things about the string |s| before we arrive here: it is guaranteed
to have no trailing white space, to have at least one blank at the
beginning and at least one non-blank thereafter, and to contain no
white space characters other than spaces.  All of these conditions
are guaranteed either by tests within this filter or transformations
performed on the input by previous components in the pipeline.

@<Classify centred line@>=
    classification = InCentred;     // Tentatively classify as centred text
    char lchar = s[s.length() - 1];
    
    if ((lchar == ChapterMarkerCharacter) || (lchar == TitleMarkerCharacter)) {
    	int fchar = s.find_first_not_of(' ');
    	if (((s.length() - fchar) >= MarkerMinimumLength) &&
	    (s.find_first_not_of(lchar, fchar) == string::npos)) {
	    classification = (lchar == TitleMarkerCharacter) ? TitleMarker : ChapterMarker;
    	}
    }

@
``Special'' commands are text lines interpreted by a specific output
format generator.  Such commands may be used, for example, to include
image files in the generated document.  Special commands follow
the heuristic justification rules of regular text, and are
identified by beginning and ending with the |SpecialMarker| sentinel,
with the complete |SpecialPrefix| at the start.
The |isLineSpecial| function tests whether a line is so marked
and should be interpreted as a special command.  The
function is |public| and |static|, and may be called by
downstream components to determine whether a line they
have received is a special command.

@<Class definitions@>=    
bool etextBodyParserFilter::isLineSpecial(string s)
{
    unsigned int first = s.find_first_not_of(' ');
    
    if ((first != string::npos) &&
    	(s.find(SpecialPrefix) == first) &&
    	(s.rfind(SpecialMarker) == (s.length() - ((sizeof SpecialMarker) - 1)))) {
    	return true;
    }
    return false;
}

@
Each special command contains a type which identifies which output
format generators are interested in it.  This function, which assumes
the line is a special command (|isLineSpecial| returns |true| for it),
extracts the type from the command and returns it.  This is cowboy
code---if you have trouble, try adding an
|assert(isLineSpecial(s))| at the top of the function and see if
it pops.

@<Class definitions@>=    
string etextBodyParserFilter::specialType(string s)
{
    string o = " - invalid -";
    unsigned int first = s.find(SpecialPrefix), last;
    
    if (first != string::npos) {
    	first += (sizeof SpecialPrefix) - 1;
	last = s.find(' ', first);
	o = s.substr(first, last - first);
    }
    return o;
}

@
Extract the output format specific body from a special command.
If given an invalid special command, an empty string will
be returned.  |specialCommand| may be called by downstream
components to extract the command body from a special
command it has received.

@<Class definitions@>=    
string etextBodyParserFilter::specialCommand(string s)
{
    string o = "";
    unsigned int first = s.find(SpecialPrefix), last;
    
    if (first != string::npos) {
    	first += (sizeof SpecialPrefix);
	first = s.find(' ', first);
	if (first != string::npos) {
	    last = s.rfind(SpecialMarker);
	    if (last != string::npos) {
	    	o = s.substr(first + 1, (last - first) - 1);
	    }
	}
    }
    return o;
}

@
This little helper function emits lines stored in the look-ahead queue |lq|
as a block of lines of a given type |s|, complete with |Begin|
and |End| brackets.

@<Class definitions@>=
void etextBodyParserFilter::emitQueuedLines(bodyState s) {
    emits(s, Begin);
    while (!lq.empty()) {
	emits(s, Body, lq.front());
	lq.pop();
    }
    emits(s, End);
}    

@
These are the states among which the Etext body parser transitions
as it processes lines of the text.  Note that some of these states
are also used by |classifyLine| to denote the justification
of individual lines.

@d EncodeBodyState(s)	((char) ('A' + (s)))
@d DecodeBodyState(c)	((bodyState) ((c) - 'A'))

@<Global variables@>=
    enum bodyState {	    	    	// Body parser current state
    	BeginText,	    	    	// Begin text pseudo-marker
    	BeforeTitle,	    	    	// Title not yet seen
	Declarations,	    	    	// Special declarations before title
	PossibleTitle,	    	    	// Centred text which may be the title
	TitleMarker,     	    	// Separator between title and author
    	DocumentTitle,	    	    	// Document title
	Author,     	    	    	// Author information after title separator
	BetweenParagraphs,  	    	// Blank space between paragraphs
	InTextParagraph,    	    	// In regular text paragraph
	InBlockQuote,	    	    	// In indented block quotation paragraph
	InRaggedRight,	    	    	// In ragged right paragraph
    	InRaggedLeft,	    	    	// In ragged left paragraph
	InPreformattedTable,	    	// In preformatted table
	PossibleChapterNumber,   	// Centred text which may be chapter number
	InCentred,  	    	    	// In centred text
    	ChapterNumber,	    	    	// Chapter number
	ChapterMarker,	    	    	// Marker after chapter number
	ChapterName,	    	    	// Chapter name
	EndOfParagraph,     	    	// End of paragraph pseudo-marker
	EndOfText   	    	    	// End of text pseudo-marker
    };

@
Each of the syntactic elements recognised by the parser are output
to the component downstream with brackets which mark the beginning,
body, and end of each element.  The |Begin| and |End| markers are
send with the state code identifying the element but no text.  If
an element such as the title or author is omitted, a |Void|
record is output to indicate its absence.

@<Global variables@>=
    static const char Begin = '{',  	// Structure nesting flags
    	    	      Body = ' ',
		      End = '}',
		      Void = '-';


@
For debugging, it's nice to be able to dump the parser states (particularly
those with which lines emitted by the parser are tagged).  Strings in the
following table correspond to the states in |bodyState| and are used by
the |parserDiagnosticFilter| to generate its output.

@<Global variables@>=
    static string const stateNames[] = {
    	"Begin text",
    	"B4 Title",
	"Declarations",
    	"Poss Title",
    	"Title mark",
    	"Title",
    	"Author",
    	"Par break",
    	"Text",
    	"Blockquote",
    	"Rag right",
    	"Rag left",
	"Table",
    	"Poss Chap",
    	"Centred",
    	"Chap num",
    	"Chap mark",
    	"Chap name",
    	"End para",
    	"End text"
    };

@*2 Strip special commands filter.

This filter scans its input for special commands (identified by
the |etextBodyParserFilter::isLineSpecial| function) and
removes them from the stream passed down the pipeline.  If
removal of special commands would result in two consecutive
blank lines in the output, the extra blank line is also
elided.  This filter assumes that its input contains no
tab characters nor trailing white space (and hence that
any blank line is a zero length string).

@<Class definitions@>=
class stripSpecialCommandsFilter : public textFilter {
private:@/
    bool lastBlank, lastStripped;
    
public:@/
    stripSpecialCommandsFilter() {
    	lastBlank = lastStripped = false;
    }

    string componentName(void) {
        return "stripSpecialCommandsFilter";
    }

    void put(string s) {
    	if (etextBodyParserFilter::isLineSpecial(s)) {
	    lastStripped = true;
	} else {
	    if (s.length() > 0) {
    	    	emit(s);
	    	lastStripped = lastBlank = false;
	    } else {
	    	if (lastStripped) {
		    if (!lastBlank) {
		    	emit(s);
		    }
		    lastStripped = false;
		    lastBlank = true;
		} else {
		    emit(s);
		    lastBlank = true;
		}
	    }
	}
    }
};

@*2 Audit filter.

The |auditFilter| performs a variety of tests on lines which pass
through it.  The tests are selected by a bit mask of |audit_criteria|
passed to the constructor or set by |setAuditCriteria| (the default is
to enable all tests).  The |auditFilter| passes input unchanged to the
component downstream.  Error messages are written to an |ostream|
log which defaults to |cerr|.  For complete generality, this should
probably be replaced with a |textComponent| transforming |auditFilter|
into a squid.

@<Class definitions@>=
class auditFilter : public textFilter {
public:@/
    enum audit_criteria {
    	trailing_blanks = 1,
	embedded_tabs = 2,
	exceeds_maximum_length = 4,
	invalid_characters = 8,
	dubious_justification = 16,
	improper_embedded_blanks = 32,
	consecutive_blank_lines = 64,
	special_commands_present = 128,
	permit_8_bit_ISO_characters = 256,
	trailing_hyphen = 512,
	
	everything = ~0
    };

private:@/
    static const int DefaultCentringTolerance = 2;  // If you're picky, you can set this to 1
    
    unsigned int maxLineLength;
    ostream *log;
    bool lastBlank;
    bool inTable;
    enum audit_criteria check;
    int centringTolerance;

public:@/
    string componentName(void) {
        return "auditFilter";
    }

    void setMaxLength(unsigned int maxlen) {
        maxLineLength = maxlen;
    }

    void setLogStream(ostream &s) {
        log = &s;
    }
    
    void setAuditCriteria(int check_for) {
    	check = (audit_criteria) check_for;
    }

    audit_criteria getAuditCriteria(void) {
    	return check;
    }
    
    void enableAuditCriteria(int check_for) {
    	check = (audit_criteria) (check | check_for);
    }
    
    void disableAuditCriteria(int check_for) {
    	check = (audit_criteria) (check & (~check_for));
    }
    
    void setCentringTolerance(int ct = DefaultCentringTolerance) {
    	centringTolerance = ct;
    }
    
    int getCentringTolerance(void) {
    	return centringTolerance;
    }
    
    auditFilter(unsigned int maxlen = FormatWidth, ostream &os = cerr,
    	    	audit_criteria check_for = everything) {
        setMaxLength(maxlen);
        setLogStream(os);
	lastBlank = false;
	inTable = false;
	setAuditCriteria(check_for);
	setCentringTolerance();
    }
    
    static bool isCharacterPermissible(unsigned int c);
    
    static string quoteArbitraryString(string s);
    
    static bool isISOletter(int c) {
    	assert((c >= 0) && (c <= 0xFF));
    	return ((c >= 'A') && (c <= 'Z')) ||
	       ((c >= 'a') && (c <= 'z')) ||
	       ((c >= 0xC0) && (c <= 0xD6)) ||
	       ((c >= 0xD8) && (c <= 0xF6)) ||
	        (c >= 0xF8);
    }

    void put(string s) {
        unsigned int i, n;
	bool err = false;
	const string sentenceEnd = ".?!\"\'";
	bodyState lclass;
	bool special = etextBodyParserFilter::isLineSpecial(s);

    	@<Check for line with trailing white space@>;
	@<Check for line with trailing hyphen@>;
    	@<Check for line with embedded tab characters@>;
	@<Check for line that exceeds maximum text length@>;
	@<Check for invalid characters in text@>;
	@<Check for justification-related problems@>;
	@<Check for line with improper embedded white space@>;
	@<Check for consecutive blank lines@>;
	@<Check for special commands present@>;
	if (err) {
	    issueMessage(quoteArbitraryString(s), *log);
	}
        emit(s);
    }
};

@
In the interest of visual fidelity as well as minimising file
size, we don't want to include any lines with nugatory white
space between the last printable character and the end of line.
If any have crept in, generate a warning.

@<Check for line with trailing white space@>=
    if (check & trailing_blanks) {
    	int j;
	
	n = 0;
	for (j = s.length() - 1; j >= 0; j--) {
    	    if (!isspace(s[j])) {
		break;
	    }
	    n++;
	}
	if (n > 0) {
	    ostringstream em;

	    em << "Line contains " << n << " white space character" <<
	    	  (n == 1 ? "" : "s") << " at the end.";
            issueMessage(em.str(), *log);
    	    err = true;
	}
    }

@
One common problem in scanned documents is hyphenated
lines which were not joined in the editing phase.  This check
attempts to detect such lines.  We only issue the warning if
the character that precedes the hyphen is alphabetic
(including ISO accented letters), as trailing em-dashes
and minus signs in mathematics are perfectly valid.

@<Check for line with trailing hyphen@>=
    if (check & trailing_hyphen) {
    	if ((s.length() >= 2) && (s[s.length() - 1] == '-')) {
	    int p = s[s.length() - 2] & 0xFF;

	    if (isISOletter(p)) {
		ostringstream em;

		em << "Line contains an apparent hyphen at the end.";
        	issueMessage(em.str(), *log);
    		err = true;
	    }
	}
    }

@
By the time we audit the text, any tab characters which may
have appeared in the input should have been expanded to
spaces.  Tab characters presume tab stop settings which,
while usually defaulting to 8 characters, are nowhere
specified in a standard.  Leaving tabs in an Etext runs
the risk that carefully-aligned material may be
scrambled if viewed on a system with different tab
stops.  Here we verify that no tabs remain.  Note that
a tab will also fail |@<Check for invalid characters in text@>|,
but making a special check here makes the diagnostic for this
common case more comprehensible.

@<Check for line with embedded tab characters@>=
    if (check & embedded_tabs) {
	if ((i = s.find('\t')) != string::npos) {
	    for (; i < s.length(); i++) {
		if (s[i] == '\t') {
		    ostringstream em;

		    em << "Line contains tab character in column " << (i + 1) << ".";
        	    issueMessage(em.str(), *log);
    		    err = true;
		}
	    }
	}
    }

@
Lines should not have more than one space between words except after
sentence-ending punctuation.  Such lines may result from attempts
to justify text by adding space, and may be propagated even if
the text is re-aligned.  Within a preformatted table embedded
spaces are allowed and this test is skipped.
    
@<Check for line with improper embedded white space@>=
    if ((check & improper_embedded_blanks) && (!inTable) && (!special)) {
	i = s.find_first_not_of(' ');   	    // Ignore leading spaces
	if (i != string::npos) {
	    while ((i = s.find("  ", i)) != string::npos) {
		if ((i > 0) && (sentenceEnd.find(s[i - 1]) != string::npos)) {
	    	    if (s.substr(i + 2, 1) == " ") {
			ostringstream em;

			em << "Line contains extra embedded space after sentence end in column " <<
		    	      (i + 1) << ".";
        		issueMessage(em.str(), *log);
    			err = true;
			i += 3;
		    } else {
			i += 2;
		    }
		} else {
		    ostringstream em;

		    em << "Line contains extra embedded space in column " << (i + 1) << ".";
        	    issueMessage(em.str(), *log);
    		    err = true;
		    i += 2;
		}
	    }
	}
    }

@
In the interest of human readability we restrict the maximum
length of text lines in the document to |maxLineLength|
characters.  If the line exceeds that limit, issue a
warning.  Note that we've already tested for lines which
still contain embedded tab characters or trailing white
space at this point.  Special commands are exempted from this
check.

@<Check for line that exceeds maximum text length@>=
    if ((check & exceeds_maximum_length) && (!special) &&
    	(s.length() > maxLineLength)) {
	ostringstream em;
	
	em << "Line (length " << s.length() << ") exceeds maximum of " <<
	    maxLineLength << " characters.";
        issueMessage(em.str(), *log);
	err = true;
    }


@
Scan the text line to ensure it contains no impermissible characters
as defined by |isCharacterPermissible|.  One little detail:
if we're explicitly checking for |embedded_tabs|, there's no
need to report them a second time as invalid characters.  Finally,
if |permit_8_bit_ISO_characters| is not set, we require the
input to consist of exclusively 7-bit ASCII characters; ISO
characters are reported as errors in this mode.

@<Check for invalid characters in text@>=
    if (check & invalid_characters) {
	for (i = 0; i < s.length(); i++) {
	    if ((!isCharacterPermissible(s[i])) ||
	    	(((!(check & permit_8_bit_ISO_characters)) &&
		    ((s[i] & 0xFF) >= 127)))) {
	    	if ((s[i] != '\t') || (!(check & embedded_tabs))) {
		    ostringstream em;

		    em << "Invalid character 0x" << hex << (s[i] & 0xFF) <<
			dec <<" in column " << (i + 1) << ".";
        	    issueMessage(em.str(), *log);
		    err = true;
		}
	    }
	}
    }

@
Use the ``heuristic justification'' classifier of the
|etextBodyParserFilter| to evaluate the line, then verify if
the actual content of the line is consistent with its
evaluation.  We also keep track of whether we're currently
in a preformatted table, within which embedded spaces are
permitted.  The test for ``dubious centred lines'' catches
a multitude of sins, in particular ragged right, block quote,
and ragged left lines which do not start or end in the
prescribed columns.  Special commands are exempted from this
check.

@<Check for justification-related problems@>=
    if ((check & dubious_justification) && !special) {
	lclass = etextBodyParserFilter::classifyLine(s);
	if (lclass == InPreformattedTable) {
    	    inTable = true;
	} else if (lclass == BetweenParagraphs) {
    	    inTable = false;    	    // Only blank line ends table
	} else if (!inTable && (lclass == InCentred)) {
    	    int l, r;

    	    l = s.find_first_not_of(' ');	// Number of leading spaces
	    r = maxLineLength - s.length();     // Number of (virtual) trailing spaces
	    if (Iabs(l - r) > centringTolerance) {
		ostringstream em;

		em << "Dubious centred line.  " << l << " spaces at left, " <<
	    	      r << " spaces at right.";
        	issueMessage(em.str(), *log);
		err = true;
    	    }
	}
    }

@
There's no reason for more than one consecutive blank line to
appear in the text.  Multiple consecutive blank lines are most
likely editing errors which would render the raw text less
readable.

@<Check for consecutive blank lines@>=
    if (check & consecutive_blank_lines) {
	if (s.find_first_not_of(' ') == string::npos) {
    	    if (lastBlank) {
        	issueMessage("This and previous line are both blank.", *log);
		err = true;
    	    }
	    lastBlank = true;
	} else {
    	    lastBlank = false;
	}
    }

@
Output format specific \.{Special} commands are included in Etexts
to facilitate the production of published editions in various
formats and media, but should be removed prior to distribution
of an Etext in ``Plain ASCII'' form.  (This can be accomplished
by passing the Etext through the |stripSpecialCommandsFilter|.)
This test detects specials inadvertently left in an Etext
intended for publication.
    
@<Check for special commands present@>=
    if ((check & special_commands_present) && special) {
	    issueMessage("Special command present in text.", *log);
	    err = true;
    }

@
The body of an Etext must contain nothing other than the
ISO-8859/1 printable characters, blanks, and end of line
delimiters.  You'd think this wouldn't be a problem, but
thanks to Microsoft's little collection of incompatible
horrors jammed right in the middle of the ISO (and Unicode)
8 bit control set, plus editors who amuse themselves by
jamming form feeds, vertical tabs, etc. into documents,
it pays to be sure, since treating any such nonsense as
legitimate text characters may lead to disaster downstream.

The following static helper function determines if its
character argument is permissible in Etext body copy.
At the time this function is called we assume that any
trailing white space including end of line sequences has
been deleted and that horizontal tabs have been expanded
to spaces.  Placing |trimFilter| and |tabExpanderFilter|
in the pipeline before |auditFilter| will guarantee
these criteria are met.

@<Class definitions@>=
bool auditFilter::isCharacterPermissible(unsigned int c)
{
    if (c < ' ') {
    	return false;	    	    // ASCII control characters not permitted
    }
    if (c >= 127 && c < 161) {
    	return false;	    	    // DEL, ISO control characters, or non-breaking space prohibited
    }
    return true;
}

@
When issuing an error message for a string which may contain invalid
and/or non-printing characters, we need to quote those characters
so they're apparent.  This function takes a string containing
arbitrary 8 bit characters and returns a string in which all
characters other than ASCII and ISO graphics are quoted as \CEE/
hexadecimal escapes.

@<Class definitions@>=
string auditFilter::quoteArbitraryString(string s)
{
    string o = "";
    string::iterator cp;
    unsigned int c;
    
    for (cp = s.begin(); cp < s.end(); cp++) {
    	c = (*cp) & 0xFF;
	if (isCharacterPermissible(c)) {
	    if ((c == ' ') &&
	    	(s.find_first_not_of(' ', (cp + 1) - s.begin()) == string::npos)) {
		o += "\\x20";
	    } else {
	    	o += c;
	    }
	} else {
	    ostringstream eh;
	    
	    eh << "\\x" << hex << setw(2) << setfill('0') << c;
	    o += eh.str();
	}
    }
    
    return o;
}

@*2 Parser diagnostic filter.

This filter processes the body of an Etext (if the source document
contains a prologue and epilogue, this filter should be placed
downstream of a |sectionSeparatorSquid|), identifying components
in the text and passing them down the pipeline tagged with their
type.  The body parser is implemented as a state machine, driven
by the lines of body copy if receives through its |put| method.

@<Class definitions@>=
class parserDiagnosticFilter : public textFilter {
private:@/

public:@/

    string componentName(void) {
        return "parserDiagnosticFilter";
    }
    
    void put(string s) {
   	bodyState rtype = DecodeBodyState(s[0]);
    	string spaces = "            ",
	       stateName = "";

	stateName += stateNames[rtype];
    	emit(s.substr(1,1) + " " + stateName + spaces.substr(0, 12 - stateName.length()) +
	    	 s.substr(2));
    }
};

@*1 Utilities.

The following are not full-fledged pipeline components, but rather utilities
which provide services to text processing components.

@*2 Text substituter.

The |textSubstituter| performs replacement of substrings in
text with defined substitutes.

@<Class definitions@>=
class textSubstituter {
private:@/
    deque <string> fromString;
    deque <string> toString;
    
public:@/
    void addSubstitution(string from, string to) {
    	fromString.push_back(from);
	toString.push_back(to);
    }
    
    string substitute(string s);
};

@
The |substitute| method applies all of the substiutution rules
of the |textSubstituter| to its argument string and returns
the result.  Note that substitutions are not re-scanned, and hence
cannot result in infinite expansion loops.

@<Class definitions@>=
string textSubstituter::substitute(string s) {
    deque <string>::iterator f = fromString.begin();
    deque <string>::iterator t = toString.begin();
    string o = s;
    
    while (f != fromString.end()) {
    	unsigned int i = 0, n;
	
	while ((n = o.find(*f, i)) != string::npos) {
	    o.replace(n, f->length(), *t);
	    i = n + t->length();
	}
	
    	f++;
	t++;
    }

    return o;
}

@q  The LaTeX, HTML, and Palm Markup Language generator @>
@q  components are kept in separate CWEB files for ease of @>
@q  maintenance.  We include them here.  @>
    
@i latex.w
@i html.w
@i palm.w

@** Main program.

The \.{etset} program is a filter which processes both its
input and output in a strictly serial fashion, permitting it to be used
as part of a pipeline.  (The program does need to look ahead, but
handles this internally.)

@<Main program@>=
int main(int argc, char *argv[])
{
    int i, f = 0, opt;
    char *cp;

    @<Process command-line options@>;
    @<Parse command-line file arguments@>;
    
    streamSource insource;
    trimFilter tfilt;
    tabExpanderFilter tabf(8);
    flattenISOCharactersFilter *fiso;
    convertForeignCharacterSetToISOFilter *dosconv;
    auditFilter afilt(FormatWidth);
    sectionSeparatorSquid squiddley;
    etextBodyParserFilter bodyParser;
    stripSpecialCommandsFilter *ssc;
    LaTeXGenerationFilter *lf;
    PalmGenerationFilter *pf;
    streamSink *os = NULL;
    heatSink *hs;
    HTMLGenerationSink *hgs;

#define Plumb(component)    *pipeEnd | component; pipeEnd = &component

    try {
    	insource.openFile(infile);
    } catch (invalid_argument &e) {
    	cerr << e.what() << "\n";
    	return 2;
    }
    textComponent *pipeEnd = &insource; // Pipeline begins with input file source
    if (dosCharacters) {
    	insource.setStripEOL(true);
    	dosconv = new convertForeignCharacterSetToISOFilter(cp850_to_ISO);
	Plumb(*dosconv);
    }
    if (!checkText) {
    	Plumb(tfilt);	    // Trim trailing white space...
	Plumb(tabf);	    // ...and expand tabs to spaces.
    }
    if (specialStrip) {
    	ssc = new stripSpecialCommandsFilter;
	Plumb(*ssc);
    }
    if (flattenISOchars) {
    	fiso = new flattenISOCharactersFilter;
	Plumb(*fiso);
    }
    if (cleanText || checkText) {
	afilt.setAuditCriteria(auditFilter::trailing_blanks |
	    	    	       auditFilter::embedded_tabs |
			       auditFilter::exceeds_maximum_length |
			       auditFilter::invalid_characters |
			       auditFilter::special_commands_present |
			       (asciiOnly ? 0 : auditFilter::permit_8_bit_ISO_characters));
	Plumb(afilt);
	if (checkText) {
    	    hs = new heatSink;
	    Plumb(*hs);
	} else {
    	    os = new streamSink(outfile);
	    Plumb(*os);
	}
    } else {
	Plumb(squiddley);   // ...and split the input file into sections.
	@<Configure prologue and epilogue processing@>;

    	if (asciiOnly) {
	    afilt.disableAuditCriteria(auditFilter::permit_8_bit_ISO_characters);
	}
	Plumb(afilt);   	// The Etext body section is audited for errors,
	afilt.disableAuditCriteria(auditFilter::special_commands_present);
	    	    	    // permitting special commands,
	Plumb(bodyParser);  // then fed to the body parser.

	@<Set up parser debugging if requested@>;

	if (ofmt == LaTeX) {
	    lf = new LaTeXGenerationFilter;
	    os = new streamSink(outfile);

    	    bodyParser.setSpecialFilter("LaTeX");
	    Plumb(*lf);
	    Plumb(*os);
	} else if (ofmt == HTML) {
	    hgs = new HTMLGenerationSink(outfile, singleFileHTML);

    	    bodyParser.setSpecialFilter("HTML");
    	    Plumb(*hgs);
	} else if (ofmt == Palm) {
	    pf = new PalmGenerationFilter;
	    os = new streamSink(outfile);

    	    bodyParser.setSpecialFilter("Palm");
	    Plumb(*pf);
	    Plumb(*os);
	}
    }

    insource.send();

    if (verbose) {
	cerr << insource.getSourceLineNumber() << " input lines processed.\n";
    }
    
    if (os != NULL) {
    	delete os;
    }
    
    return 0;
}

@
The prologue and epilogue of the input file are usually discarded,
with only the body of the Etext being processed.  The user can,
by specifying the \.{--save-prologue} and/or \.{--save-epilogue}
options, each of which takes a file name argument, direct these
portions of the input to the designated file.  The same file name
may be specified for both the prologue and epilogue: the
|sectionSeparatorSquid| goes to great pains to ensure this will
work.

@<Configure prologue and epilogue processing@>=
    textComponent *prodest = NULL;
    
    if (savePrologueFile != "") {
    	squiddley.setPrologueProcessor(prodest = new streamSink(savePrologueFile));
    }
    if (saveEpilogueFile != "") {
    	if (savePrologueFile == saveEpilogueFile) {
	    squiddley.setEpilogueProcessor(prodest);
	} else {
    	    squiddley.setEpilogueProcessor(new streamSink(saveEpilogueFile));
	}
    }

@
If the \.{--debug-parser} option is set, we insert a |teeSquid|
into the pipeline after the |etextBodyParserFilter| with its
secondary output directed to a |parserDiagnosticFilter| which
is in turn plumbed to a |streamSink| which writes the parser
diagnostic information on the |debugParserFile| given as the
argument to the option.

@<Set up parser debugging if requested@>=
    if (debugParser) {
	parserDiagnosticFilter *pd = new parserDiagnosticFilter;
	streamSink *pdsink = new streamSink(debugParserFile);
	teeSquid *pdtsq = new teeSquid(pd);

	*pd | *pdsink;
	Plumb(*pdtsq);
    }
    
@** Application plumbing.

Every application needs a modicum of clanking machinery beneath the waterline
to get its job done and conform to contemporary community standards.  I've
relegated these gory and boring details to the end, where you're
most sincerely encouraged to ignore them.

@
The following include files provide access to system and
library components.

@<System include files@>=
#include "config.h"

#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include <exception>
#include <stdexcept>
#include <string>
#include <vector>
#include <queue>
#include <map>
#include <algorithm>
using namespace std;

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <time.h>
#include <assert.h>

//  Twiddle definitions if building on WIN32 to avoid need to reconfigure
#ifdef WIN32
#ifdef HAVE_UNISTD_H
#undef HAVE_UNISTD_H
#endif
#ifdef HAVE_STAT
#undef HAVE_STAT
#endif
#define __GNU_LIBRARY__ 1
#define __STDC__ 1
#endif

#ifdef HAVE_STAT
#include <sys/stat.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#include "getopt.h"     // Use our own |getopt|, which supports |getopt_long|

@
Here are the global variables we use to keep track of command
line options.

@<Global variables@>=

typedef enum { LaTeX, HTML, Palm } outputFormat;

static outputFormat ofmt = LaTeX;   // Output format
static bool asciiOnly = false;	    // Permit only 7-bit ASCII in input
static bool babelon = false;	    // Use \LaTeX\ \.{babel} package
static string babelang;     	    // Language specification for \.{babel}
static bool singleFileHTML = false; // Generate single file for HTML output
static bool debugParser = false;    // Generate debug output from body parser ?
static bool dosCharacters = false;  // Translate MS-DOS characters to ISO ?
static string debugParserFile = ""; // Log file for parser debugging output
static bool flattenISOchars = false;	// Flatten ISO 8859-1 8-bit codes to ASCII
static bool frenchPunct = false;    // Use nonbreaking spaces for French punctuation
static string savePrologueFile = ""; // File to save prologue
static string saveEpilogueFile = ""; // File to save epilogue
static bool specialStrip = false;   // Strip special commands
static bool strictCompliance = false; // Strict compliance with XHTML 1.0 Strict DTD
static bool unicodeChars = false;   // Use Unicode text entities for special characters
static bool cleanText = false;	    // Clean text for shipment (de-tab, trim trailing spaces)
static bool checkText = false;	    // Check text for shipment
static bool verbose = false;	    // Print verbose processing information

@ Procedure |usage|
prints how-to-call information.  This serves as a reference for the
option processing code which follows.  Don't forget to update
|usage| when you add an option!

@<Global functions@>=
static void usage(void)
{
    cout << PRODUCT << "  --  Typeset ISO 8859 Latin-1 Etext.  Call\n";
    cout << "           with "<< PRODUCT << " [input [output]]\n";
    cout << "\n";
    cout << "Options:\n";
    cout << "    --ascii-only           Permit only 7-bit ASCII characters in input\n";
    cout << "    --babel lang           Use LaTeX babel package for lang\n";
    cout << "    --check                Check text for publication\n";
    cout << "    --clean                Clean: expand tabs, remove trailing white space\n";
    cout << "    --copyright            Print copyright information\n";
    cout << "    --debug-parser file    Write parser debugging log to file\n";
    cout << "    --dos-characters       Translate MS-DOS characters to ISO 8859\n";
    cout << "    --flatten-iso          Flatten ISO 8859-1 8-bit codes to ASCII\n";
    cout << "    --french-punctuation   Use nonbreaking spaces for French punctuation\n";
    cout << "    --help, -u             Print this message\n";
    cout << "    --html, -h             Generate HTML\n";
    cout << "    --latex, -l            Generate LaTeX\n";
    cout << "    --palm, -p             Generate Palm" @, REGISTERED_SIGN @, " Reader document\n";
    cout << "    --save-epilogue file   Save epilogue in file\n";
    cout << "    --save-prologue file   Save prologue in file\n";
    cout << "    --single-file          Single file for HTML output\n";
    cout << "    --special-strip        Strip format-specific special commands\n";
    cout << "    --strict               Strict compliance with XHTML 1.0 Strict DTD\n";
    cout << "    --unicode              Use XHTML Unicode text entities for special characters\n";
    cout << "    --verbose, -v          Print processing information\n";
    cout << "    --version              Print version number\n";
    cout << "\n";
    cout << "by John Walker\n";
    cout << "http://www.fourmilab.ch/\n";
}

@
We use |getopt_long| to process command line options.  This
permits aggregation of single letter options without arguments and
both \.{-d}{\it arg} and \.{-d} {\it arg} syntax.  Long options,
preceded by \.{--}, are provided as alternatives for all single
letter options and are used exclusively for less frequently used
facilities.

@<Process command-line options@>=
    static const struct option long_options[] = {@/
    	{ "ascii-only", 0, NULL, 210 },@/
    	{ "babel", 1, NULL, 202 },@/
	{ "check", 0, NULL, 209 },@/
	{ "clean", 0, NULL, 208 },@/
    	{ "copyright", 0, NULL, 200 },@/
	{ "debug-parser", 1, NULL, 205 },@/
	{ "dos-characters", 0, NULL, 213 },@/
	{ "flatten-iso", 0, NULL, 212 },@/
	{ "french-punctuation", 0, NULL, 203 },@/
	{ "help", 0, NULL, 'u' },@/
	{ "html", 0, NULL, 'h' },@/
	{ "latex", 0, NULL, 'l' },@/
	{ "palm", 0, NULL, 'p' },@/
	{ "save-epilogue", 1, NULL, 206 },@/
	{ "save-prologue", 1, NULL, 207 },@/
	{ "single-file", 0, NULL, 204 },@/
	{ "special-strip", 0, NULL, 211 },@/
	{ "strict", 0, NULL, 214 },@/
	{ "unicode", 0, NULL, 215 },@/
	{ "verbose", 0, NULL, 'v' },@/
	{ "version", 0, NULL, 201 },@/
	{ 0, 0, 0, 0 }@/
    };
    int option_index = 0;
    
    while ((opt = getopt_long(argc, argv, "hlpuv", long_options, &option_index)) != -1) {
        switch (opt) {
	    case 210:	    	    // \.{--ascii-only}  Permit only 7-bit ASCII characters in input
	    	asciiOnly = true;
		break;
		
    	    case 202:	    	    // \.{--babel} {\it language}  Use \.{babel} package with \LaTeX
	    	babelon = true;
		babelang = optarg;
		break;
		
	    case 209:	    	    // \.{--check}  Check complete text ready for publication
	    	checkText = true;
		break;
		
	    case 208:	    	    // \.{--clean}  Expand tabs, trim trailing white space
	    	cleanText = true;
		break;

            case 200:	    	    // \.{--copyright}  Print copyright information
                cout << "This program is in the public domain.\n";
                return 0;
		
    	    case 205:	    	    // \.{--debug-parser} {\it file}  Write parser debug output {\it file}
	    	debugParser = true;
		debugParserFile = optarg;
		break;
		
	    case 213:	    	    // \.{--dos-characters}  Translate MS-DOS character set to ISO 8859-1
	    	dosCharacters = true;
		break;
		
	    case 212:	    	    // \.{--flatten-iso}  Flatten ISO 8859-1 8-bit codes to ASCII
	    	flattenISOchars = true;
		break;
	
    	    case 203:	    	    // \.{--french-punctuation}  French-style spacing for punctuation
               frenchPunct = true;
               break;

    	    case 'h':	    	    // \.{-h}, \.{--html}  Generate HTML output
               ofmt = HTML;
               break;
	       
	    case 'l':	    	    // \.{-l}, \.{--latex}  Generate \LaTeX\ output
	    	ofmt = LaTeX;
		break;
	       
	    case 'p':	    	    // \.{-p}, \.{--palm}  Generate Palm Reader document
	    	ofmt = Palm;
		break;
		
	    case 206:	    	    // \.{--save-epilogue} {\it file}  Save epilogue in {\it file}
	    	saveEpilogueFile = optarg;
		break;
		
	    case 207:	    	    // \.{--save-prologue} {\it file}  Save prologue in {\it file}
	    	savePrologueFile = optarg;
		break;
		
	    case 204:	    	    // \.{--single-file}  Single file HTML output
	    	singleFileHTML = true;
		break;
		
	    case 211:	    	    // \.{--special-strip}  Strip special commands
	    	specialStrip = true;
		break;
	
    	    case 214:	    	    // \.{--strict}  Strict compliance with XHTML 1.0 Strict DTD
               strictCompliance = true;
               break;
	
    	    case 215:	    	    // \.{--unicode}  Use XHTML Unicode text entities for special chars
               unicodeChars = true;
               break;

            case 'u':	    	    // \.{-u}, \.{--help}  Print how-to-call information
            case '?':
                usage();
                return 0;
		
	    case 'v':	    	    // \.{-v}, \.{--verbose}  Print processing information
	    	verbose = true;
		break;

            case 201:	    	    // \.{--version}  Print version information
                cout << PRODUCT @, " " @, VERSION @, "\n";
                cout << "Last revised: " @, REVDATE @, "\n";
                cout << "The latest version is always available\n";
                cout << "at http://www.fourmilab.ch/etexts/etset\n";
		cout << "Please report bugs to bugs@@fourmilab.ch\n";
                return 0;
		
	    default:
	    	cerr << "***Internal error: unhandled case " << opt << " in option processing.\n";
		return 1;
        }
    }

@
Some more global variables to keep track of file name arguments on
the command line$\ldots$.

@<Global variables@>=
static string infile = "-", 	    // "-" means standard input or output
    	      outfile = "-";

@
If no file names are specified on the command line, we act as a
filter from standard input to standard output.  An input
and output file name may be specified.  For HTML format output,
both input and output file names must be given.

@<Parse command-line file arguments@>=
    for (i = optind; i < argc; i++) {
        cp = argv[i];
        switch (f) {
            case 0:
	    	infile = cp;
                f++;
                break;

            case 1:
	    	outfile = cp;
                f++;
                break;

            default:
                cerr << "Too many file names arguments specified.\n";
		return 2;
        }
    }

    if ((ofmt == HTML) && ((f < 2) || (outfile == "-"))) {
        cerr << "Must specify output file name for HTML.\n";
        return 2;
    }
    
    @<Check for input and output files the same@>;
    
@
One of the most common (and disastrous) fat-fingers in invoking this
program is specifying the same name for the input and output file.
If undetected, the open of the output file will truncate the
input file, destroying it.  Here we check for this condition and,
if it obtains, bail before doing any damage.  We don't perform
this check for HTML format output, since HTML generates its own
file names based on the specified {\it basename}, and it's
implausible that the input file would have the extension
\.{.html}.  Obviously, if input or output is standard I/O,
we needn't perform this check.

On systems with a Unix-like |stat| function, if the input and
output files both exist, we compare the device and inode numbers
to check for aliased file names (due to hard or symbolic links,
or a specification such as ``\.{./zot.txt}''.

@<Check for input and output files the same@>=
    if ((ofmt != HTML) && (f == 2) && (infile != "-") && (outfile != "-")) {
    	bool io_dup = false;
	
    	if (infile == outfile) {
	    io_dup = true;  	    // File names lexically equal
#ifdef HAVE_STAT
	} else {
	    struct stat instat, outstat;
	    
	    if ((stat(infile.c_str(), &instat) == 0) &&
	    	(stat(outfile.c_str(), &outstat) == 0) &&
		(instat.st_dev == outstat.st_dev) &&
		(instat.st_ino == outstat.st_ino)) {
	    	io_dup = true;
	    }
#endif
	}
	if (io_dup) {
	    cerr << "Input and output may not be the same file.\n";
	    return 2;
	}
    }
    
@** Character set definitions and translation tables.

The following sections define the character set used in the
program and provide translation tables among various representations
used in formats we emit.

@*1 ISO 8859-1 special characters.

We use the following definitions where ISO 8859-1 characters are required
as strings in the program.  Most modern compilers have no difficulty with
such characters embedded in string literals, but it's surprisingly
difficult to arrange for Plain \TeX\ (as opposed to \LaTeX) to
render them correctly.  Since CWEB produces Plain \TeX, the path of
least resistance is to use escapes for these characters, which
also guarantess the generated documentation will work on even the
most basic \TeX\ installation.  Characters are given their Unicode
names with spaces and hyphens replaced by underscores.  Character
defined with single quotes as |char| have named beginning with
|C_|.

@d REGISTERED_SIGN  	    	    	    	"\xAE"
@d C_LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK	0xAB
@d C_RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK	0xBB
@d RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK	"\xBB"

@*1 LaTeX representation of ISO graphic characters.

The following table is indexed by ISO codes 161 to 255, and gives
the \LaTeX\ rendering of the ISO character.  Using this table
permits compatibility with even the oldest \LaTeX\ versions but,
if you're preparing documents in languages which extensively use
the ISO character set, it's much wiser to use the
\.{-b}~{\it language} option to enable the \.{babel} package
for the specified {\it language}.

@<Global variables@>=
static const char * const texform[] = {
    "!`", "\\makebox{\\rm\\rlap/c}", "\\pounds", "$\\otimes$",
    "\\makebox{\\rm\\rlap Y{\\hspace*{0.07em}\\scriptsize =}}", "$|$",
    "{\\S}", "\\\"{}", "{\\copyright}", "\\b{a}",
    "{\\raisebox{0.3ex}{\\tiny$\\ll$~}}",
    "$\\neg$", "$-$",
    "{\\ooalign{\\hfil\\raise.07ex\\hbox{\\sc r}\\hfil\\crcr\\mathhexbox20D}}",
    "-", "$^{\\circ}$", "$\\pm$", "$^2$", "$^3$", "\\'{}", "$\\mu$",
    "{\\P}", "$\\cdot$", "\\c{}", "$^1$", "\\b{o}",
    "{\\raisebox{0.3ex}{\\tiny~$\\gg$}}",
    "{\\small $1/4$}", "{\\small $1/2$}", "{\\small $3/4$}", "?`",
    "\\`{A}", "\\'{A}", "\\^{A}", "\\~{A}", "\\\"{A}", "{\\AA}",
    "{\\AE}", "\\c{C}", "\\`{E}", "\\'{E}", "\\^{E}", "\\\"{E}",
    "\\`{I}", "\\'{I}", "\\^{I}", "\\\"{I}", "Eth", "\\~{N}",
    "\\`{O}", "\\'{O}", "\\^{O}", "\\~{O}", "\\\"{O}", "$\\times$",
    "{\\O}", "\\`{U}", "\\'{U}", "\\^{U}", "\\\"{U}", "\\'{Y}",
    "Thorn", "{\\ss}", "\\`{a}", "\\'{a}", "\\^{a}", "\\~{a}",
    "\\\"{a}", "{\\aa}", "{\\ae}", "\\c{c}", "\\`{e}", "\\'{e}",
    "\\^{e}", "\\\"{e}", "\\`{\\i}", "\\'{\\i}", "\\^{\\i}",
    "\\\"{\\i}", "eth", "\\~{n}", "\\`{o}", "\\'{o}", "\\^{o}",
    "\\~{o}", "\\\"{o}", "$\\div$", "{\\o}", "\\`{u}", "\\'{u}",
    "\\^{u}", "\\\"{u}", "\\'{y}", "thorn", "\\\"{y}"
};

@*1 MS-DOS code page 850 to ISO translation table.

The following table translates characters in the MS-DOS
code page 850 set to the ISO 8859-1 set we work with.  This
table is included from an external file because its comments
use ISO characters which are painful to express in Plain
\TeX\ and, in any case, the translation table is of
interest only to historians, geeks, and folks chasing
bugs therein.  The include file defines the array
|cp850_to_ISO| which may be used as a translation table
with |convertForeignCharacterSetToISOFilter|.

@<Global variables@>=

#include "cp850.h"

@*1 Flat 7-bit ASCII approximation of ISO characters.

The following table is indexed by ISO codes 160 to 255,
(|0xA0|--|0xFF|) and gives the flat ASCII rendering of
each ISO character.  For accented characters, these are
simply the characters with the accents removed; for more
esoteric characters the translations may be rather
eccentric.

@<Global variables@>=
		      /* Latin 1/Unicode Hex   Description */
static const char *const flattenISO[] = {
    " ",                              /* |0xA0| Non-breaking space */
    "!",                              /* |0xA1| Spanish open exclamation */
    "cents",                          /* |0xA2| Cent sign */
    "GBP",                            /* |0xA3| Pounds Sterling */
    "$",                              /* |0xA4| Universal currency symbol */
    "JPY",                            /* |0xA5| Japanese Yen */
    "|",                              /* |0xA6| Broken vertical bar */
    "Sec.",                           /* |0xA7| Section sign */
    "''",                             /* |0xA8| diaeresis */
    "(C)",                            /* |0xA9| Copyright */
    "a",                              /* |0xAA| Spanish feminine ordinal indicator */
    "<<",                             /* |0xAB| Left pointing guillemet */
    "NOT",                            /* |0xAC| Logical not */
    "",                               /* |0xAD| Soft (discretionary) hyphen */
    "(R)",                            /* |0xAE| Registered trademark */
    "-",                              /* |0xAF| Overbar */
    "o",                              /* |0xB0| Degree sign */
    "+/-",                            /* |0xB1| Plus or minus */
    "^2",                             /* |0xB2| Superscript 2 */
    "^3",                             /* |0xB3| Superscript 3 */
    "'",                              /* |0xB4| Acute accent */
    "mu",                             /* |0xB5| Micro sign */
    "PP.",                            /* |0xB6| Paragraph sign */
    ".",                              /* |0xB7| Middle dot */
    ",",                              /* |0xB8| Spacing cedilla */
    "^1",                             /* |0xB9| Superscript 1 */
    "o",                              /* |0xBA| Spanish masculine ordinal indicator */
    ">>",                             /* |0xBB| Right pointing guillemet */
    "1/4",                            /* |0xBC| Fraction one quarter */
    "1/2",                            /* |0xBD| Fraction one half */
    "3/4",                            /* |0xBE| Fraction three quarters */
    "?",                              /* |0xBF| Spanish open question */
    "A",                              /* |0xC0| Accented capital A grave */
    "A",                              /* |0xC1|                    acute */
    "A",                              /* |0xC2|                    circumflex */
    "A",                              /* |0xC3|                    tilde */
    "A",                              /* |0xC4|                    diaeresis */
    "A",                              /* |0xC5| Capital A ring / Angstrom symbol */
    "Ae",                             /* |0xC6| Capital Ae */
    "C",                              /* |0xC7| Capital C cedilla */
    "E",                              /* |0xC8| Accented capital E grave */
    "E",                              /* |0xC9|                    acute */
    "E",                              /* |0xCA|                    circumflex */
    "E",                              /* |0xCB|                    diaeresis */
    "I",                              /* |0xCC| Accented capital I grave */
    "I",                              /* |0xCD|                    acute */
    "I",                              /* |0xCE|                    circumflex */
    "I",                              /* |0xCF|                    diaeresis */
    "Th",                             /* |0xD0| Capital Eth */
    "N",                              /* |0xD1| Capital N tilde */
    "O",                              /* |0xD2| Accented capital O grave */
    "O",                              /* |0xD3|                    acute */
    "O",                              /* |0xD4|                    circumflex */
    "O",                              /* |0xD5|                    tilde */
    "O",                              /* |0xD6|                    diaeresis */
    "x",                              /* |0xD7| Multiplication sign */
    "O",                              /* |0xD8| Capital O slash */
    "U",                              /* |0xD9| Accented capital U grave */
    "U",                              /* |0xDA|                    acute */
    "U",                              /* |0xDB|                    circumflex */
    "U",                              /* |0xDC|                    diaeresis */
    "Y",                              /* |0xDD| Capital Y acute */
    "Th",                             /* |0xDE| Capital thorn */
    "ss",                             /* |0xDF| German small sharp s */
    "a",                              /* |0xE0| Accented small a grave */
    "a",                              /* |0xE1|                  acute */
    "a",                              /* |0xE2|                  circumflex */
    "a",                              /* |0xE3|                  tilde */
    "a",                              /* |0xE4|                  diaeresis */
    "a",                              /* |0xE5| Small a ring */
    "ae",                             /* |0xE6| Small ae */
    "c",                              /* |0xE7| Small c cedilla */
    "e",                              /* |0xE8| Accented small e grave */
    "e",                              /* |0xE9|                  acute */
    "e",                              /* |0xEA|                  circumflex */
    "e",                              /* |0xEB|                  diaeresis */
    "i",                              /* |0xEC| Accented small i grave */
    "i",                              /* |0xED|                  acute */
    "i",                              /* |0xEE|                  circumflex */
    "i",                              /* |0xEF|                  diaeresis */
    "th",                             /* |0xF0| Small eth */
    "n",                              /* |0xF1| Small n tilde */
    "o",                              /* |0xF2| Accented small o grave */
    "o",                              /* |0xF3|                  acute */
    "o",                              /* |0xF4|                  circumflex */
    "o",                              /* |0xF5|                  tilde */
    "o",                              /* |0xF6|                  diaeresis */
    "/",                              /* |0xF7| Division sign */
    "o",                              /* |0xF8| Small o slash */
    "u",                              /* |0xF9| Accented small u grave */
    "u",                              /* |0xFA|                  acute */
    "u",                              /* |0xFB|                  circumflex */
    "u",                              /* |0xFC|                  diaeresis */
    "y",                              /* |0xFD| Small y acute */
    "th",                             /* |0xFE| Small thorn */
    "y"                               /* |0xFF| Small y diaeresis */
};

@q  Release History and Development Log  @>

@i log.w

@** Index.
The following is a cross-reference table for \.{etset}.
Single-character identifiers are not indexed, nor are
reserved words.  Underlined entries indicate where
an identifier was declared.
