// XmlParser.java: the main parser class. // NO WARRANTY! See README, and copyright below. // $Id: XmlParser.java,v 1.1 2000/09/15 16:15:52 malcolm Exp $ package com.microstar.xml; import java.io.BufferedInputStream; import java.io.EOFException; import java.io.InputStream; import java.io.Reader; import java.net.URL; import java.net.URLConnection; import java.util.Enumeration; import java.util.Hashtable; import java.util.Stack; /** * Parse XML documents and return parse events through call-backs. *
You need to define a class implementing the XmlHandler
* interface: an object belonging to this class will receive the
* callbacks for the events. (As an alternative to implementing
* the full XmlHandler interface, you can simply extend the
* HandlerBase
convenience class.)
*
Usage (assuming that MyHandler
is your implementation
* of the XmlHandler
interface):
*
* XmlHandler handler = new MyHandler(); * XmlParser parser = new XmlParser(); * parser.setHandler(handler); * try { * parser.parse("http://www.host.com/doc.xml", null); * } catch (Exception e) { * [do something interesting] * } **
Alternatively, you can use the standard SAX interfaces
* with the SAXDriver
class as your entry point.
* @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
* @author Written by David Megginson <dmeggins@microstar.com>
* @version 1.1
* @see XmlHandler
* @see HandlerBase
* @see SAXDriver
*/
public class XmlParser {
//
// Use special cheats that speed up the code (currently about 50%),
// but may cause problems with future maintenance and add to the
// class file size (about 500 bytes).
//
private final static boolean USE_CHEATS = true;
//////////////////////////////////////////////////////////////////////
// Constructors.
////////////////////////////////////////////////////////////////////////
/**
* Construct a new parser with no associated handler.
* @see #setHandler
* @see #parse
*/
public XmlParser ()
{
}
/**
* Set the handler that will receive parsing events.
* @param handler The handler to receive callback events.
* @see #parse
* @see XmlHandler
*/
public void setHandler (XmlHandler handler)
{
this.handler = handler;
}
/**
* Parse an XML document from a URI.
*
You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The URI of the document. * @param publicId The public identifier of the document, or null. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse (String systemId, String publicId, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, null, encoding); } /** * Parse an XML document from a byte stream. *
The URI that you supply will become the base URI for * resolving relative links, but Ælfred will actually read * the document from the supplied input stream. *
You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The base URI of the document, or null if not * known. * @param publicId The public identifier of the document, or null * if not known. * @param stream A byte input stream. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse (String systemId, String publicId, InputStream stream, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, stream, encoding); } /** * Parse an XML document from a character stream. *
The URI that you supply will become the base URI for * resolving relative links, but Ælfred will actually read * the document from the supplied input stream. *
You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The base URI of the document, or null if not * known. * @param publicId The public identifier of the document, or null * if not known. * @param reader A character stream. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse (String systemId, String publicId, Reader reader) throws java.lang.Exception { doParse(systemId, publicId, reader, null, null); } private synchronized void doParse (String systemId, String publicId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { basePublicId = publicId; baseURI = systemId; baseReader = reader; baseInputStream = stream; initializeVariables(); // Set the default entities here. setInternalEntity(intern("amp"), "&"); setInternalEntity(intern("lt"), "<"); setInternalEntity(intern("gt"), ">"); setInternalEntity(intern("apos"), "'"); setInternalEntity(intern("quot"), """); if (handler != null) { handler.startDocument(); } pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream, encoding); parseDocument(); if (handler != null) { handler.endDocument(); } cleanupVariables(); } //////////////////////////////////////////////////////////////////////// // Constants. //////////////////////////////////////////////////////////////////////// // // Constants for element content type. // /** * Constant: an element has not been declared. * @see #getElementContentType */ public final static int CONTENT_UNDECLARED = 0; /** * Constant: the element has a content model of ANY. * @see #getElementContentType */ public final static int CONTENT_ANY = 1; /** * Constant: the element has declared content of EMPTY. * @see #getElementContentType */ public final static int CONTENT_EMPTY = 2; /** * Constant: the element has mixed content. * @see #getElementContentType */ public final static int CONTENT_MIXED = 3; /** * Constant: the element has element content. * @see #getElementContentType */ public final static int CONTENT_ELEMENTS = 4; // // Constants for the entity type. // /** * Constant: the entity has not been declared. * @see #getEntityType */ public final static int ENTITY_UNDECLARED = 0; /** * Constant: the entity is internal. * @see #getEntityType */ public final static int ENTITY_INTERNAL = 1; /** * Constant: the entity is external, non-XML data. * @see #getEntityType */ public final static int ENTITY_NDATA = 2; /** * Constant: the entity is external XML data. * @see #getEntityType */ public final static int ENTITY_TEXT = 3; // // Constants for attribute type. // /** * Constant: the attribute has not been declared for this element type. * @see #getAttributeType */ public final static int ATTRIBUTE_UNDECLARED = 0; /** * Constant: the attribute value is a string value. * @see #getAttributeType */ public final static int ATTRIBUTE_CDATA = 1; /** * Constant: the attribute value is a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_ID = 2; /** * Constant: the attribute value is a reference to a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREF = 3; /** * Constant: the attribute value is a list of ID references. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREFS = 4; /** * Constant: the attribute value is the name of an entity. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITY = 5; /** * Constant: the attribute value is a list of entity names. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITIES = 6; /** * Constant: the attribute value is a name token. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKEN = 7; /** * Constant: the attribute value is a list of name tokens. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKENS = 8; /** * Constant: the attribute value is a token from an enumeration. * @see #getAttributeType */ public final static int ATTRIBUTE_ENUMERATED = 9; /** * Constant: the attribute is the name of a notation. * @see #getAttributeType */ public final static int ATTRIBUTE_NOTATION = 10; // // When the class is loaded, populate the hash table of // attribute types. // /** * Hash table of attribute types. */ private static Hashtable attributeTypeHash; static { attributeTypeHash = new Hashtable(); attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA)); attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID)); attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF)); attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS)); attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY)); attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES)); attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN)); attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS)); attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION)); } // // Constants for supported encodings. // private final static int ENCODING_UTF_8 = 1; private final static int ENCODING_ISO_8859_1 = 2; private final static int ENCODING_UCS_2_12 = 3; private final static int ENCODING_UCS_2_21 = 4; private final static int ENCODING_UCS_4_1234 = 5; private final static int ENCODING_UCS_4_4321 = 6; private final static int ENCODING_UCS_4_2143 = 7; private final static int ENCODING_UCS_4_3412 = 8; // // Constants for attribute default value. // /** * Constant: the attribute is not declared. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0; /** * Constant: the attribute has a literal default value specified. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1; /** * Constant: the attribute was declared #IMPLIED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2; /** * Constant: the attribute was declared #REQUIRED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3; /** * Constant: the attribute was declared #FIXED. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_FIXED = 4; // // Constants for input. // private final static int INPUT_NONE = 0; private final static int INPUT_INTERNAL = 1; private final static int INPUT_EXTERNAL = 2; private final static int INPUT_STREAM = 3; private final static int INPUT_BUFFER = 4; private final static int INPUT_READER = 5; // // Flags for reading literals. // private final static int LIT_CHAR_REF = 1; private final static int LIT_ENTITY_REF = 2; private final static int LIT_PE_REF = 4; private final static int LIT_NORMALIZE = 8; // // Flags for parsing context. // private final static int CONTEXT_NONE = 0; private final static int CONTEXT_DTD = 1; private final static int CONTEXT_ENTITYVALUE = 2; private final static int CONTEXT_ATTRIBUTEVALUE = 3; ////////////////////////////////////////////////////////////////////// // Error reporting. ////////////////////////////////////////////////////////////////////// /** * Report an error. * @param message The error message. * @param textFound The text that caused the error (or null). * @see XmlHandler#error * @see #line */ void error (String message, String textFound, String textExpected) throws java.lang.Exception { errorCount++; if (textFound != null) { message = message + " (found \"" + textFound + "\")"; } if (textExpected != null) { message = message + " (expected \"" + textExpected + "\")"; } if (handler != null) { String uri = null; if (externalEntity != null) { uri = externalEntity.getURL().toString(); } handler.error(message, uri, line, column); } } /** * Report a serious error. * @param message The error message. * @param textFound The text that caused the error (or null). */ void error (String message, char textFound, String textExpected) throws java.lang.Exception { error(message, new Character(textFound).toString(), textExpected); } ////////////////////////////////////////////////////////////////////// // Major syntactic productions. ////////////////////////////////////////////////////////////////////// /** * Parse an XML document. *
* [1] document ::= prolog element Misc* **
This is the top-level parsing function for a single XML * document. As a minimum, a well-formed document must have * a document element, and a valid document must have a prolog * as well. */ void parseDocument () throws java.lang.Exception { char c; parseProlog(); require('<'); parseElement(); try { parseMisc(); //skip all white, PIs, and comments c=readCh(); //if this doesn't throw an exception... error("unexpected characters after document end",c,null); } catch (EOFException e) {return;} } /** * Skip a comment. *
* [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" **
(The <!--
has already been read.)
*/
void parseComment ()
throws java.lang.Exception
{
skipUntil("-->");
}
/**
* Parse a processing instruction and do a call-back.
*
* [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>' **
(The <?
has already been read.)
*
An XML processing instruction must begin with * a Name, which is the instruction's target. */ void parsePI () throws java.lang.Exception { String name; name = readNmtoken(true); if (!tryRead("?>")) { requireWhitespace(); parseUntil("?>"); } if (handler != null) { handler.processingInstruction(name, dataBufferToString()); } } /** * Parse a CDATA marked section. *
* [20] CDSect ::= CDStart CData CDEnd * [21] CDStart ::= '<![CDATA[' * [22] CData ::= (Char* - (Char* ']]>' Char*)) * [23] CDEnd ::= ']]>' **
(The '<![CDATA[' has already been read.) *
Note that this just appends characters to the dataBuffer, * without actually generating an event. */ void parseCDSect () throws java.lang.Exception { parseUntil("]]>"); } /** * Parse the prolog of an XML document. *
* [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? **
There are a couple of tricks here. First, it is necessary to * declare the XML default attributes after the DTD (if present) * has been read. Second, it is not possible to expand general * references in attribute value literals until after the entire * DTD (if present) has been parsed. *
We do not look for the XML declaration here, because it is * handled by pushURL(). * @see pushURL */ void parseProlog () throws java.lang.Exception { parseMisc(); if (tryRead(" * [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'") * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'" * | S 'standalone' Eq '"' ("yes" | "no") '"' * [78] EncodingDecl ::= S 'encoding' Eq QEncoding * *
([80] to [82] are also significant.) *
(The <?xml
and whitespace have already been read.)
*
TODO: validate value of standalone. * @see #parseTextDecl * @see #checkEncoding */ void parseXMLDecl (boolean ignoreEncoding) throws java.lang.Exception { String version; String encodingName = null; String standalone = null; // Read the version. require("version"); parseEq(); version = readLiteral(0); if (!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } // Try reading an encoding declaration. skipWhitespace(); if (tryRead("encoding")) { parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); } // Try reading a standalone declaration skipWhitespace(); if (tryRead("standalone")) { parseEq(); standalone = readLiteral(0); } skipWhitespace(); require("?>"); } /** * Parse the Encoding PI. *
* [78] EncodingDecl ::= S 'encoding' Eq QEncoding * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>' * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'" * [81] Encoding ::= LatinName * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* **
(The <?xml
' and whitespace have already been read.)
* @see #parseXMLDecl
* @see #checkEncoding
*/
void parseTextDecl (boolean ignoreEncoding)
throws java.lang.Exception
{
String encodingName = null;
// Read an optional version.
if (tryRead("version")) {
String version;
parseEq();
version = readLiteral(0);
if (!version.equals("1.0")) {
error("unsupported XML version", version, "1.0");
}
requireWhitespace();
}
// Read the encoding.
require("encoding");
parseEq();
encodingName = readLiteral(0);
checkEncoding(encodingName, ignoreEncoding);
skipWhitespace();
require("?>");
}
/**
* Check that the encoding specified makes sense.
*
Compare what the author has specified in the XML declaration * or encoding PI with what we have detected. *
This is also important for distinguishing among the various * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect * those). * @param encodingName The name of the encoding specified by the user. * @see #parseXMLDecl * @see #parseTextDecl */ void checkEncoding (String encodingName, boolean ignoreEncoding) throws java.lang.Exception { encodingName = encodingName.toUpperCase(); if (ignoreEncoding) { return; } switch (encoding) { // 8-bit encodings case ENCODING_UTF_8: if (encodingName.equals("ISO-8859-1")) { encoding = ENCODING_ISO_8859_1; } else if (!encodingName.equals("UTF-8")) { error("unsupported 8-bit encoding", encodingName, "UTF-8 or ISO-8859-1"); } break; // 16-bit encodings case ENCODING_UCS_2_12: case ENCODING_UCS_2_21: if (!encodingName.equals("ISO-10646-UCS-2") && !encodingName.equals("UTF-16")) { error("unsupported 16-bit encoding", encodingName, "ISO-10646-UCS-2"); } break; // 32-bit encodings case ENCODING_UCS_4_1234: case ENCODING_UCS_4_4321: case ENCODING_UCS_4_2143: case ENCODING_UCS_4_3412: if (!encodingName.equals("ISO-10646-UCS-4")) { error("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); } } } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. *
* [27] Misc ::= Comment | PI | S **/ void parseMisc () throws java.lang.Exception { while (true) { skipWhitespace(); if (tryRead("")) {parsePI();} else if (tryRead("