//-------------------------74-columns-wide-------------------------------|
/*
* Copyright (c) 2001 Extreme! Lab, Indiana University. All rights
* reserved.
*
* This software is open source.
* See the bottom of this file for the licence.
*
* $Id: XmlTokenizer.java,v 1.36 2001/08/15 21:37:24 aslom Exp $
*/
package sxt;
import java.io.*;
/**
* Simpe XML Tokenizer (SXT) performs input stream tokenizing.
*
* Advantages:
* - utility class to simplify creation of XML parsers, especially
* suited for pull event model but can support also push (SAX2)
*
- small footprint: whole parser is in one file
*
- minimal memory utilization: does not use memory except for input
* and content buffer (that can grow in size)
*
- fast: all parsing done in one function (simple automata)
*
- supports most of XML 1.0 (except validation and external entities)
*
- low level: supports on demand parsing of Characters,
* CDSect, Comments, PIs etc.)
*
- parsed content: supports providing on demand
* parsed content to application (standard entities expanded
* all CDATA sections inserted, Comments and PIs removed)
* not for attribute values and element content
*
- mixed content: allow to dynamically disable mixed content
*
- small - total compiled size around 15K
*
*
* Limitations:
* - it is just a tokenizer - does not enforce grammar
*
- readName() is using Java identifier rules not XML
*
- does not parse DOCTYPE declaration (skips everyting in [...])
*
*
* @author Aleksander Slominski [aslom@extreme.indiana.edu]
*/
public class XmlTokenizer {
//enumeration of tokens that can be returned
public final static byte END_DOCUMENT = 2;
public final static byte CONTENT = 10;
public final static byte CHARACTERS = 20;
public final static byte CDSECT = 30;
public final static byte COMMENT = 40;
public final static byte DOCTYPE = 50;
public final static byte PI = 60;
public final static byte ENTITY_REF = 70;
public final static byte CHAR_REF = 75;
public final static byte ETAG_NAME = 110;
public final static byte EMPTY_ELEMENT = 111;
public final static byte STAG_END = 112;
public final static byte STAG_NAME = 120;
public final static byte ATTR_NAME = 122;
public final static byte ATTR_CHARACTERS = 124;
public final static byte ATTR_CONTENT = 127;
// parameters controlling tokenizer behaviour
public boolean paramNotifyCharacters;
public boolean paramNotifyComment;
public boolean paramNotifyCDSect;
public boolean paramNotifyDoctype;
public boolean paramNotifyPI;
public boolean paramNotifyCharRef;
public boolean paramNotifyEntityRef;
public boolean paramNotifyAttValue;
public char[] buf = new char[BUF_SIZE];
public int pos;
public int posStart;
public int posEnd;
public int posNsColon;
public int nsColonCount;
public boolean seenContent;
public boolean parsedContent;
public char[] pc = new char[BUF_SIZE];
public int pcStart;
public int pcEnd;
public XmlTokenizer() {
}
public void reset() {
// release buffer that may have been used by setInput(char[])...
if(!reading) { // data was taken from input char[]
//|| ((hardLimit != -1) && (buf.length < hardLimit))
//) {
//if(hardLimit != -1) {
if(softLimit != -1) {
resize(softLimit);
} else {
resize(BUF_SIZE);
}
}
resetState();
}
private void resetState() {
reading = true;
bufSize = buf.length;
bufEnd = 0;
posEnd = posStart = pos = 0;
posNsColon = -1;
state = STATE_INIT;
prevCh = '\0';
posCol = posRow = 1;
reachedEnd = false;
pcEnd = pcStart = 0;
previousState = -1;
backtracking = false;
seenContent = false;
}
/** Reset tokenizer state and set new input source */
public void setInput(char[] data) {
resetState();
reading = false;
buf = data;
bufSize = bufEnd = buf.length;
if(paramPC && pc.length < bufSize) {
pc = new char[bufSize];
if(TEST_VALIDATING) for(int i = 0; i < bufSize; ++i) pc[i]='X';
}
}
/** Reset tokenizer state and set new input source */
public void setInput(Reader r) {
reset();
reading = true;
reader = r;
bufEnd = 0;
}
/**
* Set notification of all XML content tokens:
* Characters, Comment, CDSect, Doctype, PI, EntityRef, CharRef, AttValue
* (tokens for STag, ETag and Attribute are always sent).
*/
public void setNotifyAll(boolean enable) {
paramNotifyCharacters = enable;
paramNotifyComment = enable;
paramNotifyCDSect = enable;
paramNotifyDoctype = enable;
paramNotifyPI = enable;
paramNotifyEntityRef = enable;
paramNotifyCharRef = enable;
paramNotifyAttValue = enable;
}
/**
* Allow reporting parsed content for element content
* and attribute content (no need to deal with low level
* tokens such as in setNotifyAll).
*/
public void setParseContent(boolean enable) {
paramPC = enable;
if(paramPC && pc.length < bufSize) {
pc = new char[bufSize];
}
}
/**
* Set support for mixed conetent. If mixed content is
* disabled tokenizer will do its best to ensure that
* no element has mixed content model also ignorable whitespaces
* will not be reported as element content.
*/
public void setMixedContent(boolean enable) {
paramNoMixContent = !enable;
}
/**
* Set soft limit on internal buffer size.
* That means suggested size that tokznzier will try to keep.
*/
public void setSoftLimit(int value) throws XmlTokenizerException {
//if(state != STATE_INIT) {
// throw new XmlTokenizerException(
// "soft limit can not be changed after parsing started");
//}
if(!reading) {
throw new XmlTokenizerException(
"hard limit can not be set for char array input"
);
}
if((value != -1) && (hardLimit != -1) && (2 * value > hardLimit)) {
throw new XmlTokenizerException(
"soft limit can no tbe bigger than half of hard limit"
+"current hard limit "+hardLimit
);
}
softLimit = value;
if(softLimit != -1) {
posSafe = softLimit;
} else if(hardLimit != -1) {
posSafe = hardLimit / 2;
} else {
posSafe = (int)(loadFactor * bufSize); //restore default
}
}
/**
* Set hard limit on internal buffer size.
* That means that if input (such as element content) is bigger than
* hard limit size tokenizer will throw XmlTokenizerBufferOverflowException.
*/
public void setHardLimit(int value) throws XmlTokenizerException {
if(!reading) {
throw new XmlTokenizerException(
"hard limit can not be set for char array input"
);
}
if(state != STATE_INIT && value < hardLimit) {
throw new XmlTokenizerException(
"hard limit on buffer size can not be shrunk during parsing"
);
}
if(softLimit == -1 && value != -1) {
throw new XmlTokenizerException(
"soft limit must be set to non -1 before setting hard limit"
+getPosDesc(), getLineNumber(), getColumnNumber()
);
}
if((value != -1) && ((2 * softLimit) >= value)) {
throw new XmlTokenizerException(
"hard limit must be at least twice the size of soft limit"
+"current soft limit "+softLimit+" and hard limit "+value
+getPosDesc(), getLineNumber(), getColumnNumber()
);
}
// resize buffer to new hard limit
hardLimit = value;
if(softLimit != -1 && softLimit < bufSize) {
resize(softLimit);
}
}
private int findFragment(char[] b, int i, int j) {
if(i == 0) return i;
while(i-- > 0) {
if((j - i) > 55) break;
char c = b[i];
if(c == '<') break;
}
return i;
}
/**
* Return string describing current position of parsers as
* text 'at line %d (row) and column %d (colum) [seen %s...]'.
*/
public String getPosDesc() {
String fragment = null;
if(parsedContent) {
//System.err.println("pcStart="+pcStart+" pcEnd="+pcEnd);
//if(pcStart > 0) fragment = "...";
if(pcStart <= pcEnd) {
int start = findFragment(pc, pcStart, pcEnd);
fragment = new String(pc, start, pcEnd - start);
if(start > 0) fragment = "..." + fragment;
}
} else {
//System.err.println("posStart="+posStart+" posEnd="+posEnd);
//if(posStart > 0) fragment = "...";
if(posStart <= posEnd) {
int start = findFragment(buf, posStart, posEnd);
fragment = new String(buf, start, posEnd - start);
if(start > 0) fragment = "..." + fragment;
}
}
return " at line "+posRow
+" and column "+(posCol-1)
+(fragment != null ? " seen "+printable(fragment)+"..." : "");
}
public int getLineNumber() { return posRow; }
public int getColumnNumber() { return posCol-1; }
/**
* Return next recognized toke or END_DOCUMENT if no more input.
*
* This is simple automata (in pseudo-code):
*
* byte next() {
* while(state != END_DOCUMENT) {
* ch = more(); // read character from input
* state = func(ch, state); // do transition
* if(state is accepting)
* return state; // return token to caller
* }
* }
*
*
* For simplicity it is using few procedures such as readName() or isS().
*
*
*
*/
public byte next() throws XmlTokenizerException, IOException {
if(state == STATE_FINISHED)
throw new XmlTokenizerException("attempt to read beyond end of input");
parsedContent = false;
LOOP:
while(true) {
if(reachedEnd) {
if(state != STATE_FINISH) {
if(state != STATE_CONTENT && state != STATE_CONTENT_INIT
&& state != STATE_CONTENT_CONTINUED) {
throw new XmlTokenizerException(
"unexpected end of stream (state="+state+")");
}
if(state == STATE_CONTENT_INIT || state == STATE_CONTENT_CONTINUED) {
if(state == STATE_CONTENT_INIT) {
pcEnd = pcStart = pos - 1;
}
posEnd = posStart = pos - 1;
}
state = STATE_FINISH;
if(paramPC && (pcStart != pcEnd || posEnd != posStart)) {
// if(pcEnd == pcStart) {
// pcStart = posStart;
// pcEnd = posEnd;
// pc = buf;
// } else {
// pc = pc;
// }
parsedContent = (pcEnd != pcStart);
if(paramNoMixContent == false || seenContent == false)
return CONTENT;
else if(parsedContent)
throw new XmlTokenizerException(
"no element content allowed before end of stream");
}
}
state = STATE_FINISHED;
if(TRACE_SIZING) {
System.err.println("bufEnd="+bufEnd+" bufSize="+bufSize
+" softLimit="+softLimit+" hardLimit="+hardLimit);
}
return END_DOCUMENT;
}
char ch = more();
// 2.11 End-of-Line Handling: "\r\n" -> "\n"; "\rX" -> "\nX"
//XXX
if(NORMALIZE_LINE_BREAKS) {
if(ch == '\r') {
// TODO: joinPC()
if(pcStart == pcEnd && posEnd > posStart) {
int len = posEnd - posStart;
System.arraycopy(buf, posStart, pc, pcEnd, len);
pcEnd += len;
}
//ch = '\n';
} else if(prevPrevCh == '\r' && ch == '\n') {
continue LOOP; //ask for more chars --> ch = more();
// it can not be break as we are not yet in switch(..)
}
}
switch(state) {
case STATE_INIT:
// detect BOM and frop it (Unicode Byte Order Mark)
if(ch == '\uFFFE') {
throw new XmlTokenizerException(
"first character in input was UNICODE noncharacter (0xFFFE)"+
"- input requires byte swapping");
}
if(ch == '\uFEFF') {
// skupping UNICODE BOM
state = STATE_CONTENT_INIT;
break;
}
; // fall through
case STATE_CONTENT_INIT:
pcEnd = pcStart = pos - 1;
; // fall through
case STATE_CONTENT_CONTINUED:
posEnd = posStart = pos - 1;
state = STATE_CONTENT;
; // fall through
case STATE_CONTENT:
if(ch == '<') {
state = STATE_SEEN_LT;
if(paramNotifyCharacters && posStart != posEnd)
return CHARACTERS;
} else if(ch == '&') {
if(paramPC && pcStart == pcEnd && posEnd > posStart) {
// TODO: joinPC()
int len = posEnd - posStart;
System.arraycopy(buf, posStart, pc, pcEnd, len);
pcEnd += len;
}
if(!seenContent) {
seenContent = true;
if(paramNoMixContent && !mixInElement)
throw new XmlTokenizerException(
"mixed content disallowed outside element"+getPosDesc(), getLineNumber(), getColumnNumber());
}
state = STATE_SEEN_AMP;
previousState = STATE_CONTENT_CONTINUED;
posStart = pos - 1;
} else {
if(!seenContent && !isS(ch)) {
seenContent = true;
if(paramNoMixContent && !mixInElement)
throw new XmlTokenizerException(
"mixed content disallowed outside element, "
+"character '"+printable(ch)+"'"
+" ("+(int)ch+")"+getPosDesc(), getLineNumber(), getColumnNumber());
}
posEnd = pos;
if(paramPC &&
((pcStart != pcEnd) || (NORMALIZE_LINE_BREAKS && ch == '\r'))
) {
//NOTE: normalization is done at beginning of LOOP
//but it only will prepare pc buffer that is only now filled
// CONSIDER: worst case when \r is first character!!!!
if(NORMALIZE_LINE_BREAKS && ch == '\r')
pc[pcEnd++] = '\n';
else
pc[pcEnd++] = ch;
}
//if(NORMALIZE_LINE_BREAKS && ch == '\r') {
// throw new IllegalStateException(
// "end-of-line normalization failed"+getPosDesc());
//}
if(paramNotifyCharacters && reachedEnd)
return CHARACTERS;
}
break;
case STATE_SEEN_LT:
if(ch == '!') {
state = STATE_SEEN_LT_BANG;
} else if(ch == '?') {
state = STATE_PI;
} else { // it must be STag or ETag
boolean prevMixSeenContent = seenContent;
boolean prevMixInElement = mixInElement;
if(ch == '/') {
state = STATE_SCAN_ETAG_NAME;
mixInElement = false;
} else {
state = STATE_SCAN_STAG_NAME;
if(paramNoMixContent && seenContent)
throw new XmlTokenizerException("mixed content disallowed"
+" inside element and before start tag"+getPosDesc(), getLineNumber(), getColumnNumber());
mixInElement = true;
}
//TODO TESTME
if(paramPC /*&& (pcStart != pcEnd || posEnd != posStart)*/) {
parsedContent = (pcEnd != pcStart);
if(paramNoMixContent == false
|| (paramNoMixContent && state == STATE_SCAN_ETAG_NAME
//&& prevMixInElement && prevMixSeenContent)) {
&& prevMixInElement)) {
return CONTENT;
}
}
}
// gather parsed content - so we have what was before comments etc.
if(paramPC && state != STATE_SCAN_STAG_NAME
&& state != STATE_SCAN_ETAG_NAME)
{
// TODO: joinPC()
if(pcStart == pcEnd && posEnd > posStart) {
int len = posEnd - posStart;
System.arraycopy(buf, posStart, pc, pcEnd, len);
pcEnd += len;
}
}
posStart = pos; // to make PI start content
break;
case STATE_SEEN_LT_BANG:
if(ch == '-') {
ch = more();
if(ch != '-')
throw new XmlTokenizerException(
"expected - for start of comment