//-------------------------74-columns-wide-------------------------------| /* * Copyright (c) 2001 Extreme! Lab, Indiana University. All rights * reserved. * * This software is open source. * See the bottom of this file for the licence. * * $Id: XmlTokenizer.java,v 1.36 2001/08/15 21:37:24 aslom Exp $ */ package sxt; import java.io.*; /** * Simpe XML Tokenizer (SXT) performs input stream tokenizing. * * Advantages: * * Limitations: * * @author Aleksander Slominski [aslom@extreme.indiana.edu] */ public class XmlTokenizer { //enumeration of tokens that can be returned public final static byte END_DOCUMENT = 2; public final static byte CONTENT = 10; public final static byte CHARACTERS = 20; public final static byte CDSECT = 30; public final static byte COMMENT = 40; public final static byte DOCTYPE = 50; public final static byte PI = 60; public final static byte ENTITY_REF = 70; public final static byte CHAR_REF = 75; public final static byte ETAG_NAME = 110; public final static byte EMPTY_ELEMENT = 111; public final static byte STAG_END = 112; public final static byte STAG_NAME = 120; public final static byte ATTR_NAME = 122; public final static byte ATTR_CHARACTERS = 124; public final static byte ATTR_CONTENT = 127; // parameters controlling tokenizer behaviour public boolean paramNotifyCharacters; public boolean paramNotifyComment; public boolean paramNotifyCDSect; public boolean paramNotifyDoctype; public boolean paramNotifyPI; public boolean paramNotifyCharRef; public boolean paramNotifyEntityRef; public boolean paramNotifyAttValue; public char[] buf = new char[BUF_SIZE]; public int pos; public int posStart; public int posEnd; public int posNsColon; public int nsColonCount; public boolean seenContent; public boolean parsedContent; public char[] pc = new char[BUF_SIZE]; public int pcStart; public int pcEnd; public XmlTokenizer() { } public void reset() { // release buffer that may have been used by setInput(char[])... if(!reading) { // data was taken from input char[] //|| ((hardLimit != -1) && (buf.length < hardLimit)) //) { //if(hardLimit != -1) { if(softLimit != -1) { resize(softLimit); } else { resize(BUF_SIZE); } } resetState(); } private void resetState() { reading = true; bufSize = buf.length; bufEnd = 0; posEnd = posStart = pos = 0; posNsColon = -1; state = STATE_INIT; prevCh = '\0'; posCol = posRow = 1; reachedEnd = false; pcEnd = pcStart = 0; previousState = -1; backtracking = false; seenContent = false; } /** Reset tokenizer state and set new input source */ public void setInput(char[] data) { resetState(); reading = false; buf = data; bufSize = bufEnd = buf.length; if(paramPC && pc.length < bufSize) { pc = new char[bufSize]; if(TEST_VALIDATING) for(int i = 0; i < bufSize; ++i) pc[i]='X'; } } /** Reset tokenizer state and set new input source */ public void setInput(Reader r) { reset(); reading = true; reader = r; bufEnd = 0; } /** * Set notification of all XML content tokens: * Characters, Comment, CDSect, Doctype, PI, EntityRef, CharRef, AttValue * (tokens for STag, ETag and Attribute are always sent). */ public void setNotifyAll(boolean enable) { paramNotifyCharacters = enable; paramNotifyComment = enable; paramNotifyCDSect = enable; paramNotifyDoctype = enable; paramNotifyPI = enable; paramNotifyEntityRef = enable; paramNotifyCharRef = enable; paramNotifyAttValue = enable; } /** * Allow reporting parsed content for element content * and attribute content (no need to deal with low level * tokens such as in setNotifyAll). */ public void setParseContent(boolean enable) { paramPC = enable; if(paramPC && pc.length < bufSize) { pc = new char[bufSize]; } } /** * Set support for mixed conetent. If mixed content is * disabled tokenizer will do its best to ensure that * no element has mixed content model also ignorable whitespaces * will not be reported as element content. */ public void setMixedContent(boolean enable) { paramNoMixContent = !enable; } /** * Set soft limit on internal buffer size. * That means suggested size that tokznzier will try to keep. */ public void setSoftLimit(int value) throws XmlTokenizerException { //if(state != STATE_INIT) { // throw new XmlTokenizerException( // "soft limit can not be changed after parsing started"); //} if(!reading) { throw new XmlTokenizerException( "hard limit can not be set for char array input" ); } if((value != -1) && (hardLimit != -1) && (2 * value > hardLimit)) { throw new XmlTokenizerException( "soft limit can no tbe bigger than half of hard limit" +"current hard limit "+hardLimit ); } softLimit = value; if(softLimit != -1) { posSafe = softLimit; } else if(hardLimit != -1) { posSafe = hardLimit / 2; } else { posSafe = (int)(loadFactor * bufSize); //restore default } } /** * Set hard limit on internal buffer size. * That means that if input (such as element content) is bigger than * hard limit size tokenizer will throw XmlTokenizerBufferOverflowException. */ public void setHardLimit(int value) throws XmlTokenizerException { if(!reading) { throw new XmlTokenizerException( "hard limit can not be set for char array input" ); } if(state != STATE_INIT && value < hardLimit) { throw new XmlTokenizerException( "hard limit on buffer size can not be shrunk during parsing" ); } if(softLimit == -1 && value != -1) { throw new XmlTokenizerException( "soft limit must be set to non -1 before setting hard limit" +getPosDesc(), getLineNumber(), getColumnNumber() ); } if((value != -1) && ((2 * softLimit) >= value)) { throw new XmlTokenizerException( "hard limit must be at least twice the size of soft limit" +"current soft limit "+softLimit+" and hard limit "+value +getPosDesc(), getLineNumber(), getColumnNumber() ); } // resize buffer to new hard limit hardLimit = value; if(softLimit != -1 && softLimit < bufSize) { resize(softLimit); } } private int findFragment(char[] b, int i, int j) { if(i == 0) return i; while(i-- > 0) { if((j - i) > 55) break; char c = b[i]; if(c == '<') break; } return i; } /** * Return string describing current position of parsers as * text 'at line %d (row) and column %d (colum) [seen %s...]'. */ public String getPosDesc() { String fragment = null; if(parsedContent) { //System.err.println("pcStart="+pcStart+" pcEnd="+pcEnd); //if(pcStart > 0) fragment = "..."; if(pcStart <= pcEnd) { int start = findFragment(pc, pcStart, pcEnd); fragment = new String(pc, start, pcEnd - start); if(start > 0) fragment = "..." + fragment; } } else { //System.err.println("posStart="+posStart+" posEnd="+posEnd); //if(posStart > 0) fragment = "..."; if(posStart <= posEnd) { int start = findFragment(buf, posStart, posEnd); fragment = new String(buf, start, posEnd - start); if(start > 0) fragment = "..." + fragment; } } return " at line "+posRow +" and column "+(posCol-1) +(fragment != null ? " seen "+printable(fragment)+"..." : ""); } public int getLineNumber() { return posRow; } public int getColumnNumber() { return posCol-1; } /** * Return next recognized toke or END_DOCUMENT if no more input. * *

This is simple automata (in pseudo-code): *

   * byte next() {
   *    while(state != END_DOCUMENT) {
   *      ch = more();  // read character from input
   *      state = func(ch, state); // do transition
   *      if(state is accepting)
   *        return state;  // return token to caller
   *    }
   * }  
   * 
* *

For simplicity it is using few procedures such as readName() or isS(). * * * */ public byte next() throws XmlTokenizerException, IOException { if(state == STATE_FINISHED) throw new XmlTokenizerException("attempt to read beyond end of input"); parsedContent = false; LOOP: while(true) { if(reachedEnd) { if(state != STATE_FINISH) { if(state != STATE_CONTENT && state != STATE_CONTENT_INIT && state != STATE_CONTENT_CONTINUED) { throw new XmlTokenizerException( "unexpected end of stream (state="+state+")"); } if(state == STATE_CONTENT_INIT || state == STATE_CONTENT_CONTINUED) { if(state == STATE_CONTENT_INIT) { pcEnd = pcStart = pos - 1; } posEnd = posStart = pos - 1; } state = STATE_FINISH; if(paramPC && (pcStart != pcEnd || posEnd != posStart)) { // if(pcEnd == pcStart) { // pcStart = posStart; // pcEnd = posEnd; // pc = buf; // } else { // pc = pc; // } parsedContent = (pcEnd != pcStart); if(paramNoMixContent == false || seenContent == false) return CONTENT; else if(parsedContent) throw new XmlTokenizerException( "no element content allowed before end of stream"); } } state = STATE_FINISHED; if(TRACE_SIZING) { System.err.println("bufEnd="+bufEnd+" bufSize="+bufSize +" softLimit="+softLimit+" hardLimit="+hardLimit); } return END_DOCUMENT; } char ch = more(); // 2.11 End-of-Line Handling: "\r\n" -> "\n"; "\rX" -> "\nX" //XXX if(NORMALIZE_LINE_BREAKS) { if(ch == '\r') { // TODO: joinPC() if(pcStart == pcEnd && posEnd > posStart) { int len = posEnd - posStart; System.arraycopy(buf, posStart, pc, pcEnd, len); pcEnd += len; } //ch = '\n'; } else if(prevPrevCh == '\r' && ch == '\n') { continue LOOP; //ask for more chars --> ch = more(); // it can not be break as we are not yet in switch(..) } } switch(state) { case STATE_INIT: // detect BOM and frop it (Unicode Byte Order Mark) if(ch == '\uFFFE') { throw new XmlTokenizerException( "first character in input was UNICODE noncharacter (0xFFFE)"+ "- input requires byte swapping"); } if(ch == '\uFEFF') { // skupping UNICODE BOM state = STATE_CONTENT_INIT; break; } ; // fall through case STATE_CONTENT_INIT: pcEnd = pcStart = pos - 1; ; // fall through case STATE_CONTENT_CONTINUED: posEnd = posStart = pos - 1; state = STATE_CONTENT; ; // fall through case STATE_CONTENT: if(ch == '<') { state = STATE_SEEN_LT; if(paramNotifyCharacters && posStart != posEnd) return CHARACTERS; } else if(ch == '&') { if(paramPC && pcStart == pcEnd && posEnd > posStart) { // TODO: joinPC() int len = posEnd - posStart; System.arraycopy(buf, posStart, pc, pcEnd, len); pcEnd += len; } if(!seenContent) { seenContent = true; if(paramNoMixContent && !mixInElement) throw new XmlTokenizerException( "mixed content disallowed outside element"+getPosDesc(), getLineNumber(), getColumnNumber()); } state = STATE_SEEN_AMP; previousState = STATE_CONTENT_CONTINUED; posStart = pos - 1; } else { if(!seenContent && !isS(ch)) { seenContent = true; if(paramNoMixContent && !mixInElement) throw new XmlTokenizerException( "mixed content disallowed outside element, " +"character '"+printable(ch)+"'" +" ("+(int)ch+")"+getPosDesc(), getLineNumber(), getColumnNumber()); } posEnd = pos; if(paramPC && ((pcStart != pcEnd) || (NORMALIZE_LINE_BREAKS && ch == '\r')) ) { //NOTE: normalization is done at beginning of LOOP //but it only will prepare pc buffer that is only now filled // CONSIDER: worst case when \r is first character!!!! if(NORMALIZE_LINE_BREAKS && ch == '\r') pc[pcEnd++] = '\n'; else pc[pcEnd++] = ch; } //if(NORMALIZE_LINE_BREAKS && ch == '\r') { // throw new IllegalStateException( // "end-of-line normalization failed"+getPosDesc()); //} if(paramNotifyCharacters && reachedEnd) return CHARACTERS; } break; case STATE_SEEN_LT: if(ch == '!') { state = STATE_SEEN_LT_BANG; } else if(ch == '?') { state = STATE_PI; } else { // it must be STag or ETag boolean prevMixSeenContent = seenContent; boolean prevMixInElement = mixInElement; if(ch == '/') { state = STATE_SCAN_ETAG_NAME; mixInElement = false; } else { state = STATE_SCAN_STAG_NAME; if(paramNoMixContent && seenContent) throw new XmlTokenizerException("mixed content disallowed" +" inside element and before start tag"+getPosDesc(), getLineNumber(), getColumnNumber()); mixInElement = true; } //TODO TESTME if(paramPC /*&& (pcStart != pcEnd || posEnd != posStart)*/) { parsedContent = (pcEnd != pcStart); if(paramNoMixContent == false || (paramNoMixContent && state == STATE_SCAN_ETAG_NAME //&& prevMixInElement && prevMixSeenContent)) { && prevMixInElement)) { return CONTENT; } } } // gather parsed content - so we have what was before comments etc. if(paramPC && state != STATE_SCAN_STAG_NAME && state != STATE_SCAN_ETAG_NAME) { // TODO: joinPC() if(pcStart == pcEnd && posEnd > posStart) { int len = posEnd - posStart; System.arraycopy(buf, posStart, pc, pcEnd, len); pcEnd += len; } } posStart = pos; // to make PI start content break; case STATE_SEEN_LT_BANG: if(ch == '-') { ch = more(); if(ch != '-') throw new XmlTokenizerException( "expected - for start of comment