XMLScanner.cpp

/*
 * Copyright 1999-2002,2004 The Apache Software Foundation.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * $Id$
 */


// ---------------------------------------------------------------------------
//  Includes
// ---------------------------------------------------------------------------
#include <xercesc/internal/XMLScanner.hpp>
#include <xercesc/internal/ValidationContextImpl.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/Mutexes.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/UnexpectedEOFException.hpp>
#include <xercesc/util/XMLMsgLoader.hpp>
#include <xercesc/util/XMLRegisterCleanup.hpp>
#include <xercesc/util/XMLInitializer.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/framework/URLInputSource.hpp>
#include <xercesc/framework/XMLDocumentHandler.hpp>
#include <xercesc/framework/XMLEntityHandler.hpp>
#include <xercesc/framework/XMLPScanToken.hpp>
#include <xercesc/framework/XMLValidator.hpp>
#include <xercesc/internal/EndOfEntityException.hpp>
#include <xercesc/validators/DTD/DocTypeHandler.hpp>
#include <xercesc/validators/common/GrammarResolver.hpp>
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/util/XMLResourceIdentifier.hpp>

XERCES_CPP_NAMESPACE_BEGIN

// ---------------------------------------------------------------------------
//  Local static data
// ---------------------------------------------------------------------------
static XMLUInt32       gScannerId;
static bool            sRegistered = false;

static XMLMutex*       sScannerMutex = 0;
static XMLRegisterCleanup scannerMutexCleanup;

static XMLMsgLoader*   gMsgLoader = 0;
static XMLRegisterCleanup cleanupMsgLoader;


// ---------------------------------------------------------------------------
//  Local, static functions
// ---------------------------------------------------------------------------

//  Cleanup for the message loader
void XMLScanner::reinitMsgLoader()
{
	delete gMsgLoader;
	gMsgLoader = 0;
}

//  Cleanup for the scanner mutex
void XMLScanner::reinitScannerMutex()
{
    delete sScannerMutex;
    sScannerMutex = 0;
    sRegistered = false;
}

//
//  We need to fault in this mutex. But, since its used for synchronization
//  itself, we have to do this the low level way using a compare and swap.
//
static XMLMutex& gScannerMutex()
{
    if (!sRegistered)
    {
        XMLMutexLock lockInit(XMLPlatformUtils::fgAtomicMutex);

        if (!sRegistered)
        {
            sScannerMutex = new XMLMutex;
            scannerMutexCleanup.registerCleanup(XMLScanner::reinitScannerMutex);
            sRegistered = true;
        }
    }
    return *sScannerMutex;
}

static XMLMsgLoader& gScannerMsgLoader()
{
    if (!gMsgLoader)
    {
        XMLMutexLock lockInit(&gScannerMutex());

        // If we haven't loaded our message yet, then do that
        if (!gMsgLoader)
        {
            gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);
            if (!gMsgLoader)
                XMLPlatformUtils::panic(PanicHandler::Panic_CantLoadMsgDomain);

            // Register this object to be cleaned up at termination
            cleanupMsgLoader.registerCleanup(XMLScanner::reinitMsgLoader);
        }
    }

    return *gMsgLoader;
}

void XMLInitializer::initializeScannerMsgLoader()
{
    gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);

    // Register this object to be cleaned up at termination
    if (gMsgLoader) {
        cleanupMsgLoader.registerCleanup(XMLScanner::reinitMsgLoader);
    }

    sScannerMutex = new XMLMutex;
    if (sScannerMutex) {
        scannerMutexCleanup.registerCleanup(XMLScanner::reinitScannerMutex);
        sRegistered = true;
    }
}

// ---------------------------------------------------------------------------
//  XMLScanner: Constructors and Destructor
// ---------------------------------------------------------------------------
XMLScanner::XMLScanner(XMLValidator* const valToAdopt,
                       GrammarResolver* const grammarResolver,
                       MemoryManager* const manager)
    : fBufferSize(1024 * 1024)
    , fStandardUriConformant(false)
    , fCalculateSrcOfs(false)
    , fDoNamespaces(false)
    , fExitOnFirstFatal(true)
    , fValidationConstraintFatal(false)
    , fInException(false)
    , fStandalone(false)
    , fHasNoDTD(true)
    , fValidate(false)
    , fValidatorFromUser(false)
    , fDoSchema(false)
    , fSchemaFullChecking(false)
    , fIdentityConstraintChecking(true)
    , fToCacheGrammar(false)
    , fUseCachedGrammar(false)
    , fLoadExternalDTD(true)
    , fNormalizeData(true)
    , fGenerateSyntheticAnnotations(false)
    , fValidateAnnotations(false)
    , fErrorCount(0)
    , fEntityExpansionLimit(0)
    , fEntityExpansionCount(0)
    , fEmptyNamespaceId(0)
    , fUnknownNamespaceId(0)
    , fXMLNamespaceId(0)
    , fXMLNSNamespaceId(0)
    , fSchemaNamespaceId(0)
    , fUIntPool(0)
    , fUIntPoolRow(0)
    , fUIntPoolCol(0)
    , fUIntPoolRowTotal(2)
    , fScannerId(0)
    , fSequenceId(0)
    , fAttrList(0)
    , fAttrDupChkRegistry(0)
    , fDocHandler(0)
    , fDocTypeHandler(0)
    , fEntityHandler(0)
    , fErrorReporter(0)
    , fErrorHandler(0)
    , fPSVIHandler(0)
    , fValidationContext(0)
    , fEntityDeclPoolRetrieved(false)
    , fReaderMgr(manager)
    , fValidator(valToAdopt)
    , fValScheme(Val_Never)
    , fGrammarResolver(grammarResolver)
    , fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
    , fGrammar(0)
    , fRootGrammar(0)
    , fURIStringPool(0)
    , fRootElemName(0)
    , fExternalSchemaLocation(0)
    , fExternalNoNamespaceSchemaLocation(0)    
    , fSecurityManager(0)
    , fXMLVersion(XMLReader::XMLV1_0)
    , fMemoryManager(manager)
    , fBufMgr(manager)
    , fAttNameBuf(1023, manager)
    , fAttValueBuf(1023, manager)
    , fCDataBuf(1023, manager)
    , fQNameBuf(1023, manager)
    , fPrefixBuf(1023, manager)
    , fURIBuf(1023, manager)
    , fElemStack(manager)   
{
   commonInit();

   if (fValidator) {
       fValidatorFromUser = true;
       initValidator(fValidator);
   }
}

XMLScanner::XMLScanner( XMLDocumentHandler* const  docHandler
                          , DocTypeHandler* const    docTypeHandler
                          , XMLEntityHandler* const  entityHandler
                          , XMLErrorReporter* const  errHandler
                          , XMLValidator* const      valToAdopt
                          , GrammarResolver* const   grammarResolver
                          , MemoryManager* const     manager)

    : fBufferSize(1024 * 1024)
    , fStandardUriConformant(false)
    , fCalculateSrcOfs(false)
    , fDoNamespaces(false)
    , fExitOnFirstFatal(true)
    , fValidationConstraintFatal(false)
    , fInException(false)
    , fStandalone(false)
    , fHasNoDTD(true)
    , fValidate(false)
    , fValidatorFromUser(false)
    , fDoSchema(false)
    , fSchemaFullChecking(false)
    , fIdentityConstraintChecking(true)
    , fToCacheGrammar(false)
    , fUseCachedGrammar(false)
	, fLoadExternalDTD(true)
    , fNormalizeData(true)
    , fGenerateSyntheticAnnotations(false)
    , fValidateAnnotations(false)
    , fErrorCount(0)
    , fEntityExpansionLimit(0)
    , fEntityExpansionCount(0)
    , fEmptyNamespaceId(0)
    , fUnknownNamespaceId(0)
    , fXMLNamespaceId(0)
    , fXMLNSNamespaceId(0)
    , fSchemaNamespaceId(0)
    , fUIntPool(0)
    , fUIntPoolRow(0)
    , fUIntPoolCol(0)
    , fUIntPoolRowTotal(2)
    , fScannerId(0)
    , fSequenceId(0)
    , fAttrList(0)
    , fAttrDupChkRegistry(0)
    , fDocHandler(docHandler)
    , fDocTypeHandler(docTypeHandler)
    , fEntityHandler(entityHandler)
    , fErrorReporter(errHandler)
    , fErrorHandler(0)
    , fPSVIHandler(0)
    , fValidationContext(0)
    , fEntityDeclPoolRetrieved(false)
    , fReaderMgr(manager)
    , fValidator(valToAdopt)
    , fValScheme(Val_Never)
    , fGrammarResolver(grammarResolver)
    , fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
    , fGrammar(0)
    , fRootGrammar(0)
    , fURIStringPool(0)
    , fRootElemName(0)
    , fExternalSchemaLocation(0)
    , fExternalNoNamespaceSchemaLocation(0)    
    , fSecurityManager(0)
    , fXMLVersion(XMLReader::XMLV1_0)
    , fMemoryManager(manager)
    , fBufMgr(manager)
    , fAttNameBuf(1023, manager)
    , fAttValueBuf(1023, manager)
    , fCDataBuf(1023, manager)
    , fQNameBuf(1023, manager)
    , fPrefixBuf(1023, manager)
    , fURIBuf(1023, manager)
    , fElemStack(manager)
{
   commonInit();

   if (valToAdopt){
       fValidatorFromUser = true;
       initValidator(fValidator);
   }
}

XMLScanner::~XMLScanner()
{
    delete fAttrList;
    delete fAttrDupChkRegistry;
    delete fValidationContext;
    fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
    fMemoryManager->deallocate(fExternalSchemaLocation);//delete [] fExternalSchemaLocation;
    fMemoryManager->deallocate(fExternalNoNamespaceSchemaLocation);//delete [] fExternalNoNamespaceSchemaLocation;
    // delete fUIntPool
    for (unsigned int i=0; i<=fUIntPoolRow; i++)
    {
        fMemoryManager->deallocate(fUIntPool[i]);
    }
    fMemoryManager->deallocate(fUIntPool);
}


void XMLScanner::setValidator(XMLValidator* const valToAdopt)
{
    if (fValidatorFromUser)
        delete fValidator;
    fValidator = valToAdopt;
    fValidatorFromUser = true;
    initValidator(fValidator);
}


// ---------------------------------------------------------------------------
//  XMLScanner: Main entry point to scan a document
// ---------------------------------------------------------------------------
void XMLScanner::scanDocument(  const   XMLCh* const    systemId)
{
    //  First we try to parse it as a URL. If that fails, we assume its
    //  a file and try it that way.
    InputSource* srcToUse = 0;
    try
    {
        //  Create a temporary URL. Since this is the primary document,
        //  it has to be fully qualified. If not, then assume we are just
        //  mistaking a file for a URL.
        XMLURL tmpURL(fMemoryManager);

        if (XMLURL::parse(systemId, tmpURL)) {

            if (tmpURL.isRelative()) {
                if (!fStandardUriConformant)
                    srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
                else {
                    // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                    // emit the error directly
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getType()
                        , e.getMessage()
                    );
                    return;
                }
            }
            else
            {
                if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getType()
                        , e.getMessage()
                    );
                    return;
                }
                srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
            }
        }
        else {

            if (!fStandardUriConformant)
                srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
            else {
                // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                // emit the error directly
                // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
                MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
                fInException = true;
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , e.getType()
                    , e.getMessage()
                );
                return;
            }
        }
    }
    catch(const XMLException& excToCatch)
    {
        //  For any other XMLException,
        //  emit the error and catch any user exception thrown from here.
        fInException = true;
        if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
            emitError
            (
                XMLErrs::XMLException_Warning
                , excToCatch.getType()
                , excToCatch.getMessage()
            );
        else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
            emitError
            (
                XMLErrs::XMLException_Fatal
                , excToCatch.getType()
                , excToCatch.getMessage()
            );
        else
            emitError
            (
                XMLErrs::XMLException_Error
                , excToCatch.getType()
                , excToCatch.getMessage()
            );
        return;
    }

    Janitor<InputSource> janSrc(srcToUse);
    scanDocument(*srcToUse);
}

void XMLScanner::scanDocument(  const   char* const systemId)
{
    // We just delegate this to the XMLCh version after transcoding
    XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
    ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
    scanDocument(tmpBuf);
}


//  This method begins a progressive parse. It scans through the prolog and
//  returns a token to be used on subsequent scanNext() calls. If the return
//  value is true, then the token is legal and ready for further use. If it
//  returns false, then the scan of the prolog failed and the token is not
//  going to work on subsequent scanNext() calls.
bool XMLScanner::scanFirst( const   XMLCh* const    systemId
                            ,       XMLPScanToken&  toFill)
{
    //  First we try to parse it as a URL. If that fails, we assume its
    //  a file and try it that way.
    InputSource* srcToUse = 0;
    try
    {
        //  Create a temporary URL. Since this is the primary document,
        //  it has to be fully qualified. If not, then assume we are just
        //  mistaking a file for a URL.
        XMLURL tmpURL(fMemoryManager);
        if (XMLURL::parse(systemId, tmpURL)) {        
            if (tmpURL.isRelative()) {
                if (!fStandardUriConformant)
                    srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
                else {
                    // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                    // emit the error directly
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getType()
                        , e.getMessage()
                    );
                    return false;
                }
            }
            else
            {
                if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getType()
                        , e.getMessage()
                    );
                    return false;
                }
                srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
            }
        }
        else {
            if (!fStandardUriConformant)
                srcToUse = new (fMemoryManager) LocalFileInputSource(systemId,  fMemoryManager);
            else {
                // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                // emit the error directly
                // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
                MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
                fInException = true;
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , e.getType()
                    , e.getMessage()
                );
                return false;
            }
        }
    }
    catch(const XMLException& excToCatch)
    {
        //  For any other XMLException,
        //  emit the error and catch any user exception thrown from here.
        fInException = true;
        if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
            emitError
            (
                XMLErrs::XMLException_Warning
                , excToCatch.getType()
                , excToCatch.getMessage()
            );
        else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
            emitError
            (
                XMLErrs::XMLException_Fatal
                , excToCatch.getType()
                , excToCatch.getMessage()
            );
        else
            emitError
            (
                XMLErrs::XMLException_Error
                , excToCatch.getType()
                , excToCatch.getMessage()
            );
        return false;
    }

    Janitor<InputSource> janSrc(srcToUse);
    return scanFirst(*srcToUse, toFill);
}

bool XMLScanner::scanFirst( const   char* const     systemId
                            ,       XMLPScanToken&  toFill)
{
    // We just delegate this to the XMLCh version after transcoding
    XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
    ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
    return scanFirst(tmpBuf, toFill);
}

bool XMLScanner::scanFirst( const   InputSource&    src
                           ,       XMLPScanToken&  toFill)
{
    //  Bump up the sequence id for this new scan cycle. This will invalidate
    //  any previous tokens we've returned.
    fSequenceId++;

    // Reset the scanner and its plugged in stuff for a new run.  This
    // resets all the data structures, creates the initial reader and
    // pushes it on the stack, and sets up the base document path
    scanReset(src);

    // If we have a document handler, then call the start document
    if (fDocHandler)
        fDocHandler->startDocument();

    try
    {
        //  Scan the prolog part, which is everything before the root element
        //  including the DTD subsets. This is all that is done on the scan
        //  first.
        scanProlog();

        //  If we got to the end of input, then its not a valid XML file.
        //  Else, go on to scan the content.
        if (fReaderMgr.atEOF())
        {
            emitError(XMLErrs::EmptyMainEntity);
        }
    }
    //  NOTE:
    //
    //  In all of the error processing below, the emitError() call MUST come
    //  before the flush of the reader mgr, or it will fail because it tries
    //  to find out the position in the XML source of the error.
    catch(const XMLErrs::Codes)
    {
        // This is a 'first failure' exception so reset and return a failure
        fReaderMgr.reset();
        return false;
    }
    catch(const XMLValid::Codes)
    {
        // This is a 'first fatal error' type exit, so reset and reuturn failure
        fReaderMgr.reset();
        return false;
    }
    catch(const XMLException& excToCatch)
    {
        //  Emit the error and catch any user exception thrown from here. Make
        //  sure in all cases we flush the reader manager.
        fInException = true;
        try
        {
            if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
                emitError
                (
                    XMLErrs::XMLException_Warning
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else
                emitError
                (
                    XMLErrs::XMLException_Error
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
        }
        catch(const OutOfMemoryException&)
        {
            throw;
        }
        catch(...)
        {
            // Reset and rethrow the user error
            fReaderMgr.reset();
            throw;
        }

        // Reset and return a failure
        fReaderMgr.reset();
        return false;
    }
    catch(const OutOfMemoryException&)
    {
        throw;
    }
    catch(...)
    {
        // Reset and rethrow original error
        fReaderMgr.reset();
        throw;
    }

    // Fill in the caller's token to make it legal and return success
    toFill.set(fScannerId, fSequenceId);
    return true;
}


void XMLScanner::scanReset(XMLPScanToken& token)
{
    // Make sure this token is still legal
    if (!isLegalToken(token))
        ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);

    // Reset the reader manager
    fReaderMgr.reset();

    // And invalidate any tokens by bumping our sequence number
    fSequenceId++;

    // Reset our error count
    fErrorCount = 0;
}

void XMLScanner::setParseSettings(XMLScanner* const refScanner)
{
    setDocHandler(refScanner->getDocHandler());
    setDocTypeHandler(refScanner->getDocTypeHandler());
    setErrorHandler(refScanner->getErrorHandler());
    setErrorReporter(refScanner->getErrorReporter());
    setEntityHandler(refScanner->getEntityHandler());
    setDoNamespaces(refScanner->getDoNamespaces());
    setDoSchema(refScanner->getDoSchema());
    setCalculateSrcOfs(refScanner->getCalculateSrcOfs());
    setStandardUriConformant(refScanner->getStandardUriConformant());
    setExitOnFirstFatal(refScanner->getExitOnFirstFatal());
    setValidationConstraintFatal(refScanner->getValidationConstraintFatal());
    setIdentityConstraintChecking(refScanner->getIdentityConstraintChecking());
    setValidationSchemaFullChecking(refScanner->getValidationSchemaFullChecking());
    cacheGrammarFromParse(refScanner->isCachingGrammarFromParse());
    useCachedGrammarInParse(refScanner->isUsingCachedGrammarInParse());
    setLoadExternalDTD(refScanner->getLoadExternalDTD());
    setNormalizeData(refScanner->getNormalizeData());
    setExternalSchemaLocation(refScanner->getExternalSchemaLocation());
    setExternalNoNamespaceSchemaLocation(refScanner->getExternalNoNamespaceSchemaLocation());
    setValidationScheme(refScanner->getValidationScheme());
    setSecurityManager(refScanner->getSecurityManager());
    setPSVIHandler(refScanner->getPSVIHandler());
}

// ---------------------------------------------------------------------------
//  XMLScanner: Private helper methods.
// ---------------------------------------------------------------------------

//  This method handles the common initialization, to avoid having to do
//  it redundantly in multiple constructors.
void XMLScanner::commonInit()
{
    //  We have to do a little init that involves statics, so we have to
    //  use the mutex to protect it.
    {
        XMLMutexLock lockInit(&gScannerMutex());

        // And assign ourselves the next available scanner id
        fScannerId = ++gScannerId;
    }

    //  Create the attribute list, which is used to store attribute values
    //  during start tag processing. Give it a reasonable initial size that
    //  will serve for most folks, though it will grow as required.
    fAttrList = new (fMemoryManager) RefVectorOf<XMLAttr>(32, true, fMemoryManager);

    //  Create the id ref list. This is used to enforce XML 1.0 ID ref
    //  semantics, i.e. all id refs must refer to elements that exist
    fValidationContext = new (fMemoryManager) ValidationContextImpl(fMemoryManager);

    //  Create the GrammarResolver
    //fGrammarResolver = new GrammarResolver();

    // create initial, 64-element, fUIntPool
    fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) *fUIntPoolRowTotal);
    fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
    memset(fUIntPool[0], 0, sizeof(unsigned int) << 6);
    fUIntPool[1] = 0;

    // Register self as handler for XMLBufferFull events on the CDATA buffer
    fCDataBuf.setFullHandler(this, fBufferSize);
}


void XMLScanner::initValidator(XMLValidator* theValidator) {

    //  Tell the validator about the stuff it needs to know in order to
    //  do its work.
    theValidator->setScannerInfo(this, &fReaderMgr, &fBufMgr);
    theValidator->setErrorReporter(fErrorReporter);
}

// ---------------------------------------------------------------------------
//  XMLScanner: Error emitting methods
// ---------------------------------------------------------------------------

//  These methods are called whenever the scanner wants to emit an error.
//  It handles getting the message loaded, doing token replacement, etc...
//  and then calling the error handler, if its installed.
bool XMLScanner::emitErrorWillThrowException(const XMLErrs::Codes toEmit)
{
    if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException)
        return true;
    return false;
}

void XMLScanner::emitError(const XMLErrs::Codes toEmit)
{
    // Bump the error count if it is not a warning
    if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
        incrementErrorCount();

    if (fErrorReporter)
    {
        // Load the message into a local for display
        const unsigned int msgSize = 1023;
        XMLCh errText[msgSize + 1];

        if (!gScannerMsgLoader().loadMsg(toEmit, errText, msgSize))
        {
                // <TBD> Probably should load a default msg here
        }

        //  Create a LastExtEntityInfo structure and get the reader manager
        //  to fill it in for us. This will give us the information about
        //  the last reader on the stack that was an external entity of some
        //  sort (i.e. it will ignore internal entities.
        ReaderMgr::LastExtEntityInfo lastInfo;
        fReaderMgr.getLastExtEntityInfo(lastInfo);

        fErrorReporter->error
        (
            toEmit
            , XMLUni::fgXMLErrDomain
            , XMLErrs::errorType(toEmit)
            , errText
            , lastInfo.systemId
            , lastInfo.publicId
            , lastInfo.lineNumber
            , lastInfo.colNumber
        );
    }

    // Bail out if its fatal an we are to give up on the first fatal error
    if (emitErrorWillThrowException(toEmit))
        throw toEmit;
}

void XMLScanner::emitError( const   XMLErrs::Codes    toEmit
                            , const XMLCh* const        text1
                            , const XMLCh* const        text2
                            , const XMLCh* const        text3
                            , const XMLCh* const        text4)
{
    // Bump the error count if it is not a warning
    if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
        incrementErrorCount();

    if (fErrorReporter)
    {
        //  Load the message into alocal and replace any tokens found in
        //  the text.
        const unsigned int maxChars = 2047;
        XMLCh errText[maxChars + 1];

        if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
        {
                // <TBD> Should probably load a default message here
        }

        //  Create a LastExtEntityInfo structure and get the reader manager
        //  to fill it in for us. This will give us the information about
        //  the last reader on the stack that was an external entity of some
        //  sort (i.e. it will ignore internal entities.
        ReaderMgr::LastExtEntityInfo lastInfo;
        fReaderMgr.getLastExtEntityInfo(lastInfo);

        fErrorReporter->error
        (
            toEmit
            , XMLUni::fgXMLErrDomain
            , XMLErrs::errorType(toEmit)
            , errText
            , lastInfo.systemId
            , lastInfo.publicId
            , lastInfo.lineNumber
            , lastInfo.colNumber
        );
    }

    // Bail out if its fatal an we are to give up on the first fatal error
    if (emitErrorWillThrowException(toEmit))
        throw toEmit;
}

void XMLScanner::emitError( const   XMLErrs::Codes    toEmit
                            , const char* const         text1
                            , const char* const         text2
                            , const char* const         text3
                            , const char* const         text4)
{
    // Bump the error count if it is not a warning
    if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
        incrementErrorCount();

    if (fErrorReporter)
    {
        //  Load the message into alocal and replace any tokens found in
        //  the text.
        const unsigned int maxChars = 2047;
        XMLCh errText[maxChars + 1];

        if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
        {
                // <TBD> Should probably load a default message here
        }

        //  Create a LastExtEntityInfo structure and get the reader manager
        //  to fill it in for us. This will give us the information about
        //  the last reader on the stack that was an external entity of some
        //  sort (i.e. it will ignore internal entities.
        ReaderMgr::LastExtEntityInfo lastInfo;
        fReaderMgr.getLastExtEntityInfo(lastInfo);

        fErrorReporter->error
        (
            toEmit
            , XMLUni::fgXMLErrDomain
            , XMLErrs::errorType(toEmit)
            , errText
            , lastInfo.systemId
            , lastInfo.publicId
            , lastInfo.lineNumber
            , lastInfo.colNumber
        );
    }

    // Bail out if its fatal an we are to give up on the first fatal error
    if (emitErrorWillThrowException(toEmit))
        throw toEmit;
}


// ---------------------------------------------------------------------------
//  XMLScanner: Getter methods
// ---------------------------------------------------------------------------

//  This method allows the caller to query the current location of the scanner.
//  It will return the sys/public ids of the current entity, and the line/col
//  position within it.
//
//  NOTE: This API returns the location with the last external file. So if its
//  currently scanning an entity, the position returned will be the end of
//  the entity reference in the file that had the reference.
//
/*bool
XMLScanner::getLastExtLocation(         XMLCh* const    sysIdToFill
                                , const unsigned int    maxSysIdChars
                                ,       XMLCh* const    pubIdToFill
                                , const unsigned int    maxPubIdChars
                                ,       XMLSSize_t&     lineToFill
                                ,       XMLSSize_t&     colToFill) const
{
    // Create a local info object and get it filled in by the reader manager
    ReaderMgr::LastExtEntityInfo lastInfo;
    fReaderMgr.getLastExtEntityInfo(lastInfo);

    // Fill in the line and column number
    lineToFill = lastInfo.lineNumber;
    colToFill = lastInfo.colNumber;

    // And copy over as much of the ids as will fit
    sysIdToFill[0] = 0;
    if (lastInfo.systemId)
    {
        if (XMLString::stringLen(lastInfo.systemId) > maxSysIdChars)
            return false;
        XMLString::copyString(sysIdToFill, lastInfo.systemId);
    }

    pubIdToFill[0] = 0;
    if (lastInfo.publicId)
    {
        if (XMLString::stringLen(lastInfo.publicId) > maxPubIdChars)
            return false;
        XMLString::copyString(pubIdToFill, lastInfo.publicId);
    }
    return true;
}*/


// ---------------------------------------------------------------------------
//  XMLScanner: Private scanning methods
// ---------------------------------------------------------------------------

//  This method is called after the end of the root element, to handle
//  any miscellaneous stuff hanging around.
void XMLScanner::scanMiscellaneous()
{
    // Get a buffer for this work
    XMLBufBid bbCData(&fBufMgr);

    while (true)
    {
        try
        {
            const XMLCh nextCh = fReaderMgr.peekNextChar();

            // Watch for end of file and break out
            if (!nextCh)
                break;

            if (nextCh == chOpenAngle)
            {
                if (checkXMLDecl(true))
                {
                    // Can't have an XML decl here
                    emitError(XMLErrs::NotValidAfterContent);
                    fReaderMgr.skipPastChar(chCloseAngle);
                }
                else if (fReaderMgr.skippedString(XMLUni::fgPIString))
                {
                    scanPI();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
                {
                    scanComment();
                }
                else
                {
                    // This can't be possible, so just give up
                    emitError(XMLErrs::ExpectedCommentOrPI);
                    fReaderMgr.skipPastChar(chCloseAngle);
                }
            }
            else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
            {
                //  If we have a doc handler, then gather up the spaces and
                //  call back. Otherwise, just skip over whitespace.
                if (fDocHandler)
                {
                    fReaderMgr.getSpaces(bbCData.getBuffer());
                    fDocHandler->ignorableWhitespace
                    (
                        bbCData.getRawBuffer()
                        , bbCData.getLen()
                        , false
                    );
                }
                else
                {
                    fReaderMgr.skipPastSpaces();
                }
            }
            else
            {
                emitError(XMLErrs::ExpectedCommentOrPI);
                fReaderMgr.skipPastChar(chCloseAngle);
            }
        }
        catch(const EndOfEntityException&)
        {
            //  Some entity leaked out of the content part of the document. Issue
            //  a warning and keep going.
            emitError(XMLErrs::EntityPropogated);
        }
    }
}


//  Scans a PI and calls the appropriate callbacks. At entry we have just
//  scanned the <? part, and need to now start on the PI target name.
void XMLScanner::scanPI()
{
    const XMLCh* namePtr = 0;
    const XMLCh* targetPtr = 0;

    //  If there are any spaces here, then warn about it. If we aren't in
    //  'first error' mode, then we'll come back and can easily pick up
    //  again by just skipping them.
    if (fReaderMgr.lookingAtSpace())
    {
        emitError(XMLErrs::PINameExpected);
        fReaderMgr.skipPastSpaces();
    }

    // Get a buffer for the PI name and scan it in
    XMLBufBid bbName(&fBufMgr);
    if (!fReaderMgr.getName(bbName.getBuffer()))
    {
        emitError(XMLErrs::PINameExpected);
        fReaderMgr.skipPastChar(chCloseAngle);
        return;
    }

    // Point the name pointer at the raw data
    namePtr = bbName.getRawBuffer();

    // See if it is some form of 'xml' and emit a warning
    if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
        emitError(XMLErrs::NoPIStartsWithXML);

    // If namespaces are enabled, then no colons allowed
    if (fDoNamespaces)
    {
        if (XMLString::indexOf(namePtr, chColon) != -1)
            emitError(XMLErrs::ColonNotLegalWithNS);
    }

    //  If we don't hit a space next, then the PI has no target. If we do
    //  then get out the target. Get a buffer for it as well
    XMLBufBid bbTarget(&fBufMgr);
    if (fReaderMgr.skippedSpace())
    {
        // Skip any leading spaces
        fReaderMgr.skipPastSpaces();

        bool gotLeadingSurrogate = false;

        // It does have a target, so lets move on to deal with that.
        while (1)
        {
            const XMLCh nextCh = fReaderMgr.getNextChar();

            // Watch for an end of file, which is always bad here
            if (!nextCh)
            {
                emitError(XMLErrs::UnterminatedPI);
                ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
            }

            // Watch for potential terminating character
            if (nextCh == chQuestion)
            {
                // It must be followed by '>' to be a termination of the target
                if (fReaderMgr.skippedChar(chCloseAngle))
                    break;
            }

            // Check for correct surrogate pairs
            if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
            {
                if (gotLeadingSurrogate)
                    emitError(XMLErrs::Expected2ndSurrogateChar);
                else
                    gotLeadingSurrogate = true;
            }
             else
            {
                if (gotLeadingSurrogate)
                {
                    if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
                        emitError(XMLErrs::Expected2ndSurrogateChar);
                }
                // Its got to at least be a valid XML character
                else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {

                    XMLCh tmpBuf[9];
                    XMLString::binToText
                    (
                        nextCh
                        , tmpBuf
                        , 8
                        , 16
                        , fMemoryManager
                    );
                    emitError(XMLErrs::InvalidCharacter, tmpBuf);
                }

                gotLeadingSurrogate = false;
            }

            bbTarget.append(nextCh);
        }
    }
    else
    {
        // No target, but make sure its terminated ok
        if (!fReaderMgr.skippedChar(chQuestion))
        {
            emitError(XMLErrs::UnterminatedPI);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }

        if (!fReaderMgr.skippedChar(chCloseAngle))
        {
            emitError(XMLErrs::UnterminatedPI);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }
    }

    // Point the target pointer at the raw data
    targetPtr = bbTarget.getRawBuffer();

    // If we have a handler, then call it
    if (fDocHandler)
    {
        fDocHandler->docPI
        (
            namePtr
            , targetPtr
       );
    }

    //mark PI is seen within the current element
    if (! fElemStack.isEmpty())
        fElemStack.setCommentOrPISeen();

}

//  Scans all the input from the start of the file to the root element.
//  There does not have to be anything in the prolog necessarily, but usually
//  there is at least an XMLDecl.
//
//  On exit from here we are either at the end of the file or about to read
//  the opening < of the root element.
void XMLScanner::scanProlog()
{
    // Get a buffer for whitespace processing
    XMLBufBid bbCData(&fBufMgr);

    //  Loop through the prolog. If there is no content, this could go all
    //  the way to the end of the file.
    try
    {
        while (true)
        {
            const XMLCh nextCh = fReaderMgr.peekNextChar();

            if (nextCh == chOpenAngle)
            {
                //  Ok, it could be the xml decl, a comment, the doc type line,
                //  or the start of the root element.
                if (checkXMLDecl(true))
                {
                    // There shall be at lease --ONE-- space in between
                    // the tag '<?xml' and the VersionInfo.
                    //
                    //  If we are not at line 1, col 6, then the decl was not
                    //  the first text, so its invalid.
                    const XMLReader* curReader = fReaderMgr.getCurrentReader();
                    if ((curReader->getLineNumber() != 1)
                    ||  (curReader->getColumnNumber() != 7))
                    {
                        emitError(XMLErrs::XMLDeclMustBeFirst);
                    }

                    scanXMLDecl(Decl_XML);
                }
                else if (fReaderMgr.skippedString(XMLUni::fgPIString))
                {
                    scanPI();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
                {
                    scanComment();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString))
                {
                    scanDocTypeDecl();

                    // if reusing grammar, this has been validated already in first scan
                    // skip for performance
                    if (fValidate && !fGrammar->getValidated()) {
                        //  validate the DTD scan so far
                        fValidator->preContentValidation(fUseCachedGrammar, true);
                    }
                }
                else
                {
                    // Assume its the start of the root element
                    return;
                }
            }
            else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
            {
                //  If we have a document handler then gather up the
                //  whitespace and call back. Otherwise just skip over spaces.
                if (fDocHandler)
                {
                    fReaderMgr.getSpaces(bbCData.getBuffer());
                    fDocHandler->ignorableWhitespace
                    (
                        bbCData.getRawBuffer()
                        , bbCData.getLen()
                        , false
                    );
                }
                 else
                {
                    fReaderMgr.skipPastSpaces();
                }
            }
             else
            {
                emitError(XMLErrs::InvalidDocumentStructure);

                // Watch for end of file and break out
                if (!nextCh)
                    break;
                else
                    fReaderMgr.skipPastChar(chCloseAngle);
            }

        }
    }
    catch(const EndOfEntityException&)
    {
        //  We should never get an end of entity here. They should only
        //  occur within the doc type scanning method, and not leak out to
        //  here.
        emitError
        (
            XMLErrs::UnexpectedEOE
            , "in prolog"
        );
    }
}


//  Scans the <?xml .... ?> line. This stuff is all sequential so we don't
//  do any state machine loop here. We just bull straight through it. It ends
//  past the closing bracket. If there is a document handler, then its called
//  on the XMLDecl callback.
//
//  On entry, the <?xml has been scanned, and we pick it up from there.
//
//  NOTE: In order to provide good recovery from bad XML here, we try to be
//  very flexible. No matter what order the stuff is in, we'll keep going
//  though we'll issue errors.
//
//  The parameter tells us which type of decl we should expect, Text or XML.
//    [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
//    [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
void XMLScanner::scanXMLDecl(const DeclTypes type)
{
    // Get us some buffers to use
    XMLBufBid bbVersion(&fBufMgr);
    XMLBufBid bbEncoding(&fBufMgr);
    XMLBufBid bbStand(&fBufMgr);
    XMLBufBid bbDummy(&fBufMgr);
    XMLBufBid bbName(&fBufMgr);

    //  We use this little enum and array to keep up with what we found
    //  and what order we found them in. This lets us get them free form
    //  without too much overhead, but still know that they were in the
    //  wrong order.
    enum Strings
    {
        VersionString
        , EncodingString
        , StandaloneString
        , UnknownString

        , StringCount
    };
    int flags[StringCount] = { -1, -1, -1, -1 };

    //  Also set up a list of buffers in the right order so that we know
    //  where to put stuff.
    XMLBuffer* buffers[StringCount] ;
    buffers[0] = &bbVersion.getBuffer();
    buffers[1] = &bbEncoding.getBuffer();
    buffers[2] = &bbStand.getBuffer();
    buffers[3] = &bbDummy.getBuffer();

    int curCount = 0;
    Strings curString;
    XMLBuffer& nameBuf = bbName.getBuffer();
    while (true)
    {
        // Skip any spaces
        const unsigned int spaceCount = fReaderMgr.skipPastSpaces(true);

        // If we are looking at a question mark, then break out
        if (fReaderMgr.lookingAtChar(chQuestion))
            break;

        // If this is not the first string, then we require the spaces
        if (!spaceCount && curCount)
            emitError(XMLErrs::ExpectedWhitespace);

        //  Get characters up to the next whitespace or equal's sign.
        if (!scanUpToWSOr(nameBuf, chEqual))
            emitError(XMLErrs::ExpectedDeclString);

        // See if it matches any of our expected strings
        if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString))
            curString = VersionString;
        else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString))
            curString = EncodingString;
        else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString))
            curString = StandaloneString;
        else
            curString = UnknownString;

        //  If its an unknown string, then give that error. Else check to
        //  see if this one has been done already and give that error.
        if (curString == UnknownString)
            emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer());
        else if (flags[curString] != -1)
            emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer());
        else if (flags[curString] == -1)
            flags[curString] = ++curCount;

        //  Scan for an equal's sign. If we don't find it, issue an error
        //  but keep trying to go on.
        if (!scanEq(true))
            emitError(XMLErrs::ExpectedEqSign);

        //  Get a quote string into the buffer for the string that we are
        //  currently working on.
        if (!getQuotedString(*buffers[curString]))
        {
            emitError(XMLErrs::ExpectedQuotedString);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }

        // And validate the value according which one it was
        const XMLCh* rawValue = buffers[curString]->getRawBuffer();
        if (curString == VersionString)
        {
            if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) {
                if (type == Decl_XML) {
                	fXMLVersion = XMLReader::XMLV1_1;
                    fReaderMgr.setXMLVersion(XMLReader::XMLV1_1);
                }
                else {
            	    if (fXMLVersion != XMLReader::XMLV1_1)
            	        emitError(XMLErrs::UnsupportedXMLVersion, rawValue);                
            	}
            }
            else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) {
                if (type == Decl_XML) {
                	fXMLVersion = XMLReader::XMLV1_0;
                    fReaderMgr.setXMLVersion(XMLReader::XMLV1_0);                    
                }
            }
            else
                emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
        }
         else if (curString == EncodingString)
        {
            if (!XMLString::isValidEncName(rawValue))
                emitError(XMLErrs::BadXMLEncoding, rawValue);
        }
         else if (curString == StandaloneString)
        {
            if (XMLString::equals(rawValue, XMLUni::fgYesString))
                fStandalone = true;
            else if (XMLString::equals(rawValue, XMLUni::fgNoString))
                fStandalone = false;
            else
            {
                emitError(XMLErrs::BadStandalone);
                if (!XMLString::compareIString(rawValue, XMLUni::fgYesString))
                    fStandalone = true;
                else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString))
                    fStandalone = false;
            }
        }
    }

    //  Make sure that the strings present are in order. We don't care about
    //  which ones are present at this point, just that any there are in the
    //  right order.
    int curTop = 0;
    for (int index = VersionString; index < StandaloneString; index++)
    {
        if (flags[index] != -1)
        {
            if (flags[index] !=  curTop + 1)
            {
                emitError(XMLErrs::DeclStringsInWrongOrder);
                break;
            }
            curTop = flags[index];
        }
    }

    //  If its an XML decl, the version must be present.
    //  If its a Text decl, then encoding must be present AND standalone must not be present.
    if ((type == Decl_XML) && (flags[VersionString] == -1))
        emitError(XMLErrs::XMLVersionRequired);
    else if (type == Decl_Text) {
        if (flags[StandaloneString] != -1)
            emitError(XMLErrs::StandaloneNotLegal);
        if (flags[EncodingString] == -1)
            emitError(XMLErrs::EncodingRequired);
    }

    if (!fReaderMgr.skippedChar(chQuestion))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }
     else if (!fReaderMgr.skippedChar(chCloseAngle))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }

    //  Do this before we possibly update the reader with the
    //  actual encoding string. Otherwise, we will pass the wrong thing
    //  for the last parameter!
    const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr();

    //  Ok, we've now seen the real encoding string, if there was one, so
    //  lets call back on the current reader and tell it what the real
    //  encoding string was. If it fails, that's because it represents some
    //  sort of contradiction with the autosensed format, and it keeps the
    //  original encoding.
    //
    //  NOTE: This can fail for a number of reasons, such as a bogus encoding
    //  name or because its in flagrant contradiction of the auto-sensed
    //  format.
    if (flags[EncodingString] != -1)
    {
        if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
            emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
        else
            actualEnc = bbEncoding.getRawBuffer();
    }

    //  If we have a document handler then call the XML Decl callback.
    if (type == Decl_XML)
    {
        if (fDocHandler)
            fDocHandler->XMLDecl
            (
                bbVersion.getRawBuffer()
                , bbEncoding.getRawBuffer()
                , bbStand.getRawBuffer()
                , actualEnc
            );
    }
    else if (type == Decl_Text)
    {
        if (fDocTypeHandler)
            fDocTypeHandler->TextDecl
            (
                bbVersion.getRawBuffer()
                , bbEncoding.getRawBuffer()
            );
    }
}

const XMLCh* XMLScanner::getURIText(const   unsigned int    uriId) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return XMLUni::fgZeroLenString;

        return value;
    }
    else
        return XMLUni::fgZeroLenString;
}

bool XMLScanner::getURIText(  const   unsigned int    uriId
                      ,       XMLBuffer&      uriBufToFill) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return false;

        uriBufToFill.set(value);
        return true;
    }
    else
        return false;
}

bool XMLScanner::checkXMLDecl(bool startWithAngle) {

    // [23] XMLDecl     ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
    // [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
    //
    // [3]  S           ::= (#x20 | #x9 | #xD | #xA)+
    if (startWithAngle) {
        if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
            {
                return true;
            }
            else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
            {
                //  Just in case, check for upper case. If found, issue
                //  an error, but keep going.
                emitError(XMLErrs::XMLDeclMustBeLowerCase);
                return true;
            }
        }
    }
    else {
        if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
            {
                return true;
            }
            else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
            {
                //  Just in case, check for upper case. If found, issue
                //  an error, but keep going.
                emitError(XMLErrs::XMLDeclMustBeLowerCase);
                return true;
            }
        }
    }

    return false;
}


// ---------------------------------------------------------------------------
//  XMLScanner: Grammar preparsing
// ---------------------------------------------------------------------------
Grammar* XMLScanner::loadGrammar(const   XMLCh* const systemId
                                 , const short        grammarType
                                 , const bool         toCache)
{
    InputSource* srcToUse = 0;

    if (fEntityHandler){
        ReaderMgr::LastExtEntityInfo lastInfo;
        fReaderMgr.getLastExtEntityInfo(lastInfo);
        XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
                            systemId, 0, XMLUni::fgZeroLenString, lastInfo.systemId);
        srcToUse = fEntityHandler->resolveEntity(&resourceIdentifier);
    }

    //  First we try to parse it as a URL. If that fails, we assume its
    //  a file and try it that way.
    if (!srcToUse) {

        try
        {
            //  Create a temporary URL. Since this is the primary document,
            //  it has to be fully qualified. If not, then assume we are just
            //  mistaking a file for a URL.
            XMLURL tmpURL(fMemoryManager);

            if (XMLURL::parse(systemId, tmpURL)) {            

                if (tmpURL.isRelative())
                {
                    if (!fStandardUriConformant)
                        srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
                    else {
                        // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                        // emit the error directly
                        MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
                        fInException = true;
                        emitError
                        (
                            XMLErrs::XMLException_Fatal
                            , e.getType()
                            , e.getMessage()
                        );
                        return 0;
                    }
                }
                else
                {
                    if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
                        MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
                        fInException = true;
                        emitError
                        (
                            XMLErrs::XMLException_Fatal
                            , e.getType()
                            , e.getMessage()
                        );
                        return 0;
                    }
                    srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
                }
            }
            else         
            {
                if (!fStandardUriConformant)
                    srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
                else {
                    // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                    // emit the error directly
                    // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getType()
                        , e.getMessage()
                    );
                    return 0;
                }
            }
        }
        catch(const XMLException& excToCatch)
        {
            //  For any other XMLException,
            //  emit the error and catch any user exception thrown from here.
            fInException = true;
            if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
                emitError
                (
                    XMLErrs::XMLException_Warning
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else
                emitError
                (
                    XMLErrs::XMLException_Error
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
                return 0;
        }
    }

    Janitor<InputSource> janSrc(srcToUse);
    return loadGrammar(*srcToUse, grammarType, toCache);
}

Grammar* XMLScanner::loadGrammar(const   char* const systemId
                                 , const short       grammarType
                                 , const bool        toCache)
{
    // We just delegate this to the XMLCh version after transcoding
    XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
    ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
    return loadGrammar(tmpBuf, grammarType, toCache);
}


// ---------------------------------------------------------------------------
//  XMLScanner: Setter methods
// ---------------------------------------------------------------------------
void XMLScanner::setURIStringPool(XMLStringPool* const stringPool)
{
    fURIStringPool = stringPool;
    fEmptyNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgZeroLenString);
    fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName);
    fXMLNamespaceId     = fURIStringPool->addOrFind(XMLUni::fgXMLURIName);
    fXMLNSNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName);
}

// ---------------------------------------------------------------------------
//  XMLScanner: Private helper methods
// ---------------------------------------------------------------------------

/***
 * In reusing grammars (cacheing grammar from parse, or use cached grammar), internal
 * dtd is allowed conditionally.
 *
 * In the case of cacheing grammar from parse, it is NOT allowed.
 *
 * In the case of use cached grammar,
 *   if external dtd is present and it is parsed before, then it is not allowed,
 *   otherwise it is allowed.
 *
 ***/
void XMLScanner::checkInternalDTD(bool hasExtSubset
                                 ,const XMLCh* const sysId
                                 ,const XMLCh* const pubId)
{
    if (fToCacheGrammar)
        ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);

    if (fUseCachedGrammar && hasExtSubset )
    {
        InputSource* sysIdSrc = resolveSystemId(sysId, pubId);
        Janitor<InputSource> janSysIdSrc(sysIdSrc);
        Grammar* grammar = fGrammarResolver->getGrammar(sysIdSrc->getSystemId());

        if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) 
        {
            ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
        }
    }

}

//  This method is called after the content scan to insure that all the
//  ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
//  an XML 1.0 rule, so we can do here in the core.

void XMLScanner::checkIDRefs()
{
    //  Iterate the id ref list. If we find any entries here which are used
    //  but not declared, then that's an error.
    RefHashTableOfEnumerator<XMLRefInfo> refEnum(fValidationContext->getIdRefList(), false, fMemoryManager);
    while (refEnum.hasMoreElements())
    {
        // Get a ref to the current element
        const XMLRefInfo& curRef = refEnum.nextElement();

        // If its used but not declared, then its an error
        if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
            fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
    }
}


//  This just does a simple check that the passed progressive scan token is
//  legal for this scanner.
bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
{
    return ((fScannerId == toCheck.fScannerId)
    &&      (fSequenceId == toCheck.fSequenceId));
}


//  This method will handle figuring out what the next top level token is
//  in the input stream. It will return an enumerated value that indicates
//  what it believes the next XML level token must be. It will eat as many
//  chars are required to figure out what is next.
XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
{
    //  Get the next character and use it to guesstimate what the next token
    //  is going to be. We turn on end of entity exceptions when we do this
    //  in order to catch the scenario where the current entity ended at
    //  the > of some markup.
    XMLCh nextCh;
    {
        ThrowEOEJanitor janMgr(&fReaderMgr, true);
        nextCh = fReaderMgr.peekNextChar();
    }

    //  Check for special chars. Start with the most
    //  obvious end of file, which should be legal here at top level.
    if (!nextCh)
        return Token_EOF;


    //  If it's not a '<' we must be in content.
    //
    //  This includes entity references '&' of some sort. These must
    //  be character data because that's the only place a reference can
    //  occur in content.
    if (nextCh != chOpenAngle)
        return Token_CharData;

    //  Ok it had to have been a '<' character. So get it out of the reader
    //  and store the reader number where we saw it, passing it back to the
    //  caller.
    fReaderMgr.getNextChar();
    orgReader = fReaderMgr.getCurrentReaderNum();

    //  Ok, so lets go through the things that it could be at this point which
    //  are all some form of markup.
    nextCh = fReaderMgr.peekNextChar();

    if (nextCh == chForwardSlash)
    {
        fReaderMgr.getNextChar();
        return Token_EndTag;
    }
    else if (nextCh == chBang)
    {
        static const XMLCh gCDATAStr[] =
        {
                chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
            ,   chLatin_T, chLatin_A, chNull
        };

        static const XMLCh gCommentString[] =
        {
            chBang, chDash, chDash, chNull
        };

        if (fReaderMgr.skippedString(gCDATAStr))
            return Token_CData;

        if (fReaderMgr.skippedString(gCommentString))
            return Token_Comment;

        emitError(XMLErrs::ExpectedCommentOrCDATA);
        return Token_Unknown;
    }
    else if (nextCh == chQuestion)
    {
        // It must be a PI
        fReaderMgr.getNextChar();
        return Token_PI;
    }

    //  Assume its an element name, so return with a start tag token. If it
    //  turns out not to be, then it will fail when it cannot get a valid tag.
    return Token_StartTag;
}

// ---------------------------------------------------------------------------
//  XMLScanner: Private parsing methods
// ---------------------------------------------------------------------------

//  This guy just scans out a single or double quoted string of characters.
//  It does not pass any judgement on the contents and assumes that it is
//  illegal to have another quote of the same kind inside the string's
//  contents.
//
//  NOTE: This is for simple stuff like the strings in the XMLDecl which
//  cannot have any entities inside them. So this guy does not handle any
//  end of entity stuff.
bool XMLScanner::getQuotedString(XMLBuffer& toFill)
{
    // Reset the target buffer
    toFill.reset();

    // Get the next char which must be a single or double quote
    XMLCh quoteCh;
    if (!fReaderMgr.skipIfQuote(quoteCh))
        return false;

    while (true)
    {
        // Get another char
        const XMLCh nextCh = fReaderMgr.getNextChar();

        // See if it matches the starting quote char
        if (nextCh == quoteCh)
            break;

        //  We should never get either an end of file null char here. If we
        //  do, just fail. It will be handled more gracefully in the higher
        //  level code that called us.
        if (!nextCh)
            return false;

        // Else add it to the buffer
        toFill.append(nextCh);
    }
    return true;
}


//  This method scans a character reference and returns the character that
//  was refered to. It assumes that we've already scanned the &# characters
//  that prefix the numeric code.
bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
{
    bool gotOne = false;
    unsigned int value = 0;

    //  Set the radix. Its supposed to be a lower case x if hex. But, in
    //  order to recover well, we check for an upper and put out an error
    //  for that.
    unsigned int radix = 10;
    if (fReaderMgr.skippedChar(chLatin_x))
    {
        radix = 16;
    }
    else if (fReaderMgr.skippedChar(chLatin_X))
    {
        emitError(XMLErrs::HexRadixMustBeLowerCase);
        radix = 16;
    }

    while (true)
    {
        const XMLCh nextCh = fReaderMgr.peekNextChar();

        // Watch for EOF
        if (!nextCh)
            ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);

        // Break out on the terminating semicolon
        if (nextCh == chSemiColon)
        {
            fReaderMgr.getNextChar();
            break;
        }

        //  Convert this char to a binary value, or bail out if its not
        //  one.
        unsigned int nextVal;
        if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
            nextVal = (unsigned int)(nextCh - chDigit_0);
        else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
            nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
        else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
            nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
        else
        {
            // Return a zero
            toFill = 0;

            //  If we got at least a sigit, then do an unterminated ref error.
            //  Else, do an expected a numerical ref thing.
            if (gotOne)
                emitError(XMLErrs::UnterminatedCharRef);
            else
                emitError(XMLErrs::ExpectedNumericalCharRef);

            // Return failure
            return false;
        }

        //  Make sure its valid for the radix. If not, then just eat the
        //  digit and go on after issueing an error. Else, update the
        //  running value with this new digit.
        if (nextVal >= radix)
        {
            XMLCh tmpStr[2];
            tmpStr[0] = nextCh;
            tmpStr[1] = chNull;
            emitError(XMLErrs::BadDigitForRadix, tmpStr);
        }
        else
        {
            value = (value * radix) + nextVal;
            // Guard against overflow.
            if (value > 0x10FFFF) {
                // Character reference was not in the valid range
                emitError(XMLErrs::InvalidCharacterRef);
                return false;
            }
        }

        // Indicate that we got at least one good digit
        gotOne = true;

        // And eat the last char
        fReaderMgr.getNextChar();
    }

    // Return the char (or chars)
    // And check if the character expanded is valid or not
    if (value >= 0x10000 && value <= 0x10FFFF)
    {
        value -= 0x10000;
        toFill = XMLCh((value >> 10) + 0xD800);
        second = XMLCh((value & 0x3FF) + 0xDC00);
    }
    else if (value <= 0xFFFD)
    {
        toFill = XMLCh(value);
        second = 0;
        if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) {
            // Character reference was not in the valid range
            emitError(XMLErrs::InvalidCharacterRef);
            return false;
        }
    }
    else {
        // Character reference was not in the valid range
        emitError(XMLErrs::InvalidCharacterRef);
        return false;
    }

    return true;
}


//  We get here after the '<!--' part of the comment. We scan past the
//  terminating '-->' It will calls the appropriate handler with the comment
//  text, if one is provided. A comment can be in either the document or
//  the DTD, so the fInDocument flag is used to know which handler to send
//  it to.
void XMLScanner::scanComment()
{

    enum States
    {
        InText
        , OneDash
        , TwoDashes
    };

    // Get a buffer for this
    XMLBufBid bbComment(&fBufMgr);

    //  Get the comment text into a temp buffer. Be sure to use temp buffer
    //  two here, since its to be used for stuff that is potentially longer
    //  than just a name.
    States curState = InText;
    bool gotLeadingSurrogate = false;
    while (true)
    {
        // Get the next character
        const XMLCh nextCh = fReaderMgr.getNextChar();

        //  Watch for an end of file
        if (!nextCh)
        {
            emitError(XMLErrs::UnterminatedComment);
            ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
        }

        // Check for correct surrogate pairs
        if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
        {
            if (gotLeadingSurrogate)
                emitError(XMLErrs::Expected2ndSurrogateChar);
            else
                gotLeadingSurrogate = true;
        }
        else
        {
            if (gotLeadingSurrogate)
            {
                if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
                    emitError(XMLErrs::Expected2ndSurrogateChar);
            }
            // Its got to at least be a valid XML character
            else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {

                XMLCh tmpBuf[9];
                XMLString::binToText
                (
                    nextCh
                    , tmpBuf
                    , 8
                    , 16
                    , fMemoryManager
                );
                emitError(XMLErrs::InvalidCharacter, tmpBuf);
            }

            gotLeadingSurrogate = false;
        }

        if (curState == InText)
        {
            // If its a dash, go to OneDash state. Otherwise take as text
            if (nextCh == chDash)
                curState = OneDash;
            else
                bbComment.append(nextCh);
        }
        else if (curState == OneDash)
        {
            //  If its another dash, then we change to the two dashes states.
            //  Otherwise, we have to put in the deficit dash and the new
            //  character and go back to InText.
            if (nextCh == chDash)
            {
                curState = TwoDashes;
            }
            else
            {
                bbComment.append(chDash);
                bbComment.append(nextCh);
                curState = InText;
            }
        }
        else if (curState == TwoDashes)
        {
            // The next character must be the closing bracket
            if (nextCh != chCloseAngle)
            {
                emitError(XMLErrs::IllegalSequenceInComment);
                fReaderMgr.skipPastChar(chCloseAngle);
                return;
            }
            break;
        }
    }

    // If we have an available handler, call back with the comment.
    if (fDocHandler)
    {
        fDocHandler->docComment
        (
            bbComment.getRawBuffer()
        );
    }

    //mark comment is seen within the current element
    if (! fElemStack.isEmpty())
        fElemStack.setCommentOrPISeen();

}


//  Most equal signs can have white space around them, so this little guy
//  just makes the calling code cleaner by eating whitespace.
bool XMLScanner::scanEq(bool inDecl)
{
    fReaderMgr.skipPastSpaces(inDecl);
    if (fReaderMgr.skippedChar(chEqual))
    {
        fReaderMgr.skipPastSpaces(inDecl);
        return true;
    }
    return false;
}


unsigned int
XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
{
    fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
    return toFill.getLen();
}

unsigned int *XMLScanner::getNewUIntPtr()
{
    // this method hands back a new pointer initialized to 0
    unsigned int *retVal;
    if(fUIntPoolCol < 64)
    {
        retVal = fUIntPool[fUIntPoolRow]+fUIntPoolCol;
        fUIntPoolCol++;
        return retVal;
    }
    // time to grow the pool...
    if(fUIntPoolRow+1 == fUIntPoolRowTotal)
    {
        // and time to add some space for new rows:
        fUIntPoolRowTotal <<= 1;
        unsigned int **newArray = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal );
        memcpy(newArray, fUIntPool, (fUIntPoolRow+1) * sizeof(unsigned int *));
        fMemoryManager->deallocate(fUIntPool);
        fUIntPool = newArray;