XMLScanner.cpp

            originalExceptCode
            , XMLUni::fgExceptDomain    //fgXMLErrDomain
            , XMLErrs::errorType(toEmit)
            , errText
            , lastInfo.systemId
            , lastInfo.publicId
            , lastInfo.lineNumber
            , lastInfo.colNumber
        );
    }

    // Bail out if its fatal an we are to give up on the first fatal error
    if (emitErrorWillThrowException(toEmit))
        throw toEmit;
}

// ---------------------------------------------------------------------------
//  XMLScanner: Getter methods
// ---------------------------------------------------------------------------

//  This method allows the caller to query the current location of the scanner.
//  It will return the sys/public ids of the current entity, and the line/col
//  position within it.
//
//  NOTE: This API returns the location with the last external file. So if its
//  currently scanning an entity, the position returned will be the end of
//  the entity reference in the file that had the reference.
//
/*bool
XMLScanner::getLastExtLocation(         XMLCh* const    sysIdToFill
                                , const unsigned int    maxSysIdChars
                                ,       XMLCh* const    pubIdToFill
                                , const unsigned int    maxPubIdChars
                                ,       XMLSSize_t&     lineToFill
                                ,       XMLSSize_t&     colToFill) const
{
    // Create a local info object and get it filled in by the reader manager
    ReaderMgr::LastExtEntityInfo lastInfo;
    fReaderMgr.getLastExtEntityInfo(lastInfo);

    // Fill in the line and column number
    lineToFill = lastInfo.lineNumber;
    colToFill = lastInfo.colNumber;

    // And copy over as much of the ids as will fit
    sysIdToFill[0] = 0;
    if (lastInfo.systemId)
    {
        if (XMLString::stringLen(lastInfo.systemId) > maxSysIdChars)
            return false;
        XMLString::copyString(sysIdToFill, lastInfo.systemId);
    }

    pubIdToFill[0] = 0;
    if (lastInfo.publicId)
    {
        if (XMLString::stringLen(lastInfo.publicId) > maxPubIdChars)
            return false;
        XMLString::copyString(pubIdToFill, lastInfo.publicId);
    }
    return true;
}*/


// ---------------------------------------------------------------------------
//  XMLScanner: Private scanning methods
// ---------------------------------------------------------------------------

//  This method is called after the end of the root element, to handle
//  any miscellaneous stuff hanging around.
void XMLScanner::scanMiscellaneous()
{
    // Get a buffer for this work
    XMLBufBid bbCData(&fBufMgr);

    while (true)
    {
        try
        {
            const XMLCh nextCh = fReaderMgr.peekNextChar();

            // Watch for end of file and break out
            if (!nextCh)
                break;

            if (nextCh == chOpenAngle)
            {
                if (checkXMLDecl(true))
                {
                    // Can't have an XML decl here
                    emitError(XMLErrs::NotValidAfterContent);
                    fReaderMgr.skipPastChar(chCloseAngle);
                }
                else if (fReaderMgr.skippedString(XMLUni::fgPIString))
                {
                    scanPI();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
                {
                    scanComment();
                }
                else
                {
                    // This can't be possible, so just give up
                    emitError(XMLErrs::ExpectedCommentOrPI);
                    fReaderMgr.skipPastChar(chCloseAngle);
                }
            }
            else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
            {
                //  If we have a doc handler, then gather up the spaces and
                //  call back. Otherwise, just skip over whitespace.
                if (fDocHandler)
                {
                    fReaderMgr.getSpaces(bbCData.getBuffer());
                    fDocHandler->ignorableWhitespace
                    (
                        bbCData.getRawBuffer()
                        , bbCData.getLen()
                        , false
                    );
                }
                else
                {
                    fReaderMgr.skipPastSpaces();
                }
            }
            else
            {
                emitError(XMLErrs::ExpectedCommentOrPI);
                fReaderMgr.skipPastChar(chCloseAngle);
            }
        }
        catch(const EndOfEntityException&)
        {
            //  Some entity leaked out of the content part of the document. Issue
            //  a warning and keep going.
            emitError(XMLErrs::EntityPropogated);
        }
    }
}


//  Scans a PI and calls the appropriate callbacks. At entry we have just
//  scanned the <? part, and need to now start on the PI target name.
void XMLScanner::scanPI()
{
    const XMLCh* namePtr = 0;
    const XMLCh* targetPtr = 0;

    //  If there are any spaces here, then warn about it. If we aren't in
    //  'first error' mode, then we'll come back and can easily pick up
    //  again by just skipping them.
    if (fReaderMgr.lookingAtSpace())
    {
        emitError(XMLErrs::PINameExpected);
        fReaderMgr.skipPastSpaces();
    }

    // Get a buffer for the PI name and scan it in
    XMLBufBid bbName(&fBufMgr);
    if (!fReaderMgr.getName(bbName.getBuffer()))
    {
        emitError(XMLErrs::PINameExpected);
        fReaderMgr.skipPastChar(chCloseAngle);
        return;
    }

    // Point the name pointer at the raw data
    namePtr = bbName.getRawBuffer();

    // See if it is some form of 'xml' and emit a warning
    //if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
    if (bbName.getLen() == 3 &&
        (((namePtr[0] == chLatin_x) || (namePtr[0] == chLatin_X)) &&
         ((namePtr[1] == chLatin_m) || (namePtr[1] == chLatin_M)) &&
         ((namePtr[2] == chLatin_l) || (namePtr[2] == chLatin_L))))   
        emitError(XMLErrs::NoPIStartsWithXML);

    // If namespaces are enabled, then no colons allowed
    if (fDoNamespaces)
    {
        if (XMLString::indexOf(namePtr, chColon) != -1)
            emitError(XMLErrs::ColonNotLegalWithNS);
    }

    //  If we don't hit a space next, then the PI has no target. If we do
    //  then get out the target. Get a buffer for it as well
    XMLBufBid bbTarget(&fBufMgr);
    if (fReaderMgr.skippedSpace())
    {
        // Skip any leading spaces
        fReaderMgr.skipPastSpaces();

        bool gotLeadingSurrogate = false;

        // It does have a target, so lets move on to deal with that.
        while (1)
        {
            const XMLCh nextCh = fReaderMgr.getNextChar();

            // Watch for an end of file, which is always bad here
            if (!nextCh)
            {
                emitError(XMLErrs::UnterminatedPI);
                ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
            }

            // Watch for potential terminating character
            if (nextCh == chQuestion)
            {
                // It must be followed by '>' to be a termination of the target
                if (fReaderMgr.skippedChar(chCloseAngle))
                    break;
            }

            // Check for correct surrogate pairs
            if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
            {
                if (gotLeadingSurrogate)
                    emitError(XMLErrs::Expected2ndSurrogateChar);
                else
                    gotLeadingSurrogate = true;
            }
             else
            {
                if (gotLeadingSurrogate)
                {
                    if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
                        emitError(XMLErrs::Expected2ndSurrogateChar);
                }
                // Its got to at least be a valid XML character
                else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {

                    XMLCh tmpBuf[9];
                    XMLString::binToText
                    (
                        nextCh
                        , tmpBuf
                        , 8
                        , 16
                        , fMemoryManager
                    );
                    emitError(XMLErrs::InvalidCharacter, tmpBuf);
                }

                gotLeadingSurrogate = false;
            }

            bbTarget.append(nextCh);
        }
    }
    else
    {
        // No target, but make sure its terminated ok
        if (!fReaderMgr.skippedChar(chQuestion))
        {
            emitError(XMLErrs::UnterminatedPI);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }

        if (!fReaderMgr.skippedChar(chCloseAngle))
        {
            emitError(XMLErrs::UnterminatedPI);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }
    }

    // Point the target pointer at the raw data
    targetPtr = bbTarget.getRawBuffer();

    // If we have a handler, then call it
    if (fDocHandler)
    {
        fDocHandler->docPI
        (
            namePtr
            , targetPtr
       );
    }

    //mark PI is seen within the current element
    if (! fElemStack.isEmpty())
        fElemStack.setCommentOrPISeen();

}

//  Scans all the input from the start of the file to the root element.
//  There does not have to be anything in the prolog necessarily, but usually
//  there is at least an XMLDecl.
//
//  On exit from here we are either at the end of the file or about to read
//  the opening < of the root element.
void XMLScanner::scanProlog()
{
    bool sawDocTypeDecl = false;
    // Get a buffer for whitespace processing
    XMLBufBid bbCData(&fBufMgr);

    //  Loop through the prolog. If there is no content, this could go all
    //  the way to the end of the file.
    try
    {
        while (true)
        {
            const XMLCh nextCh = fReaderMgr.peekNextChar();

            if (nextCh == chOpenAngle)
            {
                //  Ok, it could be the xml decl, a comment, the doc type line,
                //  or the start of the root element.
                if (checkXMLDecl(true))
                {
                    // There shall be at lease --ONE-- space in between
                    // the tag '<?xml' and the VersionInfo.
                    //
                    //  If we are not at line 1, col 6, then the decl was not
                    //  the first text, so its invalid.
                    const XMLReader* curReader = fReaderMgr.getCurrentReader();
                    if ((curReader->getLineNumber() != 1)
                    ||  (curReader->getColumnNumber() != 7))
                    {
                        emitError(XMLErrs::XMLDeclMustBeFirst);
                    }

                    scanXMLDecl(Decl_XML);
                }
                else if (fReaderMgr.skippedString(XMLUni::fgPIString))
                {
                    scanPI();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
                {
                    scanComment();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString))
                {
                    if (sawDocTypeDecl) {
                        emitError(XMLErrs::DuplicateDocTypeDecl);
                    }
                    scanDocTypeDecl();
                    sawDocTypeDecl = true;

                    // if reusing grammar, this has been validated already in first scan
                    // skip for performance
                    if (fValidate && !fGrammar->getValidated()) {
                        //  validate the DTD scan so far
                        fValidator->preContentValidation(fUseCachedGrammar, true);
                    }
                }
                else
                {
                    // Assume its the start of the root element
                    return;
                }
            }
            else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
            {
                //  If we have a document handler then gather up the
                //  whitespace and call back. Otherwise just skip over spaces.
                if (fDocHandler)
                {
                    fReaderMgr.getSpaces(bbCData.getBuffer());
                    fDocHandler->ignorableWhitespace
                    (
                        bbCData.getRawBuffer()
                        , bbCData.getLen()
                        , false
                    );
                }
                 else
                {
                    fReaderMgr.skipPastSpaces();
                }
            }
             else
            {
                emitError(XMLErrs::InvalidDocumentStructure);

                // Watch for end of file and break out
                if (!nextCh)
                    break;
                else
                    fReaderMgr.skipPastChar(chCloseAngle);
            }

        }
    }
    catch(const EndOfEntityException&)
    {
        //  We should never get an end of entity here. They should only
        //  occur within the doc type scanning method, and not leak out to
        //  here.
        emitError
        (
            XMLErrs::UnexpectedEOE
            , "in prolog"
        );
    }
}


//  Scans the <?xml .... ?> line. This stuff is all sequential so we don't
//  do any state machine loop here. We just bull straight through it. It ends
//  past the closing bracket. If there is a document handler, then its called
//  on the XMLDecl callback.
//
//  On entry, the <?xml has been scanned, and we pick it up from there.
//
//  NOTE: In order to provide good recovery from bad XML here, we try to be
//  very flexible. No matter what order the stuff is in, we'll keep going
//  though we'll issue errors.
//
//  The parameter tells us which type of decl we should expect, Text or XML.
//    [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
//    [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
void XMLScanner::scanXMLDecl(const DeclTypes type)
{
    // Get us some buffers to use
    XMLBufBid bbVersion(&fBufMgr);
    XMLBufBid bbEncoding(&fBufMgr);
    XMLBufBid bbStand(&fBufMgr);
    XMLBufBid bbDummy(&fBufMgr);
    XMLBufBid bbName(&fBufMgr);

    //  We use this little enum and array to keep up with what we found
    //  and what order we found them in. This lets us get them free form
    //  without too much overhead, but still know that they were in the
    //  wrong order.
    enum Strings
    {
        VersionString
        , EncodingString
        , StandaloneString
        , UnknownString

        , StringCount
    };
    int flags[StringCount] = { -1, -1, -1, -1 };

    //  Also set up a list of buffers in the right order so that we know
    //  where to put stuff.
    XMLBuffer* buffers[StringCount] ;
    buffers[0] = &bbVersion.getBuffer();
    buffers[1] = &bbEncoding.getBuffer();
    buffers[2] = &bbStand.getBuffer();
    buffers[3] = &bbDummy.getBuffer();

    int curCount = 0;
    Strings curString;
    XMLBuffer& nameBuf = bbName.getBuffer();
    while (true)
    {
        // Skip any spaces
        const bool spaceCount = fReaderMgr.skipPastSpaces(true);

        // If we are looking at a question mark, then break out
        if (fReaderMgr.lookingAtChar(chQuestion))
            break;

        // If this is not the first string, then we require the spaces
        if (!spaceCount && curCount)
            emitError(XMLErrs::ExpectedWhitespace);

        //  Get characters up to the next whitespace or equal's sign.
        if (!scanUpToWSOr(nameBuf, chEqual))
            emitError(XMLErrs::ExpectedDeclString);

        // See if it matches any of our expected strings
        if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString))
            curString = VersionString;
        else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString))
            curString = EncodingString;
        else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString))
            curString = StandaloneString;
        else
            curString = UnknownString;

        //  If its an unknown string, then give that error. Else check to
        //  see if this one has been done already and give that error.
        if (curString == UnknownString)
            emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer());
        else if (flags[curString] != -1)
            emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer());
        else if (flags[curString] == -1)
            flags[curString] = ++curCount;

        //  Scan for an equal's sign. If we don't find it, issue an error
        //  but keep trying to go on.
        if (!scanEq(true))
            emitError(XMLErrs::ExpectedEqSign);

        //  Get a quote string into the buffer for the string that we are
        //  currently working on.
        if (!getQuotedString(*buffers[curString]))
        {
            emitError(XMLErrs::ExpectedQuotedString);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }

        // And validate the value according which one it was
        const XMLCh* rawValue = buffers[curString]->getRawBuffer();
        if (curString == VersionString)
        {
            if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) {
                if (type == Decl_XML) {
                	fXMLVersion = XMLReader::XMLV1_1;
                    fReaderMgr.setXMLVersion(XMLReader::XMLV1_1);
                }
                else {
            	    if (fXMLVersion != XMLReader::XMLV1_1)
            	        emitError(XMLErrs::UnsupportedXMLVersion, rawValue);                
            	}
            }
            else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) {
                if (type == Decl_XML) {
                	fXMLVersion = XMLReader::XMLV1_0;
                    fReaderMgr.setXMLVersion(XMLReader::XMLV1_0);                    
                }
            }
            else
                emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
        }
         else if (curString == EncodingString)
        {
            if (!XMLString::isValidEncName(rawValue))
                emitError(XMLErrs::BadXMLEncoding, rawValue);
        }
         else if (curString == StandaloneString)
        {
            if (XMLString::equals(rawValue, XMLUni::fgYesString))
                fStandalone = true;
            else if (XMLString::equals(rawValue, XMLUni::fgNoString))
                fStandalone = false;
            else
            {
                emitError(XMLErrs::BadStandalone);
                //if (!XMLString::compareIString(rawValue, XMLUni::fgYesString))
                //else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString))
                if (buffers[curString]->getLen() == 3 &&
                    (((rawValue[0] == chLatin_y) || (rawValue[0] == chLatin_Y)) &&
                     ((rawValue[1] == chLatin_e) || (rawValue[1] == chLatin_E)) &&
                     ((rawValue[2] == chLatin_s) || (rawValue[2] == chLatin_S))))   
                    fStandalone = true;                
                else if (buffers[curString]->getLen() == 2 &&
                    (((rawValue[0] == chLatin_n) || (rawValue[0] == chLatin_N)) &&
                     ((rawValue[1] == chLatin_o) || (rawValue[1] == chLatin_O))))   
                    fStandalone = false;
            }
        }
    }

    //  Make sure that the strings present are in order. We don't care about
    //  which ones are present at this point, just that any there are in the
    //  right order.
    int curTop = 0;
    for (int index = VersionString; index < StandaloneString; index++)
    {
        if (flags[index] != -1)
        {
            if (flags[index] !=  curTop + 1)
            {
                emitError(XMLErrs::DeclStringsInWrongOrder);
                break;
            }
            curTop = flags[index];
        }
    }

    //  If its an XML decl, the version must be present.
    //  If its a Text decl, then encoding must be present AND standalone must not be present.
    if ((type == Decl_XML) && (flags[VersionString] == -1))
        emitError(XMLErrs::XMLVersionRequired);
    else if (type == Decl_Text) {
        if (flags[StandaloneString] != -1)
            emitError(XMLErrs::StandaloneNotLegal);
        if (flags[EncodingString] == -1)
            emitError(XMLErrs::EncodingRequired);
    }

    if (!fReaderMgr.skippedChar(chQuestion))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }
     else if (!fReaderMgr.skippedChar(chCloseAngle))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }

    //  Do this before we possibly update the reader with the
    //  actual encoding string. Otherwise, we will pass the wrong thing
    //  for the last parameter!
    const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr();

    //  Ok, we've now seen the real encoding string, if there was one, so
    //  lets call back on the current reader and tell it what the real
    //  encoding string was. If it fails, that's because it represents some
    //  sort of contradiction with the autosensed format, and it keeps the
    //  original encoding.
    //
    //  NOTE: This can fail for a number of reasons, such as a bogus encoding
    //  name or because its in flagrant contradiction of the auto-sensed
    //  format.
    if (flags[EncodingString] != -1)
    {
        if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
            emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
        else
            actualEnc = bbEncoding.getRawBuffer();
    }

    //  If we have a document handler then call the XML Decl callback.
    if (type == Decl_XML)
    {
        if (fDocHandler)
            fDocHandler->XMLDecl
            (
                bbVersion.getRawBuffer()
                , bbEncoding.getRawBuffer()
                , bbStand.getRawBuffer()
                , actualEnc
            );
    }
    else if (type == Decl_Text)
    {
        if (fDocTypeHandler)
            fDocTypeHandler->TextDecl
            (
                bbVersion.getRawBuffer()
                , bbEncoding.getRawBuffer()
            );
    }
}

const XMLCh* XMLScanner::getURIText(const   unsigned int    uriId) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return XMLUni::fgZeroLenString;

        return value;
    }
    else
        return XMLUni::fgZeroLenString;
}

bool XMLScanner::getURIText(  const   unsigned int    uriId
                      ,       XMLBuffer&      uriBufToFill) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return false;

        uriBufToFill.set(value);
        return true;
    }
    else
        return false;
}

bool XMLScanner::checkXMLDecl(bool startWithAngle) {

    // [23] XMLDecl     ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
    // [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
    //
    // [3]  S           ::= (#x20 | #x9 | #xD | #xA)+
    if (startWithAngle) {
        if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
            {
                return true;
            }
        }
        else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
           || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
           || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
           || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
        {
            //  Just in case, check for upper case. If found, issue
            //  an error, but keep going.
            emitError(XMLErrs::XMLDeclMustBeLowerCase);
            return true;
        }
    }
    else {
        if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
            {
                return true;
            }
        }
        else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
           || fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
           || fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
           || fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
        {
            //  Just in case, check for upper case. If found, issue
            //  an error, but keep going.
            emitError(XMLErrs::XMLDeclMustBeLowerCase);
            return true;
        }
    }

    return false;
}


// ---------------------------------------------------------------------------
//  XMLScanner: Grammar preparsing
// ---------------------------------------------------------------------------
Grammar* XMLScanner::loadGrammar(const   XMLCh* const systemId
                                 , const short        grammarType
                                 , const bool         toCache)
{
    InputSource* srcToUse = 0;

    if (fEntityHandler){
        ReaderMgr::LastExtEntityInfo lastInfo;
        fReaderMgr.getLastExtEntityInfo(lastInfo);
        XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
                            systemId, 0, XMLUni::fgZeroLenString, lastInfo.systemId,
                            &fReaderMgr);
        srcToUse = fEntityHandler->resolveEntity(&resourceIdentifier);
    }

    //  First we try to parse it as a URL. If that fails, we assume its
    //  a file and try it that way.
    if (!srcToUse) {
        if (fDisableDefaultEntityResolution)
            return 0;

        try
        {
            //  Create a temporary URL. Since this is the primary document,
            //  it has to be fully qualified. If not, then assume we are just
            //  mistaking a file for a URL.
            XMLURL tmpURL(fMemoryManager);

            if (XMLURL::parse(systemId, tmpURL)) {            

                if (tmpURL.isRelative())
                {
                    if (!fStandardUriConformant)
                        srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
                    else {
                        // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                        // emit the error directly
                        MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
                        fInException = true;
                        emitError
                        (
                            XMLErrs::XMLException_Fatal
                            , e.getCode()
                            , e.getType()
                            , e.getMessage()
                        );
                        return 0;
                    }
                }
                else
                {
                    if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
                        MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
                        fInException = true;
                        emitError
                        (
                            XMLErrs::XMLException_Fatal
                            , e.getCode()
                            , e.getType()
                            , e.getMessage()
                        );
                        return 0;
                    }
                    srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
                }
            }
            else         
            {
                if (!fStandardUriConformant)
                    srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
                else {
                    // since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
                    // emit the error directly
                    // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getCode()
                        , e.getType()
                        , e.getMessage()
                    );
                    return 0;
                }
            }
        }
        catch(const XMLException& excToCatch)
        {
            //  For any other XMLException,
            //  emit the error and catch any user exception thrown from here.
            fInException = true;
            if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
                emitError
                (
                    XMLErrs::XMLException_Warning
                    , excToCatch.getCode()
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , excToCatch.getCode()
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else
                emitError
                (
                    XMLErrs::XMLException_Error
                    , excToCatch.getCode()
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
                return 0;
        }
    }

    Janitor<InputSource> janSrc(srcToUse);
    return loadGrammar(*srcToUse, grammarType, toCache);
}

Grammar* XMLScanner::loadGrammar(const   char* const systemId
                                 , const short       grammarType
                                 , const bool        toCache)
{
    // We just delegate this to the XMLCh version after transcoding
    XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
    ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
    return loadGrammar(tmpBuf, grammarType, toCache);
}


// ---------------------------------------------------------------------------
//  XMLScanner: Setter methods
// ---------------------------------------------------------------------------
void XMLScanner::setURIStringPool(XMLStringPool* const stringPool)
{
    fURIStringPool = stringPool;
    fEmptyNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgZeroLenString);
    fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName);
    fXMLNamespaceId     = fURIStringPool->addOrFind(XMLUni::fgXMLURIName);
    fXMLNSNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName);
}

// ---------------------------------------------------------------------------
//  XMLScanner: Private helper methods
// ---------------------------------------------------------------------------

/***
 * In reusing grammars (cacheing grammar from parse, or use cached grammar), internal
 * dtd is allowed conditionally.
 *
 * In the case of cacheing grammar from parse, it is NOT allowed.
 *
 * In the case of use cached grammar,
 *   if external dtd is present and it is parsed before, then it is not allowed,
 *   otherwise it is allowed.
 *
 ***/
void XMLScanner::checkInternalDTD(bool hasExtSubset
                                 ,const XMLCh* const sysId
                                 ,const XMLCh* const pubId)
{
    if (fToCacheGrammar)
        ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);

    if (fUseCachedGrammar && hasExtSubset && !fIgnoreCachedDTD)
    {
        InputSource* sysIdSrc = resolveSystemId(sysId, pubId);
        if (sysIdSrc) {
            Janitor<InputSource> janSysIdSrc(sysIdSrc);
            Grammar* grammar = fGrammarResolver->getGrammar(sysIdSrc->getSystemId());

            if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType) 
            {
                ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
            }
        }
    }

}

//  This method is called after the content scan to insure that all the
//  ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
//  an XML 1.0 rule, so we can do here in the core.

void XMLScanner::checkIDRefs()
{
    //  Iterate the id ref list. If we find any entries here which are used
    //  but not declared, then that's an error.
    RefHashTableOfEnumerator<XMLRefInfo> refEnum(fValidationContext->getIdRefList(), false, fMemoryManager);
    while (refEnum.hasMoreElements())
    {
        // Get a ref to the current element
        const XMLRefInfo& curRef = refEnum.nextElement();

        // If its used but not declared, then its an error
        if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
            fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
    }
}


//  This just does a simple check that the passed progressive scan token is
//  legal for this scanner.
bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
{
    return ((fScannerId == toCheck.fScannerId)
    &&      (fSequenceId == toCheck.fSequenceId));
}


//  This method will handle figuring out what the next top level token is
//  in the input stream. It will return an enumerated value that indicates
//  what it believes the next XML level token must be. It will eat as many
//  chars are required to figure out what is next.
XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
{
    //  Get the next character and use it to guesstimate what the next token
    //  is going to be. We turn on end of entity exceptions when we do this
    //  in order to catch the scenario where the current entity ended at
    //  the > of some markup.
    XMLCh nextCh;

    // avoid setting up the ThrowEOEJanitor if we know that we have data in the current reader
    if(fReaderMgr.getCurrentReader() && fReaderMgr.getCurrentReader()->charsLeftInBuffer()>0)
        nextCh = fReaderMgr.peekNextChar();
    else
    {
        ThrowEOEJanitor janMgr(&fReaderMgr, true);
        nextCh = fReaderMgr.peekNextChar();
    }

    //  Check for special chars. Start with the most
    //  obvious end of file, which should be legal here at top level.
    if (!nextCh)
        return Token_EOF;


    //  If it's not a '<' we must be in content.
    //
    //  This includes entity references '&' of some sort. These must
    //  be character data because that's the only place a reference can
    //  occur in content.
    if (nextCh != chOpenAngle)
        return Token_CharData;

    //  Ok it had to have been a '<' character. So get it out of the reader
    //  and store the reader number where we saw it, passing it back to the
    //  caller.
    fReaderMgr.getNextChar();
    orgReader = fReaderMgr.getCurrentReaderNum();

    //  Ok, so lets go through the things that it could be at this point which
    //  are all some form of markup.
    nextCh = fReaderMgr.peekNextChar();

    if (nextCh == chForwardSlash)
    {
        fReaderMgr.getNextChar();
        return Token_EndTag;
    }
    else if (nextCh == chBang)
    {
        static const XMLCh gCDATAStr[] =
        {
                chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
            ,   chLatin_T, chLatin_A, chNull
        };

        static const XMLCh gCommentString[] =
        {