XMLScanner.cpp

    if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
        emitError(XMLErrs::NoPIStartsWithXML);

    // If namespaces are enabled, then no colons allowed
    if (fDoNamespaces)
    {
        if (XMLString::indexOf(namePtr, chColon) != -1)
            emitError(XMLErrs::ColonNotLegalWithNS);
    }

    //  If we don't hit a space next, then the PI has no target. If we do
    //  then get out the target. Get a buffer for it as well
    XMLBufBid bbTarget(&fBufMgr);
    if (fReaderMgr.skippedSpace())
    {
        // Skip any leading spaces
        fReaderMgr.skipPastSpaces();

        bool gotLeadingSurrogate = false;

        // It does have a target, so lets move on to deal with that.
        while (1)
        {
            const XMLCh nextCh = fReaderMgr.getNextChar();

            // Watch for an end of file, which is always bad here
            if (!nextCh)
            {
                emitError(XMLErrs::UnterminatedPI);
                ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
            }

            // Watch for potential terminating character
            if (nextCh == chQuestion)
            {
                // It must be followed by '>' to be a termination of the target
                if (fReaderMgr.skippedChar(chCloseAngle))
                    break;
            }

            // Check for correct surrogate pairs
            if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
            {
                if (gotLeadingSurrogate)
                    emitError(XMLErrs::Expected2ndSurrogateChar);
                else
                    gotLeadingSurrogate = true;
            }
             else
            {
                if (gotLeadingSurrogate)
                {
                    if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
                        emitError(XMLErrs::Expected2ndSurrogateChar);
                }
                // Its got to at least be a valid XML character
                else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {

                    XMLCh tmpBuf[9];
                    XMLString::binToText
                    (
                        nextCh
                        , tmpBuf
                        , 8
                        , 16
                    );
                    emitError(XMLErrs::InvalidCharacter, tmpBuf);
                }

                gotLeadingSurrogate = false;
            }

            bbTarget.append(nextCh);
        }
    }
    else
    {
        // No target, but make sure its terminated ok
        if (!fReaderMgr.skippedChar(chQuestion))
        {
            emitError(XMLErrs::UnterminatedPI);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }

        if (!fReaderMgr.skippedChar(chCloseAngle))
        {
            emitError(XMLErrs::UnterminatedPI);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }
    }

    // Point the target pointer at the raw data
    targetPtr = bbTarget.getRawBuffer();

    // If we have a handler, then call it
    if (fDocHandler)
    {
        fDocHandler->docPI
        (
            namePtr
            , targetPtr
       );
    }
}

//  Scans all the input from the start of the file to the root element.
//  There does not have to be anything in the prolog necessarily, but usually
//  there is at least an XMLDecl.
//
//  On exit from here we are either at the end of the file or about to read
//  the opening < of the root element.
void XMLScanner::scanProlog()
{
    // Get a buffer for whitespace processing
    XMLBufBid bbCData(&fBufMgr);

    //  Loop through the prolog. If there is no content, this could go all
    //  the way to the end of the file.
    try
    {
        while (true)
        {
            const XMLCh nextCh = fReaderMgr.peekNextChar();

            if (nextCh == chOpenAngle)
            {
                //  Ok, it could be the xml decl, a comment, the doc type line,
                //  or the start of the root element.
                if (checkXMLDecl(true))
                {
                    // There shall be at lease --ONE-- space in between
                    // the tag '<?xml' and the VersionInfo.
                    //
                    //  If we are not at line 1, col 6, then the decl was not
                    //  the first text, so its invalid.
                    const XMLReader* curReader = fReaderMgr.getCurrentReader();
                    if ((curReader->getLineNumber() != 1)
                    ||  (curReader->getColumnNumber() != 7))
                    {
                        emitError(XMLErrs::XMLDeclMustBeFirst);
                    }

                    scanXMLDecl(Decl_XML);
                }
                else if (fReaderMgr.skippedString(XMLUni::fgPIString))
                {
                    scanPI();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
                {
                    scanComment();
                }
                 else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString))
                {
                    scanDocTypeDecl();

                    // if reusing grammar, this has been validated already in first scan
                    // skip for performance
                    if (fValidate && !fGrammar->getValidated()) {
                        //  validate the DTD scan so far
                        fValidator->preContentValidation(fUseCachedGrammar, true);
                    }
                }
                else
                {
                    // Assume its the start of the root element
                    return;
                }
            }
            else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
            {
                //  If we have a document handler then gather up the
                //  whitespace and call back. Otherwise just skip over spaces.
                if (fDocHandler)
                {
                    fReaderMgr.getSpaces(bbCData.getBuffer());
                    fDocHandler->ignorableWhitespace
                    (
                        bbCData.getRawBuffer()
                        , bbCData.getLen()
                        , false
                    );
                }
                 else
                {
                    fReaderMgr.skipPastSpaces();
                }
            }
             else
            {
                emitError(XMLErrs::InvalidDocumentStructure);

                // Watch for end of file and break out
                if (!nextCh)
                    break;
                else
                    fReaderMgr.skipPastChar(chCloseAngle);
            }

        }
    }
    catch(const EndOfEntityException&)
    {
        //  We should never get an end of entity here. They should only
        //  occur within the doc type scanning method, and not leak out to
        //  here.
        emitError
        (
            XMLErrs::UnexpectedEOE
            , "in prolog"
        );
    }
}


//  Scans the <?xml .... ?> line. This stuff is all sequential so we don't
//  do any state machine loop here. We just bull straight through it. It ends
//  past the closing bracket. If there is a document handler, then its called
//  on the XMLDecl callback.
//
//  On entry, the <?xml has been scanned, and we pick it up from there.
//
//  NOTE: In order to provide good recovery from bad XML here, we try to be
//  very flexible. No matter what order the stuff is in, we'll keep going
//  though we'll issue errors.
//
//  The parameter tells us which type of decl we should expect, Text or XML.
//    [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
//    [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
void XMLScanner::scanXMLDecl(const DeclTypes type)
{
    // Get us some buffers to use
    XMLBufBid bbVersion(&fBufMgr);
    XMLBufBid bbEncoding(&fBufMgr);
    XMLBufBid bbStand(&fBufMgr);
    XMLBufBid bbDummy(&fBufMgr);
    XMLBufBid bbName(&fBufMgr);

    //  We use this little enum and array to keep up with what we found
    //  and what order we found them in. This lets us get them free form
    //  without too much overhead, but still know that they were in the
    //  wrong order.
    enum Strings
    {
        VersionString
        , EncodingString
        , StandaloneString
        , UnknownString

        , StringCount
    };
    int flags[StringCount] = { -1, -1, -1, -1 };

    //  Also set up a list of buffers in the right order so that we know
    //  where to put stuff.
    XMLBuffer* buffers[StringCount] ;
    buffers[0] = &bbVersion.getBuffer();
    buffers[1] = &bbEncoding.getBuffer();
    buffers[2] = &bbStand.getBuffer();
    buffers[3] = &bbDummy.getBuffer();

    int curCount = 0;
    Strings curString;
    XMLBuffer& nameBuf = bbName.getBuffer();
    while (true)
    {
        // Skip any spaces
        const unsigned int spaceCount = fReaderMgr.skipPastSpaces();

        // If we are looking at a question mark, then break out
        if (fReaderMgr.lookingAtChar(chQuestion))
            break;

        // If this is not the first string, then we require the spaces
        if (!spaceCount && curCount)
            emitError(XMLErrs::ExpectedWhitespace);

        //  Get characters up to the next whitespace or equal's sign.
        if (!scanUpToWSOr(nameBuf, chEqual))
            emitError(XMLErrs::ExpectedDeclString);

        // See if it matches any of our expected strings
        if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString))
            curString = VersionString;
        else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString))
            curString = EncodingString;
        else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString))
            curString = StandaloneString;
        else
            curString = UnknownString;

        //  If its an unknown string, then give that error. Else check to
        //  see if this one has been done already and give that error.
        if (curString == UnknownString)
            emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer());
        else if (flags[curString] != -1)
            emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer());
        else if (flags[curString] == -1)
            flags[curString] = ++curCount;

        //  Scan for an equal's sign. If we don't find it, issue an error
        //  but keep trying to go on.
        if (!scanEq())
            emitError(XMLErrs::ExpectedEqSign);

        //  Get a quote string into the buffer for the string that we are
        //  currently working on.
        if (!getQuotedString(*buffers[curString]))
        {
            emitError(XMLErrs::ExpectedQuotedString);
            fReaderMgr.skipPastChar(chCloseAngle);
            return;
        }

        // And validate the value according which one it was
        const XMLCh* rawValue = buffers[curString]->getRawBuffer();
        if (curString == VersionString)
        {
            if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) {
                if (type == Decl_XML) {
                    fReaderMgr.setXMLVersion(XMLReader::XMLV1_1);
                }
            }
            else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) {
                if (type == Decl_XML) {
                    fReaderMgr.setXMLVersion(XMLReader::XMLV1_0);
                }
            }
            else
                emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
        }
         else if (curString == EncodingString)
        {
            if (!XMLString::isValidEncName(rawValue))
                emitError(XMLErrs::BadXMLEncoding, rawValue);
        }
         else if (curString == StandaloneString)
        {
            if (XMLString::equals(rawValue, XMLUni::fgYesString))
                fStandalone = true;
            else if (XMLString::equals(rawValue, XMLUni::fgNoString))
                fStandalone = false;
            else
            {
                emitError(XMLErrs::BadStandalone);
                if (!XMLString::compareIString(rawValue, XMLUni::fgYesString))
                    fStandalone = true;
                else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString))
                    fStandalone = false;
            }
        }
    }

    //  Make sure that the strings present are in order. We don't care about
    //  which ones are present at this point, just that any there are in the
    //  right order.
    int curTop = 0;
    for (int index = VersionString; index < StandaloneString; index++)
    {
        if (flags[index] != -1)
        {
            if (flags[index] !=  curTop + 1)
            {
                emitError(XMLErrs::DeclStringsInWrongOrder);
                break;
            }
            curTop = flags[index];
        }
    }

    //  If its an XML decl, the version must be present.
    //  If its a Text decl, then encoding must be present AND standalone must not be present.
    if ((type == Decl_XML) && (flags[VersionString] == -1))
        emitError(XMLErrs::XMLVersionRequired);
    else if (type == Decl_Text) {
        if (flags[StandaloneString] != -1)
            emitError(XMLErrs::StandaloneNotLegal);
        if (flags[EncodingString] == -1)
            emitError(XMLErrs::EncodingRequired);
    }

    if (!fReaderMgr.skippedChar(chQuestion))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }
     else if (!fReaderMgr.skippedChar(chCloseAngle))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }

    //  Do this before we possibly update the reader with the
    //  actual encoding string. Otherwise, we will pass the wrong thing
    //  for the last parameter!
    const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr();

    //  Ok, we've now seen the real encoding string, if there was one, so
    //  lets call back on the current reader and tell it what the real
    //  encoding string was. If it fails, that's because it represents some
    //  sort of contradiction with the autosensed format, and it keeps the
    //  original encoding.
    //
    //  NOTE: This can fail for a number of reasons, such as a bogus encoding
    //  name or because its in flagrant contradiction of the auto-sensed
    //  format.
    if (flags[EncodingString] != -1)
    {
        if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
            emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
        else
            actualEnc = bbEncoding.getRawBuffer();
    }

    //  If we have a document handler then call the XML Decl callback.
    if (type == Decl_XML)
    {
        if (fDocHandler)
            fDocHandler->XMLDecl
            (
                bbVersion.getRawBuffer()
                , bbEncoding.getRawBuffer()
                , bbStand.getRawBuffer()
                , actualEnc
            );
    }
    else if (type == Decl_Text)
    {
        if (fDocTypeHandler)
            fDocTypeHandler->TextDecl
            (
                bbVersion.getRawBuffer()
                , bbEncoding.getRawBuffer()
            );
    }
}

const XMLCh* XMLScanner::getURIText(const   unsigned int    uriId) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return XMLUni::fgZeroLenString;

        return value;
    }
    else
        return XMLUni::fgZeroLenString;
}

bool XMLScanner::getURIText(  const   unsigned int    uriId
                      ,       XMLBuffer&      uriBufToFill) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return false;

        uriBufToFill.set(value);
        return true;
    }
    else
        return false;
}

bool XMLScanner::checkXMLDecl(bool startWithAngle) {

    // [23] XMLDecl     ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
    // [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
    //
    // [3]  S           ::= (#x20 | #x9 | #xD | #xA)+
    if (startWithAngle) {
        if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
            {
                return true;
            }
            else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
            {
                //  Just in case, check for upper case. If found, issue
                //  an error, but keep going.
                emitError(XMLErrs::XMLDeclMustBeLowerCase);
                return true;
            }
        }
    }
    else {
        if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
            {
                return true;
            }
            else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
            {
                //  Just in case, check for upper case. If found, issue
                //  an error, but keep going.
                emitError(XMLErrs::XMLDeclMustBeLowerCase);
                return true;
            }
        }
    }

    return false;
}


// ---------------------------------------------------------------------------
//  XMLScanner: Grammar preparsing
// ---------------------------------------------------------------------------
Grammar* XMLScanner::loadGrammar(const   XMLCh* const systemId
                                 , const short        grammarType
                                 , const bool         toCache)
{
    InputSource* srcToUse = 0;

    if (fEntityHandler){
        srcToUse = fEntityHandler->resolveEntity(XMLUni::fgZeroLenString, systemId);
    }

    //  First we try to parse it as a URL. If that fails, we assume its
    //  a file and try it that way.
    if (!srcToUse) {

        try
        {
            //  Create a temporary URL. Since this is the primary document,
            //  it has to be fully qualified. If not, then assume we are just
            //  mistaking a file for a URL.
            XMLURL tmpURL(systemId);
            if (tmpURL.isRelative())
            {
                if (!fStandardUriConformant)
                    srcToUse = new LocalFileInputSource(systemId);
                else {
                    // since this is the top of the try/catch, cannot call ThrowXML
                    // emit the error directly
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getType()
                        , e.getMessage()
                    );
                    return 0;
                }
            }
            else
            {
                if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
                    MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
                    fInException = true;
                    emitError
                    (
                        XMLErrs::XMLException_Fatal
                        , e.getType()
                        , e.getMessage()
                    );
                    return 0;
                }
                srcToUse = new URLInputSource(tmpURL);
            }
        }
        catch(const MalformedURLException& e)
        {
            if (!fStandardUriConformant)
                srcToUse = new LocalFileInputSource(systemId);
            else {
                // since this is the top of the try/catch, cannot call ThrowXML
                // emit the error directly
                // lazy bypass ... since all MalformedURLException are fatal, no need to check the type
                fInException = true;
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , e.getType()
                    , e.getMessage()
                );
                return 0;
            }
        }
        catch(const XMLException& excToCatch)
        {
            //  For any other XMLException,
            //  emit the error and catch any user exception thrown from here.
            fInException = true;
            if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
                emitError
                (
                    XMLErrs::XMLException_Warning
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else
                emitError
                (
                    XMLErrs::XMLException_Error
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
                return 0;
        }
        catch(...)
        {
            // Just rethrow this, since its not our problem
            throw;
        }
    }

    Janitor<InputSource> janSrc(srcToUse);
    return loadGrammar(*srcToUse, grammarType, toCache);
}

Grammar* XMLScanner::loadGrammar(const   char* const systemId
                                 , const short       grammarType
                                 , const bool        toCache)
{
    // We just delegate this to the XMLCh version after transcoding
    XMLCh* tmpBuf = XMLString::transcode(systemId);
    ArrayJanitor<XMLCh> janBuf(tmpBuf);
    return loadGrammar(tmpBuf, grammarType, toCache);
}


// ---------------------------------------------------------------------------
//  XMLScanner: Setter methods
// ---------------------------------------------------------------------------
void XMLScanner::setURIStringPool(XMLStringPool* const stringPool)
{
    fURIStringPool = stringPool;
    fEmptyNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgZeroLenString);
    fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName);
    fXMLNamespaceId     = fURIStringPool->addOrFind(XMLUni::fgXMLURIName);
    fXMLNSNamespaceId   = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName);
}

// ---------------------------------------------------------------------------
//  XMLScanner: Private helper methods
// ---------------------------------------------------------------------------

//  This method is called after the content scan to insure that all the
//  ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
//  an XML 1.0 rule, so we can do here in the core.
void XMLScanner::checkIDRefs()
{
    //  Iterate the id ref list. If we find any entries here which are used
    //  but not declared, then that's an error.
    RefHashTableOfEnumerator<XMLRefInfo> refEnum(fIDRefList);
    while (refEnum.hasMoreElements())
    {
        // Get a ref to the current element
        const XMLRefInfo& curRef = refEnum.nextElement();

        // If its used but not declared, then its an error
        if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
            fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
    }
}


//  This just does a simple check that the passed progressive scan token is
//  legal for this scanner.
bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
{
    return ((fScannerId == toCheck.fScannerId)
    &&      (fSequenceId == toCheck.fSequenceId));
}


//  This method will handle figuring out what the next top level token is
//  in the input stream. It will return an enumerated value that indicates
//  what it believes the next XML level token must be. It will eat as many
//  chars are required to figure out what is next.
XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
{
    //  Get the next character and use it to guesstimate what the next token
    //  is going to be. We turn on end of entity exceptions when we do this
    //  in order to catch the scenario where the current entity ended at
    //  the > of some markup.
    XMLCh nextCh;
    {
        ThrowEOEJanitor janMgr(&fReaderMgr, true);
        nextCh = fReaderMgr.peekNextChar();
    }

    //  Check for special chars. Start with the most
    //  obvious end of file, which should be legal here at top level.
    if (!nextCh)
        return Token_EOF;


    //  If it's not a '<' we must be in content.
    //
    //  This includes entity references '&' of some sort. These must
    //  be character data because that's the only place a reference can
    //  occur in content.
    if (nextCh != chOpenAngle)
        return Token_CharData;

    //  Ok it had to have been a '<' character. So get it out of the reader
    //  and store the reader number where we saw it, passing it back to the
    //  caller.
    fReaderMgr.getNextChar();
    orgReader = fReaderMgr.getCurrentReaderNum();

    //  Ok, so lets go through the things that it could be at this point which
    //  are all some form of markup.
    nextCh = fReaderMgr.peekNextChar();

    if (nextCh == chForwardSlash)
    {
        fReaderMgr.getNextChar();
        return Token_EndTag;
    }
    else if (nextCh == chBang)
    {
        static const XMLCh gCDATAStr[] =
        {
                chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
            ,   chLatin_T, chLatin_A, chNull
        };

        static const XMLCh gCommentString[] =
        {
            chBang, chDash, chDash, chNull
        };

        if (fReaderMgr.skippedString(gCDATAStr))
            return Token_CData;

        if (fReaderMgr.skippedString(gCommentString))
            return Token_Comment;

        emitError(XMLErrs::ExpectedCommentOrCDATA);
        return Token_Unknown;
    }
    else if (nextCh == chQuestion)
    {
        // It must be a PI
        fReaderMgr.getNextChar();
        return Token_PI;
    }

    //  Assume its an element name, so return with a start tag token. If it
    //  turns out not to be, then it will fail when it cannot get a valid tag.
    return Token_StartTag;
}

// ---------------------------------------------------------------------------
//  XMLScanner: Private parsing methods
// ---------------------------------------------------------------------------

//  This guy just scans out a single or double quoted string of characters.
//  It does not pass any judgement on the contents and assumes that it is
//  illegal to have another quote of the same kind inside the string's
//  contents.
//
//  NOTE: This is for simple stuff like the strings in the XMLDecl which
//  cannot have any entities inside them. So this guy does not handle any
//  end of entity stuff.
bool XMLScanner::getQuotedString(XMLBuffer& toFill)
{
    // Reset the target buffer
    toFill.reset();

    // Get the next char which must be a single or double quote
    XMLCh quoteCh;
    if (!fReaderMgr.skipIfQuote(quoteCh))
        return false;

    while (true)
    {
        // Get another char
        const XMLCh nextCh = fReaderMgr.getNextChar();

        // See if it matches the starting quote char
        if (nextCh == quoteCh)
            break;

        //  We should never get either an end of file null char here. If we
        //  do, just fail. It will be handled more gracefully in the higher
        //  level code that called us.
        if (!nextCh)
            return false;

        // Else add it to the buffer
        toFill.append(nextCh);
    }
    return true;
}


//  This method scans a character reference and returns the character that
//  was refered to. It assumes that we've already scanned the &# characters
//  that prefix the numeric code.
bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
{
    bool gotOne = false;
    unsigned int value = 0;

    //  Set the radix. Its supposed to be a lower case x if hex. But, in
    //  order to recover well, we check for an upper and put out an error
    //  for that.
    unsigned int radix = 10;
    if (fReaderMgr.skippedChar(chLatin_x))
    {
        radix = 16;
    }
    else if (fReaderMgr.skippedChar(chLatin_X))
    {
        emitError(XMLErrs::HexRadixMustBeLowerCase);
        radix = 16;
    }

    while (true)
    {
        const XMLCh nextCh = fReaderMgr.peekNextChar();

        // Watch for EOF
        if (!nextCh)
            ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);

        // Break out on the terminating semicolon
        if (nextCh == chSemiColon)
        {
            fReaderMgr.getNextChar();
            break;
        }

        //  Convert this char to a binary value, or bail out if its not
        //  one.
        unsigned int nextVal;
        if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
            nextVal = (unsigned int)(nextCh - chDigit_0);
        else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
            nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
        else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
            nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
        else
        {
            // Return a zero
            toFill = 0;

            //  If we got at least a sigit, then do an unterminated ref error.
            //  Else, do an expected a numerical ref thing.
            if (gotOne)
                emitError(XMLErrs::UnterminatedCharRef);
            else
                emitError(XMLErrs::ExpectedNumericalCharRef);

            // Return failure
            return false;
        }

        //  Make sure its valid for the radix. If not, then just eat the
        //  digit and go on after issueing an error. Else, update the
        //  running value with this new digit.
        if (nextVal >= radix)
        {
            XMLCh tmpStr[2];
            tmpStr[0] = nextCh;
            tmpStr[1] = chNull;
            emitError(XMLErrs::BadDigitForRadix, tmpStr);
        }
        else
        {
            value = (value * radix) + nextVal;
        }

        // Indicate that we got at least one good digit
        gotOne = true;

        // And eat the last char
        fReaderMgr.getNextChar();
    }

    // Return the char (or chars)
    // And check if the character expanded is valid or not
    if (value >= 0x10000 && value <= 0x10FFFF)
    {
        value -= 0x10000;
        toFill = XMLCh((value >> 10) + 0xD800);
        second = XMLCh((value & 0x3FF) + 0xDC00);
    }
    else if (value <= 0xFFFD)
    {
        toFill = XMLCh(value);
        second = 0;
        if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) {
            // Character reference was not in the valid range
            emitError(XMLErrs::InvalidCharacterRef);
            return false;
        }
    }
    else {
        // Character reference was not in the valid range
        emitError(XMLErrs::InvalidCharacterRef);
        return false;
    }

    return true;
}


//  We get here after the '<!--' part of the comment. We scan past the
//  terminating '-->' It will calls the appropriate handler with the comment
//  text, if one is provided. A comment can be in either the document or
//  the DTD, so the fInDocument flag is used to know which handler to send
//  it to.
void XMLScanner::scanComment()
{
    enum States
    {
        InText
        , OneDash
        , TwoDashes
    };

    // Get a buffer for this
    XMLBufBid bbComment(&fBufMgr);

    //  Get the comment text into a temp buffer. Be sure to use temp buffer
    //  two here, since its to be used for stuff that is potentially longer
    //  than just a name.
    States curState = InText;
    bool gotLeadingSurrogate = false;
    while (true)
    {
        // Get the next character
        const XMLCh nextCh = fReaderMgr.getNextChar();

        //  Watch for an end of file
        if (!nextCh)
        {
            emitError(XMLErrs::UnterminatedComment);
            ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
        }

        // Check for correct surrogate pairs
        if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
        {
            if (gotLeadingSurrogate)
                emitError(XMLErrs::Expected2ndSurrogateChar);
            else
                gotLeadingSurrogate = true;
        }
        else
        {
            if (gotLeadingSurrogate)
            {
                if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
                    emitError(XMLErrs::Expected2ndSurrogateChar);
            }
            // Its got to at least be a valid XML character
            else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {

                XMLCh tmpBuf[9];
                XMLString::binToText
                (
                    nextCh
                    , tmpBuf
                    , 8
                    , 16
                );
                emitError(XMLErrs::InvalidCharacter, tmpBuf);
            }

            gotLeadingSurrogate = false;
        }

        if (curState == InText)
        {
            // If its a dash, go to OneDash state. Otherwise take as text
            if (nextCh == chDash)
                curState = OneDash;
            else
                bbComment.append(nextCh);