XMLScanner.cpp

//  NOTE: This is for simple stuff like the strings in the XMLDecl which
//  cannot have any entities inside them. So this guy does not handle any
//  end of entity stuff.
bool XMLScanner::getQuotedString(XMLBuffer& toFill)
{
    // Reset the target buffer
    toFill.reset();

    // Get the next char which must be a single or double quote
    XMLCh quoteCh;
    if (!fReaderMgr.skipIfQuote(quoteCh))
        return false;

    while (true)
    {
        // Get another char
        const XMLCh nextCh = fReaderMgr.getNextChar();

        // See if it matches the starting quote char
        if (nextCh == quoteCh)
            break;

        //  We should never get either an end of file null char here. If we
        //  do, just fail. It will be handled more gracefully in the higher
        //  level code that called us.
        if (!nextCh)
            return false;

        // Else add it to the buffer
        toFill.append(nextCh);
    }
    return true;
}


//  This method scans a character reference and returns the character that
//  was refered to. It assumes that we've already scanned the &# characters
//  that prefix the numeric code.
bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
{
    bool gotOne = false;
    unsigned int value = 0;

    //  Set the radix. Its supposed to be a lower case x if hex. But, in
    //  order to recover well, we check for an upper and put out an error
    //  for that.
    unsigned int radix = 10;
    if (fReaderMgr.skippedChar(chLatin_x))
    {
        radix = 16;
    }
    else if (fReaderMgr.skippedChar(chLatin_X))
    {
        emitError(XMLErrs::HexRadixMustBeLowerCase);
        radix = 16;
    }

    while (true)
    {
        const XMLCh nextCh = fReaderMgr.peekNextChar();

        // Watch for EOF
        if (!nextCh)
            ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);

        // Break out on the terminating semicolon
        if (nextCh == chSemiColon)
        {
            fReaderMgr.getNextChar();
            break;
        }

        //  Convert this char to a binary value, or bail out if its not
        //  one.
        unsigned int nextVal;
        if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
            nextVal = (unsigned int)(nextCh - chDigit_0);
        else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
            nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
        else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
            nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
        else
        {
            // Return a zero
            toFill = 0;

            //  If we got at least a sigit, then do an unterminated ref error.
            //  Else, do an expected a numerical ref thing.
            if (gotOne)
                emitError(XMLErrs::UnterminatedCharRef);
            else
                emitError(XMLErrs::ExpectedNumericalCharRef);

            // Return failure
            return false;
        }

        //  Make sure its valid for the radix. If not, then just eat the
        //  digit and go on after issueing an error. Else, update the
        //  running value with this new digit.
        if (nextVal >= radix)
        {
            XMLCh tmpStr[2];
            tmpStr[0] = nextCh;
            tmpStr[1] = chNull;
            emitError(XMLErrs::BadDigitForRadix, tmpStr);
        }
        else
        {
            value = (value * radix) + nextVal;
            // Guard against overflow.
            if (value > 0x10FFFF) {
                // Character reference was not in the valid range
                emitError(XMLErrs::InvalidCharacterRef);
                return false;
            }
        }

        // Indicate that we got at least one good digit
        gotOne = true;

        // And eat the last char
        fReaderMgr.getNextChar();
    }

    // Return the char (or chars)
    // And check if the character expanded is valid or not
    if (value >= 0x10000 && value <= 0x10FFFF)
    {
        value -= 0x10000;
        toFill = XMLCh((value >> 10) + 0xD800);
        second = XMLCh((value & 0x3FF) + 0xDC00);
    }
    else if (value <= 0xFFFD)
    {
        toFill = XMLCh(value);
        second = 0;
        if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) {
            // Character reference was not in the valid range
            emitError(XMLErrs::InvalidCharacterRef);
            return false;
        }
    }
    else {
        // Character reference was not in the valid range
        emitError(XMLErrs::InvalidCharacterRef);
        return false;
    }

    return true;
}


//  We get here after the '<!--' part of the comment. We scan past the
//  terminating '-->' It will calls the appropriate handler with the comment
//  text, if one is provided. A comment can be in either the document or
//  the DTD, so the fInDocument flag is used to know which handler to send
//  it to.
void XMLScanner::scanComment()
{

    enum States
    {
        InText
        , OneDash
        , TwoDashes
    };

    // Get a buffer for this
    XMLBufBid bbComment(&fBufMgr);

    //  Get the comment text into a temp buffer. Be sure to use temp buffer
    //  two here, since its to be used for stuff that is potentially longer
    //  than just a name.
    States curState = InText;
    bool gotLeadingSurrogate = false;
    while (true)
    {
        // Get the next character
        const XMLCh nextCh = fReaderMgr.getNextChar();

        //  Watch for an end of file
        if (!nextCh)
        {
            emitError(XMLErrs::UnterminatedComment);
            ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
        }

        // Check for correct surrogate pairs
        if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
        {
            if (gotLeadingSurrogate)
                emitError(XMLErrs::Expected2ndSurrogateChar);
            else
                gotLeadingSurrogate = true;
        }
        else
        {
            if (gotLeadingSurrogate)
            {
                if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
                    emitError(XMLErrs::Expected2ndSurrogateChar);
            }
            // Its got to at least be a valid XML character
            else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) {

                XMLCh tmpBuf[9];
                XMLString::binToText
                (
                    nextCh
                    , tmpBuf
                    , 8
                    , 16
                    , fMemoryManager
                );
                emitError(XMLErrs::InvalidCharacter, tmpBuf);
            }

            gotLeadingSurrogate = false;
        }

        if (curState == InText)
        {
            // If its a dash, go to OneDash state. Otherwise take as text
            if (nextCh == chDash)
                curState = OneDash;
            else
                bbComment.append(nextCh);
        }
        else if (curState == OneDash)
        {
            //  If its another dash, then we change to the two dashes states.
            //  Otherwise, we have to put in the deficit dash and the new
            //  character and go back to InText.
            if (nextCh == chDash)
            {
                curState = TwoDashes;
            }
            else
            {
                bbComment.append(chDash);
                bbComment.append(nextCh);
                curState = InText;
            }
        }
        else if (curState == TwoDashes)
        {
            // The next character must be the closing bracket
            if (nextCh != chCloseAngle)
            {
                emitError(XMLErrs::IllegalSequenceInComment);
                fReaderMgr.skipPastChar(chCloseAngle);
                return;
            }
            break;
        }
    }

    // If we have an available handler, call back with the comment.
    if (fDocHandler)
    {
        fDocHandler->docComment
        (
            bbComment.getRawBuffer()
        );
    }

    //mark comment is seen within the current element
    if (! fElemStack.isEmpty())
        fElemStack.setCommentOrPISeen();

}


//  Most equal signs can have white space around them, so this little guy
//  just makes the calling code cleaner by eating whitespace.
bool XMLScanner::scanEq(bool inDecl)
{
    fReaderMgr.skipPastSpaces(inDecl);
    if (fReaderMgr.skippedChar(chEqual))
    {
        fReaderMgr.skipPastSpaces(inDecl);
        return true;
    }
    return false;
}


unsigned int
XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
{
    fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
    return toFill.getLen();
}

unsigned int *XMLScanner::getNewUIntPtr()
{
    // this method hands back a new pointer initialized to 0
    unsigned int *retVal;
    if(fUIntPoolCol < 64)
    {
        retVal = fUIntPool[fUIntPoolRow]+fUIntPoolCol;
        fUIntPoolCol++;
        return retVal;
    }
    // time to grow the pool...
    if(fUIntPoolRow+1 == fUIntPoolRowTotal)
    {
        // and time to add some space for new rows:
        fUIntPoolRowTotal <<= 1;
        unsigned int **newArray = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal );
        memcpy(newArray, fUIntPool, (fUIntPoolRow+1) * sizeof(unsigned int *));
        fMemoryManager->deallocate(fUIntPool);
        fUIntPool = newArray;
        // need to 0 out new elements we won't need:
        for (unsigned int i=fUIntPoolRow+2; i<fUIntPoolRowTotal; i++)
            fUIntPool[i] = 0;
    }
    // now to add a new row; we just made sure we have space
    fUIntPoolRow++;
    fUIntPool[fUIntPoolRow] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
    memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6);
    // point to next element
    fUIntPoolCol = 1; 
    return fUIntPool[fUIntPoolRow];
}

void XMLScanner::resetUIntPool()
{
    // to reuse the unsigned int pool--and the hashtables that use it--
    // simply reinitialize everything to 0's
    for(unsigned int i = 0; i<= fUIntPoolRow; i++)
        memset(fUIntPool[i], 0, sizeof(unsigned int) << 6);
}

void XMLScanner::recreateUIntPool()
{
    // this allows a bloated unsigned int pool to be dispensed with

    // first, delete old fUIntPool
    for (unsigned int i=0; i<=fUIntPoolRow; i++)
    {
        fMemoryManager->deallocate(fUIntPool[i]);
    }
    fMemoryManager->deallocate(fUIntPool);

    fUIntPoolRow = fUIntPoolCol = 0;
    fUIntPoolRowTotal = 2;
    fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal);
    fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
    memset(fUIntPool[fUIntPoolRow], 0, sizeof(unsigned int) << 6);
    fUIntPool[1] = 0;
}

XERCES_CPP_NAMESPACE_END