Skip to content
Snippets Groups Projects
XMLReader.cpp 59.4 KiB
Newer Older

            }
            else
            {
                skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
                return true;
            }
        }

        //  We've eaten up the current buffer, so lets try to reload it. If
        //  we don't get anything new, then break out. If we do, then we go
        //  back to the top to keep getting spaces.
        if (!refreshCharBuffer())
            break;
    }

    // We never hit any non-space and ate up the whole reader
    skippedSomething = (orgLine != fCurLine) || (orgCol != fCurCol);
    return false;
}
PeiYong Zhang's avatar
PeiYong Zhang committed

bool XMLReader::skippedChar(const XMLCh toSkip)
{
    //
    //  If the buffer is empty, then try to reload it. If we still get
    //  nothing, then return false.
    //
    if (fCharIndex == fCharsAvail)
    {
        if (!refreshCharBuffer())
            return false;
    }

    //
    //  See if the current char is the one we want. If so, then we need
    //  to eat it and return true.
    //
    if (fCharBuf[fCharIndex] == toSkip)
    {
        fCharIndex++;
        fCurCol++;
        return true;
    }
    return false;
}


bool XMLReader::skippedSpace()
{
    //
    //  If the buffer is empty, then try to reload it. If we still get
    //  nothing, then return false.
    //
    if (fCharIndex == fCharsAvail)
    {
        if (!refreshCharBuffer())
            return false;
    }

    //
    //  See if the current char is a whitespace. If so, then we need to eat
    //  it and return true.
    //
    const XMLCh curCh = fCharBuf[fCharIndex];
Tinny Ng's avatar
Tinny Ng committed
    if (isWhitespace(curCh))
PeiYong Zhang's avatar
PeiYong Zhang committed
    {
        // Eat the character
        fCharIndex++;

        //
        //  'curCh' is a whitespace(x20|x9|xD|xA), so we only can have
        //  end-of-line combinations with a leading chCR(xD) or chLF(xA)
        //
        //  100000 x20
        //  001001 x9
        //  001010 chLF
        //  001101 chCR
        //  -----------
        //  000110 == (chCR|chLF) & ~(0x9|0x20)
        //
        //  if the result of the logical-& operation is
        //  true  : 'curCh' must be xA  or xD
        //  false : 'curCh' must be x20 or x9
        //
        if ( ( curCh & (chCR|chLF) & ~(0x9|0x20) ) == 0 )
        {
            fCurCol++;
        } else
        {
            handleEOL((XMLCh&)curCh, false);
        }
PeiYong Zhang's avatar
PeiYong Zhang committed

        return true;
    }
    return false;
}


bool XMLReader::skippedString(const XMLCh* const toSkip)
{
    // Get the length of the string to skip
    const unsigned int srcLen = XMLString::stringLen(toSkip);
    unsigned int charsLeft = charsLeftInBuffer();

    if (srcLen <= fCharsAvail) {    
        //
        //  See if the current reader has enough chars to test against this
        //  string. If not, then ask it to reload its buffer. If that does not
        //  get us enough, then it cannot match.
        //
        //  NOTE: This works because strings never have to cross a reader! And
        //  a string to skip will never have a new line in it, so we will never
        //  miss adjusting the current line.
        //        
        while (charsLeft < srcLen)
        {
            refreshCharBuffer();
            unsigned int t = charsLeftInBuffer();
            if (t == charsLeft)   // if the refreshCharBuf() did not add anything new
                return false;     //   give up and return.
            charsLeft = t;
	    }
PeiYong Zhang's avatar
PeiYong Zhang committed

        //
        //  Ok, now we now that the current reader has enough chars in its
        //  buffer and that its index is back at zero. So we can do a quick and
        //  dirty comparison straight to its buffer with no requirement to unget
        //  if it fails.
        //
        if (memcmp(&fCharBuf[fCharIndex], toSkip, srcLen*sizeof(XMLCh)))
PeiYong Zhang's avatar
PeiYong Zhang committed

        //
        //  And get the character buffer index back right by just adding the
        //  source len to it.
        //
        fCharIndex += srcLen;
    }    
    else {
        if (charsLeft == 0) {
            refreshCharBuffer();
            charsLeft = charsLeftInBuffer();
            if (charsLeft == 0)
                return false; // error situation
        }
        if (memcmp(&fCharBuf[fCharIndex], toSkip, charsLeft*sizeof(XMLCh)))
PeiYong Zhang's avatar
PeiYong Zhang committed

        fCharIndex += charsLeft;
    
        unsigned int offset = charsLeft;
        unsigned int remainingLen = srcLen - charsLeft;

        while (remainingLen > 0) {
            refreshCharBuffer();
            charsLeft = charsLeftInBuffer();
            if (charsLeft == 0)
                return false; // error situation
            if (charsLeft > remainingLen)
                charsLeft = remainingLen;
            if (memcmp(&fCharBuf[fCharIndex], toSkip+offset, charsLeft*sizeof(XMLCh)))
                return false;
            offset += charsLeft;
            remainingLen -= charsLeft;
            fCharIndex += charsLeft;
PeiYong Zhang's avatar
PeiYong Zhang committed

PeiYong Zhang's avatar
PeiYong Zhang committed

    }

    // Add the source length to the current column to get it back right
    fCurCol += srcLen;   
PeiYong Zhang's avatar
PeiYong Zhang committed

    return true;
}

//
// This is just to peek if the next coming buffer
// matches the string toPeek.
// Similar to skippedString, but just the fCharIndex and fCurCol are not updated
//
bool XMLReader::peekString(const XMLCh* const toPeek)
{
    // Get the length of the string to skip
    const unsigned int srcLen = XMLString::stringLen(toPeek);

    //
    //  See if the current reader has enough chars to test against this
    //  string. If not, then ask it to reload its buffer. If that does not
    //  get us enough, then it cannot match.
    //
    //  NOTE: This works because strings never have to cross a reader! And
    //  a string to skip will never have a new line in it, so we will never
    //  miss adjusting the current line.
    //
    unsigned int charsLeft = charsLeftInBuffer();
    while (charsLeft < srcLen)
    {
         refreshCharBuffer();
         unsigned int t = charsLeftInBuffer();
         if (t == charsLeft)   // if the refreshCharBuf() did not add anything new
             return false;     //   give up and return.
         charsLeft = t;
	}




    //
    //  Ok, now we now that the current reader has enough chars in its
    //  buffer and that its index is back at zero. So we can do a quick and
    //  dirty comparison straight to its buffer with no requirement to unget
    //  if it fails.
    //
    if (memcmp(&fCharBuf[fCharIndex], toPeek, srcLen*sizeof(XMLCh)))
PeiYong Zhang's avatar
PeiYong Zhang committed
        return false;

    return true;
}


// ---------------------------------------------------------------------------
//  XMLReader: Setter methods (most are inlined)
// ---------------------------------------------------------------------------
bool XMLReader::setEncoding(const XMLCh* const newEncoding)
{
    //
    //  If the encoding was forced, then we ignore the new value and just
    //  return with success. If it was forced, then we are to use that
    //  encoding without question. Note that, if we are forced, we created
    //  a transcoder up front so there is no need to do one here in that
    //  case.
    //
    if (fForcedEncoding)
        return true;

Tinny Ng's avatar
Tinny Ng committed
    //
    // upperCase the newEncoding first for better performance
    //
    XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager);
PeiYong Zhang's avatar
PeiYong Zhang committed

    XMLRecognizer::Encodings newBaseEncoding;
PeiYong Zhang's avatar
PeiYong Zhang committed
    //
    //  Check for non-endian specific UTF-16 or UCS-4. If so, and if we
    //  are already in one of the endian versions of those encodings,
    //  then just keep it and go on. Otherwise, its not valid.
PeiYong Zhang's avatar
PeiYong Zhang committed
    //
    if (!XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString)
    ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString2)
    ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString3)
    ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString4)
    ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString5)
    ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString6)
    ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString7))
PeiYong Zhang's avatar
PeiYong Zhang committed
    {
        fMemoryManager->deallocate(inputEncoding);
Tinny Ng's avatar
Tinny Ng committed

        if ((fEncoding != XMLRecognizer::UTF_16L)
        &&  (fEncoding != XMLRecognizer::UTF_16B))
        {
            return false;
        }
PeiYong Zhang's avatar
PeiYong Zhang committed

        // Override with the original endian specific encoding
        newBaseEncoding = fEncoding;
PeiYong Zhang's avatar
PeiYong Zhang committed

        if (fEncoding == XMLRecognizer::UTF_16L) {
            fMemoryManager->deallocate(fEncodingStr);
            fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager);
PeiYong Zhang's avatar
PeiYong Zhang committed
        }
        else {
            fMemoryManager->deallocate(fEncodingStr);
            fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager);
        }
    }
    else if (!XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString)
         ||  !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString2)
         ||  !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString3)
         ||  !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString4))
    {
        fMemoryManager->deallocate(inputEncoding);
Tinny Ng's avatar
Tinny Ng committed

        if ((fEncoding != XMLRecognizer::UCS_4L)
        &&  (fEncoding != XMLRecognizer::UCS_4B))
        {
            return false;
        }
PeiYong Zhang's avatar
PeiYong Zhang committed

        // Override with the original endian specific encoding
        newBaseEncoding = fEncoding;
PeiYong Zhang's avatar
PeiYong Zhang committed

        if (fEncoding == XMLRecognizer::UCS_4L) {
PeiYong Zhang's avatar
PeiYong Zhang committed

            fMemoryManager->deallocate(fEncodingStr);
            fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager);
        }
        else {
PeiYong Zhang's avatar
PeiYong Zhang committed

            fMemoryManager->deallocate(fEncodingStr);
            fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager);
PeiYong Zhang's avatar
PeiYong Zhang committed
        }
    }
     else
    {
        //
        //  Try to map the string to one of our standard encodings. If its not
        //  one of them, then it has to be one of the non-intrinsic encodings,
        //  in which case we have to delete our intrinsic encoder and create a
        //  new one.
        //
        newBaseEncoding = XMLRecognizer::encodingForName(inputEncoding);

        //
        //  If it does not come back as one of the auto-sensed encodings, then we
        //  have to possibly replace it and at least check a few things.
        //
        if (newBaseEncoding == XMLRecognizer::OtherEncoding)
PeiYong Zhang's avatar
PeiYong Zhang committed
        {
Tinny Ng's avatar
Tinny Ng committed
            //
            // We already know it's none of those non-endian special cases, 
            // so just replicate the new name and use it directly to create the transcoder
Tinny Ng's avatar
Tinny Ng committed
            //
            fMemoryManager->deallocate(fEncodingStr);
Tinny Ng's avatar
Tinny Ng committed
            fEncodingStr = inputEncoding;

            XMLTransService::Codes failReason;
            fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
            (
                fEncodingStr
                , failReason
                , kCharBufSize
Tinny Ng's avatar
Tinny Ng committed
            );
PeiYong Zhang's avatar
PeiYong Zhang committed
        }
        else
        {
            // Store the new encoding string since it is just an intrinsic
            fMemoryManager->deallocate(fEncodingStr);
            fEncodingStr = inputEncoding;
        }
Tinny Ng's avatar
Tinny Ng committed
    if (!fTranscoder) {
        //
        //  Now we can create a transcoder using the recognized fEncoding.  We
        //  might get back a transcoder for an intrinsically supported encoding,
        //  or we might get one from the underlying transcoding service.
        //
        XMLTransService::Codes failReason;
        fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
        (
            newBaseEncoding
            , failReason
            , kCharBufSize
Tinny Ng's avatar
Tinny Ng committed
        );
PeiYong Zhang's avatar
PeiYong Zhang committed

Tinny Ng's avatar
Tinny Ng committed
        if (!fTranscoder)
            ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
Tinny Ng's avatar
Tinny Ng committed
    }
PeiYong Zhang's avatar
PeiYong Zhang committed

    // Update the base encoding member with the new base encoding found
    fEncoding = newBaseEncoding;

    // Looks ok to us
    return true;
}


// ---------------------------------------------------------------------------
//  XMLReader: Private helper methods
// ---------------------------------------------------------------------------

//
//  This is called when the encoding flag is set and just sets the fSwapped
//  flag appropriately.
//
void XMLReader::checkForSwapped()
{
    // Assume not swapped
    fSwapped = false;

	if (XMLPlatformUtils::fgXMLChBigEndian)
	{
        if ((fEncoding == XMLRecognizer::UTF_16L)
        ||  (fEncoding == XMLRecognizer::UCS_4L))
PeiYong Zhang's avatar
PeiYong Zhang committed
        {
            fSwapped = true;
        }
    }
    else
    {
        if ((fEncoding == XMLRecognizer::UTF_16B)
        ||  (fEncoding == XMLRecognizer::UCS_4B))
PeiYong Zhang's avatar
PeiYong Zhang committed
        {
            fSwapped = true;
        }
PeiYong Zhang's avatar
PeiYong Zhang committed
}


//
//  This is called from the constructor when the encoding is not forced.
//  We assume that the encoding has been auto-sensed at this point and that
//  fSwapped is set correctly.
//
//  In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
//  The fact that we got here, means that there is one, because that's the
//  only way we can autosense those.
//
void XMLReader::doInitDecode()
{
    switch(fEncoding)
    {
        case XMLRecognizer::UCS_4B :
        case XMLRecognizer::UCS_4L :
        {
            // Remove bom if any
            if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
                ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))  )
            {
                for (unsigned int i = 0; i < fRawBytesAvail; i++)
                    fRawByteBuf[i] = fRawByteBuf[i+4];

                fRawBytesAvail -=4;
            }

PeiYong Zhang's avatar
PeiYong Zhang committed
            // Look at the raw buffer as UCS4 chars
            const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;

            while (fRawBufIndex < fRawBytesAvail)
            {
                // Get out the current 4 byte value and inc our raw buf index
                UCS4Ch curVal = *asUCS++;
                fRawBufIndex += sizeof(UCS4Ch);

                // Swap if that is required for this machine
                if (fSwapped)
                    curVal = BitOps::swapBytes(curVal);

                // Make sure its at least semi legal. If not, undo and throw
                if (curVal > 0xFFFF)
                {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
PeiYong Zhang's avatar
PeiYong Zhang committed
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
PeiYong Zhang's avatar
PeiYong Zhang committed
                    );
                }

                // Convert the value to an XML char and store it
                fCharSizeBuf[fCharsAvail] = 4;
                fCharBuf[fCharsAvail++] = XMLCh(curVal);

                // Break out on the > character
                if (curVal == chCloseAngle)
                    break;
            }
            break;
        }

        case XMLRecognizer::UTF_8 :
        {
            // If there's a utf-8 BOM  (0xEF 0xBB 0xBF), skip past it.
            //   Don't move to char buf - no one wants to see it.
            //   Note: this causes any encoding= declaration to override
            //         the BOM's attempt to say that the encoding is utf-8.

            // Look at the raw buffer as short chars
            const char* asChars = (const char*)fRawByteBuf;

            if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
                XMLString::compareNString(  asChars
                                            , XMLRecognizer::fgUTF8BOM
                                            , XMLRecognizer::fgUTF8BOMLen) == 0)
            {
                fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
                asChars      += XMLRecognizer::fgUTF8BOMLen;
            }

            //
            //  First check that there are enough bytes to even see the
            //  decl indentifier. If not, get out now with no action since
            //  there is no decl.
            //
            if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
                break;

            // Check for the opening sequence. If not, then no decl
            if (XMLString::compareNString(  asChars
                                            , XMLRecognizer::fgASCIIPre
                                            , XMLRecognizer::fgASCIIPreLen))
            {
                break;
            }

            while (fRawBufIndex < fRawBytesAvail)
            {
                const char curCh = *asChars++;
                fRawBufIndex++;

                // Looks ok, so store it
                fCharSizeBuf[fCharsAvail] = 1;
                fCharBuf[fCharsAvail++] = XMLCh(curCh);

                // Break out on a > character
                if (curCh == chCloseAngle)
                    break;

                //
                //  A char greater than 0x7F is not allowed in this case. If
                //  so, undo and throw.
                //
                if (curCh & 0x80)
                {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
PeiYong Zhang's avatar
PeiYong Zhang committed
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
PeiYong Zhang's avatar
PeiYong Zhang committed
                    );
                }
            }
            break;
        }

        case XMLRecognizer::UTF_16B :
        case XMLRecognizer::UTF_16L :
        {
            //
            //  If there is a decl here, we just truncate back the characters
            //  as we go. No surrogate creation would be allowed here in legal
            //  XML, so we consider it a transoding error if we find one.
            //
            if (fRawBytesAvail < 2)
                break;

PeiYong Zhang's avatar
PeiYong Zhang committed
            const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
            if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
            {
                fRawBufIndex += sizeof(UTF16Ch);
                asUTF16++;
PeiYong Zhang's avatar
PeiYong Zhang committed
            }

            //  First check that there are enough raw bytes for there to even
            //  be a decl indentifier. If not, then nothing to do.
            //
            if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
            {
PeiYong Zhang's avatar
PeiYong Zhang committed
                break;
            }

            //
            //  See we get a match on the prefix. If not, then reset and
            //  break out.
            //
            if (fEncoding == XMLRecognizer::UTF_16B)
            {
                if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
                {
PeiYong Zhang's avatar
PeiYong Zhang committed
                    break;
                }
            }
             else
            {
                if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
                {
PeiYong Zhang's avatar
PeiYong Zhang committed
                    break;
                }
            }

            while (fRawBufIndex < fRawBytesAvail)
            {
                // Get out the current 2 byte value
                UTF16Ch curVal = *asUTF16++;
                fRawBufIndex += sizeof(UTF16Ch);

                // Swap if that is required for this machine
                if (fSwapped)
                    curVal = BitOps::swapBytes(curVal);

                //
                //  Store it and bump the target index, implicitly converting
                //  if UTF16Ch and XMLCh are not the same size.
                //
                fCharSizeBuf[fCharsAvail] = 2;
                fCharBuf[fCharsAvail++] = curVal;

                // Break out on a > char
                if (curVal == chCloseAngle)
                    break;
            }
            break;
        }

        case XMLRecognizer::EBCDIC :
        {
            //
            //  We use special support in the intrinsic EBCDIC-US transcoder
            //  to go through one char at a time.
            //
            const XMLByte* srcPtr = fRawByteBuf;
            while (1)
            {
                // Transcode one char from the source
                const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
                fRawBufIndex++;

                //
                //  And put it into the character buffer. This stuff has to
                //  look like it was normally transcoded.
                //
                fCharSizeBuf[fCharsAvail] = 1;
                fCharBuf[fCharsAvail++] = chCur;

                // If its a > char, then break out
                if (chCur == chCloseAngle)
                    break;

                // Watch for using up all input and get out
                if (fRawBufIndex == fRawBytesAvail)
                    break;
            }
            break;
        }

        default :
            // It should never be anything else here
            fMemoryManager->deallocate(fPublicId);
            fMemoryManager->deallocate(fEncodingStr);                    
            fMemoryManager->deallocate(fSystemId);
            ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
PeiYong Zhang's avatar
PeiYong Zhang committed
            break;
    }

    //
    //  Ok, by the time we get here, if its a legal XML file we have eaten
    //  the XML/TextDecl. So, if we are a PE and are being referenced from
    //  outside a literal, then we need to throw in an arbitrary space that
    //  is required by XML.
    //
    if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
        fCharBuf[fCharsAvail++] = chSpace;
    
    //  Calculate fCharOfsBuf buffer using the elements from fCharBufSize
    if (fCalculateSrcOfs)
    {
        fCharOfsBuf[0] = 0;
        for (unsigned int index = 1; index < fCharsAvail; ++index) {
            fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
        }
    }
PeiYong Zhang's avatar
PeiYong Zhang committed
}


//
//  This method is called internally when we run out of bytes in the raw
//  buffer. We just read as many bytes as we can into the raw buffer again
//  and store the number of bytes we got.
//
void XMLReader::refreshRawBuffer()
{
    //
    //  If there are any bytes left, move them down to the start. There
    //  should only ever be (max bytes per char - 1) at the most.
    //
    const unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;

    // Move the existing ones down
    for (unsigned int index = 0; index < bytesLeft; index++)
        fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];

    //
    //  And then read into the buffer past the existing bytes. Add back in
    //  that many to the bytes read, and subtract that many from the bytes
    //  requested.
    //
    fRawBytesAvail = fStream->readBytes
    (
        &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft
    ) + bytesLeft;

    //
    //  We need to reset the buffer index back to the start in all cases,
    //  since any trailing data was copied down to the start.
    //
    fRawBufIndex = 0;
}


//
//  This method is called internally when we run out of characters in the
//  trancoded character buffer. We transcode up to another maxChars chars
//  from the
//
unsigned int
XMLReader::xcodeMoreChars(          XMLCh* const            bufToFill
                            ,       unsigned char* const    charSizes
                            , const unsigned int            maxChars)
{
    // If we are plain tuckered out, then return zero now
    if (!fRawBytesAvail)
        return 0;

    //
    //  If our raw buffer is low, then lets load up another batch of
    //  raw bytes now.  We can't check for exactly zero bytes left because
    //  transcoding of multi-byte encodings may have left a few bytes
    //  representing a partial character in the buffer that can't be
    //  used until the next buffer (and the rest of the character)
    //  is read.
    //
    unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;
    if (bytesLeft < 100)
    {
        refreshRawBuffer();

        // If we didn't get anything more just return a zero now
        if (!fRawBytesAvail)
            return 0;
    }

    // Ask the transcoder to internalize another batch of chars
    XMLSize_t bytesEaten;
    const XMLSize_t charsDone = fTranscoder->transcodeFrom
PeiYong Zhang's avatar
PeiYong Zhang committed
    (
        &fRawByteBuf[fRawBufIndex]
        , fRawBytesAvail - fRawBufIndex
        , bufToFill
        , maxChars
        , bytesEaten
        , charSizes
    );

    // Update the raw buffer index
    fRawBufIndex += bytesEaten;

    return charsDone;
}

/***
 *
 * XML1.1
 *
 * 2.11 End-of-Line Handling
 *
 *    XML parsed entities are often stored in computer files which, for editing 
 *    convenience, are organized into lines. These lines are typically separated 
 *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
 *
 *    To simplify the tasks of applications, the XML processor MUST behave as if 
 *    it normalized all line breaks in external parsed entities (including the document 
 *    entity) on input, before parsing, by translating all of the following to a single 
 *    #xA character:
 *
 *  1. the two-character sequence #xD #xA
 *  2. the two-character sequence #xD #x85
 *  3. the single character #x85
 *  4. the single character #x2028
 *  5. any #xD character that is not immediately followed by #xA or #x85.
 *
 *
 ***/
void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
{
    // 1. the two-character sequence #xD #xA
    // 2. the two-character sequence #xD #x85
    // 5. any #xD character that is not immediately followed by #xA or #x85.
    if (curCh == chCR)
    {
        fCurCol = 1;
        fCurLine++;

        //
        //  If not already internalized, then convert it to an
        //  LF and eat any following LF.
        //
        if (fSource == Source_External)
        {
            if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
            {
                if ( fCharBuf[fCharIndex] == chLF              || 
                    ((fCharBuf[fCharIndex] == chNEL) && fNEL)  )
                {
                    fCharIndex++;
                }
            }
            curCh = chLF;
        }
    }
    else if (curCh == chLF)                   
    {
        fCurCol = 1;
        fCurLine++;
    }
    // 3. the single character #x85
    // 4. the single character #x2028
    else if (curCh == chNEL || curCh == chLineSeparator)
    {
        if (inDecl && fXMLVersion == XMLV1_1)
        {

        /***
         * XML1.1
         *
         * 2.11 End-of-Line Handling
         *  ...
         *   The characters #x85 and #x2028 cannot be reliably recognized and translated 
         *   until an entity's encoding declaration (if present) has been read. 
         *   Therefore, it is a fatal error to use them within the XML declaration or 
         *   text declaration. 
         *
         ***/
            ThrowXMLwithMemMgr1
                (
                TranscodingException
                , XMLExcepts::Reader_NelLsepinDecl
                , fSystemId
                , fMemoryManager
                );
        }

        if (fNEL && fSource == Source_External)
        {
            fCurCol = 1;
            fCurLine++;
            curCh = chLF;
        }
    }
    else
    {
        fCurCol++;
    }
}

Tinny Ng's avatar
Tinny Ng committed
XERCES_CPP_NAMESPACE_END