XMLReader.cpp


    //
    // upperCase the newEncoding first for better performance
    //
    XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager);
    XMLString::upperCase(inputEncoding);

    //
    //  Try to map the string to one of our standard encodings. If its not
    //  one of them, then it has to be one of the non-intrinsic encodings,
    //  in which case we have to delete our intrinsic encoder and create a
    //  new one.
    //
    XMLRecognizer::Encodings newBaseEncoding = XMLRecognizer::encodingForName
    (
        inputEncoding
    );

    //
    //  If it does not come back as one of the auto-sensed encodings, then we
    //  have to possibly replace it and at least check a few things.
    //
    if (newBaseEncoding == XMLRecognizer::OtherEncoding)
    {
        //
        //  Check for non-endian specific UTF-16 or UCS-4. If so, and if we
        //  are already in one of the endian versions of those encodings,
        //  then just keep it and go on. Otherwise, its not valid.
        //
        if (!XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString)
        ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString2)
        ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString3)
        ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString4)
        ||  !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString5))
        {
            fMemoryManager->deallocate(inputEncoding);

            if ((fEncoding != XMLRecognizer::UTF_16L)
            &&  (fEncoding != XMLRecognizer::UTF_16B))
            {
                return false;
            }

            // Override with the original endian specific encoding
            newBaseEncoding = fEncoding;

            if (fEncoding == XMLRecognizer::UTF_16L) {
                fMemoryManager->deallocate(fEncodingStr);
                fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager);
            }
            else {
                fMemoryManager->deallocate(fEncodingStr);
                fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager);
            }
        }
        else if (!XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString)
             ||  !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString2)
             ||  !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString3))
        {
            fMemoryManager->deallocate(inputEncoding);

            if ((fEncoding != XMLRecognizer::UCS_4L)
            &&  (fEncoding != XMLRecognizer::UCS_4B))
            {
                return false;
            }

            // Override with the original endian specific encoding
            newBaseEncoding = fEncoding;

            if (fEncoding == XMLRecognizer::UCS_4L) {

                fMemoryManager->deallocate(fEncodingStr);
                fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager);
            }
            else {

                fMemoryManager->deallocate(fEncodingStr);
                fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager);
            }
        }
         else
        {
            //
            // None of those special cases, so just replicate the new name
            // and use it directly to create the transcoder
            //
            fMemoryManager->deallocate(fEncodingStr);
            fEncodingStr = inputEncoding;

            XMLTransService::Codes failReason;
            fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
            (
                fEncodingStr
                , failReason
                , kCharBufSize
                , fMemoryManager
            );
        }
    }
     else
    {
        // Store the new encoding string since it is just an intrinsic
        fMemoryManager->deallocate(fEncodingStr);
        fEncodingStr = inputEncoding;
    }

    if (!fTranscoder) {
        //
        //  Now we can create a transcoder using the recognized fEncoding.  We
        //  might get back a transcoder for an intrinsically supported encoding,
        //  or we might get one from the underlying transcoding service.
        //
        XMLTransService::Codes failReason;
        fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
        (
            newBaseEncoding
            , failReason
            , kCharBufSize
            , fMemoryManager
        );

        if (!fTranscoder)
            ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
    }

    // Update the base encoding member with the new base encoding found
    fEncoding = newBaseEncoding;

    // Looks ok to us
    return true;
}


// ---------------------------------------------------------------------------
//  XMLReader: Private helper methods
// ---------------------------------------------------------------------------

//
//  This is called when the encoding flag is set and just sets the fSwapped
//  flag appropriately.
//
void XMLReader::checkForSwapped()
{
    // Assume not swapped
    fSwapped = false;

    #if defined(ENDIANMODE_LITTLE)

        if ((fEncoding == XMLRecognizer::UTF_16B)
        ||  (fEncoding == XMLRecognizer::UCS_4B))
        {
            fSwapped = true;
        }

    #elif defined(ENDIANMODE_BIG)

        if ((fEncoding == XMLRecognizer::UTF_16L)
        ||  (fEncoding == XMLRecognizer::UCS_4L))
        {
            fSwapped = true;
        }

    #endif
}


//
//  This is called from the constructor when the encoding is not forced.
//  We assume that the encoding has been auto-sensed at this point and that
//  fSwapped is set correctly.
//
//  In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
//  The fact that we got here, means that there is one, because that's the
//  only way we can autosense those.
//
void XMLReader::doInitDecode()
{
    switch(fEncoding)
    {
        case XMLRecognizer::UCS_4B :
        case XMLRecognizer::UCS_4L :
        {
            // Remove bom if any
            if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
                ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))  )
            {
                for (unsigned int i = 0; i < fRawBytesAvail; i++)
                    fRawByteBuf[i] = fRawByteBuf[i+4];

                fRawBytesAvail -=4;
            }

            // Look at the raw buffer as UCS4 chars
            const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;

            while (fRawBufIndex < fRawBytesAvail)
            {
                // Get out the current 4 byte value and inc our raw buf index
                UCS4Ch curVal = *asUCS++;
                fRawBufIndex += sizeof(UCS4Ch);

                // Swap if that is required for this machine
                if (fSwapped)
                    curVal = BitOps::swapBytes(curVal);

                // Make sure its at least semi legal. If not, undo and throw
                if (curVal > 0xFFFF)
                {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }

                // Convert the value to an XML char and store it
                fCharSizeBuf[fCharsAvail] = 4;
                fCharBuf[fCharsAvail++] = XMLCh(curVal);

                // Break out on the > character
                if (curVal == chCloseAngle)
                    break;
            }
            break;
        }

        case XMLRecognizer::UTF_8 :
        {
            // If there's a utf-8 BOM  (0xEF 0xBB 0xBF), skip past it.
            //   Don't move to char buf - no one wants to see it.
            //   Note: this causes any encoding= declaration to override
            //         the BOM's attempt to say that the encoding is utf-8.

            // Look at the raw buffer as short chars
            const char* asChars = (const char*)fRawByteBuf;

            if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
                XMLString::compareNString(  asChars
                                            , XMLRecognizer::fgUTF8BOM
                                            , XMLRecognizer::fgUTF8BOMLen) == 0)
            {
                fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
                asChars      += XMLRecognizer::fgUTF8BOMLen;
            }

            //
            //  First check that there are enough bytes to even see the
            //  decl indentifier. If not, get out now with no action since
            //  there is no decl.
            //
            if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
                break;

            // Check for the opening sequence. If not, then no decl
            if (XMLString::compareNString(  asChars
                                            , XMLRecognizer::fgASCIIPre
                                            , XMLRecognizer::fgASCIIPreLen))
            {
                break;
            }

            while (fRawBufIndex < fRawBytesAvail)
            {
                const char curCh = *asChars++;
                fRawBufIndex++;

                // Looks ok, so store it
                fCharSizeBuf[fCharsAvail] = 1;
                fCharBuf[fCharsAvail++] = XMLCh(curCh);

                // Break out on a > character
                if (curCh == chCloseAngle)
                    break;

                //
                //  A char greater than 0x7F is not allowed in this case. If
                //  so, undo and throw.
                //
                if (curCh & 0x80)
                {
                    fCharsAvail = 0;
                    fRawBufIndex = 0;
                    fMemoryManager->deallocate(fPublicId);
                    fMemoryManager->deallocate(fEncodingStr);
                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
                    ThrowXMLwithMemMgr1
                    (
                        TranscodingException
                        , XMLExcepts::Reader_CouldNotDecodeFirstLine
                        , fSystemId
                        , fMemoryManager
                    );
                }
            }
            break;
        }

        case XMLRecognizer::UTF_16B :
        case XMLRecognizer::UTF_16L :
        {
            //
            //  If there is a decl here, we just truncate back the characters
            //  as we go. No surrogate creation would be allowed here in legal
            //  XML, so we consider it a transoding error if we find one.
            //
            if (fRawBytesAvail < 2)
                break;

            unsigned int postBOMIndex = 0;
            const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
            if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
            {
                fRawBufIndex += sizeof(UTF16Ch);
                asUTF16++;
                postBOMIndex = fRawBufIndex;
            }

            //  First check that there are enough raw bytes for there to even
            //  be a decl indentifier. If not, then nothing to do.
            //
            if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
            {
                fRawBufIndex = postBOMIndex;
                break;
            }

            //
            //  See we get a match on the prefix. If not, then reset and
            //  break out.
            //
            if (fEncoding == XMLRecognizer::UTF_16B)
            {
                if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
                {
                    fRawBufIndex = postBOMIndex;
                    break;
                }
            }
             else
            {
                if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
                {
                    fRawBufIndex = postBOMIndex;
                    break;
                }
            }

            while (fRawBufIndex < fRawBytesAvail)
            {
                // Get out the current 2 byte value
                UTF16Ch curVal = *asUTF16++;
                fRawBufIndex += sizeof(UTF16Ch);

                // Swap if that is required for this machine
                if (fSwapped)
                    curVal = BitOps::swapBytes(curVal);

                //
                //  Store it and bump the target index, implicitly converting
                //  if UTF16Ch and XMLCh are not the same size.
                //
                fCharSizeBuf[fCharsAvail] = 2;
                fCharBuf[fCharsAvail++] = curVal;

                // Break out on a > char
                if (curVal == chCloseAngle)
                    break;
            }
            break;
        }

        case XMLRecognizer::EBCDIC :
        {
            //
            //  We use special support in the intrinsic EBCDIC-US transcoder
            //  to go through one char at a time.
            //
            const XMLByte* srcPtr = fRawByteBuf;
            while (1)
            {
                // Transcode one char from the source
                const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
                fRawBufIndex++;

                //
                //  And put it into the character buffer. This stuff has to
                //  look like it was normally transcoded.
                //
                fCharSizeBuf[fCharsAvail] = 1;
                fCharBuf[fCharsAvail++] = chCur;

                // If its a > char, then break out
                if (chCur == chCloseAngle)
                    break;

                // Watch for using up all input and get out
                if (fRawBufIndex == fRawBytesAvail)
                    break;
            }
            break;
        }

        default :
            // It should never be anything else here
            fMemoryManager->deallocate(fPublicId);
            fMemoryManager->deallocate(fEncodingStr);                    
            fMemoryManager->deallocate(fSystemId);
            ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
            break;
    }

    //
    //  Ok, by the time we get here, if its a legal XML file we have eaten
    //  the XML/TextDecl. So, if we are a PE and are being referenced from
    //  outside a literal, then we need to throw in an arbitrary space that
    //  is required by XML.
    //
    if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
        fCharBuf[fCharsAvail++] = chSpace;
    
    //  Calculate fCharOfsBuf buffer using the elements from fCharBufSize
    if (fCalculateSrcOfs)
    {
        fCharOfsBuf[0] = 0;
        for (unsigned int index = 1; index < fCharsAvail; ++index) {
            fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
        }
    }
}


//
//  This method is called internally when we run out of bytes in the raw
//  buffer. We just read as many bytes as we can into the raw buffer again
//  and store the number of bytes we got.
//
void XMLReader::refreshRawBuffer()
{
    //
    //  If there are any bytes left, move them down to the start. There
    //  should only ever be (max bytes per char - 1) at the most.
    //
    const unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;

    // Move the existing ones down
    for (unsigned int index = 0; index < bytesLeft; index++)
        fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];

    //
    //  And then read into the buffer past the existing bytes. Add back in
    //  that many to the bytes read, and subtract that many from the bytes
    //  requested.
    //
    fRawBytesAvail = fStream->readBytes
    (
        &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft
    ) + bytesLeft;

    //
    //  We need to reset the buffer index back to the start in all cases,
    //  since any trailing data was copied down to the start.
    //
    fRawBufIndex = 0;
}


//
//  This method is called internally when we run out of characters in the
//  trancoded character buffer. We transcode up to another maxChars chars
//  from the
//
unsigned int
XMLReader::xcodeMoreChars(          XMLCh* const            bufToFill
                            ,       unsigned char* const    charSizes
                            , const unsigned int            maxChars)
{
    // If we are plain tuckered out, then return zero now
    if (!fRawBytesAvail)
        return 0;

    //
    //  If our raw buffer is low, then lets load up another batch of
    //  raw bytes now.  We can't check for exactly zero bytes left because
    //  transcoding of multi-byte encodings may have left a few bytes
    //  representing a partial character in the buffer that can't be
    //  used until the next buffer (and the rest of the character)
    //  is read.
    //
    unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;
    if (bytesLeft < 100)
    {
        refreshRawBuffer();

        // If we didn't get anything more just return a zero now
        if (!fRawBytesAvail)
            return 0;
    }

    // Ask the transcoder to internalize another batch of chars
    unsigned int bytesEaten;
    const unsigned int charsDone = fTranscoder->transcodeFrom
    (
        &fRawByteBuf[fRawBufIndex]
        , fRawBytesAvail - fRawBufIndex
        , bufToFill
        , maxChars
        , bytesEaten
        , charSizes
    );

    // Update the raw buffer index
    fRawBufIndex += bytesEaten;

    return charsDone;
}

XERCES_CPP_NAMESPACE_END