Skip to content
Snippets Groups Projects
DGXMLScanner.cpp 116 KiB
Newer Older
Khaled Noaman's avatar
Khaled Noaman committed
                    // If we are validating and its required, then an error
                    if (defType == XMLAttDef::Required)
                    {
                        fValidator->emitError
                        (
                            XMLValid::RequiredAttrNotProvided
                            , curDef.getFullName()
                        );
                    }
                    else if ((defType == XMLAttDef::Default) ||
		                       (defType == XMLAttDef::Fixed)  )
                    {
                        if (fStandalone && curDef.isExternal())
                        {
                            // XML 1.0 Section 2.9
                            // Document is standalone, so attributes must not be defaulted.
                            fValidator->emitError(XMLValid::NoDefAttForStandalone, curDef.getFullName(), elemDecl->getFullName());
                        }
                    }
                }

                // Fault in the value if needed, and bump the att count
                if ((defType == XMLAttDef::Default)
                ||  (defType == XMLAttDef::Fixed))
                {
                    // Let the validator pass judgement on the attribute value
                    if (fValidate)
                    {
                        fValidator->validateAttrValue
                        (
                            &curDef
                            , curDef.getValue()
                            , false
                            , elemDecl
                        );
                    }

                    XMLAttr* curAtt;
                    if (retCount >= curAttListSize)
                    {
                        if (fDoNamespaces)
                        {
                            curAtt = new (fMemoryManager) XMLAttr
Khaled Noaman's avatar
Khaled Noaman committed
                            (
                                -1
                                , curDef.getFullName()
                                , XMLUni::fgZeroLenString
                                , curDef.getValue()
                                , curDef.getType()
                                , false
                            curAtt = new (fMemoryManager) XMLAttr
Khaled Noaman's avatar
Khaled Noaman committed
                            (
                                -1
                                , curDef.getFullName()
                                , XMLUni::fgZeroLenString
                                , curDef.getValue()
                                , curDef.getType()
                                , false
Khaled Noaman's avatar
Khaled Noaman committed
                            );
                        }

                        fAttrList->addElement(curAtt);
                    }
                    else
                    {
                        curAtt = fAttrList->elementAt(retCount);
                        if (fDoNamespaces)
                        {
                            curAtt->set
                            (
                                fEmptyNamespaceId
                                , curDef.getFullName()
                                , curDef.getValue()
                                , curDef.getType()
                            );
                        }
                        else
                        {
                            curAtt->set
                            (
                                -1
                                , curDef.getFullName()
                                , XMLUni::fgZeroLenString
                                , curDef.getValue()
                                , curDef.getType()
                            );
                        }
                        curAtt->setSpecified(false);
                    }

                    if (fDoNamespaces)
                    {
                        //  Map the new attribute's prefix to a URI id and store
                        //  that in the attribute object.
                        const XMLCh* attPrefix = curAtt->getPrefix();
                        if (attPrefix && *attPrefix) {
                            curAtt->setURIId
                            (
                                resolvePrefix(attPrefix, ElemStack::Mode_Attribute)
                            );
                        }
                    }

                    retCount++;
                }
            }
        }
    }

    for (unsigned int i=0; i < fAttrNSList->size(); i++) {

        XMLAttr* providedAttr = fAttrNSList->elementAt(i);

        providedAttr->setURIId
        (
	        resolvePrefix
            (
                providedAttr->getPrefix(),
                ElemStack::Mode_Attribute
            )
        );
    }

    fAttrNSList->removeAllElements();

    return retCount;
}


unsigned int
DGXMLScanner::resolvePrefix(  const   XMLCh* const        prefix
                              , const ElemStack::MapModes mode)
{
    //  Watch for the special namespace prefixes. We always map these to
    //  special URIs. 'xml' gets mapped to the official URI that its defined
    //  to map to by the NS spec. xmlns gets mapped to a special place holder
    //  URI that we define (so that it maps to something checkable.)
    if (XMLString::equals(prefix, XMLUni::fgXMLNSString))
        return fXMLNSNamespaceId;
    else if (XMLString::equals(prefix, XMLUni::fgXMLString))
        return fXMLNamespaceId;

    //  Ask the element stack to search up itself for a mapping for the
    //  passed prefix.
    bool unknown;
    unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);

    // If it was unknown, then the URI was faked in but we have to issue an error
    if (unknown)
        emitError(XMLErrs::UnknownPrefix, prefix);

    return uriId;
}

//  This method will reset the scanner data structures, and related plugged
//  in stuff, for a new scan session. We get the input source for the primary
//  XML entity, create the reader for it, and push it on the stack so that
//  upon successful return from here we are ready to go.
void DGXMLScanner::scanReset(const InputSource& src)
{

    //  This call implicitly tells us that we are going to reuse the scanner
    //  if it was previously used. So tell the validator to reset itself.
    //
    //  But, if the fUseCacheGrammar flag is set, then don't reset it.
    //
    //  NOTE:   The ReaderMgr is flushed on the way out, because that is
    //          required to insure that files are closed.
    fGrammarResolver->cacheGrammarFromParse(fToCacheGrammar);
    fGrammarResolver->useCachedGrammarInParse(fUseCachedGrammar);

    fDTDGrammar = new (fGrammarPoolMemoryManager) DTDGrammar(fGrammarPoolMemoryManager);
    fGrammarResolver->putGrammar(fDTDGrammar);
Khaled Noaman's avatar
Khaled Noaman committed
    fGrammar = fDTDGrammar;
    fRootGrammar = 0;
    fValidator->setGrammar(fGrammar);

    // Reset validation
    fValidate = (fValScheme == Val_Always) ? true : false;
Khaled Noaman's avatar
Khaled Noaman committed

    //  And for all installed handlers, send reset events. This gives them
    //  a chance to flush any cached data.
    if (fDocHandler)
        fDocHandler->resetDocument();
    if (fEntityHandler)
        fEntityHandler->resetEntities();
    if (fErrorReporter)
        fErrorReporter->resetErrors();

    // Clear out the id reference list
    resetValidationContext();
Khaled Noaman's avatar
Khaled Noaman committed

    // Reset the Root Element Name
    fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
Khaled Noaman's avatar
Khaled Noaman committed
    fRootElemName = 0;

    //  Reset the element stack, and give it the latest ids for the special
    //  URIs it has to know about.
    fElemStack.reset
    (
        fEmptyNamespaceId
        , fUnknownNamespaceId
        , fXMLNamespaceId
        , fXMLNSNamespaceId
    );

    // Reset some status flags
    fInException = false;
    fStandalone = false;
    fErrorCount = 0;
    fHasNoDTD = true;

    // Reset the validators
    fDTDValidator->reset();
    fDTDValidator->setErrorReporter(fErrorReporter);
    if (fValidatorFromUser)
        fValidator->reset();

    //  Handle the creation of the XML reader object for this input source.
    //  This will provide us with transcoding and basic lexing services.
    XMLReader* newReader = fReaderMgr.createReader
    (
        src
        , true
        , XMLReader::RefFrom_NonLiteral
        , XMLReader::Type_General
        , XMLReader::Source_External
        , fCalculateSrcOfs
    );

    if (!newReader) {
        if (src.getIssueFatalErrorIfNotFound())
            ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager);
Khaled Noaman's avatar
Khaled Noaman committed
        else
            ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager);
Khaled Noaman's avatar
Khaled Noaman committed
    }

    // Push this read onto the reader manager
    fReaderMgr.pushReader(newReader, 0);

    // and reset security-related things if necessary:
    if(fSecurityManager != 0) 
    {
        fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit();
        fEntityExpansionCount = 0;
    }
    if(fUIntPoolRowTotal >= 32) 
    { // 8 KB tied up with validating attributes...
        fAttDefRegistry->removeAll();
        fUndeclaredAttrRegistry->removeAll();
        recreateUIntPool();
    }
    else
    {
        // note that this will implicitly reset the values of the hashtables,
        // though their buckets will still be tied up
        resetUIntPool();
    }
Khaled Noaman's avatar
Khaled Noaman committed
}


//  This method is called between markup in content. It scans for character
//  data that is sent to the document handler. It watches for any markup
//  characters that would indicate that the character data has ended. It also
//  handles expansion of general and character entities.
//
//  sendData() is a local static helper for this method which handles some
//  code that must be done in three different places here.
void DGXMLScanner::sendCharData(XMLBuffer& toSend)
{
    // If no data in the buffer, then nothing to do
    if (toSend.isEmpty())
        return;

    //  We do different things according to whether we are validating or
    //  not. If not, its always just characters; else, it depends on the
    //  current element's content model.
    if (fValidate)
    {
        // Get the raw data we need for the callback
        const XMLCh* const rawBuf = toSend.getRawBuffer();
        const unsigned int len = toSend.getLen();

        // And see if the current element is a 'Children' style content model
        const ElemStack::StackElem* topElem = fElemStack.topElement();

        // Get the character data opts for the current element
        XMLElementDecl::CharDataOpts charOpts = topElem->fThisElement->getCharDataOpts();

        if (charOpts == XMLElementDecl::NoCharData)
        {
            // They definitely cannot handle any type of char data
            fValidator->emitError(XMLValid::NoCharDataInCM);
        }
Tinny Ng's avatar
Tinny Ng committed
        else if (fReaderMgr.getCurrentReader()->isAllSpaces(rawBuf, len))
Khaled Noaman's avatar
Khaled Noaman committed
        {
            //  Its all spaces. So, if they can take spaces, then send it
            //  as ignorable whitespace. If they can handle any char data
            //  send it as characters.
            if (charOpts == XMLElementDecl::SpacesOk) {
                if (fDocHandler)
                    fDocHandler->ignorableWhitespace(rawBuf, len, false);
            }
            else if (charOpts == XMLElementDecl::AllCharData)
            {
                if (fDocHandler)
                    fDocHandler->docCharacters(rawBuf, len, false);
            }
        }
        else
        {
            //  If they can take any char data, then send it. Otherwise, they
            //  can only handle whitespace and can't handle this stuff so
            //  issue an error.
            if (charOpts == XMLElementDecl::AllCharData)
            {
                if (fDocHandler)
                    fDocHandler->docCharacters(rawBuf, len, false);
            }
            else
            {
                fValidator->emitError(XMLValid::NoCharDataInCM);
            }
        }
    }
    else
    {
        // Always assume its just char data if not validating
        if (fDocHandler)
            fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
    }

    // Reset buffer
    toSend.reset();
}



//  This method is called with a key/value string pair that represents an
//  xmlns="yyy" or xmlns:xxx="yyy" attribute. This method will update the
//  current top of the element stack based on this data. We know that when
//  we get here, that it is one of these forms, so we don't bother confirming
//  it.
//
//  But we have to ensure
//      1. xxx is not xmlns
//      2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa
//      3. yyy is not XMLUni::fgXMLNSURIName
//      4. if xxx is not null, then yyy cannot be an empty string.
void DGXMLScanner::updateNSMap(const    XMLCh* const attrPrefix
                               , const  XMLCh* const attrLocalName
                               , const  XMLCh* const attrValue)
{
    //  We either have the default prefix (""), or we point it into the attr
    //  name parameter. Note that the xmlns is not the prefix we care about
    //  here. To us, the 'prefix' is really the local part of the attrName
    //  parameter.
    //
    //  Check 1. xxx is not xmlns
    //        2. if xxx is xml, then yyy must match XMLUni::fgXMLURIName, and vice versa
    //        3. yyy is not XMLUni::fgXMLNSURIName
    //        4. if xxx is not null, then yyy cannot be an empty string.
    if (attrPrefix && *attrPrefix) {

        if (XMLString::equals(attrLocalName, XMLUni::fgXMLNSString))
            emitError(XMLErrs::NoUseOfxmlnsAsPrefix);
        else if (XMLString::equals(attrLocalName, XMLUni::fgXMLString)) {
            if (!XMLString::equals(attrValue, XMLUni::fgXMLURIName))
                emitError(XMLErrs::PrefixXMLNotMatchXMLURI);
        }

        if (!attrValue)
            emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName);
        else if(!*attrValue && fXMLVersion == XMLReader::XMLV1_0)
Khaled Noaman's avatar
Khaled Noaman committed
            emitError(XMLErrs::NoEmptyStrNamespace, attrLocalName);
    }

    if (XMLString::equals(attrValue, XMLUni::fgXMLNSURIName))
        emitError(XMLErrs::NoUseOfxmlnsURI);
    else if (XMLString::equals(attrValue, XMLUni::fgXMLURIName)) {
        if (!XMLString::equals(attrLocalName, XMLUni::fgXMLString))
            emitError(XMLErrs::XMLURINotMatchXMLPrefix);
    }

    //  Ok, we have to get the unique id for the attribute value, which is the
    //  URI that this value should be mapped to. The validator has the
    //  namespace string pool, so we ask him to find or add this new one. Then
    //  we ask the element stack to add this prefix to URI Id mapping.
    fElemStack.addPrefix
    (
        attrLocalName
        , fURIStringPool->addOrFind(attrValue)
    );
}

void DGXMLScanner::scanAttrListforNameSpaces(RefVectorOf<XMLAttr>* theAttrList, int attCount, 
                                                XMLElementDecl*       elemDecl)
Khaled Noaman's avatar
Khaled Noaman committed
{
    //  Make an initial pass through the list and find any xmlns attributes or
    //  schema attributes.
    //  When we find one, send it off to be used to update the element stack's
    //  namespace mappings.
    for (int index = 0; index < attCount; index++)
    {
        // each attribute has the prefix:suffix="value"
        XMLAttr* curAttr = theAttrList->elementAt(index);
        const XMLCh* attPrefix = curAttr->getPrefix();
        const XMLCh* attLocalName = curAttr->getName();

        if (attPrefix && *attPrefix)
        {
            if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) {
                curAttr->setURIId(fXMLNamespaceId);
            }
            else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) {

                curAttr->setURIId(fXMLNSNamespaceId);
                updateNSMap(attPrefix, attLocalName, curAttr->getValue());
            }
            else {
                fAttrNSList->addElement(curAttr);
            }
        }
        else if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName))
        {
            updateNSMap(attPrefix, XMLUni::fgZeroLenString, curAttr->getValue());
        }
		
        // check for duplicate namespace attributes:
        // by checking for qualified names with the same local part and with prefixes 
        // which have been bound to namespace names that are identical.         
        XMLAttr* loopAttr;
        for (int attrIndex=0; attrIndex < index; attrIndex++) {
            loopAttr = theAttrList->elementAt(attrIndex);
            if (loopAttr->getURIId() == curAttr->getURIId() &&
                XMLString::equals(loopAttr->getName(), curAttr->getName())) {
                emitError
                ( 
                    XMLErrs::AttrAlreadyUsedInSTag
                    , curAttr->getName()
                    , elemDecl->getFullName()
                );
            }
Khaled Noaman's avatar
Khaled Noaman committed
    }
}

InputSource* DGXMLScanner::resolveSystemId(const XMLCh* const sysId)
{
    // Create a buffer for expanding the system id
    XMLBufBid bbSys(&fBufMgr);
    XMLBuffer& expSysId = bbSys.getBuffer();

    //  Allow the entity handler to expand the system id if they choose
    //  to do so.
    InputSource* srcToFill = 0;
    if (fEntityHandler)
    {
        if (!fEntityHandler->expandSystemId(sysId, expSysId))
            expSysId.set(sysId);

        XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
                            expSysId.getRawBuffer());
        srcToFill = fEntityHandler->resolveEntity(&resourceIdentifier);
Khaled Noaman's avatar
Khaled Noaman committed
    }
    else
    {
        expSysId.set(sysId);
    }

    //  If they didn't create a source via the entity handler, then we
    //  have to create one on our own.
    if (!srcToFill)
    {
        ReaderMgr::LastExtEntityInfo lastInfo;
        fReaderMgr.getLastExtEntityInfo(lastInfo);

        XMLURL urlTmp(fMemoryManager);
        if ((!urlTmp.setURL(lastInfo.systemId, expSysId.getRawBuffer(), urlTmp)) ||
            (urlTmp.isRelative()))
Khaled Noaman's avatar
Khaled Noaman committed
        {
                srcToFill = new (fMemoryManager) LocalFileInputSource
                (
                    lastInfo.systemId
                    , expSysId.getRawBuffer()
                ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);            
Khaled Noaman's avatar
Khaled Noaman committed
        }
        else
        {
            if (fStandardUriConformant && urlTmp.hasInvalidChar())
                ThrowXMLwithMemMgr(MalformedURLException, XMLExcepts::URL_MalformedURL, fMemoryManager);
            srcToFill = new (fMemoryManager) URLInputSource(urlTmp, fMemoryManager);
        }        
Khaled Noaman's avatar
Khaled Noaman committed
    }

    return srcToFill;
}

// ---------------------------------------------------------------------------
//  DGXMLScanner: Private parsing methods
// ---------------------------------------------------------------------------
bool DGXMLScanner::scanAttValue(  const   XMLAttDef* const    attDef
Khaled Noaman's avatar
Khaled Noaman committed
                                  ,       XMLBuffer&          toFill)
{
    enum States
    {
        InWhitespace
        , InContent
    };

    // Get the type and name
    const XMLAttDef::AttTypes type = (attDef)
                        ?attDef->getType()
                        :XMLAttDef::CData;
Khaled Noaman's avatar
Khaled Noaman committed

    // Reset the target buffer
    toFill.reset();

    // Get the next char which must be a single or double quote
    XMLCh quoteCh;
    if (!fReaderMgr.skipIfQuote(quoteCh))
        return false;

    //  We have to get the current reader because we have to ignore closing
    //  quotes until we hit the same reader again.
    const unsigned int curReader = fReaderMgr.getCurrentReaderNum();

    // Get attribute def - to check to see if it's declared externally or not
    bool  isAttExternal = (attDef)
                        ?attDef->isExternal()
                        :false;
Khaled Noaman's avatar
Khaled Noaman committed

    //  Loop until we get the attribute value. Note that we use a double
    //  loop here to avoid the setup/teardown overhead of the exception
    //  handler on every round.
    XMLCh   nextCh;
    XMLCh   secondCh = 0;
    States  curState = InContent;
    bool    firstNonWS = false;
    bool    gotLeadingSurrogate = false;
    bool    escaped;
    while (true)
    {
    try
    {
        while(true)
        {
            nextCh = fReaderMgr.getNextChar();
Khaled Noaman's avatar
Khaled Noaman committed

            if (!nextCh)
                ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
Khaled Noaman's avatar
Khaled Noaman committed

            // Check for our ending quote in the same entity
            if (nextCh == quoteCh)
            {
                if (curReader == fReaderMgr.getCurrentReaderNum())
                    return true;

                // Watch for spillover into a previous entity
                if (curReader > fReaderMgr.getCurrentReaderNum())
                {
                    emitError(XMLErrs::PartialMarkupInEntity);
                    return false;
                }
            }

            //  Check for an entity ref now, before we let it affect our
            //  whitespace normalization logic below. We ignore the empty flag
            //  in this one.
            escaped = false;
            if (nextCh == chAmpersand)
            {
                if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
                {
                    gotLeadingSurrogate = false;
                    continue;
                }
            }
            else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
Khaled Noaman's avatar
Khaled Noaman committed
            {
                // Deal with surrogate pairs
Khaled Noaman's avatar
Khaled Noaman committed
                //  Its a leading surrogate. If we already got one, then
                //  issue an error, else set leading flag to make sure that
                //  we look for a trailing next time.
                if (gotLeadingSurrogate)
                    emitError(XMLErrs::Expected2ndSurrogateChar);
Khaled Noaman's avatar
Khaled Noaman committed
                    gotLeadingSurrogate = true;
            }
            else
            {
                //  If its a trailing surrogate, make sure that we are
                //  prepared for that. Else, its just a regular char so make
                //  sure that we were not expected a trailing surrogate.
                if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
                {
                    // Its trailing, so make sure we were expecting it
                    if (!gotLeadingSurrogate)
                        emitError(XMLErrs::Unexpected2ndSurrogateChar);
                }
                else
                {
                    //  Its just a char, so make sure we were not expecting a
                    //  trailing surrogate.
                    if (gotLeadingSurrogate)
                        emitError(XMLErrs::Expected2ndSurrogateChar);

                    // Its got to at least be a valid XML character
Tinny Ng's avatar
Tinny Ng committed
                    if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
Khaled Noaman's avatar
Khaled Noaman committed
                    {
                        XMLCh tmpBuf[9];
                        XMLString::binToText
                        (
                            nextCh
                            , tmpBuf
                            , 8
                            , 16
                        );
                        emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
Khaled Noaman's avatar
Khaled Noaman committed
                    }
                }
                gotLeadingSurrogate = false;
            }

            //  If its not escaped, then make sure its not a < character, which
            //  is not allowed in attribute values.
            if (!escaped && (nextCh == chOpenAngle))
                emitError(XMLErrs::BracketInAttrValue, attrName);

            //  If the attribute is a CDATA type we do simple replacement of
            //  tabs and new lines with spaces, if the character is not escaped
            //  by way of a char ref.
            //
            //  Otherwise, we do the standard non-CDATA normalization of
            //  compressing whitespace to single spaces and getting rid of leading
            //  and trailing whitespace.
            if (type == XMLAttDef::CData)
            {
                if (!escaped)
                {
                    if ((nextCh == 0x09) || (nextCh == 0x0A) || (nextCh == 0x0D))
                    {
                        // Check Validity Constraint for Standalone document declaration
                        // XML 1.0, Section 2.9
                        if (fStandalone && fValidate && isAttExternal)
                        {
                             // Can't have a standalone document declaration of "yes" if  attribute
                             // values are subject to normalisation
                             fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
                        }
                        nextCh = chSpace;
                    }
                }
            }
            else
            {
                if (curState == InWhitespace)
                {
Tinny Ng's avatar
Tinny Ng committed
                    if ((escaped && nextCh != chSpace) || !fReaderMgr.getCurrentReader()->isWhitespace(nextCh))
Khaled Noaman's avatar
Khaled Noaman committed
                    {
                        if (firstNonWS)
                            toFill.append(chSpace);
                        curState = InContent;
                        firstNonWS = true;
                    }
                    else
                    {
                        continue;
                    }
                }
                else if (curState == InContent)
                {
                    if ((nextCh == chSpace) ||
Tinny Ng's avatar
Tinny Ng committed
                        (fReaderMgr.getCurrentReader()->isWhitespace(nextCh) && !escaped))
Khaled Noaman's avatar
Khaled Noaman committed
                    {
                        curState = InWhitespace;

                        // Check Validity Constraint for Standalone document declaration
                        // XML 1.0, Section 2.9
                        if (fStandalone && fValidate && isAttExternal)
                        {
                            if (!firstNonWS || (nextCh != chSpace) || (fReaderMgr.lookingAtSpace()))
                            {
                                 // Can't have a standalone document declaration of "yes" if  attribute
                                 // values are subject to normalisation
                                 fValidator->emitError(XMLValid::NoAttNormForStandalone, attrName);
                            }
                        }
                        continue;
                    }
                    firstNonWS = true;
                }
            }

            // Else add it to the buffer
            toFill.append(nextCh);

            if (secondCh)
                toFill.append(secondCh);
Khaled Noaman's avatar
Khaled Noaman committed
        }
    }
    catch(const EndOfEntityException&)
    {
        // Just eat it and continue.
        gotLeadingSurrogate = false;
        escaped = false;
    }
    }
    return true;
}


//  This method scans a CDATA section. It collects the character into one
//  of the temp buffers and calls the document handler, if any, with the
//  characters. It assumes that the <![CDATA string has been scanned before
//  this call.
void DGXMLScanner::scanCDSection()
{
    static const XMLCh CDataClose[] =
    {
            chCloseSquare, chCloseAngle, chNull
    };

    //  The next character should be the opening square bracket. If not
    //  issue an error, but then try to recover by skipping any whitespace
    //  and checking again.
    if (!fReaderMgr.skippedChar(chOpenSquare))
    {
        emitError(XMLErrs::ExpectedOpenSquareBracket);
        fReaderMgr.skipPastSpaces();

        // If we still don't find it, then give up, else keep going
        if (!fReaderMgr.skippedChar(chOpenSquare))
            return;
    }

    // Get a buffer for this
    XMLBufBid bbCData(&fBufMgr);

    //  We just scan forward until we hit the end of CDATA section sequence.
    //  CDATA is effectively a big escape mechanism so we don't treat markup
    //  characters specially here.
    bool            emittedError = false;
Tinny Ng's avatar
Tinny Ng committed
    bool     gotLeadingSurrogate = false;

    // Get the character data opts for the current element
    const ElemStack::StackElem* topElem = fElemStack.topElement();
    XMLElementDecl::CharDataOpts charOpts =  topElem->fThisElement->getCharDataOpts();

Khaled Noaman's avatar
Khaled Noaman committed
    while (true)
    {
        const XMLCh nextCh = fReaderMgr.getNextChar();

        // Watch for unexpected end of file
        if (!nextCh)
        {
            emitError(XMLErrs::UnterminatedCDATASection);
            ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
Tinny Ng's avatar
Tinny Ng committed
        if (fValidate && fStandalone && (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)))
Khaled Noaman's avatar
Khaled Noaman committed
        {
            // This document is standalone; this ignorable CDATA whitespace is forbidden.
            // XML 1.0, Section 2.9
            // And see if the current element is a 'Children' style content model
            if (topElem->fThisElement->isExternal()) {

                if (charOpts == XMLElementDecl::SpacesOk) // Element Content
                {
                    // Error - standalone should have a value of "no" as whitespace detected in an
                    // element type with element content whose element declaration was external
                    fValidator->emitError(XMLValid::NoWSForStandalone);
                }
            }
        }

        //  If this is a close square bracket it could be our closing
        //  sequence.
        if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
        {
            //  make sure we were not expecting a trailing surrogate.
            if (gotLeadingSurrogate)
                emitError(XMLErrs::Expected2ndSurrogateChar);

            if (fValidate) {

                if (charOpts != XMLElementDecl::AllCharData)
                {
                    // They definitely cannot handle any type of char data
                    fValidator->emitError(XMLValid::NoCharDataInCM);
                }
            }

Khaled Noaman's avatar
Khaled Noaman committed
            // If we have a doc handler, call it
            if (fDocHandler)
            {
                fDocHandler->docCharacters
                    (
                    bbCData.getRawBuffer()
                    , bbCData.getLen()
                    , true
                    );
            }

            // And we are done
            break;
        }

        //  Make sure its a valid character. But if we've emitted an error
        //  already, don't bother with the overhead since we've already told
        //  them about it.
        if (!emittedError)
        {
Tinny Ng's avatar
Tinny Ng committed
            // Deal with surrogate pairs
            if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
Khaled Noaman's avatar
Khaled Noaman committed
            {
Tinny Ng's avatar
Tinny Ng committed
                //  Its a leading surrogate. If we already got one, then
                //  issue an error, else set leading flag to make sure that
                //  we look for a trailing next time.
                if (gotLeadingSurrogate)
                    emitError(XMLErrs::Expected2ndSurrogateChar);
                else
                    gotLeadingSurrogate = true;
            }
            else
            {
                //  If its a trailing surrogate, make sure that we are
                //  prepared for that. Else, its just a regular char so make
                //  sure that we were not expected a trailing surrogate.
                if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
                {
                    // Its trailing, so make sure we were expecting it
                    if (!gotLeadingSurrogate)
                        emitError(XMLErrs::Unexpected2ndSurrogateChar);
                }
                else
                {
                    //  Its just a char, so make sure we were not expecting a
                    //  trailing surrogate.
                    if (gotLeadingSurrogate)
                        emitError(XMLErrs::Expected2ndSurrogateChar);

                    // Its got to at least be a valid XML character
                    else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
                    {
                        XMLCh tmpBuf[9];
                        XMLString::binToText
                        (
                            nextCh
                            , tmpBuf
                            , 8
                            , 16
Tinny Ng's avatar
Tinny Ng committed
                        );
                        emitError(XMLErrs::InvalidCharacter, tmpBuf);
                        emittedError = true;
                    }
                }
                gotLeadingSurrogate = false;
Khaled Noaman's avatar
Khaled Noaman committed
            }
        }

        // Add it to the buffer
        bbCData.append(nextCh);
    }
}


void DGXMLScanner::scanCharData(XMLBuffer& toUse)
{
    //  We have to watch for the stupid ]]> sequence, which is illegal in
    //  character data. So this is a little state machine that handles that.
    enum States
    {
        State_Waiting
        , State_GotOne
        , State_GotTwo
    };

    // Reset the buffer before we start
    toUse.reset();

    // Turn on the 'throw at end' flag of the reader manager
    ThrowEOEJanitor jan(&fReaderMgr, true);

    //  In order to be more efficient we have to use kind of a deeply nested
    //  set of blocks here. The outer block puts on a try and catches end of
    //  entity exceptions. The inner loop is the per-character loop. If we
    //  put the try inside the inner loop, it would work but would require
    //  the exception handling code setup/teardown code to be invoked for
    //  each character.
    XMLCh   nextCh;
    XMLCh   secondCh = 0;
    States  curState = State_Waiting;
    bool    escaped = false;
    bool    gotLeadingSurrogate = false;
    bool    notDone = true;
    while (notDone)
    {
        try
        {
            while (true)
            {
                //  Eat through as many plain content characters as possible without
                //  needing special handling.  Moving most content characters here,
                //  in this one call, rather than running the overall loop once
                //  per content character, is a speed optimization.
                if (curState == State_Waiting  &&  !gotLeadingSurrogate)
Khaled Noaman's avatar
Khaled Noaman committed
                {
                     fReaderMgr.movePlainContentChars(toUse);
                // Try to get another char from the source
                //   The code from here on down covers all contengencies,
                if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
                {
                    // If we were waiting for a trailing surrogate, its an error
                    if (gotLeadingSurrogate)
                        emitError(XMLErrs::Expected2ndSurrogateChar);
Khaled Noaman's avatar
Khaled Noaman committed

Khaled Noaman's avatar
Khaled Noaman committed
                }

                //  Watch for a reference. Note that the escapement mechanism
                //  is ignored in this content.
Khaled Noaman's avatar
Khaled Noaman committed
                if (nextCh == chAmpersand)
                {
                    sendCharData(toUse);

                    // Turn off the throwing at the end of entity during this
                    ThrowEOEJanitor jan(&fReaderMgr, false);

                    if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
                    {
                        gotLeadingSurrogate = false;
                        continue;
                    }
                }
                else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
Khaled Noaman's avatar
Khaled Noaman committed
                {
                    // Deal with surrogate pairs
Khaled Noaman's avatar
Khaled Noaman committed
                    //  Its a leading surrogate. If we already got one, then
                    //  issue an error, else set leading flag to make sure that
                    //  we look for a trailing next time.
                    if (gotLeadingSurrogate)
                        emitError(XMLErrs::Expected2ndSurrogateChar);
                    else
                        gotLeadingSurrogate = true;
                }
                else
                {
                    //  If its a trailing surrogate, make sure that we are
                    //  prepared for that. Else, its just a regular char so make
                    //  sure that we were not expected a trailing surrogate.
                    if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
                    {
                        // Its trailing, so make sure we were expecting it
                        if (!gotLeadingSurrogate)
                            emitError(XMLErrs::Unexpected2ndSurrogateChar);
                    }
                    else
                    {
                        //  Its just a char, so make sure we were not expecting a
                        //  trailing surrogate.
                        if (gotLeadingSurrogate)
                            emitError(XMLErrs::Expected2ndSurrogateChar);

                        // Make sure the returned char is a valid XML char
Tinny Ng's avatar
Tinny Ng committed
                        if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
Khaled Noaman's avatar
Khaled Noaman committed
                        {
                            XMLCh tmpBuf[9];
                            XMLString::binToText
                            (
                                nextCh
                                , tmpBuf
                                , 8
                                , 16
                            );
                            emitError(XMLErrs::InvalidCharacter, tmpBuf);
Khaled Noaman's avatar
Khaled Noaman committed
                        }
                    }
                    gotLeadingSurrogate = false;
                }

                 // Keep the state machine up to date