Skip to content
Snippets Groups Projects
XMLScanner.cpp 146 KiB
Newer Older
PeiYong Zhang's avatar
PeiYong Zhang committed
    //
    int curTop = 0;
    for (int index = VersionString; index < StandaloneString; index++)
    {
        if (flags[index] != -1)
        {
            if (flags[index] !=  curTop + 1)
            {
                emitError(XMLErrs::DeclStringsInWrongOrder);
                break;
            }
            curTop = flags[index];
        }
    }

    //
    //  If its an XML decl, the version must be present.
    //  If its a Text decl, then encoding must be present AND standalone must not be present.
    //
    if ((type == Decl_XML) && (flags[VersionString] == -1))
        emitError(XMLErrs::XMLVersionRequired);
    else if (type == Decl_Text) {
        if (flags[StandaloneString] != -1)
            emitError(XMLErrs::StandaloneNotLegal);
        if (flags[EncodingString] == -1)
            emitError(XMLErrs::EncodingRequired);
    }

    if (!fReaderMgr.skippedChar(chQuestion))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }
     else if (!fReaderMgr.skippedChar(chCloseAngle))
    {
        emitError(XMLErrs::UnterminatedXMLDecl);
        fReaderMgr.skipPastChar(chCloseAngle);
    }

    //
    //  If we have a document handler then call the XML Decl callback.
    //
    //  !NOTE! Do this before we possibly update the reader with the
    //  actual encoding string. Otherwise, we will pass the wrong thing
    //  for the last parameter!
    //
    if (fDocHandler)
    {
        fDocHandler->XMLDecl
        (
            bbVersion.getRawBuffer()
            , bbEncoding.getRawBuffer()
            , bbStand.getRawBuffer()
            , fReaderMgr.getCurrentEncodingStr()
        );
    }

    //
    //  Ok, we've now seen the real encoding string, if there was one, so
    //  lets call back on the current reader and tell it what the real
    //  encoding string was. If it fails, that's because it represents some
    //  sort of contradiction with the autosensed format, and it keeps the
    //  original encoding.
    //
    //  NOTE: This can fail for a number of reasons, such as a bogus encoding
    //  name or because its in flagrant contradiction of the auto-sensed
    //  format.
    //
    if (flags[EncodingString] != -1)
    {
        if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
            emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
    }
}

const XMLCh* XMLScanner::getURIText(const   unsigned int    uriId) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return XMLUni::fgZeroLenString;

        return value;
    }
    else
        return XMLUni::fgZeroLenString;
}

bool XMLScanner::getURIText(  const   unsigned int    uriId
                      ,       XMLBuffer&      uriBufToFill) const
{
    if (fURIStringPool->exists(uriId)) {
        // Look up the URI in the string pool and return its id
        const XMLCh* value = fURIStringPool->getValueForId(uriId);
        if (!value)
            return false;

        uriBufToFill.set(value);
        return true;
    }
    else
        return false;
}

unsigned int
XMLScanner::resolveQName(   const   XMLCh* const        qName
                            ,       XMLBuffer&          nameBuf
                            ,       XMLBuffer&          prefixBuf
                            , const ElemStack::MapModes mode)
{
    // Reset both target buffers in case we don't get anything for either
    nameBuf.reset();
    prefixBuf.reset();

    //
    //  Lets split out the qName into a URI and name buffer first. The URI
    //  can be empty.
    //
    const int colonPos = XMLString::indexOf(qName, chColon);
    unsigned int uriId = 0;
    if (colonPos == -1)
    {
        //
        //  Its all name with no prefix, so put the whole thing into the name
        //  buffer. Then map the empty string to a URI, since the empty string
        //  represents the default namespace. This will either return some
        //  explicit URI which the default namespace is mapped to, or the
        //  the default global namespace.
        //
        nameBuf.append(qName);
        bool unknown;
        uriId = fElemStack.mapPrefixToURI(prefixBuf.getRawBuffer(), mode, unknown);

        #if defined(XERCES_DEBUG)
        if (unknown)
        {
            // <TBD> This one should never be unknown
        }
        #endif
    }
     else
    {
        //
        //  Copy the chars up to but not including the colon into the prefix
        //  buffer.
        //
        prefixBuf.append(qName, colonPos);

        // And copy over the rest of the chars to the name buffer
        nameBuf.append(&qName[colonPos+1]);

        //
        //  Watch for the special namespace prefixes. We always map these to
        //  special URIs. 'xml' gets mapped to the official URI that its defined
        //  to map to by the NS spec. xmlns gets mapped to a special place holder
        //  URI that we define (so that it maps to something checkable.)
        //
        if (!XMLString::compareString(prefixBuf.getRawBuffer(), XMLUni::fgXMLNSString))
            uriId = fXMLNSNamespaceId;
        else if (!XMLString::compareString(prefixBuf.getRawBuffer(), XMLUni::fgXMLString))
            uriId = fXMLNamespaceId;
        else
        {
            bool unknown;
            uriId = fElemStack.mapPrefixToURI(prefixBuf.getRawBuffer(), mode, unknown);
            if (unknown)
                emitError(XMLErrs::UnknownPrefix, prefixBuf.getRawBuffer());
        }
    }
    return uriId;
}

bool XMLScanner::checkXMLDecl(bool startWithAngle) {
    //
    // [23] XMLDecl     ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
    // [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
    //
    // [3]  S           ::= (#x20 | #x9 | #xD | #xA)+
    //

    if (startWithAngle) {
        if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
            {
                return true;
            }
            else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
               || fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
            {
                //
                //  Just in case, check for upper case. If found, issue
                //  an error, but keep going.
                //
                emitError(XMLErrs::XMLDeclMustBeLowerCase);
                return true;
            }
        }
    }
    else {
        if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
            if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
            {
                return true;
            }
            else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
               || fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
            {
                //
                //  Just in case, check for upper case. If found, issue
                //  an error, but keep going.
                //
                emitError(XMLErrs::XMLDeclMustBeLowerCase);
                return true;
            }
        }
    }

    return false;
}


// ---------------------------------------------------------------------------
//  XMLScanner: Helper methos
// ---------------------------------------------------------------------------
void XMLScanner::resizeElemState() {

    unsigned int newSize = fElemStateSize * 2;
    unsigned int* newElemState = new unsigned int[newSize];

    // Copy the existing values
    unsigned int index = 0;
    for (; index < fElemStateSize; index++)
        newElemState[index] = fElemState[index];

    for (; index < newSize; index++)
        newElemState[index] = 0;

    // Delete the old array and udpate our members
    delete [] fElemState;
    fElemState = newElemState;
    fElemStateSize = newSize;
}

// ---------------------------------------------------------------------------
//  XMLScanner: IC activation methos
// ---------------------------------------------------------------------------
void XMLScanner::activateSelectorFor(IdentityConstraint* const ic) {

    IC_Selector* selector = ic->getSelector();

    if (!selector)
        return;

    XPathMatcher* matcher = selector->createMatcher(fFieldActivator);

    fMatcherStack->addMatcher(matcher);
    matcher->startDocumentFragment();
}

// ---------------------------------------------------------------------------
//  XMLScanner: Grammar preparsing
// ---------------------------------------------------------------------------
Grammar* XMLScanner::loadGrammar(const   XMLCh* const systemId
                                 , const short        grammarType
                                 , const bool         toCache)
{
    InputSource* srcToUse = 0;

    if (fEntityHandler){
        srcToUse = fEntityHandler->resolveEntity(XMLUni::fgZeroLenString, systemId);
    }

    //
    //  First we try to parse it as a URL. If that fails, we assume its
    //  a file and try it that way.
    //
    if (!srcToUse) {

        try
        {
            //
            //  Create a temporary URL. Since this is the primary document,
            //  it has to be fully qualified. If not, then assume we are just
            //  mistaking a file for a URL.
            //
            XMLURL tmpURL(systemId);
            if (tmpURL.isRelative()) {
                srcToUse = new LocalFileInputSource(systemId);
            }
            else
            {
                srcToUse = new URLInputSource(tmpURL);
            }
        }
        catch(const MalformedURLException&)
        {
            srcToUse = new LocalFileInputSource(systemId);
        }
        catch(...)
        {
            // Just rethrow this, since its not our problem
            throw;
        }
    }

    Janitor<InputSource> janSrc(srcToUse);
    return loadGrammar(*srcToUse, grammarType, toCache);
}

Grammar* XMLScanner::loadGrammar(const   char* const systemId
                                 , const short       grammarType
                                 , const bool        toCache)
{
    // We just delegate this to the XMLCh version after transcoding
    XMLCh* tmpBuf = XMLString::transcode(systemId);
    ArrayJanitor<XMLCh> janBuf(tmpBuf);
    return loadGrammar(tmpBuf, grammarType, toCache);
}


Grammar* XMLScanner::loadGrammar(const   InputSource& src
                                 , const short        grammarType
                                 , const bool         toCache)
{
    try
    {
        fGrammarResolver->cacheGrammarFromParse(false);
        fGrammarResolver->useCachedGrammarInParse(false);
        fRootGrammar = 0;

        if (fValScheme == Val_Auto) {
            fValidate = true;
        }

        // Reset some status flags
        fInException = false;
        fStandalone = false;
        fErrorCount = 0;
        fHasNoDTD = true;
        fSeeXsi = false;

        if (grammarType == Grammar::SchemaGrammarType) {
            return loadXMLSchemaGrammar(src, toCache);
        }
        else if (grammarType == Grammar::DTDGrammarType) {
            return loadDTDGrammar(src, toCache);
        }

        // Reset the reader manager to close all files, sockets, etc...
        fReaderMgr.reset();
    }

    //
    //  NOTE:
    //
    //  In all of the error processing below, the emitError() call MUST come
    //  before the flush of the reader mgr, or it will fail because it tries
    //  to find out the position in the XML source of the error.
    //
    catch(const XMLErrs::Codes)
    {
        // This is a 'first fatal error' type exit, so reset and fall through
        fReaderMgr.reset();
    }

    catch(const XMLValid::Codes)
    {
        // This is a 'first fatal error' type exit, so reset and fall through
        fReaderMgr.reset();
       
    }

    catch(const XMLException& excToCatch)
    {
        //
        //  Emit the error and catch any user exception thrown from here. Make
        //  sure in all cases we flush the reader manager.
        //
        fInException = true;
        try
        {
            if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
                emitError
                (
                    XMLErrs::DisplayErrorMessage
                    , excToCatch.getMessage()
                );
            else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
                emitError
                (
                    XMLErrs::XMLException_Fatal
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
            else
                emitError
                (
                    XMLErrs::XMLException_Error
                    , excToCatch.getType()
                    , excToCatch.getMessage()
                );
        }

        catch(...)
        {
            // Flush the reader manager and rethrow user's error
            fReaderMgr.reset();
            throw;
        }

        // If it returned, then reset the reader manager and fall through
        fReaderMgr.reset();
    }

    catch(...)
    {
        // Reset and rethrow
        fReaderMgr.reset();
        throw;
    }

    return 0;
}

Grammar* XMLScanner::loadDTDGrammar(const InputSource& src,
                                    const bool toCache)
{
   // Reset the validators
    fDTDValidator->reset();
    if (fValidatorFromUser)
        fValidator->reset();

    if (!fValidator->handlesDTD()) {
        if (fValidatorFromUser && fValidate)
            ThrowXML(RuntimeException, XMLExcepts::Gen_NoDTDValidator);
        else {
            fValidator = fDTDValidator;
        }
    }

    fDTDGrammar = new DTDGrammar();
    fGrammarResolver->putGrammar(XMLUni::fgDTDEntityString, fDTDGrammar);
    fGrammar = fDTDGrammar;
    fGrammarType = fGrammar->getGrammarType();
    fValidator->setGrammar(fGrammar);

    //
    //  And for all installed handlers, send reset events. This gives them
    //  a chance to flush any cached data.
    //
    if (fDocHandler)
        fDocHandler->resetDocument();
    if (fEntityHandler)
        fEntityHandler->resetEntities();
    if (fErrorReporter)
        fErrorReporter->resetErrors();

    // Clear out the id reference list
    fIDRefList->removeAll();

    if (toCache) {

        unsigned int sysId = fGrammarResolver->getStringPool()->addOrFind(src.getSystemId());
        const XMLCh* sysIdStr = fGrammarResolver->getStringPool()->getValueForId(sysId);

        fGrammarResolver->orphanGrammar(XMLUni::fgDTDEntityString);
        fGrammarResolver->putGrammar(sysIdStr, fGrammar);
    }

    //
    //  Handle the creation of the XML reader object for this input source.
    //  This will provide us with transcoding and basic lexing services.
    //
    XMLReader* newReader = fReaderMgr.createReader
    (
        src
        , false
        , XMLReader::RefFrom_NonLiteral
        , XMLReader::Type_General
        , XMLReader::Source_External
    );
    if (!newReader) {
        if (src.getIssueFatalErrorIfNotFound())
            ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId());
        else
            ThrowXML1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId());
    }

    //
    //  In order to make the processing work consistently, we have to
    //  make this look like an external entity. So create an entity
    //  decl and fill it in and push it with the reader, as happens
    //  with an external entity. Put a janitor on it to insure it gets
    //  cleaned up. The reader manager does not adopt them.
    //
    const XMLCh gDTDStr[] = { chLatin_D, chLatin_T, chLatin_D , chNull };
    DTDEntityDecl* declDTD = new DTDEntityDecl(gDTDStr);
    declDTD->setSystemId(src.getSystemId());
    Janitor<DTDEntityDecl> janDecl(declDTD);

    // Mark this one as a throw at end
    newReader->setThrowAtEnd(true);

    // And push it onto the stack, with its pseudo name
    fReaderMgr.pushReader(newReader, declDTD);

    //
    //  If we have a doc type handler and advanced callbacks are enabled,
    //  call the doctype event.
    //
    if (fDocTypeHandler) {

        // Create a dummy root
        DTDElementDecl* rootDecl = new DTDElementDecl(gDTDStr, fEmptyNamespaceId);
        rootDecl->setCreateReason(DTDElementDecl::AsRootElem);
        rootDecl->setExternalElemDeclaration(true);
        Janitor<DTDElementDecl> janSrc(rootDecl);

        fDocTypeHandler->doctypeDecl(*rootDecl, src.getPublicId(), src.getSystemId(), false);
    }

    // Create DTDScanner
    DTDScanner dtdScanner((DTDGrammar*)fGrammar, fDocTypeHandler);
    dtdScanner.setScannerInfo(this, &fReaderMgr, &fBufMgr);

    // Tell it its not in an include section
    dtdScanner.scanExtSubsetDecl(false);

    if (fValidate) {
        //  validate the DTD scan so far
        fValidator->preContentValidation(false, true);
    }

    if (toCache)
        fGrammarResolver->cacheGrammars();

    return fDTDGrammar;
}

// ---------------------------------------------------------------------------
//  XMLScanner: Rest pool of cached grammars
// ---------------------------------------------------------------------------
void XMLScanner::resetCachedGrammarPool()
{
    fGrammarResolver->resetCachedGrammar();
}