Newer
Older
* Copyright 1999-2002,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/internal/ValidationContextImpl.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/Mutexes.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/UnexpectedEOFException.hpp>
#include <xercesc/util/XMLMsgLoader.hpp>
#include <xercesc/util/XMLRegisterCleanup.hpp>
#include <xercesc/util/XMLInitializer.hpp>
#include <xercesc/framework/LocalFileInputSource.hpp>
#include <xercesc/framework/URLInputSource.hpp>
#include <xercesc/framework/XMLDocumentHandler.hpp>
#include <xercesc/framework/XMLEntityHandler.hpp>
#include <xercesc/framework/XMLPScanToken.hpp>
#include <xercesc/framework/XMLValidator.hpp>
#include <xercesc/internal/EndOfEntityException.hpp>
#include <xercesc/validators/DTD/DocTypeHandler.hpp>
#include <xercesc/validators/common/GrammarResolver.hpp>
Neil Graham
committed
#include <xercesc/util/OutOfMemoryException.hpp>
#include <xercesc/util/XMLResourceIdentifier.hpp>
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
// ---------------------------------------------------------------------------
// Local static data
// ---------------------------------------------------------------------------
static XMLUInt32 gScannerId;
static bool sRegistered = false;
static XMLMutex* sScannerMutex = 0;
static XMLRegisterCleanup scannerMutexCleanup;
static XMLMsgLoader* gMsgLoader = 0;
static XMLRegisterCleanup cleanupMsgLoader;
// ---------------------------------------------------------------------------
// Local, static functions
// ---------------------------------------------------------------------------
// Cleanup for the message loader
void XMLScanner::reinitMsgLoader()
{
delete gMsgLoader;
gMsgLoader = 0;
}
// Cleanup for the scanner mutex
void XMLScanner::reinitScannerMutex()
{
delete sScannerMutex;
sScannerMutex = 0;
sRegistered = false;
}
//
// We need to fault in this mutex. But, since its used for synchronization
// itself, we have to do this the low level way using a compare and swap.
//
static XMLMutex& gScannerMutex()
{
Khaled Noaman
committed
if (!sRegistered)
Khaled Noaman
committed
XMLMutexLock lockInit(XMLPlatformUtils::fgAtomicMutex);
Khaled Noaman
committed
sScannerMutex = new XMLMutex;
scannerMutexCleanup.registerCleanup(XMLScanner::reinitScannerMutex);
sRegistered = true;
}
}
return *sScannerMutex;
}
static XMLMsgLoader& gScannerMsgLoader()
{
if (!gMsgLoader)
{
Khaled Noaman
committed
XMLMutexLock lockInit(&gScannerMutex());
// If we haven't loaded our message yet, then do that
Khaled Noaman
committed
{
gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);
if (!gMsgLoader)
XMLPlatformUtils::panic(PanicHandler::Panic_CantLoadMsgDomain);
Khaled Noaman
committed
// Register this object to be cleaned up at termination
cleanupMsgLoader.registerCleanup(XMLScanner::reinitMsgLoader);
}
}
return *gMsgLoader;
}
void XMLInitializer::initializeScannerMsgLoader()
{
gMsgLoader = XMLPlatformUtils::loadMsgSet(XMLUni::fgXMLErrDomain);
// Register this object to be cleaned up at termination
if (gMsgLoader) {
cleanupMsgLoader.registerCleanup(XMLScanner::reinitMsgLoader);
}
sScannerMutex = new XMLMutex;
if (sScannerMutex) {
scannerMutexCleanup.registerCleanup(XMLScanner::reinitScannerMutex);
sRegistered = true;
}
}
// ---------------------------------------------------------------------------
// XMLScanner: Constructors and Destructor
// ---------------------------------------------------------------------------
XMLScanner::XMLScanner(XMLValidator* const valToAdopt,
GrammarResolver* const grammarResolver,
David Abram Cargill
committed
MemoryManager* const manager)
: fBufferSize(1024 * 1024)
, fStandardUriConformant(false)
David Abram Cargill
committed
, fCalculateSrcOfs(false)
, fExitOnFirstFatal(true)
, fValidationConstraintFatal(false)
, fInException(false)
, fStandalone(false)
, fHasNoDTD(true)
, fValidate(false)
, fValidatorFromUser(false)
, fDoSchema(false)
, fSchemaFullChecking(false)
, fToCacheGrammar(false)
, fUseCachedGrammar(false)
, fLoadExternalDTD(true)
, fNormalizeData(true)
David Abram Cargill
committed
, fGenerateSyntheticAnnotations(false)
, fValidateAnnotations(false)
David Abram Cargill
committed
, fEntityExpansionLimit(0)
, fEntityExpansionCount(0)
, fEmptyNamespaceId(0)
, fUnknownNamespaceId(0)
, fXMLNamespaceId(0)
, fXMLNSNamespaceId(0)
, fSchemaNamespaceId(0)
David Abram Cargill
committed
, fUIntPool(0)
, fUIntPoolRow(0)
, fUIntPoolCol(0)
, fUIntPoolRowTotal(2)
, fScannerId(0)
, fSequenceId(0)
, fAttrList(0)
, fAttrDupChkRegistry(0)
, fDocHandler(0)
, fDocTypeHandler(0)
, fEntityHandler(0)
, fErrorReporter(0)
, fErrorHandler(0)
David Abram Cargill
committed
, fPSVIHandler(0)
, fValidationContext(0)
, fEntityDeclPoolRetrieved(false)
, fReaderMgr(manager)
, fGrammarResolver(grammarResolver)
, fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
, fURIStringPool(0)
, fRootElemName(0)
, fExternalSchemaLocation(0)
David Abram Cargill
committed
, fExternalNoNamespaceSchemaLocation(0)
, fMemoryManager(manager)
, fBufMgr(manager)
, fAttNameBuf(1023, manager)
, fAttValueBuf(1023, manager)
, fCDataBuf(1023, manager)
, fQNameBuf(1023, manager)
, fPrefixBuf(1023, manager)
, fURIBuf(1023, manager)
David Abram Cargill
committed
, fElemStack(manager)
{
commonInit();
if (fValidator) {
fValidatorFromUser = true;
initValidator(fValidator);
}
}
XMLScanner::XMLScanner( XMLDocumentHandler* const docHandler
, DocTypeHandler* const docTypeHandler
, XMLEntityHandler* const entityHandler
, XMLErrorReporter* const errHandler
, XMLValidator* const valToAdopt
, GrammarResolver* const grammarResolver
David Abram Cargill
committed
, MemoryManager* const manager)
: fBufferSize(1024 * 1024)
, fStandardUriConformant(false)
David Abram Cargill
committed
, fCalculateSrcOfs(false)
, fExitOnFirstFatal(true)
, fValidationConstraintFatal(false)
, fInException(false)
, fStandalone(false)
, fHasNoDTD(true)
, fValidate(false)
, fValidatorFromUser(false)
, fDoSchema(false)
, fSchemaFullChecking(false)
, fToCacheGrammar(false)
, fUseCachedGrammar(false)
, fLoadExternalDTD(true)
, fNormalizeData(true)
David Abram Cargill
committed
, fGenerateSyntheticAnnotations(false)
, fValidateAnnotations(false)
David Abram Cargill
committed
, fEntityExpansionLimit(0)
, fEntityExpansionCount(0)
, fEmptyNamespaceId(0)
, fUnknownNamespaceId(0)
, fXMLNamespaceId(0)
, fXMLNSNamespaceId(0)
, fSchemaNamespaceId(0)
David Abram Cargill
committed
, fUIntPool(0)
, fUIntPoolRow(0)
, fUIntPoolCol(0)
, fUIntPoolRowTotal(2)
, fScannerId(0)
, fSequenceId(0)
, fAttrList(0)
, fAttrDupChkRegistry(0)
, fDocHandler(docHandler)
, fDocTypeHandler(docTypeHandler)
, fEntityHandler(entityHandler)
, fErrorReporter(errHandler)
, fErrorHandler(0)
David Abram Cargill
committed
, fPSVIHandler(0)
, fValidationContext(0)
, fEntityDeclPoolRetrieved(false)
, fReaderMgr(manager)
, fGrammarResolver(grammarResolver)
, fGrammarPoolMemoryManager(grammarResolver->getGrammarPoolMemoryManager())
, fURIStringPool(0)
, fRootElemName(0)
, fExternalSchemaLocation(0)
David Abram Cargill
committed
, fExternalNoNamespaceSchemaLocation(0)
, fMemoryManager(manager)
, fBufMgr(manager)
, fAttNameBuf(1023, manager)
, fAttValueBuf(1023, manager)
, fCDataBuf(1023, manager)
, fQNameBuf(1023, manager)
, fPrefixBuf(1023, manager)
, fURIBuf(1023, manager)
{
commonInit();
if (valToAdopt){
fValidatorFromUser = true;
initValidator(fValidator);
}
}
XMLScanner::~XMLScanner()
{
delete fAttrList;
delete fAttrDupChkRegistry;
fMemoryManager->deallocate(fRootElemName);//delete [] fRootElemName;
fMemoryManager->deallocate(fExternalSchemaLocation);//delete [] fExternalSchemaLocation;
fMemoryManager->deallocate(fExternalNoNamespaceSchemaLocation);//delete [] fExternalNoNamespaceSchemaLocation;
// delete fUIntPool
for (unsigned int i=0; i<=fUIntPoolRow; i++)
{
fMemoryManager->deallocate(fUIntPool[i]);
}
fMemoryManager->deallocate(fUIntPool);
void XMLScanner::setValidator(XMLValidator* const valToAdopt)
{
if (fValidatorFromUser)
delete fValidator;
fValidator = valToAdopt;
fValidatorFromUser = true;
initValidator(fValidator);
}
// ---------------------------------------------------------------------------
// XMLScanner: Main entry point to scan a document
// ---------------------------------------------------------------------------
void XMLScanner::scanDocument( const XMLCh* const systemId)
{
// First we try to parse it as a URL. If that fails, we assume its
// a file and try it that way.
InputSource* srcToUse = 0;
try
{
// Create a temporary URL. Since this is the primary document,
// it has to be fully qualified. If not, then assume we are just
// mistaking a file for a URL.
XMLURL tmpURL(fMemoryManager);
if (XMLURL::parse(systemId, tmpURL)) {
if (tmpURL.isRelative()) {
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
David Abram Cargill
committed
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
David Abram Cargill
committed
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return;
}
}
else
{
if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
David Abram Cargill
committed
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return;
}
srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
}
}
else {
if (!fStandardUriConformant)
Khaled Noaman
committed
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
David Abram Cargill
committed
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
// lazy bypass ... since all MalformedURLException are fatal, no need to check the type
David Abram Cargill
committed
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return;
}
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
catch(const XMLException& excToCatch)
{
// For any other XMLException,
// emit the error and catch any user exception thrown from here.
fInException = true;
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getType()
, excToCatch.getMessage()
);
return;
}
scanDocument(*srcToUse);
void XMLScanner::scanDocument( const char* const systemId)
{
// We just delegate this to the XMLCh version after transcoding
XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
scanDocument(tmpBuf);
}
// This method begins a progressive parse. It scans through the prolog and
// returns a token to be used on subsequent scanNext() calls. If the return
// value is true, then the token is legal and ready for further use. If it
// returns false, then the scan of the prolog failed and the token is not
// going to work on subsequent scanNext() calls.
bool XMLScanner::scanFirst( const XMLCh* const systemId
, XMLPScanToken& toFill)
{
// First we try to parse it as a URL. If that fails, we assume its
// a file and try it that way.
InputSource* srcToUse = 0;
try
{
// Create a temporary URL. Since this is the primary document,
// it has to be fully qualified. If not, then assume we are just
// mistaking a file for a URL.
David Abram Cargill
committed
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
XMLURL tmpURL(fMemoryManager);
if (XMLURL::parse(systemId, tmpURL)) {
if (tmpURL.isRelative()) {
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return false;
}
}
else
{
if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return false;
}
srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
}
}
else {
if (!fStandardUriConformant)
David Abram Cargill
committed
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
David Abram Cargill
committed
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
David Abram Cargill
committed
// lazy bypass ... since all MalformedURLException are fatal, no need to check the type
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return false;
}
}
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
catch(const XMLException& excToCatch)
{
// For any other XMLException,
// emit the error and catch any user exception thrown from here.
fInException = true;
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getType()
, excToCatch.getMessage()
);
return false;
}
return scanFirst(*srcToUse, toFill);
}
bool XMLScanner::scanFirst( const char* const systemId
, XMLPScanToken& toFill)
{
// We just delegate this to the XMLCh version after transcoding
XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
return scanFirst(tmpBuf, toFill);
}
bool XMLScanner::scanFirst( const InputSource& src
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
{
// Bump up the sequence id for this new scan cycle. This will invalidate
// any previous tokens we've returned.
fSequenceId++;
// Reset the scanner and its plugged in stuff for a new run. This
// resets all the data structures, creates the initial reader and
// pushes it on the stack, and sets up the base document path
scanReset(src);
// If we have a document handler, then call the start document
if (fDocHandler)
fDocHandler->startDocument();
try
{
// Scan the prolog part, which is everything before the root element
// including the DTD subsets. This is all that is done on the scan
// first.
scanProlog();
// If we got to the end of input, then its not a valid XML file.
// Else, go on to scan the content.
if (fReaderMgr.atEOF())
{
emitError(XMLErrs::EmptyMainEntity);
}
}
// NOTE:
//
// In all of the error processing below, the emitError() call MUST come
// before the flush of the reader mgr, or it will fail because it tries
// to find out the position in the XML source of the error.
catch(const XMLErrs::Codes)
{
// This is a 'first failure' exception so reset and return a failure
fReaderMgr.reset();
return false;
}
catch(const XMLValid::Codes)
{
// This is a 'first fatal error' type exit, so reset and reuturn failure
fReaderMgr.reset();
return false;
}
catch(const XMLException& excToCatch)
{
// Emit the error and catch any user exception thrown from here. Make
// sure in all cases we flush the reader manager.
fInException = true;
try
{
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getType()
, excToCatch.getMessage()
);
}
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
catch(...)
{
// Reset and rethrow the user error
fReaderMgr.reset();
throw;
}
// Reset and return a failure
fReaderMgr.reset();
return false;
}
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
catch(...)
{
// Reset and rethrow original error
fReaderMgr.reset();
throw;
}
// Fill in the caller's token to make it legal and return success
toFill.set(fScannerId, fSequenceId);
return true;
}
void XMLScanner::scanReset(XMLPScanToken& token)
{
// Make sure this token is still legal
if (!isLegalToken(token))
David Abram Cargill
committed
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);
// Reset the reader manager
fReaderMgr.reset();
// And invalidate any tokens by bumping our sequence number
fSequenceId++;
// Reset our error count
fErrorCount = 0;
}
void XMLScanner::setParseSettings(XMLScanner* const refScanner)
{
setDocHandler(refScanner->getDocHandler());
setDocTypeHandler(refScanner->getDocTypeHandler());
setErrorHandler(refScanner->getErrorHandler());
setErrorReporter(refScanner->getErrorReporter());
setEntityHandler(refScanner->getEntityHandler());
setDoNamespaces(refScanner->getDoNamespaces());
setDoSchema(refScanner->getDoSchema());
setCalculateSrcOfs(refScanner->getCalculateSrcOfs());
setStandardUriConformant(refScanner->getStandardUriConformant());
setExitOnFirstFatal(refScanner->getExitOnFirstFatal());
setValidationConstraintFatal(refScanner->getValidationConstraintFatal());
setIdentityConstraintChecking(refScanner->getIdentityConstraintChecking());
setValidationSchemaFullChecking(refScanner->getValidationSchemaFullChecking());
cacheGrammarFromParse(refScanner->isCachingGrammarFromParse());
useCachedGrammarInParse(refScanner->isUsingCachedGrammarInParse());
setLoadExternalDTD(refScanner->getLoadExternalDTD());
setNormalizeData(refScanner->getNormalizeData());
setExternalSchemaLocation(refScanner->getExternalSchemaLocation());
setExternalNoNamespaceSchemaLocation(refScanner->getExternalNoNamespaceSchemaLocation());
setValidationScheme(refScanner->getValidationScheme());
setSecurityManager(refScanner->getSecurityManager());
setPSVIHandler(refScanner->getPSVIHandler());
// ---------------------------------------------------------------------------
// XMLScanner: Private helper methods.
// ---------------------------------------------------------------------------
// This method handles the common initialization, to avoid having to do
// it redundantly in multiple constructors.
void XMLScanner::commonInit()
{
// We have to do a little init that involves statics, so we have to
// use the mutex to protect it.
{
XMLMutexLock lockInit(&gScannerMutex());
// And assign ourselves the next available scanner id
fScannerId = ++gScannerId;
}
// Create the attribute list, which is used to store attribute values
// during start tag processing. Give it a reasonable initial size that
// will serve for most folks, though it will grow as required.
fAttrList = new (fMemoryManager) RefVectorOf<XMLAttr>(32, true, fMemoryManager);
// Create the id ref list. This is used to enforce XML 1.0 ID ref
// semantics, i.e. all id refs must refer to elements that exist
fValidationContext = new (fMemoryManager) ValidationContextImpl(fMemoryManager);
// Create the GrammarResolver
//fGrammarResolver = new GrammarResolver();
// create initial, 64-element, fUIntPool
fUIntPool = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) *fUIntPoolRowTotal);
fUIntPool[0] = (unsigned int *)fMemoryManager->allocate(sizeof(unsigned int) << 6);
memset(fUIntPool[0], 0, sizeof(unsigned int) << 6);
fUIntPool[1] = 0;
// Register self as handler for XMLBufferFull events on the CDATA buffer
fCDataBuf.setFullHandler(this, fBufferSize);
void XMLScanner::initValidator(XMLValidator* theValidator) {
// Tell the validator about the stuff it needs to know in order to
// do its work.
theValidator->setScannerInfo(this, &fReaderMgr, &fBufMgr);
theValidator->setErrorReporter(fErrorReporter);
}
// ---------------------------------------------------------------------------
// XMLScanner: Error emitting methods
// ---------------------------------------------------------------------------
// These methods are called whenever the scanner wants to emit an error.
// It handles getting the message loaded, doing token replacement, etc...
// and then calling the error handler, if its installed.
bool XMLScanner::emitErrorWillThrowException(const XMLErrs::Codes toEmit)
{
if (XMLErrs::isFatal(toEmit) && fExitOnFirstFatal && !fInException)
return true;
return false;
}
void XMLScanner::emitError(const XMLErrs::Codes toEmit)
{
// Bump the error count if it is not a warning
if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
incrementErrorCount();
if (fErrorReporter)
{
// Load the message into a local for display
const unsigned int msgSize = 1023;
XMLCh errText[msgSize + 1];
if (!gScannerMsgLoader().loadMsg(toEmit, errText, msgSize))
{
// <TBD> Probably should load a default msg here
}
// Create a LastExtEntityInfo structure and get the reader manager
// to fill it in for us. This will give us the information about
// the last reader on the stack that was an external entity of some
// sort (i.e. it will ignore internal entities.
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
fErrorReporter->error
(
toEmit
, XMLUni::fgXMLErrDomain
, XMLErrs::errorType(toEmit)
, errText
, lastInfo.systemId
, lastInfo.publicId
, lastInfo.lineNumber
, lastInfo.colNumber
);
}
// Bail out if its fatal an we are to give up on the first fatal error
if (emitErrorWillThrowException(toEmit))
throw toEmit;
}
void XMLScanner::emitError( const XMLErrs::Codes toEmit
, const XMLCh* const text1
, const XMLCh* const text2
, const XMLCh* const text3
, const XMLCh* const text4)
{
// Bump the error count if it is not a warning
if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
incrementErrorCount();
if (fErrorReporter)
{
// Load the message into alocal and replace any tokens found in
// the text.
const unsigned int maxChars = 2047;
XMLCh errText[maxChars + 1];
if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
{
// <TBD> Should probably load a default message here
}
// Create a LastExtEntityInfo structure and get the reader manager
// to fill it in for us. This will give us the information about
// the last reader on the stack that was an external entity of some
// sort (i.e. it will ignore internal entities.
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
fErrorReporter->error
(
toEmit
, XMLUni::fgXMLErrDomain
, XMLErrs::errorType(toEmit)
, errText
, lastInfo.systemId
, lastInfo.publicId
, lastInfo.lineNumber
, lastInfo.colNumber
);
}
// Bail out if its fatal an we are to give up on the first fatal error
if (emitErrorWillThrowException(toEmit))
throw toEmit;
}
void XMLScanner::emitError( const XMLErrs::Codes toEmit
, const char* const text1
, const char* const text2
, const char* const text3
, const char* const text4)
{
// Bump the error count if it is not a warning
if (XMLErrs::errorType(toEmit) != XMLErrorReporter::ErrType_Warning)
incrementErrorCount();
if (fErrorReporter)
{
// Load the message into alocal and replace any tokens found in
// the text.
const unsigned int maxChars = 2047;
XMLCh errText[maxChars + 1];
David Abram Cargill
committed
if (!gScannerMsgLoader().loadMsg(toEmit, errText, maxChars, text1, text2, text3, text4, fMemoryManager))
{
// <TBD> Should probably load a default message here
}
// Create a LastExtEntityInfo structure and get the reader manager
// to fill it in for us. This will give us the information about
// the last reader on the stack that was an external entity of some
// sort (i.e. it will ignore internal entities.
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
fErrorReporter->error
(
toEmit
, XMLUni::fgXMLErrDomain
, XMLErrs::errorType(toEmit)
, errText
, lastInfo.systemId
, lastInfo.publicId
, lastInfo.lineNumber
, lastInfo.colNumber
);
}
// Bail out if its fatal an we are to give up on the first fatal error
if (emitErrorWillThrowException(toEmit))
throw toEmit;
}
// ---------------------------------------------------------------------------
// XMLScanner: Getter methods
// ---------------------------------------------------------------------------
// This method allows the caller to query the current location of the scanner.
// It will return the sys/public ids of the current entity, and the line/col
// position within it.
//
// NOTE: This API returns the location with the last external file. So if its
// currently scanning an entity, the position returned will be the end of
// the entity reference in the file that had the reference.
//
XMLScanner::getLastExtLocation( XMLCh* const sysIdToFill
, const unsigned int maxSysIdChars
, XMLCh* const pubIdToFill
, const unsigned int maxPubIdChars
Tinny Ng
committed
, XMLSSize_t& lineToFill
, XMLSSize_t& colToFill) const
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
{
// Create a local info object and get it filled in by the reader manager
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
// Fill in the line and column number
lineToFill = lastInfo.lineNumber;
colToFill = lastInfo.colNumber;
// And copy over as much of the ids as will fit
sysIdToFill[0] = 0;
if (lastInfo.systemId)
{
if (XMLString::stringLen(lastInfo.systemId) > maxSysIdChars)
return false;
XMLString::copyString(sysIdToFill, lastInfo.systemId);
}
pubIdToFill[0] = 0;
if (lastInfo.publicId)
{
if (XMLString::stringLen(lastInfo.publicId) > maxPubIdChars)
return false;
XMLString::copyString(pubIdToFill, lastInfo.publicId);
}
return true;
// ---------------------------------------------------------------------------
// XMLScanner: Private scanning methods
// ---------------------------------------------------------------------------
// This method is called after the end of the root element, to handle
// any miscellaneous stuff hanging around.
void XMLScanner::scanMiscellaneous()
// Get a buffer for this work
XMLBufBid bbCData(&fBufMgr);
const XMLCh nextCh = fReaderMgr.peekNextChar();
// Watch for end of file and break out
if (!nextCh)
break;
// Can't have an XML decl here
emitError(XMLErrs::NotValidAfterContent);
fReaderMgr.skipPastChar(chCloseAngle);
else if (fReaderMgr.skippedString(XMLUni::fgPIString))
else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
// This can't be possible, so just give up
emitError(XMLErrs::ExpectedCommentOrPI);
fReaderMgr.skipPastChar(chCloseAngle);
// If we have a doc handler, then gather up the spaces and
// call back. Otherwise, just skip over whitespace.
if (fDocHandler)
fReaderMgr.getSpaces(bbCData.getBuffer());
fDocHandler->ignorableWhitespace
bbCData.getRawBuffer()
, bbCData.getLen()
, false
emitError(XMLErrs::ExpectedCommentOrPI);
fReaderMgr.skipPastChar(chCloseAngle);
catch(const EndOfEntityException&)
{
// Some entity leaked out of the content part of the document. Issue
// a warning and keep going.
emitError(XMLErrs::EntityPropogated);
// Scans a PI and calls the appropriate callbacks. At entry we have just
// scanned the <? part, and need to now start on the PI target name.
void XMLScanner::scanPI()
const XMLCh* namePtr = 0;
const XMLCh* targetPtr = 0;
// If there are any spaces here, then warn about it. If we aren't in
// 'first error' mode, then we'll come back and can easily pick up
// again by just skipping them.
if (fReaderMgr.lookingAtSpace())
emitError(XMLErrs::PINameExpected);
fReaderMgr.skipPastSpaces();
// Get a buffer for the PI name and scan it in
XMLBufBid bbName(&fBufMgr);
if (!fReaderMgr.getName(bbName.getBuffer()))
emitError(XMLErrs::PINameExpected);
fReaderMgr.skipPastChar(chCloseAngle);
return;
// Point the name pointer at the raw data
namePtr = bbName.getRawBuffer();
// See if it is some form of 'xml' and emit a warning
if (!XMLString::compareIString(namePtr, XMLUni::fgXMLString))
emitError(XMLErrs::NoPIStartsWithXML);
// If namespaces are enabled, then no colons allowed
if (fDoNamespaces)
if (XMLString::indexOf(namePtr, chColon) != -1)
emitError(XMLErrs::ColonNotLegalWithNS);
// If we don't hit a space next, then the PI has no target. If we do
// then get out the target. Get a buffer for it as well
XMLBufBid bbTarget(&fBufMgr);
if (fReaderMgr.skippedSpace())
// Skip any leading spaces
fReaderMgr.skipPastSpaces();
bool gotLeadingSurrogate = false;
// It does have a target, so lets move on to deal with that.
while (1)
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for an end of file, which is always bad here
if (!nextCh)
{
emitError(XMLErrs::UnterminatedPI);
David Abram Cargill
committed
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
// Watch for potential terminating character
if (nextCh == chQuestion)
// It must be followed by '>' to be a termination of the target
if (fReaderMgr.skippedChar(chCloseAngle))
break;
// Check for correct surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
David Abram Cargill
committed
, fMemoryManager
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
// No target, but make sure its terminated ok
if (!fReaderMgr.skippedChar(chQuestion))
emitError(XMLErrs::UnterminatedPI);
fReaderMgr.skipPastChar(chCloseAngle);
return;
if (!fReaderMgr.skippedChar(chCloseAngle))
{
emitError(XMLErrs::UnterminatedPI);
fReaderMgr.skipPastChar(chCloseAngle);
return;
// Point the target pointer at the raw data
targetPtr = bbTarget.getRawBuffer();
// If we have a handler, then call it
if (fDocHandler)
{
//mark PI is seen within the current element
if (! fElemStack.isEmpty())
fElemStack.setCommentOrPISeen();
// Scans all the input from the start of the file to the root element.
// There does not have to be anything in the prolog necessarily, but usually
// there is at least an XMLDecl.
//
// On exit from here we are either at the end of the file or about to read
// the opening < of the root element.
void XMLScanner::scanProlog()
{
// Get a buffer for whitespace processing
XMLBufBid bbCData(&fBufMgr);
// Loop through the prolog. If there is no content, this could go all
// the way to the end of the file.
try
{
while (true)
const XMLCh nextCh = fReaderMgr.peekNextChar();
if (nextCh == chOpenAngle)
{
// Ok, it could be the xml decl, a comment, the doc type line,
// or the start of the root element.
if (checkXMLDecl(true))
{
// There shall be at lease --ONE-- space in between
// the tag '<?xml' and the VersionInfo.
//
// If we are not at line 1, col 6, then the decl was not
// the first text, so its invalid.
const XMLReader* curReader = fReaderMgr.getCurrentReader();
if ((curReader->getLineNumber() != 1)
|| (curReader->getColumnNumber() != 7))
{
emitError(XMLErrs::XMLDeclMustBeFirst);
scanXMLDecl(Decl_XML);
}
else if (fReaderMgr.skippedString(XMLUni::fgPIString))
{
scanPI();
}
else if (fReaderMgr.skippedString(XMLUni::fgCommentString))
{
scanComment();
}
else if (fReaderMgr.skippedString(XMLUni::fgDocTypeString))
{
scanDocTypeDecl();
// if reusing grammar, this has been validated already in first scan
// skip for performance
if (fValidate && !fGrammar->getValidated()) {
// validate the DTD scan so far
fValidator->preContentValidation(fUseCachedGrammar, true);
else
{
// Assume its the start of the root element
return;
{
// If we have a document handler then gather up the
// whitespace and call back. Otherwise just skip over spaces.
if (fDocHandler)
{
fReaderMgr.getSpaces(bbCData.getBuffer());
fDocHandler->ignorableWhitespace
(
bbCData.getRawBuffer()
, bbCData.getLen()
, false
);
}
else
{
fReaderMgr.skipPastSpaces();
else
{
emitError(XMLErrs::InvalidDocumentStructure);
// Watch for end of file and break out
if (!nextCh)
break;
else
fReaderMgr.skipPastChar(chCloseAngle);
}
catch(const EndOfEntityException&)
{
// We should never get an end of entity here. They should only
// occur within the doc type scanning method, and not leak out to
// here.
emitError
(
XMLErrs::UnexpectedEOE
, "in prolog"
);
}
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
}
// Scans the <?xml .... ?> line. This stuff is all sequential so we don't
// do any state machine loop here. We just bull straight through it. It ends
// past the closing bracket. If there is a document handler, then its called
// on the XMLDecl callback.
//
// On entry, the <?xml has been scanned, and we pick it up from there.
//
// NOTE: In order to provide good recovery from bad XML here, we try to be
// very flexible. No matter what order the stuff is in, we'll keep going
// though we'll issue errors.
//
// The parameter tells us which type of decl we should expect, Text or XML.
// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
// [77] TextDecl::= '<?xml' VersionInfo? EncodingDecl S? '?>'
void XMLScanner::scanXMLDecl(const DeclTypes type)
{
// Get us some buffers to use
XMLBufBid bbVersion(&fBufMgr);
XMLBufBid bbEncoding(&fBufMgr);
XMLBufBid bbStand(&fBufMgr);
XMLBufBid bbDummy(&fBufMgr);
XMLBufBid bbName(&fBufMgr);
// We use this little enum and array to keep up with what we found
// and what order we found them in. This lets us get them free form
// without too much overhead, but still know that they were in the
// wrong order.
enum Strings
{
VersionString
, EncodingString
, StandaloneString
, UnknownString
, StringCount
};
int flags[StringCount] = { -1, -1, -1, -1 };
// Also set up a list of buffers in the right order so that we know
// where to put stuff.
XMLBuffer* buffers[StringCount] ;
buffers[0] = &bbVersion.getBuffer();
buffers[1] = &bbEncoding.getBuffer();
buffers[2] = &bbStand.getBuffer();
buffers[3] = &bbDummy.getBuffer();
int curCount = 0;
Strings curString;
XMLBuffer& nameBuf = bbName.getBuffer();
while (true)
{
// Skip any spaces
const unsigned int spaceCount = fReaderMgr.skipPastSpaces(true);
// If we are looking at a question mark, then break out
if (fReaderMgr.lookingAtChar(chQuestion))
break;
// If this is not the first string, then we require the spaces
if (!spaceCount && curCount)
emitError(XMLErrs::ExpectedWhitespace);
// Get characters up to the next whitespace or equal's sign.
if (!scanUpToWSOr(nameBuf, chEqual))
emitError(XMLErrs::ExpectedDeclString);
// See if it matches any of our expected strings
if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgVersionString))
else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgEncodingString))
else if (XMLString::equals(nameBuf.getRawBuffer(), XMLUni::fgStandaloneString))
curString = StandaloneString;
else
curString = UnknownString;
// If its an unknown string, then give that error. Else check to
// see if this one has been done already and give that error.
if (curString == UnknownString)
emitError(XMLErrs::ExpectedDeclString, nameBuf.getRawBuffer());
else if (flags[curString] != -1)
emitError(XMLErrs::DeclStringRep, nameBuf.getRawBuffer());
else if (flags[curString] == -1)
flags[curString] = ++curCount;
// Scan for an equal's sign. If we don't find it, issue an error
// but keep trying to go on.
emitError(XMLErrs::ExpectedEqSign);
// Get a quote string into the buffer for the string that we are
// currently working on.
if (!getQuotedString(*buffers[curString]))
{
emitError(XMLErrs::ExpectedQuotedString);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
// And validate the value according which one it was
const XMLCh* rawValue = buffers[curString]->getRawBuffer();
if (curString == VersionString)
{
if (XMLString::equals(rawValue, XMLUni::fgVersion1_1)) {
if (type == Decl_XML) {
else {
if (fXMLVersion != XMLReader::XMLV1_1)
emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
}
else if (XMLString::equals(rawValue, XMLUni::fgVersion1_0)) {
if (type == Decl_XML) {
fXMLVersion = XMLReader::XMLV1_0;
fReaderMgr.setXMLVersion(XMLReader::XMLV1_0);
emitError(XMLErrs::UnsupportedXMLVersion, rawValue);
}
else if (curString == EncodingString)
{
if (!XMLString::isValidEncName(rawValue))
emitError(XMLErrs::BadXMLEncoding, rawValue);
}
else if (curString == StandaloneString)
{
if (XMLString::equals(rawValue, XMLUni::fgYesString))
else if (XMLString::equals(rawValue, XMLUni::fgNoString))
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
fStandalone = false;
else
{
emitError(XMLErrs::BadStandalone);
if (!XMLString::compareIString(rawValue, XMLUni::fgYesString))
fStandalone = true;
else if (!XMLString::compareIString(rawValue, XMLUni::fgNoString))
fStandalone = false;
}
}
}
// Make sure that the strings present are in order. We don't care about
// which ones are present at this point, just that any there are in the
// right order.
int curTop = 0;
for (int index = VersionString; index < StandaloneString; index++)
{
if (flags[index] != -1)
{
if (flags[index] != curTop + 1)
{
emitError(XMLErrs::DeclStringsInWrongOrder);
break;
}
curTop = flags[index];
}
}
// If its an XML decl, the version must be present.
// If its a Text decl, then encoding must be present AND standalone must not be present.
if ((type == Decl_XML) && (flags[VersionString] == -1))
emitError(XMLErrs::XMLVersionRequired);
else if (type == Decl_Text) {
if (flags[StandaloneString] != -1)
emitError(XMLErrs::StandaloneNotLegal);
if (flags[EncodingString] == -1)
emitError(XMLErrs::EncodingRequired);
}
if (!fReaderMgr.skippedChar(chQuestion))
{
emitError(XMLErrs::UnterminatedXMLDecl);
fReaderMgr.skipPastChar(chCloseAngle);
}
else if (!fReaderMgr.skippedChar(chCloseAngle))
{
emitError(XMLErrs::UnterminatedXMLDecl);
fReaderMgr.skipPastChar(chCloseAngle);
}
// Do this before we possibly update the reader with the
// actual encoding string. Otherwise, we will pass the wrong thing
// for the last parameter!
const XMLCh* actualEnc = fReaderMgr.getCurrentEncodingStr();
// Ok, we've now seen the real encoding string, if there was one, so
// lets call back on the current reader and tell it what the real
// encoding string was. If it fails, that's because it represents some
// sort of contradiction with the autosensed format, and it keeps the
// original encoding.
//
// NOTE: This can fail for a number of reasons, such as a bogus encoding
// name or because its in flagrant contradiction of the auto-sensed
// format.
if (flags[EncodingString] != -1)
{
if (!fReaderMgr.getCurrentReader()->setEncoding(bbEncoding.getRawBuffer()))
emitError(XMLErrs::ContradictoryEncoding, bbEncoding.getRawBuffer());
else
actualEnc = bbEncoding.getRawBuffer();
}
// If we have a document handler then call the XML Decl callback.
Tinny Ng
committed
if (type == Decl_XML)
Tinny Ng
committed
if (fDocHandler)
fDocHandler->XMLDecl
(
bbVersion.getRawBuffer()
, bbEncoding.getRawBuffer()
, bbStand.getRawBuffer()
, actualEnc
);
}
else if (type == Decl_Text)
{
if (fDocTypeHandler)
fDocTypeHandler->TextDecl
(
bbVersion.getRawBuffer()
, bbEncoding.getRawBuffer()
);
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
}
}
const XMLCh* XMLScanner::getURIText(const unsigned int uriId) const
{
if (fURIStringPool->exists(uriId)) {
// Look up the URI in the string pool and return its id
const XMLCh* value = fURIStringPool->getValueForId(uriId);
if (!value)
return XMLUni::fgZeroLenString;
return value;
}
else
return XMLUni::fgZeroLenString;
}
bool XMLScanner::getURIText( const unsigned int uriId
, XMLBuffer& uriBufToFill) const
{
if (fURIStringPool->exists(uriId)) {
// Look up the URI in the string pool and return its id
const XMLCh* value = fURIStringPool->getValueForId(uriId);
if (!value)
}
bool XMLScanner::checkXMLDecl(bool startWithAngle) {
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
// [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
// [24] VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
//
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
if (startWithAngle) {
if (fReaderMgr.peekString(XMLUni::fgXMLDeclString)) {
if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpace)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTab)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLF)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCR))
{
return true;
}
else if (fReaderMgr.skippedString(XMLUni::fgXMLDeclStringSpaceU)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringHTabU)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringLFU)
|| fReaderMgr.skippedString(XMLUni::fgXMLDeclStringCRU))
{
// Just in case, check for upper case. If found, issue
// an error, but keep going.
emitError(XMLErrs::XMLDeclMustBeLowerCase);
return true;
}
}
}
else {
if (fReaderMgr.peekString(XMLUni::fgXMLString)) {
if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpace)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringHTab)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringLF)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringCR))
{
return true;
}
else if (fReaderMgr.skippedString(XMLUni::fgXMLStringSpaceU)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringHTabU)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringLFU)
|| fReaderMgr.skippedString(XMLUni::fgXMLStringCRU))
{
// Just in case, check for upper case. If found, issue
// an error, but keep going.
emitError(XMLErrs::XMLDeclMustBeLowerCase);
return true;
}
}
}
return false;
}
// ---------------------------------------------------------------------------
// XMLScanner: Grammar preparsing
// ---------------------------------------------------------------------------
Grammar* XMLScanner::loadGrammar(const XMLCh* const systemId
, const short grammarType
, const bool toCache)
{
InputSource* srcToUse = 0;
if (fEntityHandler){
ReaderMgr::LastExtEntityInfo lastInfo;
fReaderMgr.getLastExtEntityInfo(lastInfo);
XMLResourceIdentifier resourceIdentifier(XMLResourceIdentifier::ExternalEntity,
systemId, 0, XMLUni::fgZeroLenString, lastInfo.systemId);
srcToUse = fEntityHandler->resolveEntity(&resourceIdentifier);
}
// First we try to parse it as a URL. If that fails, we assume its
// a file and try it that way.
if (!srcToUse) {
try
{
// Create a temporary URL. Since this is the primary document,
// it has to be fully qualified. If not, then assume we are just
// mistaking a file for a URL.
David Abram Cargill
committed
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
XMLURL tmpURL(fMemoryManager);
if (XMLURL::parse(systemId, tmpURL)) {
if (tmpURL.isRelative())
{
if (!fStandardUriConformant)
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_NoProtocolPresent, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return 0;
}
}
else
{
if (fStandardUriConformant && tmpURL.hasInvalidChar()) {
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL, fMemoryManager);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return 0;
}
srcToUse = new (fMemoryManager) URLInputSource(tmpURL, fMemoryManager);
}
}
else
if (!fStandardUriConformant)
Khaled Noaman
committed
srcToUse = new (fMemoryManager) LocalFileInputSource(systemId, fMemoryManager);
else {
David Abram Cargill
committed
// since this is the top of the try/catch, cannot call ThrowXMLwithMemMgr
// emit the error directly
David Abram Cargill
committed
// lazy bypass ... since all MalformedURLException are fatal, no need to check the type
MalformedURLException e(__FILE__, __LINE__, XMLExcepts::URL_MalformedURL);
fInException = true;
emitError
(
XMLErrs::XMLException_Fatal
, e.getType()
, e.getMessage()
);
return 0;
}
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
}
}
catch(const XMLException& excToCatch)
{
// For any other XMLException,
// emit the error and catch any user exception thrown from here.
fInException = true;
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getType()
, excToCatch.getMessage()
);
return 0;
}
}
Janitor<InputSource> janSrc(srcToUse);
return loadGrammar(*srcToUse, grammarType, toCache);
}
Grammar* XMLScanner::loadGrammar(const char* const systemId
, const short grammarType
, const bool toCache)
{
// We just delegate this to the XMLCh version after transcoding
XMLCh* tmpBuf = XMLString::transcode(systemId, fMemoryManager);
ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager);
return loadGrammar(tmpBuf, grammarType, toCache);
}
// ---------------------------------------------------------------------------
// XMLScanner: Setter methods
// ---------------------------------------------------------------------------
void XMLScanner::setURIStringPool(XMLStringPool* const stringPool)
fURIStringPool = stringPool;
fEmptyNamespaceId = fURIStringPool->addOrFind(XMLUni::fgZeroLenString);
fUnknownNamespaceId = fURIStringPool->addOrFind(XMLUni::fgUnknownURIName);
fXMLNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLURIName);
fXMLNSNamespaceId = fURIStringPool->addOrFind(XMLUni::fgXMLNSURIName);
}
// ---------------------------------------------------------------------------
// XMLScanner: Private helper methods
// ---------------------------------------------------------------------------
/***
* In reusing grammars (cacheing grammar from parse, or use cached grammar), internal
* dtd is allowed conditionally.
*
* In the case of cacheing grammar from parse, it is NOT allowed.
*
* In the case of use cached grammar,
* if external dtd is present and it is parsed before, then it is not allowed,
* otherwise it is allowed.
*
***/
David Abram Cargill
committed
void XMLScanner::checkInternalDTD(bool hasExtSubset
,const XMLCh* const sysId
,const XMLCh* const pubId)
{
if (fToCacheGrammar)
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
if (fUseCachedGrammar && hasExtSubset )
{
David Abram Cargill
committed
InputSource* sysIdSrc = resolveSystemId(sysId, pubId);
Janitor<InputSource> janSysIdSrc(sysIdSrc);
Grammar* grammar = fGrammarResolver->getGrammar(sysIdSrc->getSystemId());
if (grammar && grammar->getGrammarType() == Grammar::DTDGrammarType)
{
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Val_CantHaveIntSS, fMemoryManager);
}
}
}
// This method is called after the content scan to insure that all the
// ID/IDREF attributes match up (i.e. that all IDREFs refer to IDs.) This is
// an XML 1.0 rule, so we can do here in the core.
void XMLScanner::checkIDRefs()
{
// Iterate the id ref list. If we find any entries here which are used
// but not declared, then that's an error.
David Abram Cargill
committed
RefHashTableOfEnumerator<XMLRefInfo> refEnum(fValidationContext->getIdRefList(), false, fMemoryManager);
// Get a ref to the current element
const XMLRefInfo& curRef = refEnum.nextElement();
// If its used but not declared, then its an error
if (!curRef.getDeclared() && curRef.getUsed() && fValidate)
fValidator->emitError(XMLValid::IDNotDeclared, curRef.getRefName());
}
}
// This just does a simple check that the passed progressive scan token is
// legal for this scanner.
bool XMLScanner::isLegalToken(const XMLPScanToken& toCheck)
{
return ((fScannerId == toCheck.fScannerId)
&& (fSequenceId == toCheck.fSequenceId));
}
// This method will handle figuring out what the next top level token is
// in the input stream. It will return an enumerated value that indicates
// what it believes the next XML level token must be. It will eat as many
// chars are required to figure out what is next.
XMLScanner::XMLTokens XMLScanner::senseNextToken(unsigned int& orgReader)
{
// Get the next character and use it to guesstimate what the next token
// is going to be. We turn on end of entity exceptions when we do this
// in order to catch the scenario where the current entity ended at
// the > of some markup.
XMLCh nextCh;
ThrowEOEJanitor janMgr(&fReaderMgr, true);
nextCh = fReaderMgr.peekNextChar();
// Check for special chars. Start with the most
// obvious end of file, which should be legal here at top level.
if (!nextCh)
return Token_EOF;
// If it's not a '<' we must be in content.
//
// This includes entity references '&' of some sort. These must
// be character data because that's the only place a reference can
// occur in content.
if (nextCh != chOpenAngle)
return Token_CharData;
// Ok it had to have been a '<' character. So get it out of the reader
// and store the reader number where we saw it, passing it back to the
// caller.
fReaderMgr.getNextChar();
orgReader = fReaderMgr.getCurrentReaderNum();
// Ok, so lets go through the things that it could be at this point which
// are all some form of markup.
nextCh = fReaderMgr.peekNextChar();
if (nextCh == chForwardSlash)
fReaderMgr.getNextChar();
return Token_EndTag;
}
else if (nextCh == chBang)
{
static const XMLCh gCDATAStr[] =
chBang, chOpenSquare, chLatin_C, chLatin_D, chLatin_A
, chLatin_T, chLatin_A, chNull
};
if (fReaderMgr.skippedString(gCDATAStr))
return Token_CData;
if (fReaderMgr.skippedString(gCommentString))
return Token_Comment;
emitError(XMLErrs::ExpectedCommentOrCDATA);
return Token_Unknown;
}
else if (nextCh == chQuestion)
// It must be a PI
fReaderMgr.getNextChar();
return Token_PI;
// Assume its an element name, so return with a start tag token. If it
// turns out not to be, then it will fail when it cannot get a valid tag.
return Token_StartTag;
// ---------------------------------------------------------------------------
// XMLScanner: Private parsing methods
// ---------------------------------------------------------------------------
// This guy just scans out a single or double quoted string of characters.
// It does not pass any judgement on the contents and assumes that it is
// illegal to have another quote of the same kind inside the string's
// contents.
//
// NOTE: This is for simple stuff like the strings in the XMLDecl which
// cannot have any entities inside them. So this guy does not handle any
// end of entity stuff.
bool XMLScanner::getQuotedString(XMLBuffer& toFill)
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
while (true)
{
// Get another char
const XMLCh nextCh = fReaderMgr.getNextChar();
// See if it matches the starting quote char
if (nextCh == quoteCh)
break;
// We should never get either an end of file null char here. If we
// do, just fail. It will be handled more gracefully in the higher
// level code that called us.
if (!nextCh)
return false;
// Else add it to the buffer
toFill.append(nextCh);
}
return true;
}
// This method scans a character reference and returns the character that
// was refered to. It assumes that we've already scanned the &# characters
// that prefix the numeric code.
bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second)
{
bool gotOne = false;
unsigned int value = 0;
// Set the radix. Its supposed to be a lower case x if hex. But, in
// order to recover well, we check for an upper and put out an error
// for that.
unsigned int radix = 10;
if (fReaderMgr.skippedChar(chLatin_x))
{
radix = 16;
}
else if (fReaderMgr.skippedChar(chLatin_X))
{
emitError(XMLErrs::HexRadixMustBeLowerCase);
radix = 16;
while (true)
{
const XMLCh nextCh = fReaderMgr.peekNextChar();
// Watch for EOF
if (!nextCh)
David Abram Cargill
committed
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
// Break out on the terminating semicolon
if (nextCh == chSemiColon)
{
fReaderMgr.getNextChar();
break;
}
// Convert this char to a binary value, or bail out if its not
// one.
unsigned int nextVal;
if ((nextCh >= chDigit_0) && (nextCh <= chDigit_9))
nextVal = (unsigned int)(nextCh - chDigit_0);
else if ((nextCh >= chLatin_A) && (nextCh <= chLatin_F))
nextVal= (unsigned int)(10 + (nextCh - chLatin_A));
else if ((nextCh >= chLatin_a) && (nextCh <= chLatin_f))
nextVal = (unsigned int)(10 + (nextCh - chLatin_a));
// If we got at least a sigit, then do an unterminated ref error.
// Else, do an expected a numerical ref thing.
if (gotOne)
emitError(XMLErrs::UnterminatedCharRef);
else
emitError(XMLErrs::ExpectedNumericalCharRef);
// Return failure
return false;
}
// Make sure its valid for the radix. If not, then just eat the
// digit and go on after issueing an error. Else, update the
// running value with this new digit.
if (nextVal >= radix)
{
XMLCh tmpStr[2];
tmpStr[0] = nextCh;
tmpStr[1] = chNull;
emitError(XMLErrs::BadDigitForRadix, tmpStr);
}
else
{
value = (value * radix) + nextVal;
// Guard against overflow.
if (value > 0x10FFFF) {
// Character reference was not in the valid range
emitError(XMLErrs::InvalidCharacterRef);
return false;
}
// Indicate that we got at least one good digit
gotOne = true;
// And eat the last char
fReaderMgr.getNextChar();
}
// Return the char (or chars)
// And check if the character expanded is valid or not
if (value >= 0x10000 && value <= 0x10FFFF)
{
value -= 0x10000;
toFill = XMLCh((value >> 10) + 0xD800);
second = XMLCh((value & 0x3FF) + 0xDC00);
}
else if (value <= 0xFFFD)
{
toFill = XMLCh(value);
second = 0;
if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) {
// Character reference was not in the valid range
emitError(XMLErrs::InvalidCharacterRef);
return false;
}
}
else {
// Character reference was not in the valid range
emitError(XMLErrs::InvalidCharacterRef);
return false;
return true;
}
// We get here after the '<!--' part of the comment. We scan past the
// terminating '-->' It will calls the appropriate handler with the comment
// text, if one is provided. A comment can be in either the document or
// the DTD, so the fInDocument flag is used to know which handler to send
// it to.
void XMLScanner::scanComment()
{
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
enum States
{
InText
, OneDash
, TwoDashes
};
// Get a buffer for this
XMLBufBid bbComment(&fBufMgr);
// Get the comment text into a temp buffer. Be sure to use temp buffer
// two here, since its to be used for stuff that is potentially longer
// than just a name.
States curState = InText;
bool gotLeadingSurrogate = false;
while (true)
{
// Get the next character
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for an end of file
if (!nextCh)
{
emitError(XMLErrs::UnterminatedComment);
David Abram Cargill
committed
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
}
// Check for correct surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
{
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
if (gotLeadingSurrogate)
{
if ((nextCh < 0xDC00) || (nextCh > 0xDFFF))
emitError(XMLErrs::Expected2ndSurrogateChar);
}
// Its got to at least be a valid XML character
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
David Abram Cargill
committed
, fMemoryManager
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
}
gotLeadingSurrogate = false;
}
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
if (curState == InText)
{
// If its a dash, go to OneDash state. Otherwise take as text
if (nextCh == chDash)
curState = OneDash;
else
bbComment.append(nextCh);
}
else if (curState == OneDash)
{
// If its another dash, then we change to the two dashes states.
// Otherwise, we have to put in the deficit dash and the new
// character and go back to InText.
if (nextCh == chDash)
{
curState = TwoDashes;
}
else
{
bbComment.append(chDash);
bbComment.append(nextCh);
curState = InText;
}
}
else if (curState == TwoDashes)
{
// The next character must be the closing bracket
if (nextCh != chCloseAngle)
{
emitError(XMLErrs::IllegalSequenceInComment);
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
break;
}
}
// If we have an available handler, call back with the comment.
if (fDocHandler)
{
fDocHandler->docComment
(
bbComment.getRawBuffer()
);
//mark comment is seen within the current element
if (! fElemStack.isEmpty())
fElemStack.setCommentOrPISeen();
// Most equal signs can have white space around them, so this little guy
// just makes the calling code cleaner by eating whitespace.
unsigned int
XMLScanner::scanUpToWSOr(XMLBuffer& toFill, const XMLCh chEndChar)
fReaderMgr.getUpToCharOrWS(toFill, chEndChar);
return toFill.getLen();
unsigned int *XMLScanner::getNewUIntPtr()
{
// this method hands back a new pointer initialized to 0
unsigned int *retVal;
if(fUIntPoolCol < 64)
{
retVal = fUIntPool[fUIntPoolRow]+fUIntPoolCol;
fUIntPoolCol++;
return retVal;
}
// time to grow the pool...
if(fUIntPoolRow+1 == fUIntPoolRowTotal)
{
// and time to add some space for new rows:
fUIntPoolRowTotal <<= 1;
unsigned int **newArray = (unsigned int **)fMemoryManager->allocate(sizeof(unsigned int *) * fUIntPoolRowTotal );
memcpy(newArray, fUIntPool, (fUIntPoolRow+1) * sizeof(unsigned int *));
fMemoryManager->deallocate(fUIntPool);
fUIntPool = newArray;
Loading
Loading full blame...