Newer
Older
* Copyright 2002,2003-2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Id$
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/internal/WFXMLScanner.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/RuntimeException.hpp>
#include <xercesc/util/UnexpectedEOFException.hpp>
#include <xercesc/sax/InputSource.hpp>
#include <xercesc/framework/XMLDocumentHandler.hpp>
#include <xercesc/framework/XMLEntityHandler.hpp>
#include <xercesc/framework/XMLPScanToken.hpp>
#include <xercesc/framework/XMLValidityCodes.hpp>
#include <xercesc/internal/EndOfEntityException.hpp>
Neil Graham
committed
#include <xercesc/util/OutOfMemoryException.hpp>
XERCES_CPP_NAMESPACE_BEGIN
// ---------------------------------------------------------------------------
// WFXMLScanner: Constructors and Destructor
// ---------------------------------------------------------------------------
WFXMLScanner::WFXMLScanner( XMLValidator* const valToAdopt
, GrammarResolver* const grammarResolver
, MemoryManager* const manager) :
XMLScanner(valToAdopt, grammarResolver, manager)
, fElementIndex(0)
, fElements(0)
, fEntityTable(0)
, fAttrNameHashList(0)
, fAttrNSList(0)
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
WFXMLScanner::WFXMLScanner( XMLDocumentHandler* const docHandler
, DocTypeHandler* const docTypeHandler
, XMLEntityHandler* const entityHandler
, XMLErrorReporter* const errHandler
, XMLValidator* const valToAdopt
, GrammarResolver* const grammarResolver
, MemoryManager* const manager) :
XMLScanner(docHandler, docTypeHandler, entityHandler, errHandler, valToAdopt, grammarResolver, manager)
, fElementIndex(0)
, fElements(0)
, fEntityTable(0)
, fAttrNameHashList(0)
, fAttrNSList(0)
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
cleanUp();
throw;
}
}
WFXMLScanner::~WFXMLScanner()
{
cleanUp();
}
// ---------------------------------------------------------------------------
// XMLScanner: Getter methods
// ---------------------------------------------------------------------------
NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool()
{
return 0;
}
const NameIdPool<DTDEntityDecl>* WFXMLScanner::getEntityDeclPool() const
{
return 0;
}
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
// ---------------------------------------------------------------------------
// WFXMLScanner: Main entry point to scan a document
// ---------------------------------------------------------------------------
void WFXMLScanner::scanDocument(const InputSource& src)
{
// Bump up the sequence id for this parser instance. This will invalidate
// any previous progressive scan tokens.
fSequenceId++;
try
{
// Reset the scanner and its plugged in stuff for a new run. This
// resets all the data structures, creates the initial reader and
// pushes it on the stack, and sets up the base document path.
scanReset(src);
// If we have a document handler, then call the start document
if (fDocHandler)
fDocHandler->startDocument();
// Scan the prolog part, which is everything before the root element
// including the DTD subsets.
scanProlog();
// If we got to the end of input, then its not a valid XML file.
// Else, go on to scan the content.
if (fReaderMgr.atEOF())
{
emitError(XMLErrs::EmptyMainEntity);
}
else
{
// Scan content, and tell it its not an external entity
David Abram Cargill
committed
if (scanContent())
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
{
// That went ok, so scan for any miscellaneous stuff
if (!fReaderMgr.atEOF())
scanMiscellaneous();
}
}
// If we have a document handler, then call the end document
if (fDocHandler)
fDocHandler->endDocument();
// Reset the reader manager to close all files, sockets, etc...
fReaderMgr.reset();
}
// NOTE:
//
// In all of the error processing below, the emitError() call MUST come
// before the flush of the reader mgr, or it will fail because it tries
// to find out the position in the XML source of the error.
catch(const XMLErrs::Codes)
{
// This is a 'first fatal error' type exit, so reset and fall through
fReaderMgr.reset();
}
catch(const XMLValid::Codes)
{
// This is a 'first fatal error' type exit, so reset and fall through
fReaderMgr.reset();
}
catch(const XMLException& excToCatch)
{
// Emit the error and catch any user exception thrown from here. Make
// sure in all cases we flush the reader manager.
fInException = true;
try
{
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getType()
, excToCatch.getMessage()
);
}
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
catch(...)
{
// Flush the reader manager and rethrow user's error
fReaderMgr.reset();
throw;
}
// If it returned, then reset the reader manager and fall through
fReaderMgr.reset();
}
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
catch(...)
{
// Reset and rethrow
fReaderMgr.reset();
throw;
}
}
bool WFXMLScanner::scanNext(XMLPScanToken& token)
{
// Make sure this token is still legal
if (!isLegalToken(token))
David Abram Cargill
committed
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_BadPScanToken, fMemoryManager);
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
// Find the next token and remember the reader id
unsigned int orgReader;
XMLTokens curToken;
bool retVal = true;
try
{
while (true)
{
// We have to handle any end of entity exceptions that happen here.
// We could be at the end of X nested entities, each of which will
// generate an end of entity exception as we try to move forward.
try
{
curToken = senseNextToken(orgReader);
break;
}
catch(const EndOfEntityException& toCatch)
{
// Send an end of entity reference event
if (fDocHandler)
fDocHandler->endEntityReference(toCatch.getEntity());
}
}
if (curToken == Token_CharData)
{
scanCharData(fCDataBuf);
}
else if (curToken == Token_EOF)
{
if (!fElemStack.isEmpty())
{
const ElemStack::StackElem* topElem = fElemStack.popTop();
, topElem->fThisElement->getFullName()
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
);
}
retVal = false;
}
else
{
// Its some sort of markup
bool gotData = true;
switch(curToken)
{
case Token_CData :
// Make sure we are within content
if (fElemStack.isEmpty())
emitError(XMLErrs::CDATAOutsideOfContent);
scanCDSection();
break;
case Token_Comment :
scanComment();
break;
case Token_EndTag :
scanEndTag(gotData);
break;
case Token_PI :
scanPI();
break;
case Token_StartTag :
if (fDoNamespaces)
scanStartTagNS(gotData);
else
scanStartTag(gotData);
break;
default :
fReaderMgr.skipToChar(chOpenAngle);
break;
}
if (orgReader != fReaderMgr.getCurrentReaderNum())
emitError(XMLErrs::PartialMarkupInEntity);
// If we hit the end, then do the miscellaneous part
if (!gotData)
{
// That went ok, so scan for any miscellaneous stuff
scanMiscellaneous();
if (fDocHandler)
fDocHandler->endDocument();
}
}
}
// NOTE:
//
// In all of the error processing below, the emitError() call MUST come
// before the flush of the reader mgr, or it will fail because it tries
// to find out the position in the XML source of the error.
catch(const XMLErrs::Codes)
{
// This is a 'first failure' exception, so reset and return failure
fReaderMgr.reset();
return false;
}
catch(const XMLValid::Codes)
{
// This is a 'first fatal error' type exit, so reset and reuturn failure
fReaderMgr.reset();
return false;
}
catch(const XMLException& excToCatch)
{
// Emit the error and catch any user exception thrown from here. Make
// sure in all cases we flush the reader manager.
fInException = true;
try
{
if (excToCatch.getErrorType() == XMLErrorReporter::ErrType_Warning)
emitError
(
XMLErrs::XMLException_Warning
, excToCatch.getType()
, excToCatch.getMessage()
);
else if (excToCatch.getErrorType() >= XMLErrorReporter::ErrType_Fatal)
emitError
(
XMLErrs::XMLException_Fatal
, excToCatch.getType()
, excToCatch.getMessage()
);
else
emitError
(
XMLErrs::XMLException_Error
, excToCatch.getType()
, excToCatch.getMessage()
);
}
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
catch(...)
{
// Reset and rethrow user error
fReaderMgr.reset();
throw;
}
// Reset and return failure
fReaderMgr.reset();
return false;
}
Neil Graham
committed
catch(const OutOfMemoryException&)
{
throw;
}
catch(...)
{
// Reset and rethrow original error
fReaderMgr.reset();
throw;
}
// If we hit the end, then flush the reader manager
if (!retVal)
fReaderMgr.reset();
return retVal;
}
// ---------------------------------------------------------------------------
// WFXMLScanner: Private helper methods.
// ---------------------------------------------------------------------------
// This method handles the common initialization, to avoid having to do
// it redundantly in multiple constructors.
void WFXMLScanner::commonInit()
{
fEntityTable = new (fMemoryManager) ValueHashTableOf<XMLCh>(11, fMemoryManager);
fAttrNameHashList = new (fMemoryManager)ValueVectorOf<unsigned int>(16, fMemoryManager);
fAttrNSList = new (fMemoryManager) ValueVectorOf<XMLAttr*>(8, fMemoryManager);
fElements = new (fMemoryManager) RefVectorOf<XMLElementDecl>(32, true, fMemoryManager);
fElementLookup = new (fMemoryManager) RefHashTableOf<XMLElementDecl>(109, false, fMemoryManager);
// Add the default entity entries for the character refs that must always
// be present.
fEntityTable->put((void*) XMLUni::fgAmp, chAmpersand);
fEntityTable->put((void*) XMLUni::fgLT, chOpenAngle);
fEntityTable->put((void*) XMLUni::fgGT, chCloseAngle);
fEntityTable->put((void*) XMLUni::fgQuot, chDoubleQuote);
fEntityTable->put((void*) XMLUni::fgApos, chSingleQuote);
}
void WFXMLScanner::cleanUp()
{
delete fEntityTable;
delete fAttrNameHashList;
delete fAttrNSList;
delete fElementLookup;
delete fElements;
}
unsigned int
WFXMLScanner::resolvePrefix(const XMLCh* const prefix
, const ElemStack::MapModes mode)
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
{
// Watch for the special namespace prefixes. We always map these to
// special URIs. 'xml' gets mapped to the official URI that its defined
// to map to by the NS spec. xmlns gets mapped to a special place holder
// URI that we define (so that it maps to something checkable.)
if (XMLString::equals(prefix, XMLUni::fgXMLNSString))
return fXMLNSNamespaceId;
else if (XMLString::equals(prefix, XMLUni::fgXMLString))
return fXMLNamespaceId;
// Ask the element stack to search up itself for a mapping for the
// passed prefix.
bool unknown;
unsigned int uriId = fElemStack.mapPrefixToURI(prefix, mode, unknown);
// If it was unknown, then the URI was faked in but we have to issue an error
if (unknown)
emitError(XMLErrs::UnknownPrefix, prefix);
return uriId;
}
// This method will reset the scanner data structures, and related plugged
// in stuff, for a new scan session. We get the input source for the primary
// XML entity, create the reader for it, and push it on the stack so that
// upon successful return from here we are ready to go.
void WFXMLScanner::scanReset(const InputSource& src)
{
// For all installed handlers, send reset events. This gives them
// a chance to flush any cached data.
if (fDocHandler)
fDocHandler->resetDocument();
if (fEntityHandler)
fEntityHandler->resetEntities();
if (fErrorReporter)
fErrorReporter->resetErrors();
// Reset the element stack, and give it the latest ids for the special
// URIs it has to know about.
fElemStack.reset
(
fEmptyNamespaceId
, fUnknownNamespaceId
, fXMLNamespaceId
, fXMLNSNamespaceId
);
// Reset some status flags
fInException = false;
fStandalone = false;
fErrorCount = 0;
fHasNoDTD = true;
fElementIndex = 0;
// Reset elements lookup table
fElementLookup->removeAll();
// Handle the creation of the XML reader object for this input source.
// This will provide us with transcoding and basic lexing services.
XMLReader* newReader = fReaderMgr.createReader
(
src
, true
, XMLReader::RefFrom_NonLiteral
, XMLReader::Type_General
, XMLReader::Source_External
, fCalculateSrcOfs
);
if (!newReader) {
if (src.getIssueFatalErrorIfNotFound())
David Abram Cargill
committed
ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource, src.getSystemId(), fMemoryManager);
David Abram Cargill
committed
ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Scan_CouldNotOpenSource_Warning, src.getSystemId(), fMemoryManager);
}
// Push this read onto the reader manager
fReaderMgr.pushReader(newReader, 0);
// and reset security-related things if necessary:
if(fSecurityManager != 0)
{
fEntityExpansionLimit = fSecurityManager->getEntityExpansionLimit();
fEntityExpansionCount = 0;
}
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
}
// This method is called between markup in content. It scans for character
// data that is sent to the document handler. It watches for any markup
// characters that would indicate that the character data has ended. It also
// handles expansion of general and character entities.
//
// sendData() is a local static helper for this method which handles some
// code that must be done in three different places here.
void WFXMLScanner::sendCharData(XMLBuffer& toSend)
{
// If no data in the buffer, then nothing to do
if (toSend.isEmpty())
return;
// Always assume its just char data if not validating
if (fDocHandler)
fDocHandler->docCharacters(toSend.getRawBuffer(), toSend.getLen(), false);
// Reset buffer
toSend.reset();
}
// ---------------------------------------------------------------------------
// WFXMLScanner: Private scanning methods
// ---------------------------------------------------------------------------
// This method will kick off the scanning of the primary content of the
// document, i.e. the elements.
David Abram Cargill
committed
bool WFXMLScanner::scanContent()
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
{
// Go into a loop until we hit the end of the root element, or we fall
// out because there is no root element.
//
// We have to do kind of a deeply nested double loop here in order to
// avoid doing the setup/teardown of the exception handler on each
// round. Doing it this way we only do it when an exception actually
// occurs.
bool gotData = true;
bool inMarkup = false;
while (gotData)
{
try
{
while (gotData)
{
// Sense what the next top level token is. According to what
// this tells us, we will call something to handle that kind
// of thing.
unsigned int orgReader;
const XMLTokens curToken = senseNextToken(orgReader);
// Handle character data and end of file specially. Char data
// is not markup so we don't want to handle it in the loop
// below.
if (curToken == Token_CharData)
{
// Scan the character data and call appropriate events. Let
// him use our local character data buffer for efficiency.
scanCharData(fCDataBuf);
continue;
}
else if (curToken == Token_EOF)
{
// The element stack better be empty at this point or we
// ended prematurely before all elements were closed.
if (!fElemStack.isEmpty())
{
const ElemStack::StackElem* topElem = fElemStack.popTop();
, topElem->fThisElement->getFullName()
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
);
}
// Its the end of file, so clear the got data flag
gotData = false;
continue;
}
// We are in some sort of markup now
inMarkup = true;
// According to the token we got, call the appropriate
// scanning method.
switch(curToken)
{
case Token_CData :
// Make sure we are within content
if (fElemStack.isEmpty())
emitError(XMLErrs::CDATAOutsideOfContent);
scanCDSection();
break;
case Token_Comment :
scanComment();
break;
case Token_EndTag :
scanEndTag(gotData);
break;
case Token_PI :
scanPI();
break;
case Token_StartTag :
if (fDoNamespaces)
scanStartTagNS(gotData);
else
scanStartTag(gotData);
break;
default :
fReaderMgr.skipToChar(chOpenAngle);
break;
}
if (orgReader != fReaderMgr.getCurrentReaderNum())
emitError(XMLErrs::PartialMarkupInEntity);
// And we are back out of markup again
inMarkup = false;
}
}
catch(const EndOfEntityException& toCatch)
{
// If we were in some markup when this happened, then its a
// partial markup error.
if (inMarkup)
emitError(XMLErrs::PartialMarkupInEntity);
// Send an end of entity reference event
if (fDocHandler)
fDocHandler->endEntityReference(toCatch.getEntity());
inMarkup = false;
}
}
// It went ok, so return success
return true;
}
void WFXMLScanner::scanEndTag(bool& gotData)
{
// Assume we will still have data until proven otherwise. It will only
// ever be false if this is the end of the root element.
gotData = true;
// Check if the element stack is empty. If so, then this is an unbalanced
// element (i.e. more ends than starts, perhaps because of bad text
// causing one to be skipped.)
if (fElemStack.isEmpty())
{
emitError(XMLErrs::MoreEndThanStartTags);
fReaderMgr.skipPastChar(chCloseAngle);
David Abram Cargill
committed
ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Scan_UnbalancedStartEnd, fMemoryManager);
}
// Pop the stack of the element we are supposed to be ending. Remember
// that we don't own this. The stack just keeps them and reuses them.
unsigned int uriId = (fDoNamespaces)
? fElemStack.getCurrentURI() : fEmptyNamespaceId;
const ElemStack::StackElem* topElem = fElemStack.popTop();
// See if it was the root element, to avoid multiple calls below
const bool isRoot = fElemStack.isEmpty();
// Make sure that its the end of the element that we expect
if (!fReaderMgr.skippedString(topElem->fThisElement->getFullName()))
{
, topElem->fThisElement->getFullName()
fReaderMgr.skipPastChar(chCloseAngle);
return;
}
// Make sure we are back on the same reader as where we started
if (topElem->fReaderNum != fReaderMgr.getCurrentReaderNum())
emitError(XMLErrs::PartialTagMarkupError);
// Skip optional whitespace
fReaderMgr.skipPastSpaces();
// Make sure we find the closing bracket
if (!fReaderMgr.skippedChar(chCloseAngle))
{
emitError
(
XMLErrs::UnterminatedEndTag
, topElem->fThisElement->getFullName()
);
}
// If we have a doc handler, tell it about the end tag
if (fDocHandler)
{
fDocHandler->endElement
(
, topElem->fThisElement->getElementName()->getPrefix()
);
}
// If this was the root, then done with content
gotData = !isRoot;
}
void WFXMLScanner::scanDocTypeDecl()
{
// Just skips over it
// REVISIT: Should we issue a warning
chOpenSquare, chCloseAngle, chNull
};
XMLCh nextCh = fReaderMgr.skipUntilIn(doctypeIE);
if (nextCh == chOpenSquare)
fReaderMgr.skipPastChar(chCloseSquare);
fReaderMgr.skipPastChar(chCloseAngle);
}
bool WFXMLScanner::scanStartTag(bool& gotData)
{
// Assume we will still have data until proven otherwise. It will only
// ever be false if this is the root and its empty.
gotData = true;
// Get the QName. In this case, we are not doing namespaces, so we just
// use it as is and don't have to break it into parts.
if (!fReaderMgr.getName(fQNameBuf))
{
emitError(XMLErrs::ExpectedElementName);
fReaderMgr.skipToChar(chOpenAngle);
return false;
}
// Assume it won't be an empty tag
bool isEmpty = false;
// See if its the root element
const bool isRoot = fElemStack.isEmpty();
// Lets try to look up the element
const XMLCh* qnameRawBuf = fQNameBuf.getRawBuffer();
XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf);
if (!elemDecl) {
if (fElementIndex < fElements->size()) {
elemDecl = fElements->elementAt(fElementIndex);
}
else {
elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
fGrammarPoolMemoryManager
fElements->addElement(elemDecl);
}
elemDecl->setElementName(XMLUni::fgZeroLenString, qnameRawBuf, fEmptyNamespaceId);
fElementLookup->put((void*)elemDecl->getFullName(), elemDecl);
fElementIndex++;
}
// Expand the element stack and add the new element
fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
// Skip any whitespace after the name
fReaderMgr.skipPastSpaces();
// We loop until we either see a /> or >, handling attribute/value
// pairs until we get there.
unsigned int attCount = 0;
unsigned int curAttListSize = fAttrList->size();
while (true)
{
// And get the next non-space character
XMLCh nextCh = fReaderMgr.peekNextChar();
// If the next character is not a slash or closed angle bracket,
// then it must be whitespace, since whitespace is required
// between the end of the last attribute and the name of the next
// one.
if (attCount)
{
if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
{
{
// Ok, skip by them and peek another char
fReaderMgr.skipPastSpaces();
nextCh = fReaderMgr.peekNextChar();
}
{
// Emit the error but keep on going
emitError(XMLErrs::ExpectedWhitespace);
}
}
}
// Ok, here we first check for any of the special case characters.
// If its not one, then we do the normal case processing, which
// assumes that we've hit an attribute value, Otherwise, we do all
// the special case checks.
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
{
// Assume its going to be an attribute, so get a name from
// the input.
if (!fReaderMgr.getName(fAttNameBuf))
{
emitError(XMLErrs::ExpectedAttrName);
fReaderMgr.skipPastChar(chCloseAngle);
return false;
}
// And next must be an equal sign
if (!scanEq())
{
static const XMLCh tmpList[] =
{
chSingleQuote, chDoubleQuote, chCloseAngle
, chOpenAngle, chForwardSlash, chNull
};
emitError(XMLErrs::ExpectedEqSign);
// Try to sync back up by skipping forward until we either
// hit something meaningful.
const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
{
// Jump back to top for normal processing of these
continue;
}
else if ((chFound == chSingleQuote)
|| (chFound == chDoubleQuote)
{
// Just fall through assuming that the value is to follow
}
else if (chFound == chOpenAngle)
{
// Assume a malformed tag and that new one is starting
emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
return false;
}
else
{
// Something went really wrong
return false;
}
}
// See if this attribute is declared more than one for this element.
David Abram Cargill
committed
unsigned int attNameHash = XMLString::hash(attNameRawBuf, 109, fMemoryManager);
if (attCount) {
for (unsigned int k=0; k < attCount; k++) {
if (fAttrNameHashList->elementAt(k) == attNameHash) {
if (
XMLString::equals
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
fAttrList->elementAt(k)->getName()
, attNameRawBuf
)
)
{
emitError
(
XMLErrs::AttrAlreadyUsedInSTag
, attNameRawBuf
, qnameRawBuf
);
break;
}
}
}
}
// Skip any whitespace before the value and then scan the att
// value. This will come back normalized with entity refs and
// char refs expanded.
fReaderMgr.skipPastSpaces();
if (!scanAttValue(attNameRawBuf, fAttValueBuf))
{
static const XMLCh tmpList[] =
{
chCloseAngle, chOpenAngle, chForwardSlash, chNull
};
emitError(XMLErrs::ExpectedAttrValue);
// It failed, so lets try to get synced back up. We skip
// forward until we find some whitespace or one of the
// chars in our list.
const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
if ((chFound == chCloseAngle)
|| (chFound == chForwardSlash)
{
// Just fall through and process this attribute, though
// the value will be "".
}
else if (chFound == chOpenAngle)
{
// Assume a malformed tag and that new one is starting
emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
return false;
}
else
{
// Something went really wrong
return false;
}
}
// Add this attribute to the attribute list that we use to
// pass them to the handler. We reuse its existing elements
// but expand it as required.
curAtt = new (fMemoryManager) XMLAttr
-1
, attNameRawBuf
, XMLUni::fgZeroLenString
, fAttValueBuf.getRawBuffer()
, XMLAttDef::CData
, true
, fMemoryManager
fAttrNameHashList->addElement(attNameHash);
}
else
{
curAtt = fAttrList->elementAt(attCount);
curAtt->set
(
-1
, attNameRawBuf
, XMLUni::fgZeroLenString
, fAttValueBuf.getRawBuffer()
);
curAtt->setSpecified(true);
fAttrNameHashList->setElementAt(attNameHash, attCount);
}
attCount++;
// And jump back to the top of the loop
continue;
}
// It was some special case character so do all of the checks and
// deal with it.
if (!nextCh)
David Abram Cargill
committed
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
if (nextCh == chForwardSlash)
{
fReaderMgr.getNextChar();
isEmpty = true;
if (!fReaderMgr.skippedChar(chCloseAngle))
emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
break;
}
else if (nextCh == chCloseAngle)
{
fReaderMgr.getNextChar();
break;
}
else if (nextCh == chOpenAngle)
{
// Check for this one specially, since its going to be common
// and it is kind of auto-recovering since we've already hit the
// next open bracket, which is what we would have seeked to (and
// skipped this whole tag.)
emitError(XMLErrs::UnterminatedStartTag, elemDecl->getFullName());
break;
}
else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
{
// Check for this one specially, which is probably a missing
// attribute name, e.g. ="value". Just issue expected name
// error and eat the quoted string, then jump back to the
// top again.
emitError(XMLErrs::ExpectedAttrName);
fReaderMgr.getNextChar();
fReaderMgr.skipQuotedString(nextCh);
fReaderMgr.skipPastSpaces();
continue;
}
}
// If empty, validate content right now if we are validating and then
// pop the element stack top. Else, we have to update the current stack
// top's namespace mapping elements.
if (isEmpty)
{
// Pop the element stack back off since it'll never be used now
fElemStack.popTop();
// If the elem stack is empty, then it was an empty root
if (isRoot)
gotData = false;
}
// If we have a document handler, then tell it about this start tag. We
// don't have any URI id to send along, so send fEmptyNamespaceId. We also do not send
// any prefix since its just one big name if we are not doing namespaces.
if (fDocHandler)
{
fDocHandler->startElement
(
, fEmptyNamespaceId
, 0
, *fAttrList
, attCount
, isEmpty
, isRoot
);
}
return true;
}
// This method is called to scan a start tag when we are processing
// namespaces. There are two different versions of this method, one for
// namespace aware processing an done for non-namespace aware processing.
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
//
// This method is called after we've scanned the < of a start tag. So we
// have to get the element name, then scan the attributes, after which
// we are either going to see >, />, or attributes followed by one of those
// sequences.
bool WFXMLScanner::scanStartTagNS(bool& gotData)
{
// Assume we will still have data until proven otherwise. It will only
// ever be false if this is the root and its empty.
gotData = true;
// The current position is after the open bracket, so we need to read in
// in the element name.
if (!fReaderMgr.getName(fQNameBuf))
{
emitError(XMLErrs::ExpectedElementName);
fReaderMgr.skipToChar(chOpenAngle);
return false;
}
// See if its the root element
const bool isRoot = fElemStack.isEmpty();
// Assume it won't be an empty tag
bool isEmpty = false;
// Skip any whitespace after the name
fReaderMgr.skipPastSpaces();
// Lets try to look up the element
XMLElementDecl* elemDecl = fElementLookup->get(qnameRawBuf);
if (!elemDecl) {
if (!XMLString::compareNString(qnameRawBuf, XMLUni::fgXMLNSColonString, 6))
emitError(XMLErrs::NoXMLNSAsElementPrefix, qnameRawBuf);
if (fElementIndex < fElements->size()) {
elemDecl = fElements->elementAt(fElementIndex);
}
else {
elemDecl = new (fGrammarPoolMemoryManager) DTDElementDecl
fGrammarPoolMemoryManager
fElements->addElement(elemDecl);
}
elemDecl->setElementName(qnameRawBuf, fEmptyNamespaceId);
fElementLookup->put((void*)elemDecl->getFullName(), elemDecl);
fElementIndex++;
}
// Expand the element stack and add the new element
fElemStack.addLevel(elemDecl, fReaderMgr.getCurrentReaderNum());
// reset NS attribute list
fAttrNSList->removeAllElements();
// We loop until we either see a /> or >, handling attribute/value
// pairs until we get there.
unsigned int attCount = 0;
unsigned int curAttListSize = fAttrList->size();
while (true)
{
// And get the next non-space character
XMLCh nextCh = fReaderMgr.peekNextChar();
// If the next character is not a slash or closed angle bracket,
// then it must be whitespace, since whitespace is required
// between the end of the last attribute and the name of the next
// one.
if (attCount)
{
if ((nextCh != chForwardSlash) && (nextCh != chCloseAngle))
{
{
// Ok, skip by them and peek another char
fReaderMgr.skipPastSpaces();
nextCh = fReaderMgr.peekNextChar();
}
else
{
// Emit the error but keep on going
emitError(XMLErrs::ExpectedWhitespace);
}
}
}
// Ok, here we first check for any of the special case characters.
// If its not one, then we do the normal case processing, which
// assumes that we've hit an attribute value, Otherwise, we do all
// the special case checks.
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
{
// Assume its going to be an attribute, so get a name from
// the input.
if (!fReaderMgr.getName(fAttNameBuf))
{
emitError(XMLErrs::ExpectedAttrName);
fReaderMgr.skipPastChar(chCloseAngle);
return false;
}
// And next must be an equal sign
if (!scanEq())
{
static const XMLCh tmpList[] =
{
chSingleQuote, chDoubleQuote, chCloseAngle
, chOpenAngle, chForwardSlash, chNull
};
emitError(XMLErrs::ExpectedEqSign);
// Try to sync back up by skipping forward until we either
// hit something meaningful.
const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
if ((chFound == chCloseAngle) || (chFound == chForwardSlash))
{
// Jump back to top for normal processing of these
continue;
}
else if ((chFound == chSingleQuote)
|| (chFound == chDoubleQuote)
{
// Just fall through assuming that the value is to follow
}
else if (chFound == chOpenAngle)
{
// Assume a malformed tag and that new one is starting
emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
return false;
}
else
{
// Something went really wrong
return false;
}
}
// See if this attribute is declared more than one for this element.
David Abram Cargill
committed
unsigned int attNameHash = XMLString::hash(attNameRawBuf, 109, fMemoryManager);
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
if (attCount) {
for (unsigned int k=0; k < attCount; k++) {
if (fAttrNameHashList->elementAt(k) == attNameHash) {
if (XMLString::equals(
fAttrList->elementAt(k)->getQName()
, attNameRawBuf))
{
emitError
(
XMLErrs::AttrAlreadyUsedInSTag
, attNameRawBuf
, qnameRawBuf
);
break;
}
}
}
}
// Skip any whitespace before the value and then scan the att
// value. This will come back normalized with entity refs and
// char refs expanded.
fReaderMgr.skipPastSpaces();
if (!scanAttValue(attNameRawBuf, fAttValueBuf))
{
static const XMLCh tmpList[] =
{
chCloseAngle, chOpenAngle, chForwardSlash, chNull
};
emitError(XMLErrs::ExpectedAttrValue);
// It failed, so lets try to get synced back up. We skip
// forward until we find some whitespace or one of the
// chars in our list.
const XMLCh chFound = fReaderMgr.skipUntilInOrWS(tmpList);
if ((chFound == chCloseAngle)
|| (chFound == chForwardSlash)
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
{
// Just fall through and process this attribute, though
// the value will be "".
}
else if (chFound == chOpenAngle)
{
// Assume a malformed tag and that new one is starting
emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
return false;
}
else
{
// Something went really wrong
return false;
}
}
// Add this attribute to the attribute list that we use to
// pass them to the handler. We reuse its existing elements
// but expand it as required.
const XMLCh* attValueRawBuf = fAttValueBuf.getRawBuffer();
XMLAttr* curAtt = 0;
if (attCount >= curAttListSize)
{
curAtt = new (fMemoryManager) XMLAttr
(
fEmptyNamespaceId
, attNameRawBuf
, attValueRawBuf
, XMLAttDef::CData
, true
, fMemoryManager
);
fAttrList->addElement(curAtt);
fAttrNameHashList->addElement(attNameHash);
}
else
{
curAtt = fAttrList->elementAt(attCount);
curAtt->set
(
fEmptyNamespaceId
, attNameRawBuf
, attValueRawBuf
);
fAttrNameHashList->setElementAt(attNameHash, attCount);
}
// Make sure that the name is basically well formed for namespace
// enabled rules. It either has no colons, or it has one which
// is neither the first or last char.
const int colonFirst = XMLString::indexOf(attNameRawBuf, chColon);
if (colonFirst != -1)
{
const int colonLast = XMLString::lastIndexOf(attNameRawBuf, chColon);
if (colonFirst != colonLast)
{
emitError(XMLErrs::TooManyColonsInName);
continue;
}
else if ((colonFirst == 0)
|| (colonLast == (int)fAttNameBuf.getLen() - 1))
{
emitError(XMLErrs::InvalidColonPos);
continue;
}
}
// Map prefix to namespace
const XMLCh* attPrefix = curAtt->getPrefix();
const XMLCh* attLocalName = curAtt->getName();
const XMLCh* namespaceURI = fAttValueBuf.getRawBuffer();
if (attPrefix && *attPrefix) {
if (XMLString::equals(attPrefix, XMLUni::fgXMLString)) {
curAtt->setURIId(fXMLNamespaceId);
}
else if (XMLString::equals(attPrefix, XMLUni::fgXMLNSString)) {
if (XMLString::equals(attLocalName, XMLUni::fgXMLNSString))
emitError(XMLErrs::NoUseOfxmlnsAsPrefix);
else if (XMLString::equals(attLocalName, XMLUni::fgXMLString)) {
if (!XMLString::equals(namespaceURI, XMLUni::fgXMLURIName))
emitError(XMLErrs::PrefixXMLNotMatchXMLURI);
}
Neil Graham
committed
if (!namespaceURI)
emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf);
else if(!*namespaceURI && fXMLVersion == XMLReader::XMLV1_0)
emitError(XMLErrs::NoEmptyStrNamespace, attNameRawBuf);
, fURIStringPool->addOrFind(namespaceURI)
);
curAtt->setURIId(fXMLNSNamespaceId);
}
else {
fAttrNSList->addElement(curAtt);
}
}
else {
if (XMLString::equals(XMLUni::fgXMLNSString, attLocalName)) {
if (XMLString::equals(namespaceURI, XMLUni::fgXMLNSURIName))
emitError(XMLErrs::NoUseOfxmlnsURI);
else if (XMLString::equals(namespaceURI, XMLUni::fgXMLURIName))
emitError(XMLErrs::XMLURINotMatchXMLPrefix);
fElemStack.addPrefix
(
XMLUni::fgZeroLenString
, fURIStringPool->addOrFind(namespaceURI)
);
}
}
// increment attribute count
attCount++;
// And jump back to the top of the loop
continue;
}
// It was some special case character so do all of the checks and
// deal with it.
if (!nextCh)
David Abram Cargill
committed
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
if (nextCh == chForwardSlash)
{
fReaderMgr.getNextChar();
isEmpty = true;
if (!fReaderMgr.skippedChar(chCloseAngle))
emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
break;
}
else if (nextCh == chCloseAngle)
{
fReaderMgr.getNextChar();
break;
}
else if (nextCh == chOpenAngle)
{
// Check for this one specially, since its going to be common
// and it is kind of auto-recovering since we've already hit the
// next open bracket, which is what we would have seeked to (and
// skipped this whole tag.)
emitError(XMLErrs::UnterminatedStartTag, qnameRawBuf);
break;
}
else if ((nextCh == chSingleQuote) || (nextCh == chDoubleQuote))
{
// Check for this one specially, which is probably a missing
// attribute name, e.g. ="value". Just issue expected name
// error and eat the quoted string, then jump back to the
// top again.
emitError(XMLErrs::ExpectedAttrName);
fReaderMgr.getNextChar();
fReaderMgr.skipQuotedString(nextCh);
fReaderMgr.skipPastSpaces();
continue;
}
}
// Handle provided attributes that we did not map their prefixes
for (unsigned int i=0; i < fAttrNSList->size(); i++) {
XMLAttr* providedAttr = fAttrNSList->elementAt(i);
resolvePrefix
(
providedAttr->getPrefix(),
ElemStack::Mode_Attribute
)
Alberto Massari
committed
if(attCount) {
//
// Decide if to use hash table to do duplicate checking
//
bool toUseHashTable = false;
setAttrDupChkRegistry(attCount, toUseHashTable);
Alberto Massari
committed
// check for duplicate namespace attributes:
// by checking for qualified names with the same local part and with prefixes
// which have been bound to namespace names that are identical.
XMLAttr* loopAttr;
XMLAttr* curAtt;
for (unsigned int attrIndex=0; attrIndex < attCount-1; attrIndex++) {
loopAttr = fAttrList->elementAt(attrIndex);
if (!toUseHashTable)
{
for (unsigned int curAttrIndex = attrIndex+1; curAttrIndex < attCount; curAttrIndex++) {
curAtt = fAttrList->elementAt(curAttrIndex);
if (curAtt->getURIId() == loopAttr->getURIId() &&
XMLString::equals(curAtt->getName(), loopAttr->getName())) {
emitError
(
XMLErrs::AttrAlreadyUsedInSTag
Alberto Massari
committed
, curAtt->getName()
, elemDecl->getFullName()
);
}
}
}
else
{
if (fAttrDupChkRegistry->containsKey((void*)loopAttr->getName(), loopAttr->getURIId()))
{
emitError
(
XMLErrs::AttrAlreadyUsedInSTag
, loopAttr->getName()
, elemDecl->getFullName()
Alberto Massari
committed
);
}
fAttrDupChkRegistry->put((void*)loopAttr->getName(), loopAttr->getURIId(), loopAttr);
Alberto Massari
committed
}
}
// Resolve the qualified name to a URI.
unsigned int uriId = resolvePrefix
(
elemDecl->getElementName()->getPrefix()
, ElemStack::Mode_Element
);
// Now we can update the element stack
fElemStack.setCurrentURI(uriId);
// Tell the document handler about this start tag
if (fDocHandler)
{
, elemDecl->getElementName()->getPrefix()
, *fAttrList
, attCount
, false
, isRoot
);
}
// If empty, validate content right now if we are validating and then
// pop the element stack top. Else, we have to update the current stack
// top's namespace mapping elements.
if (isEmpty)
{
// Pop the element stack back off since it'll never be used now
fElemStack.popTop();
// If we have a doc handler, tell it about the end tag
if (fDocHandler)
{
fDocHandler->endElement
(
, elemDecl->getElementName()->getPrefix()
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
);
}
// If the elem stack is empty, then it was an empty root
if (isRoot)
gotData = false;
}
return true;
}
unsigned int
WFXMLScanner::resolveQName(const XMLCh* const qName
, XMLBuffer& prefixBuf
, const short mode
, int& prefixColonPos)
{
// Lets split out the qName into a URI and name buffer first. The URI
// can be empty.
prefixColonPos = XMLString::indexOf(qName, chColon);
if (prefixColonPos == -1)
{
// Its all name with no prefix, so put the whole thing into the name
// buffer. Then map the empty string to a URI, since the empty string
// represents the default namespace. This will either return some
// explicit URI which the default namespace is mapped to, or the
// the default global namespace.
bool unknown = false;
prefixBuf.reset();
return fElemStack.mapPrefixToURI(XMLUni::fgZeroLenString, (ElemStack::MapModes) mode, unknown);
}
else
{
// Copy the chars up to but not including the colon into the prefix
// buffer.
prefixBuf.set(qName, prefixColonPos);
// Watch for the special namespace prefixes. We always map these to
// special URIs. 'xml' gets mapped to the official URI that its defined
// to map to by the NS spec. xmlns gets mapped to a special place holder
// URI that we define (so that it maps to something checkable.)
const XMLCh* prefixRawBuf = prefixBuf.getRawBuffer();
if (XMLString::equals(prefixRawBuf, XMLUni::fgXMLNSString)) {
// if this is an element, it is an error to have xmlns as prefix
if (mode == ElemStack::Mode_Element)
emitError(XMLErrs::NoXMLNSAsElementPrefix, qName);
return fXMLNSNamespaceId;
}
else if (XMLString::equals(prefixRawBuf, XMLUni::fgXMLString)) {
unsigned int uriId = fElemStack.mapPrefixToURI(prefixRawBuf, (ElemStack::MapModes) mode, unknown);
emitError(XMLErrs::UnknownPrefix, prefixRawBuf);
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
return uriId;
}
}
}
// ---------------------------------------------------------------------------
// XMLScanner: Private parsing methods
// ---------------------------------------------------------------------------
bool WFXMLScanner::scanAttValue(const XMLCh* const attrName
, XMLBuffer& toFill)
{
// Reset the target buffer
toFill.reset();
// Get the next char which must be a single or double quote
XMLCh quoteCh;
if (!fReaderMgr.skipIfQuote(quoteCh))
return false;
// We have to get the current reader because we have to ignore closing
// quotes until we hit the same reader again.
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
// Loop until we get the attribute value. Note that we use a double
// loop here to avoid the setup/teardown overhead of the exception
// handler on every round.
XMLCh nextCh;
XMLCh secondCh = 0;
bool gotLeadingSurrogate = false;
bool escaped;
while (true)
{
try
{
while(true)
{
nextCh = fReaderMgr.getNextChar();
David Abram Cargill
committed
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
// Check for our ending quote in the same entity
if (nextCh == quoteCh)
{
if (curReader == fReaderMgr.getCurrentReaderNum())
return true;
// Watch for spillover into a previous entity
if (curReader > fReaderMgr.getCurrentReaderNum())
{
emitError(XMLErrs::PartialMarkupInEntity);
return false;
}
}
// Check for an entity ref now, before we let it affect our
// whitespace normalization logic below. We ignore the empty flag
// in this one.
escaped = false;
if (nextCh == chAmpersand)
{
if (scanEntityRef(true, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
// Deal with surrogate pairs
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
if (gotLeadingSurrogate)
{
emitError(XMLErrs::Expected2ndSurrogateChar);
gotLeadingSurrogate = true;
}
else
{
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XMLErrs::Unexpected2ndSurrogateChar);
}
else
{
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
if (gotLeadingSurrogate) {
emitError(XMLErrs::Expected2ndSurrogateChar);
// Its got to at least be a valid XML character
else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
David Abram Cargill
committed
, fMemoryManager
);
emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf);
}
// If its not escaped, then make sure its not a < character, which
// is not allowed in attribute values.
if (!escaped) {
if (nextCh == chOpenAngle)
nextCh = chSpace;
}
// Else add it to the buffer
toFill.append(nextCh);
if (secondCh)
Alberto Massari
committed
{
toFill.append(secondCh);
secondCh=0;
}
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
}
}
catch(const EndOfEntityException&)
{
// Just eat it and continue.
gotLeadingSurrogate = false;
escaped = false;
}
}
return true;
}
// This method scans a CDATA section. It collects the character into one
// of the temp buffers and calls the document handler, if any, with the
// characters. It assumes that the <![CDATA string has been scanned before
// this call.
void WFXMLScanner::scanCDSection()
{
static const XMLCh CDataClose[] =
{
chCloseSquare, chCloseAngle, chNull
};
// The next character should be the opening square bracket. If not
// issue an error, but then try to recover by skipping any whitespace
// and checking again.
if (!fReaderMgr.skippedChar(chOpenSquare))
{
emitError(XMLErrs::ExpectedOpenSquareBracket);
fReaderMgr.skipPastSpaces();
// If we still don't find it, then give up, else keep going
if (!fReaderMgr.skippedChar(chOpenSquare))
return;
}
// Get a buffer for this
XMLBufBid bbCData(&fBufMgr);
// We just scan forward until we hit the end of CDATA section sequence.
// CDATA is effectively a big escape mechanism so we don't treat markup
// characters specially here.
bool emittedError = false;
while (true)
{
const XMLCh nextCh = fReaderMgr.getNextChar();
// Watch for unexpected end of file
if (!nextCh)
{
emitError(XMLErrs::UnterminatedCDATASection);
David Abram Cargill
committed
ThrowXMLwithMemMgr(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF, fMemoryManager);
}
// If this is a close square bracket it could be our closing
// sequence.
if (nextCh == chCloseSquare && fReaderMgr.skippedString(CDataClose))
{
// make sure we were not expecting a trailing surrogate.
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
// If we have a doc handler, call it
if (fDocHandler)
{
fDocHandler->docCharacters
(
bbCData.getRawBuffer()
, bbCData.getLen()
, true
);
}
// And we are done
break;
}
// Make sure its a valid character. But if we've emitted an error
// already, don't bother with the overhead since we've already told
// them about it.
if (!emittedError)
{
// Deal with surrogate pairs
if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
else
gotLeadingSurrogate = true;
}
else
{
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
{
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XMLErrs::Unexpected2ndSurrogateChar);
}
else
{
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
// Its got to at least be a valid XML character
else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
{
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
David Abram Cargill
committed
, fMemoryManager
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
emittedError = true;
}
}
gotLeadingSurrogate = false;
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
}
}
// Add it to the buffer
bbCData.append(nextCh);
}
}
void WFXMLScanner::scanCharData(XMLBuffer& toUse)
{
// We have to watch for the stupid ]]> sequence, which is illegal in
// character data. So this is a little state machine that handles that.
enum States
{
State_Waiting
, State_GotOne
, State_GotTwo
};
// Reset the buffer before we start
toUse.reset();
// Turn on the 'throw at end' flag of the reader manager
ThrowEOEJanitor jan(&fReaderMgr, true);
// In order to be more efficient we have to use kind of a deeply nested
// set of blocks here. The outer block puts on a try and catches end of
// entity exceptions. The inner loop is the per-character loop. If we
// put the try inside the inner loop, it would work but would require
// the exception handling code setup/teardown code to be invoked for
// each character.
XMLCh nextCh;
XMLCh secondCh = 0;
States curState = State_Waiting;
bool escaped = false;
bool gotLeadingSurrogate = false;
bool notDone = true;
while (notDone)
{
try
{
while (true)
{
// Eat through as many plain content characters as possible without
// needing special handling. Moving most content characters here,
// in this one call, rather than running the overall loop once
// per content character, is a speed optimization.
if (curState == State_Waiting && !gotLeadingSurrogate)
fReaderMgr.movePlainContentChars(toUse);
// Try to get another char from the source
// The code from here on down covers all contengencies,
if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh))
{
// If we were waiting for a trailing surrogate, its an error
if (gotLeadingSurrogate)
emitError(XMLErrs::Expected2ndSurrogateChar);
notDone = false;
break;
}
// Watch for a reference. Note that the escapement mechanism
// is ignored in this content.
escaped = false;
if (nextCh == chAmpersand)
{
sendCharData(toUse);
// Turn off the throwing at the end of entity during this
ThrowEOEJanitor jan(&fReaderMgr, false);
if (scanEntityRef(false, nextCh, secondCh, escaped) != EntityExp_Returned)
{
gotLeadingSurrogate = false;
continue;
}
}
else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
// Deal with surrogate pairs
// Its a leading surrogate. If we already got one, then
// issue an error, else set leading flag to make sure that
// we look for a trailing next time.
if (gotLeadingSurrogate)
{
emitError(XMLErrs::Expected2ndSurrogateChar);
gotLeadingSurrogate = true;
}
else
{
// If its a trailing surrogate, make sure that we are
// prepared for that. Else, its just a regular char so make
// sure that we were not expected a trailing surrogate.
if ((nextCh >= 0xDC00) && (nextCh <= 0xDFFF))
// Its trailing, so make sure we were expecting it
if (!gotLeadingSurrogate)
emitError(XMLErrs::Unexpected2ndSurrogateChar);
}
else
{
// Its just a char, so make sure we were not expecting a
// trailing surrogate.
if (gotLeadingSurrogate) {
emitError(XMLErrs::Expected2ndSurrogateChar);
// Its got to at least be a valid XML character
else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh))
XMLCh tmpBuf[9];
XMLString::binToText
(
nextCh
, tmpBuf
, 8
, 16
David Abram Cargill
committed
, fMemoryManager
);
emitError(XMLErrs::InvalidCharacter, tmpBuf);
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
// Keep the state machine up to date
if (!escaped)
{
if (nextCh == chCloseSquare)
{
if (curState == State_Waiting)
curState = State_GotOne;
else if (curState == State_GotOne)
curState = State_GotTwo;
}
else if (nextCh == chCloseAngle)
{
if (curState == State_GotTwo)
emitError(XMLErrs::BadSequenceInCharData);
curState = State_Waiting;
}
else
{
curState = State_Waiting;
}
}
else
{
curState = State_Waiting;
}
// Add this char to the buffer
toUse.append(nextCh);
if (secondCh)
Alberto Massari
committed
{
toUse.append(secondCh);
Alberto Massari
committed
secondCh=0;
}
}
}
catch(const EndOfEntityException& toCatch)
{
// Some entity ended, so we have to send any accumulated
// chars and send an end of entity event.
sendCharData(toUse);
gotLeadingSurrogate = false;
if (fDocHandler)
fDocHandler->endEntityReference(toCatch.getEntity());
}
}
// Send any char data that we accumulated into the buffer
sendCharData(toUse);
}
InputSource* WFXMLScanner::resolveSystemId(const XMLCh* const /*sysId*/
,const XMLCh* const /*pubId*/)
// This method will scan a general/character entity ref. It will either
// expand a char ref and return it directly, or push a reader for a general
// entity.
//
// The return value indicates whether the char parameters hold the value
// or whether the value was pushed as a reader, or that it failed.
//
// The escaped flag tells the caller whether the returned parameter resulted
// from a character reference, which escapes the character in some cases. It
// only makes any difference if the return value indicates the value was
// returned directly.
XMLScanner::EntityExpRes
David Abram Cargill
committed
WFXMLScanner::scanEntityRef(const bool
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
, XMLCh& firstCh
, XMLCh& secondCh
, bool& escaped)
{
// Assume no escape
secondCh = 0;
escaped = false;
// We have to insure that its all in one entity
const unsigned int curReader = fReaderMgr.getCurrentReaderNum();
// If the next char is a pound, then its a character reference and we
// need to expand it always.
if (fReaderMgr.skippedChar(chPound))
{
// Its a character reference, so scan it and get back the numeric
// value it represents.
if (!scanCharRef(firstCh, secondCh))
return EntityExp_Failed;
escaped = true;
if (curReader != fReaderMgr.getCurrentReaderNum())
emitError(XMLErrs::PartialMarkupInEntity);
return EntityExp_Returned;
}
// Expand it since its a normal entity ref
XMLBufBid bbName(&fBufMgr);
if (!fReaderMgr.getName(bbName.getBuffer()))
{
emitError(XMLErrs::ExpectedEntityRefName);
return EntityExp_Failed;
}
// Next char must be a semi-colon. But if its not, just emit
// an error and try to continue.
if (!fReaderMgr.skippedChar(chSemiColon))
emitError(XMLErrs::UnterminatedEntityRef, bbName.getRawBuffer());
// Make sure we ended up on the same entity reader as the & char
if (curReader != fReaderMgr.getCurrentReaderNum())
emitError(XMLErrs::PartialMarkupInEntity);
// Look up the name in the general entity pool
// If it does not exist, then obviously an error
if (!fEntityTable->containsKey(bbName.getRawBuffer()))
{
// XML 1.0 Section 4.1
// Well-formedness Constraint for entity not found:
// In a document without any DTD, a document with only an internal DTD subset which contains no parameter entity references,
// or a document with "standalone='yes'", for an entity reference that does not occur within the external subset
// or a parameter entity
if (fStandalone || fHasNoDTD)
emitError(XMLErrs::EntityNotFound, bbName.getRawBuffer());
return EntityExp_Failed;
}
// here's where we need to check if there's a SecurityManager,
// how many entity references we've had
if(fSecurityManager != 0 && ++fEntityExpansionCount > fEntityExpansionLimit) {
XMLCh expLimStr[16];
David Abram Cargill
committed
XMLString::binToText(fEntityExpansionLimit, expLimStr, 15, 10, fMemoryManager);
emitError
(
XMLErrs::EntityExpansionLimitExceeded
, expLimStr
);
// there seems nothing better to be done than to reset the entity expansion counter
fEntityExpansionCount = 0;
}
firstCh = fEntityTable->get(bbName.getRawBuffer());
escaped = true;
return EntityExp_Returned;
}
// ---------------------------------------------------------------------------
// WFXMLScanner: Grammar preparsing
// ---------------------------------------------------------------------------
Grammar* WFXMLScanner::loadGrammar(const InputSource&
, const short
, const bool)
{
// REVISIT: emit a warning or throw an exception
return 0;
}
XERCES_CPP_NAMESPACE_END