diff --git a/src/xercesc/util/XMLString.hpp b/src/xercesc/util/XMLString.hpp index 2db9e94695a80ed8b5efb28c53567d3ebfabdd72..852ee8b16f0980fbcfc2ac0e94b5880783940d70 100644 --- a/src/xercesc/util/XMLString.hpp +++ b/src/xercesc/util/XMLString.hpp @@ -1532,19 +1532,14 @@ inline void XMLString::moveChars( XMLCh* const targetStr inline XMLSize_t XMLString::stringLen(const XMLCh* const src) { - if (src == 0 || *src == 0) - { + if (src == 0) return 0; - } - else - { - const XMLCh* pszTmp = src + 1; - while (*pszTmp) - ++pszTmp; + const XMLCh* pszTmp = src; - return (pszTmp - src); - } + while (*pszTmp++) ; + + return (pszTmp - src - 1); } inline XMLCh* XMLString::replicate(const XMLCh* const toRep, diff --git a/src/xercesc/validators/common/CMStateSet.hpp b/src/xercesc/validators/common/CMStateSet.hpp index 25cc29086ad238ea257be80be8db3e07e764a63c..0b3c693f32e3a6eb3a3176bd6541fa7fa57a6b4b 100644 --- a/src/xercesc/validators/common/CMStateSet.hpp +++ b/src/xercesc/validators/common/CMStateSet.hpp @@ -26,7 +26,7 @@ // // This class is a specialized bitset class for the content model code of // the validator. It assumes that its never called with two objects of -// different bit counts, and that bit sets smaller than 64 bits are far +// different bit counts, and that bit sets smaller than a threshold are far // and away the most common. So it can be a lot more optimized than a general // purpose utility bitset class // @@ -41,84 +41,112 @@ XERCES_CPP_NAMESPACE_BEGIN class CMStateSetEnumerator; +#define CMSTATE_CACHED_INT32_SIZE 4 + +#define CMSTATE_BITFIELD_CHUNK 1024 +#define CMSTATE_BITFIELD_INT32_SIZE (1024 / 32) + +struct CMDynamicBuffer +{ + // fArraySize + // This indicates the number of elements of the fBitArray vector + // + // fBitArray + // A vector of arrays of XMLInt32; each array is allocated on demand + // if a bit needs to be set in that range + // + // fMemoryManager + // The memory manager used to allocate and deallocate memory + // + XMLSize_t fArraySize; + XMLInt32** fBitArray; + MemoryManager* fMemoryManager; +}; + class CMStateSet : public XMemory { public : // ----------------------------------------------------------------------- // Constructors and Destructor // ----------------------------------------------------------------------- - CMStateSet( const unsigned int bitCount + CMStateSet( const XMLSize_t bitCount , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) : fBitCount(bitCount) - , fBitArray(0) - , fMemoryManager(manager) + , fDynamicBuffer(0) { // // See if we need to allocate the byte array or whether we can live - // within the 64 bit high performance scheme. + // within the cached bit high performance scheme. // - if (fBitCount > 64) + if (fBitCount > (CMSTATE_CACHED_INT32_SIZE * 32)) { - fArraySize = fBitCount / 32; - if (fBitCount % 32) - fArraySize++; - fBitArray = (XMLInt32*) fMemoryManager->allocate(fArraySize*sizeof(XMLInt32)); + fDynamicBuffer = (CMDynamicBuffer*)manager->allocate(sizeof(CMDynamicBuffer)); + fDynamicBuffer->fMemoryManager = manager; + // allocate an array of vectors, each one containing CMSTATE_BITFIELD_CHUNK bits + fDynamicBuffer->fArraySize = fBitCount / CMSTATE_BITFIELD_CHUNK; + if (fBitCount % CMSTATE_BITFIELD_CHUNK) + fDynamicBuffer->fArraySize++; + fDynamicBuffer->fBitArray = (XMLInt32**) fDynamicBuffer->fMemoryManager->allocate(fDynamicBuffer->fArraySize*sizeof(XMLInt32*)); + for(XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + fDynamicBuffer->fBitArray[index]=NULL; } else { - fArraySize = 2; - fBitArray = fBits; + for (XMLSize_t index = 0; index < CMSTATE_CACHED_INT32_SIZE; index++) + fBits[index] = 0; } - - // Init all the bits to zero - zeroBits(); } - - /* - * This method with the 'for' statement (commented out) cannot be made inline - * because the antiquated CC (CFront) compiler under HPUX 10.20 does not allow - * the 'for' statement inside any inline method. Unfortunately, - * we have to support it. So instead, we use memcpy(). - */ - CMStateSet(const CMStateSet& toCopy) : XMemory(toCopy) , fBitCount(toCopy.fBitCount) - , fBitArray(0) - , fMemoryManager(toCopy.fMemoryManager) + , fDynamicBuffer(0) { // // See if we need to allocate the byte array or whether we can live - // within the 64 bit high performance scheme. + // within the cahced bit high performance scheme. // - if (fBitCount > 64) + if (fBitCount > (CMSTATE_CACHED_INT32_SIZE * 32)) { - fArraySize = fBitCount / 32; - if (fBitCount % 32) - fArraySize++; - fBitArray = (XMLInt32*) fMemoryManager->allocate(fArraySize*sizeof(XMLInt32)); + fDynamicBuffer = (CMDynamicBuffer*) toCopy.fDynamicBuffer->fMemoryManager->allocate(sizeof(CMDynamicBuffer)); + fDynamicBuffer->fMemoryManager = toCopy.fDynamicBuffer->fMemoryManager; + fDynamicBuffer->fArraySize = fBitCount / CMSTATE_BITFIELD_CHUNK; + if (fBitCount % CMSTATE_BITFIELD_CHUNK) + fDynamicBuffer->fArraySize++; + fDynamicBuffer->fBitArray = (XMLInt32**) fDynamicBuffer->fMemoryManager->allocate(fDynamicBuffer->fArraySize*sizeof(XMLInt32*)); + for(XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + { + if(toCopy.fDynamicBuffer->fBitArray[index]!=NULL) + { + fDynamicBuffer->fBitArray[index]=(XMLInt32*)fDynamicBuffer->fMemoryManager->allocate(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); + memcpy((void *) fDynamicBuffer->fBitArray[index], + (const void *) toCopy.fDynamicBuffer->fBitArray[index], + CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); + } + else + fDynamicBuffer->fBitArray[index]=NULL; + } } else { - fArraySize = 2; - fBitArray = fBits; + memcpy((void *) fBits, + (const void *) toCopy.fBits, + CMSTATE_CACHED_INT32_SIZE * sizeof(XMLInt32)); } - - - memcpy((void *) fBitArray, - (const void *) toCopy.fBitArray, - fArraySize * sizeof(XMLInt32)); - - // for (unsigned int index = 0; index < fArraySize; index++) - // fBitArray[index] = toCopy.fBitArray[index]; } ~CMStateSet() { - if (fBitArray!=fBits) - fMemoryManager->deallocate(fBitArray); + if(fDynamicBuffer) + { + for(XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + { + if(fDynamicBuffer->fBitArray[index]!=NULL) + fDynamicBuffer->fMemoryManager->deallocate(fDynamicBuffer->fBitArray[index]); + } + fDynamicBuffer->fMemoryManager->deallocate(fDynamicBuffer); + } } @@ -127,8 +155,40 @@ public : // ----------------------------------------------------------------------- void operator|=(const CMStateSet& setToOr) { - for (unsigned int index = 0; index < fArraySize; index++) - fBitArray[index] |= setToOr.fBitArray[index]; + if(fDynamicBuffer==0) + { + for (XMLSize_t index = 0; index < CMSTATE_CACHED_INT32_SIZE; index++) + if(setToOr.fBits[index]) + if(fBits[index]) + fBits[index] |= setToOr.fBits[index]; + else + fBits[index] = setToOr.fBits[index]; + } + else + { + for (XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + if(setToOr.fDynamicBuffer->fBitArray[index]!=NULL) + { + // if we haven't allocated the subvector yet, allocate it and copy + if(fDynamicBuffer->fBitArray[index]==NULL) + { + fDynamicBuffer->fBitArray[index]=(XMLInt32*)fDynamicBuffer->fMemoryManager->allocate(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); + memcpy((void *) fDynamicBuffer->fBitArray[index], + (const void *) setToOr.fDynamicBuffer->fBitArray[index], + CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); + } + else + { + // otherwise, merge them + for(XMLSize_t subIndex = 0; subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex++) + if(setToOr.fDynamicBuffer->fBitArray[index][subIndex]) + if(fDynamicBuffer->fBitArray[index][subIndex]) + fDynamicBuffer->fBitArray[index][subIndex] |= setToOr.fDynamicBuffer->fBitArray[index][subIndex]; + else + fDynamicBuffer->fBitArray[index][subIndex] = setToOr.fDynamicBuffer->fBitArray[index][subIndex]; + } + } + } } bool operator==(const CMStateSet& setToCompare) const @@ -136,10 +196,29 @@ public : if (fBitCount != setToCompare.fBitCount) return false; - for (unsigned int index = 0; index < fArraySize; index++) + if(fDynamicBuffer==0) { - if (fBitArray[index] != setToCompare.fBitArray[index]) - return false; + for (XMLSize_t index = 0; index < CMSTATE_CACHED_INT32_SIZE; index++) + { + if (fBits[index] != setToCompare.fBits[index]) + return false; + } + } + else + { + for (XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + { + if(fDynamicBuffer->fBitArray[index]==NULL && setToCompare.fDynamicBuffer->fBitArray[index]==NULL) + continue; + else if(fDynamicBuffer->fBitArray[index]==NULL || setToCompare.fDynamicBuffer->fBitArray[index]==NULL) // the other should have been empty too + return false; + else + { + for(XMLSize_t subIndex = 0; subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex++) + if(fDynamicBuffer->fBitArray[index][subIndex]!=setToCompare.fDynamicBuffer->fBitArray[index][subIndex]) + return false; + } + } } return true; } @@ -151,60 +230,207 @@ public : // They have to be the same size if (fBitCount != srcSet.fBitCount) - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Bitset_NotEqualSize, fMemoryManager); - - for (unsigned int index = 0; index < fArraySize; index++) - fBitArray[index] = srcSet.fBitArray[index]; + if(fDynamicBuffer) + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Bitset_NotEqualSize, fDynamicBuffer->fMemoryManager); + else + ThrowXML(RuntimeException, XMLExcepts::Bitset_NotEqualSize); + if(fDynamicBuffer==0) + { + for (XMLSize_t index = 0; index < CMSTATE_CACHED_INT32_SIZE; index++) + fBits[index] = srcSet.fBits[index]; + } + else + { + for (XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + if(srcSet.fDynamicBuffer->fBitArray[index]==NULL) + { + // delete this subentry + if(fDynamicBuffer->fBitArray[index]!=NULL) + { + fDynamicBuffer->fMemoryManager->deallocate(fDynamicBuffer->fBitArray[index]); + fDynamicBuffer->fBitArray[index]=NULL; + } + } + else + { + // if we haven't allocated the subvector yet, allocate it and copy + if(fDynamicBuffer->fBitArray[index]==NULL) + fDynamicBuffer->fBitArray[index]=(XMLInt32*)fDynamicBuffer->fMemoryManager->allocate(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); + memcpy((void *) fDynamicBuffer->fBitArray[index], + (const void *) srcSet.fDynamicBuffer->fBitArray[index], + CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); + } + } return *this; } + XMLSize_t getBitCountInRange(XMLSize_t start, XMLSize_t end) const + { + XMLSize_t count = 0; + end /= 32; + if(fDynamicBuffer==0) + { + if(end > CMSTATE_CACHED_INT32_SIZE) + end = CMSTATE_CACHED_INT32_SIZE; + for (XMLSize_t index = start / 32; index < end; index++) + { + if (fBits[index] != 0) + for(int i=0;i<32;i++) + { + XMLInt32 mask=(1UL << i); + if(fBits[index] & mask) + count++; + } + } + } + else + { + if(end > fDynamicBuffer->fArraySize) + end = fDynamicBuffer->fArraySize; + for (XMLSize_t index = start / 32; index < end; index++) + { + if(fDynamicBuffer->fBitArray[index]==NULL) + continue; + for(XMLSize_t subIndex=0;subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex++) + { + if (fDynamicBuffer->fBitArray[index][subIndex] != 0) + for(int i=0;i<32;i++) + { + XMLInt32 mask=(1UL << i); + if(fDynamicBuffer->fBitArray[index][subIndex] & mask) + count++; + } + } + } + } + return count; + } - bool getBit(const unsigned int bitToGet) const + bool getBit(const XMLSize_t bitToGet) const { if (bitToGet >= fBitCount) - ThrowXMLwithMemMgr(ArrayIndexOutOfBoundsException, XMLExcepts::Bitset_BadIndex, fMemoryManager); + if(fDynamicBuffer) + ThrowXMLwithMemMgr(ArrayIndexOutOfBoundsException, XMLExcepts::Bitset_BadIndex, fDynamicBuffer->fMemoryManager); + else + ThrowXML(ArrayIndexOutOfBoundsException, XMLExcepts::Bitset_BadIndex); - const XMLInt32 mask = (0x1UL << (bitToGet % 32)); - const unsigned int byteOfs = bitToGet / 32; // And access the right bit and byte - return (fBitArray[byteOfs]!=0 && (fBitArray[byteOfs] & mask) != 0); + if(fDynamicBuffer==0) + { + const XMLInt32 mask = (0x1UL << (bitToGet % 32)); + const XMLSize_t byteOfs = bitToGet / 32; + return (fBits[byteOfs]!=0 && (fBits[byteOfs] & mask) != 0); + } + else + { + const XMLSize_t vectorOfs = bitToGet / CMSTATE_BITFIELD_CHUNK; + if(fDynamicBuffer->fBitArray[vectorOfs]==NULL) + return false; + const XMLInt32 mask = (0x1UL << (bitToGet % 32)); + const XMLSize_t byteOfs = (bitToGet % CMSTATE_BITFIELD_CHUNK) / 32; + return (fDynamicBuffer->fBitArray[vectorOfs][byteOfs]!=0 && (fDynamicBuffer->fBitArray[vectorOfs][byteOfs] & mask) != 0); + } } bool isEmpty() const { - for (unsigned int index = 0; index < fArraySize; index++) + if(fDynamicBuffer==0) { - if (fBitArray[index] != 0) - return false; + for (XMLSize_t index = 0; index < CMSTATE_CACHED_INT32_SIZE; index++) + { + if (fBits[index] != 0) + return false; + } + } + else + { + for (XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + { + if(fDynamicBuffer->fBitArray[index]==NULL) + continue; + for(XMLSize_t subIndex=0;subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex++) + { + if (fDynamicBuffer->fBitArray[index][subIndex] != 0) + return false; + } + } } return true; } - void setBit(const unsigned int bitToSet) + void setBit(const XMLSize_t bitToSet) { if (bitToSet >= fBitCount) - ThrowXMLwithMemMgr(ArrayIndexOutOfBoundsException, XMLExcepts::Bitset_BadIndex, fMemoryManager); + if(fDynamicBuffer) + ThrowXMLwithMemMgr(ArrayIndexOutOfBoundsException, XMLExcepts::Bitset_BadIndex, fDynamicBuffer->fMemoryManager); + else + ThrowXML(ArrayIndexOutOfBoundsException, XMLExcepts::Bitset_BadIndex); const XMLInt32 mask = (0x1UL << (bitToSet % 32)); - const unsigned int byteOfs = bitToSet / 32; // And access the right bit and byte - fBitArray[byteOfs] &= ~mask; - fBitArray[byteOfs] |= mask; + if(fDynamicBuffer==0) + { + const XMLSize_t byteOfs = bitToSet / 32; + fBits[byteOfs] &= ~mask; + fBits[byteOfs] |= mask; + } + else + { + const XMLSize_t vectorOfs = bitToSet / CMSTATE_BITFIELD_CHUNK; + if(fDynamicBuffer->fBitArray[vectorOfs]==NULL) + { + fDynamicBuffer->fBitArray[vectorOfs]=(XMLInt32*)fDynamicBuffer->fMemoryManager->allocate(CMSTATE_BITFIELD_INT32_SIZE * sizeof(XMLInt32)); + for(XMLSize_t index=0;index < CMSTATE_BITFIELD_INT32_SIZE; index++) + fDynamicBuffer->fBitArray[vectorOfs][index]=0; + } + const XMLSize_t byteOfs = (bitToSet % CMSTATE_BITFIELD_CHUNK) / 32; + fDynamicBuffer->fBitArray[vectorOfs][byteOfs] &= ~mask; + fDynamicBuffer->fBitArray[vectorOfs][byteOfs] |= mask; + } } void zeroBits() { - for (unsigned int index = 0; index < fArraySize; index++) - fBitArray[index] = 0; + if(fDynamicBuffer==0) + { + for (XMLSize_t index = 0; index < CMSTATE_CACHED_INT32_SIZE; index++) + fBits[index] = 0; + } + else + { + for (XMLSize_t index = 0; index < fDynamicBuffer->fArraySize; index++) + // delete this subentry + if(fDynamicBuffer->fBitArray[index]!=NULL) + { + fDynamicBuffer->fMemoryManager->deallocate(fDynamicBuffer->fBitArray[index]); + fDynamicBuffer->fBitArray[index]=NULL; + } + } } XMLSize_t hashCode() const { XMLSize_t hash = 0; - for (XMLSize_t index = 0; index<fArraySize; index++) - hash = fBitArray[index] + hash * 31; + if(fDynamicBuffer==0) + { + for (XMLSize_t index = 0; index<CMSTATE_CACHED_INT32_SIZE; index++) + hash = fBits[index] + hash * 31; + } + else + { + for (XMLSize_t index = 0; index<fDynamicBuffer->fArraySize; index++) + { + if(fDynamicBuffer->fBitArray[index]==NULL) + // simulates the iteration on the missing bits + for(XMLSize_t subIndex=0;subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex++) + hash *= 31; + else + for(XMLSize_t subIndex=0;subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex++) + hash = fDynamicBuffer->fBitArray[index][subIndex] + hash * 31; + } + } return hash; } @@ -222,26 +448,21 @@ private : // The count of bits that the outside world wants to support, // so its the max bit index plus one. // - // fArraySize - // If the bit count is > 64, then we use the fBitArray member to - // store the bits, and this indicates its size in bytes. Otherwise - // its value is meaningless and unset. - // // fBits - // When the bit count is <= 64 (very common), these hold the bits. - // Otherwise, the fBitArray member holds htem. + // When the bit count is less than a threshold (very common), these hold the bits. + // Otherwise, the fDynamicBuffer member holds htem. // - // fBitArray - // The array of bytes used when the bit count is > 64. It is - // allocated as required. + // fDynamicBuffer + // If the bit count is greater than the threshold, then we allocate this structure to + // store the bits, the length, and the memory manager to allocate/deallocate + // the memory + // // ----------------------------------------------------------------------- - unsigned int fBitCount; - unsigned int fArraySize; - XMLInt32 fBits[2]; - XMLInt32* fBitArray; - MemoryManager* fMemoryManager; + XMLSize_t fBitCount; + XMLInt32 fBits[CMSTATE_CACHED_INT32_SIZE]; + CMDynamicBuffer* fDynamicBuffer; - friend class CMStateSetEnumerator ; + friend class CMStateSetEnumerator; }; class CMStateSetEnumerator : public XMemory @@ -249,9 +470,8 @@ class CMStateSetEnumerator : public XMemory public: CMStateSetEnumerator(const CMStateSet* toEnum) : fToEnum(toEnum), - fIndexCount((unsigned int)-1), - fLastValue(0), - fByteArrayCursor(0) + fIndexCount((XMLSize_t)-1), + fLastValue(0) { findNext(); } @@ -269,7 +489,7 @@ public: if(fLastValue & mask) { fLastValue &= ~mask; - unsigned int retVal=fIndexCount+i; + unsigned int retVal=(unsigned int)fIndexCount+i; if(fLastValue==0) findNext(); return retVal; @@ -281,22 +501,43 @@ public: private: void findNext() { - unsigned int nOffset=((fIndexCount==(unsigned int)-1)?0:(fIndexCount/32)+1), i; - for(i=nOffset;i<fToEnum->fArraySize;i++) + if(fToEnum->fDynamicBuffer==0) + { + XMLSize_t nOffset=((fIndexCount==(XMLSize_t)-1)?0:(fIndexCount/32)+1); + for(XMLSize_t index=nOffset;index<CMSTATE_CACHED_INT32_SIZE;index++) + { + if(fToEnum->fBits[index]!=0) + { + fIndexCount=index*32; + fLastValue=fToEnum->fBits[index]; + return; + } + } + } + else { - if(fToEnum->fBitArray[i]!=0) + XMLSize_t nOffset=((fIndexCount==(XMLSize_t)-1)?0:(fIndexCount/CMSTATE_BITFIELD_CHUNK)); + XMLSize_t nSubOffset=((fIndexCount==(XMLSize_t)-1)?0:((fIndexCount % CMSTATE_BITFIELD_CHUNK) /32)+1); + for (XMLSize_t index = nOffset; index<fToEnum->fDynamicBuffer->fArraySize; index++) { - fIndexCount=i*32; - fLastValue=fToEnum->fBitArray[i]; - break; + if(fToEnum->fDynamicBuffer->fBitArray[index]!=NULL) + { + for(XMLSize_t subIndex=nSubOffset;subIndex < CMSTATE_BITFIELD_INT32_SIZE; subIndex++) + if(fToEnum->fDynamicBuffer->fBitArray[index][subIndex]!=0) + { + fIndexCount=index*CMSTATE_BITFIELD_CHUNK + subIndex*32; + fLastValue=fToEnum->fDynamicBuffer->fBitArray[index][subIndex]; + return; + } + } + nSubOffset = 0; // next chunks will be processed from the beginning } } } - const CMStateSet* fToEnum; - unsigned int fIndexCount; - XMLInt32 fLastValue; - unsigned int fByteArrayCursor; + const CMStateSet* fToEnum; + XMLSize_t fIndexCount; + XMLInt32 fLastValue; }; XERCES_CPP_NAMESPACE_END diff --git a/src/xercesc/validators/common/DFAContentModel.cpp b/src/xercesc/validators/common/DFAContentModel.cpp index df424b225867b3ba4034252ae64bb0bbbf795e33..97d8e9bdaac942f75274eaad46c4b435938278be 100644 --- a/src/xercesc/validators/common/DFAContentModel.cpp +++ b/src/xercesc/validators/common/DFAContentModel.cpp @@ -40,6 +40,7 @@ #include <xercesc/validators/schema/XercesElementWildcard.hpp> #include <xercesc/util/RefHashTableOf.hpp> #include <xercesc/util/XMLInteger.hpp> +#include <math.h> XERCES_CPP_NAMESPACE_BEGIN @@ -959,9 +960,11 @@ void DFAContentModel::buildDFA(ContentSpecNode* const curNode) // Bump up the unmarked state count, marking this state done unmarkedState++; +#ifdef OPTIMIZED_BUT_STILL_LINEAR_SEARCH // Optimization(Jan, 2001) unsigned int sorterIndex = 0; // Optimization(Jan, 2001) +#endif // Loop through each possible input symbol in the element map for (unsigned int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) @@ -1034,30 +1037,68 @@ void DFAContentModel::buildDFA(ContentSpecNode* const curNode) unsigned int fNumItems=fLeafIndexes[0]; if(fNumItems!=0) { - CMStateSetEnumerator enumBits(setT); - while(enumBits.hasMoreElements()) + // The algorithm requires finding the leaf that is present both in the bitfield of the current state, and in the + // list of places where the currently tested item can appear. When this occurs, the follow list of this parent item + // is added to the bitfield representing the next state. + // Both the bitfield and the list of places are sorted, so we can analyze them in two ways; either iterating over the + // parent items, testing the bitfield for the existence of the parent (N times a constant Tb), or by iterating over the + // bitfield (restricted to the range of the sorted list of places), using a binary search to locate the leaf in the + // sorted list of places (M times log(N) testing operations Ts) + // Assuming that the time to test a bit is roughly the same of the time needed to compute the average of two integers, + // plus a couple of comparisons and additions, we compare N agains M*log(N) to decide which algorithm should be faster given + // the two sets + if(fNumItems <= setT->getBitCountInRange(fLeafIndexes[1], fLeafIndexes[fNumItems])*log((float)fNumItems)) { - unsigned int bitIndex=enumBits.nextElement(); - // Check if this leaf index (DFA position) is in the current set - // (using binary search: the indexes are sorted) - unsigned int first=1,last=fNumItems,i; - while(first<=last) - { - i=(first+last)/2; - if(fLeafIndexes[i]>bitIndex) - last=i-1; - else if(fLeafIndexes[i]<bitIndex) - first=i+1; - else + for(unsigned int i=1; i<=fNumItems; ++i) + if(setT->getBit(fLeafIndexes[i])) { - // found! // // If this leaf is the current input symbol, then we // want to add its follow list to the set of states to // transition to from the current state. // - *newSet |= *fFollowList[bitIndex]; + *newSet |= *fFollowList[ fLeafIndexes[i] ]; + } + } + else + { + // Further optimization: given that the bitfield enumerator returns the numbers in order, + // every time we raise the lower marker we know it will true also for the next bits, so + // the next binary search will not start from 1 but from this index + unsigned int lowIndex = 1; + CMStateSetEnumerator enumBits(setT); + while(enumBits.hasMoreElements()) + { + unsigned int bitIndex=enumBits.nextElement(); + // if this leaf has an index less than the first index in the sorted list of places, + // it cannot be here, so avoid starting the binary search + if(bitIndex < fLeafIndexes[1]) + continue; + // if this leaf is greater than the last index in the sorted list of places, + // nothing can be found from now on, so get out of here + else if(bitIndex > fLeafIndexes[fNumItems]) break; + + // Check if this leaf index (DFA position) is in the current set + // (using binary search: the indexes are sorted) + unsigned int first=lowIndex,last=fNumItems,i; + while(first<=last) + { + i=(first+last)/2; + if(fLeafIndexes[i]>bitIndex) + last=i-1; + else if(fLeafIndexes[i]<bitIndex) + lowIndex=first=i+1; + else + { + // + // If this leaf is the current input symbol, then we + // want to add its follow list to the set of states to + // transition to from the current state. + // + *newSet |= *fFollowList[bitIndex]; + break; + } } } } diff --git a/tests/src/DOM/DOMTest/DTest.cpp b/tests/src/DOM/DOMTest/DTest.cpp index 37f20963ab5ea23fb06485be235dbc86507f9b0c..5c2da0fdd87374b850a6a2dd2ea67d5758d1d58e 100644 --- a/tests/src/DOM/DOMTest/DTest.cpp +++ b/tests/src/DOM/DOMTest/DTest.cpp @@ -39,6 +39,7 @@ #include <xercesc/dom/DOMLSParserFilter.hpp> #include <xercesc/util/OutOfMemoryException.hpp> #include <xercesc/framework/MemBufInputSource.hpp> +#include <xercesc/validators/common/CMStateSet.hpp> #define UNUSED(x) { if(x!=0){} } @@ -1751,7 +1752,7 @@ bool DOMTest::testCharacterData(DOMDocument* document) // EXCEPTIONSTEST(charData->deleteData(2, -1), DOMException::INDEX_SIZE_ERR, OK, 102 ); EXCEPTIONSTEST(charData->deleteData(100, 5), DOMException::INDEX_SIZE_ERR, OK,103 ); -//can't set negative unsigned int in c++ compiler +//can't set negative XMLSize_t in c++ compiler // EXCEPTIONSTEST(charData->insertData(-1, "Stuff inserted"), DOMException::INDEX_SIZE_ERR, OK, 104 ); XMLString::transcode("Stuff inserted", tempStr, 3999); @@ -5608,5 +5609,110 @@ bool DOMTest::testUtilFunctions() XMLString::removeWS(tempStr); TEST_STRING(tempStr, tempStr2); + if(XMLString::stringLen((XMLCh*)0)!=0) + { + fprintf(stderr, "strLen test failed at line %i\n", __LINE__); + OK = false; + } + if(XMLString::stringLen(XMLUni::fgZeroLenString)!=0) + { + fprintf(stderr, "strLen test failed at line %i\n", __LINE__); + OK = false; + } + XMLCh one[2]={ chLatin_A, chNull }; + if(XMLString::stringLen(one)!=1) + { + fprintf(stderr, "strLen test failed at line %i\n", __LINE__); + OK = false; + } + XMLCh two[3]={ chLatin_A, chLatin_B, chNull }; + if(XMLString::stringLen(two)!=2) + { + fprintf(stderr, "strLen test failed at line %i\n", __LINE__); + OK = false; + } + + // this tests the cached bit storage + CMStateSet setT(60); + setT.setBit(8); + setT.setBit(52); + setT.setBit(34); + + if(!setT.getBit(8) || !setT.getBit(52) || !setT.getBit(34)) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + + CMStateSetEnumerator enumT(&setT); + if(!enumT.hasMoreElements() || enumT.nextElement()!=8) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + if(!enumT.hasMoreElements() || enumT.nextElement()!=34) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + if(!enumT.hasMoreElements() || enumT.nextElement()!=52) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + if(enumT.hasMoreElements()) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + + // this tests the dynamic bit storage + CMStateSet setT2(3 * CMSTATE_BITFIELD_CHUNK); + setT2.setBit(0); // first block, begin + setT2.setBit(CMSTATE_BITFIELD_CHUNK/2 -1); // first block, middle + setT2.setBit(CMSTATE_BITFIELD_CHUNK/2); // first block, middle + setT2.setBit(CMSTATE_BITFIELD_CHUNK/2 +1); // first block, middle + setT2.setBit(CMSTATE_BITFIELD_CHUNK-1); // first block, end + setT2.setBit(2*CMSTATE_BITFIELD_CHUNK); // last block, begin + setT2.setBit(2*CMSTATE_BITFIELD_CHUNK + CMSTATE_BITFIELD_CHUNK/2 -1); // last block, middle + setT2.setBit(2*CMSTATE_BITFIELD_CHUNK + CMSTATE_BITFIELD_CHUNK/2); // last block, middle + setT2.setBit(2*CMSTATE_BITFIELD_CHUNK + CMSTATE_BITFIELD_CHUNK/2 +1); // last block, middle + setT2.setBit(3*CMSTATE_BITFIELD_CHUNK-1); // last block, end + + // test just a few ones + if(!setT2.getBit(0) || !setT2.getBit(CMSTATE_BITFIELD_CHUNK-1) || !setT2.getBit(2*CMSTATE_BITFIELD_CHUNK + CMSTATE_BITFIELD_CHUNK/2 +1)) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + + if(setT2.getBitCountInRange(0, 3*CMSTATE_BITFIELD_CHUNK)!=10) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + CMStateSetEnumerator enumT2(&setT2); + XMLSize_t count=0; + while(enumT2.hasMoreElements()) + { + count++; + enumT2.nextElement(); + } + if(count!=10) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } + + // this tests the hash generator + CMStateSet setT3(3 * CMSTATE_BITFIELD_CHUNK), setT4(3 * CMSTATE_BITFIELD_CHUNK); + // these two sets will have a single bit set at the beginning of a chunk + setT3.setBit(0); + setT4.setBit(CMSTATE_BITFIELD_CHUNK); + if(setT3.hashCode()==setT4.hashCode()) + { + fprintf(stderr, "bitset test failed at line %i\n", __LINE__); + OK = false; + } return OK; }