From 29a5f5dac92b1fa1891a71d1dcbdb75b5eae62ed Mon Sep 17 00:00:00 2001 From: David N Bertoni <dbertoni@apache.org> Date: Wed, 27 Jul 2005 18:56:33 +0000 Subject: [PATCH] Fixes for Jira issue XERCESC-1463. git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@225575 13f79535-47bb-0310-9956-ffa450edef68 --- .../util/Transcoders/ICU/ICUTransService.cpp | 168 ++++++++++++------ src/xercesc/util/regx/RangeToken.cpp | 53 +++++- src/xercesc/util/regx/RegularExpression.cpp | 80 ++++++--- src/xercesc/util/regx/RegularExpression.hpp | 12 +- src/xercesc/util/regx/RegxParser.cpp | 9 + src/xercesc/util/regx/RegxUtil.cpp | 7 +- src/xercesc/util/regx/RegxUtil.hpp | 9 + 7 files changed, 244 insertions(+), 94 deletions(-) diff --git a/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp b/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp index 9ba1aa819..3f9c90053 100644 --- a/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp +++ b/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp @@ -161,7 +161,7 @@ ICUTransService::~ICUTransService() * if we clean up here, users' code may crash * #if (U_ICU_VERSION_MAJOR_NUM >= 2) - // release all lasily allocated data + // release all lazily allocated data u_cleanup(); #endif */ @@ -174,29 +174,37 @@ ICUTransService::~ICUTransService() int ICUTransService::compareIString(const XMLCh* const comp1 , const XMLCh* const comp2) { - const XMLCh* psz1 = comp1; - const XMLCh* psz2 = comp2; + size_t i = 0; + size_t j = 0; - unsigned int curCount = 0; - while (true) + for(;;) { - // - // If an inequality, then return the difference. Note that the XMLCh - // might be bigger physically than UChar, but it won't hold anything - // larger than 0xFFFF, so our cast here will work for both possible - // sizes of XMLCh. - // - if (u_toupper(UChar(*psz1)) != u_toupper(UChar(*psz2))) - return int(*psz1) - int(*psz2); + UChar32 ch1; + UChar32 ch2; - // If either has ended, then they both ended, so equal - if (!*psz1 || !*psz2) - break; + U16_NEXT_UNSAFE(comp1, i, ch1); + U16_NEXT_UNSAFE(comp2, j, ch2); + + const UChar32 folded1 = + u_foldCase(ch1, U_FOLD_CASE_DEFAULT); - // Move upwards for the next round - psz1++; - psz2++; + const UChar32 folded2 = + u_foldCase(ch2, U_FOLD_CASE_DEFAULT); + + if (folded1 != + folded2) + { + return folded1 - folded2; + } + else if (ch1 == 0) + { + // If ch1 is 0, the ch2 must also be + // 0. Otherwise, the previous if + // would have failed. + break; + } } + return 0; } @@ -205,38 +213,49 @@ int ICUTransService::compareNIString(const XMLCh* const comp1 , const XMLCh* const comp2 , const unsigned int maxChars) { - const XMLCh* psz1 = comp1; - const XMLCh* psz2 = comp2; - - unsigned int curCount = 0; - while (true) + if (maxChars > 0) { - // - // If an inequality, then return the difference. Note that the XMLCh - // might be bigger physically than UChar, but it won't hold anything - // larger than 0xFFFF, so our cast here will work for both possible - // sizes of XMLCh. - // - if (u_toupper(UChar(*psz1)) != u_toupper(UChar(*psz2))) - return int(*psz1) - int(*psz2); + // Note that this function has somewhat broken semantics, as it's + // possible for two strings of different lengths to compare as equal + // in a case-insensitive manner, since one character could be + // represented as a surrogate pair. + size_t i = 0; + size_t j = 0; + + for(;;) + { + UChar32 ch1; + UChar32 ch2; - // If either ended, then both ended, so equal - if (!*psz1 || !*psz2) - break; + U16_NEXT_UNSAFE(comp1, i, ch1); + U16_NEXT_UNSAFE(comp2, j, ch2); - // Move upwards to next chars - psz1++; - psz2++; + const UChar32 folded1 = + u_foldCase(ch1, U_FOLD_CASE_DEFAULT); - // - // Bump the count of chars done. If it equals the count then we - // are equal for the requested count, so break out and return - // equal. - // - curCount++; - if (maxChars == curCount) - break; + const UChar32 folded2 = + u_foldCase(ch2, U_FOLD_CASE_DEFAULT); + + if (folded1 != folded2) + { + return folded1 - folded2; + } + else if (i == maxChars) + { + // If we're at the end of both strings, return 0. + // Otherwise, we've run out of characters in the + // left string, so return -1. + return j == maxChars ? 0 : -1; + } + else if (j == maxChars) + { + // We've run out of characters in the right string, + // but not the left, so return 1. + return 1; + } + } } + return 0; } @@ -289,24 +308,59 @@ bool ICUTransService::supportsSrcOfs() const } -void ICUTransService::upperCase(XMLCh* const toUpperCase) const +template <class FunctionType> +static void +doCaseConvert( + XMLCh* convertString, + FunctionType caseFunction) { - XMLCh* outPtr = toUpperCase; - while (*outPtr) + // Note the semantics of this function are broken, since it's + // possible that changing the case of a string could increase + // its length, but there's no way to handle such a situation. + const unsigned int len = + XMLString::stringLen(convertString); + + size_t readPos = 0; + size_t writePos = 0; + + while(readPos < len) { - *outPtr = XMLCh(u_toupper(UChar(*outPtr))); - outPtr++; + UChar32 original; + + // Get the next Unicode code point. + U16_NEXT_UNSAFE(convertString, readPos, original); + + // Convert the code point + const UChar32 converted = caseFunction(original); + + // OK, now here's where it gets ugly. + if (!U_IS_BMP(converted) && U_IS_BMP(original) && + readPos - writePos == 1) + { + // We do not have room to convert the + // character without overwriting the next + // character, so we will just stop. + break; + } + else + { + U16_APPEND_UNSAFE(convertString, writePos, converted); + } } + + convertString[writePos] = 0; +} + + + +void ICUTransService::upperCase(XMLCh* const toUpperCase) const +{ + doCaseConvert(toUpperCase, u_toupper); } void ICUTransService::lowerCase(XMLCh* const toLowerCase) const { - XMLCh* outPtr = toLowerCase; - while (*outPtr) - { - *outPtr = XMLCh(u_tolower(UChar(*outPtr))); - outPtr++; - } + doCaseConvert(toLowerCase, u_tolower); } diff --git a/src/xercesc/util/regx/RangeToken.cpp b/src/xercesc/util/regx/RangeToken.cpp index 7dd947871..3980e0922 100644 --- a/src/xercesc/util/regx/RangeToken.cpp +++ b/src/xercesc/util/regx/RangeToken.cpp @@ -27,6 +27,12 @@ #include <xercesc/util/regx/TokenFactory.hpp> #include <xercesc/util/IllegalArgumentException.hpp> +#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER) +#include <unicode/uchar.h> +#else +#include <xercesc/util/XMLUniDefs.hpp> +#endif + XERCES_CPP_NAMESPACE_BEGIN // --------------------------------------------------------------------------- @@ -66,15 +72,55 @@ RangeToken::~RangeToken() { // --------------------------------------------------------------------------- RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) { - // REVIST - // We will not build a token with case insenstive ranges - // For now we will return a copy of ourselves. if (fCaseIToken == 0 && tokFactory) { bool isNRange = (getTokenType() == T_NRANGE) ? true : false; RangeToken* lwrToken = tokFactory->createRange(isNRange); + for (unsigned int i = 0; i < fElemCount - 1; i += 2) { + for (XMLInt32 ch = fRanges[i]; ch <= fRanges[i + 1]; ++ch) { +#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER) + const XMLInt32 upperCh = u_toupper(ch); + + if (upperCh != ch) + { + lwrToken->addRange(upperCh, upperCh); + } + + const XMLInt32 lowerCh = u_tolower(ch); + + if (lowerCh != ch) + { + lwrToken->addRange(lowerCh, lowerCh); + } + + const XMLInt32 titleCh = u_totitle(ch); + + if (titleCh != ch && titleCh != upperCh) + { + lwrToken->addRange(titleCh, titleCh); + } +#else + if (ch >= chLatin_A && ch <= chLatin_Z) + { + ch += chLatin_a - chLatin_A; + + lwrToken->addRange(ch, ch); + } + else if (ch >= chLatin_a && ch <= chLatin_z) + { + ch -= chLatin_a - chLatin_A; + + lwrToken->addRange(ch, ch); + } +#endif + } + } + lwrToken->mergeRanges(this); + lwrToken->compactRanges(); + lwrToken->createMap(); + fCaseIToken = lwrToken; } @@ -259,6 +305,7 @@ void RangeToken::mergeRanges(const Token *const tok) { } fElemCount = rangeTok->fElemCount; + fSorted = true; return; } diff --git a/src/xercesc/util/regx/RegularExpression.cpp b/src/xercesc/util/regx/RegularExpression.cpp index de420f097..d8927a53d 100644 --- a/src/xercesc/util/regx/RegularExpression.cpp +++ b/src/xercesc/util/regx/RegularExpression.cpp @@ -23,7 +23,6 @@ // --------------------------------------------------------------------------- #include <xercesc/util/regx/RegularExpression.hpp> #include <xercesc/util/PlatformUtils.hpp> -#include <xercesc/util/regx/RegxUtil.hpp> #include <xercesc/util/regx/Match.hpp> #include <xercesc/util/regx/RangeToken.hpp> #include <xercesc/util/regx/RegxDefs.hpp> @@ -36,6 +35,7 @@ #include <xercesc/util/OutOfMemoryException.hpp> #include <xercesc/util/XMLInitializer.hpp> #include <xercesc/util/XMLRegisterCleanup.hpp> +#include <xercesc/util/XMLUniDefs.hpp> XERCES_CPP_NAMESPACE_BEGIN @@ -69,6 +69,55 @@ static XMLRegisterCleanup WordRangeCleanup; +bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1, + const XMLInt32 ch2) +{ + if (ch1 >= 0x10000) + { + XMLCh string1[2]; + XMLCh string2[2]; + + RegxUtil::decomposeToSurrogates(ch1, string1[0], string1[1]); + + if (ch2 >= 0x10000) + { + RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]); + } + else + { + // XMLString::compareNIString is broken, because it assume the + // two strings must be of the same length. Note that two strings + // of different length could compare as equal, because there is no + // guarantee that a Unicode code point that is encoded in UTF-16 as + // a surrogate pair does not have a case mapping to a code point + // that is not in the surrogate range. Just to be safe, we pad the + // shorter string with a space, which cannot hvae a case mapping. + string2[0] = (XMLCh)ch2; + string2[1] = chSpace; + } + + return (0==XMLString::compareNIString(string1, string2, 2)); + } + else if (ch2 >= 0x10000) + { + const XMLCh string1[2] = { (XMLCh)ch1, chSpace }; + XMLCh string2[2]; + + RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]); + + return (0==XMLString::compareNIString(string1, string2, 2)); + } + else + { + const XMLCh char1 = (XMLCh)ch1; + const XMLCh char2 = (XMLCh)ch2; + + return (0==XMLString::compareNIString(&char1, &char2, 1)); + } + } + + + // --------------------------------------------------------------------------- // RegularExpression::Context: Constructors and Destructor // --------------------------------------------------------------------------- @@ -540,11 +589,6 @@ bool RegularExpression::matches(const XMLCh* const expression, const int start, if (!range->match(ch)) { - if (!ignoreCase) - continue; - - // Perform case insensitive match - // REVISIT continue; } @@ -1098,21 +1142,10 @@ bool RegularExpression::matchRange(Context* const context, const Op* const op, bool match = false; if (ignoreCase) { - - //REVISIT we should match ignoring case, but for now - //we will do a normal match - //tok = tok->getCaseInsensitiveToken(); - //if (!token->match(strCh)) { - - // if (strCh > 0x10000) - // return -1; - // Do case insensitive matching - uppercase match - // or lowercase match - //} - match = tok->match(strCh); + tok = tok->getCaseInsensitiveToken(fTokenFactory); } - else - match = tok->match(strCh); + + match = tok->match(strCh); if (!match) return false; @@ -1498,7 +1531,12 @@ void RegularExpression::prepare() { } rangeTok->createMap(); - } + + if (isSet(fOptions, IGNORE_CASE)) + { + rangeTok->getCaseInsensitiveToken(fTokenFactory); + } + } if (fOperations != 0 && fOperations->getNextOp() == 0 && (fOperations->getOpType() == Op::O_STRING || diff --git a/src/xercesc/util/regx/RegularExpression.hpp b/src/xercesc/util/regx/RegularExpression.hpp index ec9ca8413..29e73aa00 100644 --- a/src/xercesc/util/regx/RegularExpression.hpp +++ b/src/xercesc/util/regx/RegularExpression.hpp @@ -33,6 +33,7 @@ #include <xercesc/util/regx/ModifierToken.hpp> #include <xercesc/util/regx/ConditionToken.hpp> #include <xercesc/util/regx/OpFactory.hpp> +#include <xercesc/util/regx/RegxUtil.hpp> XERCES_CPP_NAMESPACE_BEGIN @@ -147,6 +148,8 @@ public: static void staticCleanup(); + static bool isSet(const int options, const int flag); + private: // ----------------------------------------------------------------------- // Private data types @@ -195,7 +198,6 @@ private: // ----------------------------------------------------------------------- void prepare(); int parseOptions(const XMLCh* const options); - bool isSet(const int options, const int flag); unsigned short getWordType(const XMLCh* const target, const int begin, const int end, const int offset); unsigned short getCharType(const XMLCh ch); @@ -605,14 +607,6 @@ private: return ret; } - inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1, - const XMLInt32 ch2) - { - - return (0==XMLString::compareNIString((const XMLCh*)&ch1,(const XMLCh*)&ch2, 1)); - } - - XERCES_CPP_NAMESPACE_END #endif diff --git a/src/xercesc/util/regx/RegxParser.cpp b/src/xercesc/util/regx/RegxParser.cpp index 6b67d6cb4..ba29a797d 100644 --- a/src/xercesc/util/regx/RegxParser.cpp +++ b/src/xercesc/util/regx/RegxParser.cpp @@ -1175,6 +1175,15 @@ RangeToken* RegxParser::parseCharacterClass(const bool useNRange) { tok->sortRanges(); tok->compactRanges(); + + // If the case-insensitive option is enabled, we need to + // have the new RangeToken instance build its internal + // case-insensitive RangeToken. + if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE)) + { + tok->getCaseInsensitiveToken(fTokenFactory); + } + setParseContext(S_NORMAL); processNext(); diff --git a/src/xercesc/util/regx/RegxUtil.cpp b/src/xercesc/util/regx/RegxUtil.cpp index 3cd2921d1..9e892910d 100644 --- a/src/xercesc/util/regx/RegxUtil.cpp +++ b/src/xercesc/util/regx/RegxUtil.cpp @@ -31,10 +31,9 @@ XMLCh* RegxUtil::decomposeToSurrogates(XMLInt32 ch, XMLCh* pszStr = (XMLCh*) manager->allocate(3 * sizeof(XMLCh));//new XMLCh[3]; - ch -= 0x10000; - pszStr[0] = XMLCh((ch >> 10) + 0xD800); - pszStr[1] = XMLCh((ch & 0x03FF) + 0xDC00); - pszStr[2] = chNull; + decomposeToSurrogates(ch, pszStr[0], pszStr[1]); + + pszStr[2] = chNull; return pszStr; } diff --git a/src/xercesc/util/regx/RegxUtil.hpp b/src/xercesc/util/regx/RegxUtil.hpp index 1e211130f..8f07f2deb 100644 --- a/src/xercesc/util/regx/RegxUtil.hpp +++ b/src/xercesc/util/regx/RegxUtil.hpp @@ -44,6 +44,8 @@ public: static bool isWordChar(const XMLCh); static bool isLowSurrogate(const XMLCh ch); static bool isHighSurrogate(const XMLCh ch); + static void decomposeToSurrogates(XMLInt32 ch, XMLCh& high, XMLCh& low); + static XMLCh* decomposeToSurrogates(XMLInt32 ch, MemoryManager* const manager); static XMLCh* stripExtendedComment(const XMLCh* const expression, @@ -78,6 +80,13 @@ inline bool RegxUtil::isHighSurrogate(const XMLCh ch) { return (ch & 0xFC00) == 0xD800; } +inline void RegxUtil::decomposeToSurrogates(XMLInt32 ch, XMLCh& high, XMLCh& low) { + + ch -= 0x10000; + high = XMLCh((ch >> 10) + 0xD800); + low = XMLCh((ch & 0x03FF) + 0xDC00); +} + inline bool RegxUtil::isWordChar(const XMLCh ch) { if ((ch == chUnderscore) -- GitLab