From 29a5f5dac92b1fa1891a71d1dcbdb75b5eae62ed Mon Sep 17 00:00:00 2001
From: David N Bertoni <dbertoni@apache.org>
Date: Wed, 27 Jul 2005 18:56:33 +0000
Subject: [PATCH] Fixes for Jira issue XERCESC-1463.

git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@225575 13f79535-47bb-0310-9956-ffa450edef68
---
 .../util/Transcoders/ICU/ICUTransService.cpp  | 168 ++++++++++++------
 src/xercesc/util/regx/RangeToken.cpp          |  53 +++++-
 src/xercesc/util/regx/RegularExpression.cpp   |  80 ++++++---
 src/xercesc/util/regx/RegularExpression.hpp   |  12 +-
 src/xercesc/util/regx/RegxParser.cpp          |   9 +
 src/xercesc/util/regx/RegxUtil.cpp            |   7 +-
 src/xercesc/util/regx/RegxUtil.hpp            |   9 +
 7 files changed, 244 insertions(+), 94 deletions(-)

diff --git a/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp b/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp
index 9ba1aa819..3f9c90053 100644
--- a/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp
+++ b/src/xercesc/util/Transcoders/ICU/ICUTransService.cpp
@@ -161,7 +161,7 @@ ICUTransService::~ICUTransService()
      * if we clean up here, users' code may crash
      *
     #if (U_ICU_VERSION_MAJOR_NUM >= 2)
-        // release all lasily allocated data
+        // release all lazily allocated data
         u_cleanup();
     #endif
     */
@@ -174,29 +174,37 @@ ICUTransService::~ICUTransService()
 int ICUTransService::compareIString(const   XMLCh* const    comp1
                                     , const XMLCh* const    comp2)
 {
-    const XMLCh* psz1 = comp1;
-    const XMLCh* psz2 = comp2;
+    size_t  i = 0;
+    size_t  j = 0;
 
-    unsigned int curCount = 0;
-    while (true)
+    for(;;)
     {
-        //
-        //  If an inequality, then return the difference. Note that the XMLCh
-        //  might be bigger physically than UChar, but it won't hold anything
-        //  larger than 0xFFFF, so our cast here will work for both possible
-        //  sizes of XMLCh.
-        //
-        if (u_toupper(UChar(*psz1)) != u_toupper(UChar(*psz2)))
-            return int(*psz1) - int(*psz2);
+        UChar32 ch1;
+        UChar32 ch2;
 
-        // If either has ended, then they both ended, so equal
-        if (!*psz1 || !*psz2)
-            break;
+        U16_NEXT_UNSAFE(comp1, i, ch1);
+        U16_NEXT_UNSAFE(comp2, j, ch2);
+
+        const UChar32   folded1 =
+            u_foldCase(ch1, U_FOLD_CASE_DEFAULT);
 
-        // Move upwards for the next round
-        psz1++;
-        psz2++;
+        const UChar32   folded2 =
+            u_foldCase(ch2, U_FOLD_CASE_DEFAULT);
+
+        if (folded1 !=
+            folded2)
+        {
+            return folded1 - folded2;
+        }
+        else if (ch1 == 0)
+        {
+            // If ch1 is 0, the ch2 must also be
+            // 0.  Otherwise, the previous if
+            // would have failed.
+            break;
+        }
     }
+
     return 0;
 }
 
@@ -205,38 +213,49 @@ int ICUTransService::compareNIString(const  XMLCh* const    comp1
                                     , const XMLCh* const    comp2
                                     , const unsigned int    maxChars)
 {
-    const XMLCh* psz1 = comp1;
-    const XMLCh* psz2 = comp2;
-
-    unsigned int curCount = 0;
-    while (true)
+    if (maxChars > 0)
     {
-        //
-        //  If an inequality, then return the difference. Note that the XMLCh
-        //  might be bigger physically than UChar, but it won't hold anything
-        //  larger than 0xFFFF, so our cast here will work for both possible
-        //  sizes of XMLCh.
-        //
-        if (u_toupper(UChar(*psz1)) != u_toupper(UChar(*psz2)))
-            return int(*psz1) - int(*psz2);
+        // Note that this function has somewhat broken semantics, as it's
+        // possible for two strings of different lengths to compare as equal
+        // in a case-insensitive manner, since one character could be
+        // represented as a surrogate pair.
+        size_t  i = 0;
+        size_t  j = 0;
+
+        for(;;)
+        {
+            UChar32 ch1;
+            UChar32 ch2;
 
-        // If either ended, then both ended, so equal
-        if (!*psz1 || !*psz2)
-            break;
+            U16_NEXT_UNSAFE(comp1, i, ch1);
+            U16_NEXT_UNSAFE(comp2, j, ch2);
 
-        // Move upwards to next chars
-        psz1++;
-        psz2++;
+            const UChar32   folded1 =
+                u_foldCase(ch1, U_FOLD_CASE_DEFAULT);
 
-        //
-        //  Bump the count of chars done. If it equals the count then we
-        //  are equal for the requested count, so break out and return
-        //  equal.
-        //
-        curCount++;
-        if (maxChars == curCount)
-            break;
+            const UChar32   folded2 =
+                u_foldCase(ch2, U_FOLD_CASE_DEFAULT);
+
+            if (folded1 != folded2)
+            {
+                return folded1 - folded2;
+            }
+            else if (i == maxChars)
+            {
+                // If we're at the end of both strings, return 0.
+                // Otherwise, we've run out of characters in the
+                // left string, so return -1.
+                return j == maxChars ? 0 : -1;
+            }
+            else if (j == maxChars)
+            {
+                // We've run out of characters in the right string,
+                // but not the left, so return 1.
+                return 1;
+            }
+        }
     }
+
     return 0;
 }
 
@@ -289,24 +308,59 @@ bool ICUTransService::supportsSrcOfs() const
 }
 
 
-void ICUTransService::upperCase(XMLCh* const toUpperCase) const
+template <class FunctionType>
+static void
+doCaseConvert(
+            XMLCh*          convertString,
+            FunctionType    caseFunction)
 {
-    XMLCh* outPtr = toUpperCase;
-    while (*outPtr)
+    // Note the semantics of this function are broken, since it's
+    // possible that changing the case of a string could increase
+    // its length, but there's no way to handle such a situation.
+    const unsigned int  len =
+            XMLString::stringLen(convertString);
+
+    size_t  readPos = 0;
+    size_t  writePos = 0;
+
+    while(readPos < len)
     {
-        *outPtr = XMLCh(u_toupper(UChar(*outPtr)));
-        outPtr++;
+        UChar32     original;
+
+        // Get the next Unicode code point.
+        U16_NEXT_UNSAFE(convertString, readPos, original);
+
+        // Convert the code point
+        const UChar32   converted = caseFunction(original);
+
+        // OK, now here's where it gets ugly.
+        if (!U_IS_BMP(converted) && U_IS_BMP(original) &&
+            readPos - writePos == 1)
+        {
+            // We do not have room to convert the
+            // character without overwriting the next
+            // character, so we will just stop.
+            break;
+        }
+        else
+        {
+            U16_APPEND_UNSAFE(convertString, writePos, converted);
+        }
     }
+
+    convertString[writePos] = 0;
+}
+
+
+
+void ICUTransService::upperCase(XMLCh* const toUpperCase) const
+{
+    doCaseConvert(toUpperCase, u_toupper);
 }
 
 void ICUTransService::lowerCase(XMLCh* const toLowerCase) const
 {
-    XMLCh* outPtr = toLowerCase;
-    while (*outPtr)
-    {
-        *outPtr = XMLCh(u_tolower(UChar(*outPtr)));
-        outPtr++;
-    }
+    doCaseConvert(toLowerCase, u_tolower);
 }
 
 
diff --git a/src/xercesc/util/regx/RangeToken.cpp b/src/xercesc/util/regx/RangeToken.cpp
index 7dd947871..3980e0922 100644
--- a/src/xercesc/util/regx/RangeToken.cpp
+++ b/src/xercesc/util/regx/RangeToken.cpp
@@ -27,6 +27,12 @@
 #include <xercesc/util/regx/TokenFactory.hpp>
 #include <xercesc/util/IllegalArgumentException.hpp>
 
+#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER)
+#include <unicode/uchar.h>
+#else
+#include <xercesc/util/XMLUniDefs.hpp>
+#endif
+
 XERCES_CPP_NAMESPACE_BEGIN
 
 // ---------------------------------------------------------------------------
@@ -66,15 +72,55 @@ RangeToken::~RangeToken() {
 // ---------------------------------------------------------------------------
 RangeToken* RangeToken::getCaseInsensitiveToken(TokenFactory* const tokFactory) {
 
-    // REVIST
-    // We will not build a token with case insenstive ranges
-    // For now we will return a copy of ourselves.
     if (fCaseIToken == 0 && tokFactory) {
 
         bool isNRange = (getTokenType() == T_NRANGE) ? true : false;
         RangeToken* lwrToken = tokFactory->createRange(isNRange);
 
+        for (unsigned int i = 0;  i < fElemCount - 1;  i += 2) {
+            for (XMLInt32 ch = fRanges[i];  ch <= fRanges[i + 1];  ++ch) {
+#if defined(XML_USE_ICU_TRANSCODER) || defined (XML_USE_UNICONV390_TRANSCODER)
+                const XMLInt32  upperCh = u_toupper(ch);
+
+                if (upperCh != ch)
+                {
+                    lwrToken->addRange(upperCh, upperCh);
+                }
+
+                const XMLInt32  lowerCh = u_tolower(ch);
+
+                if (lowerCh != ch)
+                {
+                    lwrToken->addRange(lowerCh, lowerCh);
+                }
+
+                const XMLInt32  titleCh = u_totitle(ch);
+
+                if (titleCh != ch && titleCh != upperCh)
+                {
+                    lwrToken->addRange(titleCh, titleCh);
+                }
+#else
+                if (ch >= chLatin_A && ch <= chLatin_Z)
+                {
+                    ch += chLatin_a - chLatin_A;
+
+                    lwrToken->addRange(ch, ch);
+                }
+                else if (ch >= chLatin_a && ch <= chLatin_z)
+                {
+                    ch -= chLatin_a - chLatin_A;
+
+                    lwrToken->addRange(ch, ch);
+                }
+#endif
+            }
+        }
+
         lwrToken->mergeRanges(this);
+        lwrToken->compactRanges();
+        lwrToken->createMap();
+
         fCaseIToken = lwrToken;
     }
 
@@ -259,6 +305,7 @@ void RangeToken::mergeRanges(const Token *const tok) {
         }
 
         fElemCount = rangeTok->fElemCount;
+        fSorted = true;
         return;
     }
 
diff --git a/src/xercesc/util/regx/RegularExpression.cpp b/src/xercesc/util/regx/RegularExpression.cpp
index de420f097..d8927a53d 100644
--- a/src/xercesc/util/regx/RegularExpression.cpp
+++ b/src/xercesc/util/regx/RegularExpression.cpp
@@ -23,7 +23,6 @@
 // ---------------------------------------------------------------------------
 #include <xercesc/util/regx/RegularExpression.hpp>
 #include <xercesc/util/PlatformUtils.hpp>
-#include <xercesc/util/regx/RegxUtil.hpp>
 #include <xercesc/util/regx/Match.hpp>
 #include <xercesc/util/regx/RangeToken.hpp>
 #include <xercesc/util/regx/RegxDefs.hpp>
@@ -36,6 +35,7 @@
 #include <xercesc/util/OutOfMemoryException.hpp>
 #include <xercesc/util/XMLInitializer.hpp>
 #include <xercesc/util/XMLRegisterCleanup.hpp>
+#include <xercesc/util/XMLUniDefs.hpp>
 
 XERCES_CPP_NAMESPACE_BEGIN
 
@@ -69,6 +69,55 @@ static XMLRegisterCleanup WordRangeCleanup;
 
 
 
+bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
+                                                 const XMLInt32 ch2)
+{
+    if (ch1 >= 0x10000)
+    {
+        XMLCh string1[2];
+        XMLCh string2[2];
+
+        RegxUtil::decomposeToSurrogates(ch1, string1[0], string1[1]);
+
+        if (ch2 >= 0x10000)
+        {
+            RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]);
+        }
+        else
+        {
+            // XMLString::compareNIString is broken, because it assume the
+            // two strings must be of the same length.  Note that two strings
+            // of different length could compare as equal, because there is no
+            // guarantee that a Unicode code point that is encoded in UTF-16 as
+            // a surrogate pair does not have a case mapping to a code point
+            // that is not in the surrogate range.  Just to be safe, we pad the
+            // shorter string with a space, which cannot hvae a case mapping.
+            string2[0] = (XMLCh)ch2;
+            string2[1] = chSpace;
+        }
+
+        return (0==XMLString::compareNIString(string1, string2, 2));
+    }
+    else if (ch2 >= 0x10000)
+    {
+        const XMLCh string1[2] = { (XMLCh)ch1, chSpace };
+        XMLCh string2[2];
+
+        RegxUtil::decomposeToSurrogates(ch2, string2[0], string2[1]);
+
+        return (0==XMLString::compareNIString(string1, string2, 2));
+    }
+    else
+    {
+        const XMLCh  char1 = (XMLCh)ch1;
+        const XMLCh  char2 = (XMLCh)ch2;
+
+        return (0==XMLString::compareNIString(&char1, &char2, 1));
+    }
+  }
+
+
+
 // ---------------------------------------------------------------------------
 //  RegularExpression::Context: Constructors and Destructor
 // ---------------------------------------------------------------------------
@@ -540,11 +589,6 @@ bool RegularExpression::matches(const XMLCh* const expression, const int start,
 
 				if (!range->match(ch)) {
 
-					if (!ignoreCase)
-						continue;
-
-					// Perform case insensitive match
-					// REVISIT
 					continue;
 				}
 
@@ -1098,21 +1142,10 @@ bool RegularExpression::matchRange(Context* const context, const Op* const op,
 	bool match = false;
 
 	if (ignoreCase) {
-
-		//REVISIT we should match ignoring case, but for now
-		//we will do a normal match
-		//tok = tok->getCaseInsensitiveToken();
-		//if (!token->match(strCh)) {
-
-		//	if (strCh > 0x10000)
-		//		return -1;
-			// Do case insensitive matching - uppercase match
-			// or lowercase match
-		//}
-		match = tok->match(strCh);
+		tok = tok->getCaseInsensitiveToken(fTokenFactory);
 	}
-	else
-		match = tok->match(strCh);
+
+    match = tok->match(strCh);
 
 	if (!match)
 		return false;
@@ -1498,7 +1531,12 @@ void RegularExpression::prepare() {
 		}
 
         rangeTok->createMap();
-	}
+
+    	if (isSet(fOptions, IGNORE_CASE))
+        {
+            rangeTok->getCaseInsensitiveToken(fTokenFactory);
+        }
+    }
 
 	if (fOperations != 0 && fOperations->getNextOp() == 0 &&
 		(fOperations->getOpType() == Op::O_STRING ||
diff --git a/src/xercesc/util/regx/RegularExpression.hpp b/src/xercesc/util/regx/RegularExpression.hpp
index ec9ca8413..29e73aa00 100644
--- a/src/xercesc/util/regx/RegularExpression.hpp
+++ b/src/xercesc/util/regx/RegularExpression.hpp
@@ -33,6 +33,7 @@
 #include <xercesc/util/regx/ModifierToken.hpp>
 #include <xercesc/util/regx/ConditionToken.hpp>
 #include <xercesc/util/regx/OpFactory.hpp>
+#include <xercesc/util/regx/RegxUtil.hpp>
 
 XERCES_CPP_NAMESPACE_BEGIN
 
@@ -147,6 +148,8 @@ public:
     static void
     staticCleanup();
 
+    static bool isSet(const int options, const int flag);
+
 private:
     // -----------------------------------------------------------------------
     //  Private data types
@@ -195,7 +198,6 @@ private:
     // -----------------------------------------------------------------------
     void prepare();
     int parseOptions(const XMLCh* const options);
-    bool isSet(const int options, const int flag);
     unsigned short getWordType(const XMLCh* const target, const int begin,
                                const int end, const int offset);
     unsigned short getCharType(const XMLCh ch);
@@ -605,14 +607,6 @@ private:
       return ret;
   }
 
-  inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
-                                                 const XMLInt32 ch2)
-  {
-
-      return (0==XMLString::compareNIString((const XMLCh*)&ch1,(const XMLCh*)&ch2, 1));
-  }
-
-
 XERCES_CPP_NAMESPACE_END
 
 #endif
diff --git a/src/xercesc/util/regx/RegxParser.cpp b/src/xercesc/util/regx/RegxParser.cpp
index 6b67d6cb4..ba29a797d 100644
--- a/src/xercesc/util/regx/RegxParser.cpp
+++ b/src/xercesc/util/regx/RegxParser.cpp
@@ -1175,6 +1175,15 @@ RangeToken* RegxParser::parseCharacterClass(const bool useNRange) {
 
     tok->sortRanges();
     tok->compactRanges();
+
+    // If the case-insensitive option is enabled, we need to
+    // have the new RangeToken instance build its internal
+    // case-insensitive RangeToken.
+    if (RegularExpression::isSet(fOptions, RegularExpression::IGNORE_CASE))
+    {
+        tok->getCaseInsensitiveToken(fTokenFactory);
+    }
+
     setParseContext(S_NORMAL);
     processNext();
 
diff --git a/src/xercesc/util/regx/RegxUtil.cpp b/src/xercesc/util/regx/RegxUtil.cpp
index 3cd2921d1..9e892910d 100644
--- a/src/xercesc/util/regx/RegxUtil.cpp
+++ b/src/xercesc/util/regx/RegxUtil.cpp
@@ -31,10 +31,9 @@ XMLCh* RegxUtil::decomposeToSurrogates(XMLInt32 ch,
 
 	XMLCh* pszStr = (XMLCh*) manager->allocate(3 *  sizeof(XMLCh));//new XMLCh[3];
 
-	ch -= 0x10000;
-	pszStr[0] = XMLCh((ch >> 10) + 0xD800);
-	pszStr[1] = XMLCh((ch & 0x03FF) + 0xDC00);
-	pszStr[2] = chNull;
+    decomposeToSurrogates(ch, pszStr[0], pszStr[1]);
+
+    pszStr[2] = chNull;
 
 	return pszStr;
 }
diff --git a/src/xercesc/util/regx/RegxUtil.hpp b/src/xercesc/util/regx/RegxUtil.hpp
index 1e211130f..8f07f2deb 100644
--- a/src/xercesc/util/regx/RegxUtil.hpp
+++ b/src/xercesc/util/regx/RegxUtil.hpp
@@ -44,6 +44,8 @@ public:
 	static bool isWordChar(const XMLCh);
 	static bool isLowSurrogate(const XMLCh ch);
 	static bool isHighSurrogate(const XMLCh ch);
+	static void decomposeToSurrogates(XMLInt32 ch, XMLCh& high, XMLCh& low);
+
 	static XMLCh* decomposeToSurrogates(XMLInt32 ch,
                                         MemoryManager* const manager);
 	static XMLCh* stripExtendedComment(const XMLCh* const expression,
@@ -78,6 +80,13 @@ inline bool RegxUtil::isHighSurrogate(const XMLCh ch) {
 	return (ch & 0xFC00) == 0xD800;
 }
 
+inline void RegxUtil::decomposeToSurrogates(XMLInt32 ch, XMLCh& high, XMLCh& low) {
+
+    ch -= 0x10000;
+	high = XMLCh((ch >> 10) + 0xD800);
+	low = XMLCh((ch & 0x03FF) + 0xDC00);
+}
+
 inline bool RegxUtil::isWordChar(const XMLCh ch) {
 
 	if ((ch == chUnderscore)
-- 
GitLab