diff --git a/src/xercesc/util/regx/BMPattern.cpp b/src/xercesc/util/regx/BMPattern.cpp index c057bd7686de1d0080fa8dd2ba94c3577261d406..1c64b044b2b556249c999a8bac8f6ea62b9592b4 100644 --- a/src/xercesc/util/regx/BMPattern.cpp +++ b/src/xercesc/util/regx/BMPattern.cpp @@ -99,7 +99,7 @@ BMPattern::~BMPattern() { // --------------------------------------------------------------------------- // BMPattern: matches methods // --------------------------------------------------------------------------- -int BMPattern::matches(const XMLCh* const content, XMLSize_t start, XMLSize_t limit) { +int BMPattern::matches(const XMLCh* const content, XMLSize_t start, XMLSize_t limit) const { const XMLSize_t patternLen = XMLString::stringLen(fPattern); // Uppercase Content diff --git a/src/xercesc/util/regx/BMPattern.hpp b/src/xercesc/util/regx/BMPattern.hpp index 8686fddda451079baf6402a388bad2c11fb305b0..b15283f504b27ce4a930794fa28f32f81d14ff1b 100644 --- a/src/xercesc/util/regx/BMPattern.hpp +++ b/src/xercesc/util/regx/BMPattern.hpp @@ -100,7 +100,7 @@ public: * This method will perform a match of the given content against a * predefined pattern. */ - int matches(const XMLCh* const content, XMLSize_t start, XMLSize_t limit); + int matches(const XMLCh* const content, XMLSize_t start, XMLSize_t limit) const; //@} diff --git a/src/xercesc/util/regx/RegularExpression.cpp b/src/xercesc/util/regx/RegularExpression.cpp index 4cdde2f083c52cb683f113744b3e627fbe142b5d..b964f34ab86364c83e0080e9268b06fc2d8e45c3 100644 --- a/src/xercesc/util/regx/RegularExpression.cpp +++ b/src/xercesc/util/regx/RegularExpression.cpp @@ -57,7 +57,7 @@ const unsigned int RegularExpression::SPECIAL_COMMA = 1024; RangeToken* RegularExpression::fWordRange = 0; bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1, - const XMLInt32 ch2) + const XMLInt32 ch2) const { if (ch1 >= 0x10000) { @@ -118,6 +118,7 @@ RegularExpression::Context::Context(MemoryManager* const manager) : , fOffsets(0) , fMatch(0) , fString(0) + , fOptions(0) , fMemoryManager(manager) { } @@ -132,6 +133,7 @@ RegularExpression::Context::Context(Context* src) : , fOffsets(0) , fMatch(0) , fString(src->fString) + , fOptions(src->fOptions) , fMemoryManager(src->fMemoryManager) { if(src->fOffsets) @@ -155,6 +157,7 @@ RegularExpression::Context& RegularExpression::Context::operator= (const Regular fSize=other.fSize; fStringMaxLen=other.fStringMaxLen; fString=other.fString; + fOptions=other.fOptions; if (fOffsets) fMemoryManager->deallocate(fOffsets);//delete [] fOffsets; fOffsets=0; @@ -194,7 +197,8 @@ void RegularExpression::Context::reset(const XMLCh* const string , const XMLSize_t stringLen , const XMLSize_t start , const XMLSize_t limit - , const int noClosures) + , const int noClosures + , const unsigned int options) { fString = string; fStringMaxLen = stringLen; @@ -212,6 +216,7 @@ void RegularExpression::Context::reset(const XMLCh* const string } fSize = noClosures; + fOptions = options; for (int i = 0; i< fSize; i++) fOffsets[i] = -1; @@ -441,7 +446,7 @@ void RegularExpression::setPattern(const XMLCh* const pattern, // RegularExpression: Matching methods // --------------------------------------------------------------------------- bool RegularExpression::matches(const char* const expression - , MemoryManager* const manager) + , MemoryManager* const manager) const { XMLCh* tmpBuf = XMLString::transcode(expression, manager); ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); @@ -450,7 +455,7 @@ bool RegularExpression::matches(const char* const expression bool RegularExpression::matches(const char* const expression , const XMLSize_t start, const XMLSize_t end - , MemoryManager* const manager) + , MemoryManager* const manager) const { XMLCh* tmpBuf = XMLString::transcode(expression, manager); @@ -460,7 +465,7 @@ bool RegularExpression::matches(const char* const expression bool RegularExpression::matches(const char* const expression , Match* const match - , MemoryManager* const manager) + , MemoryManager* const manager) const { XMLCh* tmpBuf = XMLString::transcode(expression, manager); @@ -470,7 +475,7 @@ bool RegularExpression::matches(const char* const expression bool RegularExpression::matches(const char* const expression, const XMLSize_t start , const XMLSize_t end, Match* const pMatch - , MemoryManager* const manager) + , MemoryManager* const manager) const { XMLCh* tmpBuf = XMLString::transcode(expression, manager); @@ -482,34 +487,34 @@ bool RegularExpression::matches(const char* const expression, const XMLSize_t st // --------------------------------------------------------------------------- // RegularExpression: Matching methods - Wide char version // --------------------------------------------------------------------------- -bool RegularExpression::matches(const XMLCh* const expression, MemoryManager* const manager) +bool RegularExpression::matches(const XMLCh* const expression, MemoryManager* const manager) const { return matches(expression, 0, XMLString::stringLen(expression), 0, manager); } bool RegularExpression::matches(const XMLCh* const expression , const XMLSize_t start, const XMLSize_t end - , MemoryManager* const manager) + , MemoryManager* const manager) const { return matches(expression, start, end, 0, manager); } bool RegularExpression::matches(const XMLCh* const expression , Match* const match - , MemoryManager* const manager) + , MemoryManager* const manager) const { return matches(expression, 0, XMLString::stringLen(expression), match, manager); } bool RegularExpression::matches(const XMLCh* const expression, const XMLSize_t start , const XMLSize_t end, Match* const pMatch - , MemoryManager* const manager) + , MemoryManager* const manager) const { Context context(manager); XMLSize_t strLength = XMLString::stringLen(expression); - context.reset(expression, strLength, start, end, fNoClosures); + context.reset(expression, strLength, start, end, fNoClosures, fOptions); bool adoptMatch = false; Match* lMatch = pMatch; @@ -518,7 +523,7 @@ bool RegularExpression::matches(const XMLCh* const expression, const XMLSize_t s lMatch->setNoGroups(fNoGroups); } else if (fHasBackReferences) { - lMatch = new (fMemoryManager) Match(fMemoryManager); + lMatch = new (manager) Match(manager); lMatch->setNoGroups(fNoGroups); adoptMatch = true; } @@ -669,21 +674,23 @@ bool RegularExpression::matches(const XMLCh* const expression, const XMLSize_t s // --------------------------------------------------------------------------- // RegularExpression: Tokenize methods // --------------------------------------------------------------------------- -RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression) +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression, + MemoryManager* const manager) const { - XMLCh* tmpBuf = XMLString::transcode(expression, fMemoryManager); - ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); - return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf)); + XMLCh* tmpBuf = XMLString::transcode(expression, manager); + ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); + return tokenize(tmpBuf, 0, XMLString::stringLen(tmpBuf), manager); } RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expression, - const XMLSize_t start, const XMLSize_t end) + const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager) const { - XMLCh* tmpBuf = XMLString::transcode(expression, fMemoryManager); - ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); - return tokenize(tmpBuf, start, end); + XMLCh* tmpBuf = XMLString::transcode(expression, manager); + ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); + return tokenize(tmpBuf, start, end, manager); } @@ -691,126 +698,75 @@ RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const char* const expressio // --------------------------------------------------------------------------- // RegularExpression: Tokenize methods - Wide char version // --------------------------------------------------------------------------- -RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression) -{ - return tokenize(expression, 0, XMLString::stringLen(expression), 0); -} - RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression, - const XMLSize_t start, const XMLSize_t end) + MemoryManager* const manager) const { - return tokenize(expression, start, end, 0); + return tokenize(expression, 0, XMLString::stringLen(expression), manager); } -RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expression, +RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, - RefVectorOf<Match> *subEx) + MemoryManager* const manager) const { - - RefArrayVectorOf<XMLCh>* tokenStack = new (fMemoryManager) RefArrayVectorOf<XMLCh>(16, true, fMemoryManager); - - Context context(fMemoryManager); - - XMLSize_t strLength = XMLString::stringLen(expression); - - context.reset(expression, strLength, start, end, fNoClosures); - - Match* lMatch = 0; - bool adoptMatch = false; - - if (subEx || fHasBackReferences) { - lMatch = new (fMemoryManager) Match(fMemoryManager); - adoptMatch = true; - lMatch->setNoGroups(fNoGroups); + // check if matches zero length string - throw error if so + if(matches(XMLUni::fgZeroLenString, manager)){ + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, manager); } + + RefVectorOf<Match> *subEx = new (manager) RefVectorOf<Match>(10, true, manager); + Janitor<RefVectorOf<Match> > janSubEx(subEx); - if (context.fAdoptMatch) - delete context.fMatch; - - context.fMatch = lMatch; - context.fAdoptMatch = adoptMatch; + allMatches(matchString, start, end, subEx, manager); + RefArrayVectorOf<XMLCh> *tokens = new (manager) RefArrayVectorOf<XMLCh>(16, true, manager); XMLSize_t tokStart = start; - XMLSize_t matchStart = start; - for (; matchStart <= end; matchStart++) { + unsigned int i = 0; + for(; i < subEx->size(); ++i) { + Match *match = subEx->elementAt(i); + XMLSize_t matchStart = match->getStartPos(0); - int iMatchEnd = match(&context, fOperations, matchStart, 1); + XMLCh *token = (XMLCh*)manager->allocate((matchStart + 1 - tokStart) * sizeof(XMLCh)); + XMLString::subString(token, matchString, tokStart, matchStart, manager); + tokens->addElement(token); - if (iMatchEnd != -1) { - XMLSize_t matchEnd=iMatchEnd; - - if (context.fMatch != 0) { - context.fMatch->setStartPos(0, (int)context.fStart); - context.fMatch->setEndPos(0, (int)matchEnd); - } + tokStart = match->getEndPos(0); + } - if (subEx){ - subEx->addElement(context.fMatch); - lMatch = new (fMemoryManager) Match(*(context.fMatch)); - adoptMatch = true; + XMLCh *token = (XMLCh*)manager->allocate((end + 1 - tokStart) * sizeof(XMLCh)); + XMLString::subString(token, matchString, tokStart, end, manager); + tokens->addElement(token); - context.fAdoptMatch = adoptMatch; - context.fMatch = lMatch; - } + return tokens; +} - XMLCh* token; - if (tokStart == matchStart){ +void RegularExpression::allMatches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, + RefVectorOf<Match> *subEx, MemoryManager* const manager) const +{ + Context context(manager); + context.reset(matchString, XMLString::stringLen(matchString), start, end, fNoClosures, fOptions); - if (tokStart == strLength){ - tokStart--; - break; - } + context.fMatch = new (manager) Match(manager); + context.fMatch->setNoGroups(fNoGroups); + context.fAdoptMatch = true; - token = (XMLCh*) fMemoryManager->allocate(sizeof(XMLCh));//new XMLCh[1]; - token[0] = chNull; - - // When you tokenize using zero string, will return each - // token in the string. Since the zero string will also - // match the start/end characters, resulting in empty - // tokens, we ignore them and do not add them to the stack. - if (!XMLString::equals(fPattern, &chNull)) - tokenStack->addElement(token); - else - fMemoryManager->deallocate(token);//delete[] token; - - } else { - token = (XMLCh*) fMemoryManager->allocate - ( - (matchStart + 1 - tokStart) * sizeof(XMLCh) - );//new XMLCh[matchStart + 1 - tokStart]; - XMLString::subString(token, expression, tokStart, matchStart, fMemoryManager); - tokenStack->addElement(token); - } + XMLSize_t matchStart = start; + while(matchStart <= end) { + XMLSize_t matchEnd = match(&context, fOperations, matchStart, 1); + if(matchEnd != -1) { + context.fMatch->setStartPos(0, matchStart); + context.fMatch->setEndPos(0, matchEnd); - tokStart = matchEnd; + subEx->addElement(context.fMatch); + + context.fMatch = new (manager) Match(*(context.fMatch)); + context.fAdoptMatch = true; - //decrement matchStart as will increment it at the top of the loop - if (matchStart < matchEnd - 1) - matchStart = matchEnd - 1; + matchStart = matchEnd; + } else { + ++matchStart; } } - - XMLCh* token; - - if (matchStart == tokStart + 1){ - token = (XMLCh*) fMemoryManager->allocate(sizeof(XMLCh));//new XMLCh[1]; - token[0] = chNull; - - } else { - token = (XMLCh*) fMemoryManager->allocate - ( - (strLength + 1 - tokStart) * sizeof(XMLCh) - );//new XMLCh[strLength + 1 - tokStart]; - XMLString::subString(token, expression, tokStart, strLength, fMemoryManager); - } - - if (!XMLString::equals(fPattern, &chNull)) - tokenStack->addElement(token); - else - fMemoryManager->deallocate(token);//delete[] token; - - return tokenStack; } @@ -818,28 +774,30 @@ RefArrayVectorOf<XMLCh>* RegularExpression::tokenize(const XMLCh* const expressi // RegularExpression: Replace methods // ----------------------------------------------------------------------- XMLCh* RegularExpression::replace(const char* const matchString, - const char* const replaceString) + const char* const replaceString, + MemoryManager* const manager) const { - XMLCh* tmpBuf = XMLString::transcode(matchString, fMemoryManager); - ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); - XMLCh* tmpBuf2 = XMLString::transcode(replaceString, fMemoryManager); - ArrayJanitor<XMLCh> janBuf2(tmpBuf2, fMemoryManager); + XMLCh* tmpBuf = XMLString::transcode(matchString, manager); + ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); + XMLCh* tmpBuf2 = XMLString::transcode(replaceString, manager); + ArrayJanitor<XMLCh> janBuf2(tmpBuf2, manager); - return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf)); + return replace(tmpBuf, tmpBuf2, 0, XMLString::stringLen(tmpBuf), manager); } XMLCh* RegularExpression::replace(const char* const matchString, const char* const replaceString, - const XMLSize_t start, const XMLSize_t end) + const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager) const { - XMLCh* tmpBuf = XMLString::transcode(matchString, fMemoryManager); - ArrayJanitor<XMLCh> janBuf(tmpBuf, fMemoryManager); - XMLCh* tmpBuf2 = XMLString::transcode(replaceString, fMemoryManager); - ArrayJanitor<XMLCh> janBuf2(tmpBuf2, fMemoryManager); + XMLCh* tmpBuf = XMLString::transcode(matchString, manager); + ArrayJanitor<XMLCh> janBuf(tmpBuf, manager); + XMLCh* tmpBuf2 = XMLString::transcode(replaceString, manager); + ArrayJanitor<XMLCh> janBuf2(tmpBuf2, manager); - return replace(tmpBuf, tmpBuf2, start, end); + return replace(tmpBuf, tmpBuf2, start, end, manager); } @@ -847,60 +805,114 @@ XMLCh* RegularExpression::replace(const char* const matchString, // RegularExpression: Replace methods - Wide char version // --------------------------------------------------------------------------- XMLCh* RegularExpression::replace(const XMLCh* const matchString, - const XMLCh* const replaceString) + const XMLCh* const replaceString, + MemoryManager* const manager) const { return replace(matchString, replaceString, 0, - XMLString::stringLen(matchString)); + XMLString::stringLen(matchString), manager); } XMLCh* RegularExpression::replace(const XMLCh* const matchString, const XMLCh* const replaceString, - const XMLSize_t start, const XMLSize_t end) + const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager) const { + // check if matches zero length string - throw error if so + if(matches(XMLUni::fgZeroLenString, manager)){ + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, manager); + } + + RefVectorOf<Match> *subEx = new (manager) RefVectorOf<Match>(10, true, manager); + Janitor<RefVectorOf<Match> > janSubEx(subEx); + + allMatches(matchString, start, end, subEx, manager); - //check if matches zero length string - throw error if so - if (matches(XMLUni::fgZeroLenString, fMemoryManager)){ - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_RepPatMatchesZeroString, fMemoryManager); + XMLBuffer result(1023, manager); + int tokStart = start; + + unsigned int i = 0; + for(; i < subEx->size(); ++i) { + Match *match = subEx->elementAt(i); + int matchStart = match->getStartPos(0); + + if(matchStart > tokStart) + result.append(matchString + tokStart, matchStart - tokStart); + subInExp(replaceString, matchString, match, result, manager); + + tokStart = match->getEndPos(0); } - RefVectorOf<Match> *subEx = new (fMemoryManager) RefVectorOf<Match>(10, true, fMemoryManager); - Janitor<RefVectorOf<Match> > janSubEx(subEx); + if(end > tokStart) + result.append(matchString + tokStart, end - tokStart); - //Call to tokenize with Match vector so that we keep track of the locations - //of the subExpression within each of the matches - RefArrayVectorOf<XMLCh>* tokenStack = tokenize(matchString, start, end, subEx); - Janitor<RefArrayVectorOf<XMLCh> > janTokStack(tokenStack); + return XMLString::replicate(result.getRawBuffer(), manager); +} - XMLBuffer result(1023, fMemoryManager); +/* + * Helper for Replace. This method prepares the replacement string by substituting + * in actual values for parenthesized sub expressions. + * + * An error will be thrown if: + * 1) there is chBackSlash not followed by a chDollarSign or chBackSlash + * 2) there is an unescaped chDollarSign which is not followed by a digit + * + */ +void RegularExpression::subInExp(const XMLCh* const repString, + const XMLCh* const origString, + const Match* subEx, + XMLBuffer &result, + MemoryManager* const manager) const +{ + int numSubExp = subEx->getNoGroups() - 1; - int numSubEx = 0; + for(const XMLCh *ptr = repString; *ptr != chNull; ++ptr) { + if(*ptr == chDollarSign) { + ++ptr; + + // check that after the $ is a digit + if(!XMLString::isDigit(*ptr)) { + // invalid replace string - $ must be followed by a digit + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, manager); + } - if (subEx && subEx->size() > 0) - numSubEx = subEx->elementAt(0)->getNoGroups() - 1; + int index = *ptr - chDigit_0; - int tokStackSize = tokenStack->size(); - const XMLCh* curRepString = XMLString::replicate(replaceString, fMemoryManager); + const XMLCh *dig = ptr + 1; + while(XMLString::isDigit(*dig)) { + int newIndex = index * 10 + (*dig - chDigit_0); + if(newIndex > numSubExp) break; - for (int i = 0; i < tokStackSize; i++){ + index = newIndex; + ptr = dig; + ++dig; + } - result.append(tokenStack->elementAt(i)); + // now check that the index is legal + if(index <= numSubExp) { + int start = subEx->getStartPos(index); + int end = subEx->getEndPos(index); - if (i != tokStackSize - 1) { + // now copy the substring into the new string + if(start < end) { + result.append(origString + start, end - start); + } + } + + } else { + if(*ptr == chBackSlash) { + ++ptr; - //if there are subExpressions, then determine the string we want to - //substitute in. - if (numSubEx != 0) { - fMemoryManager->deallocate((XMLCh*)curRepString); - curRepString = subInExp(replaceString, matchString, subEx->elementAt(i)); + // if you have a slash and then a character that's not a $ or /, + // then it's an invalid replace string + if(*ptr != chDollarSign && *ptr != chBackSlash) { + ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, manager); + } } - result.append(curRepString); + + result.append(*ptr); } } - - fMemoryManager->deallocate((XMLCh*)curRepString); - return XMLString::replicate(result.getRawBuffer(), fMemoryManager); - } @@ -981,8 +993,8 @@ struct RE_RuntimeContext { RE_RuntimeContext(const Op *op, XMLSize_t offs) : op_(op), offs_(offs) { } }; -int RegularExpression::match(Context* const context, const Op* const operations - , XMLSize_t offset, const short direction) +int RegularExpression::match(Context* const context, const Op* const operations, + XMLSize_t offset, const short direction) const { ValueStackOf<RE_RuntimeContext>* opStack=NULL; Janitor<ValueStackOf<RE_RuntimeContext> > janStack(NULL); @@ -992,7 +1004,7 @@ int RegularExpression::match(Context* const context, const Op* const operations janStack.reset(opStack); } const Op* tmpOp = operations; - bool ignoreCase = isSet(fOptions, IGNORE_CASE); + bool ignoreCase = isSet(context->fOptions, IGNORE_CASE); int doReturn; while (tmpOp != 0) { @@ -1248,7 +1260,7 @@ int RegularExpression::match(Context* const context, const Op* const operations bool RegularExpression::matchChar(Context* const context, const XMLInt32 ch, XMLSize_t& offset, - const short direction, const bool ignoreCase) + const short direction, const bool ignoreCase) const { if(direction < 0 && offset==0) return false; @@ -1274,7 +1286,7 @@ bool RegularExpression::matchChar(Context* const context, } bool RegularExpression::matchDot(Context* const context, XMLSize_t& offset, - const short direction) + const short direction) const { if(direction < 0 && offset==0) return false; @@ -1289,7 +1301,7 @@ bool RegularExpression::matchDot(Context* const context, XMLSize_t& offset, if (!context->nextCh(strCh, tmpOffset, direction)) return false; - if (!isSet(fOptions, SINGLE_LINE)) { + if (!isSet(context->fOptions, SINGLE_LINE)) { if (direction > 0 && RegxUtil::isEOLChar(strCh)) return false; @@ -1304,7 +1316,7 @@ bool RegularExpression::matchDot(Context* const context, XMLSize_t& offset, bool RegularExpression::matchRange(Context* const context, const Op* const op, XMLSize_t& offset, const short direction, - const bool ignoreCase) + const bool ignoreCase) const { if(direction < 0 && offset==0) return false; @@ -1337,7 +1349,7 @@ bool RegularExpression::matchRange(Context* const context, const Op* const op, } bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, - const XMLSize_t offset) + const XMLSize_t offset) const { switch ((XMLCh) ch) { case chLatin_A: @@ -1348,10 +1360,10 @@ bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, if (context->fLength == 0) break; { - wordType after = getWordType(context->fString, context->fStart, + wordType after = getWordType(context, context->fString, context->fStart, context->fLimit, offset); if (after == wordTypeIgnore - || after == getPreviousWordType(context->fString, + || after == getPreviousWordType(context, context->fString, context->fStart, context->fLimit, offset)) break; @@ -1361,10 +1373,10 @@ bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, if (context->fLength == 0) return false; { - wordType after = getWordType(context->fString, context->fStart, + wordType after = getWordType(context, context->fString, context->fStart, context->fLimit, offset); if (after == wordTypeIgnore - || after == getPreviousWordType(context->fString, + || after == getPreviousWordType(context, context->fString, context->fStart, context->fLimit, offset)) return false; @@ -1372,7 +1384,7 @@ bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, break; case chLatin_Z: case chDollarSign: - if ( (XMLCh) ch == chDollarSign && isSet(fOptions, MULTIPLE_LINE)) { + if ( (XMLCh) ch == chDollarSign && isSet(context->fOptions, MULTIPLE_LINE)) { if (!(offset == context->fLimit || (offset < context->fLimit && RegxUtil::isEOLChar(context->fString[offset])))) return false; @@ -1394,7 +1406,7 @@ bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, break; case chAt: case chCaret: - if ( (XMLCh) ch == chCaret && !isSet(fOptions, MULTIPLE_LINE)) { + if ( (XMLCh) ch == chCaret && !isSet(context->fOptions, MULTIPLE_LINE)) { if (offset != context->fStart) return false; @@ -1410,9 +1422,9 @@ bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, if (context->fLength == 0 || offset == context->fLimit) return false; - if (getWordType(context->fString, context->fStart, context->fLimit, + if (getWordType(context, context->fString, context->fStart, context->fLimit, offset) != wordTypeLetter - || getPreviousWordType(context->fString, context->fStart, + || getPreviousWordType(context, context->fString, context->fStart, context->fLimit, offset) != wordTypeOther) return false; break; @@ -1420,9 +1432,9 @@ bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, if (context->fLength == 0 || offset == context->fStart) return false; - if (getWordType(context->fString, context->fStart, context->fLimit, + if (getWordType(context, context->fString, context->fStart, context->fLimit, offset) != wordTypeOther - || getPreviousWordType(context->fString, context->fStart, + || getPreviousWordType(context, context->fString, context->fStart, context->fLimit, offset) != wordTypeLetter) return false; break; @@ -1434,10 +1446,10 @@ bool RegularExpression::matchAnchor(Context* const context, const XMLInt32 ch, bool RegularExpression::matchBackReference(Context* const context, const XMLInt32 refNo, XMLSize_t& offset, const short direction, - const bool ignoreCase) + const bool ignoreCase) const { if (refNo <=0 || refNo >= fNoGroups) - ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_BadRefNo, fMemoryManager); + ThrowXMLwithMemMgr(IllegalArgumentException, XMLExcepts::Regex_BadRefNo, context->fMemoryManager); if (context->fMatch->getStartPos(refNo) < 0 || context->fMatch->getEndPos(refNo) < 0) @@ -1469,7 +1481,7 @@ bool RegularExpression::matchBackReference(Context* const context, bool RegularExpression::matchString(Context* const context, const XMLCh* const literal, XMLSize_t& offset, - const short direction, const bool ignoreCase) + const short direction, const bool ignoreCase) const { XMLSize_t length = XMLString::stringLen(literal); if(direction < 0 && offset<length) @@ -1494,7 +1506,7 @@ bool RegularExpression::matchString(Context* const context, } int RegularExpression::matchCapture(Context* const context, const Op* const op, - XMLSize_t offset, const short direction) + XMLSize_t offset, const short direction) const { // No check is made for nullness of fMatch as the function is only called if // fMatch is not null. @@ -1520,7 +1532,7 @@ int RegularExpression::matchCapture(Context* const context, const Op* const op, int RegularExpression::matchUnion(Context* const context, const Op* const op, XMLSize_t offset, - const short direction) + const short direction) const { unsigned int opSize = op->getSize(); @@ -1546,7 +1558,7 @@ int RegularExpression::matchUnion(Context* const context, bool RegularExpression::matchCondition(Context* const context, const Op* const op, XMLSize_t offset, - const short direction) + const short direction) const { int refNo = op->getRefNo(); @@ -1643,87 +1655,6 @@ Op* RegularExpression::compile(const Token* const token, Op* const next, return ret; } -/* - * Helper for Replace. This method prepares the replacement string by substituting - * in actual values for parenthesized sub expressions. - * - * An error will be thrown if: - * 1) repString references an undefined subExpression - * 2) there is an unescaped chDollar which is not followed by a digit - * - */ -const XMLCh* RegularExpression::subInExp(const XMLCh* const repString, - const XMLCh* const origString, - const Match* subEx) -{ - - int numSubExp = subEx->getNoGroups() - 1; - - if (numSubExp == 0) - return XMLString::replicate(repString, fMemoryManager); - - bool notEscaped = true; - - XMLBuffer newString(1023, fMemoryManager); - - XMLCh indexStr[2]; //holds the string rep of a - - indexStr[1] = chNull; - int index = -1; - - for (const XMLCh* ptr = repString; *ptr != chNull; ptr++){ - - if ((*ptr == chDollarSign) && notEscaped) { - - ptr++; - - //check that after the $ is a digit - if (!XMLString::isDigit(*ptr)){ - - //invalid replace string - $ must be followed by a digit - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager); - } - - indexStr[0] = *ptr; //get the digit - index = XMLString::parseInt(indexStr, fMemoryManager); //convert it to an int - - //now check that the index is legal - if (index > numSubExp){ - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager); - } - - int start = subEx->getStartPos(index); - int end = subEx->getEndPos(index); - - //now copy the substring into the new string - for (int i=start; i<end; i++){ - newString.append(origString[i]); - } - - } else { - - //if you have a slash and then a character that's not a $ or /, - //then it's an invalid replace string - if (!notEscaped && (*ptr != chDollarSign && *ptr != chBackSlash)){ - ThrowXMLwithMemMgr(RuntimeException, XMLExcepts::Regex_InvalidRepPattern, fMemoryManager); - } - - if (*ptr == chBackSlash){ - notEscaped = false; - continue; - - }else - notEscaped = true; - - newString.append(*ptr); - } - } - - return XMLString::replicate(newString.getRawBuffer(), fMemoryManager); - -} - - /* * Prepares for matching. This method is called during construction. */ @@ -1812,17 +1743,17 @@ void RegularExpression::prepare() { } } -RegularExpression::wordType RegularExpression::getCharType(const XMLCh ch) { - - if (!isSet(fOptions, UNICODE_WORD_BOUNDARY)) { +RegularExpression::wordType RegularExpression::getCharType(Context* const context, const XMLCh ch) const +{ + if (!isSet(context->fOptions, UNICODE_WORD_BOUNDARY)) { - if (isSet(fOptions, USE_UNICODE_CATEGORY)) { + if (isSet(context->fOptions, USE_UNICODE_CATEGORY)) { if (fWordRange == 0) { fWordRange = fTokenFactory->getRange(fgUniIsWord); if (fWordRange == 0) - ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Regex_RangeTokenGetError, fgUniIsWord, fMemoryManager); + ThrowXMLwithMemMgr1(RuntimeException, XMLExcepts::Regex_RangeTokenGetError, fgUniIsWord, context->fMemoryManager); } return fWordRange->match(ch) ? wordTypeLetter : wordTypeOther; diff --git a/src/xercesc/util/regx/RegularExpression.hpp b/src/xercesc/util/regx/RegularExpression.hpp index ce162519cfbf7be5c3ae3469ca2b2ab91bb52d6e..ae5fe376a7f83b7d1c3d5e82c8c343712d567e75 100644 --- a/src/xercesc/util/regx/RegularExpression.hpp +++ b/src/xercesc/util/regx/RegularExpression.hpp @@ -104,45 +104,57 @@ public: // ----------------------------------------------------------------------- // Matching methods // ----------------------------------------------------------------------- - bool matches(const char* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); - bool matches(const char* const matchString, const XMLSize_t start, - const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); - bool matches(const char* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); - bool matches(const char* const matchString, const XMLSize_t start, - const XMLSize_t end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); - - bool matches(const XMLCh* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); - bool matches(const XMLCh* const matchString, const XMLSize_t start, - const XMLSize_t end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); - bool matches(const XMLCh* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); - bool matches(const XMLCh* const matchString, const XMLSize_t start, - const XMLSize_t end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); + bool matches(const char* const matchString, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + bool matches(const char* const matchString, const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + bool matches(const char* const matchString, Match* const pMatch, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + bool matches(const char* const matchString, const XMLSize_t start, const XMLSize_t end, + Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + + bool matches(const XMLCh* const matchString, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + bool matches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + bool matches(const XMLCh* const matchString, Match* const pMatch, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + bool matches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, + Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + void allMatches(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, + RefVectorOf<Match> *subEx, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; // ----------------------------------------------------------------------- // Tokenize methods // ----------------------------------------------------------------------- // Note: The caller owns the string vector that is returned, and is responsible // for deleting it. - RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString); - RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const XMLSize_t start, - const XMLSize_t end); + RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; - RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString); RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, - const XMLSize_t start, const XMLSize_t end); + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; // ----------------------------------------------------------------------- // Replace methods // ----------------------------------------------------------------------- // Note: The caller owns the XMLCh* that is returned, and is responsible for // deleting it. - XMLCh *replace(const char* const matchString, const char* const replaceString); XMLCh *replace(const char* const matchString, const char* const replaceString, - const XMLSize_t start, const XMLSize_t end); + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + XMLCh *replace(const char* const matchString, const char* const replaceString, + const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; - XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString); XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString, - const XMLSize_t start, const XMLSize_t end); + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; + XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString, + const XMLSize_t start, const XMLSize_t end, + MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager) const; // ----------------------------------------------------------------------- // Static initialize and cleanup methods @@ -182,7 +194,8 @@ private: Context& operator= (const Context& other); inline const XMLCh* getString() const { return fString; } void reset(const XMLCh* const string, const XMLSize_t stringLen, - const XMLSize_t start, const XMLSize_t limit, const int noClosures); + const XMLSize_t start, const XMLSize_t limit, const int noClosures, + const unsigned int options); bool nextCh(XMLInt32& ch, XMLSize_t& offset, const short direction); bool fAdoptMatch; @@ -194,6 +207,7 @@ private: int* fOffsets; Match* fMatch; const XMLCh* fString; + unsigned int fOptions; MemoryManager* fMemoryManager; }; @@ -208,65 +222,53 @@ private: // ----------------------------------------------------------------------- void prepare(); int parseOptions(const XMLCh* const options); - wordType getWordType(const XMLCh* const target, const XMLSize_t begin, - const XMLSize_t end, const XMLSize_t offset); - wordType getCharType(const XMLCh ch); - wordType getPreviousWordType(const XMLCh* const target, - const XMLSize_t start, const XMLSize_t end, - XMLSize_t offset); + wordType getWordType(Context* const context, const XMLCh* const target, + const XMLSize_t begin, const XMLSize_t end, + const XMLSize_t offset) const; + wordType getCharType(Context* const context, const XMLCh ch) const; + wordType getPreviousWordType(Context* const context, const XMLCh* const target, + const XMLSize_t start, const XMLSize_t end, + XMLSize_t offset) const; /** * Matching helpers */ int match(Context* const context, const Op* const operations, XMLSize_t offset, - const short direction); - bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2); + const short direction) const; + bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2) const; /** * Helper methods used by match(Context* ...) */ bool matchChar(Context* const context, const XMLInt32 ch, XMLSize_t& offset, - const short direction, const bool ignoreCase); - bool matchDot(Context* const context, XMLSize_t& offset, const short direction); + const short direction, const bool ignoreCase) const; + bool matchDot(Context* const context, XMLSize_t& offset, const short direction) const; bool matchRange(Context* const context, const Op* const op, - XMLSize_t& offset, const short direction, const bool ignoreCase); + XMLSize_t& offset, const short direction, const bool ignoreCase) const; bool matchAnchor(Context* const context, const XMLInt32 ch, - const XMLSize_t offset); + const XMLSize_t offset) const; bool matchBackReference(Context* const context, const XMLInt32 ch, XMLSize_t& offset, const short direction, - const bool ignoreCase); + const bool ignoreCase) const; bool matchString(Context* const context, const XMLCh* const literal, - XMLSize_t& offset, const short direction, const bool ignoreCase); + XMLSize_t& offset, const short direction, const bool ignoreCase) const; int matchUnion(Context* const context, const Op* const op, XMLSize_t offset, - const short direction); + const short direction) const; int matchCapture(Context* const context, const Op* const op, XMLSize_t offset, - const short direction); + const short direction) const; bool matchCondition(Context* const context, const Op* const op, XMLSize_t offset, - const short direction); + const short direction) const; int matchModifier(Context* const context, const Op* const op, XMLSize_t offset, - const short direction); + const short direction) const; - /** - * Tokenize helper - * - * This overloaded tokenize is for internal use only. It provides a way to - * keep track of the sub-expressions in each match of the pattern. - * - * It is called by the other tokenize methods, and by the replace method. - * The caller is responsible for the deletion of the returned - * RefArrayVectorOf<XMLCh*> - */ - RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, - const XMLSize_t start, const XMLSize_t end, - RefVectorOf<Match> *subEx); /** * Replace helpers - * - * Note: the caller owns the XMLCh* that is returned */ - const XMLCh *subInExp(const XMLCh* const repString, - const XMLCh* const origString, - const Match* subEx); + void subInExp(const XMLCh* const repString, + const XMLCh* const origString, + const Match* subEx, + XMLBuffer &result, + MemoryManager* const manager) const; /** * Converts a token tree into an operation tree */ @@ -302,10 +304,10 @@ private: XMLSize_t fMinLength; unsigned int fNoClosures; unsigned int fOptions; - BMPattern* fBMPattern; + const BMPattern* fBMPattern; XMLCh* fPattern; XMLCh* fFixedString; - Op* fOperations; + const Op* fOperations; Token* fTokenTree; RangeToken* fFirstChar; static RangeToken* fWordRange; @@ -574,40 +576,42 @@ private: inline int RegularExpression::matchModifier(Context* const context, const Op* const op, XMLSize_t offset, - const short direction) + const short direction) const { int saveOptions = fOptions; - fOptions |= (int) op->getData(); - fOptions &= (int) ~op->getData2(); + context->fOptions |= (int) op->getData(); + context->fOptions &= (int) ~op->getData2(); int ret = match(context, op->getChild(), offset, direction); - fOptions = saveOptions; + context->fOptions = saveOptions; return ret; } - inline RegularExpression::wordType RegularExpression::getWordType(const XMLCh* const target + inline RegularExpression::wordType RegularExpression::getWordType(Context* const context + , const XMLCh* const target , const XMLSize_t begin , const XMLSize_t end - , const XMLSize_t offset) + , const XMLSize_t offset) const { if (offset < begin || offset >= end) return wordTypeOther; - return getCharType(target[offset]); + return getCharType(context, target[offset]); } inline - RegularExpression::wordType RegularExpression::getPreviousWordType(const XMLCh* const target + RegularExpression::wordType RegularExpression::getPreviousWordType(Context* const context + , const XMLCh* const target , const XMLSize_t start , const XMLSize_t end - , XMLSize_t offset) + , XMLSize_t offset) const { - wordType ret = getWordType(target, start, end, --offset); + wordType ret = getWordType(context, target, start, end, --offset); while (ret == wordTypeIgnore) { - ret = getWordType(target, start, end, --offset); + ret = getWordType(context, target, start, end, --offset); } return ret;