Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 2001, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/*
* $Id$
*/
#if !defined(REGULAREXPRESSION_HPP)
#define REGULAREXPRESSION_HPP
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/XMLUniDefs.hpp>
Gareth Reakes
committed
#include <xercesc/util/RefArrayVectorOf.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/regx/Op.hpp>
#include <xercesc/util/regx/TokenFactory.hpp>
#include <xercesc/util/regx/BMPattern.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/Mutexes.hpp>
#include <xercesc/util/regx/ModifierToken.hpp>
#include <xercesc/util/regx/ConditionToken.hpp>
#include <xercesc/util/regx/OpFactory.hpp>
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
// ---------------------------------------------------------------------------
// Forward Declaration
// ---------------------------------------------------------------------------
class Token;
class BMPattern;
class RangeToken;
class Match;
class TokenFactory;
class XMLUTIL_EXPORT RegularExpression {
public:
// -----------------------------------------------------------------------
// Public Constructors and Destructor
// -----------------------------------------------------------------------
RegularExpression(const char* const pattern);
RegularExpression(const char* const pattern, const char* const options);
RegularExpression(const XMLCh* const pattern);
RegularExpression(const XMLCh* const pattern, const XMLCh* const options);
~RegularExpression();
// -----------------------------------------------------------------------
// Public Constants
// -----------------------------------------------------------------------
static const unsigned int MARK_PARENS;
static const unsigned int IGNORE_CASE;
static const unsigned int SINGLE_LINE;
static const unsigned int MULTIPLE_LINE;
static const unsigned int EXTENDED_COMMENT;
static const unsigned int USE_UNICODE_CATEGORY;
static const unsigned int UNICODE_WORD_BOUNDARY;
static const unsigned int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
static const unsigned int PROHIBIT_FIXED_STRING_OPTIMIZATION;
static const unsigned int XMLSCHEMA_MODE;
static const unsigned int SPECIAL_COMMA;
static const unsigned short WT_IGNORE;
static const unsigned short WT_LETTER;
static const unsigned short WT_OTHER;
// -----------------------------------------------------------------------
// Public Helper methods
// -----------------------------------------------------------------------
static int getOptionValue(const XMLCh ch);
// -----------------------------------------------------------------------
// Matching methods
// -----------------------------------------------------------------------
bool matches(const char* const matchString);
bool matches(const char* const matchString, const int start,
const int end);
bool matches(const char* const matchString, Match* const pMatch);
bool matches(const char* const matchString, const int start,
const int end, Match* const pMatch);
bool matches(const XMLCh* const matchString);
bool matches(const XMLCh* const matchString, const int start,
const int end);
bool matches(const XMLCh* const matchString, Match* const pMatch);
bool matches(const XMLCh* const matchString, const int start,
const int end, Match* const pMatch);
Gareth Reakes
committed
// -----------------------------------------------------------------------
// Tokenize methods
// -----------------------------------------------------------------------
// Note: The caller owns the string vector that is returned, and is responsible
// for deleting it.
Gareth Reakes
committed
RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString);
RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const int start,
const int end);
RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString);
RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString,
Gareth Reakes
committed
const int start, const int end);
// -----------------------------------------------------------------------
// Replace methods
// -----------------------------------------------------------------------
// Note: The caller owns the XMLCh* that is returned, and is responsible for
// deleting it.
Gareth Reakes
committed
XMLCh *replace(const char* const matchString, const char* const replaceString);
XMLCh *replace(const char* const matchString, const char* const replaceString,
const int start, const int end);
XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString);
XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString,
Gareth Reakes
committed
const int start, const int end);
private:
// -----------------------------------------------------------------------
// Private data types
// -----------------------------------------------------------------------
class XMLUTIL_EXPORT Context
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
{
public :
Context();
~Context();
inline const XMLCh* getString() const { return fString; }
void reset(const XMLCh* const string, const int start,
const int limit, const int noClosures);
bool nextCh(XMLInt32& ch, int& offset, const short direction);
bool fInUse;
bool fAdoptMatch;
int fStart;
int fLimit;
int fLength;
int fSize;
int* fOffsets;
Match* fMatch;
XMLCh* fString;
friend class Janitor<Context>;
};
// -----------------------------------------------------------------------
// Unimplemented constructors and operators
// -----------------------------------------------------------------------
RegularExpression(const RegularExpression&);
void operator=(const RegularExpression&);
// -----------------------------------------------------------------------
// Cleanup methods
// -----------------------------------------------------------------------
void cleanUp();
// -----------------------------------------------------------------------
// Setter methods
// -----------------------------------------------------------------------
void setPattern(const XMLCh* const pattern, const XMLCh* const options=0);
// -----------------------------------------------------------------------
// Private Helper methods
// -----------------------------------------------------------------------
void prepare();
int parseOptions(const XMLCh* const options);
bool isSet(const int options, const int flag);
unsigned short getWordType(const XMLCh* const target, const int begin,
const int end, const int offset);
unsigned short getCharType(const XMLCh ch);
unsigned short getPreviousWordType(const XMLCh* const target,
const int start, const int end,
int offset);
/**
* Matching helpers
*/
int match(Context* const context, const Op* const operations, int offset,
const short direction);
bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2);
/**
* Helper methods used by match(Context* ...)
*/
bool matchChar(Context* const context, const XMLInt32 ch, int& offset,
const short direction, const bool ignoreCase);
bool matchDot(Context* const context, int& offset, const short direction);
bool matchRange(Context* const context, const Op* const op,
int& offset, const short direction, const bool ignoreCase);
bool matchAnchor(Context* const context, const XMLInt32 ch,
const int offset);
bool matchBackReference(Context* const context, const XMLInt32 ch,
int& offset, const short direction,
const bool ignoreCase);
bool matchString(Context* const context, const XMLCh* const literal,
int& offset, const short direction, const bool ignoreCase);
int matchUnion(Context* const context, const Op* const op, int offset,
const short direction);
int matchCapture(Context* const context, const Op* const op, int offset,
const short direction);
bool matchCondition(Context* const context, const Op* const op, int offset,
const short direction);
int matchModifier(Context* const context, const Op* const op, int offset,
const short direction);
/**
Gareth Reakes
committed
* Tokenize helper
Gareth Reakes
committed
* This overloaded tokenize is for internal use only. It provides a way to
* keep track of the sub-expressions in each match of the pattern.
Gareth Reakes
committed
* It is called by the other tokenize methods, and by the replace method.
* The caller is responsible for the deletion of the returned
Gareth Reakes
committed
* RefArrayVectorOf<XMLCh*>
*/
RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString,
const int start, const int end,
Gareth Reakes
committed
RefVectorOf<Match> *subEx);
/**
* Replace helpers
*
* Note: the caller owns the XMLCh* that is returned
*/
const XMLCh *subInExp(const XMLCh* const repString,
const XMLCh* const origString,
Gareth Reakes
committed
const Match* subEx);
/**
* Converts a token tree into an operation tree
*/
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
void compile(const Token* const token);
Op* compile(const Token* const token, Op* const next,
const bool reverse);
/**
* Helper methods used by compile
*/
Op* compileSingle(const Token* const token, Op* const next,
const unsigned short tokType);
Op* compileUnion(const Token* const token, Op* const next,
const bool reverse);
Op* compileCondition(const Token* const token, Op* const next,
const bool reverse);
Op* compileParenthesis(const Token* const token, Op* const next,
const bool reverse);
Op* compileLook(const Token* const token, const Op* const next,
const bool reverse, const unsigned short tokType);
Op* compileConcat(const Token* const token, Op* const next,
const bool reverse);
Op* compileClosure(const Token* const token, Op* const next,
const bool reverse, const unsigned short tokType);
// -----------------------------------------------------------------------
// Private data members
// -----------------------------------------------------------------------
bool fHasBackReferences;
bool fFixedStringOnly;
int fNoGroups;
int fMinLength;
int fNoClosures;
unsigned int fOptions;
Context* fContext;
BMPattern* fBMPattern;
XMLCh* fPattern;
XMLCh* fFixedString;
Op* fOperations;
Token* fTokenTree;
RangeToken* fFirstChar;
static RangeToken* fWordRange;
OpFactory fOpFactory;
XMLMutex fMutex;
TokenFactory* fTokenFactory;
};
Gareth Reakes
committed
// ---------------------------------------------------------------------------
// RegularExpression: Cleanup methods
// ---------------------------------------------------------------------------
inline void RegularExpression::cleanUp() {
Gareth Reakes
committed
delete [] fPattern;
delete [] fFixedString;
delete fContext;
delete fBMPattern;
delete fTokenFactory;
}
Gareth Reakes
committed
// ---------------------------------------------------------------------------
// RegularExpression: Helper methods
// ---------------------------------------------------------------------------
inline bool RegularExpression::isSet(const int options, const int flag) {
Gareth Reakes
committed
return (options & flag) == flag;
}
Gareth Reakes
committed
inline Op* RegularExpression::compileLook(const Token* const token,
const Op* const next,
const bool reverse,
Gareth Reakes
committed
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
Op* ret = 0;
Op* result = compile(token->getChild(0), 0, reverse);
switch(tokType) {
case Token::T_LOOKAHEAD:
ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result);
break;
case Token::T_NEGATIVELOOKAHEAD:
ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result);
break;
case Token::T_LOOKBEHIND:
ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result);
break;
case Token::T_NEGATIVELOOKBEHIND:
ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result);
break;
case Token::T_INDEPENDENT:
ret = fOpFactory.createIndependentOp(next, result);
break;
case Token::T_MODIFIERGROUP:
ret = fOpFactory.createModifierOp(next, result,
((ModifierToken *) token)->getOptions(),
((ModifierToken *) token)->getOptionsMask());
break;
}
return ret;
}
inline Op* RegularExpression::compileSingle(const Token* const token,
Op* const next,
const unsigned short tokType) {
Op* ret = 0;
switch (tokType) {
case Token::T_DOT:
ret = fOpFactory.createDotOp();
break;
case Token::T_CHAR:
ret = fOpFactory.createCharOp(token->getChar());
break;
case Token::T_ANCHOR:
ret = fOpFactory.createAnchorOp(token->getChar());
break;
case Token::T_RANGE:
case Token::T_NRANGE:
ret = fOpFactory.createRangeOp(token);
break;
case Token::T_EMPTY:
ret = next;
break;
case Token::T_STRING:
ret = fOpFactory.createStringOp(token->getString());
break;
case Token::T_BACKREFERENCE:
ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
break;
}
if (tokType != Token::T_EMPTY)
ret->setNextOp(next);
return ret;
}
inline Op* RegularExpression::compileUnion(const Token* const token,
Op* const next,
const bool reverse) {
Gareth Reakes
committed
int tokSize = token->size();
UnionOp* uniOp = fOpFactory.createUnionOp(tokSize);
Gareth Reakes
committed
for (int i=0; i<tokSize; i++) {
Gareth Reakes
committed
uniOp->addElement(compile(token->getChild(i), next, reverse));
}
Gareth Reakes
committed
return uniOp;
}
Gareth Reakes
committed
inline Op* RegularExpression::compileCondition(const Token* const token,
Op* const next,
const bool reverse) {
Gareth Reakes
committed
Token* condTok = ((ConditionToken*) token)->getConditionToken();
Token* yesTok = token->getChild(0);
Token* noTok = token->getChild(1);
int refNo = token->getReferenceNo();
Op* condOp = (condTok == 0) ? 0 : compile(condTok, 0, reverse);
Op* yesOp = compile(yesTok, next, reverse);
Op* noOp = (noTok == 0) ? 0 : compile(noTok, next, reverse);
Gareth Reakes
committed
return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp);
}
Gareth Reakes
committed
inline Op* RegularExpression::compileParenthesis(const Token* const token,
Op* const next,
const bool reverse) {
Gareth Reakes
committed
if (token->getNoParen() == 0)
return compile(token->getChild(0), next, reverse);
Gareth Reakes
committed
Op* captureOp = 0;
Gareth Reakes
committed
if (reverse) {
Gareth Reakes
committed
captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next);
captureOp = compile(token->getChild(0), captureOp, reverse);
Gareth Reakes
committed
return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp);
}
Gareth Reakes
committed
captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next);
captureOp = compile(token->getChild(0), captureOp, reverse);
Gareth Reakes
committed
return fOpFactory.createCaptureOp(token->getNoParen(), captureOp);
}
Gareth Reakes
committed
inline Op* RegularExpression::compileConcat(const Token* const token,
Op* const next,
const bool reverse) {
Gareth Reakes
committed
Op* ret = next;
int tokSize = token->size();
Gareth Reakes
committed
if (!reverse) {
Gareth Reakes
committed
for (int i= tokSize - 1; i>=0; i--) {
ret = compile(token->getChild(i), ret, false);
}
}
else {
Gareth Reakes
committed
for (int i= 0; i< tokSize; i++) {
ret = compile(token->getChild(i), ret, true);
}
}
Gareth Reakes
committed
return ret;
}
Gareth Reakes
committed
inline Op* RegularExpression::compileClosure(const Token* const token,
Op* const next,
const bool reverse,
const unsigned short tokType) {
Gareth Reakes
committed
Op* ret = 0;
Token* childTok = token->getChild(0);
int min = token->getMin();
int max = token->getMax();
Gareth Reakes
committed
if (min >= 0 && min == max) {
Gareth Reakes
committed
ret = next;
for (int i=0; i< min; i++) {
ret = compile(childTok, ret, reverse);
}
Gareth Reakes
committed
return ret;
}
Gareth Reakes
committed
if (min > 0 && max > 0)
max -= min;
Gareth Reakes
committed
if (max > 0) {
Gareth Reakes
committed
ret = next;
for (int i=0; i<max; i++) {
Gareth Reakes
committed
ChildOp* childOp = fOpFactory.createQuestionOp(
tokType == Token::T_NONGREEDYCLOSURE);
Gareth Reakes
committed
childOp->setNextOp(next);
childOp->setChild(compile(childTok, ret, reverse));
ret = childOp;
}
}
else {
Gareth Reakes
committed
ChildOp* childOp = 0;
Gareth Reakes
committed
if (tokType == Token::T_NONGREEDYCLOSURE) {
childOp = fOpFactory.createNonGreedyClosureOp();
}
else {
Gareth Reakes
committed
if (childTok->getMinLength() == 0)
childOp = fOpFactory.createClosureOp(fNoClosures++);
else
childOp = fOpFactory.createClosureOp(-1);
}
Gareth Reakes
committed
childOp->setNextOp(next);
childOp->setChild(compile(childTok, childOp, reverse));
ret = childOp;
}
Gareth Reakes
committed
if (min > 0) {
Gareth Reakes
committed
for (int i=0; i< min; i++) {
ret = compile(childTok, ret, reverse);
}
}
Gareth Reakes
committed
return ret;
}
Gareth Reakes
committed
inline int RegularExpression::matchUnion(Context* const context,
const Op* const op, int offset,
const short direction)
{
unsigned int opSize = op->getSize();
int ret = -1;
Gareth Reakes
committed
for (unsigned int i=0; i < opSize; i++) {
Gareth Reakes
committed
ret = match(context, op->elementAt(i), offset, direction);
Gareth Reakes
committed
if (ret == context->fLimit)
return ret;
}
Gareth Reakes
committed
return -1;
}
Gareth Reakes
committed
inline int RegularExpression::matchModifier(Context* const context,
const Op* const op, int offset,
const short direction)
{
int saveOptions = fOptions;
fOptions |= (int) op->getData();
fOptions &= (int) ~op->getData2();
Gareth Reakes
committed
int ret = match(context, op->getChild(), offset, direction);
Gareth Reakes
committed
fOptions = saveOptions;
Gareth Reakes
committed
return ret;
}
Gareth Reakes
committed
inline unsigned short RegularExpression::getWordType(const XMLCh* const target
, const int begin
, const int end
, const int offset)
{
if (offset < begin || offset >= end)
return WT_OTHER;
Gareth Reakes
committed
return getCharType(target[offset]);
}
Gareth Reakes
committed
inline
unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target
, const int start
, const int end
, int offset)
{
unsigned short ret = getWordType(target, start, end, --offset);
Gareth Reakes
committed
while (ret == WT_IGNORE) {
ret = getWordType(target, start, end, --offset);
}
Gareth Reakes
committed
return ret;
}
Gareth Reakes
committed
inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
Gareth Reakes
committed
return (0==XMLString::compareNIString((XMLCh*)&ch1,(XMLCh*)&ch2, 1));
Gareth Reakes
committed
#endif
/**
* End of file RegularExpression.hpp
*/