Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
/*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
/**
* $Id$
*/
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XMLUniDefs.hpp>
#include <xercesc/util/XMLString.hpp>
#include <xercesc/util/TransService.hpp>
#include <xercesc/util/TranscodingException.hpp>
#include <xercesc/util/XMLExceptMsgs.hpp>
#include <xercesc/framework/XMLFormatter.hpp>
#include <xercesc/util/Janitor.hpp>
#include <xercesc/util/XMLChar.hpp>
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// ---------------------------------------------------------------------------
// Local data
//
// gXXXRef
// These are hard coded versions of the char refs we put out for the
// standard char refs.
//
// gEscapeChars
// For each style of escape, we have a list of the chars that must
// be escaped for that style. The first null hit in each list indicates
// no more valid entries in that list. The first entry is a dummy for
// the NoEscapes style.
// ---------------------------------------------------------------------------
static const XMLCh gAmpRef[] =
{
chAmpersand, chLatin_a, chLatin_m, chLatin_p, chSemiColon, chNull
};
static const XMLCh gAposRef[] =
{
chAmpersand, chLatin_a, chLatin_p, chLatin_o, chLatin_s, chSemiColon, chNull
};
static const XMLCh gGTRef[] =
{
chAmpersand, chLatin_g, chLatin_t, chSemiColon, chNull
};
static const XMLCh gLTRef[] =
{
chAmpersand, chLatin_l, chLatin_t, chSemiColon, chNull
};
static const XMLCh gQuoteRef[] =
{
chAmpersand, chLatin_q, chLatin_u, chLatin_o, chLatin_t, chSemiColon, chNull
};
static const unsigned int kEscapeCount = 6;
static const XMLCh gEscapeChars[XMLFormatter::EscapeFlags_Count][kEscapeCount] =
{
{ chNull , chNull , chNull , chNull , chNull , chNull }
, { chAmpersand , chCloseAngle , chDoubleQuote , chOpenAngle , chSingleQuote , chNull }
, { chAmpersand , chOpenAngle , chDoubleQuote , chNull , chNull , chNull }
, { chAmpersand , chOpenAngle , chCloseAngle , chNull , chNull , chNull }
};
// ---------------------------------------------------------------------------
// Local methods
// ---------------------------------------------------------------------------
bool XMLFormatter::inEscapeList(const XMLFormatter::EscapeFlags escStyle
, const XMLCh toCheck)
{
const XMLCh* escList = gEscapeChars[escStyle];
while (*escList)
{
if (*escList++ == toCheck)
return true;
}
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/***
* XML1.1
*
* Finally, there is considerable demand to define a standard representation of
* arbitrary Unicode characters in XML documents. Therefore, XML 1.1 allows the
* use of character references to the control characters #x1 through #x1F,
* most of which are forbidden in XML 1.0. For reasons of robustness, however,
* these characters still cannot be used directly in documents.
* In order to improve the robustness of character encoding detection, the
* additional control characters #x7F through #x9F, which were freely allowed in
* XML 1.0 documents, now must also appear only as character references.
* (Whitespace characters are of course exempt.) The minor sacrifice of backward
* compatibility is considered not significant.
* Due to potential problems with APIs, #x0 is still forbidden both directly and
* as a character reference.
*
***/
if (fIsXML11)
{
// for XML11
if ( XMLChar1_1::isControlChar(toCheck, 0) &&
!XMLChar1_1::isWhitespace(toCheck, 0) )
{
return true;
}
else
{
return false;
}
}
else
{
return false;
}
}
// ---------------------------------------------------------------------------
// XMLFormatter: Constructors and Destructor
// ---------------------------------------------------------------------------
XMLFormatter::XMLFormatter( const char* const outEncoding
, const char* const docVersion
, XMLFormatTarget* const target
, const EscapeFlags escapeFlags
Khaled Noaman
committed
, const UnRepFlags unrepFlags
, MemoryManager* const manager)
: fEscapeFlags(escapeFlags)
, fOutEncoding(0)
, fTarget(target)
, fUnRepFlags(unrepFlags)
, fIsXML11(false)
Khaled Noaman
committed
, fMemoryManager(manager)
fOutEncoding = XMLString::transcode(outEncoding, fMemoryManager);
// Try to create a transcoder for this encoding
XMLTransService::Codes resCode;
fXCoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fOutEncoding
, resCode
, kTmpBufSize
, fMemoryManager
fMemoryManager->deallocate(fOutEncoding); //delete [] fOutEncoding;
ThrowXML1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, outEncoding
);
}
XMLCh* const tmpDocVer = XMLString::transcode(docVersion, fMemoryManager);
ArrayJanitor<XMLCh> jname(tmpDocVer, fMemoryManager);
fIsXML11 = XMLString::equals(tmpDocVer, XMLUni::fgVersion1_1);
}
XMLFormatter::XMLFormatter( const XMLCh* const outEncoding
, const XMLCh* const docVersion
, XMLFormatTarget* const target
, const EscapeFlags escapeFlags
Khaled Noaman
committed
, const UnRepFlags unrepFlags
, MemoryManager* const manager)
: fEscapeFlags(escapeFlags)
, fOutEncoding(0)
, fTarget(target)
, fUnRepFlags(unrepFlags)
, fAposLen(0)
, fAmpRef(0)
, fAmpLen(0)
, fQuoteLen(0)
, fIsXML11(false)
Khaled Noaman
committed
, fMemoryManager(manager)
{
// Try to create a transcoder for this encoding
XMLTransService::Codes resCode;
fXCoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
, fMemoryManager
);
if (!fXCoder)
{
ThrowXML1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, outEncoding
);
}
// Copy the encoding string
fOutEncoding = XMLString::replicate(outEncoding, fMemoryManager);
fIsXML11 = XMLString::equals(docVersion, XMLUni::fgVersion1_1);
XMLFormatter::XMLFormatter( const char* const outEncoding
, XMLFormatTarget* const target
, const EscapeFlags escapeFlags
, const UnRepFlags unrepFlags
, MemoryManager* const manager)
Neil Graham
committed
: fEscapeFlags(escapeFlags)
, fOutEncoding(0)
, fTarget(target)
, fUnRepFlags(unrepFlags)
, fXCoder(0)
, fAposRef(0)
, fAposLen(0)
, fAmpRef(0)
, fAmpLen(0)
, fGTRef(0)
, fGTLen(0)
, fLTRef(0)
, fLTLen(0)
, fQuoteRef(0)
, fQuoteLen(0)
, fIsXML11(false)
, fMemoryManager(manager)
Neil Graham
committed
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
// this constructor uses "1.0" for the docVersion
// Transcode the encoding string
fOutEncoding = XMLString::transcode(outEncoding, fMemoryManager);
// Try to create a transcoder for this encoding
XMLTransService::Codes resCode;
fXCoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fOutEncoding
, resCode
, kTmpBufSize
, fMemoryManager
);
if (!fXCoder)
{
fMemoryManager->deallocate(fOutEncoding); //delete [] fOutEncoding;
ThrowXML1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, outEncoding
);
}
//XMLCh* const tmpDocVer = XMLString::transcode("1.0", fMemoryManager);
//ArrayJanitor<XMLCh> jname(tmpDocVer, fMemoryManager);
//fIsXML11 = XMLString::equals(tmpDocVer, XMLUni::fgVersion1_1);
fIsXML11 = false; // docVersion 1.0 is not 1.1!
}
XMLFormatter::XMLFormatter( const XMLCh* const outEncoding
, XMLFormatTarget* const target
, const EscapeFlags escapeFlags
, const UnRepFlags unrepFlags
, MemoryManager* const manager)
Neil Graham
committed
: fEscapeFlags(escapeFlags)
, fOutEncoding(0)
, fTarget(target)
, fUnRepFlags(unrepFlags)
, fXCoder(0)
, fAposRef(0)
, fAposLen(0)
, fAmpRef(0)
, fAmpLen(0)
, fGTRef(0)
, fGTLen(0)
, fLTRef(0)
, fLTLen(0)
, fQuoteRef(0)
, fQuoteLen(0)
, fIsXML11(false)
, fMemoryManager(manager)
Neil Graham
committed
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
// this constructor uses XMLUni::fgVersion1_0 for the docVersion
// Try to create a transcoder for this encoding
XMLTransService::Codes resCode;
fXCoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
outEncoding
, resCode
, kTmpBufSize
, fMemoryManager
);
if (!fXCoder)
{
ThrowXML1
(
TranscodingException
, XMLExcepts::Trans_CantCreateCvtrFor
, outEncoding
);
}
// Copy the encoding string
fOutEncoding = XMLString::replicate(outEncoding, fMemoryManager);
//fIsXML11 = XMLString::equals(docVersion, XMLUni::fgVersion1_1);
fIsXML11 = false; // docVersion 1.0 is not 1.1!
fMemoryManager->deallocate(fAposRef); //delete [] fAposRef;
fMemoryManager->deallocate(fAmpRef); //delete [] fAmpRef;
fMemoryManager->deallocate(fGTRef); //delete [] fGTRef;
fMemoryManager->deallocate(fLTRef); //delete [] fLTRef;
fMemoryManager->deallocate(fQuoteRef); //delete [] fQuoteRef;
fMemoryManager->deallocate(fOutEncoding); //delete [] fOutEncoding;
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
delete fXCoder;
// We DO NOT own the target object!
}
// ---------------------------------------------------------------------------
// XMLFormatter: Formatting methods
// ---------------------------------------------------------------------------
void
XMLFormatter::formatBuf(const XMLCh* const toFormat
, const unsigned int count
, const EscapeFlags escapeFlags
, const UnRepFlags unrepFlags)
{
//
// Figure out the actual escape flag value. If the parameter is not
// the default, then take it. Else take the current default.
//
const EscapeFlags actualEsc = (escapeFlags == DefaultEscape)
? fEscapeFlags : escapeFlags;
// And do the same for the unrep flags
const UnRepFlags actualUnRep = (unrepFlags == DefaultUnRep)
? fUnRepFlags : unrepFlags;
//
// If the actual unrep action is that they want to provide char refs
// for unrepresentable chars, then this one is a much more difficult
// one to do cleanly, and we handle it separately.
//
if (actualUnRep == UnRep_CharRef)
{
specialFormat(toFormat, count, actualEsc);
return;
}
//
// If we don't have any escape flags set, then we can do the most
// efficient loop, else we have to do it the hard way.
//
const XMLCh* srcPtr = toFormat;
const XMLCh* endPtr = toFormat + count;
if (actualEsc == NoEscapes)
{
//
// Just do a whole buffer at a time into the temp buffer, cap
// it off, and send it to the target.
if (srcPtr < endPtr)
srcPtr += handleUnEscapedChars(srcPtr, endPtr - srcPtr, actualUnRep);
// Escape chars that require it according tot he scale flags
// we were given. For the others, try to accumulate them and
// format them in as big as bulk as we can.
//
while (srcPtr < endPtr)
{
//
// Run a temp pointer up until we hit a character that we have
// to escape. Then we can convert all the chars between our
// current source pointer and here all at once.
//
const XMLCh* tmpPtr = srcPtr;
while ((tmpPtr < endPtr) && !inEscapeList(actualEsc, *tmpPtr))
tmpPtr++;
//
// If we got any chars, then lets convert them and write them
// out.
//
if (tmpPtr > srcPtr)
srcPtr += handleUnEscapedChars(srcPtr, tmpPtr - srcPtr,
actualUnRep);
else if (tmpPtr < endPtr)
{
//
// Ok, so we've hit a char that must be escaped. So do
// this one specially.
//
const XMLByte * theChars;
switch (*srcPtr) {
theChars = getCharRef(fAmpLen, fAmpRef, gAmpRef);
fTarget->writeChars(theChars, fAmpLen, this);
theChars = getCharRef(fAposLen, fAposRef, gAposRef);
fTarget->writeChars(theChars, fAposLen, this);
theChars = getCharRef(fQuoteLen, fQuoteRef, gQuoteRef);
fTarget->writeChars(theChars, fQuoteLen, this);
theChars = getCharRef(fGTLen, fGTRef, gGTRef);
fTarget->writeChars(theChars, fGTLen, this);
theChars = getCharRef(fLTLen, fLTRef, gLTRef);
fTarget->writeChars(theChars, fLTLen, this);
// control characters
writeCharRef(*srcPtr);
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
unsigned int
XMLFormatter::handleUnEscapedChars(const XMLCh * srcPtr,
const unsigned int oCount,
const UnRepFlags actualUnRep)
{
//
// Use that to figure out what I should pass to the transcoder. If we
// are doing character references or failing for unrepresentable chars,
// then we just throw, since we should never get a call for something
// we cannot represent. Else, we tell it to just use the replacement
// char.
//
const XMLTranscoder::UnRepOpts unRepOpts = (actualUnRep == UnRep_Replace)
? XMLTranscoder::UnRep_RepChar
: XMLTranscoder::UnRep_Throw;
unsigned int charsEaten;
unsigned int count = oCount;
while (count) {
const unsigned srcChars
= count > kTmpBufSize ? kTmpBufSize : count;
const unsigned int outBytes
= fXCoder->transcodeTo(srcPtr, srcChars,
fTmpBuf, kTmpBufSize,
charsEaten, unRepOpts);
if (outBytes) {
fTmpBuf[outBytes] = 0; fTmpBuf[outBytes + 1] = 0;
fTmpBuf[outBytes + 2] = 0; fTmpBuf[outBytes + 3] = 0;
fTarget->writeChars(fTmpBuf, outBytes, this);
}
srcPtr += charsEaten;
count -= charsEaten;
}
return oCount; // This should be an assertion that count == 0.
}
XMLFormatter& XMLFormatter::operator<<(const XMLCh* const toFormat)
{
const unsigned int len = XMLString::stringLen(toFormat);
formatBuf(toFormat, len);
return *this;
}
XMLFormatter& XMLFormatter::operator<<(const XMLCh toFormat)
{
// Make a temp string format that
XMLCh szTmp[2];
szTmp[0] = toFormat;
szTmp[1] = 0;
formatBuf(szTmp, 1);
return *this;
}
/**
* the parameter, count, is needed since stringLen()
* does not work on a BOM like "0xFE0xFF0x000x00" or
* "0x000x000xFF0xFE"
**/
void XMLFormatter::writeBOM(const XMLByte* const toFormat
, const unsigned int count)
{
fTarget->writeChars(toFormat, count, this);
}
// ---------------------------------------------------------------------------
// XMLFormatter: Private helper methods
// ---------------------------------------------------------------------------
void XMLFormatter::writeCharRef(const XMLCh &toWrite)
{
XMLCh tmpBuf[32];
tmpBuf[0] = chAmpersand;
tmpBuf[1] = chPound;
tmpBuf[2] = chLatin_x;
// Build a char ref for the current char
XMLString::binToText(toWrite, &tmpBuf[3], 8, 16);
const unsigned int bufLen = XMLString::stringLen(tmpBuf);
tmpBuf[bufLen] = chSemiColon;
tmpBuf[bufLen+1] = chNull;
// write it out
formatBuf(tmpBuf
, bufLen + 1
, XMLFormatter::NoEscapes
, XMLFormatter::UnRep_Fail);
}
void XMLFormatter::writeCharRef(unsigned long toWrite)
{
XMLCh tmpBuf[32];
tmpBuf[0] = chAmpersand;
tmpBuf[1] = chPound;
tmpBuf[2] = chLatin_x;
// Build a char ref for the current char
XMLString::binToText(toWrite, &tmpBuf[3], 8, 16);
const unsigned int bufLen = XMLString::stringLen(tmpBuf);
tmpBuf[bufLen] = chSemiColon;
tmpBuf[bufLen+1] = chNull;
// write it out
formatBuf(tmpBuf
, bufLen + 1
, XMLFormatter::NoEscapes
, XMLFormatter::UnRep_Fail);
}
const XMLByte* XMLFormatter::getCharRef(unsigned int & count,
const XMLCh * stdRef)
unsigned int charsEaten;
const unsigned int outBytes =
fXCoder->transcodeTo(stdRef, XMLString::stringLen(stdRef),
fTmpBuf, kTmpBufSize, charsEaten,
XMLTranscoder::UnRep_Throw);
fTmpBuf[outBytes] = 0;
fTmpBuf[outBytes + 1] = 0;
fTmpBuf[outBytes + 2] = 0;
fTmpBuf[outBytes + 3] = 0;
ref = (XMLByte*) fMemoryManager->allocate
(
(outBytes + 4) * sizeof(XMLByte)
);//new XMLByte[outBytes + 4];
memcpy(ref, fTmpBuf, outBytes + 4);
count = outBytes;
}
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
}
void XMLFormatter::specialFormat(const XMLCh* const toFormat
, const unsigned int count
, const EscapeFlags escapeFlags)
{
//
// We have to check each character and see if it could be represented.
// As long as it can, we just keep up with where we started and how
// many chars we've checked. When we hit an unrepresentable one, we
// stop, transcode everything we've collected, then start handling
// the unrepresentables via char refs. We repeat this until we get all
// the chars done.
//
const XMLCh* srcPtr = toFormat;
const XMLCh* endPtr = toFormat + count;
while (srcPtr < endPtr)
{
const XMLCh* tmpPtr = srcPtr;
while (tmpPtr < endPtr)
{
if (fXCoder->canTranscodeTo(*tmpPtr))
tmpPtr++;
else
break;
}
if (tmpPtr > srcPtr)
{
// We got at least some chars that can be done normally
formatBuf
(
srcPtr
, tmpPtr - srcPtr
, escapeFlags
, XMLFormatter::UnRep_Fail
);
// Update the source pointer to our new spot
srcPtr = tmpPtr;
}
else
{
// We hit something unrepresentable. So continue forward doing
// char refs until we hit something representable again or the
// end of input.
//
while (srcPtr < endPtr)
{
if ((*srcPtr & 0xFC00) == 0xD800) {
// we have encountered a surrogate, need to recombine before printing out
// use writeCharRef that takes unsigned long to get values larger than
// hex 0xFFFF printed.
tmpPtr = srcPtr;
tmpPtr++; // point at low surrogate
writeCharRef((unsigned long) (0x10000+((*srcPtr-0xD800)<<10)+*tmpPtr-0xDC00));
srcPtr++; // advance to low surrogate (will advance again below)
}
else {
writeCharRef(*srcPtr);
}
// Move up the source pointer and break out if needed
srcPtr++;
if (fXCoder->canTranscodeTo(*srcPtr))
break;
}
}
}
}