Newer
Older
* Copyright (c) 1999-2000 The Apache Software Foundation. All rights
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact apache\@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation, and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.ibm.com . For more information
* on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
*/
// ---------------------------------------------------------------------------
// This sample program which invokes the DOMParser to build a DOM tree for
// the specified input file. It then walks the tree, and prints out the data
// as an XML file.
//
// Limitations:
// 1. The encoding="xxx" clause in the XML header should reflect
// the system local code page, but does not.
// 2. Cases where the XML data contains characters that can not
// be represented in the system local code page are not handled.
// 3. Enabled namespace processing won't affect the output, since
// DOM doesn't do namespace yet. But it will confirm that all
// prefixes are correctly mapped, else you'll get errors.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include <util/PlatformUtils.hpp>
#include <util/XMLUniDefs.hpp>
#include <framework/XMLFormatter.hpp>
#include <util/TranscodingException.hpp>
#include "DOMTreeErrorReporter.hpp"
#include <string.h>
#include <stdlib.h>
// ---------------------------------------------------------------------------
// Local const data
//
// Note: This is the 'safe' way to do these strings. If you compiler supports
// L"" style strings, and portability is not a concern, you can use
// those types constants directly.
// ---------------------------------------------------------------------------
static const XMLCh gEndElement[] = { chOpenAngle, chForwardSlash, chNull };
static const XMLCh gEndPI[] = { chQuestion, chCloseAngle, chNull};
static const XMLCh gStartPI[] = { chOpenAngle, chQuestion, chNull };
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
static const XMLCh gXMLDecl1[] =
{
chOpenAngle, chQuestion, chLatin_x, chLatin_m, chLatin_l
, chSpace, chLatin_v, chLatin_e, chLatin_r, chLatin_s, chLatin_i
, chLatin_o, chLatin_n, chEqual, chDoubleQuote, chNull
};
static const XMLCh gXMLDecl2[] =
{
chDoubleQuote, chSpace, chLatin_e, chLatin_n, chLatin_c
, chLatin_o, chLatin_d, chLatin_i, chLatin_n, chLatin_g, chEqual
, chDoubleQuote, chNull
};
static const XMLCh gXMLDecl3[] =
{
chDoubleQuote, chSpace, chLatin_s, chLatin_t, chLatin_a
, chLatin_n, chLatin_d, chLatin_a, chLatin_l, chLatin_o
, chLatin_n, chLatin_e, chEqual, chDoubleQuote, chNull
};
static const XMLCh gXMLDecl4[] =
{
chDoubleQuote, chQuestion, chCloseAngle
, chCR, chLF, chNull
};
static const XMLCh gStartCDATA[] =
{
chOpenAngle, chBang, chOpenSquare, chLatin_C, chLatin_D,
chLatin_A, chLatin_T, chLatin_A, chOpenSquare, chNull
};
static const XMLCh gEndCDATA[] =
{
chCloseSquare, chCloseSquare, chCloseAngle, chNull
};
static const XMLCh gStartComment[] =
{
chOpenAngle, chBang, chDash, chDash, chNull
};
static const XMLCh gEndComment[] =
{
chDash, chDash, chCloseAngle, chNull
};
static const XMLCh gStartDoctype[] =
{
chOpenAngle, chBang, chLatin_D, chLatin_O, chLatin_C, chLatin_T,
chLatin_Y, chLatin_P, chLatin_E, chSpace, chNull
};
static const XMLCh gPublic[] =
{
chLatin_P, chLatin_U, chLatin_B, chLatin_L, chLatin_I,
chLatin_C, chSpace, chDoubleQuote, chNull
};
static const XMLCh gSystem[] =
{
chLatin_S, chLatin_Y, chLatin_S, chLatin_T, chLatin_E,
chLatin_M, chSpace, chDoubleQuote, chNull
};
static const XMLCh gStartEntity[] =
{
chOpenAngle, chBang, chLatin_E, chLatin_N, chLatin_T, chLatin_I,
chLatin_T, chLatin_Y, chSpace, chNull
};
static const XMLCh gNotation[] =
{
chLatin_N, chLatin_D, chLatin_A, chLatin_T, chLatin_A,
chSpace, chDoubleQuote, chNull
};
// ---------------------------------------------------------------------------
// Local classes
// ---------------------------------------------------------------------------
class DOMPrintFormatTarget : public XMLFormatTarget
{
public:
DOMPrintFormatTarget() {}
~DOMPrintFormatTarget() {}
// -----------------------------------------------------------------------
// Implementations of the format target interface
// -----------------------------------------------------------------------
void writeChars(const XMLByte* const toWrite)
{
// Surprisingly, Solaris was the only platform on which
// required the char* cast to print out the string correctly.
// Without the cast, it was printing the pointer value in hex.
// Quite annoying, considering every other platform printed
// the string with the explicit cast to char* below.
cout << (char *) toWrite;
}
private:
// -----------------------------------------------------------------------
// Unimplemented methods.
// -----------------------------------------------------------------------
DOMPrintFormatTarget(const DOMPrintFormatTarget& other);
void operator=(const DOMPrintFormatTarget& rhs);
};
// ---------------------------------------------------------------------------
// Local data
//
// The path to the file to parser. Set via command line.
//
// Indicates whether namespace processing should be done.
//
Unknown (aruna1)
committed
// Indicates whether entity references needs to be expanded or not
// Defaults to false
//
// gEncodingName
// The encoding we are to output in. If not set on the command line,
// then it is defaults to the encoding of the input XML file.
//
// gValScheme
// Indicates what validation scheme to use. It defaults to 'auto', but
// can be set via the -v= command.
//
// ---------------------------------------------------------------------------
static char* gXmlFile = 0;
static bool gDoNamespaces = false;
static bool gDoExpand = false;
static XMLCh* gEncodingName = 0;
static XMLFormatter::UnRepFlags gUnRepFlags = XMLFormatter::UnRep_CharRef;
static DOMParser::ValSchemes gValScheme = DOMParser::Val_Auto;
static XMLFormatter* gFormatter = 0;
// ---------------------------------------------------------------------------
// Forward references
// ---------------------------------------------------------------------------
void usage();
ostream& operator<<(ostream& target, const DOMString& toWrite);
ostream& operator<<(ostream& target, DOM_Node& toWrite);
XMLFormatter& operator<< (XMLFormatter& strm, const DOMString& s);
// ---------------------------------------------------------------------------
//
// Usage()
//
// ---------------------------------------------------------------------------
void usage()
{
cout << "\nUsage: DOMPrint [options] file\n\n"
"This program invokes the Xerces-C DOM parser and builds the DOM\n"
"tree. It then traverses the DOM tree and prints the contents \n"
"of the tree. Options are NOT case sensitive.\n\n"
"Options:\n"
" -e Expand entity references. Default is no expansion.\n"
" -u=xxx Handle unrepresentable chars [fail | rep | ref*]\n"
" -v=xxx Validation scheme [always | never | auto*]\n"
" -n Enable namespace processing. Default is off.\n"
" -x=XXX Use a particular encoding for output. Default is\n"
" the same encoding as the input XML file. UTF-8 if\n"
" input XML file has not XML declaration.\n"
" -? Show this help (must be the only parameter)\n\n"
" * = Default if not provided explicitly\n\n"
"The parser has intrinsic support for the following encodings:\n"
" UTF-8, USASCII, ISO8859-1, UTF-16[BL]E, UCS-4[BL]E,\n"
" WINDOWS-1252, IBM1140, IBM037\n"
<< endl;
}
// ---------------------------------------------------------------------------
//
// main
//
// ---------------------------------------------------------------------------
int main(int argC, char* argV[])
{
// Initialize the XML4C2 system
try
{
XMLPlatformUtils::Initialize();
}
catch(const XMLException& toCatch)
{
cerr << "Error during Xerces-c Initialization.\n"
<< DOMString(toCatch.getMessage()) << endl;
return 1;
}
// Check command line and extract arguments.
if (argC < 2)
{
usage();
XMLPlatformUtils::Terminate();
return 1;
}
// Watch for special case help request
if (!strcmp(argV[1], "-?"))
XMLPlatformUtils::Terminate();
return 2;
}
// See if non validating dom parser configuration is requested.
int parmInd;
for (parmInd = 1; parmInd < argC; parmInd++)
{
// Break out on first parm not starting with a dash
if (argV[parmInd][0] != '-')
break;
if (!strncmp(argV[parmInd], "-v=", 3)
|| !strncmp(argV[parmInd], "-V=", 3))
const char* const parm = &argV[parmInd][3];
if (!strcmp(parm, "never"))
gValScheme = DOMParser::Val_Never;
else if (!strcmp(parm, "auto"))
gValScheme = DOMParser::Val_Auto;
else if (!strcmp(parm, "always"))
gValScheme = DOMParser::Val_Always;
else
{
cerr << "Unknown -v= value: " << parm << endl;
return 2;
}
}
else if (!strcmp(argV[parmInd], "-n")
|| !strcmp(argV[parmInd], "-N"))
{
Unknown (aruna1)
committed
}
else if (!strcmp(argV[parmInd], "-e")
|| !strcmp(argV[parmInd], "-E"))
{
else if (!strncmp(argV[parmInd], "-x=", 3)
|| !strncmp(argV[parmInd], "-X=", 3))
// Get out the encoding name
gEncodingName = XMLString::transcode( &(argV[parmInd][3]) );
else if (!strncmp(argV[parmInd], "-u=", 3)
|| !strncmp(argV[parmInd], "-U=", 3))
const char* const parm = &argV[parmInd][3];
if (!strcmp(parm, "fail"))
gUnRepFlags = XMLFormatter::UnRep_Fail;
else if (!strcmp(parm, "rep"))
gUnRepFlags = XMLFormatter::UnRep_Replace;
else if (!strcmp(parm, "ref"))
gUnRepFlags = XMLFormatter::UnRep_CharRef;
else
{
cerr << "Unknown -u= value: " << parm << endl;
return 2;
}
// else if (!strcmp(argV[parmInd], "-NoEscape"))
// {
// gDoEscapes = false;
// }
cerr << "Unknown option '" << argV[parmInd]
<< "', ignoring it.\n" << endl;
}
}
//
// And now we have to have only one parameter left and it must be
// the file name.
//
if (parmInd + 1 != argC)
{
usage();
XMLPlatformUtils::Terminate();
// Create our parser, then attach an error handler to the parser.
// The parser will call back to methods of the ErrorHandler if it
// discovers errors during the course of parsing the XML document.
//
DOMParser *parser = new DOMParser;
parser->setValidationScheme(gValScheme);
parser->setDoNamespaces(gDoNamespaces);
parser->setErrorHandler(errReporter);
parser->setExpandEntityReferences(gDoExpand);
parser->setToCreateXMLDeclTypeNode(true);
//
// Parse the XML file, catching any XML exceptions that might propogate
// out of it.
//
bool errorsOccured = false;
try
{
}
catch (const XMLException& e)
{
cerr << "An error occured during parsing\n Message: "
<< DOMString(e.getMessage()) << endl;
catch (...)
{
cerr << "An error occured during parsing\n " << endl;
errorsOccured = true;
}
// If the parse was successful, output the document data from the DOM tree
if (!errorsOccured)
{
DOM_Node doc = parser->getDocument();
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
DOMPrintFormatTarget* formatTarget = new DOMPrintFormatTarget();
if (gEncodingName == 0)
{
DOMString encNameStr("UTF-8");
DOM_Node aNode = doc.getFirstChild();
if (aNode.getNodeType() == DOM_Node::XML_DECL_NODE)
{
DOMString aStr = ((DOM_XMLDecl &)aNode).getEncoding();
if (aStr != "")
{
encNameStr = aStr;
}
}
unsigned int lent = encNameStr.length();
gEncodingName = new XMLCh[lent + 1];
XMLString::copyNString(gEncodingName, encNameStr.rawBuffer(), lent);
gEncodingName[lent] = 0;
}
try
{
gFormatter = new XMLFormatter(gEncodingName, formatTarget,
XMLFormatter::NoEscapes, gUnRepFlags);
cout << doc << endl;
}
catch (XMLException& e)
{
cerr << "An error occurred during creation of output transcoder. Msg is:"
<< endl
<< DOMString(e.getMessage()) << endl;
retval = 3;
}
// Clean up the error handler. The parser does not adopt handlers
// since they could be many objects or one object installed for multiple
// handlers.
//
delete errReporter;
//
// Delete the parser itself. Must be done prior to calling Terminate, below.
//
delete parser;
// And call the termination method
XMLPlatformUtils::Terminate();
// DomMemDebug().print();
// The DOM document and its contents are reference counted, and need
// no explicit deletion.
}
// ---------------------------------------------------------------------------
// ostream << DOM_Node
//
// Stream out a DOM node, and, recursively, all of its children. This
// function is the heart of writing a DOM tree out as XML source. Give it
// a document node and it will do the whole thing.
// ---------------------------------------------------------------------------
ostream& operator<<(ostream& target, DOM_Node& toWrite)
{
// Get the name and value out for convenience
DOMString nodeName = toWrite.getNodeName();
DOMString nodeValue = toWrite.getNodeValue();
unsigned long lent = nodeValue.length();
gFormatter->formatBuf(nodeValue.rawBuffer(),
lent, XMLFormatter::CharEscapes);
*gFormatter << XMLFormatter::NoEscapes << gStartPI << nodeName;
if (lent > 0)
{
*gFormatter << chSpace << nodeValue;
}
*gFormatter << XMLFormatter::NoEscapes << gEndPI;
DOM_Node child = toWrite.getFirstChild();
while( child != 0)
{
target << child << endl;
child = child.getNextSibling();
}
break;
}
// The name has to be representable without any escapes
*gFormatter << XMLFormatter::NoEscapes
<< chOpenAngle << nodeName;
// Output the element start tag.
// Output any attributes on this element
DOM_NamedNodeMap attributes = toWrite.getAttributes();
int attrCount = attributes.getLength();
for (int i = 0; i < attrCount; i++)
{
DOM_Node attribute = attributes.item(i);
//
// Again the name has to be completely representable. But the
// attribute can have refs and requires the attribute style
// escaping.
//
*gFormatter << XMLFormatter::NoEscapes
<< chSpace << attribute.getNodeName()
<< chEqual << chDoubleQuote
<< XMLFormatter::AttrEscapes
<< attribute.getNodeValue()
<< XMLFormatter::NoEscapes
<< chDoubleQuote;
}
//
// Test for the presence of children, which includes both
// text content and nested elements.
//
DOM_Node child = toWrite.getFirstChild();
if (child != 0)
{
// There are children. Close start-tag, and output children.
// No escapes are legal here
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle;
while( child != 0)
{
target << child;
child = child.getNextSibling();
}
//
*gFormatter << XMLFormatter::NoEscapes << gEndElement
<< nodeName << chCloseAngle;
// There were no children. Output the short form close of
// the element start tag, making it an empty-element tag.
*gFormatter << XMLFormatter::NoEscapes << chForwardSlash << chCloseAngle;
case DOM_Node::ENTITY_REFERENCE_NODE:
{
DOM_Node child;
for (child = toWrite.getFirstChild();
child != 0;
child = child.getNextSibling())
{
*gFormatter << XMLFormatter::NoEscapes << gStartCDATA
<< nodeValue << gEndCDATA;
*gFormatter << XMLFormatter::NoEscapes << gStartComment
<< nodeValue << gEndComment;
Unknown (aruna1)
committed
case DOM_Node::DOCUMENT_TYPE_NODE:
{
DOM_DocumentType doctype = (DOM_DocumentType &)toWrite;;
*gFormatter << XMLFormatter::NoEscapes << gStartDoctype
<< nodeName;
DOMString id = doctype.getPublicId();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << chSpace << gPublic
<< id << chDoubleQuote;
id = doctype.getSystemId();
if (id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace
<< chDoubleQuote << id << chDoubleQuote;
}
}
else
id = doctype.getSystemId();
if (id != 0)
{
*gFormatter << XMLFormatter::NoEscapes << chSpace << gSystem
<< id << chDoubleQuote;
}
}
id = doctype.getInternalSubset();
if (id !=0)
*gFormatter << XMLFormatter::NoEscapes << chOpenSquare
<< id << chCloseSquare;
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle;
Unknown (aruna1)
committed
break;
}
Unknown (aruna1)
committed
{
*gFormatter << XMLFormatter::NoEscapes << gStartEntity
<< nodeName;
DOMString id = ((DOM_Entity &)toWrite).getPublicId();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gPublic
<< id << chDoubleQuote;
id = ((DOM_Entity &)toWrite).getSystemId();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gSystem
<< id << chDoubleQuote;
id = ((DOM_Entity &)toWrite).getNotationName();
if (id != 0)
*gFormatter << XMLFormatter::NoEscapes << gNotation
<< id << chDoubleQuote;
*gFormatter << XMLFormatter::NoEscapes << chCloseAngle << chCR << chLF;
Unknown (aruna1)
committed
break;
}
case DOM_Node::XML_DECL_NODE:
{
DOMString str;
*gFormatter << gXMLDecl1 << ((DOM_XMLDecl &)toWrite).getVersion();
*gFormatter << gXMLDecl2 << gEncodingName;
str = ((DOM_XMLDecl &)toWrite).getStandalone();
if (str != 0)
*gFormatter << gXMLDecl3 << str;
*gFormatter << gXMLDecl4;
break;
}
default:
cerr << "Unrecognized node type = "
<< (long)toWrite.getNodeType() << endl;
}
}
// ---------------------------------------------------------------------------
// ostream << DOMString
// Stream out a DOM string. Doing this requires that we first transcode
// to char * form in the default code page for the system
// ---------------------------------------------------------------------------
ostream& operator<< (ostream& target, const DOMString& s)
char *p = s.transcode();
target << p;
XMLFormatter& operator<< (XMLFormatter& strm, const DOMString& s)
{
unsigned int lent = s.length();
if (lent <= 0)
return strm;