From 7a89380e42ecc451b62b8fdfa4322afb1847f55d Mon Sep 17 00:00:00 2001 From: "Unknown (roddey)" <dev-null@apache.org> Date: Fri, 17 Mar 2000 23:58:56 +0000 Subject: [PATCH] New utility for munging ICU UCM files and spitting out tables for our intrinsic encoders. git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@171961 13f79535-47bb-0310-9956-ffa450edef68 --- .../Unsupported/IntVC6/ICUData/ICUData.dsp | 96 +++ Projects/Win32/Unsupported/IntVC6/XML4C3.dsw | 12 + tools/ICUData/ICUData.cpp | 581 ++++++++++++++++++ 3 files changed, 689 insertions(+) create mode 100644 Projects/Win32/Unsupported/IntVC6/ICUData/ICUData.dsp create mode 100644 tools/ICUData/ICUData.cpp diff --git a/Projects/Win32/Unsupported/IntVC6/ICUData/ICUData.dsp b/Projects/Win32/Unsupported/IntVC6/ICUData/ICUData.dsp new file mode 100644 index 000000000..a262f9666 --- /dev/null +++ b/Projects/Win32/Unsupported/IntVC6/ICUData/ICUData.dsp @@ -0,0 +1,96 @@ +# Microsoft Developer Studio Project File - Name="ICUData" - Package Owner=<4> +# Microsoft Developer Studio Generated Build File, Format Version 6.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Console Application" 0x0103 + +CFG=ICUData - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE +!MESSAGE NMAKE /f "ICUData.mak". +!MESSAGE +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f "ICUData.mak" CFG="ICUData - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "ICUData - Win32 Release" (based on "Win32 (x86) Console Application") +!MESSAGE "ICUData - Win32 Debug" (based on "Win32 (x86) Console Application") +!MESSAGE + +# Begin Project +# PROP AllowPerConfigDependencies 0 +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "ICUData - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "..\..\..\..\..\Build\Win32\VC6\Release\" +# PROP Intermediate_Dir "..\..\..\..\..\Build\Win32\VC6\Release\Obj" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD CPP /nologo /G6 /MD /Ze /W3 /GX /O2 /Ob2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /FD /c +# SUBTRACT CPP /YX +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 +# ADD LINK32 kernel32.lib user32.lib /nologo /version:3.0 /subsystem:console /machine:IX86 /machine:I386" + +!ELSEIF "$(CFG)" == "ICUData - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "..\..\..\..\..\Build\Win32\VC6\Debug\" +# PROP Intermediate_Dir "..\..\..\..\..\Build\Win32\VC6\Debug\Obj" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD CPP /nologo /G6 /MDd /Ze /W3 /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /FD /GZ /c +# SUBTRACT CPP /YX +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept +# ADD LINK32 kernel32.lib user32.lib /nologo /version:3.0 /subsystem:console /debug /machine:I386 /pdbtype:sept" + +!ENDIF + +# Begin Target + +# Name "ICUData - Win32 Release" +# Name "ICUData - Win32 Debug" +# Begin Group "Source Files" + +# PROP Default_Filter "cpp" +# Begin Source File + +SOURCE=..\..\..\..\..\tools\ICUData\ICUData.cpp +# End Source File +# End Group +# End Target +# End Project diff --git a/Projects/Win32/Unsupported/IntVC6/XML4C3.dsw b/Projects/Win32/Unsupported/IntVC6/XML4C3.dsw index 6ca338c1d..aec35c1a7 100644 --- a/Projects/Win32/Unsupported/IntVC6/XML4C3.dsw +++ b/Projects/Win32/Unsupported/IntVC6/XML4C3.dsw @@ -99,6 +99,18 @@ Package=<4> ############################################################################### +Project: "ICUData"=".\ICUData\ICUData.dsp" - Package Owner=<4> + +Package=<5> +{{{ +}}} + +Package=<4> +{{{ +}}} + +############################################################################### + Project: "MemParse"=".\MemParse\MemParse.dsp" - Package Owner=<4> Package=<5> diff --git a/tools/ICUData/ICUData.cpp b/tools/ICUData/ICUData.cpp new file mode 100644 index 000000000..ba3af8b97 --- /dev/null +++ b/tools/ICUData/ICUData.cpp @@ -0,0 +1,581 @@ +/* + * The Apache Software License, Version 1.1 + * + * Copyright (c) 1999-2000 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Xerces" and "Apache Software Foundation" must + * not be used to endorse or promote products derived from this + * software without prior written permission. For written + * permission, please contact apache\@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * nor may "Apache" appear in their name, without prior written + * permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation, and was + * originally based on software copyright (c) 1999, International + * Business Machines, Inc., http://www.ibm.com . For more information + * on the Apache Software Foundation, please see + * <http://www.apache.org/>. + */ + + +/* + * $Log$ + * Revision 1.1 2000/03/17 23:58:00 roddey + * New utility for munging ICU UCM files and spitting out tables for + * our intrinsic encoders. + * + */ + +// --------------------------------------------------------------------------- +// This program is designed to parse a standard ICU .UCM file and spit out +// a C++ code fragment that represents the tables required by the intrinsic +// XML parser transcoders. +// +// The file format is pretty simple and this program is not intended to be +// industrial strength by any means. Its use by anyone but the author is +// at the user's own risk. +// +// The code looks for the min/max bytes per character to know what kind of +// table to spit out, but for now only handles single char sets. +// --------------------------------------------------------------------------- + + +// --------------------------------------------------------------------------- +// Includes +// --------------------------------------------------------------------------- +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <iostream.h> +#include <string.h> + + +// --------------------------------------------------------------------------- +// Const data +// --------------------------------------------------------------------------- +static const unsigned int gMaxInRecs = 1024; + + +// --------------------------------------------------------------------------- +// Local data types +// --------------------------------------------------------------------------- +struct XlatRec +{ + unsigned short uniVal; + unsigned char cpVal; +}; + + +// --------------------------------------------------------------------------- +// Local data +// +// gInFile +// gOutFile +// These are the file stream for the input UCM file and the output file +// that we write the C++ code to. +// +// fLineNum +// Used to track the current line number in the source file, for error +// reporting. +// +// gMainTable +// gMainTableSz +// This is the table that is filled in from the original source document. +// We don't know how big it will be, but its not likely to be much more +// than 300 entries or so (256 output code points with some multiply +// mapped Unicode code points.) So we make it extra large and watch for +// possible overflow. +// +// The size value is bumped up as we load entries into it during the +// parse of the file. +// +// gMaxChar +// gMinChar +// The min/max chars that are used to represent a character. These are +// read from the header of the input file. +// +// gRepChar +// The replacement character to be used. This is read from the header of +// the input file. +// --------------------------------------------------------------------------- +static FILE* gInFile; +static FILE* gOutFile; +static unsigned int fLineNum; +static XlatRec gMainTable[gMaxInRecs]; +static unsigned int gMainTableSz = 0; +static unsigned int gMaxChar; +static unsigned int gMinChar; +static unsigned char gRepChar = 1; + + +// --------------------------------------------------------------------------- +// Local functions +// --------------------------------------------------------------------------- +static unsigned int getLine( char* const toFill + , const unsigned int maxChars + , const bool eofOk = false) +{ + while (true) + { + if (!fgets(toFill, maxChars, gInFile)) + { + if (feof(gInFile)) + { + if (eofOk) + return ~0UL; + else + cout << "Unexpected end of input at line: " << fLineNum << endl; + } + else + { + cout << "Error processing input at line: " << fLineNum << endl; + exit(1); + } + } + fLineNum++; + + // + // If its not a comment, then break out + // + if (toFill[0] != '#') + break; + } + + // + // There could be a trailing comment on this line, so lets get rid + // of it. Search for a # char and put a null there. + // + char* endPtr = toFill; + while (*endPtr && (*endPtr != '#')) + endPtr++; + if (*endPtr == '#') + *endPtr = 0; + + // Strip trailing whitespace + endPtr = toFill + (strlen(toFill) - 1); + while (isspace(*endPtr)) + endPtr--; + *(endPtr + 1) = 0; + + // And return the count of chars we got + return strlen(toFill); +} + + +static unsigned int extractVal(char* const srcStr) +{ + char* srcPtr = srcStr; + + // Run forward to the first non-space + while (isspace(*srcPtr)) + srcPtr++; + + if (!*srcPtr) + { + cout << "Invalid numeric value on line: " << fLineNum << endl; + exit(1); + } + + // + // If it starts with \, then its a hex value in the form \xXX. Else its + // just a decimal value. + // + unsigned int retVal; + char* endPtr; + if (*srcPtr == '\\') + { + // Skip the \\x and interpret as a hex value + srcPtr += 2; + retVal = (unsigned int)strtoul(srcPtr, &endPtr, 16); + } + else + { + retVal = (unsigned int)strtoul(srcPtr, &endPtr, 10); + } + + // We should have translated up to the end of the string + if (*endPtr) + { + cout << "Invalid numeric value on line: " << fLineNum << endl; + exit(1); + } + + return retVal; +} + + +static void loadTable() +{ + // + // Just loop, reading lines at a time, until we either find the start + // of the character table or hit the end of the file. Along the way, we + // should see a few header values that we store away. + // + const unsigned int tmpBufSz = 2048; + char tmpBuf[tmpBufSz - 1]; + while (getLine(tmpBuf, tmpBufSz)) + { + // + // Check for one of the special values we are intersted int. If + // its CHARMAP, then we fall out of this loop. + // + if (!strcmp(tmpBuf, "CHARMAP")) + break; + + if (!strncmp(tmpBuf, "<mb_cur_max>", 12)) + { + gMaxChar = extractVal(&tmpBuf[12]); + } + else if (!strncmp(tmpBuf, "<mb_cur_min>", 12)) + { + gMinChar = extractVal(&tmpBuf[12]); + } + else if (!strncmp(tmpBuf, "<subchar>", 9)) + { + gRepChar = (char)extractVal(&tmpBuf[9]); + } + } + + // + // Ok, now we just run till we hit the "END CHARMAP" line. Each entry + // will be in the form: + // + // <UXXXX> \xXX + // + // Where X is a hex number. + // + char* endPtr; + while (getLine(tmpBuf, tmpBufSz)) + { + // Watch for the end of table + if (!strcmp(tmpBuf, "END CHARMAP")) + break; + + // The absolute minium it could be is 12 chars + if (strlen(tmpBuf) < 12) + { + cout << "Line " << fLineNum << " is too short to hold a valid entry" + << endl; + exit(1); + } + + // Make sure the first token meets the criteria + if ((tmpBuf[0] != '<') + || (tmpBuf[1] != 'U') + || (tmpBuf[6] != '>')) + { + cout << "Line " << fLineNum << " has a badly formed Unicode value" + << endl; + exit(1); + } + + // + // Looks reasonable so lets try to convert it. We can play tricks + // with this buffer, so put a null over the > char. + // + tmpBuf[6] = 0; + const unsigned int uniVal = strtoul(&tmpBuf[2], &endPtr, 16); + if (*endPtr) + { + cout << "Invalid Unicode value on line " << fLineNum << endl; + exit(1); + } + + // + // Ok, lets search over to the second token. We have to find a \\ + // character. + // + char* srcPtr = &tmpBuf[7]; + while (*srcPtr && (*srcPtr != '\\')) + srcPtr++; + + // If we never found it, its in error + if (!*srcPtr) + { + cout << "Never found second token on line " << fLineNum << endl; + exit(1); + } + + // Try to translate it + srcPtr += 2; + const unsigned int cpVal = strtoul(srcPtr, &endPtr, 16); + if (*endPtr) + { + cout << "Invalid code page value on line " << fLineNum << endl; + exit(1); + } + + // Make sure that the values are within range + if (uniVal > 0xFFFF) + { + cout << "Unicode value is too big on line " << fLineNum << endl; + exit(1); + } + + if (cpVal > 0xFF) + { + cout << "Code page value is too big on line " << fLineNum << endl; + exit(1); + } + + // Looks reasonable, so add a new entry to the global table + gMainTable[gMainTableSz].uniVal = (unsigned short)uniVal; + gMainTable[gMainTableSz].cpVal = (unsigned char)cpVal; + gMainTableSz++; + } +} + + + +int compFuncTo(const void* p1, const void* p2) +{ + const XlatRec* rec1 = (const XlatRec*)p1; + const XlatRec* rec2 = (const XlatRec*)p2; + + return (int)rec1->uniVal - (int)rec2->uniVal; +} + + +int compFuncFrom(const void* p1, const void* p2) +{ + const XlatRec* rec1 = (const XlatRec*)p1; + const XlatRec* rec2 = (const XlatRec*)p2; + + // + // Since there can be multiple Unicode chars that map to a single + // code page char, we have to handle the situationw here they are + // equal specially. If the code page vals are equal, then the one + // with the smaller Unicode code point is considered smaller. + // + if (rec1->cpVal == rec2->cpVal) + return (int)rec1->uniVal - (int)rec2->uniVal; + + // Else use the code page value for sorting + return (int)rec1->cpVal - (int)rec2->cpVal; +} + + +static void formatSBTables() +{ + // For now, only handle single byte char sets + if ((gMinChar != 1) || (gMaxChar != 1)) + { + cout << "formatSBTables can only handle single byte encodings" + << endl; + exit(1); + } + + // + // First, we want to sort the table by the code page value field. This + // is the order required for the 'from' table to convert from the code + // page to the internal Unicode format. + // + qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncFrom); + + // + // Now spit out the header for the table. This is the same for all + // of them, since they are static to the file and can just all have + // the same name. + // + fprintf + ( + gOutFile + , "static const XMLCh gFromTable[256] =\n{\n " + ); + + // + // Now for each unique entry in the cp value field, we want to put out + // the Unicode value for that entry. Since we sorted them such that + // dups have the one with the smaller Unicode value in the lower index, + // we always hit the desired value first, and then can just skip over + // a duplicate. + // + unsigned int curValue = 0; + unsigned int index; + for (index = 0; index < gMainTableSz; index++) + { + if (curValue) + { + if (!(curValue % 8)) + fprintf(gOutFile, "\n , "); + else + fprintf(gOutFile, ", "); + } + + if (curValue == gMainTable[index].cpVal) + { + fprintf(gOutFile, "0x%04X", (unsigned int)gMainTable[index].uniVal); + + // If there is a dump, then skip it + if (index < gMainTableSz) + { + if (gMainTable[index + 1].cpVal == curValue) + index++; + } + } + else if (curValue < gMainTable[index].cpVal) + { + fprintf(gOutFile, "0xFFFF"); + } + else + { + // Screwed up + cout << "Current value got above target value\n" << endl; + exit(1); + } + curValue++; + + // If the current value goes over 256, we are in trouble + if (curValue > 256) + { + cout << "The code page value cannot be > 256 in SB mode\n" << endl; + exit(1); + } + } + + // And print the trailer for this table + fprintf(gOutFile, "\n};\n\n"); + + + // + // Now lets sort by the Unicode value field. This sort is used for + // the 'to' table. The Unicode value is found by binary search and + // used to map to the right output encoding value. + // + qsort(gMainTable, gMainTableSz, sizeof(gMainTable[0]), compFuncTo); + + // Output the table ehader for this one + fprintf + ( + gOutFile + , "static const XMLTransService::TransRec gToTable[] =\n{\n " + ); + + for (index = 0; index < gMainTableSz; index++) + { + if (index) + { + if (!(index % 4)) + fprintf(gOutFile, "\n , "); + else + fprintf(gOutFile, ", "); + } + + fprintf + ( + gOutFile + , "{ 0x%04X, 0x%02X }" + , (unsigned int)gMainTable[index].uniVal + , (unsigned int)gMainTable[index].cpVal + ); + } + + // Print the trailer for this table + fprintf(gOutFile, "\n};\n"); + + // And print out the table size constant + fprintf(gOutFile, "static const unsigned int gToTableSz = %d;\n", gMainTableSz); +} + +static void showUsage() +{ + cout << "ICUData inputUCMfile outputfile\n" << endl; +} + + + +// --------------------------------------------------------------------------- +// The parameters are: +// +// argV[1] = The source UCM file +// argV[2] = The path to the output file +// --------------------------------------------------------------------------- +int main(int argC, char** argV) +{ + // We have to have 3 parameters + if (argC != 3) + { + showUsage(); + return 1; + } + + // Try to open the first file for input + gInFile = fopen(argV[1], "rt"); + if (!gInFile) + { + cout << "Could not find input file: " << argV[1] << endl; + return 1; + } + + // Try to open the second file for output (truncated) + gOutFile = fopen(argV[2], "wt+"); + if (!gOutFile) + { + cout << "Could not create output file: " << argV[1] << endl; + return 1; + } + + // + // This will parse the file and load the table. It will also look for + // a couple of key fields in the file header and store that data into + // globals. + // + loadTable(); + + // If we didn't get any table entries, then give up + if (!gMainTableSz) + { + cout << "No translation table entries were found in the file" << endl; + return 1; + } + + // + // Ok, we got the data loaded. Now lets output the tables. This method + // spit out both tables to the output file, in a format ready to be + // incorporated directly into the source code. + // + formatSBTables(); + + // Close our files + fclose(gInFile); + fclose(gOutFile); + + return 0; +} -- GitLab