Newer
Older
//
// upperCase the newEncoding first for better performance
//
XMLCh* inputEncoding = XMLString::replicate(newEncoding, fMemoryManager);
//
// Try to map the string to one of our standard encodings. If its not
// one of them, then it has to be one of the non-intrinsic encodings,
// in which case we have to delete our intrinsic encoder and create a
// new one.
//
XMLRecognizer::Encodings newBaseEncoding = XMLRecognizer::encodingForName
(
);
//
// If it does not come back as one of the auto-sensed encodings, then we
// have to possibly replace it and at least check a few things.
//
if (newBaseEncoding == XMLRecognizer::OtherEncoding)
{
//
// Check for non-endian specific UTF-16 or UCS-4. If so, and if we
// are already in one of the endian versions of those encodings,
// then just keep it and go on. Otherwise, its not valid.
//
if (!XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString2)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString3)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString4)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUTF16EncodingString5))
fMemoryManager->deallocate(inputEncoding);
if ((fEncoding != XMLRecognizer::UTF_16L)
&& (fEncoding != XMLRecognizer::UTF_16B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UTF_16L) {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16LEncodingString, fMemoryManager);
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = XMLString::replicate(XMLUni::fgUTF16BEncodingString, fMemoryManager);
else if (!XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString2)
|| !XMLString::compareString(inputEncoding, XMLUni::fgUCS4EncodingString3))
fMemoryManager->deallocate(inputEncoding);
if ((fEncoding != XMLRecognizer::UCS_4L)
&& (fEncoding != XMLRecognizer::UCS_4B))
{
return false;
}
// Override with the original endian specific encoding
newBaseEncoding = fEncoding;
if (fEncoding == XMLRecognizer::UCS_4L) {
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4LEncodingString, fMemoryManager);
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = XMLString::replicate(XMLUni::fgUCS4BEncodingString, fMemoryManager);
// None of those special cases, so just replicate the new name
fMemoryManager->deallocate(fEncodingStr);
fEncodingStr = inputEncoding;
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
fEncodingStr
, failReason
, kCharBufSize
, fMemoryManager
}
}
else
{
// Store the new encoding string since it is just an intrinsic
fMemoryManager->deallocate(fEncodingStr);
if (!fTranscoder) {
//
// Now we can create a transcoder using the recognized fEncoding. We
// might get back a transcoder for an intrinsically supported encoding,
// or we might get one from the underlying transcoding service.
//
XMLTransService::Codes failReason;
fTranscoder = XMLPlatformUtils::fgTransService->makeNewTranscoderFor
(
newBaseEncoding
, failReason
, kCharBufSize
, fMemoryManager
David Abram Cargill
committed
ThrowXMLwithMemMgr1(TranscodingException, XMLExcepts::Trans_CantCreateCvtrFor, fEncodingStr, fMemoryManager);
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
// Update the base encoding member with the new base encoding found
fEncoding = newBaseEncoding;
// Looks ok to us
return true;
}
// ---------------------------------------------------------------------------
// XMLReader: Private helper methods
// ---------------------------------------------------------------------------
//
// This is called when the encoding flag is set and just sets the fSwapped
// flag appropriately.
//
void XMLReader::checkForSwapped()
{
// Assume not swapped
fSwapped = false;
#if defined(ENDIANMODE_LITTLE)
if ((fEncoding == XMLRecognizer::UTF_16B)
|| (fEncoding == XMLRecognizer::UCS_4B))
{
fSwapped = true;
}
#elif defined(ENDIANMODE_BIG)
if ((fEncoding == XMLRecognizer::UTF_16L)
|| (fEncoding == XMLRecognizer::UCS_4L))
{
fSwapped = true;
}
#endif
}
//
// This is called from the constructor when the encoding is not forced.
// We assume that the encoding has been auto-sensed at this point and that
// fSwapped is set correctly.
//
// In the case of UCS-4 and EBCDIC, we don't have to check for a decl.
// The fact that we got here, means that there is one, because that's the
// only way we can autosense those.
//
void XMLReader::doInitDecode()
{
switch(fEncoding)
{
case XMLRecognizer::UCS_4B :
case XMLRecognizer::UCS_4L :
{
// Remove bom if any
if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||
((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) )
{
for (unsigned int i = 0; i < fRawBytesAvail; i++)
fRawByteBuf[i] = fRawByteBuf[i+4];
fRawBytesAvail -=4;
}
// Look at the raw buffer as UCS4 chars
const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;
while (fRawBufIndex < fRawBytesAvail)
{
// Get out the current 4 byte value and inc our raw buf index
UCS4Ch curVal = *asUCS++;
fRawBufIndex += sizeof(UCS4Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
// Make sure its at least semi legal. If not, undo and throw
if (curVal > 0xFFFF)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
David Abram Cargill
committed
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
David Abram Cargill
committed
, fMemoryManager
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
);
}
// Convert the value to an XML char and store it
fCharSizeBuf[fCharsAvail] = 4;
fCharBuf[fCharsAvail++] = XMLCh(curVal);
// Break out on the > character
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::UTF_8 :
{
// If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it.
// Don't move to char buf - no one wants to see it.
// Note: this causes any encoding= declaration to override
// the BOM's attempt to say that the encoding is utf-8.
// Look at the raw buffer as short chars
const char* asChars = (const char*)fRawByteBuf;
if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&
XMLString::compareNString( asChars
, XMLRecognizer::fgUTF8BOM
, XMLRecognizer::fgUTF8BOMLen) == 0)
{
fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;
asChars += XMLRecognizer::fgUTF8BOMLen;
}
//
// First check that there are enough bytes to even see the
// decl indentifier. If not, get out now with no action since
// there is no decl.
//
if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)
break;
// Check for the opening sequence. If not, then no decl
if (XMLString::compareNString( asChars
, XMLRecognizer::fgASCIIPre
, XMLRecognizer::fgASCIIPreLen))
{
break;
}
while (fRawBufIndex < fRawBytesAvail)
{
const char curCh = *asChars++;
fRawBufIndex++;
// Looks ok, so store it
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = XMLCh(curCh);
// Break out on a > character
if (curCh == chCloseAngle)
break;
//
// A char greater than 0x7F is not allowed in this case. If
// so, undo and throw.
//
if (curCh & 0x80)
{
fCharsAvail = 0;
fRawBufIndex = 0;
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);
David Abram Cargill
committed
ThrowXMLwithMemMgr1
(
TranscodingException
, XMLExcepts::Reader_CouldNotDecodeFirstLine
, fSystemId
David Abram Cargill
committed
, fMemoryManager
);
}
}
break;
}
case XMLRecognizer::UTF_16B :
case XMLRecognizer::UTF_16L :
{
//
// If there is a decl here, we just truncate back the characters
// as we go. No surrogate creation would be allowed here in legal
// XML, so we consider it a transoding error if we find one.
//
if (fRawBytesAvail < 2)
break;
unsigned int postBOMIndex = 0;
const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];
if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))
{
fRawBufIndex += sizeof(UTF16Ch);
asUTF16++;
postBOMIndex = fRawBufIndex;
}
// First check that there are enough raw bytes for there to even
// be a decl indentifier. If not, then nothing to do.
//
if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)
{
fRawBufIndex = postBOMIndex;
break;
}
//
// See we get a match on the prefix. If not, then reset and
// break out.
//
if (fEncoding == XMLRecognizer::UTF_16B)
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
break;
}
}
else
{
if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))
{
fRawBufIndex = postBOMIndex;
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
break;
}
}
while (fRawBufIndex < fRawBytesAvail)
{
// Get out the current 2 byte value
UTF16Ch curVal = *asUTF16++;
fRawBufIndex += sizeof(UTF16Ch);
// Swap if that is required for this machine
if (fSwapped)
curVal = BitOps::swapBytes(curVal);
//
// Store it and bump the target index, implicitly converting
// if UTF16Ch and XMLCh are not the same size.
//
fCharSizeBuf[fCharsAvail] = 2;
fCharBuf[fCharsAvail++] = curVal;
// Break out on a > char
if (curVal == chCloseAngle)
break;
}
break;
}
case XMLRecognizer::EBCDIC :
{
//
// We use special support in the intrinsic EBCDIC-US transcoder
// to go through one char at a time.
//
const XMLByte* srcPtr = fRawByteBuf;
while (1)
{
// Transcode one char from the source
const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);
fRawBufIndex++;
//
// And put it into the character buffer. This stuff has to
// look like it was normally transcoded.
//
fCharSizeBuf[fCharsAvail] = 1;
fCharBuf[fCharsAvail++] = chCur;
// If its a > char, then break out
if (chCur == chCloseAngle)
break;
// Watch for using up all input and get out
if (fRawBufIndex == fRawBytesAvail)
break;
}
break;
}
default :
// It should never be anything else here
fMemoryManager->deallocate(fPublicId);
fMemoryManager->deallocate(fEncodingStr);
fMemoryManager->deallocate(fSystemId);
David Abram Cargill
committed
ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);
break;
}
//
// Ok, by the time we get here, if its a legal XML file we have eaten
// the XML/TextDecl. So, if we are a PE and are being referenced from
// outside a literal, then we need to throw in an arbitrary space that
// is required by XML.
//
if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))
fCharBuf[fCharsAvail++] = chSpace;
// Calculate fCharOfsBuf buffer using the elements from fCharBufSize
if (fCalculateSrcOfs)
{
fCharOfsBuf[0] = 0;
for (unsigned int index = 1; index < fCharsAvail; ++index) {
fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];
}
}
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
}
//
// This method is called internally when we run out of bytes in the raw
// buffer. We just read as many bytes as we can into the raw buffer again
// and store the number of bytes we got.
//
void XMLReader::refreshRawBuffer()
{
//
// If there are any bytes left, move them down to the start. There
// should only ever be (max bytes per char - 1) at the most.
//
const unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;
// Move the existing ones down
for (unsigned int index = 0; index < bytesLeft; index++)
fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];
//
// And then read into the buffer past the existing bytes. Add back in
// that many to the bytes read, and subtract that many from the bytes
// requested.
//
fRawBytesAvail = fStream->readBytes
(
&fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft
) + bytesLeft;
//
// We need to reset the buffer index back to the start in all cases,
// since any trailing data was copied down to the start.
//
fRawBufIndex = 0;
}
//
// This method is called internally when we run out of characters in the
// trancoded character buffer. We transcode up to another maxChars chars
// from the
//
unsigned int
XMLReader::xcodeMoreChars( XMLCh* const bufToFill
, unsigned char* const charSizes
, const unsigned int maxChars)
{
// If we are plain tuckered out, then return zero now
if (!fRawBytesAvail)
return 0;
//
// If our raw buffer is low, then lets load up another batch of
// raw bytes now. We can't check for exactly zero bytes left because
// transcoding of multi-byte encodings may have left a few bytes
// representing a partial character in the buffer that can't be
// used until the next buffer (and the rest of the character)
// is read.
//
unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;
if (bytesLeft < 100)
{
refreshRawBuffer();
// If we didn't get anything more just return a zero now
if (!fRawBytesAvail)
return 0;
}
// Ask the transcoder to internalize another batch of chars
unsigned int bytesEaten;
const unsigned int charsDone = fTranscoder->transcodeFrom
(
&fRawByteBuf[fRawBufIndex]
, fRawBytesAvail - fRawBufIndex
, bufToFill
, maxChars
, bytesEaten
, charSizes
);
// Update the raw buffer index
fRawBufIndex += bytesEaten;
return charsDone;
}