diff --git a/src/xercesc/internal/DGXMLScanner.cpp b/src/xercesc/internal/DGXMLScanner.cpp index 8875f50d8d465b7d4cdf27d61a23a35c9016d846..ef293488606b178b3ca737f23717ba695227dac0 100644 --- a/src/xercesc/internal/DGXMLScanner.cpp +++ b/src/xercesc/internal/DGXMLScanner.cpp @@ -2326,23 +2326,13 @@ bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef bool firstNonWS = false; bool gotLeadingSurrogate = false; bool escaped; - bool charref_expanded = false; while (true) { try { while(true) { - // Get another char. Use second char if one is waiting - if (secondCh) - { - nextCh = secondCh; - secondCh = 0; - } - else - { - nextCh = fReaderMgr.getNextChar(); - } + nextCh = fReaderMgr.getNextChar(); if (!nextCh) ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF); @@ -2372,18 +2362,16 @@ bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. if (gotLeadingSurrogate) emitError(XMLErrs::Expected2ndSurrogateChar); - else + else gotLeadingSurrogate = true; } else @@ -2407,22 +2395,17 @@ bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef // Its got to at least be a valid XML character if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } @@ -2498,6 +2481,9 @@ bool DGXMLScanner::scanAttValue( const XMLAttDef* const attDef // Else add it to the buffer toFill.append(nextCh); + + if (secondCh) + toFill.append(secondCh); } } catch(const EndOfEntityException&) @@ -2704,44 +2690,36 @@ void DGXMLScanner::scanCharData(XMLBuffer& toUse) bool escaped = false; bool gotLeadingSurrogate = false; bool notDone = true; - bool charref_expanded = false; while (notDone) { try { while (true) { - if (secondCh) + // Eat through as many plain content characters as possible without + // needing special handling. Moving most content characters here, + // in this one call, rather than running the overall loop once + // per content character, is a speed optimization. + if (curState == State_Waiting && !gotLeadingSurrogate) { - nextCh = secondCh; - secondCh = 0; + fReaderMgr.movePlainContentChars(toUse); } - else - { - // Eat through as many plain content characters as possible without - // needing special handling. Moving most content characters here, - // in this one call, rather than running the overall loop once - // per content character, is a speed optimization. - if (curState == State_Waiting && !gotLeadingSurrogate) - { - fReaderMgr.movePlainContentChars(toUse); - } - // Try to get another char from the source - // The code from here on down covers all contengencies, - if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) - { - // If we were waiting for a trailing surrogate, its an error - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); + // Try to get another char from the source + // The code from here on down covers all contengencies, + if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) + { + // If we were waiting for a trailing surrogate, its an error + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); - notDone = false; - break; - } + notDone = false; + break; } // Watch for a reference. Note that the escapement mechanism // is ignored in this content. + escaped = false; if (nextCh == chAmpersand) { sendCharData(toUse); @@ -2754,42 +2732,10 @@ void DGXMLScanner::scanCharData(XMLBuffer& toUse) gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - else - { - escaped = false; - } - - // Keep the state machine up to date - if (!escaped) - { - if (nextCh == chCloseSquare) - { - if (curState == State_Waiting) - curState = State_GotOne; - else if (curState == State_GotOne) - curState = State_GotTwo; - } - else if (nextCh == chCloseAngle) - { - if (curState == State_GotTwo) - emitError(XMLErrs::BadSequenceInCharData); - curState = State_Waiting; - } - else - { - curState = State_Waiting; - } - } - else - { - curState = State_Waiting; - } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -2819,27 +2765,51 @@ void DGXMLScanner::scanCharData(XMLBuffer& toUse) // Make sure the returned char is a valid XML char if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacter, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacter, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } + // Keep the state machine up to date + if (!escaped) + { + if (nextCh == chCloseSquare) + { + if (curState == State_Waiting) + curState = State_GotOne; + else if (curState == State_GotOne) + curState = State_GotTwo; + } + else if (nextCh == chCloseAngle) + { + if (curState == State_GotTwo) + emitError(XMLErrs::BadSequenceInCharData); + curState = State_Waiting; + } + else + { + curState = State_Waiting; + } + } + else + { + curState = State_Waiting; + } + // Add this char to the buffer toUse.append(nextCh); + + if (secondCh) + toUse.append(secondCh); } } catch(const EndOfEntityException& toCatch) diff --git a/src/xercesc/internal/IGXMLScanner2.cpp b/src/xercesc/internal/IGXMLScanner2.cpp index 293ac5508f1b824977f3a955178ce67b55698668..45297c2450a554c6db7f106223e755c764acc795 100644 --- a/src/xercesc/internal/IGXMLScanner2.cpp +++ b/src/xercesc/internal/IGXMLScanner2.cpp @@ -1542,23 +1542,13 @@ bool IGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to XMLCh secondCh = 0; bool gotLeadingSurrogate = false; bool escaped; - bool charref_expanded = false; while (true) { try { while(true) { - // Get another char. Use second char if one is waiting - if (secondCh) - { - nextCh = secondCh; - secondCh = 0; - } - else - { - nextCh = fReaderMgr.getNextChar(); - } + nextCh = fReaderMgr.getNextChar(); if (!nextCh) ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF); @@ -1589,12 +1579,10 @@ bool IGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -1626,23 +1614,17 @@ bool IGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to // Its got to at least be a valid XML character else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } @@ -1655,6 +1637,9 @@ bool IGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to // Else add it to the buffer toFill.append(nextCh); + + if (secondCh) + toFill.append(secondCh); } } catch(const EndOfEntityException&) @@ -1705,23 +1690,13 @@ bool IGXMLScanner::scanAttValue( const XMLAttDef* const attDef bool firstNonWS = false; bool gotLeadingSurrogate = false; bool escaped; - bool charref_expanded = false; while (true) { try { while(true) { - // Get another char. Use second char if one is waiting - if (secondCh) - { - nextCh = secondCh; - secondCh = 0; - } - else - { - nextCh = fReaderMgr.getNextChar(); - } + nextCh = fReaderMgr.getNextChar(); if (!nextCh) ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF); @@ -1751,12 +1726,10 @@ bool IGXMLScanner::scanAttValue( const XMLAttDef* const attDef gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -1786,22 +1759,17 @@ bool IGXMLScanner::scanAttValue( const XMLAttDef* const attDef // Its got to at least be a valid XML character if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } @@ -1877,6 +1845,9 @@ bool IGXMLScanner::scanAttValue( const XMLAttDef* const attDef // Else add it to the buffer toFill.append(nextCh); + + if (secondCh) + toFill.append(secondCh); } } catch(const EndOfEntityException&) @@ -2093,44 +2064,36 @@ void IGXMLScanner::scanCharData(XMLBuffer& toUse) bool escaped = false; bool gotLeadingSurrogate = false; bool notDone = true; - bool charref_expanded = false; while (notDone) { try { while (true) { - if (secondCh) + // Eat through as many plain content characters as possible without + // needing special handling. Moving most content characters here, + // in this one call, rather than running the overall loop once + // per content character, is a speed optimization. + if (curState == State_Waiting && !gotLeadingSurrogate) { - nextCh = secondCh; - secondCh = 0; + fReaderMgr.movePlainContentChars(toUse); } - else - { - // Eat through as many plain content characters as possible without - // needing special handling. Moving most content characters here, - // in this one call, rather than running the overall loop once - // per content character, is a speed optimization. - if (curState == State_Waiting && !gotLeadingSurrogate) - { - fReaderMgr.movePlainContentChars(toUse); - } - // Try to get another char from the source - // The code from here on down covers all contengencies, - if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) - { - // If we were waiting for a trailing surrogate, its an error - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); + // Try to get another char from the source + // The code from here on down covers all contengencies, + if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) + { + // If we were waiting for a trailing surrogate, its an error + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); - notDone = false; - break; - } + notDone = false; + break; } // Watch for a reference. Note that the escapement mechanism // is ignored in this content. + escaped = false; if (nextCh == chAmpersand) { sendCharData(toUse); @@ -2143,42 +2106,10 @@ void IGXMLScanner::scanCharData(XMLBuffer& toUse) gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - else - { - escaped = false; - } - - // Keep the state machine up to date - if (!escaped) - { - if (nextCh == chCloseSquare) - { - if (curState == State_Waiting) - curState = State_GotOne; - else if (curState == State_GotOne) - curState = State_GotTwo; - } - else if (nextCh == chCloseAngle) - { - if (curState == State_GotTwo) - emitError(XMLErrs::BadSequenceInCharData); - curState = State_Waiting; - } - else - { - curState = State_Waiting; - } - } - else - { - curState = State_Waiting; - } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -2208,27 +2139,51 @@ void IGXMLScanner::scanCharData(XMLBuffer& toUse) // Make sure the returned char is a valid XML char if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacter, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacter, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } + // Keep the state machine up to date + if (!escaped) + { + if (nextCh == chCloseSquare) + { + if (curState == State_Waiting) + curState = State_GotOne; + else if (curState == State_GotOne) + curState = State_GotTwo; + } + else if (nextCh == chCloseAngle) + { + if (curState == State_GotTwo) + emitError(XMLErrs::BadSequenceInCharData); + curState = State_Waiting; + } + else + { + curState = State_Waiting; + } + } + else + { + curState = State_Waiting; + } + // Add this char to the buffer toUse.append(nextCh); + + if (secondCh) + toUse.append(secondCh); } } catch(const EndOfEntityException& toCatch) diff --git a/src/xercesc/internal/SGXMLScanner.cpp b/src/xercesc/internal/SGXMLScanner.cpp index 552082abf948204bb36fa95f29d649219a7e41f3..f8707041bc14ded3f68bd17c8e1f39d6ce4dde4d 100644 --- a/src/xercesc/internal/SGXMLScanner.cpp +++ b/src/xercesc/internal/SGXMLScanner.cpp @@ -3224,23 +3224,13 @@ bool SGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to XMLCh secondCh = 0; bool gotLeadingSurrogate = false; bool escaped; - bool charref_expanded = false; while (true) { try { while(true) { - // Get another char. Use second char if one is waiting - if (secondCh) - { - nextCh = secondCh; - secondCh = 0; - } - else - { - nextCh = fReaderMgr.getNextChar(); - } + nextCh = fReaderMgr.getNextChar(); if (!nextCh) ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF); @@ -3271,12 +3261,10 @@ bool SGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -3308,22 +3296,17 @@ bool SGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to // Its got to at least be a valid XML character else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } @@ -3336,6 +3319,9 @@ bool SGXMLScanner::basicAttrValueScan(const XMLCh* const attrName, XMLBuffer& to // Else add it to the buffer toFill.append(nextCh); + + if (secondCh) + toFill.append(secondCh); } } catch(const EndOfEntityException&) @@ -3553,44 +3539,36 @@ void SGXMLScanner::scanCharData(XMLBuffer& toUse) bool escaped = false; bool gotLeadingSurrogate = false; bool notDone = true; - bool charref_expanded = false; while (notDone) { try { while (true) { - if (secondCh) + // Eat through as many plain content characters as possible without + // needing special handling. Moving most content characters here, + // in this one call, rather than running the overall loop once + // per content character, is a speed optimization. + if (curState == State_Waiting && !gotLeadingSurrogate) { - nextCh = secondCh; - secondCh = 0; + fReaderMgr.movePlainContentChars(toUse); } - else - { - // Eat through as many plain content characters as possible without - // needing special handling. Moving most content characters here, - // in this one call, rather than running the overall loop once - // per content character, is a speed optimization. - if (curState == State_Waiting && !gotLeadingSurrogate) - { - fReaderMgr.movePlainContentChars(toUse); - } - // Try to get another char from the source - // The code from here on down covers all contengencies, - if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) - { - // If we were waiting for a trailing surrogate, its an error - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); + // Try to get another char from the source + // The code from here on down covers all contengencies, + if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) + { + // If we were waiting for a trailing surrogate, its an error + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); - notDone = false; - break; - } + notDone = false; + break; } // Watch for a reference. Note that the escapement mechanism // is ignored in this content. + escaped = false; if (nextCh == chAmpersand) { sendCharData(toUse); @@ -3603,42 +3581,10 @@ void SGXMLScanner::scanCharData(XMLBuffer& toUse) gotLeadingSurrogate = false; continue; } - charref_expanded = true; - } - else - { - escaped = false; - } - - // Keep the state machine up to date - if (!escaped) - { - if (nextCh == chCloseSquare) - { - if (curState == State_Waiting) - curState = State_GotOne; - else if (curState == State_GotOne) - curState = State_GotTwo; - } - else if (nextCh == chCloseAngle) - { - if (curState == State_GotTwo) - emitError(XMLErrs::BadSequenceInCharData); - curState = State_Waiting; - } - else - { - curState = State_Waiting; - } - } - else - { - curState = State_Waiting; } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -3668,27 +3614,51 @@ void SGXMLScanner::scanCharData(XMLBuffer& toUse) // Make sure the returned char is a valid XML char if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacter, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacter, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } + // Keep the state machine up to date + if (!escaped) + { + if (nextCh == chCloseSquare) + { + if (curState == State_Waiting) + curState = State_GotOne; + else if (curState == State_GotOne) + curState = State_GotTwo; + } + else if (nextCh == chCloseAngle) + { + if (curState == State_GotTwo) + emitError(XMLErrs::BadSequenceInCharData); + curState = State_Waiting; + } + else + { + curState = State_Waiting; + } + } + else + { + curState = State_Waiting; + } + // Add this char to the buffer toUse.append(nextCh); + + if (secondCh) + toUse.append(secondCh); } } catch(const EndOfEntityException& toCatch) diff --git a/src/xercesc/internal/WFXMLScanner.cpp b/src/xercesc/internal/WFXMLScanner.cpp index 12099dc8fb111c0ef6ae84381945a6b1969eb44a..d52e0b65efe3fd7fb99dd43440f47cf2092f80d8 100644 --- a/src/xercesc/internal/WFXMLScanner.cpp +++ b/src/xercesc/internal/WFXMLScanner.cpp @@ -1522,23 +1522,13 @@ bool WFXMLScanner::scanAttValue(const XMLCh* const attrName bool firstNonWS = false; bool gotLeadingSurrogate = false; bool escaped; - bool charref_expanded = false; while (true) { try { while(true) { - // Get another char. Use second char if one is waiting - if (secondCh) - { - nextCh = secondCh; - secondCh = 0; - } - else - { - nextCh = fReaderMgr.getNextChar(); - } + nextCh = fReaderMgr.getNextChar(); if (!nextCh) ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF); @@ -1568,12 +1558,10 @@ bool WFXMLScanner::scanAttValue(const XMLCh* const attrName gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -1605,30 +1593,24 @@ bool WFXMLScanner::scanAttValue(const XMLCh* const attrName // Its got to at least be a valid XML character else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacterInAttrValue, attrName, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } // If its not escaped, then make sure its not a < character, which // is not allowed in attribute values. if (!escaped) { - if (nextCh == chOpenAngle) + if (nextCh == chOpenAngle) emitError(XMLErrs::BracketInAttrValue, attrName); else if (fReaderMgr.getCurrentReader()->isWhitespace(nextCh)) nextCh = chSpace; @@ -1636,6 +1618,9 @@ bool WFXMLScanner::scanAttValue(const XMLCh* const attrName // Else add it to the buffer toFill.append(nextCh); + + if (secondCh) + toFill.append(secondCh); } } catch(const EndOfEntityException&) @@ -1807,44 +1792,36 @@ void WFXMLScanner::scanCharData(XMLBuffer& toUse) bool escaped = false; bool gotLeadingSurrogate = false; bool notDone = true; - bool charref_expanded = false; while (notDone) { try { while (true) { - if (secondCh) + // Eat through as many plain content characters as possible without + // needing special handling. Moving most content characters here, + // in this one call, rather than running the overall loop once + // per content character, is a speed optimization. + if (curState == State_Waiting && !gotLeadingSurrogate) { - nextCh = secondCh; - secondCh = 0; + fReaderMgr.movePlainContentChars(toUse); } - else - { - // Eat through as many plain content characters as possible without - // needing special handling. Moving most content characters here, - // in this one call, rather than running the overall loop once - // per content character, is a speed optimization. - if (curState == State_Waiting && !gotLeadingSurrogate) - { - fReaderMgr.movePlainContentChars(toUse); - } - // Try to get another char from the source - // The code from here on down covers all contengencies, - if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) - { - // If we were waiting for a trailing surrogate, its an error - if (gotLeadingSurrogate) - emitError(XMLErrs::Expected2ndSurrogateChar); + // Try to get another char from the source + // The code from here on down covers all contengencies, + if (!fReaderMgr.getNextCharIfNot(chOpenAngle, nextCh)) + { + // If we were waiting for a trailing surrogate, its an error + if (gotLeadingSurrogate) + emitError(XMLErrs::Expected2ndSurrogateChar); - notDone = false; - break; - } + notDone = false; + break; } // Watch for a reference. Note that the escapement mechanism // is ignored in this content. + escaped = false; if (nextCh == chAmpersand) { sendCharData(toUse); @@ -1857,42 +1834,10 @@ void WFXMLScanner::scanCharData(XMLBuffer& toUse) gotLeadingSurrogate = false; continue; } - charref_expanded = true; - } - else - { - escaped = false; - } - - // Keep the state machine up to date - if (!escaped) - { - if (nextCh == chCloseSquare) - { - if (curState == State_Waiting) - curState = State_GotOne; - else if (curState == State_GotOne) - curState = State_GotTwo; - } - else if (nextCh == chCloseAngle) - { - if (curState == State_GotTwo) - emitError(XMLErrs::BadSequenceInCharData); - curState = State_Waiting; - } - else - { - curState = State_Waiting; - } } - else - { - curState = State_Waiting; - } - - // Deal with surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Deal with surrogate pairs // Its a leading surrogate. If we already got one, then // issue an error, else set leading flag to make sure that // we look for a trailing next time. @@ -1924,29 +1869,51 @@ void WFXMLScanner::scanCharData(XMLBuffer& toUse) // Its got to at least be a valid XML character else if (!fReaderMgr.getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr.getCurrentReader()->isControlChar(nextCh))) - { - - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - emitError(XMLErrs::InvalidCharacter, tmpBuf); - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + emitError(XMLErrs::InvalidCharacter, tmpBuf); } } - charref_expanded = false; gotLeadingSurrogate = false; } + // Keep the state machine up to date + if (!escaped) + { + if (nextCh == chCloseSquare) + { + if (curState == State_Waiting) + curState = State_GotOne; + else if (curState == State_GotOne) + curState = State_GotTwo; + } + else if (nextCh == chCloseAngle) + { + if (curState == State_GotTwo) + emitError(XMLErrs::BadSequenceInCharData); + curState = State_Waiting; + } + else + { + curState = State_Waiting; + } + } + else + { + curState = State_Waiting; + } // Add this char to the buffer toUse.append(nextCh); + + if (secondCh) + toUse.append(secondCh); } } catch(const EndOfEntityException& toCatch) diff --git a/src/xercesc/internal/XMLScanner.cpp b/src/xercesc/internal/XMLScanner.cpp index 6be0688cef0009cb59512529e158eff105106eec..54296478b4a7d9da79b8db7e779c876993e54221 100644 --- a/src/xercesc/internal/XMLScanner.cpp +++ b/src/xercesc/internal/XMLScanner.cpp @@ -1725,16 +1725,27 @@ bool XMLScanner::scanCharRef(XMLCh& toFill, XMLCh& second) } // Return the char (or chars) - if (value >= 0x10000) + // And check if the character expanded is valid or not + if (value >= 0x10000 && value <= 0x10FFFF) { value -= 0x10000; toFill = XMLCh((value >> 10) + 0xD800); second = XMLCh((value & 0x3FF) + 0xDC00); } - else + else if (value <= 0xFFFD) { toFill = XMLCh(value); second = 0; + if (!fReaderMgr.getCurrentReader()->isXMLChar(toFill) && !fReaderMgr.getCurrentReader()->isControlChar(toFill)) { + // Character reference was not in the valid range + emitError(XMLErrs::InvalidCharacterRef); + return false; + } + } + else { + // Character reference was not in the valid range + emitError(XMLErrs::InvalidCharacterRef); + return false; } return true; diff --git a/src/xercesc/validators/DTD/DTDScanner.cpp b/src/xercesc/validators/DTD/DTDScanner.cpp index 4f0e8eb16fe03237c97286c54d585bcfb4da4cff..0631696820f3c4c9220faebcde5ac93dab2a8ebc 100644 --- a/src/xercesc/validators/DTD/DTDScanner.cpp +++ b/src/xercesc/validators/DTD/DTDScanner.cpp @@ -56,6 +56,9 @@ /* * $Log$ + * Revision 1.20 2002/12/24 16:12:19 tng + * For performance reason, move the character check to scancharref. + * * Revision 1.19 2002/12/20 22:10:47 tng * XML 1.1 * @@ -918,23 +921,13 @@ bool DTDScanner::scanAttValue(const XMLCh* const attrName bool firstNonWS = false; bool gotLeadingSurrogate = false; bool escaped; - bool charref_expanded = false; while (true) { try { while(true) { - // Get another char. Use second char from prevous is its there - if (secondCh) - { - nextCh = secondCh; - secondCh = 0; - } - else - { - nextCh = fReaderMgr->getNextChar(); - } + nextCh = fReaderMgr->getNextChar(); if (!nextCh) ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF); @@ -966,12 +959,10 @@ bool DTDScanner::scanAttValue(const XMLCh* const attrName gotLeadingSurrogate = false; continue; } - charref_expanded = true; } - - // Check for correct surrogate pairs - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { + // Check for correct surrogate pairs if (gotLeadingSurrogate) fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); else @@ -985,29 +976,25 @@ bool DTDScanner::scanAttValue(const XMLCh* const attrName fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); } // Its got to at least be a valid XML character - else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr->getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - fScanner->emitError - ( - XMLErrs::InvalidCharacterInAttrValue - , attrName - , tmpBuf - ); - } + else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) + { + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + fScanner->emitError + ( + XMLErrs::InvalidCharacterInAttrValue + , attrName + , tmpBuf + ); } gotLeadingSurrogate = false; - charref_expanded = false; } // @@ -1063,6 +1050,9 @@ bool DTDScanner::scanAttValue(const XMLCh* const attrName // Else add it to the buffer toFill.append(nextCh); + + if (secondCh) + toFill.append(secondCh); } } @@ -1164,17 +1154,29 @@ bool DTDScanner::scanCharRef(XMLCh& first, XMLCh& second) } // Return the char (or chars) - if (value >= 0x10000) + // And check if the character expanded is valid or not + if (value >= 0x10000 && value <= 0x10FFFF) { value -= 0x10000; - first = XMLCh((value >> 10) + 0xD800); + first = XMLCh((value >> 10) + 0xD800); second = XMLCh((value & 0x3FF) + 0xDC00); } - else + else if (value <= 0xFFFD) { - first = XMLCh(value); + first = XMLCh(value); second = 0; + if (!fReaderMgr->getCurrentReader()->isXMLChar(first) && !fReaderMgr->getCurrentReader()->isControlChar(first)) { + // Character reference was not in the valid range + fScanner->emitError(XMLErrs::InvalidCharacterRef); + return false; + } } + else { + // Character reference was not in the valid range + fScanner->emitError(XMLErrs::InvalidCharacterRef); + return false; + } + return true; } @@ -2156,19 +2158,9 @@ bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill, const bool isPE) XMLCh nextCh; XMLCh secondCh = 0; bool gotLeadingSurrogate = false; - bool charref_expanded = false; while (true) { - // Get the second char if we have one, else get another - if (secondCh) - { - nextCh = secondCh; - secondCh = 0; - } - else - { - nextCh = fReaderMgr->getNextChar(); - } + nextCh = fReaderMgr->getNextChar(); // // Watch specifically for EOF and issue a more meaningful error @@ -2226,8 +2218,6 @@ bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill, const bool isPE) gotLeadingSurrogate = false; continue; } - charref_expanded = true; - } else { @@ -2262,8 +2252,7 @@ bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill, const bool isPE) gotLeadingSurrogate = false; } } - - if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) + else if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF)) { if (gotLeadingSurrogate) fScanner->emitError(XMLErrs::Expected2ndSurrogateChar); @@ -2279,28 +2268,26 @@ bool DTDScanner::scanEntityLiteral(XMLBuffer& toFill, const bool isPE) } else if (!fReaderMgr->getCurrentReader()->isXMLChar(nextCh)) { - // if it was a character reference and is control char, then it's ok - if (!(charref_expanded && fReaderMgr->getCurrentReader()->isControlChar(nextCh))) - { - XMLCh tmpBuf[9]; - XMLString::binToText - ( - nextCh - , tmpBuf - , 8 - , 16 - ); - fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf); - fReaderMgr->skipPastChar(quoteCh); - return false; - } + XMLCh tmpBuf[9]; + XMLString::binToText + ( + nextCh + , tmpBuf + , 8 + , 16 + ); + fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf); + fReaderMgr->skipPastChar(quoteCh); + return false; } - charref_expanded = false; gotLeadingSurrogate = false; } // Looks ok, so add it to the literal toFill.append(nextCh); + + if (secondCh) + toFill.append(secondCh); } //