From e00cbff98ad52be874584ff99d0634b1443ee0e4 Mon Sep 17 00:00:00 2001
From: Khaled Noaman <knoaman@apache.org>
Date: Fri, 14 Dec 2001 20:21:37 +0000
Subject: [PATCH] Add surrogate support to comments and processing
 instrunctions.

git-svn-id: https://svn.apache.org/repos/asf/xerces/c/trunk@173390 13f79535-47bb-0310-9956-ffa450edef68
---
 src/internal/XMLScanner.cpp       | 42 ++++++++++----
 src/internal/XMLScanner2.cpp      | 41 ++++++++++----
 src/validators/DTD/DTDScanner.cpp | 94 ++++++++++++++++++++++---------
 3 files changed, 129 insertions(+), 48 deletions(-)

diff --git a/src/internal/XMLScanner.cpp b/src/internal/XMLScanner.cpp
index 009389c1e..1b293ba32 100644
--- a/src/internal/XMLScanner.cpp
+++ b/src/internal/XMLScanner.cpp
@@ -2066,6 +2066,8 @@ void XMLScanner::scanPI()
         // Skip any leading spaces
         fReaderMgr.skipPastSpaces();
 
+        bool gotLeadingSurrogate = false;
+
         // It does have a target, so lets move on to deal with that.
         while (1)
         {
@@ -2086,18 +2088,36 @@ void XMLScanner::scanPI()
                     break;
             }
 
-            // Watch for invalid chars but try to keep going
-            if (!XMLReader::isXMLChar(nextCh))
+            // Check for correct surrogate pairs
+            if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
             {
-                XMLCh tmpBuf[9];
-                XMLString::binToText
-                (
-                    nextCh
-                    , tmpBuf
-                    , 8
-                    , 16
-                );
-                emitError(XMLErrs::InvalidCharacter, tmpBuf);
+                if (gotLeadingSurrogate)
+                    emitError(XMLErrs::Expected2ndSurrogateChar);
+                else
+                    gotLeadingSurrogate = true;
+            }
+             else
+            {
+                if (gotLeadingSurrogate)
+                {
+                    if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
+                        emitError(XMLErrs::Expected2ndSurrogateChar);
+                }
+                // Its got to at least be a valid XML character
+                else if (!XMLReader::isXMLChar(nextCh)) {
+
+                    XMLCh tmpBuf[9];
+                    XMLString::binToText
+                    (
+                        nextCh
+                        , tmpBuf
+                        , 8
+                        , 16
+                    );
+                    emitError(XMLErrs::InvalidCharacter, tmpBuf);
+                }
+
+                gotLeadingSurrogate = false;
             }
 
             bbTarget.append(nextCh);
diff --git a/src/internal/XMLScanner2.cpp b/src/internal/XMLScanner2.cpp
index 6af930f5b..01f4090ea 100644
--- a/src/internal/XMLScanner2.cpp
+++ b/src/internal/XMLScanner2.cpp
@@ -2615,6 +2615,7 @@ void XMLScanner::scanComment()
     //  than just a name.
     //
     States curState = InText;
+    bool gotLeadingSurrogate = false;
     while (true)
     {
         // Get the next character
@@ -2627,18 +2628,36 @@ void XMLScanner::scanComment()
             ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
         }
 
-        // Make sure its a valid XML character
-        if (!XMLReader::isXMLChar(nextCh))
+        // Check for correct surrogate pairs
+        if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
         {
-            XMLCh tmpBuf[9];
-            XMLString::binToText
-            (
-                nextCh
-                , tmpBuf
-                , 8
-                , 16
-            );
-            emitError(XMLErrs::InvalidCharacter, tmpBuf);
+            if (gotLeadingSurrogate)
+                emitError(XMLErrs::Expected2ndSurrogateChar);
+            else
+                gotLeadingSurrogate = true;
+        }
+        else
+        {
+            if (gotLeadingSurrogate)
+            {
+                if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
+                    emitError(XMLErrs::Expected2ndSurrogateChar);
+            }
+            // Its got to at least be a valid XML character
+            else if (!XMLReader::isXMLChar(nextCh)) {
+
+                XMLCh tmpBuf[9];
+                XMLString::binToText
+                (
+                    nextCh
+                    , tmpBuf
+                    , 8
+                    , 16
+                );
+                emitError(XMLErrs::InvalidCharacter, tmpBuf);
+            }
+
+            gotLeadingSurrogate = false;
         }
 
         if (curState == InText)
diff --git a/src/validators/DTD/DTDScanner.cpp b/src/validators/DTD/DTDScanner.cpp
index 4e659ead4..7cd9075b6 100644
--- a/src/validators/DTD/DTDScanner.cpp
+++ b/src/validators/DTD/DTDScanner.cpp
@@ -56,6 +56,9 @@
 
 /*
  * $Log$
+ * Revision 1.23  2001/12/14 20:21:37  knoaman
+ * Add surrogate support to comments and processing instrunctions.
+ *
  * Revision 1.22  2001/12/06 17:51:18  tng
  * Performance Enhancement. The ContentSpecNode constructor always copied the QName
  * that was passed to it.  Added a second constructor that allows the QName to be just assigned, not copied.
@@ -884,11 +887,9 @@ bool DTDScanner::scanAttValue(const   XMLCh* const        attrName
                     if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
                         fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
                 }
-                gotLeadingSurrogate = false;
-
                 // Its got to at least be a valid XML character
-                if (!XMLReader::isXMLChar(nextCh))
-                {
+                else if (!XMLReader::isXMLChar(nextCh)) {
+
                     XMLCh tmpBuf[9];
                     XMLString::binToText
                     (
@@ -904,6 +905,8 @@ bool DTDScanner::scanAttValue(const   XMLCh* const        attrName
                         , tmpBuf
                     );
                 }
+
+                gotLeadingSurrogate = false;
             }
 
             //
@@ -1384,6 +1387,7 @@ void DTDScanner::scanComment()
     //  two here, since its to be used for stuff that is potentially longer
     //  than just a name.
     //
+    bool   gotLeadingSurrogate = false;
     States curState = InText;
     while (true)
     {
@@ -1397,18 +1401,36 @@ void DTDScanner::scanComment()
             ThrowXML(UnexpectedEOFException, XMLExcepts::Gen_UnexpectedEOF);
         }
 
-        // Make sure its a valid XML character
-        if (!XMLReader::isXMLChar(nextCh))
+        // Check for correct surrogate pairs
+        if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
         {
-            XMLCh tmpBuf[9];
-            XMLString::binToText
-            (
-                nextCh
-                , tmpBuf
-                , 8
-                , 16
-            );
-            fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
+            if (gotLeadingSurrogate)
+                fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
+            else
+                gotLeadingSurrogate = true;
+        }
+        else
+        {
+            if (gotLeadingSurrogate)
+            {
+                if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
+                    fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
+            }
+            // Its got to at least be a valid XML character
+            else if (!XMLReader::isXMLChar(nextCh)) {
+
+                XMLCh tmpBuf[9];
+                XMLString::binToText
+                (
+                    nextCh
+                    , tmpBuf
+                    , 8
+                    , 16
+                );
+                fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
+            }
+
+            gotLeadingSurrogate = false;
         }
 
         if (curState == InText)
@@ -3629,6 +3651,8 @@ void DTDScanner::scanPI()
         // Skip any leading spaces
         fReaderMgr->skipPastSpaces();
 
+        bool gotLeadingSurrogate = false;
+
         // It does have a target, so lets move on to deal with that.
         while (1)
         {
@@ -3649,18 +3673,36 @@ void DTDScanner::scanPI()
                     break;
             }
 
-            // Watch for invalid chars but try to keep going
-            if (!XMLReader::isXMLChar(nextCh))
+            // Check for correct surrogate pairs
+            if ((nextCh >= 0xD800) && (nextCh <= 0xDBFF))
             {
-                XMLCh tmpBuf[9];
-                XMLString::binToText
-                (
-                    nextCh
-                    , tmpBuf
-                    , 8
-                    , 16
-                );
-                fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
+                if (gotLeadingSurrogate)
+                    fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
+                else
+                    gotLeadingSurrogate = true;
+            }
+             else
+            {
+                if (gotLeadingSurrogate)
+                {
+                    if ((nextCh < 0xDC00) && (nextCh > 0xDFFF))
+                        fScanner->emitError(XMLErrs::Expected2ndSurrogateChar);
+                }
+                // Its got to at least be a valid XML character
+                else if (!XMLReader::isXMLChar(nextCh)) {
+
+                    XMLCh tmpBuf[9];
+                    XMLString::binToText
+                    (
+                        nextCh
+                        , tmpBuf
+                        , 8
+                        , 16
+                    );
+                    fScanner->emitError(XMLErrs::InvalidCharacter, tmpBuf);
+                }
+
+                gotLeadingSurrogate = false;
             }
             bbTarget.append(nextCh);
         }
-- 
GitLab