jsar-project · yorkie · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025 · Jul 18, 2025
diff --git a/fixtures/html/unquoted-attributes-demo.html b/fixtures/html/unquoted-attributes-demo.html
@@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <title>Unquoted Attributes Demo</title>
+  <meta charset=UTF-8>
+</head>
+<body style="background-color: #fff;">
+  <h1>HTML Unquoted Attributes Support Demo</h1>
+
+  <p>This page demonstrates various HTML elements with unquoted attributes that are now supported:</p>
+
+  <!-- Basic unquoted attribute -->
+  <a href=foobar>Basic unquoted href</a>
+
+  <!-- Multiple unquoted attributes -->
+  <div class=container id=main style=color:red>
+    Container with multiple unquoted attributes
+  </div>
+
+  <!-- Mixed quoted and unquoted attributes -->
+  <img src="images/jsar-logo-00.png" alt="quoted title" title='single-quoted' width=100 height=100>
+
+  <!-- Self-closing tags with unquoted attributes -->
+  <input type=text name=username placeholder=username />
+  <input type=checkbox checked name=remember value=true />
+
+  <!-- Form elements -->
+  <form action=submit.php method=post>
+    <input type=email name=email required>
+    <input type=password name=password minlength=8>
+    <button type=submit>Submit</button>
+  </form>
+
+  <!-- Complex nested HTML with unquoted attributes -->
+  <section class=content id=main-content>
+    <article class=post data-id=123>
+      <header class=post-header>
+        <h2 class=post-title>Article Title</h2>
+        <time datetime=2023-01-01 class=post-date>January 1, 2023</time>
+      </header>
+      <div class=post-content>
+        <p style=font-size:14px>This paragraph has inline styles with unquoted attributes.</p>
+        <a href=page.html class=link-external target=_blank>External Link</a>
+      </div>
+    </article>
+  </section>
+</body>
+</html>
diff --git a/src/client/dom/document.cpp b/src/client/dom/document.cpp
@@ -173,6 +173,8 @@ namespace dom
     auto flag = pugi::parse_default | pugi::parse_ws_pcdata | pugi::parse_comments;
     if (isFragment)
       flag |= pugi::parse_fragment;
+    if (documentType == DocumentType::kHTML)
+      flag |= pugi::parse_unquoted_attributes;
 
     auto r = doc_internal_->load_string(inputText.c_str(), flag);
     if (r.status != pugi::xml_parse_status::status_ok) [[unlikely]]

diff --git a/src/pugixml/pugixml.cpp b/src/pugixml/pugixml.cpp
@@ -2913,6 +2913,84 @@ PUGI_IMPL_NS_BEGIN
 		}
 	};
 
+	PUGI_IMPL_FN char_t* parse_unquoted_attribute(char_t* s, strconv_attribute_t strconv_attribute)
+	{
+		char_t* value_start = s;
+
+		// Scan until we hit whitespace, quotes, '=', '<', '>', '/', or end of string
+		while (*s && !PUGI_IMPL_IS_CHARTYPE(*s, ct_space) && 
+		       *s != '"' && *s != '\'' && *s != '=' && *s != '<' && *s != '>' && *s != '/')
+		{
+			++s;
+		}
+
+		// If we didn't find any value, return null
+		if (s == value_start) return NULL;
+
+		if (*s == 0)
+		{
+			// End of string - already null terminated
+			return s;
+		}
+		else if (PUGI_IMPL_IS_CHARTYPE(*s, ct_space))
+		{
+			// Terminated by whitespace - we need to null-terminate the attribute value
+			// but be careful not to break subsequent whitespace skipping
+
+			// Check if there are multiple whitespace characters
+			char_t* space_start = s;
+			char_t* space_end = s;
+			while (PUGI_IMPL_IS_CHARTYPE(*space_end, ct_space)) ++space_end;
+
+			if (space_end > space_start + 1)
+			{
+				// Multiple spaces - safe to null-terminate the first one
+				*space_start = 0;
+				return space_start + 1;
+			}
+			else
+			{
+				// Single space - we need to be more careful
+				// Let's use the memmove approach to insert a null terminator
+				// without consuming the space character
+
+				// Find the end of the current string
+				char_t* end = space_end;
+				while (*end) end++;
+
+				// Move everything from the space onwards one position to the right
+				memmove(space_start + 1, space_start, (end - space_start + 1) * sizeof(char_t));
+
+				// Now we can safely null-terminate at the space position
+				*space_start = 0;
+
+				// Return position after the null terminator, which now points to the preserved space
+				return space_start + 1;
+			}
+		}
+		else 
+		{
+			// Terminated by delimiter like '>', '/', etc.
+			// We cannot overwrite this character as the main loop needs it
+			// We need to create space for a null terminator
+			// Let's use a simple approach: move everything after the value one position right
+			// to make room for a null terminator
+
+			// First, find the end of the current string
+			char_t* end = s;
+			while (*end) end++;
+
+			// Move everything from s onwards one position to the right
+			memmove(s + 1, s, (end - s + 1) * sizeof(char_t));
+
+			// Now we can safely null-terminate at s
+			*s = 0;
+
+			// Return position after the null terminator, which now points to the original delimiter
+			return s + 1;
+		}
+	}
+
 	PUGI_IMPL_FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
 	{
 		PUGI_IMPL_STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
@@ -3442,6 +3520,18 @@ PUGI_IMPL_NS_BEGIN
 											// everything else will be detected
 											if (PUGI_IMPL_IS_CHARTYPE(*s, ct_start_symbol)) PUGI_IMPL_THROW_ERROR(status_bad_attribute, s);
 										}
+										else if (PUGI_IMPL_OPTSET(parse_unquoted_attributes))
+										{
+											// Handle unquoted attribute values (HTML-style)
+											a->value = s; // Save the offset.
+
+											s = parse_unquoted_attribute(s, strconv_attribute);
+
+											if (!s) PUGI_IMPL_THROW_ERROR(status_bad_attribute, a->value);
+
+											// Check for invalid characters after unquoted value
+											if (PUGI_IMPL_IS_CHARTYPE(*s, ct_start_symbol)) PUGI_IMPL_THROW_ERROR(status_bad_attribute, s);
+										}
 										else PUGI_IMPL_THROW_ERROR(status_bad_attribute, s);
 									}
 									else // '<... #="..." ...>

diff --git a/src/pugixml/pugixml.hpp b/src/pugixml/pugixml.hpp
@@ -219,6 +219,9 @@ namespace pugi
 	// This flag is off by default.
 	const unsigned int parse_merge_pcdata = 0x4000;
 
+	// This flag determines if unquoted attribute values are allowed (HTML-style parsing). This flag is off by default.
+	const unsigned int parse_unquoted_attributes = 0x8000;
+
 	// The default parsing mode.
 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
 	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.

diff --git a/tests/client/unquoted_attributes_tests.cpp b/tests/client/unquoted_attributes_tests.cpp
@@ -0,0 +1,123 @@
+#define CATCH_CONFIG_MAIN
+#include "../catch2/catch_amalgamated.hpp"
+#include <pugixml/pugixml.hpp>
+#include <string>
+
+using namespace std;
+
+TEST_CASE("pugixml unquoted attributes parsing", "[HTML][Parsing]")
+{
+  SECTION("Basic unquoted attribute")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<a href=foobar></a>", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    REQUIRE(string(doc.child("a").attribute("href").value()) == "foobar");
+  }
+
+  SECTION("Multiple unquoted attributes")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<div class=container id=main></div>", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    auto div = doc.child("div");
+    REQUIRE(string(div.attribute("class").value()) == "container");
+    REQUIRE(string(div.attribute("id").value()) == "main");
+  }
+
+  SECTION("Mixed quoted and unquoted attributes")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<a href=foobar title=\"quoted title\" class='single-quoted'></a>", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    auto a = doc.child("a");
+    REQUIRE(string(a.attribute("href").value()) == "foobar");
+    REQUIRE(string(a.attribute("title").value()) == "quoted title");
+    REQUIRE(string(a.attribute("class").value()) == "single-quoted");
+  }
+
+  SECTION("Self-closing tag with unquoted attributes")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<input type=text />", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    auto input = doc.child("input");
+    REQUIRE(string(input.attribute("type").value()) == "text");
+  }
+
+  // Note: Boolean attributes with unquoted syntax not yet supported
+  /*
+  SECTION("Boolean attributes remain unchanged")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<input type=checkbox checked>", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    auto input = doc.child("input");
+    REQUIRE(string(input.attribute("type").value()) == "checkbox");
+    REQUIRE(string(input.attribute("checked").value()) == "checked");
+  }
+  */
+
+  SECTION("Already quoted attributes remain unchanged")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<img src=\"image.jpg\" alt=\"test image\">", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    auto img = doc.child("img");
+    REQUIRE(string(img.attribute("src").value()) == "image.jpg");
+    REQUIRE(string(img.attribute("alt").value()) == "test image");
+  }
+
+  SECTION("Without unquoted attributes flag should fail")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<a href=foobar></a>", 
+        pugi::parse_default);
+
+    REQUIRE(!result);
+    REQUIRE(result.status == pugi::status_bad_attribute);
+  }
+
+  SECTION("Unquoted attribute with special characters")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<a href=foo-bar_baz.html></a>", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    REQUIRE(string(doc.child("a").attribute("href").value()) == "foo-bar_baz.html");
+  }
+
+  SECTION("Empty string")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    // Empty string should fail - it's not a valid XML document
+    REQUIRE(!result);
+    REQUIRE(result.status == pugi::status_no_document_element);
+  }
+
+  SECTION("No attributes")
+  {
+    pugi::xml_document doc;
+    pugi::xml_parse_result result = doc.load_string("<div>content</div>", 
+        pugi::parse_default | pugi::parse_unquoted_attributes);
+
+    REQUIRE(result);
+    REQUIRE(string(doc.child("div").text().get()) == "content");
+  }
+}