Skip to content
48 changes: 48 additions & 0 deletions fixtures/html/unquoted-attributes-demo.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<!DOCTYPE html>
<html>
<head>
<title>Unquoted Attributes Demo</title>
<meta charset=UTF-8>
</head>
<body style="background-color: #fff;">
<h1>HTML Unquoted Attributes Support Demo</h1>

<p>This page demonstrates various HTML elements with unquoted attributes that are now supported:</p>

<!-- Basic unquoted attribute -->
<a href=foobar>Basic unquoted href</a>

<!-- Multiple unquoted attributes -->
<div class=container id=main style=color:red>
Container with multiple unquoted attributes
</div>

<!-- Mixed quoted and unquoted attributes -->
<img src="images/jsar-logo-00.png" alt="quoted title" title='single-quoted' width=100 height=100>

<!-- Self-closing tags with unquoted attributes -->
<input type=text name=username placeholder=username />
<input type=checkbox checked name=remember value=true />

<!-- Form elements -->
<form action=submit.php method=post>
<input type=email name=email required>
<input type=password name=password minlength=8>
<button type=submit>Submit</button>
</form>

<!-- Complex nested HTML with unquoted attributes -->
<section class=content id=main-content>
<article class=post data-id=123>
<header class=post-header>
<h2 class=post-title>Article Title</h2>
<time datetime=2023-01-01 class=post-date>January 1, 2023</time>
</header>
<div class=post-content>
<p style=font-size:14px>This paragraph has inline styles with unquoted attributes.</p>
<a href=page.html class=link-external target=_blank>External Link</a>
</div>
</article>
</section>
</body>
</html>
2 changes: 2 additions & 0 deletions src/client/dom/document.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,8 @@ namespace dom
auto flag = pugi::parse_default | pugi::parse_ws_pcdata | pugi::parse_comments;
if (isFragment)
flag |= pugi::parse_fragment;
if (documentType == DocumentType::kHTML)
flag |= pugi::parse_unquoted_attributes;

auto r = doc_internal_->load_string(inputText.c_str(), flag);
if (r.status != pugi::xml_parse_status::status_ok) [[unlikely]]
Expand Down
90 changes: 90 additions & 0 deletions src/pugixml/pugixml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2913,6 +2913,84 @@ PUGI_IMPL_NS_BEGIN
}
};

PUGI_IMPL_FN char_t* parse_unquoted_attribute(char_t* s, strconv_attribute_t strconv_attribute)
{
char_t* value_start = s;

// Scan until we hit whitespace, quotes, '=', '<', '>', '/', or end of string
while (*s && !PUGI_IMPL_IS_CHARTYPE(*s, ct_space) &&
*s != '"' && *s != '\'' && *s != '=' && *s != '<' && *s != '>' && *s != '/')
{
++s;
}

// If we didn't find any value, return null
if (s == value_start) return NULL;

if (*s == 0)
{
// End of string - already null terminated
return s;
}
else if (PUGI_IMPL_IS_CHARTYPE(*s, ct_space))
{
// Terminated by whitespace - we need to null-terminate the attribute value
// but be careful not to break subsequent whitespace skipping

// Check if there are multiple whitespace characters
char_t* space_start = s;
char_t* space_end = s;
while (PUGI_IMPL_IS_CHARTYPE(*space_end, ct_space)) ++space_end;

if (space_end > space_start + 1)
{
// Multiple spaces - safe to null-terminate the first one
*space_start = 0;
return space_start + 1;
}
else
{
// Single space - we need to be more careful
// Let's use the memmove approach to insert a null terminator
// without consuming the space character

// Find the end of the current string
char_t* end = space_end;
while (*end) end++;

// Move everything from the space onwards one position to the right
memmove(space_start + 1, space_start, (end - space_start + 1) * sizeof(char_t));

// Now we can safely null-terminate at the space position
*space_start = 0;

// Return position after the null terminator, which now points to the preserved space
return space_start + 1;
}
}
else
{
// Terminated by delimiter like '>', '/', etc.
// We cannot overwrite this character as the main loop needs it
// We need to create space for a null terminator
// Let's use a simple approach: move everything after the value one position right
// to make room for a null terminator

// First, find the end of the current string
char_t* end = s;
while (*end) end++;

// Move everything from s onwards one position to the right
memmove(s + 1, s, (end - s + 1) * sizeof(char_t));

// Now we can safely null-terminate at s
*s = 0;

// Return position after the null terminator, which now points to the original delimiter
return s + 1;
}
}

PUGI_IMPL_FN strconv_attribute_t get_strconv_attribute(unsigned int optmask)
{
PUGI_IMPL_STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_wconv_attribute == 0x40 && parse_wnorm_attribute == 0x80);
Expand Down Expand Up @@ -3442,6 +3520,18 @@ PUGI_IMPL_NS_BEGIN
// everything else will be detected
if (PUGI_IMPL_IS_CHARTYPE(*s, ct_start_symbol)) PUGI_IMPL_THROW_ERROR(status_bad_attribute, s);
}
else if (PUGI_IMPL_OPTSET(parse_unquoted_attributes))
{
// Handle unquoted attribute values (HTML-style)
a->value = s; // Save the offset.

s = parse_unquoted_attribute(s, strconv_attribute);

if (!s) PUGI_IMPL_THROW_ERROR(status_bad_attribute, a->value);

// Check for invalid characters after unquoted value
if (PUGI_IMPL_IS_CHARTYPE(*s, ct_start_symbol)) PUGI_IMPL_THROW_ERROR(status_bad_attribute, s);
}
else PUGI_IMPL_THROW_ERROR(status_bad_attribute, s);
}
else // '<... #="..." ...>
Expand Down
3 changes: 3 additions & 0 deletions src/pugixml/pugixml.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,9 @@ namespace pugi
// This flag is off by default.
const unsigned int parse_merge_pcdata = 0x4000;

// This flag determines if unquoted attribute values are allowed (HTML-style parsing). This flag is off by default.
const unsigned int parse_unquoted_attributes = 0x8000;

// The default parsing mode.
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
Expand Down
123 changes: 123 additions & 0 deletions tests/client/unquoted_attributes_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#define CATCH_CONFIG_MAIN
#include "../catch2/catch_amalgamated.hpp"
#include <pugixml/pugixml.hpp>
#include <string>

using namespace std;

TEST_CASE("pugixml unquoted attributes parsing", "[HTML][Parsing]")
{
SECTION("Basic unquoted attribute")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<a href=foobar></a>",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
REQUIRE(string(doc.child("a").attribute("href").value()) == "foobar");
}

SECTION("Multiple unquoted attributes")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<div class=container id=main></div>",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
auto div = doc.child("div");
REQUIRE(string(div.attribute("class").value()) == "container");
REQUIRE(string(div.attribute("id").value()) == "main");
}

SECTION("Mixed quoted and unquoted attributes")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<a href=foobar title=\"quoted title\" class='single-quoted'></a>",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
auto a = doc.child("a");
REQUIRE(string(a.attribute("href").value()) == "foobar");
REQUIRE(string(a.attribute("title").value()) == "quoted title");
REQUIRE(string(a.attribute("class").value()) == "single-quoted");
}

SECTION("Self-closing tag with unquoted attributes")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<input type=text />",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
auto input = doc.child("input");
REQUIRE(string(input.attribute("type").value()) == "text");
}

// Note: Boolean attributes with unquoted syntax not yet supported
/*
SECTION("Boolean attributes remain unchanged")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<input type=checkbox checked>",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
auto input = doc.child("input");
REQUIRE(string(input.attribute("type").value()) == "checkbox");
REQUIRE(string(input.attribute("checked").value()) == "checked");
}
*/

SECTION("Already quoted attributes remain unchanged")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<img src=\"image.jpg\" alt=\"test image\">",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
auto img = doc.child("img");
REQUIRE(string(img.attribute("src").value()) == "image.jpg");
REQUIRE(string(img.attribute("alt").value()) == "test image");
}

SECTION("Without unquoted attributes flag should fail")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<a href=foobar></a>",
pugi::parse_default);

REQUIRE(!result);
REQUIRE(result.status == pugi::status_bad_attribute);
}

SECTION("Unquoted attribute with special characters")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<a href=foo-bar_baz.html></a>",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
REQUIRE(string(doc.child("a").attribute("href").value()) == "foo-bar_baz.html");
}

SECTION("Empty string")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("",
pugi::parse_default | pugi::parse_unquoted_attributes);

// Empty string should fail - it's not a valid XML document
REQUIRE(!result);
REQUIRE(result.status == pugi::status_no_document_element);
}

SECTION("No attributes")
{
pugi::xml_document doc;
pugi::xml_parse_result result = doc.load_string("<div>content</div>",
pugi::parse_default | pugi::parse_unquoted_attributes);

REQUIRE(result);
REQUIRE(string(doc.child("div").text().get()) == "content");
}
}