From 360519fd1d366c4f772463ad543fce50cf46a0d3 Mon Sep 17 00:00:00 2001
From: Marko Kreen <markokr@gmail.com>
Date: Sun, 15 Jun 2014 00:25:46 +0300
Subject: [PATCH] utf8: validator

For sanity-checking UTF8 without decoding.
---
 test/test_utf8.c | 69 +++++++++++++++++++++++++++++++++++++++++
 usual/utf8.c     | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 usual/utf8.h     |  5 +++
 3 files changed, 154 insertions(+)

diff --git a/test/test_utf8.c b/test/test_utf8.c
index 63eb035..e40321b 100644
--- a/test/test_utf8.c
+++ b/test/test_utf8.c
@@ -34,6 +34,24 @@ static int uget4(int a, int b, int c, int d)
 	return utf8_get_char(&p, buf + 4);
 }
 
+static const char *mkseq(uint32_t c, int n)
+{
+	static char buf[8];
+	static const uint8_t prefix[] = { 0, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+	int i;
+	for (i = n - 1; i > 0; i--) {
+		buf[i] = (c & 0x3F) | 0x80;
+		c >>= 6;
+	}
+	buf[0] = prefix[n-1] | c;
+	return buf;
+}
+
+static int readseq(uint32_t c, int n)
+{
+	const char *p = mkseq(c, n);
+	return utf8_get_char(&p, p + n);
+}
 
 static void test_utf8_char_size(void *p)
 {
@@ -87,6 +105,26 @@ static void test_utf8_get_char(void *p)
 	int_check(uget1(0xC2), -0xC2);
 	int_check(uget2(0xE2, 0x82), -0xE2);
 	int_check(uget3(0xF0, 0xA4, 0xAD), -0xF0);
+
+	/* good boundaries */
+	int_check(readseq(0x7f, 1), 0x7f);
+	int_check(readseq(0x80, 2), 0x80);
+	int_check(readseq(0x7ff, 2), 0x7ff);
+	int_check(readseq(0x800, 3), 0x800);
+	int_check(readseq(0xffff, 3), 0xffff);
+	int_check(readseq(0x10000, 4), 0x10000);
+	int_check(readseq(0x10ffff, 4), 0x10ffff);
+	int_check(readseq(0xd7ff, 3), 0xd7ff);
+	int_check(readseq(0xe000, 3), 0xe000);
+
+	/* bad boundaries */
+	int_check(readseq(0x7f, 2), -193);
+	int_check(readseq(0x7ff, 3), -224);
+	int_check(readseq(0xffff, 4), -240);
+	int_check(readseq(0x110000, 4), -244);
+	int_check(readseq(0x10ffff, 5), -248);
+	int_check(readseq(0xd800, 3), -237);
+	int_check(readseq(0xdfff, 3), -237);
 end:;
 }
 
@@ -140,6 +178,36 @@ static void test_utf8_put_char(void *p)
 end:;
 }
 
+static int validseq(uint32_t c, int n)
+{
+	const char *p = mkseq(c, n);
+	return utf8_validate_seq(p, p + n);
+}
+
+static void test_utf8_validate_seq(void *p)
+{
+	/* good boundaries */
+	int_check(validseq(0x7f, 1), 1);
+	int_check(validseq(0x80, 2), 2);
+	int_check(validseq(0x7ff, 2), 2);
+	int_check(validseq(0x800, 3), 3);
+	int_check(validseq(0xffff, 3), 3);
+	int_check(validseq(0x10000, 4), 4);
+	int_check(validseq(0x10ffff, 4), 4);
+	int_check(validseq(0xd7ff, 3), 3);
+	int_check(validseq(0xe000, 3), 3);
+
+	/* bad boundaries */
+	int_check(validseq(0x7f, 2), 0);
+	int_check(validseq(0x7ff, 3), 0);
+	int_check(validseq(0xffff, 4), 0);
+	int_check(validseq(0x110000, 4), 0);
+	int_check(validseq(0x10ffff, 5), 0);
+	int_check(validseq(0xd800, 3), 0);
+	int_check(validseq(0xdfff, 3), 0);
+end:;
+}
+
 /*
  * Describe
  */
@@ -149,6 +217,7 @@ struct testcase_t utf8_tests[] = {
 	{ "utf8_seq_size", test_utf8_seq_size },
 	{ "utf8_get_char", test_utf8_get_char },
 	{ "utf8_put_char", test_utf8_put_char },
+	{ "utf8_validate_seq", test_utf8_validate_seq },
 	END_OF_TESTCASES
 };
 
diff --git a/usual/utf8.c b/usual/utf8.c
index 8902c15..24f965c 100644
--- a/usual/utf8.c
+++ b/usual/utf8.c
@@ -17,6 +17,7 @@
  */
 
 #include <usual/utf8.h>
+#include <usual/err.h>
 
 #define u8head(c, mask)	(((c) & (mask | (mask >> 1))) == mask)
 #define u8tail(c)	u8head(c, 0x80)
@@ -138,3 +139,82 @@ int utf8_seq_size(unsigned char b)
 	return 0;
 }
 
+/*
+ *     7f: c1bf (+1)
+ *     80: c280
+ *    7ff: dfbf
+ *    7ff: e09fbf (+1)
+ *    800: e0a080
+ *   ffff: efbfbf
+ *   ffff: f08fbfbf (+1)
+ *  10000: f0908080
+ * 10ffff: f48fbfbf
+ */
+int utf8_validate_seq(const char *src, const char *srcend)
+{
+	const unsigned char *u = (unsigned char *)src;
+	const unsigned char *uend = (unsigned char *)srcend;
+
+	if (u[0] < 0x80) { /* ascii */
+		if (u[0] == 0)
+			goto invalid;
+		return 1;
+	} else if (u[0] < 0xC2) { /* tail byte as first byte */
+		goto invalid;
+	} else if (u[0] < 0xE0) { /* 1 tail byte */
+		if (u + 2 > uend)
+			goto invalid;
+
+		if ((u[1] & 0xC0) != 0x80)
+			goto invalid;
+		return 2;
+	} else if (u[0] < 0xF0) { /* 2 tail bytes */
+		if (u + 3 > uend)
+			goto invalid;
+		if (u[0] == 0xE0 && u[1] < 0xA0)
+			goto invalid;
+		if (u[0] == 0xED && u[1] >= 0xA0)
+			goto invalid;
+		if ((u[1] & 0xC0) != 0x80)
+			goto invalid;
+		if ((u[2] & 0xC0) != 0x80)
+			goto invalid;
+		return 3;
+	} else if (u[0] < 0xF5) { /* 3-tail bytes */
+		if (u + 4 > uend)
+			goto invalid;
+		if (u[0] == 0xF0 && u[1] < 0x90)
+			goto invalid;
+		if (u[0] == 0xF4 && u[1] > 0x8F)
+			goto invalid;
+
+		if ((u[1] & 0xC0) != 0x80)
+			goto invalid;
+		if ((u[2] & 0xC0) != 0x80)
+			goto invalid;
+		if ((u[3] & 0xC0) != 0x80)
+			goto invalid;
+		return 4;
+	}
+invalid:
+	return 0;
+}
+
+bool utf8_validate_string(const char *src, const char *end)
+{
+	unsigned int n;
+	while (src < end) {
+		if (*src & 0x80) {
+			n = utf8_validate_seq(src, end);
+			if (n == 0)
+				return false;
+			src += n;
+		} else if (*src == '\0') {
+			return false;
+		} else {
+			src++;
+		}
+	}
+	return true;
+}
+
diff --git a/usual/utf8.h b/usual/utf8.h
index 1aedfb4..bbf4380 100644
--- a/usual/utf8.h
+++ b/usual/utf8.h
@@ -53,5 +53,10 @@ int utf8_char_size(unsigned int c);
 /** Return UTF8 seq length based on first byte */
 int utf8_seq_size(unsigned char c);
 
+/** Return sequence length if all bytes are valid, 0 otherwise. */
+int utf8_validate_seq(const char *src, const char *srcend);
+
+bool utf8_validate_string(const char *src, const char *end);
+
 #endif
 
-- 
2.39.5