From 360519fd1d366c4f772463ad543fce50cf46a0d3 Mon Sep 17 00:00:00 2001 From: Marko Kreen Date: Sun, 15 Jun 2014 00:25:46 +0300 Subject: [PATCH] utf8: validator For sanity-checking UTF8 without decoding. --- test/test_utf8.c | 69 +++++++++++++++++++++++++++++++++++++++++ usual/utf8.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ usual/utf8.h | 5 +++ 3 files changed, 154 insertions(+) diff --git a/test/test_utf8.c b/test/test_utf8.c index 63eb035..e40321b 100644 --- a/test/test_utf8.c +++ b/test/test_utf8.c @@ -34,6 +34,24 @@ static int uget4(int a, int b, int c, int d) return utf8_get_char(&p, buf + 4); } +static const char *mkseq(uint32_t c, int n) +{ + static char buf[8]; + static const uint8_t prefix[] = { 0, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + int i; + for (i = n - 1; i > 0; i--) { + buf[i] = (c & 0x3F) | 0x80; + c >>= 6; + } + buf[0] = prefix[n-1] | c; + return buf; +} + +static int readseq(uint32_t c, int n) +{ + const char *p = mkseq(c, n); + return utf8_get_char(&p, p + n); +} static void test_utf8_char_size(void *p) { @@ -87,6 +105,26 @@ static void test_utf8_get_char(void *p) int_check(uget1(0xC2), -0xC2); int_check(uget2(0xE2, 0x82), -0xE2); int_check(uget3(0xF0, 0xA4, 0xAD), -0xF0); + + /* good boundaries */ + int_check(readseq(0x7f, 1), 0x7f); + int_check(readseq(0x80, 2), 0x80); + int_check(readseq(0x7ff, 2), 0x7ff); + int_check(readseq(0x800, 3), 0x800); + int_check(readseq(0xffff, 3), 0xffff); + int_check(readseq(0x10000, 4), 0x10000); + int_check(readseq(0x10ffff, 4), 0x10ffff); + int_check(readseq(0xd7ff, 3), 0xd7ff); + int_check(readseq(0xe000, 3), 0xe000); + + /* bad boundaries */ + int_check(readseq(0x7f, 2), -193); + int_check(readseq(0x7ff, 3), -224); + int_check(readseq(0xffff, 4), -240); + int_check(readseq(0x110000, 4), -244); + int_check(readseq(0x10ffff, 5), -248); + int_check(readseq(0xd800, 3), -237); + int_check(readseq(0xdfff, 3), -237); end:; } @@ -140,6 +178,36 @@ static void test_utf8_put_char(void *p) end:; } +static int validseq(uint32_t c, int n) +{ + const char *p = mkseq(c, n); + return utf8_validate_seq(p, p + n); +} + +static void test_utf8_validate_seq(void *p) +{ + /* good boundaries */ + int_check(validseq(0x7f, 1), 1); + int_check(validseq(0x80, 2), 2); + int_check(validseq(0x7ff, 2), 2); + int_check(validseq(0x800, 3), 3); + int_check(validseq(0xffff, 3), 3); + int_check(validseq(0x10000, 4), 4); + int_check(validseq(0x10ffff, 4), 4); + int_check(validseq(0xd7ff, 3), 3); + int_check(validseq(0xe000, 3), 3); + + /* bad boundaries */ + int_check(validseq(0x7f, 2), 0); + int_check(validseq(0x7ff, 3), 0); + int_check(validseq(0xffff, 4), 0); + int_check(validseq(0x110000, 4), 0); + int_check(validseq(0x10ffff, 5), 0); + int_check(validseq(0xd800, 3), 0); + int_check(validseq(0xdfff, 3), 0); +end:; +} + /* * Describe */ @@ -149,6 +217,7 @@ struct testcase_t utf8_tests[] = { { "utf8_seq_size", test_utf8_seq_size }, { "utf8_get_char", test_utf8_get_char }, { "utf8_put_char", test_utf8_put_char }, + { "utf8_validate_seq", test_utf8_validate_seq }, END_OF_TESTCASES }; diff --git a/usual/utf8.c b/usual/utf8.c index 8902c15..24f965c 100644 --- a/usual/utf8.c +++ b/usual/utf8.c @@ -17,6 +17,7 @@ */ #include +#include #define u8head(c, mask) (((c) & (mask | (mask >> 1))) == mask) #define u8tail(c) u8head(c, 0x80) @@ -138,3 +139,82 @@ int utf8_seq_size(unsigned char b) return 0; } +/* + * 7f: c1bf (+1) + * 80: c280 + * 7ff: dfbf + * 7ff: e09fbf (+1) + * 800: e0a080 + * ffff: efbfbf + * ffff: f08fbfbf (+1) + * 10000: f0908080 + * 10ffff: f48fbfbf + */ +int utf8_validate_seq(const char *src, const char *srcend) +{ + const unsigned char *u = (unsigned char *)src; + const unsigned char *uend = (unsigned char *)srcend; + + if (u[0] < 0x80) { /* ascii */ + if (u[0] == 0) + goto invalid; + return 1; + } else if (u[0] < 0xC2) { /* tail byte as first byte */ + goto invalid; + } else if (u[0] < 0xE0) { /* 1 tail byte */ + if (u + 2 > uend) + goto invalid; + + if ((u[1] & 0xC0) != 0x80) + goto invalid; + return 2; + } else if (u[0] < 0xF0) { /* 2 tail bytes */ + if (u + 3 > uend) + goto invalid; + if (u[0] == 0xE0 && u[1] < 0xA0) + goto invalid; + if (u[0] == 0xED && u[1] >= 0xA0) + goto invalid; + if ((u[1] & 0xC0) != 0x80) + goto invalid; + if ((u[2] & 0xC0) != 0x80) + goto invalid; + return 3; + } else if (u[0] < 0xF5) { /* 3-tail bytes */ + if (u + 4 > uend) + goto invalid; + if (u[0] == 0xF0 && u[1] < 0x90) + goto invalid; + if (u[0] == 0xF4 && u[1] > 0x8F) + goto invalid; + + if ((u[1] & 0xC0) != 0x80) + goto invalid; + if ((u[2] & 0xC0) != 0x80) + goto invalid; + if ((u[3] & 0xC0) != 0x80) + goto invalid; + return 4; + } +invalid: + return 0; +} + +bool utf8_validate_string(const char *src, const char *end) +{ + unsigned int n; + while (src < end) { + if (*src & 0x80) { + n = utf8_validate_seq(src, end); + if (n == 0) + return false; + src += n; + } else if (*src == '\0') { + return false; + } else { + src++; + } + } + return true; +} + diff --git a/usual/utf8.h b/usual/utf8.h index 1aedfb4..bbf4380 100644 --- a/usual/utf8.h +++ b/usual/utf8.h @@ -53,5 +53,10 @@ int utf8_char_size(unsigned int c); /** Return UTF8 seq length based on first byte */ int utf8_seq_size(unsigned char c); +/** Return sequence length if all bytes are valid, 0 otherwise. */ +int utf8_validate_seq(const char *src, const char *srcend); + +bool utf8_validate_string(const char *src, const char *end); + #endif -- 2.39.5