@@ -1855,25 +1855,56 @@ utf8_to_uv(p, lenp)
1855
1855
char * p ;
1856
1856
long * lenp ;
1857
1857
{
1858
- int c = (* p ++ )& 0xff ;
1859
- unsigned long uv ;
1860
- long n = 1 ;
1861
-
1862
- if (c < 0xc0 ) n = 1 ;
1863
- else if (c < 0xe0 ) n = 2 ;
1864
- else if (c < 0xf0 ) n = 3 ;
1865
- else if (c < 0xf8 ) n = 4 ;
1866
- else if (c < 0xfc ) n = 5 ;
1867
- else if (c < 0xfe ) n = 6 ;
1868
- else if (c == 0xfe ) n = 7 ;
1869
- if (n > * lenp ) return 0 ;
1858
+ int c = * p ++ & 0xff ;
1859
+ unsigned long uv = c ;
1860
+ long n ;
1861
+
1862
+ if (!(uv & 0x80 )) {
1863
+ * lenp = 1 ;
1864
+ return uv ;
1865
+ }
1866
+ if (!(uv & 0x40 )) {
1867
+ rb_warning ("malformed UTF-8 character" );
1868
+ * lenp = 1 ;
1869
+ return uv ;
1870
+ }
1871
+
1872
+ if (!(uv & 0x20 )) { n = 2 ; uv &= 0x1f ; }
1873
+ else if (!(uv & 0x10 )) { n = 3 ; uv &= 0x0f ; }
1874
+ else if (!(uv & 0x08 )) { n = 4 ; uv &= 0x07 ; }
1875
+ else if (!(uv & 0x04 )) { n = 5 ; uv &= 0x03 ; }
1876
+ else if (!(uv & 0x02 )) { n = 6 ; uv &= 0x01 ; }
1877
+ else if (!(uv & 0x01 )) { n = 7 ; uv = 0 ; }
1878
+ else { n = 13 ; uv = 0 ; }
1879
+ if (n > * lenp ) {
1880
+ rb_warning ("malformed UTF-8 character (expected %d bytes, given %d bytes)" ,
1881
+ n , * lenp );
1882
+ return 0xfffd ;
1883
+ }
1870
1884
* lenp = n -- ;
1871
1885
1872
- uv = c ;
1873
1886
if (n != 0 ) {
1874
- uv &= (1 <<(BYTEWIDTH - 2 - n )) - 1 ;
1875
1887
while (n -- ) {
1876
- uv = uv << 6 | (* p ++ & ((1 <<6 )- 1 ));
1888
+ c = * p ++ & 0xff ;
1889
+ if ((c & 0xc0 ) != 0x80 ) {
1890
+ rb_warning ("malformed UTF-8 character" );
1891
+ * lenp -= n + 1 ;
1892
+ return 0xfffd ;
1893
+ }
1894
+ else {
1895
+ c &= 0x3f ;
1896
+ if (uv == 0 && c == 0 ) {
1897
+ int i ;
1898
+
1899
+ for (i = 0 ; n - i > 0 && (p [i ] & 0x3f ) == 0 ; i ++ )
1900
+ ;
1901
+ rb_warning ("redundant UTF-8 sequence (skip %d bytes)" , i + 1 );
1902
+ n -= i ;
1903
+ p += i ;
1904
+ continue ;
1905
+ }
1906
+ uv = uv << 6 | c ;
1907
+ }
1877
1908
}
1878
1909
}
1879
1910
return uv ;
0 commit comments