Update UTF-8 base code
UTF-8 encodes characters in one to four bytes (since 2003). Because 0 is a valid code point, the decode function utf8_ord() should return -1, not 0, on error. As a consequence utf8_width() should return 0 for a continuation byte (as it did previously). Signed-off-by: Lukas Fleischer <lfleischer@calcurse.org>
This commit is contained in:
parent
edc44d613b
commit
95c5d576fa
@ -225,13 +225,10 @@
|
|||||||
#define TOSTRING(x) STRINGIFY(x)
|
#define TOSTRING(x) STRINGIFY(x)
|
||||||
#define __FILE_POS__ __FILE__ ":" TOSTRING(__LINE__)
|
#define __FILE_POS__ __FILE__ ":" TOSTRING(__LINE__)
|
||||||
|
|
||||||
#define UTF8_MAXLEN 6
|
#define UTF8_MAXLEN 4
|
||||||
#define UTF8_LENGTH(ch) ((unsigned char)ch >= 0xFC ? 6 : \
|
#define UTF8_LENGTH(ch) ((unsigned char)ch >= 0xF0 ? 4 : \
|
||||||
((unsigned char)ch >= 0xF8 ? 5 : \
|
|
||||||
((unsigned char)ch >= 0xF0 ? 4 : \
|
|
||||||
((unsigned char)ch >= 0xE0 ? 3 : \
|
((unsigned char)ch >= 0xE0 ? 3 : \
|
||||||
((unsigned char)ch >= 0xC0 ? 2 : 1)))))
|
((unsigned char)ch >= 0xC0 ? 2 : 1)))
|
||||||
#define UTF8_ISMULTI(ch) ((unsigned char)ch >= 0x80)
|
|
||||||
#define UTF8_ISCONT(ch) ((unsigned char)ch >= 0x80 && \
|
#define UTF8_ISCONT(ch) ((unsigned char)ch >= 0x80 && \
|
||||||
(unsigned char)ch <= 0xBF)
|
(unsigned char)ch <= 0xBF)
|
||||||
|
|
||||||
|
23
src/utf8.c
23
src/utf8.c
@ -269,11 +269,11 @@ static const struct utf8_range utf8_widthtab[] = {
|
|||||||
{0xe0100, 0xe01ef, 0}
|
{0xe0100, 0xe01ef, 0}
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Decode a UTF-8 code point. */
|
/* Decode a UTF-8 encoded character. Return the Unicode code point. */
|
||||||
int utf8_ord(const char *s)
|
int utf8_ord(const char *s)
|
||||||
{
|
{
|
||||||
if (UTF8_ISCONT(*s))
|
if (UTF8_ISCONT(*s))
|
||||||
return 0;
|
return -1;
|
||||||
|
|
||||||
switch (UTF8_LENGTH(*s)) {
|
switch (UTF8_LENGTH(*s)) {
|
||||||
case 1:
|
case 1:
|
||||||
@ -285,17 +285,9 @@ int utf8_ord(const char *s)
|
|||||||
(s[0] & 0x0f) << 12;
|
(s[0] & 0x0f) << 12;
|
||||||
case 4:
|
case 4:
|
||||||
return (((s[3] & 0x3f) | (s[2] & 0x3f) << 6) |
|
return (((s[3] & 0x3f) | (s[2] & 0x3f) << 6) |
|
||||||
(s[1] & 0x3f) << 12) | (s[0] & 0x3f) << 18;
|
(s[1] & 0x3f) << 12) | (s[0] & 0x7) << 18;
|
||||||
case 5:
|
|
||||||
return ((((s[4] & 0x3f) | (s[3] & 0x3f) << 6) |
|
|
||||||
(s[2] & 0x3f) << 12) | (s[1] & 0x3f) << 18) |
|
|
||||||
(s[0] & 0x3f) << 24;
|
|
||||||
case 6:
|
|
||||||
return (((((s[5] & 0x3f) | (s[4] & 0x3f) << 6) |
|
|
||||||
(s[3] & 0x3f) << 12) | (s[2] & 0x3f) << 18) |
|
|
||||||
(s[1] & 0x3f) << 24) | (s[0] & 0x3f) << 30;
|
|
||||||
default:
|
default:
|
||||||
return 0;
|
return -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -304,6 +296,8 @@ int utf8_width(char *s)
|
|||||||
{
|
{
|
||||||
int val, low, high, cur;
|
int val, low, high, cur;
|
||||||
|
|
||||||
|
if (UTF8_ISCONT(*s))
|
||||||
|
return 0;
|
||||||
val = utf8_ord(s);
|
val = utf8_ord(s);
|
||||||
low = 0;
|
low = 0;
|
||||||
high = ARRAY_SIZE(utf8_widthtab);
|
high = ARRAY_SIZE(utf8_widthtab);
|
||||||
@ -328,11 +322,8 @@ int utf8_strwidth(char *s)
|
|||||||
{
|
{
|
||||||
int width = 0;
|
int width = 0;
|
||||||
|
|
||||||
for (; s && *s; s++) {
|
for (; *s; s++)
|
||||||
if (!UTF8_ISCONT(*s))
|
|
||||||
width += utf8_width(s);
|
width += utf8_width(s);
|
||||||
}
|
|
||||||
|
|
||||||
return width;
|
return width;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user