(svn r9012) -Fix/Feature (UTF8): When cutting strings into multiple lines also take into consideration whitespace characters of more than 1 byte length (eg IDEOGRAPHIC SPACE, IsWhitespace() function). When trimming such strings, account for multiple-byte long sequences so use *Utf8PrevChar(v) = '\0'.
-Codechange: Add a function Utf8TrimString() that properly trims a string to an UTF8 encoding seperation instead of somewhere in the wild (and use it in the chat area)
This commit is contained in:
33
src/string.h
33
src/string.h
@@ -74,6 +74,7 @@ bool IsValidChar(WChar key, CharSetFilter afilter);
|
||||
|
||||
size_t Utf8Decode(WChar *c, const char *s);
|
||||
size_t Utf8Encode(char *buf, WChar c);
|
||||
size_t Utf8TrimString(char *s, size_t maxlen);
|
||||
|
||||
|
||||
static inline WChar Utf8Consume(const char **s)
|
||||
@@ -100,6 +101,23 @@ static inline size_t Utf8CharLen(WChar c)
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the length of an UTF-8 encoded value based on a single char. This
|
||||
* char should be the first byte of the UTF-8 encoding. If not, or encoding
|
||||
* is invalid, return value is 0
|
||||
*/
|
||||
static inline size_t Utf8EncodedCharLen(char c)
|
||||
{
|
||||
if (GB(c, 3, 5) == 0x1E) return 4;
|
||||
if (GB(c, 4, 4) == 0x0E) return 3;
|
||||
if (GB(c, 5, 3) == 0x06) return 2;
|
||||
if (GB(c, 7, 1) == 0x00) return 1;
|
||||
|
||||
/* Invalid UTF8 start encoding */
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* Check if the given character is part of a UTF8 sequence */
|
||||
static inline bool IsUtf8Part(char c)
|
||||
{
|
||||
@@ -129,5 +147,20 @@ static inline bool IsPrintable(WChar c)
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether UNICODE character is whitespace or not
|
||||
* @param c UNICODE character to check
|
||||
* @return a boolean value whether 'c' is a whitespace character or not
|
||||
* @see http://www.fileformat.info/info/unicode/category/Zs/list.htm
|
||||
*/
|
||||
static inline bool IsWhitespace(WChar c)
|
||||
{
|
||||
return
|
||||
c == 0x0020 /* SPACE */ ||
|
||||
c == 0x00A0 /* NO-BREAK SPACE */ ||
|
||||
c == 0x3000 /* IDEOGRAPHIC SPACE */
|
||||
;
|
||||
}
|
||||
|
||||
|
||||
#endif /* STRING_H */
|
||||
|
Reference in New Issue
Block a user