Commit 2fc7ef84 authored by Chris Müller's avatar Chris Müller

lib: add low-level utf8 routines.

parent f5e47cae
set(SOURCES set(SOURCES
unittest.c unittest.c
utf8.c
structures/single_linked_list.c structures/single_linked_list.c
structures/list.c structures/list.c
structures/red_black_tree.c structures/red_black_tree.c
...@@ -12,6 +13,7 @@ set(HEADER ...@@ -12,6 +13,7 @@ set(HEADER
standard.h standard.h
unix_colors.h unix_colors.h
unittest.h unittest.h
utf8.h
structures/structures.h structures/structures.h
structures/array.h structures/array.h
structures/stack.h structures/stack.h
......
...@@ -42,6 +42,9 @@ ...@@ -42,6 +42,9 @@
/** General error code indicating all is fine */ /** General error code indicating all is fine */
#define CRY_OKAY 0 #define CRY_OKAY 0
/** General error code indicating a failure occured */
#define CRY_FAIL -1
/** typedef to hide void* pointers for clearer interfaces */ /** typedef to hide void* pointers for clearer interfaces */
typedef void* pointer; typedef void* pointer;
...@@ -59,6 +62,12 @@ crydefine__tuple(long, tuple_long); ...@@ -59,6 +62,12 @@ crydefine__tuple(long, tuple_long);
crydefine__tuple(double, tuple_double); crydefine__tuple(double, tuple_double);
crydefine__tuple(float, tuple_float); crydefine__tuple(float, tuple_float);
/** Smaller identifier for uint8_t */
typedef uint8_t byte;
/** Unicode character type */
typedef uint32_t unicode;
/** /**
* Crystal's malloc macro for allocating memory based on a type * Crystal's malloc macro for allocating memory based on a type
* @param Type Set directly the type for this memory block (dont use sizeof) * @param Type Set directly the type for this memory block (dont use sizeof)
......
...@@ -16,6 +16,320 @@ ...@@ -16,6 +16,320 @@
* *
*/ */
#include <assert.h>
#include "utf8.h" #include "utf8.h"
#include "icu.h"
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define UTF8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @internal
*/
#define UTF8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Is this code unit (byte) a UTF-8 lead byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @internal
*/
#define UTF8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
/**
* Is this code unit (byte) a UTF-8 trail byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @internal
*/
#define UTF8_IS_TRAIL(c) (((c)&0xc0)==0x80)
/**
* Advance the string pointer from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @internal
*/
#define UTF8_FWD_1_UNSAFE(s) { \
(s) += 1 + UTF8_COUNT_TRAIL_BYTES_UNSAFE(*s); \
}
/**
* Move the string pointer from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @internal
*/
#define UTF8_BACK_1_UNSAFE(s) { \
while(UTF8_IS_TRAIL(*--(s))) {} \
}
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* The result is undefined if the offset points to a trail byte
* or an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @internal
*/
#define UTF8_NEXT_UNSAFE(s, i, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(uint16_t)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
(i)+=2; \
} else { \
(c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
(i)+=3; \
} \
} \
}
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @internal
*/
#define UTF8_PREV_UNSAFE(s, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(uint8_t)(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(uint32_t)__b<<__shift; \
break; \
} else { \
(c)|=(uint32_t)(__b&0x3f)<<__shift; \
++__count; \
__shift+=6; \
} \
} \
} \
}
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* The result is undefined if the offset points to an illegal UTF-8
* byte sequence.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @internal
*/
#define UTF8_GET_UNSAFE(s, c) { \
int32_t _u8_get_unsafe_index=(int32_t) 0; \
UTF8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
}
byte*
cry_utf8_strchr(const byte* str, unicode character)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
unicode ch = 0;
// search unicode character until end of string found
while(*str != 0) {
UTF8_GET_UNSAFE(str, ch);
if(ch == character)
return cry_cast(byte*, str);
UTF8_FWD_1_UNSAFE(str);
}
return 0;
}
byte*
cry_utf8_strrchr(const byte* str, unicode character)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
const byte* start = str;
unicode ch = 0;
// move to end of utf8 string
while(*str != 0)
UTF8_FWD_1_UNSAFE(str);
// reverse search for unicode character until start adress is found
while(start <= str) {
UTF8_GET_UNSAFE(str, ch);
if(ch == character)
return cry_cast(byte*, str);
UTF8_BACK_1_UNSAFE(str);
}
return 0;
}
byte*
cry_utf8_strstr(const byte* str1, const byte* str2)
{
assert(str1 != 0 && (UTF8_IS_SINGLE(*str1) || UTF8_IS_LEAD(*str1)));
return 0;
}
size_t
cry_utf8_strlen(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
size_t count = 0;
while(*str != 0) {
UTF8_FWD_1_UNSAFE(str);
count++;
}
return count;
}
unicode
cry_utf8_get(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
unicode ch = 0;
UTF8_GET_UNSAFE(str, ch);
return ch;
}
size_t
cry_utf8_codepoints(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
return 1 + UTF8_COUNT_TRAIL_BYTES_UNSAFE(*str);
}
byte*
cry_utf8_next(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
if(*str == 0)
return 0;
UTF8_FWD_1_UNSAFE(str);
return cry_cast(byte*, str);
}
byte*
cry_utf8_prev(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
UTF8_BACK_1_UNSAFE(str);
return cry_cast(byte*, str);
}
size_t
cry_utf8_strsize(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
size_t size = 0;
size_t trails = 0;
while(*str != 0) {
trails = 1 + UTF8_COUNT_TRAIL_BYTES_UNSAFE(*str);
size += trails;
str += trails;
}
return size;
}
int
cry_utf8_validate(const byte* str)
{
assert(str != 0);
size_t trails = 0;
while(*str != 0) {
if(trails == 0 && UTF8_IS_LEAD(*str))
trails = UTF8_COUNT_TRAIL_BYTES_UNSAFE(*str);
else if(trails > 0 && UTF8_IS_TRAIL(*str))
--trails;
else if(trails == 0 && !UTF8_IS_SINGLE(*str))
return CRY_FAIL;
str++;
}
return (trails > 0) ? CRY_FAIL : CRY_OKAY;
}
...@@ -18,6 +18,19 @@ ...@@ -18,6 +18,19 @@
#include "standard.h" #include "standard.h"
typedef uint8_t byte; int cry_utf8_validate(const byte* str);
byte* cry_utf8_strchr(const byte* str, unicode character);
byte* cry_utf8_strrchr(const byte* str, unicode character);
byte* cry_utf8_strstr(const byte* str1, const byte* str2);
size_t cry_utf8_strsize(const byte* str);
size_t cry_utf8_strlen(const byte* str);
unicode cry_utf8_get(const byte* str);
size_t cry_utf8_codepoints(const byte* str);
byte* cry_utf8_next(const byte* str);
byte* cry_utf8_prev(const byte* str);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment