Commit e891415e authored by Chris Müller's avatar Chris Müller
Browse files

add utf8 and unicode functions

parent a7eb642e
......@@ -15,14 +15,14 @@
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <stdlib.h>
#include <stdint.h>
#define CHERRY_OK 0
#define CHERRY_FAIL -1
#define TRUE (1)
#define FALSE (0)
......@@ -30,6 +30,11 @@ typedef void* pointer;
typedef const void* const_pointer;
typedef uint8_t byte;
typedef uint32_t unicode;
typedef int (*cy_ordering_funptr)(const_pointer a, const_pointer b);
typedef void (*cy_free_funptr)(pointer data);
/**
* Crystal's malloc macro for allocating memory based on a type
......
/*
* Cherry programming language
* Copyright (C) 2013 Christoph Mueller
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <ctype.h>
#include "unicode.h"
int
cy_unicode_isblank(unicode ch)
{
return isblank(ch);
}
int
cy_unicode_isspace(unicode ch)
{
return isspace(ch);
}
int
cy_unicode_isalpha(unicode ch)
{
return isalpha(ch);
}
int
cy_unicode_isalnum(unicode ch)
{
return isalnum(ch);
}
int
cy_unicode_isdigit(unicode ch)
{
return isdigit(ch);
}
int
cy_unicode_ishex(unicode ch)
{
return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('a' <= ch && ch <= 'f');
}
int
cy_unicode_iscntrl(unicode ch)
{
return iscntrl(ch);
}
int
cy_unicode_isprint(unicode ch)
{
return isprint(ch);
}
int
cy_unicode_isgraph(unicode ch)
{
return isgraph(ch);
}
int
cy_unicode_ispunct(unicode ch)
{
return ispunct(ch);
}
int
cy_unicode_isctrl(unicode ch)
{
return 0;
}
/**
* Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence.
* The maximum supported lead byte is 0xf4 corresponding to U+10FFFF.
* leadByte might be evaluated multiple times.
*
* This is internal since it is not meant to be called directly by external clients;
* however it is called by public macros in this file and thus must remain stable.
*
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
* @internal
*/
#define UTF8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \
(((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0))
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @internal
*/
#define UTF8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Is this code unit (byte) a UTF-8 lead byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @internal
*/
#define UTF8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
/**
* Is this code unit (byte) a UTF-8 trail byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @internal
*/
#define UTF8_IS_TRAIL(c) (((c)&0xc0)==0x80)
/**
* Advance the string pointer from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @internal
*/
#define UTF8_FWD_1_UNSAFE(s) { \
(s) += 1 + UTF8_COUNT_TRAIL_BYTES_UNSAFE(*s); \
}
/**
* Move the string pointer from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const uint8_t * string
* @internal
*/
#define UTF8_BACK_1_UNSAFE(s) { \
while(UTF8_IS_TRAIL(*--(s))) {} \
}
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* The result is undefined if the offset points to a trail byte
* or an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @internal
*/
#define UTF8_NEXT_UNSAFE(s, i, c) { \
(c)=(uint8_t)(s)[(i)++]; \
if((c)>=0x80) { \
if((c)<0xe0) { \
(c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \
} else if((c)<0xf0) { \
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
(c)=(uint16_t)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \
(i)+=2; \
} else { \
(c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \
(i)+=3; \
} \
} \
}
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @internal
*/
#define UTF8_PREV_UNSAFE(s, i, c) { \
(c)=(uint8_t)(s)[--(i)]; \
if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(uint8_t)(s)[--(i)]; \
if(__b>=0xc0) { \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(uint32_t)__b<<__shift; \
break; \
} else { \
(c)|=(uint32_t)(__b&0x3f)<<__shift; \
++__count; \
__shift+=6; \
} \
} \
} \
}
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* The result is undefined if the offset points to an illegal UTF-8
* byte sequence.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const uint8_t * string
* @param i string offset
* @param c output UChar32 variable
* @internal
*/
#define UTF8_GET_UNSAFE(s, c) { \
int32_t _u8_get_unsafe_index=(int32_t) 0; \
UTF8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \
}
byte*
cy_utf8_chr(const byte* str, unicode character)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
unicode ch = 0;
// search unicode character until end of string found
while(*str != 0) {
UTF8_GET_UNSAFE(str, ch);
if(ch == character)
return cy_cast(byte*, str);
UTF8_FWD_1_UNSAFE(str);
}
return 0;
}
byte*
cy_utf8_rchr(const byte* str, unicode character)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
const byte* start = str;
unicode ch = 0;
// move to end of utf8 string
while(*str != 0)
UTF8_FWD_1_UNSAFE(str);
// reverse search for unicode character until start adress is found
while(start <= str) {
UTF8_GET_UNSAFE(str, ch);
if(ch == character)
return cy_cast(byte*, str);
UTF8_BACK_1_UNSAFE(str);
}
return 0;
}
byte*
cy_utf8_str(const byte* str1, const byte* str2)
{
assert(str1 != 0 && (UTF8_IS_SINGLE(*str1) || UTF8_IS_LEAD(*str1)));
const byte* cp = str1;
const byte* s1 = 0;
const byte* s2 = 0;
if( !*str2 )
return cy_cast(byte*, str1);
while(*cp) {
s1 = cp;
s2 = str2;
while(*s1 && *s2 && !(*s1 - *s2))
s1++, s2++;
if( !*s2)
return cy_cast(byte*, cp);
++cp;
}
return 0;
}
int
cy_utf8_compare(const byte* str1, const byte* str2)
{
register const byte* s1 = str1;
register const byte* s2 = str2;
register byte c1, c2;
do {
c1 = *s1++;
c2 = *s2++;
if(c1 == '\0')
return c1 - c2;
} while(c1 == c2);
return c1 - c2;
}
size_t
cy_utf8_len(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
size_t count = 0;
while(*str != 0) {
UTF8_FWD_1_UNSAFE(str);
count++;
}
return count;
}
unicode
cy_utf8_get(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
unicode ch = 0;
UTF8_GET_UNSAFE(str, ch);
return ch;
}
size_t
cy_utf8_codepoints(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
return 1 + UTF8_COUNT_TRAIL_BYTES_UNSAFE(*str);
}
byte*
cy_utf8_next(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
if(*str == 0)
return 0;
UTF8_FWD_1_UNSAFE(str);
return cy_cast(byte*, str);
}
byte*
cy_utf8_prev(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
UTF8_BACK_1_UNSAFE(str);
return cy_cast(byte*, str);
}
size_t
cy_utf8_size(const byte* str)
{
assert(str != 0 && (UTF8_IS_SINGLE(*str) || UTF8_IS_LEAD(*str)));
size_t size = 0;
size_t trails = 0;
while(*str != 0) {
trails = 1 + UTF8_COUNT_TRAIL_BYTES_UNSAFE(*str);
size += trails;
str += trails;
}
return size;
}
int
cy_utf8_validate(const byte* str)
{
assert(str != 0);
size_t trails = 0;
while(*str != 0) {
if(trails == 0 && UTF8_IS_LEAD(*str))
trails = UTF8_COUNT_TRAIL_BYTES_UNSAFE(*str);
else if(trails > 0 && UTF8_IS_TRAIL(*str))
--trails;
else if(trails == 0 && !UTF8_IS_SINGLE(*str))
return CHERRY_FAIL;
str++;
}
return (trails > 0) ? CHERRY_FAIL : CHERRY_OK;
}
/*
* Cherry programming language
* Copyright (C) 2013 Christoph Mueller
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include "standard.h"
int cy_unicode_isblank(unicode ch);
int cy_unicode_isspace(unicode ch);
int cy_unicode_isalpha(unicode ch);
int cy_unicode_isalnum(unicode ch);
int cy_unicode_isdigit(unicode ch);
int cy_unicode_ishex(unicode ch);
int cy_unicode_isoct(unicode ch);
int cy_unicode_isprint(unicode ch);
int cy_unicode_isgraph(unicode ch);
int cy_unicode_ispunct(unicode ch);
int cy_unicode_iscntrl(unicode ch);
int cy_utf8_validate(const byte* str);
int cy_utf8_compare(const byte* str1, const byte* str2);
byte* cy_utf8_chr(const byte* str, unicode character);
byte* cy_utf8_rchr(const byte* str, unicode character);
byte* cy_utf8_str(const byte* str1, const byte* str2);
size_t cy_utf8_size(const byte* str);
size_t cy_utf8_len(const byte* str);
unicode cy_utf8_get(const byte* str);
size_t cy_utf8_codepoints(const byte* str);
byte* cy_utf8_next(const byte* str);
byte* cy_utf8_prev(const byte* str);
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment