blob: 6fac8ec4f5af514725edbfd5352e18047fb4f91e [file] [log] [blame]
/* See LICENSE file for copyright and license details. */
#include <stddef.h>
#include <stdint.h>
#include "../gen/case.h"
#include "../grapheme.h"
#include "util.h"
static inline enum case_property
get_case_property(uint_least32_t cp)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
return (enum case_property)
case_minor[case_major[cp >> 8] + (cp & 0xFF)];
} else {
return CASE_PROP_OTHER;
}
}
static inline int_least32_t
get_case_offset(uint_least32_t cp, const uint_least16_t *major,
const int_least32_t *minor)
{
if (likely(cp <= UINT32_C(0x10FFFF))) {
/*
* this value might be larger than or equal to 0x110000
* for the special-case-mapping. This needs to be handled
* separately
*/
return minor[major[cp >> 8] + (cp & 0xFF)];
} else {
return 0;
}
}
static inline size_t
to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
uint_least8_t final_sigma_level, const uint_least16_t *major,
const int_least32_t *minor, const struct special_case *sc)
{
HERODOTUS_READER tmp;
enum case_property prop;
enum herodotus_status s;
size_t off, i;
uint_least32_t cp, tmp_cp;
int_least32_t map;
for (; herodotus_read_codepoint(r, true, &cp) ==
HERODOTUS_STATUS_SUCCESS;) {
if (sc == lower_special) {
/*
* For the special Final_Sigma-rule (see
* SpecialCasing.txt), which is the only non-localized
* case-dependent rule, we apply a different mapping
* when a sigma is at the end of a word.
*
* Before: cased case-ignorable*
* After: not(case-ignorable* cased)
*
* We check the after-condition on demand, but the
* before- condition is best checked using the
* "level"-heuristic also used in the sentence and line
* breaking-implementations.
*/
if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER
SIGMA */
(final_sigma_level == 1 ||
final_sigma_level == 2)) {
/*
* check succeeding characters by first skipping
* all case-ignorable characters and then
* checking if the succeeding character is
* cased, invalidating the after-condition
*/
herodotus_reader_copy(r, &tmp);
for (prop = NUM_CASE_PROPS;
(s = herodotus_read_codepoint(&tmp, true,
&tmp_cp)) ==
HERODOTUS_STATUS_SUCCESS;) {
prop = get_case_property(tmp_cp);
if (prop != CASE_PROP_CASE_IGNORABLE &&
prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
}
}
/*
* Now prop is something other than
* case-ignorable or the source-string ended. If
* it is something other than cased, we know
* that the after-condition holds
*/
if (s != HERODOTUS_STATUS_SUCCESS ||
(prop != CASE_PROP_CASED &&
prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/*
* write GREEK SMALL LETTER FINAL SIGMA
* to destination
*/
herodotus_write_codepoint(
w, UINT32_C(0x03C2));
/* reset Final_Sigma-state and continue
*/
final_sigma_level = 0;
continue;
}
}
/* update state */
prop = get_case_property(cp);
if ((final_sigma_level == 0 ||
final_sigma_level == 1) &&
(prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/* sequence has begun */
final_sigma_level = 1;
} else if (
(final_sigma_level == 1 ||
final_sigma_level == 2) &&
(prop == CASE_PROP_CASE_IGNORABLE ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
/* case-ignorable sequence begins or continued
*/
final_sigma_level = 2;
} else {
/* sequence broke */
final_sigma_level = 0;
}
}
/* get and handle case mapping */
if (unlikely((map = get_case_offset(cp, major, minor)) >=
INT32_C(0x110000))) {
/* we have a special case and the offset in the sc-array
* is the difference to 0x110000*/
off = (uint_least32_t)map - UINT32_C(0x110000);
for (i = 0; i < sc[off].cplen; i++) {
herodotus_write_codepoint(w, sc[off].cp[i]);
}
} else {
/* we have a simple mapping */
herodotus_write_codepoint(
w, (uint_least32_t)((int_least32_t)cp + map));
}
}
herodotus_writer_nul_terminate(w);
return herodotus_writer_number_written(w);
}
static size_t
herodotus_next_word_break(const HERODOTUS_READER *r)
{
HERODOTUS_READER tmp;
herodotus_reader_copy(r, &tmp);
if (r->type == HERODOTUS_TYPE_CODEPOINT) {
return grapheme_next_word_break(tmp.src, tmp.srclen);
} else { /* r->type == HERODOTUS_TYPE_UTF8 */
return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
}
}
static inline size_t
to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
{
enum case_property prop;
enum herodotus_status s;
uint_least32_t cp;
size_t nwb;
for (; (nwb = herodotus_next_word_break(r)) > 0;) {
herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
prop = get_case_property(cp);
if (prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
} else {
/* write the data to the output verbatim, it if
* permits */
herodotus_write_codepoint(w, cp);
/* increment reader */
herodotus_read_codepoint(r, true, &cp);
}
}
if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
/* we are done */
herodotus_reader_pop_limit(r);
break;
} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
/*
* we did not encounter any cased character
* up to the word break
*/
herodotus_reader_pop_limit(r);
continue;
} else {
/*
* we encountered a cased character before the word
* break, convert it to titlecase
*/
herodotus_reader_push_advance_limit(
r, herodotus_reader_next_codepoint_break(r));
to_case(r, w, 0, title_major, title_minor,
title_special);
herodotus_reader_pop_limit(r);
}
/* cast the rest of the codepoints in the word to lowercase */
to_case(r, w, 1, lower_major, lower_minor, lower_special);
/* remove the limit on the word before the next iteration */
herodotus_reader_pop_limit(r);
}
herodotus_writer_nul_terminate(w);
return herodotus_writer_number_written(w);
}
size_t
grapheme_to_uppercase(const uint_least32_t *src, size_t srclen,
uint_least32_t *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase(const uint_least32_t *src, size_t srclen,
uint_least32_t *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase(const uint_least32_t *src, size_t srclen,
uint_least32_t *dest, size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
return to_titlecase(&r, &w);
}
size_t
grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest,
size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
}
size_t
grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest,
size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
}
size_t
grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest,
size_t destlen)
{
HERODOTUS_READER r;
HERODOTUS_WRITER w;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
return to_titlecase(&r, &w);
}
static inline bool
is_case(HERODOTUS_READER *r, const uint_least16_t *major,
const int_least32_t *minor, const struct special_case *sc,
size_t *output)
{
size_t off, i;
bool ret = true;
uint_least32_t cp;
int_least32_t map;
for (; herodotus_read_codepoint(r, false, &cp) ==
HERODOTUS_STATUS_SUCCESS;) {
/* get and handle case mapping */
if (unlikely((map = get_case_offset(cp, major, minor)) >=
INT32_C(0x110000))) {
/* we have a special case and the offset in the sc-array
* is the difference to 0x110000*/
off = (uint_least32_t)map - UINT32_C(0x110000);
for (i = 0; i < sc[off].cplen; i++) {
if (herodotus_read_codepoint(r, false, &cp) ==
HERODOTUS_STATUS_SUCCESS) {
if (cp != sc[off].cp[i]) {
ret = false;
goto done;
} else {
/* move forward */
herodotus_read_codepoint(
r, true, &cp);
}
} else {
/*
* input ended and we didn't see
* any difference so far, so this
* string is in fact okay
*/
ret = true;
goto done;
}
}
} else {
/* we have a simple mapping */
if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
/* we have a difference */
ret = false;
goto done;
} else {
/* move forward */
herodotus_read_codepoint(r, true, &cp);
}
}
}
done:
if (output) {
*output = herodotus_reader_number_read(r);
}
return ret;
}
static inline bool
is_titlecase(HERODOTUS_READER *r, size_t *output)
{
enum case_property prop;
enum herodotus_status s;
bool ret = true;
uint_least32_t cp;
size_t nwb;
for (; (nwb = herodotus_next_word_break(r)) > 0;) {
herodotus_reader_push_advance_limit(r, nwb);
for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
HERODOTUS_STATUS_SUCCESS;) {
/* check if we have a cased character */
prop = get_case_property(cp);
if (prop == CASE_PROP_CASED ||
prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
break;
} else {
/* increment reader */
herodotus_read_codepoint(r, true, &cp);
}
}
if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
/* we are done */
break;
} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
/*
* we did not encounter any cased character
* up to the word break
*/
herodotus_reader_pop_limit(r);
continue;
} else {
/*
* we encountered a cased character before the word
* break, check if it's titlecase
*/
herodotus_reader_push_advance_limit(
r, herodotus_reader_next_codepoint_break(r));
if (!is_case(r, title_major, title_minor, title_special,
NULL)) {
ret = false;
goto done;
}
herodotus_reader_pop_limit(r);
}
/* check if the rest of the codepoints in the word are lowercase
*/
if (!is_case(r, lower_major, lower_minor, lower_special,
NULL)) {
ret = false;
goto done;
}
/* remove the limit on the word before the next iteration */
herodotus_reader_pop_limit(r);
}
done:
if (output) {
*output = herodotus_reader_number_read(r);
}
return ret;
}
bool
grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
return is_case(&r, upper_major, upper_minor, upper_special, caselen);
}
bool
grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
return is_case(&r, lower_major, lower_minor, lower_special, caselen);
}
bool
grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
return is_titlecase(&r, caselen);
}
bool
grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
return is_case(&r, upper_major, upper_minor, upper_special, caselen);
}
bool
grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
return is_case(&r, lower_major, lower_minor, lower_special, caselen);
}
bool
grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
{
HERODOTUS_READER r;
herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
return is_titlecase(&r, caselen);
}