Skip to content

Commit

Permalink
Add Hangul Jamo (de)composition
Browse files Browse the repository at this point in the history
  • Loading branch information
grigorig committed Aug 19, 2012
1 parent 88e2a6c commit 92e82cd
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 13 deletions.
6 changes: 3 additions & 3 deletions ucdn-test.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@ int main(int argc, char **argv)
printf("script %d\n", ucdn_get_script(codepoint));
printf("bidi_class %d\n", ucdn_get_bidi_class(codepoint));

if (ucdn_decompose(codepoint, &a, &b))
printf("decomposition U+%04X U+%04X\n", a, b);

if ((len = ucdn_compat_decompose(codepoint, decomposed))) {
printf("compatibility_decomposition");
for (i = 0; i < len; i++) {
Expand All @@ -49,6 +46,9 @@ int main(int argc, char **argv)
printf("\n");
}

if (ucdn_decompose(codepoint, &a, &b))
printf("decomposition U+%04X U+%04X\n", a, b);

if (ucdn_compose(&codepoint, a, b))
printf("recomposition U+%04X\n", codepoint);

Expand Down
78 changes: 72 additions & 6 deletions ucdn.c
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ typedef struct {

#include "unicodedata_db.h"

/* constants required for Hangul (de)composition */
#define SBASE 0xAC00
#define LBASE 0x1100
#define VBASE 0x1161
#define TBASE 0x11A7
#define SCOUNT 11172
#define LCOUNT 19
#define VCOUNT 21
#define TCOUNT 28
#define NCOUNT (VCOUNT * TCOUNT)

static UCDRecord *get_ucd_record(uint32_t code)
{
int index;
Expand Down Expand Up @@ -91,6 +102,32 @@ static int compare_mp(const void *a, const void *b)
return mpa->from - mpb->from;
}

static int hangul_full_compose(uint32_t *code, uint32_t l, uint32_t v,
uint32_t t)
{
int li = l - LBASE;
int vi = v - VBASE;
int ti = t - TBASE;

if (ti <= 0 || ti >= TCOUNT)
ti = 0;

*code = (li * VCOUNT + vi) * TCOUNT + ti + SBASE;
return ti == 0 ? 2 : 3;
}

static int hangul_full_decompose(uint32_t code, uint32_t *l, uint32_t *v,
uint32_t *t)
{
int si = code - SBASE;

*l = LBASE + si / NCOUNT;
*v = VBASE + (si % NCOUNT) / TCOUNT;
*t = TBASE + si % TCOUNT;

return *t == TBASE ? 2 : 3;
}

const char *ucdn_get_unicode_version(void)
{
return UNIDATA_VERSION;
Expand Down Expand Up @@ -144,14 +181,30 @@ int ucdn_get_script(uint32_t code)

int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
{
unsigned int *rec = get_decomp_record(code);
int len = rec[0] >> 8;
unsigned int *rec;
int len;

if (code >= SBASE && code < (SBASE + SCOUNT)) {
uint32_t l, v, t;
/* LVT decomposition needs a recomposition of the LV part,
to achieve a two-character decomposition */
if (hangul_full_decompose(code, &l, &v, &t) == 3) {
hangul_full_compose(&l, l, v, TBASE);
*a = l;
*b = t;
return 1;
}
*a = l;
*b = v;
return 1;
}

rec = get_decomp_record(code);
len = rec[0] >> 8;

if ((rec[0] & 0xff) != 0 || len == 0)
return 0;

/* TODO: Hangul Jamo decomposition */

*a = rec[1];
if (len > 1)
*b = rec[2];
Expand All @@ -165,6 +218,21 @@ int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
{
int l, r, index, indexi;

if (b >= LBASE && b < (TBASE + TCOUNT)) {
/* LVT compositions are handled in two steps, so
fully decompose the first character if needed */
if (a >= SBASE && a < (SBASE + SCOUNT)) {
uint32_t l, v, t;
hangul_full_decompose(a, &l, &v, &t);
hangul_full_compose(code, l, v, b);
return 1;
} else if (a >= LBASE && a < (TBASE + TCOUNT)) {
hangul_full_compose(code, a, b, TBASE);
return 1;
}
return 0;
}

l = get_comp_index(a, nfc_first);
r = get_comp_index(b, nfc_last);

Expand All @@ -187,8 +255,6 @@ int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
if (len == 0)
return 0;

/* TODO: Hangul Jamo decomposition */

for (i = 0; i < len; i++)
decomposed[i] = rec[i + 1];

Expand Down
8 changes: 4 additions & 4 deletions ucdn.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,8 +247,8 @@ int ucdn_get_mirrored(uint32_t code);
uint32_t ucdn_mirror(uint32_t code);

/**
* Pairwise canonical decomposition of a codepoint. Note that this does
* not include Hangul Jamo decomposition.
* Pairwise canonical decomposition of a codepoint. This includes
* Hangul Jamo decomposition.
*
* @param code Unicode codepoint
* @param a filled with first codepoint of decomposition
Expand All @@ -268,8 +268,8 @@ int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b);
int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed);

/**
* Pairwise canonical composition of two codepoints. Note that this does
* not include Hangul Jamo composition.
* Pairwise canonical composition of two codepoints. This includes
* Hangul Jamo composition.
*
* @param code filled with composition
* @param a first codepoint
Expand Down

0 comments on commit 92e82cd

Please sign in to comment.