Skip to content

Commit

Permalink
Use UTF-16 encoded decompositions to reduce data size
Browse files Browse the repository at this point in the history
There are very few characters (only a handful) that decompose into
codepoints outside of the BMP. So UTF-16 can be quite beneficial. Like
the last change, no measurable performance impact, but much smaller
data size.
  • Loading branch information
grigorig committed Sep 18, 2012
1 parent 867c726 commit 8affcda
Show file tree
Hide file tree
Showing 3 changed files with 836 additions and 795 deletions.
17 changes: 15 additions & 2 deletions makeunicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,9 +223,13 @@ def makeunicodedata(unicode, trace):
try:
i = decomp_data.index(decomp)
except ValueError:
decomp_utf16 = []
for code in decomp:
if code < 0x10000: decomp_utf16.append(code)
else: decomp_utf16.extend(encode_utf16_surrogates(code))
i = len(decomp_data)
decomp_data.extend(decomp)
decomp_size = decomp_size + len(decomp) * 2
decomp_data.extend(decomp_utf16)
decomp_size = decomp_size + len(decomp_utf16) * 2
else:
i = 0
decomp_index[char] = i
Expand Down Expand Up @@ -1470,5 +1474,14 @@ def get_best_split(tab):
print("best", best_block_sizes)
return best_split + best_block_sizes

def encode_utf16_surrogates(code):
assert(code < 0x110000)
code = code - 0x10000
low = code & 1023
high = code >> 10
high_sur = 0xd800 + high
low_sur = 0xdc00 + low
return high_sur, low_sur

if __name__ == "__main__":
maketables(1)
28 changes: 22 additions & 6 deletions ucdn.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ static UCDRecord *get_ucd_record(uint32_t code)
return &ucd_records[index];
}

static unsigned int *get_decomp_record(uint32_t code)
static unsigned short *get_decomp_record(uint32_t code)
{
int index, offset;

Expand Down Expand Up @@ -201,9 +201,23 @@ int ucdn_get_script(uint32_t code)
return get_ucd_record(code)->script;
}

static uint32_t decode_utf16(unsigned short **code_ptr)
{
unsigned short *code = *code_ptr;

if ((code[0] & 0xd800) != 0xd800) {
*code_ptr += 1;
return (uint32_t)code[0];
} else {
*code_ptr += 2;
return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
(((uint32_t)code[0] - 0xd800) << 10);
}
}

int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
{
unsigned int *rec;
unsigned short *rec;
int len;

if (hangul_pair_decompose(code, a, b))
Expand All @@ -215,9 +229,10 @@ int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
if ((rec[0] & 0xff) != 0 || len == 0)
return 0;

*a = rec[1];
rec++;
*a = decode_utf16(&rec);
if (len > 1)
*b = rec[2];
*b = decode_utf16(&rec);
else
*b = 0;

Expand Down Expand Up @@ -250,14 +265,15 @@ int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
{
int i, len;
unsigned int *rec = get_decomp_record(code);
unsigned short *rec = get_decomp_record(code);
len = rec[0] >> 8;

if (len == 0)
return 0;

rec++;
for (i = 0; i < len; i++)
decomposed[i] = rec[i + 1];
decomposed[i] = decode_utf16(&rec);

return len;
}
Loading

0 comments on commit 8affcda

Please sign in to comment.