Skip to content

Commit

Permalink
Add initial support for katakana ##charset
Browse files Browse the repository at this point in the history
* Support multibyte charsets
  • Loading branch information
radare authored and trufae committed Feb 16, 2022
1 parent 99a3887 commit 33ce7e7
Show file tree
Hide file tree
Showing 7 changed files with 264 additions and 18 deletions.
2 changes: 2 additions & 0 deletions libr/util/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ OBJS+=ascii_table.o protobuf.o graph_drawable.o axml.o sstext.o new_rbtree.o
ifeq (${HAVE_GPERF},1)
OBJS+=d/ascii.o
OBJS+=d/pokered.o
OBJS+=d/katakana.o
OBJS+=d/hiragana.o
OBJS+=d/ebcdic37.o
OBJS+=d/iso8859_1.o
endif
Expand Down
77 changes: 60 additions & 17 deletions libr/util/charset.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* radare - LGPL - Copyright 2020-2021 - gogo, pancake */
/* radare - LGPL - Copyright 2020-2022 - gogo, pancake */

#include <r_util.h>
#include <config.h>
Expand All @@ -7,15 +7,19 @@

#if HAVE_GPERF
extern SdbGperf gperf_ascii;
extern SdbGperf gperf_pokered;
extern SdbGperf gperf_ebcdic37;
extern SdbGperf gperf_hiragana;
extern SdbGperf gperf_iso8859_1;
extern SdbGperf gperf_katakana;
extern SdbGperf gperf_pokered;

static const SdbGperf *gperfs[] = {
&gperf_ascii,
&gperf_pokered,
&gperf_ebcdic37,
&gperf_hiragana,
&gperf_iso8859_1,
&gperf_katakana,
&gperf_pokered,
NULL
};

Expand Down Expand Up @@ -223,6 +227,7 @@ R_API size_t r_charset_encode_str(RCharset *rc, ut8 *out, size_t out_len, const
}
fine = true;
r_str_unescape (res);
// memcpy (o, res, out_len - i);
r_str_ncpy (o, res, out_len - i);
free (res);
}
Expand Down Expand Up @@ -250,33 +255,71 @@ R_API size_t r_charset_decode_str(RCharset *rc, ut8 *out, size_t out_len, const
if (!str) {
break;
}
r_str_ncpy (str, (char *)in + cur, toread);
memcpy (str, in + cur, toread);
bool found = false;
for (j = toread; cur < in_len && j > 0; j--) {
left = in_len - cur + 1;
toread = R_MIN (left, maxkeylen);
//zero terminate the string
str[j] = '\0';

str[j] = 0;
const char *v = sdb_const_get (rc->db_char_to_hex, (char *) str, 0);
if (v) {
int repeat = !strncmp (v, "0x", 2)? strlen (v + 2) / 2: 1;
ut64 nv = r_num_get (NULL, v);
if (!nv) {
int i;
// write 0x00 N times (
for (i = 0; i < repeat; i++) {
// write null byte
memcpy (o, "\x00", 2);
o++;
}
o--;
found = true;
break;
}
//convert to ascii
char *str_hx = malloc (1 + maxkeylen);
if (!str_hx) {
break;
}
//in the future handle multiple chars output
snprintf (str_hx, maxkeylen + 1, "%c", (char) strtol (v, 0, 16));
const char *ret = r_str_get_fail (str_hx, "?");
if (nv > 0xff) {
ut64 d = 0;
r_mem_swapendian ((ut8*)&d, (const ut8*)&nv, 8);
nv = d;
}
int i;
bool skip = true;
int chcount = 0;
for (i = 0; i < 8; i++) {
ut8 bv = nv & 0xff;
// skip until we found one byet
if (bv & 0xff) {
skip = false;
}
if (skip) {
nv >>= 8;
continue;
} else if (!bv) {
break;
}
// eprintf ("-> 0x%02x\n", nv & 0xff);
//in the future handle multiple chars output
str_hx[0] = bv;
str_hx[1] = 0;
const char *ret = r_str_get_fail (str_hx, "?");

// concatenate
const size_t ll = R_MIN (left, strlen (ret) + 1);
if (ll > 0) {
r_str_ncpy (o, ret, ll);
o += ll - 1;
// concatenate
const size_t ll = R_MIN (left, strlen (ret) + 1);
if (ll > 0) {
memcpy (o, ret, ll);
o[ll] = 0;
o += ll - 1;
chcount++;
}
found = true;
nv >>= 8;
}
found = true;
cur += j - 1;
cur += (chcount>1)?chcount - 2:j-1;
free (str_hx);
break;
}
Expand Down
2 changes: 1 addition & 1 deletion libr/util/d/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FILES=pokered ascii ebcdic37 iso8859_1
FILES=pokered ascii ebcdic37 iso8859_1 katakana hiragana
F_SDB=$(addsuffix .sdb,$(FILES))
SDB=../../../shlr/sdb/sdb

Expand Down
87 changes: 87 additions & 0 deletions libr/util/d/hiragana.sdb.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# define utf8-phoneme translation for Japanese's Hiragana charset
0xa=
0xe38181=a
0xe38182=A
0xe38183=i
0xe38184=I
0xe38185=u
0xe38186=U
0xe38187=e
0xe38188=E
0xe38189=o
0xe3818a=O
0xe3818b=ka
0xe3818c=ga
0xe3818d=ki
0xe3818e=gi
0xe3818f=ku
0xe38190=gu
0xe38191=ke
0xe38192=ge
0xe38193=ko
0xe38194=go
0xe38195=sa
0xe38196=za
0xe38197=si
0xe38198=zi
0xe38199=su
0xe3819a=zu
0xe3819b=se
0xe3819c=ze
0xe3819d=so
0xe3819e=zo
0xe3819f=ta
0xe381a0=da
0xe381a1=ti
0xe381a2=di
0xe381a3=tu
0xe381a4=du
0xe381a5=tu
0xe381a6=du
0xe381a6=te
0xe381a7=de
0xe381a8=to
0xe381a9=do
0xe381aa=na
0xe381ab=ni
0xe381ac=nu
0xe381ad=ne
0xe381ae=no
0xe381af=ha
0xe381b0=ba
0xe381b1=pa
0xe381b2=hi
0xe381b3=bi
0xe381b4=pi
0xe381b5=hu
0xe381b6=bu
0xe381b7=pu
0xe381b8=he
0xe381b9=be
0xe381ba=pe
0xe381bb=ho
0xe381bc=bo
0xe381bd=po
0xe381be=ma
0xe381bf=mi
# unicode hole
0xe38280=mu
0xe38281=me
0xe38282=mo
0xe38283=ya
0xe38284=YA
0xe38285=yu
0xe38286=YU
0xe38287=yo
0xe38288=YO
0xe38289=ra
0xe3828a=ri
0xe3828b=ru
0xe3828c=re
0xe3828d=ro
0xe3828e=wa
0xe3828f=WA
0xe38290=wi
0xe38291=we
0xe38292=wo
0xe38293=n
93 changes: 93 additions & 0 deletions libr/util/d/katakana.sdb.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# define utf8-phoneme translation for Japanese's Katakana charset
0xa=
0xe382a1=a
0xe382a2=A
0xe382a3=i
0xe382a4=I
0xe382a5=u
0xe382a6=U
0xe382a7=e
0xe382a8=E
0xe382a9=o
0xe382aa=O
0xe382ab=ka
0xe382ac=ga
0xe382ad=ki
0xe382ae=gi
0xe382af=ku
0xe382b0=gu
0xe382b1=ke
0xe382b2=ge
0xe382b3=ko
0xe382b4=go
0xe382b5=sa
0xe382b6=za
0xe382b7=si
0xe382b8=zi
0xe382b9=su
0xe382ba=zu
0xe382bb=se
0xe382bc=ze
0xe382bd=so
0xe382be=zo
0xe382bf=ta
## unicode hole
0xe38380=da
0xe38381=ti
0xe38382=di
0xe38383=tu
0xe38384=du
0xe38385=tu
0xe38386=du
0xe38386=te
0xe38387=de
0xe38388=to
0xe38389=do
0xe3838a=na
0xe3838b=ni
0xe3838c=nu
0xe3838d=ne
0xe3838e=no
0xe3838f=ha
0xe38390=ba
0xe38391=pa
0xe38392=hi
0xe38393=bi
0xe38394=pi
0xe38395=hu
0xe38396=bu
0xe38397=pu
0xe38398=he
0xe38399=be
0xe3839a=pe
0xe3839b=ho
0xe3839c=bo
0xe3839d=po
0xe3839e=ma
0xe3839f=mi
0xe383a0=mu
0xe383a1=me
0xe383a2=mo
0xe383a3=ya
0xe383a4=YA
0xe383a5=yu
0xe383a6=YU
0xe383a7=yo
0xe383a8=YO
0xe383a9=ra
0xe383aa=ri
0xe383ab=ru
0xe383ac=re
0xe383ad=ro
0xe383ae=wa
0xe383af=WA
0xe383b0=wi
0xe383b1=we
0xe383b2=wo
0xe383b3=n
0xe383b4=vu
0xe383b5=ka
0xe383b6=ke
# another hole
0xe3829b="
0xe3829c=.
2 changes: 2 additions & 0 deletions libr/util/d/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ sdb_files = [
'pokered',
'ebcdic37',
'ascii',
'katakana',
'hiragana',
'iso8859_1'
]

Expand Down
19 changes: 19 additions & 0 deletions test/db/cmd/charset
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ EOF
EXPECT=<<EOF
ascii
ebcdic37
hiragana
iso8859_1
katakana
pokered
\xa6\xb1\xae\xb4\xad\xa3\x7f\xb3\xae\x7f\xaf\xb1\xae\xb3\xa4\xa2\xb3I\xa8\xb3\xb2\xa4\xab\xa5\x7f\xa5\xb1\xae\xac\x7f\xa8\xb3
ground-to-protect<PAGE>itself-fr
Expand All @@ -95,14 +97,18 @@ EOF
EXPECT=<<EOF
ascii
ebcdic37
hiragana
iso8859_1
katakana
pokered
--------Self-replicating----arbi
--------Self-replicating----arbi
--------Self-replicating----arbi
ascii
ebcdic37
hiragana
iso8859_1
katakana
pokered
--------Self-replicating----arbi
--------Self-replicating----arbi
Expand Down Expand Up @@ -144,3 +150,16 @@ EXPECT=<<EOF
AAA[nul]AAA[stx]
EOF
RUN

NAME=ps on katakana
FILE=-
CMDS=<<EOF
e cfg.charset=katakana
w radare
pr 9
?e
EOF
EXPECT=<<EOF
ラダレ
EOF
RUN

0 comments on commit 33ce7e7

Please # to comment.