Skip to content

Commit

Permalink
FEAT: Unicode utils module with decode-utf8 function
Browse files Browse the repository at this point in the history
  • Loading branch information
Oldes committed Oct 26, 2023
1 parent 45749f7 commit bee8e09
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/boot/sysobj.reb
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ modules: object [
httpd: https://src.rebol.tech/modules/httpd.reb
prebol: https://src.rebol.tech/modules/prebol.reb
spotify: https://src.rebol.tech/modules/spotify.reb
unicode-utils: https://src.rebol.tech/modules/unicode-utils.reb
daytime: https://src.rebol.tech/mezz/prot-daytime.reb
mail: https://src.rebol.tech/mezz/prot-mail.reb
mysql: https://src.rebol.tech/mezz/prot-mysql.reb
Expand Down
68 changes: 68 additions & 0 deletions src/modules/unicode-utils.reb
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
Rebol [
title: "Unicode utils"
name: unicode-utils
type: module
version: 0.1.0
exports: [decode-utf8]
author: @Oldes
file: %unicode-utils.reb
home: https://src.rebol.tech/modules/unicode-utils.reb
note: {
Based on Bjoern Hoehrmann's C code:
Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
}
]

decode-utf8: closure/with [
"Converts UTF8 encoded binary to Rebol string ignoring chars outside the Basic Multilingual Plane (BMP)."
bin [binary! file! url!] "Source data in the UTF-8 encoding"
/html "Converts chars over BMP to HTML entities instead of ignoring these"
][
unless binary? bin [bin: read/binary bin]
state: UTF8_ACCEPT
codep: 0
str: make string! length? bin
foreach byte bin [
if byte < 128 [ append str to char! byte continue]
type: pickz utf8d byte
codep: either state = UTF8_ACCEPT [
(0#ff >> type) & byte
][
(byte & 0#3f) | (codep << 6)
]
state: pickz utf8d (256 + state + type)
if state = UTF8_ACCEPT [
case [
codep <= 0#FFFF [ append str to char! codep ]
html [ append str ajoin ["&#" codep #";"] ]
]
codep: 0
]
]
str
][
utf8d: #[u8! [
;; The first part of the table maps bytes to character classes that
;; to reduce the size of the transition table and create bitmasks.
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
8 8 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
10 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 11 6 6 6 5 8 8 8 8 8 8 8 8 8 8 8

;; The second part is a transition table that maps a combination
;; of a state of the automaton and a character class to a state.
0 12 24 36 60 96 84 12 12 12 48 72 12 12 12 12 12 12 12 12 12 12 12 12
12 0 12 12 12 12 12 0 12 0 12 12 12 24 12 12 12 12 12 24 12 24 12 12
12 12 12 12 12 12 12 24 12 12 12 12 12 24 12 12 12 12 12 12 12 24 12 12
12 12 12 12 12 12 12 36 12 36 12 12 12 36 12 12 12 12 12 36 12 36 12 12
12 36 12 12 12 12 12 12 12 12 12 12
]]

UTF8_ACCEPT: 0
UTF8_REJECT: 12
]

0 comments on commit bee8e09

Please # to comment.