Skip to content

Commit 330a395

Browse files
committed
Instead of blindly trusting the first byte, decode UTF-8 safely.
1 parent e241137 commit 330a395

File tree

2 files changed

+76
-22
lines changed

2 files changed

+76
-22
lines changed

LICENSE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ released under the following license:
1818
rgbfix was rewritten from scratch by Anthony J. Bentley, and is released
1919
under the ISC license; see the source file for the text of the license.
2020

21+
The UTF-8 decoder in src/asm/charmap.c was written by Björn Höhrmann and is
22+
released under the MIT license. The remainder of charmap.c was written by
23+
stag019, and is released under the ISC license.
24+
2125
extern/err.c is derived from the Musl C library, http://www.musl-libc.org,
2226
and is released under the MIT license.
2327

src/asm/charmap.c

Lines changed: 72 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,57 @@
1+
/*
2+
* UTF-8 decoder copyright © 2008–2009 Björn Höhrmann <[email protected]>
3+
* http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
4+
*
5+
* Permission is hereby granted, free of charge, to any person obtaining a copy
6+
* of this software and associated documentation files (the "Software"), to
7+
* deal in the Software without restriction, including without limitation the
8+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
9+
* sell copies of the Software, and to permit persons to whom the Software is
10+
* furnished to do so, subject to the following conditions:
11+
*
12+
* The above copyright notice and this permission notice shall be included in
13+
* all copies or substantial portions of the Software.
14+
*
15+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20+
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21+
* IN THE SOFTWARE.
22+
*/
23+
24+
#include <stdint.h>
25+
26+
static const uint8_t utf8d[] = {
27+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
28+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
29+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
30+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
31+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
32+
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
33+
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
34+
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
35+
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
36+
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
37+
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
38+
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
39+
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
40+
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
41+
};
42+
43+
uint32_t
44+
decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
45+
uint32_t type = utf8d[byte];
46+
47+
*codep = (*state != 0) ?
48+
(byte & 0x3fu) | (*codep << 6) :
49+
(0xff >> type) & (byte);
50+
51+
*state = utf8d[256 + *state*16 + type];
52+
return *state;
53+
}
54+
155
/*
256
* Copyright © 2013 stag019 <[email protected]>
357
*
@@ -28,30 +82,26 @@ struct Charmap globalCharmap = {0};
2882
extern struct Section *pCurrentSection;
2983

3084
int
31-
readUTF8Char(char *destination, char *source)
85+
readUTF8Char(char *dest, char *src)
3286
{
33-
int size;
34-
UBYTE first;
35-
first = source[0];
36-
37-
if (first >= 0xFC) {
38-
size = 6;
39-
} else if (first >= 0xF8) {
40-
size = 5;
41-
} else if (first >= 0xF0) {
42-
size = 4;
43-
} else if (first >= 0xE0) {
44-
size = 3;
45-
} else if (first >= 0xC0) {
46-
size = 2;
47-
} else if (first != '\0') {
48-
size = 1;
49-
} else {
50-
size = 0;
87+
uint32_t state;
88+
uint32_t codep;
89+
int i;
90+
91+
for (i = 0, state = 0;; i++) {
92+
if (decode(&state, &codep, (uint8_t)src[i]) == 1) {
93+
fatalerror("invalid UTF-8 character");
94+
}
95+
96+
dest[i] = src[i];
97+
98+
i++;
99+
if (state == 0) {
100+
dest[i] = '\0';
101+
return i;
102+
}
103+
dest[i] = src[i];
51104
}
52-
strncpy(destination, source, size);
53-
destination[size] = 0;
54-
return size;
55105
}
56106

57107
int

0 commit comments

Comments
 (0)