Skip to content
This repository was archived by the owner on Jun 15, 2023. It is now read-only.

Commit de843ea

Browse files
author
Iwan
committed
## Unicode support
This PR adds support for Unicode codepoints at the syntax level: ReScript source code is now unicode text encoded in UTF-8. Fixes #397 ### Codepoint literals A codepoint literal represents an integer value identifying a unicode code point. It is expressed as one or more characters enclosed in single quotes. Examples are `’x’`, `’\n’` or `\u{00A9}`. Multiple UTF-8-encoded bytes may represent a single integer value. ### String literals String literals are (possibly multi-byte) UTF-8 encoded character sequences between double quotes, as in `"fox"`. ### New escape sequences Both codepoint and string literals accept the following new escape sequences: 1) Unicode escape sequences Any character with a character code lower than 65536 can be escaped using the hexadecimal value of its character code, prefixed with `\u`. Unicode escapes are six characters long. They require exactly four characters following `\u` . If the hexadecimal character code is only one, two or three characters long, you’ll need to pad it with leading zeroes. Example: `'\u2665'` (Represents ♥) 2) Unicode codepoint escape sequences Any code point or character can be escaped using the hexadecimal value of its character code, prefixed with `\u{` and suffixed with `}` . This allows for code points up to 0x10FFFF, which is the highest code point defined by Unicode. Unicode code point escapes consist of at least five characters. At least one hexadecimal character can be wrapped in `\u{…}` . There is no upper limit on the number of hex digits in use (for example '\u{000000000061}' == 'a') Example: `'\u{2318}'` (Represents ⌘)
1 parent d5986f9 commit de843ea

20 files changed

+472
-87
lines changed

.depend

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ src/res_comment.cmx : src/res_comment.cmi
1212
src/res_comment.cmi :
1313
src/res_comments_table.cmx : src/res_parsetree_viewer.cmx src/res_doc.cmx \
1414
src/res_comment.cmx
15-
src/res_core.cmx : src/res_token.cmx src/res_scanner.cmx src/res_printer.cmx \
16-
src/res_parser.cmx src/res_js_ffi.cmx src/res_grammar.cmx src/res_doc.cmx \
17-
src/res_diagnostics.cmx src/res_comments_table.cmx src/res_core.cmi
15+
src/res_core.cmx : src/res_utf8.cmx src/res_token.cmx src/res_scanner.cmx \
16+
src/res_printer.cmx src/res_parser.cmx src/res_js_ffi.cmx \
17+
src/res_grammar.cmx src/res_doc.cmx src/res_diagnostics.cmx \
18+
src/res_comments_table.cmx src/res_core.cmi
1819
src/res_core.cmi : src/res_parser.cmi
1920
src/res_diagnostics.cmx : src/res_token.cmx src/res_grammar.cmx \
2021
src/res_diagnostics_printing_utils.cmx src/res_diagnostics.cmi
@@ -60,16 +61,19 @@ src/res_parser.cmi : src/res_token.cmx src/res_scanner.cmi \
6061
src/res_comment.cmi
6162
src/res_parsetree_viewer.cmx : src/res_parsetree_viewer.cmi
6263
src/res_parsetree_viewer.cmi :
63-
src/res_printer.cmx : src/res_token.cmx src/res_parsetree_viewer.cmx \
64-
src/res_parens.cmx src/res_doc.cmx src/res_comments_table.cmx \
65-
src/res_comment.cmx src/res_printer.cmi
64+
src/res_printer.cmx : src/res_utf8.cmx src/res_token.cmx \
65+
src/res_parsetree_viewer.cmx src/res_parens.cmx src/res_doc.cmx \
66+
src/res_comments_table.cmx src/res_comment.cmx src/res_printer.cmi
6667
src/res_printer.cmi : src/res_doc.cmi src/res_comments_table.cmx \
6768
src/res_comment.cmi
6869
src/res_reporting.cmx : src/res_token.cmx src/res_grammar.cmx
69-
src/res_scanner.cmx : src/res_token.cmx src/res_diagnostics.cmx \
70-
src/res_comment.cmx src/res_scanner.cmi
70+
src/res_scanner.cmx : src/res_utf8.cmx src/res_token.cmx \
71+
src/res_diagnostics.cmx src/res_comment.cmx src/res_scanner.cmi
7172
src/res_scanner.cmi : src/res_token.cmx src/res_diagnostics.cmi
7273
src/res_token.cmx : src/res_comment.cmx
73-
tests/res_test.cmx : src/res_token.cmx src/res_parser.cmx \
74-
src/res_outcome_printer.cmx src/res_multi_printer.cmx src/res_io.cmx \
75-
src/res_driver.cmx src/res_core.cmx
74+
src/res_utf8.cmx : src/res_utf8.cmi
75+
src/res_utf8.cmi :
76+
tests/res_test.cmx : tests/res_utf8_test.cmx src/res_token.cmx \
77+
src/res_parser.cmx src/res_outcome_printer.cmx src/res_multi_printer.cmx \
78+
src/res_io.cmx src/res_driver.cmx src/res_core.cmx
79+
tests/res_utf8_test.cmx : src/res_utf8.cmx

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ API_FILES = \
2424
src/res_parsetree_viewer.cmx\
2525
src/res_parens.cmx\
2626
src/res_comments_table.cmx\
27+
src/res_utf8.cmx\
2728
src/res_printer.cmx\
2829
src/res_scanner.cmx\
2930
src/res_js_ffi.cmx\
@@ -40,7 +41,7 @@ API_FILES = \
4041

4142
CLI_FILES = $(API_FILES) src/res_cli.cmx
4243

43-
TEST_FILES = $(API_FILES) tests/res_test.cmx
44+
TEST_FILES = $(API_FILES) tests/res_utf8_test.cmx tests/res_test.cmx
4445

4546
.DEFAULT_GOAL := build-native
4647

benchmarks/Benchmark.ml

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ module Printer = Res_printer
66

77
module IO: sig
88
val readFile: string -> string
9-
val readStdin: unit -> string
109
end = struct
1110
(* random chunk size: 2^15, TODO: why do we guess randomly? *)
1211
let chunkSize = 32768
@@ -26,21 +25,6 @@ end = struct
2625
)
2726
in
2827
loop ()
29-
30-
let readStdin () =
31-
let buffer = Buffer.create chunkSize in
32-
let chunk = (Bytes.create [@doesNotRaise]) chunkSize in
33-
let rec loop () =
34-
let len = try input stdin chunk 0 chunkSize with Invalid_argument _ -> 0 in
35-
if len == 0 then (
36-
close_in_noerr stdin;
37-
Buffer.contents buffer
38-
) else (
39-
Buffer.add_subbytes buffer chunk 0 len;
40-
loop ()
41-
)
42-
in
43-
loop ()
4428
end
4529

4630
module Time: sig
@@ -188,29 +172,6 @@ end = struct
188172
done
189173
end
190174

191-
module Profile: sig
192-
val record : name:string -> (unit -> 'a) -> 'a
193-
val print: unit -> unit
194-
end = struct
195-
let state = Hashtbl.create 2
196-
197-
let record ~name f =
198-
let startTime = Time.now() in
199-
let result = f() in
200-
let endTime = Time.now() in
201-
202-
Hashtbl.add state name (Time.diff startTime endTime);
203-
result
204-
205-
let print () =
206-
let report = Hashtbl.fold (fun k v acc ->
207-
let line = Printf.sprintf "%s: %fms\n" k (Time.print v) in
208-
acc ^ line
209-
) state "\n\n"
210-
in
211-
print_endline report
212-
end
213-
214175
module Benchmarks: sig
215176
val run: unit -> unit
216177
end = struct

src/res_ast_debugger.ml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,10 +143,13 @@ module SexpAst = struct
143143
string txt;
144144
optChar tag;
145145
]
146-
| Pconst_char c ->
146+
| Pconst_char _ ->
147+
Sexp.list [
148+
Sexp.atom "Pconst_char";
149+
]
150+
| Pconst_string(_, Some "INTERNAL_RES_CHAR_CONTENTS") ->
147151
Sexp.list [
148152
Sexp.atom "Pconst_char";
149-
Sexp.atom (Char.escaped c);
150153
]
151154
| Pconst_string (txt, tag) ->
152155
Sexp.list [

src/res_cli.ml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,7 @@ module ResClflags: sig
165165
val file: string ref
166166
val interface: bool ref
167167
val ppx: string ref
168+
val typechecker: bool ref
168169

169170
val parse: unit -> unit
170171
end = struct
@@ -176,6 +177,7 @@ end = struct
176177
let interface = ref false
177178
let ppx = ref ""
178179
let file = ref ""
180+
let typechecker = ref false
179181

180182
let usage = "\n**This command line is for the repo developer's testing purpose only. DO NOT use it in production**!\n\n" ^
181183
"Usage:\n rescript <options> <file>\n\n" ^
@@ -192,6 +194,7 @@ end = struct
192194
("-width", Arg.Int (fun w -> width := w), "Specify the line length for the printer (formatter)");
193195
("-interface", Arg.Unit (fun () -> interface := true), "Parse as interface");
194196
("-ppx", Arg.String (fun txt -> ppx := txt), "Apply a specific built-in ppx before parsing, none or jsx. Default: none");
197+
("-typechecker", Arg.Unit (fun () -> typechecker := true), "Parses the ast as it would be passed to the typechecker and not the printer")
195198
]
196199

197200
let parse () = Arg.parse spec (fun f -> file := f) usage
@@ -200,7 +203,7 @@ end
200203
module CliArgProcessor = struct
201204
type backend = Parser: ('diagnostics) Res_driver.parsingEngine -> backend [@@unboxed]
202205

203-
let processFile ~isInterface ~width ~recover ~origin ~target ~ppx filename =
206+
let processFile ~isInterface ~width ~recover ~origin ~target ~ppx ~typechecker filename =
204207
let len = String.length filename in
205208
let processInterface =
206209
isInterface || len > 0 && (String.get [@doesNotRaise]) filename (len - 1) = 'i'
@@ -233,7 +236,7 @@ module CliArgProcessor = struct
233236
in
234237

235238
let forPrinter = match target with
236-
| "res" | "sexp" -> true
239+
| "res" | "sexp" when not typechecker -> true
237240
| _ -> false
238241
in
239242

@@ -292,5 +295,6 @@ let [@raises Invalid_argument, Failure, exit] () =
292295
~target:!ResClflags.print
293296
~origin:!ResClflags.origin
294297
~ppx:!ResClflags.ppx
298+
~typechecker:!ResClflags.typechecker
295299
!ResClflags.file
296300
end

src/res_core.ml

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ type stringLiteralState =
143143
| HexEscape
144144
| DecimalEscape
145145
| OctalEscape
146+
| UnicodeEscape
147+
| UnicodeCodePointEscape
148+
| UnicodeEscapeStart
146149
| EscapedLineBreak
147150

148151
type typDefOrExt =
@@ -482,15 +485,12 @@ let processUnderscoreApplication args =
482485
in
483486
(args, wrap)
484487

485-
let hexValue x =
486-
match x with
487-
| '0' .. '9' ->
488-
(Char.code x) - 48
489-
| 'A' .. 'Z' ->
490-
(Char.code x) - 55
491-
| 'a' .. 'z' ->
492-
(Char.code x) - 97
493-
| _ -> 16
488+
let hexValue ch =
489+
match ch with
490+
| '0'..'9' -> (Char.code ch) - 48
491+
| 'a'..'f' -> (Char.code ch) - (Char.code 'a') + 10
492+
| 'A'..'F' -> (Char.code ch) + 32 - (Char.code 'a') + 10
493+
| _ -> 16 (* larger than any legal value *)
494494

495495
let parseStringLiteral s =
496496
let len = String.length s in
@@ -499,7 +499,7 @@ let parseStringLiteral s =
499499
let rec parse state i d =
500500
if i = len then
501501
(match state with
502-
| HexEscape | DecimalEscape | OctalEscape -> false
502+
| HexEscape | DecimalEscape | OctalEscape | UnicodeEscape | UnicodeCodePointEscape -> false
503503
| _ -> true)
504504
else
505505
let c = String.unsafe_get s i in
@@ -517,6 +517,7 @@ let parseStringLiteral s =
517517
| ('\\' | ' ' | '\'' | '"') as c -> Buffer.add_char b c; parse Start (i + 1) d
518518
| 'x' -> parse HexEscape (i + 1) 0
519519
| 'o' -> parse OctalEscape (i + 1) 0
520+
| 'u' -> parse UnicodeEscapeStart (i + 1) 0
520521
| '0' .. '9' -> parse DecimalEscape i 0
521522
| '\010' | '\013' -> parse EscapedLineBreak (i + 1) d
522523
| c -> Buffer.add_char b '\\'; Buffer.add_char b c; parse Start (i + 1) d)
@@ -558,6 +559,45 @@ let parseStringLiteral s =
558559
)
559560
else
560561
parse OctalEscape (i + 1) (d + 1)
562+
| UnicodeEscapeStart ->
563+
(match c with
564+
| '{' -> parse UnicodeCodePointEscape (i + 1) 0
565+
| _ -> parse UnicodeEscape (i + 1) 1)
566+
| UnicodeEscape ->
567+
if d == 3 then
568+
let c0 = String.unsafe_get s (i - 3) in
569+
let c1 = String.unsafe_get s (i - 2) in
570+
let c2 = String.unsafe_get s (i - 1) in
571+
let c3 = String.unsafe_get s i in
572+
let c = (4096 * (hexValue c0)) + (256 * (hexValue c1)) + (16 * (hexValue c2)) + (hexValue c3) in
573+
if Res_utf8.isValidCodePoint c then (
574+
let codePoint = Res_utf8.encodeCodePoint c in
575+
Buffer.add_string b codePoint;
576+
parse Start (i + 1) 0
577+
) else (
578+
false
579+
)
580+
else
581+
parse UnicodeEscape (i + 1) (d + 1)
582+
| UnicodeCodePointEscape ->
583+
(match c with
584+
| '0'..'9' | 'a'..'f' | 'A'.. 'F' ->
585+
parse UnicodeCodePointEscape (i + 1) (d + 1)
586+
| '}' ->
587+
let x = ref 0 in
588+
for remaining = d downto 1 do
589+
let ix = i - remaining in
590+
x := (!x * 16) + (hexValue (String.unsafe_get s ix));
591+
done;
592+
let c = !x in
593+
if Res_utf8.isValidCodePoint c then (
594+
let codePoint = Res_utf8.encodeCodePoint !x in
595+
Buffer.add_string b codePoint;
596+
parse Start (i + 1) 0
597+
) else (
598+
false
599+
)
600+
| _ -> false)
561601
| EscapedLineBreak ->
562602
(match c with
563603
| ' ' | '\t' -> parse EscapedLineBreak (i + 1) d
@@ -877,7 +917,11 @@ let parseConstant p =
877917
s
878918
in
879919
Pconst_string(txt, None)
880-
| Character c -> Pconst_char c
920+
| Character {c; original} ->
921+
if p.mode = ParseForTypeChecker then
922+
Pconst_char c
923+
else
924+
Pconst_string (original, Some "INTERNAL_RES_CHAR_CONTENTS")
881925
| token ->
882926
Parser.err p (Diagnostics.unexpected token p.breadcrumbs);
883927
Pconst_string("", None)

src/res_outcome_printer.ml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -348,7 +348,7 @@ let printPolyVarIdent txt =
348348
| [], [] -> Doc.nil
349349
| labels, types ->
350350
let i = ref 0 in
351-
let package = Doc.join ~sep:Doc.line (List.map2 (fun lbl typ ->
351+
let package = Doc.join ~sep:Doc.line ((List.map2 [@doesNotRaise]) (fun lbl typ ->
352352
Doc.concat [
353353
Doc.text (if i.contents > 0 then "and " else "with ");
354354
Doc.text lbl;

src/res_printer.ml

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -508,14 +508,32 @@ let printConstant c = match c with
508508
Doc.text "\"";
509509
]
510510
| Pconst_string (txt, Some prefix) ->
511-
Doc.concat [
512-
if prefix = "js" then Doc.nil else Doc.text prefix;
513-
Doc.text "`";
514-
printStringContents txt;
515-
Doc.text "`";
516-
]
511+
if prefix = "INTERNAL_RES_CHAR_CONTENTS" then
512+
Doc.concat [Doc.text "'"; Doc.text txt; Doc.text "'"]
513+
else
514+
Doc.concat [
515+
if prefix = "js" then Doc.nil else Doc.text prefix;
516+
Doc.text "`";
517+
printStringContents txt;
518+
Doc.text "`";
519+
]
517520
| Pconst_float (s, _) -> Doc.text s
518-
| Pconst_char c -> Doc.text ("'" ^ (Char.escaped c) ^ "'")
521+
| Pconst_char c ->
522+
let str = match c with
523+
| '\'' -> "\\'"
524+
| '\\' -> "\\\\"
525+
| '\n' -> "\\n"
526+
| '\t' -> "\\t"
527+
| '\r' -> "\\r"
528+
| '\b' -> "\\b"
529+
| ' ' .. '~' as c ->
530+
let s = (Bytes.create [@doesNotRaise]) 1 in
531+
Bytes.unsafe_set s 0 c;
532+
Bytes.unsafe_to_string s
533+
| c ->
534+
Res_utf8.encodeCodePoint (Obj.magic c)
535+
in
536+
Doc.text ("'" ^ str ^ "'")
519537

520538
let rec printStructure (s : Parsetree.structure) t =
521539
match s with

0 commit comments

Comments
 (0)