Parser: improve error message handling

chqrlie · chqrlie · commit 0ce98524fa6f · 2025-07-05T23:09:36.000+02:00
* use single `on_error` handler with error level and message arguments
* remove `Warning` token type, never returned anyway.
* improve `#error` and `#warning` message parsing consistency
* make `num_error` messages non fatal
* fix `#warning` behavior
diff --git a/parser/c2_parser.c2 b/parser/c2_parser.c2
@@ -121,7 +121,6 @@ public fn void Parser.parse(Parser* p, i32 file_id, bool is_interface, bool is_g
                          p.kwinfo,
                          p.features,
                          on_tokenizer_error,
-                         on_tokenizer_warning,
                          p,
                          false);
         p.tok.init();
@@ -137,17 +136,16 @@ public fn void Parser.parse(Parser* p, i32 file_id, bool is_interface, bool is_g
     buf.free();
 }
 
-fn void on_tokenizer_error(void* arg, SrcLoc loc) {
+fn void on_tokenizer_error(void* arg, c2_tokenizer.ErrorLevel level, SrcLoc loc, const char* message) {
     Parser* p = arg;
-    // NOTE: cannot use p.tok.error_msg, because of possible lookahead (changes token)
-    p.tok.loc = loc;
-    // will longjmp
-    p.error("%s", p.tokenizer.error_msg);
-}
 
-fn void on_tokenizer_warning(void* arg, SrcLoc loc) {
-    Parser* p = arg;
-    p.diags.error(loc, "%s", p.tokenizer.error_msg);
+    if (level) {
+        p.diags.error(loc, "%s", message);
+    } else {
+        p.diags.warn(loc, "%s", message);
+    }
+    if (level == c2_tokenizer.ErrorLevel.FatalError)
+        longjmp(&p.jmpbuf, 1);
 }
 
 fn void Parser.consumeToken(Parser* p) {
@@ -822,10 +820,6 @@ fn void Parser.dump_token(Parser* p, const Token* tok) @(unused) {
         out.add(p.pool.idx2str(tok.text_idx));
         out.add("*/");
         break;
-    case Warning:
-        out.color(color.Yellow);
-        out.add(tok.error_msg);
-        break;
     case Error:
         out.color(color.Red);
         out.add(p.tokenizer.error_msg);
diff --git a/parser/c2_tokenizer.c2 b/parser/c2_tokenizer.c2
@@ -255,7 +255,8 @@ public type Feature struct {
     bool is_else;   // inside the #else block
 }
 
-public type HandlerFn fn void (void* arg, SrcLoc loc);
+public type ErrorLevel enum u8 { Warning, Error, FatalError }
+public type ErrorFn fn void (void* arg, ErrorLevel level, SrcLoc loc, const char* msg);
 
 public type Tokenizer struct {
     const char* cur;
@@ -270,9 +271,8 @@ public type Tokenizer struct {
 
     string_pool.Pool* pool; // no ownership
     string_buffer.Buf* buf; // no ownership, used for strings and character constants
-    HandlerFn on_error;
-    HandlerFn on_warning;
-    void* fn_arg;
+    ErrorFn on_error;
+    void* on_error_arg;
 
     // Feature handling
     Feature[constants.MaxFeatureDepth+1] feature_stack;
@@ -283,7 +283,7 @@ public type Tokenizer struct {
 
     char[256] error_msg;
 }
-static_assert(1448, sizeof(Tokenizer));
+static_assert(1440, sizeof(Tokenizer));
 
 public fn void Tokenizer.init(Tokenizer* t,
                               string_pool.Pool* pool,
@@ -292,9 +292,8 @@ public fn void Tokenizer.init(Tokenizer* t,
                               SrcLoc loc_start,
                               const keywords.Info* kwinfo,
                               const string_list.List* features,
-                              HandlerFn on_error,
-                              HandlerFn on_warning,
-                              void* fn_arg,
+                              ErrorFn on_error,
+                              void* on_error_arg,
                               bool raw_mode)
 {
     string.memset(t, 0, sizeof(Tokenizer));
@@ -307,8 +306,7 @@ public fn void Tokenizer.init(Tokenizer* t,
     t.pool = pool;
     t.buf = buf;
     t.on_error = on_error;
-    t.on_warning = on_warning;
-    t.fn_arg = fn_arg;
+    t.on_error_arg = on_error_arg;
 
     t.features = features;
     t.raw_mode = raw_mode;
@@ -708,7 +706,7 @@ fn void Tokenizer.error(Tokenizer* t, Token* result, const char* format @(printf
     result.kind = Kind.Error;
     result.error_msg = t.error_msg;
     result.done = true;
-    if (t.on_error) t.on_error(t.fn_arg, result.loc);
+    if (t.on_error) t.on_error(t.on_error_arg, ErrorLevel.FatalError, result.loc, t.error_msg);
 }
 
 // generate an error but keep parsing
@@ -736,7 +734,7 @@ fn void Tokenizer.num_error(Tokenizer* t, Token* result, const char* p, const ch
     }
     t.cur = p;
     result.len = (u16)((p - t.input_start) - (result.loc - t.loc_start));
-    if (t.on_warning) t.on_warning(t.fn_arg, result.loc);
+    if (t.on_error) t.on_error(t.on_error_arg, ErrorLevel.Error, result.loc, t.error_msg);
 }
 
 fn void Tokenizer.lex_identifier(Tokenizer* t, Token* result) {
@@ -1490,28 +1488,36 @@ fn bool Tokenizer.at_bol(Tokenizer* t) {
 
 fn bool Tokenizer.parse_error_warn(Tokenizer* t, Token* result, Kind kind) {
     const char* start = t.cur;
-    while (*t.cur != '\0' && *t.cur != '\r' && *t.cur != '\n')
-        t.cur++;
-    usize len = (usize)(t.cur - start);
-    if (len > constants.MaxErrorMsgLen) {
-        t.error(result, "error msg too long (max %d bytes)", constants.MaxErrorMsgLen);
-        return true;
+
+    // parse pptokens instead of raw text
+    string_buffer.Buf* msg = string_buffer.create_static(elemsof(t.error_msg), false, t.error_msg);
+    SrcLoc last_loc = 0;
+    while (t.lex_preproc(result) != Kind.Eof) {
+        // replace blanks with a single space
+        if (last_loc && last_loc < result.loc) msg.add1(' ');
+        // copy string text or token source
+        if (result.kind == Kind.StringLiteral) {
+            msg.add2(t.pool.idx2str(result.text_idx), result.text_len);
+        } else {
+            msg.add2(t.input_start + (result.loc - t.loc_start), result.len);
+        }
+        last_loc = result.loc + result.len;
     }
-    char[constants.MaxErrorMsgLen+1] msg;
-    string.memcpy(msg, start, len);
-    msg[len] = 0;
+    msg.size();  // ensure null terminator
 
     if (kind == Kind.Feat_error) {
-        t.cur = t.line_start;
-        t.error(result, "%s", msg);
-    } else {
-        // TODO: output diagnostic synchronously
-        string.strcpy(t.error_msg, msg);
-        result.kind = Kind.Warning;
-        result.len = (u16)((t.cur - t.input_start) - (result.loc - t.loc_start));
+        t.cur = t.line_start;   // restart on the same line
+        result.kind = Kind.Error;
+        result.done = true;
+        result.loc = t.loc_start + (SrcLoc)(t.line_start - t.input_start);
+        result.len = (u16)(t.cur - start);
         result.error_msg = t.error_msg;
+        if (t.on_error) t.on_error(t.on_error_arg, ErrorLevel.FatalError, result.loc, t.error_msg);
+        return true;    // return error token with result.done set
+    } else {
+        if (t.on_error) t.on_error(t.on_error_arg, ErrorLevel.Warning, result.loc, t.error_msg);
+        return false;   // continue reading tokens
     }
-    return true;
 }
 
 fn bool Tokenizer.is_enabled(const Tokenizer* t) {
diff --git a/parser/token.c2 b/parser/token.c2
@@ -146,7 +146,6 @@ public type Kind enum u8 {
     BlockComment,
     // Special Tokens
     Eof,
-    Warning,
     Error,
 }
 
@@ -285,7 +284,6 @@ const char*[] token_names = {
     [Kind.LineComment]      = "l-comment",
     [Kind.BlockComment]     = "b-comment",
     [Kind.Eof]              = "eof",
-    [Kind.Warning]          = "warning",
     [Kind.Error]            = "error",
 }
 
diff --git a/tools/c2cat.c2 b/tools/c2cat.c2
@@ -304,7 +304,7 @@ public fn i32 c2cat(const char* filename)
     keywords.Info kwinfo;
     kwinfo.init(ctx.pool);
     c2_tokenizer.Tokenizer tokenizer;
-    tokenizer.init(ctx.pool, buf, ctx.input, 0, &kwinfo, &features, nil, nil, nil, true);
+    tokenizer.init(ctx.pool, buf, ctx.input, 0, &kwinfo, &features, nil, nil, true);
     ctx.tokenizer = &tokenizer;
 
     Token tok;

Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,6 @@ public type Kind enum u8 {`
`146`	`146`	`BlockComment,`
`147`	`147`	`// Special Tokens`
`148`	`148`	`Eof,`
`149`		`- Warning,`
`150`	`149`	`Error,`
`151`	`150`	`}`
`152`	`151`
`@@ -285,7 +284,6 @@ const char*[] token_names = {`
`285`	`284`	`[Kind.LineComment] = "l-comment",`
`286`	`285`	`[Kind.BlockComment] = "b-comment",`
`287`	`286`	`[Kind.Eof] = "eof",`
`288`		`- [Kind.Warning] = "warning",`
`289`	`287`	`[Kind.Error] = "error",`
`290`	`288`	`}`
`291`	`289`