[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language).

cmtice · cmtice · commit 468f73f8539d · 2025-01-19T09:15:34.000-08:00
This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with llvm#120971.
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -0,0 +1,156 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <limits.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private {
+
+namespace dil {
+
+enum class TokenKind {
+  coloncolon,
+  eof,
+  identifier,
+  invalid,
+  kw_namespace,
+  l_paren,
+  none,
+  r_paren,
+  unknown,
+};
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class DILToken {
+public:
+  DILToken(dil::TokenKind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+
+  DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {}
+
+  void setKind(dil::TokenKind kind) { m_kind = kind; }
+  dil::TokenKind getKind() const { return m_kind; }
+
+  std::string getSpelling() const { return m_spelling; }
+
+  uint32_t getLength() const { return m_spelling.size(); }
+
+  bool is(dil::TokenKind kind) const { return m_kind == kind; }
+
+  bool isNot(dil::TokenKind kind) const { return m_kind != kind; }
+
+  bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const {
+    return is(kind1) || is(kind2);
+  }
+
+  template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const {
+    return is(kind) || isOneOf(Ks...);
+  }
+
+  uint32_t getLocation() const { return m_start_pos; }
+
+  void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
+    m_kind = kind;
+    m_spelling = spelling;
+    m_start_pos = start;
+  }
+
+  static const std::string getTokenName(dil::TokenKind kind);
+
+private:
+  dil::TokenKind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {
+    m_cur_pos = m_expr.begin();
+    // Use UINT_MAX to indicate invalid/uninitialized value.
+    m_tokens_idx = UINT_MAX;
+  }
+
+  bool Lex(DILToken &result, bool look_ahead = false);
+
+  bool Is_Word(std::string::iterator start, uint32_t &length);
+
+  uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
+
+  /// Update 'result' with the other paremeter values, create a
+  /// duplicate token, and push the duplicate token onto the vector of
+  /// lexed tokens.
+  void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+                         std::string tok_str, uint32_t tok_pos);
+
+  /// Return the lexed token N+1 positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const DILToken &LookAhead(uint32_t N);
+
+  const DILToken &AcceptLookAhead(uint32_t N);
+
+  /// Return the index for the 'current' token being handled by the DIL parser.
+  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+  /// Return the current token to be handled by the DIL parser.
+  DILToken &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+  /// Update the index for the 'current' token, to point to the next lexed
+  /// token.
+  bool IncrementTokenIdx() {
+    if (m_tokens_idx >= m_lexed_tokens.size() - 1)
+      return false;
+
+    m_tokens_idx++;
+    return true;
+  }
+
+  /// Set the index for the 'current' token (to be handled by the parser)
+  /// to a particular position. Used for either committing 'look ahead' parsing
+  /// or rolling back tentative parsing.
+  bool ResetTokenIdx(uint32_t new_value) {
+    if (new_value > m_lexed_tokens.size() - 1)
+      return false;
+
+    m_tokens_idx = new_value;
+    return true;
+  }
+
+private:
+  // The input string we are lexing & parsing.
+  std::string m_expr;
+
+  // The current position of the lexer within m_expr (the character position,
+  // within the string, of the next item to be lexed).
+  std::string::iterator m_cur_pos;
+
+  // Holds all of the tokens lexed so far.
+  std::vector<DILToken> m_lexed_tokens;
+
+  // Index into m_lexed_tokens; indicates which token the DIL parser is
+  // currently trying to parse/handle.
+  uint32_t m_tokens_idx;
+
+  // "invalid" token; to be returned by lexer when 'look ahead' fails.
+  DILToken m_invalid_token;
+};
+
+} // namespace dil
+
+} // namespace lldb_private
+
+#endif // LLDB_VALUEOBJECT_DILLEXER_H_
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
@@ -0,0 +1,205 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace lldb_private {
+
+namespace dil {
+
+// For fast keyword lookup. More keywords will be added later.
+const llvm::StringMap<dil::TokenKind> Keywords = {
+    {"namespace", dil::TokenKind::kw_namespace},
+};
+
+const std::string DILToken::getTokenName(dil::TokenKind kind) {
+  switch (kind) {
+  case dil::TokenKind::coloncolon:
+    return "coloncolon";
+  case dil::TokenKind::eof:
+    return "eof";
+  case dil::TokenKind::identifier:
+    return "identifier";
+  case dil::TokenKind::kw_namespace:
+    return "namespace";
+  case dil::TokenKind::l_paren:
+    return "l_paren";
+  case dil::TokenKind::r_paren:
+    return "r_paren";
+  case dil::TokenKind::unknown:
+    return "unknown";
+  default:
+    return "token_name";
+  }
+}
+
+static bool Is_Letter(char c) {
+  if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+    return true;
+  return false;
+}
+
+static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+bool DILLexer::Is_Word(std::string::iterator start, uint32_t &length) {
+  bool done = false;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (m_cur_pos == m_expr.end() || Is_Digit(*m_cur_pos))
+    return false;
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*m_cur_pos == '$') {
+    dollar_start = true;
+    ++m_cur_pos;
+    length++;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; m_cur_pos != m_expr.end() && !done; ++m_cur_pos) {
+    char c = *m_cur_pos;
+    if (!Is_Letter(c) && !Is_Digit(c) && c != '_') {
+      done = true;
+      break;
+    } else
+      length++;
+  }
+
+  if (dollar_start && length > 1) // Must have something besides just '$'
+    return true;
+
+  if (!dollar_start && length > 0)
+    return true;
+
+  // Not a valid word, so re-set the lexing position.
+  m_cur_pos = start;
+  return false;
+}
+
+void DILLexer::UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+                                 std::string tok_str, uint32_t tok_pos) {
+  DILToken new_token;
+  result.setValues(tok_kind, tok_str, tok_pos);
+  new_token = result;
+  m_lexed_tokens.push_back(std::move(new_token));
+}
+
+bool DILLexer::Lex(DILToken &result, bool look_ahead) {
+  bool retval = true;
+
+  if (!look_ahead) {
+    // We're being asked for the 'next' token, and not a part of a LookAhead.
+    // Check to see if we've already lexed it and pushed it onto our tokens
+    // vector; if so, return the next token from the vector, rather than doing
+    // more lexing.
+    if ((m_tokens_idx != UINT_MAX) &&
+        (m_tokens_idx < m_lexed_tokens.size() - 1)) {
+      result = m_lexed_tokens[m_tokens_idx + 1];
+      return retval;
+    }
+  }
+
+  // Skip over whitespace (spaces).
+  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
+    m_cur_pos++;
+
+  // Check to see if we've reached the end of our input string.
+  if (m_cur_pos == m_expr.end()) {
+    UpdateLexedTokens(result, dil::TokenKind::eof, "", m_expr.length());
+    return retval;
+  }
+
+  uint32_t position = m_cur_pos - m_expr.begin();
+  ;
+  std::string::iterator start = m_cur_pos;
+  uint32_t length = 0;
+  if (Is_Word(start, length)) {
+    dil::TokenKind kind;
+    std::string word = m_expr.substr(position, length);
+    auto iter = Keywords.find(word);
+    if (iter != Keywords.end())
+      kind = iter->second;
+    else
+      kind = dil::TokenKind::identifier;
+
+    UpdateLexedTokens(result, kind, word, position);
+    return true;
+  }
+
+  switch (*m_cur_pos) {
+  case '(':
+    m_cur_pos++;
+    UpdateLexedTokens(result, dil::TokenKind::l_paren, "(", position);
+    return true;
+  case ')':
+    m_cur_pos++;
+    UpdateLexedTokens(result, dil::TokenKind::r_paren, ")", position);
+    return true;
+  case ':':
+    if (position + 1 < m_expr.size() && m_expr[position + 1] == ':') {
+      m_cur_pos += 2;
+      UpdateLexedTokens(result, dil::TokenKind::coloncolon, "::", position);
+      return true;
+    }
+    break;
+  default:
+    break;
+  }
+  // Empty Token
+  result.setValues(dil::TokenKind::none, "", m_expr.length());
+  return false;
+}
+
+const DILToken &DILLexer::LookAhead(uint32_t N) {
+  uint32_t extra_lexed_tokens = m_lexed_tokens.size() - m_tokens_idx - 1;
+
+  if (N + 1 < extra_lexed_tokens)
+    return m_lexed_tokens[m_tokens_idx + N + 1];
+
+  uint32_t remaining_tokens =
+      (m_tokens_idx + N + 1) - m_lexed_tokens.size() + 1;
+
+  bool done = false;
+  bool look_ahead = true;
+  while (!done && remaining_tokens > 0) {
+    DILToken tok;
+    Lex(tok, look_ahead);
+    if (tok.getKind() == dil::TokenKind::eof)
+      done = true;
+    remaining_tokens--;
+  };
+
+  if (remaining_tokens > 0) {
+    m_invalid_token.setValues(dil::TokenKind::invalid, "", 0);
+    return m_invalid_token;
+  }
+
+  return m_lexed_tokens[m_tokens_idx + N + 1];
+}
+
+const DILToken &DILLexer::AcceptLookAhead(uint32_t N) {
+  if (m_tokens_idx + N + 1 > m_lexed_tokens.size())
+    return m_invalid_token;
+
+  m_tokens_idx += N + 1;
+  return m_lexed_tokens[m_tokens_idx];
+}
+
+} // namespace dil
+
+} // namespace lldb_private
diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_lldb_unittest(LLDBValueObjectTests
   DumpValueObjectOptionsTests.cpp
+  DILLexerTests.cpp
 
   LINK_LIBS
     lldbValueObject
diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp