llvm · cmtice · Feb 5, 2025 · Jan 19, 2025 · Jan 19, 2025 · Jan 26, 2025
@@ -0,0 +1,132 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_
-#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
-#define LLDB_VALUEOBJECT_DILLEXER_H_
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H
+#define LLDB_VALUEOBJECT_DILLEXER_H
-#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
-#define LLDB_VALUEOBJECT_DILLEXER_H_
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H
+#define LLDB_VALUEOBJECT_DILLEXER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <limits.h>
-#include <limits.h>
+#include <climits>
-#include <limits.h>
+#include <climits>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private::dil {
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class Token {
+public:
+  enum Kind {
+    coloncolon,
+    eof,
+    identifier,
+    l_paren,
+    r_paren,
+    unknown,
+  };
+
+  Token(Kind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
+
+  Kind GetKind() const { return m_kind; }
+
+  std::string GetSpelling() const { return m_spelling; }
+
+  bool Is(Kind kind) const { return m_kind == kind; }
+
+  bool IsNot(Kind kind) const { return m_kind != kind; }
+
+  bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }
+
+  template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
+    return Is(kind) || IsOneOf(Ks...);
+  }
+
+  uint32_t GetLocation() const { return m_start_pos; }
+
+  static llvm::StringRef GetTokenName(Kind kind);
+
+private:
+  Kind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  /// Lexes all the tokens in expr and calls the private constructor
+  /// with the lexed tokens.
+  static llvm::Expected<DILLexer> Create(llvm::StringRef expr);
+
+  /// Return the current token to be handled by the DIL parser.
+  const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+  /// Advance the current token position by N.
+  void Advance(uint32_t N = 1) {
+    // UINT_MAX means uninitialized, no "current" position, so move to start.
+    if (m_tokens_idx == UINT_MAX)
+      m_tokens_idx = 0;
+    else if (m_tokens_idx + N >= m_lexed_tokens.size())
+      // N is too large; advance to the end of the lexed tokens.
+      m_tokens_idx = m_lexed_tokens.size() - 1;
+    else
+      m_tokens_idx += N;
+  }
+
+  /// Return the lexed token N positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const Token &LookAhead(uint32_t N) {
+    if (m_tokens_idx + N < m_lexed_tokens.size())
+      return m_lexed_tokens[m_tokens_idx + N];
+
+    // Last token should be an 'eof' token.
+    return m_lexed_tokens.back();
+  }
+
+  /// Return the index for the 'current' token being handled by the DIL parser.
+  uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+  /// Set the index for the 'current' token (to be handled by the parser)
+  /// to a particular position. Used for either committing 'look ahead' parsing
+  /// or rolling back tentative parsing.
+  void ResetTokenIdx(uint32_t new_value) {
+    assert(new_value == UINT_MAX || new_value < m_lexed_tokens.size());
+    m_tokens_idx = new_value;
+  }
+
+  uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
+
+private:
+  DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
+      : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
+        m_tokens_idx(UINT_MAX), m_eof_token(Token(Token::eof, "", 0)) {}
+
+  static llvm::Expected<Token> Lex(llvm::StringRef expr,
+                                   llvm::StringRef &remainder);
+
+  // The input string we are lexing & parsing.
+  llvm::StringRef m_expr;
+
+  // Holds all of the tokens lexed so far.
+  std::vector<Token> m_lexed_tokens;
+
+  // Index into m_lexed_tokens; indicates which token the DIL parser is
+  // currently trying to parse/handle.
+  uint32_t m_tokens_idx;
+
+  // "eof" token; to be returned by lexer when 'look ahead' fails.
+  Token m_eof_token;
+};
-
-  // "eof" token; to be returned by lexer when 'look ahead' fails.
-  Token m_eof_token;
-};
+};
-
-  // "eof" token; to be returned by lexer when 'look ahead' fails.
-  Token m_eof_token;
-};
+};
+
+} // namespace lldb_private::dil
+
+#endif // LLDB_VALUEOBJECT_DILLEXER_H_
@@ -1,4 +1,5 @@
 add_lldb_library(lldbValueObject
+  DILLexer.cpp
   ValueObject.cpp
   ValueObjectCast.cpp
   ValueObjectChild.cpp

@@ -0,0 +1,128 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "lldb/Utility/Status.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace lldb_private::dil {
+
+llvm::StringRef Token::GetTokenName(Kind kind) {
+  switch (kind) {
+  case Kind::coloncolon:
+    return "coloncolon";
+  case Kind::eof:
+    return "eof";
+  case Kind::identifier:
+    return "identifier";
+  case Kind::l_paren:
+    return "l_paren";
+  case Kind::r_paren:
+    return "r_paren";
+  case Kind::unknown:
+    return "unknown";
+  }
+}
+
+static bool IsLetter(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+}
+
+static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
+                                             llvm::StringRef &remainder) {
+  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+  llvm::StringRef::iterator start = cur_pos;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (cur_pos == expr.end() || IsDigit(*cur_pos))
+    return std::nullopt;
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*cur_pos == '$') {
+    dollar_start = true;
+    ++cur_pos;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; cur_pos != expr.end(); ++cur_pos) {
+    char c = *cur_pos;
+    if (!IsLetter(c) && !IsDigit(c) && c != '_')
+      break;
+  }
+
+  // If first char is '$', make sure there's at least one mare char, or it's
+  // invalid.
+  if (dollar_start && (cur_pos - start <= 1)) {
+    cur_pos = start;
+    return std::nullopt;
+  }
+
+  if (cur_pos == start)
+    return std::nullopt;
+
+  llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
+  if (remainder.consume_front(word))
+    return word;
+
+  return std::nullopt;
-  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
-  llvm::StringRef::iterator start = cur_pos;
-  bool dollar_start = false;
-
-  // Must not start with a digit.
-  if (cur_pos == expr.end() || IsDigit(*cur_pos))
-    return std::nullopt;
-
-  // First character *may* be a '$', for a register name or convenience
-  // variable.
-  if (*cur_pos == '$') {
-    dollar_start = true;
-    ++cur_pos;
-  }
-
-  // Contains only letters, digits or underscores
-  for (; cur_pos != expr.end(); ++cur_pos) {
-    char c = *cur_pos;
-    if (!IsLetter(c) && !IsDigit(c) && c != '_')
-      break;
-  }
-
-  // If first char is '$', make sure there's at least one mare char, or it's
-  // invalid.
-  if (dollar_start && (cur_pos - start <= 1)) {
-    cur_pos = start;
-    return std::nullopt;
-  }
-
-  if (cur_pos == start)
-    return std::nullopt;
-
-  llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
-  if (remainder.consume_front(word))
-    return word;
-
-  return std::nullopt;
+  const char *start = remainder.data();
+  remainder.consume_front("$"); // initial '$' is valid
+  remainder = remainder.drop_while([](char c){  return IsDigit(c) || IsLetter(c) || c=='_'; });
+  llvm::StringRef candidate(start, remainder.data()-start);
+  if (candidate.empty() || candidate == "$" || IsDigit(candidate[0]))
+    return std::nullopt;
+  return candidate;
-  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
-  llvm::StringRef::iterator start = cur_pos;
-  bool dollar_start = false;
-
-  // Must not start with a digit.
-  if (cur_pos == expr.end() || IsDigit(*cur_pos))
-    return std::nullopt;
-
-  // First character *may* be a '$', for a register name or convenience
-  // variable.
-  if (*cur_pos == '$') {
-    dollar_start = true;
-    ++cur_pos;
-  }
-
-  // Contains only letters, digits or underscores
-  for (; cur_pos != expr.end(); ++cur_pos) {
-    char c = *cur_pos;
-    if (!IsLetter(c) && !IsDigit(c) && c != '_')
-      break;
-  }
-
-  // If first char is '$', make sure there's at least one mare char, or it's
-  // invalid.
-  if (dollar_start && (cur_pos - start <= 1)) {
-    cur_pos = start;
-    return std::nullopt;
-  }
-
-  if (cur_pos == start)
-    return std::nullopt;
-
-  llvm::StringRef word = expr.substr(start - expr.begin(), cur_pos - start);
-  if (remainder.consume_front(word))
-    return word;
-
-  return std::nullopt;
+  const char *start = remainder.data();
+  remainder.consume_front("$"); // initial '$' is valid
+  remainder = remainder.drop_while([](char c){  return IsDigit(c) || IsLetter(c) || c=='_'; });
+  llvm::StringRef candidate(start, remainder.data()-start);
+  if (candidate.empty() || candidate == "$" || IsDigit(candidate[0]))
+    return std::nullopt;
+  return candidate;
+}
+
+llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
+  std::vector<Token> tokens;
+  llvm::StringRef remainder = expr;
+  do {
+    if (llvm::Expected<Token> t = Lex(expr, remainder)) {
+      tokens.push_back(std::move(*t));
+    } else {
+      return t.takeError();
+    }
+  } while (tokens.back().GetKind() != Token::eof);
+  return DILLexer(expr, std::move(tokens));
+}
+
+llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
+                                    llvm::StringRef &remainder) {
+  // Skip over whitespace (spaces).
+  remainder = remainder.ltrim();
+  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
-  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+  llvm::StringRef::iterator cur_pos = remainder.begin();
-  llvm::StringRef::iterator cur_pos = expr.end() - remainder.size();
+  llvm::StringRef::iterator cur_pos = remainder.begin();
+
+  // Check to see if we've reached the end of our input string.
+  if (remainder.empty() || cur_pos == expr.end())
-  if (remainder.empty() || cur_pos == expr.end())
+  if (remainder.empty())
-  if (remainder.empty() || cur_pos == expr.end())
+  if (remainder.empty())
+    return Token(Token::eof, "", (uint32_t)expr.size());
+
+  uint32_t position = cur_pos - expr.begin();
+  std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
+  if (maybe_word)
+    return Token(Token::identifier, maybe_word->str(), position);
+
+  constexpr std::pair<Token::Kind, const char *> operators[] = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str))
+      return Token(kind, str, position);
+  }
+
+  // Unrecognized character(s) in string; unable to lex it.
+  return llvm::createStringError("Unable to lex input string");
+}
+
+} // namespace lldb_private::dil
@@ -1,10 +1,12 @@
 add_lldb_unittest(LLDBValueObjectTests
   DumpValueObjectOptionsTests.cpp
+  DILLexerTests.cpp
 
   LINK_LIBS
     lldbValueObject
     lldbPluginPlatformLinux
     lldbPluginScriptInterpreterNone
+    LLVMTestingSupport
 
   LINK_COMPONENTS
     Support