Skip to content

Commit 468f73f

Browse files
committed
[LLDB] Add Lexer (with tests) for DIL (Data Inspection Language).
This adds the basic lexer, with unittests, for the Data Inspection Language (DIL) -- see https://discourse.llvm.org/t/rfc-data-inspection-language/69893 This version of the lexer only handles local variables and namespaces, and is designed to work with llvm#120971.
1 parent 23a2392 commit 468f73f

File tree

4 files changed

+555
-0
lines changed

4 files changed

+555
-0
lines changed
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
10+
#define LLDB_VALUEOBJECT_DILLEXER_H_
11+
12+
#include "llvm/ADT/StringRef.h"
13+
#include <cstdint>
14+
#include <limits.h>
15+
#include <memory>
16+
#include <string>
17+
#include <vector>
18+
19+
namespace lldb_private {
20+
21+
namespace dil {
22+
23+
enum class TokenKind {
24+
coloncolon,
25+
eof,
26+
identifier,
27+
invalid,
28+
kw_namespace,
29+
l_paren,
30+
none,
31+
r_paren,
32+
unknown,
33+
};
34+
35+
/// Class defining the tokens generated by the DIL lexer and used by the
36+
/// DIL parser.
37+
class DILToken {
38+
public:
39+
DILToken(dil::TokenKind kind, std::string spelling, uint32_t start)
40+
: m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
41+
42+
DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {}
43+
44+
void setKind(dil::TokenKind kind) { m_kind = kind; }
45+
dil::TokenKind getKind() const { return m_kind; }
46+
47+
std::string getSpelling() const { return m_spelling; }
48+
49+
uint32_t getLength() const { return m_spelling.size(); }
50+
51+
bool is(dil::TokenKind kind) const { return m_kind == kind; }
52+
53+
bool isNot(dil::TokenKind kind) const { return m_kind != kind; }
54+
55+
bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const {
56+
return is(kind1) || is(kind2);
57+
}
58+
59+
template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const {
60+
return is(kind) || isOneOf(Ks...);
61+
}
62+
63+
uint32_t getLocation() const { return m_start_pos; }
64+
65+
void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
66+
m_kind = kind;
67+
m_spelling = spelling;
68+
m_start_pos = start;
69+
}
70+
71+
static const std::string getTokenName(dil::TokenKind kind);
72+
73+
private:
74+
dil::TokenKind m_kind;
75+
std::string m_spelling;
76+
uint32_t m_start_pos; // within entire expression string
77+
};
78+
79+
/// Class for doing the simple lexing required by DIL.
80+
class DILLexer {
81+
public:
82+
DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {
83+
m_cur_pos = m_expr.begin();
84+
// Use UINT_MAX to indicate invalid/uninitialized value.
85+
m_tokens_idx = UINT_MAX;
86+
}
87+
88+
bool Lex(DILToken &result, bool look_ahead = false);
89+
90+
bool Is_Word(std::string::iterator start, uint32_t &length);
91+
92+
uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
93+
94+
/// Update 'result' with the other paremeter values, create a
95+
/// duplicate token, and push the duplicate token onto the vector of
96+
/// lexed tokens.
97+
void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
98+
std::string tok_str, uint32_t tok_pos);
99+
100+
/// Return the lexed token N+1 positions ahead of the 'current' token
101+
/// being handled by the DIL parser.
102+
const DILToken &LookAhead(uint32_t N);
103+
104+
const DILToken &AcceptLookAhead(uint32_t N);
105+
106+
/// Return the index for the 'current' token being handled by the DIL parser.
107+
uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
108+
109+
/// Return the current token to be handled by the DIL parser.
110+
DILToken &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
111+
112+
/// Update the index for the 'current' token, to point to the next lexed
113+
/// token.
114+
bool IncrementTokenIdx() {
115+
if (m_tokens_idx >= m_lexed_tokens.size() - 1)
116+
return false;
117+
118+
m_tokens_idx++;
119+
return true;
120+
}
121+
122+
/// Set the index for the 'current' token (to be handled by the parser)
123+
/// to a particular position. Used for either committing 'look ahead' parsing
124+
/// or rolling back tentative parsing.
125+
bool ResetTokenIdx(uint32_t new_value) {
126+
if (new_value > m_lexed_tokens.size() - 1)
127+
return false;
128+
129+
m_tokens_idx = new_value;
130+
return true;
131+
}
132+
133+
private:
134+
// The input string we are lexing & parsing.
135+
std::string m_expr;
136+
137+
// The current position of the lexer within m_expr (the character position,
138+
// within the string, of the next item to be lexed).
139+
std::string::iterator m_cur_pos;
140+
141+
// Holds all of the tokens lexed so far.
142+
std::vector<DILToken> m_lexed_tokens;
143+
144+
// Index into m_lexed_tokens; indicates which token the DIL parser is
145+
// currently trying to parse/handle.
146+
uint32_t m_tokens_idx;
147+
148+
// "invalid" token; to be returned by lexer when 'look ahead' fails.
149+
DILToken m_invalid_token;
150+
};
151+
152+
} // namespace dil
153+
154+
} // namespace lldb_private
155+
156+
#endif // LLDB_VALUEOBJECT_DILLEXER_H_

lldb/source/ValueObject/DILLexer.cpp

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
//===-- DILLexer.cpp ------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
// This implements the recursive descent parser for the Data Inspection
8+
// Language (DIL), and its helper functions, which will eventually underlie the
9+
// 'frame variable' command. The language that this parser recognizes is
10+
// described in lldb/docs/dil-expr-lang.ebnf
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "lldb/ValueObject/DILLexer.h"
15+
#include "llvm/ADT/StringMap.h"
16+
17+
namespace lldb_private {
18+
19+
namespace dil {
20+
21+
// For fast keyword lookup. More keywords will be added later.
22+
const llvm::StringMap<dil::TokenKind> Keywords = {
23+
{"namespace", dil::TokenKind::kw_namespace},
24+
};
25+
26+
const std::string DILToken::getTokenName(dil::TokenKind kind) {
27+
switch (kind) {
28+
case dil::TokenKind::coloncolon:
29+
return "coloncolon";
30+
case dil::TokenKind::eof:
31+
return "eof";
32+
case dil::TokenKind::identifier:
33+
return "identifier";
34+
case dil::TokenKind::kw_namespace:
35+
return "namespace";
36+
case dil::TokenKind::l_paren:
37+
return "l_paren";
38+
case dil::TokenKind::r_paren:
39+
return "r_paren";
40+
case dil::TokenKind::unknown:
41+
return "unknown";
42+
default:
43+
return "token_name";
44+
}
45+
}
46+
47+
static bool Is_Letter(char c) {
48+
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
49+
return true;
50+
return false;
51+
}
52+
53+
static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); }
54+
55+
// A word starts with a letter, underscore, or dollar sign, followed by
56+
// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
57+
bool DILLexer::Is_Word(std::string::iterator start, uint32_t &length) {
58+
bool done = false;
59+
bool dollar_start = false;
60+
61+
// Must not start with a digit.
62+
if (m_cur_pos == m_expr.end() || Is_Digit(*m_cur_pos))
63+
return false;
64+
65+
// First character *may* be a '$', for a register name or convenience
66+
// variable.
67+
if (*m_cur_pos == '$') {
68+
dollar_start = true;
69+
++m_cur_pos;
70+
length++;
71+
}
72+
73+
// Contains only letters, digits or underscores
74+
for (; m_cur_pos != m_expr.end() && !done; ++m_cur_pos) {
75+
char c = *m_cur_pos;
76+
if (!Is_Letter(c) && !Is_Digit(c) && c != '_') {
77+
done = true;
78+
break;
79+
} else
80+
length++;
81+
}
82+
83+
if (dollar_start && length > 1) // Must have something besides just '$'
84+
return true;
85+
86+
if (!dollar_start && length > 0)
87+
return true;
88+
89+
// Not a valid word, so re-set the lexing position.
90+
m_cur_pos = start;
91+
return false;
92+
}
93+
94+
void DILLexer::UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
95+
std::string tok_str, uint32_t tok_pos) {
96+
DILToken new_token;
97+
result.setValues(tok_kind, tok_str, tok_pos);
98+
new_token = result;
99+
m_lexed_tokens.push_back(std::move(new_token));
100+
}
101+
102+
bool DILLexer::Lex(DILToken &result, bool look_ahead) {
103+
bool retval = true;
104+
105+
if (!look_ahead) {
106+
// We're being asked for the 'next' token, and not a part of a LookAhead.
107+
// Check to see if we've already lexed it and pushed it onto our tokens
108+
// vector; if so, return the next token from the vector, rather than doing
109+
// more lexing.
110+
if ((m_tokens_idx != UINT_MAX) &&
111+
(m_tokens_idx < m_lexed_tokens.size() - 1)) {
112+
result = m_lexed_tokens[m_tokens_idx + 1];
113+
return retval;
114+
}
115+
}
116+
117+
// Skip over whitespace (spaces).
118+
while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
119+
m_cur_pos++;
120+
121+
// Check to see if we've reached the end of our input string.
122+
if (m_cur_pos == m_expr.end()) {
123+
UpdateLexedTokens(result, dil::TokenKind::eof, "", m_expr.length());
124+
return retval;
125+
}
126+
127+
uint32_t position = m_cur_pos - m_expr.begin();
128+
;
129+
std::string::iterator start = m_cur_pos;
130+
uint32_t length = 0;
131+
if (Is_Word(start, length)) {
132+
dil::TokenKind kind;
133+
std::string word = m_expr.substr(position, length);
134+
auto iter = Keywords.find(word);
135+
if (iter != Keywords.end())
136+
kind = iter->second;
137+
else
138+
kind = dil::TokenKind::identifier;
139+
140+
UpdateLexedTokens(result, kind, word, position);
141+
return true;
142+
}
143+
144+
switch (*m_cur_pos) {
145+
case '(':
146+
m_cur_pos++;
147+
UpdateLexedTokens(result, dil::TokenKind::l_paren, "(", position);
148+
return true;
149+
case ')':
150+
m_cur_pos++;
151+
UpdateLexedTokens(result, dil::TokenKind::r_paren, ")", position);
152+
return true;
153+
case ':':
154+
if (position + 1 < m_expr.size() && m_expr[position + 1] == ':') {
155+
m_cur_pos += 2;
156+
UpdateLexedTokens(result, dil::TokenKind::coloncolon, "::", position);
157+
return true;
158+
}
159+
break;
160+
default:
161+
break;
162+
}
163+
// Empty Token
164+
result.setValues(dil::TokenKind::none, "", m_expr.length());
165+
return false;
166+
}
167+
168+
const DILToken &DILLexer::LookAhead(uint32_t N) {
169+
uint32_t extra_lexed_tokens = m_lexed_tokens.size() - m_tokens_idx - 1;
170+
171+
if (N + 1 < extra_lexed_tokens)
172+
return m_lexed_tokens[m_tokens_idx + N + 1];
173+
174+
uint32_t remaining_tokens =
175+
(m_tokens_idx + N + 1) - m_lexed_tokens.size() + 1;
176+
177+
bool done = false;
178+
bool look_ahead = true;
179+
while (!done && remaining_tokens > 0) {
180+
DILToken tok;
181+
Lex(tok, look_ahead);
182+
if (tok.getKind() == dil::TokenKind::eof)
183+
done = true;
184+
remaining_tokens--;
185+
};
186+
187+
if (remaining_tokens > 0) {
188+
m_invalid_token.setValues(dil::TokenKind::invalid, "", 0);
189+
return m_invalid_token;
190+
}
191+
192+
return m_lexed_tokens[m_tokens_idx + N + 1];
193+
}
194+
195+
const DILToken &DILLexer::AcceptLookAhead(uint32_t N) {
196+
if (m_tokens_idx + N + 1 > m_lexed_tokens.size())
197+
return m_invalid_token;
198+
199+
m_tokens_idx += N + 1;
200+
return m_lexed_tokens[m_tokens_idx];
201+
}
202+
203+
} // namespace dil
204+
205+
} // namespace lldb_private

lldb/unittests/ValueObject/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
add_lldb_unittest(LLDBValueObjectTests
22
DumpValueObjectOptionsTests.cpp
3+
DILLexerTests.cpp
34

45
LINK_LIBS
56
lldbValueObject

0 commit comments

Comments
 (0)