Skip to content

Commit 82e6bf3

Browse files
Yuhtameta-codesync[bot]
authored andcommitted
Fix row set life cycle in deduplicated column readers when getValues is not called during last read (facebookincubator#275)
Summary: Pull Request resolved: facebookincubator#275 When the following conditions are satisfied, we got some use-after-free crash in deduplicated column readers: 1. There are at least 2 filters on 2 different columns other than the deduplicated column; 2. One filter is evaluated before the deduplicated column, and some rows survive the filter in first batch; 3. The column reader stored an initial row set as output rows; 4. On second batch, a new run is started; 5. Some rows survive the first filter, but all rows are filtered out after the second filter after deduplicated column is `read()` (but not `getValues()`); 6. On third batch, the column reader should get a larger row set (larger than the initial row set) to trigger a reallocation of row set buffer; 7. In this case, we still pointing to the old row set buffer which is already released, while trying to get the cached value from last run. Fix this by using an empty row set to retrieve the last cached value; this is ok because the result was all filtered out. Reviewed By: sdruzkin Differential Revision: D84204087 fbshipit-source-id: 800fb785b6ce646f0f838f4b600cdf145aafab68
1 parent ed5fb48 commit 82e6bf3

File tree

2 files changed

+75
-8
lines changed

2 files changed

+75
-8
lines changed

dwio/nimble/velox/selective/VariableLengthColumnReader.cpp

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,7 @@ void DeduplicatedArrayColumnReader::read(
865865
if (deduplicatedReadHelper_.loadLastRun_ &&
866866
!deduplicatedReadHelper_.lastRunLoaded_) {
867867
velox::VectorPtr result;
868-
getValues(outputRows(), &result);
868+
getValues({}, &result);
869869
}
870870
SelectiveListColumnReader::read(offset, rows, incomingNulls);
871871
}
@@ -882,7 +882,9 @@ void DeduplicatedArrayColumnReader::getValues(
882882
// next batch in lazy vector scenario.
883883
auto dictionaryVector = prepareDictionaryArrayResult(
884884
*result, requestedType_, rows.size(), memoryPool_);
885-
setComplexNulls(rows, *result);
885+
if (!rows.empty()) {
886+
setComplexNulls(rows, *result);
887+
}
886888
auto indices =
887889
dictionaryVector->mutableIndices(rows.size())->asMutable<vector_size_t>();
888890
auto* alphabetArray =
@@ -918,8 +920,11 @@ void DeduplicatedArrayColumnReader::getValues(
918920
lastRunValue_->resize(0);
919921
}
920922

921-
auto* nulls = nullsInReadRange_ ? nullsInReadRange_->as<uint64_t>() : nullptr;
922-
deduplicatedReadHelper_.populateSelectedIndices(nulls, rows, indices);
923+
if (!rows.empty()) {
924+
auto* nulls =
925+
nullsInReadRange_ ? nullsInReadRange_->as<uint64_t>() : nullptr;
926+
deduplicatedReadHelper_.populateSelectedIndices(nulls, rows, indices);
927+
}
923928
}
924929

925930
DeduplicatedMapColumnReader::DeduplicatedMapColumnReader(
@@ -1010,7 +1015,7 @@ void DeduplicatedMapColumnReader::read(
10101015
if (deduplicatedReadHelper_.loadLastRun_ &&
10111016
!deduplicatedReadHelper_.lastRunLoaded_) {
10121017
velox::VectorPtr result;
1013-
getValues(outputRows(), &result);
1018+
getValues({}, &result);
10141019
}
10151020
SelectiveMapColumnReader::read(offset, rows, incomingNulls);
10161021
}
@@ -1027,7 +1032,9 @@ void DeduplicatedMapColumnReader::getValues(
10271032
// next batch in lazy vector scenario.
10281033
auto dictionaryVector = prepareDictionaryMapResult(
10291034
*result, requestedType_, rows.size(), memoryPool_);
1030-
setComplexNulls(rows, *result);
1035+
if (!rows.empty()) {
1036+
setComplexNulls(rows, *result);
1037+
}
10311038
auto indices =
10321039
dictionaryVector->mutableIndices(rows.size())->asMutable<vector_size_t>();
10331040
auto* alphabetMap =
@@ -1073,7 +1080,10 @@ void DeduplicatedMapColumnReader::getValues(
10731080
lastRunValues_->resize(0);
10741081
}
10751082

1076-
auto* nulls = nullsInReadRange_ ? nullsInReadRange_->as<uint64_t>() : nullptr;
1077-
deduplicatedReadHelper_.populateSelectedIndices(nulls, rows, indices);
1083+
if (!rows.empty()) {
1084+
auto* nulls =
1085+
nullsInReadRange_ ? nullsInReadRange_->as<uint64_t>() : nullptr;
1086+
deduplicatedReadHelper_.populateSelectedIndices(nulls, rows, indices);
1087+
}
10781088
}
10791089
} // namespace facebook::nimble

dwio/nimble/velox/selective/tests/SelectiveNimbleReaderTest.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,63 @@ TEST_F(SelectiveNimbleReaderTest, arrayWithOffsetsReuseNullResult) {
10361036
validate(*vector, *readers.rowReader, 2, [](auto) { return true; });
10371037
}
10381038

1039+
TEST_F(SelectiveNimbleReaderTest, arrayWithOffsetsLastRowSetLifeCycle) {
1040+
std::vector<std::optional<std::vector<std::optional<int64_t>>>> c0, c1, c2;
1041+
// First batch, one row after filtering, and allocate outputRows_ with a
1042+
// smaller size.
1043+
for (int i = 0; i < 16; ++i) {
1044+
c0.push_back(std::nullopt);
1045+
c1.push_back({{1}});
1046+
c2.push_back({{}});
1047+
}
1048+
c0.push_back({{}});
1049+
c1.push_back({{1}});
1050+
c2.push_back({{}});
1051+
// Second batch, all filtered out by c2, but c1 reads some value without
1052+
// calling getValues.
1053+
c0.push_back(std::nullopt);
1054+
// Add a null to force setComplexNulls to read last row set before batch 3.
1055+
c1.push_back(std::nullopt);
1056+
c2.push_back({{}});
1057+
for (int i = 0; i < 15; ++i) {
1058+
c0.push_back(std::nullopt);
1059+
c1.push_back({{2}});
1060+
c2.push_back({{}});
1061+
}
1062+
c0.push_back({{}});
1063+
c1.push_back({{2}});
1064+
c2.push_back(std::nullopt);
1065+
// Third batch, nothing is filtered out, and force outputRows_ buffer
1066+
// reallocation.
1067+
for (int i = 0; i < 17; ++i) {
1068+
c0.push_back({{}});
1069+
c1.push_back({{2}});
1070+
c2.push_back({{}});
1071+
}
1072+
auto vector = makeRowVector({
1073+
makeNullableArrayVector<int64_t>(c0),
1074+
makeRowVector(
1075+
{"c1c0"},
1076+
{makeNullableArrayVector<int64_t>(c1)},
1077+
// Add some nulls so it is not lazy.
1078+
[](auto i) { return i == 0; }),
1079+
makeNullableArrayVector<int64_t>(c2),
1080+
});
1081+
VeloxWriterOptions writerOptions;
1082+
writerOptions.dictionaryArrayColumns = {"c1"};
1083+
auto fileContent = test::createNimbleFile(*rootPool(), vector, writerOptions);
1084+
auto scanSpec = std::make_shared<common::ScanSpec>("root");
1085+
scanSpec->addAllChildFields(*vector->type());
1086+
scanSpec->childByName("c0")->setFilter(std::make_shared<common::IsNotNull>());
1087+
scanSpec->childByName("c1")->setFilter(std::make_shared<common::IsNotNull>());
1088+
scanSpec->childByName("c2")->setFilter(std::make_shared<common::IsNotNull>());
1089+
scanSpec->disableStatsBasedFilterReorder();
1090+
auto readers = makeReaders(vector, fileContent, scanSpec);
1091+
validate(*vector, *readers.rowReader, 17, [](auto i) {
1092+
return i == 16 || i >= 34;
1093+
});
1094+
}
1095+
10391096
TEST_F(SelectiveNimbleReaderTest, slidingWindowMapSubfieldPruning) {
10401097
common::BigintRange keyFilter(2, 2, false);
10411098
checkSlidingWindowMap(

0 commit comments

Comments
 (0)