Skip to content

Commit 482efe7

Browse files
pedroerpmeta-codesync[bot]
authored andcommitted
getRawSize support for FlatMapVectors (#318)
Summary: Pull Request resolved: #318 To ensure Nimble writers can properly write FlatMapVectors, the first step is being able to estimate the raw size of a FlatMapVector. Reviewed By: peterenescu Differential Revision: D80669288 fbshipit-source-id: b70ab6e4d45aacbdf46840341ee3aedc0e50d334
1 parent 75b3dbc commit 482efe7

File tree

2 files changed

+224
-61
lines changed

2 files changed

+224
-61
lines changed

dwio/nimble/velox/RawSizeUtils.cpp

Lines changed: 158 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "velox/vector/ConstantVector.h"
2020
#include "velox/vector/DecodedVector.h"
2121
#include "velox/vector/DictionaryVector.h"
22+
#include "velox/vector/FlatMapVector.h"
2223
#include "velox/vector/FlatVector.h"
2324

2425
namespace facebook::nimble {
@@ -348,18 +349,80 @@ uint64_t getRawSizeFromArrayVector(
348349
return rawSize;
349350
}
350351

352+
namespace {
353+
351354
uint64_t getRawSizeFromMapVector(
355+
const velox::MapVector& mapVector,
356+
const velox::common::Ranges& childRanges,
357+
RawSizeContext& context) {
358+
uint64_t rawSize = 0;
359+
rawSize += getRawSizeFromVector(mapVector.mapKeys(), childRanges, context);
360+
rawSize += getRawSizeFromVector(mapVector.mapValues(), childRanges, context);
361+
return rawSize;
362+
}
363+
364+
// For flat map vectors, we need to merge the base ranges with the "valid"
365+
// ranges for each key. Valid ranges for a key are controled by `inMap` buffers,
366+
// which dictate in which rows a particular key is present/active.
367+
uint64_t getRawSizeFromFlatMapVector(
368+
const velox::FlatMapVector& flatMapVector,
369+
const velox::common::Ranges& baseRanges,
370+
RawSizeContext& context) {
371+
uint64_t rawSize = 0;
372+
373+
if (baseRanges.size()) {
374+
velox::common::Ranges keyRanges;
375+
376+
for (size_t i = 0; i < flatMapVector.numDistinctKeys(); ++i) {
377+
keyRanges.clear();
378+
379+
const uint32_t keySize = getRawSizeFromVector(
380+
flatMapVector.distinctKeys(),
381+
velox::common::Ranges::of(i, i + 1),
382+
context);
383+
384+
// Process the keys and values for the rows where the key is present.
385+
if (auto& inMaps = flatMapVector.inMapsAt(i)) {
386+
const auto* rawInMaps = inMaps->as<uint64_t>();
387+
for (const auto& row : baseRanges) {
388+
if (velox::bits::isBitSet(rawInMaps, row)) {
389+
keyRanges.add(row, row + 1);
390+
}
391+
}
392+
393+
rawSize += getRawSizeFromVector(
394+
flatMapVector.mapValuesAt(i), keyRanges, context);
395+
rawSize += keySize * keyRanges.size();
396+
}
397+
// If there is no inMap buffer, process all rows.
398+
else {
399+
rawSize += getRawSizeFromVector(
400+
flatMapVector.mapValuesAt(i), baseRanges, context);
401+
rawSize += keySize * baseRanges.size();
402+
}
403+
}
404+
}
405+
return rawSize;
406+
}
407+
408+
} // namespace
409+
410+
uint64_t getRawSizeFromMap(
352411
const velox::VectorPtr& vector,
353412
const velox::common::Ranges& ranges,
354413
RawSizeContext& context) {
355414
VELOX_CHECK_NOT_NULL(vector);
356415
const auto& encoding = vector->encoding();
357416
const velox::MapVector* mapVector;
417+
358418
const velox::vector_size_t* offsets;
359419
const velox::vector_size_t* sizes;
360420
velox::common::Ranges childRanges;
421+
422+
uint64_t rawSize = 0;
361423
uint64_t nullCount = 0;
362-
auto processRow = [&](size_t row) {
424+
425+
auto processMapRow = [&](size_t row) {
363426
auto begin = offsets[row];
364427
auto end = begin + sizes[row];
365428
// Ensure valid size
@@ -371,6 +434,7 @@ uint64_t getRawSizeFromMapVector(
371434
};
372435

373436
switch (encoding) {
437+
// Handle top-level (regular) Map vectors.
374438
case velox::VectorEncoding::Simple::MAP: {
375439
mapVector = vector->as<velox::MapVector>();
376440
VELOX_CHECK_NOT_NULL(
@@ -388,20 +452,50 @@ uint64_t getRawSizeFromMapVector(
388452
if (velox::bits::isBitNull(nulls, row)) {
389453
++nullCount;
390454
} else {
391-
processRow(row);
455+
processMapRow(row);
392456
}
393457
}
394458
} else {
395459
for (const auto& row : ranges) {
396-
processRow(row);
460+
processMapRow(row);
397461
}
398462
}
463+
rawSize += getRawSizeFromMapVector(*mapVector, childRanges, context);
464+
break;
465+
}
399466

467+
// Handle top-level Flat Map vectors.
468+
case velox::VectorEncoding::Simple::FLAT_MAP: {
469+
auto flatMapVector = vector->as<velox::FlatMapVector>();
470+
VELOX_CHECK_NOT_NULL(
471+
flatMapVector,
472+
"Encoding mismatch on FlatMapVector. Encoding: {}. TypeKind: {}.",
473+
encoding,
474+
vector->typeKind());
475+
476+
if (flatMapVector->mayHaveNulls()) {
477+
const uint64_t* nulls = flatMapVector->rawNulls();
478+
for (const auto& row : ranges) {
479+
if (velox::bits::isBitNull(nulls, row)) {
480+
++nullCount;
481+
} else {
482+
childRanges.add(row, row + 1);
483+
}
484+
}
485+
rawSize +=
486+
getRawSizeFromFlatMapVector(*flatMapVector, childRanges, context);
487+
} else {
488+
rawSize += getRawSizeFromFlatMapVector(*flatMapVector, ranges, context);
489+
}
400490
break;
401491
}
492+
493+
// Cases when maps or flat maps are wrapped by a constant.
402494
case velox::VectorEncoding::Simple::CONSTANT: {
403495
return getRawSizeFromConstantComplexVector(vector, ranges, context);
404496
}
497+
498+
// Cases when maps or flat maps are wrapped by a dictionary.
405499
case velox::VectorEncoding::Simple::DICTIONARY: {
406500
const auto* dictionaryMapVector =
407501
vector->as<velox::DictionaryVector<velox::ComplexType>>();
@@ -416,49 +510,79 @@ uint64_t getRawSizeFromMapVector(
416510
velox::DecodedVector& decodedVector = localDecodedVector.get();
417511
decodedVector.decode(*dictionaryMapVector);
418512

419-
mapVector = decodedVector.base()->as<velox::MapVector>();
420-
VELOX_CHECK_NOT_NULL(
421-
mapVector,
422-
"Encoding mismatch on FlatVector. MapVector: {}. TypeKind: {}.",
423-
decodedVector.base()->encoding(),
424-
decodedVector.base()->typeKind());
425-
426-
offsets = mapVector->rawOffsets();
427-
sizes = mapVector->rawSizes();
428-
429-
if (decodedVector.mayHaveNulls()) {
430-
for (const auto& row : ranges) {
431-
if (decodedVector.isNullAt(row)) {
432-
++nullCount;
513+
// Now switch on the inner type of the dictionary; must be either a map
514+
// or a flat map.
515+
switch (decodedVector.base()->encoding()) {
516+
// Dictionary wrapped around a map:
517+
case velox::VectorEncoding::Simple::MAP: {
518+
mapVector = decodedVector.base()->as<velox::MapVector>();
519+
VELOX_CHECK_NOT_NULL(
520+
mapVector,
521+
"Encoding mismatch on FlatVector. MapVector: {}. TypeKind: {}.",
522+
decodedVector.base()->encoding(),
523+
decodedVector.base()->typeKind());
524+
525+
offsets = mapVector->rawOffsets();
526+
sizes = mapVector->rawSizes();
527+
528+
if (decodedVector.mayHaveNulls()) {
529+
for (const auto& row : ranges) {
530+
if (decodedVector.isNullAt(row)) {
531+
++nullCount;
532+
} else {
533+
processMapRow(decodedVector.index(row));
534+
}
535+
}
433536
} else {
434-
processRow(decodedVector.index(row));
537+
for (const auto& row : ranges) {
538+
processMapRow(decodedVector.index(row));
539+
}
435540
}
541+
rawSize += getRawSizeFromMapVector(*mapVector, childRanges, context);
542+
break;
436543
}
437-
} else {
438-
for (const auto& row : ranges) {
439-
processRow(decodedVector.index(row));
544+
// Dictionary wrapped around a flat map:
545+
case velox::VectorEncoding::Simple::FLAT_MAP: {
546+
auto flatMapVector = decodedVector.base()->as<velox::FlatMapVector>();
547+
VELOX_CHECK_NOT_NULL(
548+
flatMapVector,
549+
"Encoding mismatch on FlatMapVector. Encoding: {}. TypeKind: {}.",
550+
decodedVector.base()->encoding(),
551+
decodedVector.base()->typeKind());
552+
553+
if (decodedVector.mayHaveNulls()) {
554+
for (const auto& row : ranges) {
555+
if (decodedVector.isNullAt(row)) {
556+
++nullCount;
557+
} else {
558+
auto idx = decodedVector.index(row);
559+
childRanges.add(idx, idx + 1);
560+
}
561+
}
562+
} else {
563+
for (const auto& row : ranges) {
564+
auto idx = decodedVector.index(row);
565+
childRanges.add(idx, idx + 1);
566+
}
567+
}
568+
rawSize +=
569+
getRawSizeFromFlatMapVector(*flatMapVector, childRanges, context);
570+
break;
440571
}
572+
default:
573+
VELOX_FAIL(
574+
"Unsupported map encoding wrapped by DICTIONARY: {}.", encoding);
441575
}
442-
443576
break;
444577
}
445-
default: {
446-
VELOX_FAIL("Unsupported encoding: {}.", encoding);
447-
}
448-
}
449-
450-
uint64_t rawSize = 0;
451-
if (childRanges.size()) {
452-
rawSize += getRawSizeFromVector(mapVector->mapKeys(), childRanges, context);
453-
rawSize +=
454-
getRawSizeFromVector(mapVector->mapValues(), childRanges, context);
578+
default:
579+
VELOX_FAIL("Unsupported map encoding: {}.", encoding);
455580
}
456581

457582
context.nullCount = nullCount;
458583
if (nullCount) {
459584
rawSize += nullCount * NULL_SIZE;
460585
}
461-
462586
return rawSize;
463587
}
464588

@@ -494,10 +618,8 @@ uint64_t getRawSizeFromRowVector(
494618
}
495619
}
496620
} else {
497-
// Potentially expensive?
498621
childRangesPtr = &ranges;
499622
}
500-
501623
break;
502624
}
503625
case velox::VectorEncoding::Simple::CONSTANT: {
@@ -624,7 +746,7 @@ uint64_t getRawSizeFromVector(
624746
return getRawSizeFromArrayVector(vector, ranges, context);
625747
}
626748
case velox::TypeKind::MAP: {
627-
return getRawSizeFromMapVector(vector, ranges, context);
749+
return getRawSizeFromMap(vector, ranges, context);
628750
}
629751
case velox::TypeKind::ROW: {
630752
return getRawSizeFromRowVector(vector, ranges, context);

0 commit comments

Comments
 (0)