|
22 | 22 | #include "velox/connectors/hive/HiveConnector.h" // @manual |
23 | 23 | #include "velox/core/QueryCtx.h" |
24 | 24 | #include "velox/dwio/parquet/RegisterParquetWriter.h" // @manual |
| 25 | +#include "velox/dwio/parquet/reader/PageReader.h" |
25 | 26 | #include "velox/dwio/parquet/tests/ParquetTestBase.h" |
26 | 27 | #include "velox/exec/Cursor.h" |
27 | 28 | #include "velox/exec/tests/utils/AssertQueryBuilder.h" |
@@ -146,6 +147,124 @@ TEST_F(ParquetWriterTest, compression) { |
146 | 147 | assertReadWithReaderAndExpected(schema, *rowReader, data, *leafPool_); |
147 | 148 | }; |
148 | 149 |
|
| 150 | +TEST_F(ParquetWriterTest, toggleDataPageVersion) { |
| 151 | + auto schema = ROW({"c0"}, {INTEGER()}); |
| 152 | + const int64_t kRows = 1; |
| 153 | + const auto data = makeRowVector({ |
| 154 | + makeFlatVector<int32_t>(kRows, [](auto row) { return 987; }), |
| 155 | + }); |
| 156 | + |
| 157 | + // Write Parquet test data, then read and return the DataPage |
| 158 | + // (thrift::PageType::type) used. |
| 159 | + const auto testDataPageVersion = |
| 160 | + [&](std::unordered_map<std::string, std::string> configFromFile, |
| 161 | + std::unordered_map<std::string, std::string> sessionProperties) { |
| 162 | + // Create an in-memory writer. |
| 163 | + auto sink = std::make_unique<MemorySink>( |
| 164 | + 200 * 1024 * 1024, |
| 165 | + dwio::common::FileSink::Options{.pool = leafPool_.get()}); |
| 166 | + auto sinkPtr = sink.get(); |
| 167 | + parquet::WriterOptions writerOptions; |
| 168 | + writerOptions.memoryPool = leafPool_.get(); |
| 169 | + |
| 170 | + // Simulate setting of Hive config & connector session properties, then |
| 171 | + // write test data. |
| 172 | + auto connectorConfig = config::ConfigBase(std::move(configFromFile)); |
| 173 | + auto connectorSessionProperties = |
| 174 | + config::ConfigBase(std::move(sessionProperties)); |
| 175 | + |
| 176 | + writerOptions.processConfigs( |
| 177 | + connectorConfig, connectorSessionProperties); |
| 178 | + auto writer = std::make_unique<parquet::Writer>( |
| 179 | + std::move(sink), writerOptions, rootPool_, schema); |
| 180 | + writer->write(data); |
| 181 | + writer->close(); |
| 182 | + |
| 183 | + // Read to identify DataPage used. |
| 184 | + dwio::common::ReaderOptions readerOptions{leafPool_.get()}; |
| 185 | + auto reader = createReaderInMemory(*sinkPtr, readerOptions); |
| 186 | + |
| 187 | + auto colChunkPtr = reader->fileMetaData().rowGroup(0).columnChunk(0); |
| 188 | + std::string_view sinkData(sinkPtr->data(), sinkPtr->size()); |
| 189 | + |
| 190 | + auto readFile = std::make_shared<InMemoryReadFile>(sinkData); |
| 191 | + auto file = std::make_shared<ReadFileInputStream>(std::move(readFile)); |
| 192 | + |
| 193 | + auto inputStream = std::make_unique<SeekableFileInputStream>( |
| 194 | + std::move(file), |
| 195 | + colChunkPtr.dataPageOffset(), |
| 196 | + 150, |
| 197 | + *leafPool_, |
| 198 | + LogType::TEST); |
| 199 | + auto pageReader = std::make_unique<PageReader>( |
| 200 | + std::move(inputStream), |
| 201 | + *leafPool_, |
| 202 | + colChunkPtr.compression(), |
| 203 | + colChunkPtr.totalCompressedSize()); |
| 204 | + |
| 205 | + return pageReader->readPageHeader().type; |
| 206 | + }; |
| 207 | + |
| 208 | + // Test default behavior - DataPage should be V1. |
| 209 | + ASSERT_EQ(testDataPageVersion({}, {}), thrift::PageType::type::DATA_PAGE); |
| 210 | + |
| 211 | + // Simulate setting DataPage version to V2 via Hive config from file. |
| 212 | + std::unordered_map<std::string, std::string> configFromFile = { |
| 213 | + {parquet::WriterOptions::kParquetHiveConnectorDataPageVersion, "V2"}}; |
| 214 | + |
| 215 | + ASSERT_EQ( |
| 216 | + testDataPageVersion(configFromFile, {}), |
| 217 | + thrift::PageType::type::DATA_PAGE_V2); |
| 218 | + |
| 219 | + // Simulate setting DataPage version to V1 via Hive config from file. |
| 220 | + configFromFile = { |
| 221 | + {parquet::WriterOptions::kParquetHiveConnectorDataPageVersion, "V1"}}; |
| 222 | + |
| 223 | + ASSERT_EQ( |
| 224 | + testDataPageVersion(configFromFile, {}), |
| 225 | + thrift::PageType::type::DATA_PAGE); |
| 226 | + |
| 227 | + // Simulate setting DataPage version to V2 via connector session property. |
| 228 | + std::unordered_map<std::string, std::string> sessionProperties = { |
| 229 | + {parquet::WriterOptions::kParquetSessionDataPageVersion, "V2"}}; |
| 230 | + |
| 231 | + ASSERT_EQ( |
| 232 | + testDataPageVersion({}, sessionProperties), |
| 233 | + thrift::PageType::type::DATA_PAGE_V2); |
| 234 | + |
| 235 | + // Simulate setting DataPage version to V1 via connector session property. |
| 236 | + sessionProperties = { |
| 237 | + {parquet::WriterOptions::kParquetSessionDataPageVersion, "V1"}}; |
| 238 | + |
| 239 | + ASSERT_EQ( |
| 240 | + testDataPageVersion({}, sessionProperties), |
| 241 | + thrift::PageType::type::DATA_PAGE); |
| 242 | + |
| 243 | + // Simulate setting DataPage version to V1 via connector session property, |
| 244 | + // and to V2 via Hive config from file. Session property should take |
| 245 | + // precedence. |
| 246 | + sessionProperties = { |
| 247 | + {parquet::WriterOptions::kParquetSessionDataPageVersion, "V1"}}; |
| 248 | + configFromFile = { |
| 249 | + {parquet::WriterOptions::kParquetHiveConnectorDataPageVersion, "V2"}}; |
| 250 | + |
| 251 | + ASSERT_EQ( |
| 252 | + testDataPageVersion({}, sessionProperties), |
| 253 | + thrift::PageType::type::DATA_PAGE); |
| 254 | + |
| 255 | + // Simulate setting DataPage version to V2 via connector session property, |
| 256 | + // and to V1 via Hive config from file. Session property should take |
| 257 | + // precedence. |
| 258 | + sessionProperties = { |
| 259 | + {parquet::WriterOptions::kParquetSessionDataPageVersion, "V2"}}; |
| 260 | + configFromFile = { |
| 261 | + {parquet::WriterOptions::kParquetHiveConnectorDataPageVersion, "V1"}}; |
| 262 | + |
| 263 | + ASSERT_EQ( |
| 264 | + testDataPageVersion({}, sessionProperties), |
| 265 | + thrift::PageType::type::DATA_PAGE_V2); |
| 266 | +} |
| 267 | + |
149 | 268 | DEBUG_ONLY_TEST_F(ParquetWriterTest, unitFromWriterOptions) { |
150 | 269 | SCOPED_TESTVALUE_SET( |
151 | 270 | "facebook::velox::parquet::Writer::write", |
|
0 commit comments