Skip to content

Commit d08ab02

Browse files
Prasoon Telangfacebook-github-bot
authored andcommitted
Back out "Optimize PagedInputStream::Skip" (#6870)
Summary: Pull Request resolved: #6870 Original commit changeset: 07241aaf71e8 Original Phabricator Diff: D49501856 Reviewed By: Yuhta Differential Revision: D49872683 fbshipit-source-id: d33553ce7ff603caed5befcb8d12c67a0154fba3
1 parent 9db46c6 commit d08ab02

File tree

5 files changed

+54
-178
lines changed

5 files changed

+54
-178
lines changed

velox/dwio/common/compression/Compression.cpp

Lines changed: 19 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -246,32 +246,24 @@ class ZstdDecompressor : public Decompressor {
246246
explicit ZstdDecompressor(
247247
uint64_t blockSize,
248248
const std::string& streamDebugInfo)
249-
: Decompressor{blockSize, streamDebugInfo}, context_{ZSTD_createDCtx()} {}
250-
251-
~ZstdDecompressor() override {
252-
ZSTD_freeDCtx(context_);
253-
}
249+
: Decompressor{blockSize, streamDebugInfo} {}
254250

255251
uint64_t decompress(
256252
const char* src,
257253
uint64_t srcLength,
258254
char* dest,
259255
uint64_t destLength) override;
260256

261-
std::pair<int64_t, bool> getDecompressedLength(
262-
const char* src,
263-
uint64_t srcLength) const override;
264-
265-
private:
266-
ZSTD_DCtx* context_;
257+
uint64_t getUncompressedLength(const char* src, uint64_t srcLength)
258+
const override;
267259
};
268260

269261
uint64_t ZstdDecompressor::decompress(
270262
const char* src,
271263
uint64_t srcLength,
272264
char* dest,
273265
uint64_t destLength) {
274-
auto ret = ZSTD_decompressDCtx(context_, dest, destLength, src, srcLength);
266+
auto ret = ZSTD_decompress(dest, destLength, src, srcLength);
275267
DWIO_ENSURE(
276268
!ZSTD_isError(ret),
277269
"ZSTD returned an error: ",
@@ -281,22 +273,22 @@ uint64_t ZstdDecompressor::decompress(
281273
return ret;
282274
}
283275

284-
std::pair<int64_t, bool> ZstdDecompressor::getDecompressedLength(
276+
uint64_t ZstdDecompressor::getUncompressedLength(
285277
const char* src,
286278
uint64_t srcLength) const {
287279
auto uncompressedLength = ZSTD_getFrameContentSize(src, srcLength);
288280
// in the case when decompression size is not available, return the upper
289281
// bound
290282
if (uncompressedLength == ZSTD_CONTENTSIZE_UNKNOWN ||
291283
uncompressedLength == ZSTD_CONTENTSIZE_ERROR) {
292-
return {blockSize_, false};
284+
return blockSize_;
293285
}
294286
DWIO_ENSURE_LE(
295287
uncompressedLength,
296288
blockSize_,
297289
"Insufficient buffer size. Info: ",
298290
streamDebugInfo_);
299-
return {uncompressedLength, true};
291+
return uncompressedLength;
300292
}
301293

302294
class SnappyDecompressor : public Decompressor {
@@ -312,17 +304,16 @@ class SnappyDecompressor : public Decompressor {
312304
char* dest,
313305
uint64_t destLength) override;
314306

315-
std::pair<int64_t, bool> getDecompressedLength(
316-
const char* src,
317-
uint64_t srcLength) const override;
307+
uint64_t getUncompressedLength(const char* src, uint64_t srcLength)
308+
const override;
318309
};
319310

320311
uint64_t SnappyDecompressor::decompress(
321312
const char* src,
322313
uint64_t srcLength,
323314
char* dest,
324315
uint64_t destLength) {
325-
auto [length, _] = getDecompressedLength(src, srcLength);
316+
auto length = getUncompressedLength(src, srcLength);
326317
DWIO_ENSURE_GE(destLength, length);
327318
DWIO_ENSURE(
328319
snappy::RawUncompress(src, srcLength, dest),
@@ -331,24 +322,23 @@ uint64_t SnappyDecompressor::decompress(
331322
return length;
332323
}
333324

334-
std::pair<int64_t, bool> SnappyDecompressor::getDecompressedLength(
325+
uint64_t SnappyDecompressor::getUncompressedLength(
335326
const char* src,
336327
uint64_t srcLength) const {
337328
size_t uncompressedLength;
338329
// in the case when decompression size is not available, return the upper
339330
// bound
340331
if (!snappy::GetUncompressedLength(src, srcLength, &uncompressedLength)) {
341-
return {blockSize_, false};
332+
return blockSize_;
342333
}
343334
DWIO_ENSURE_LE(
344335
uncompressedLength,
345336
blockSize_,
346337
"Insufficient buffer size. Info: ",
347338
streamDebugInfo_);
348-
return {uncompressedLength, true};
339+
return uncompressedLength;
349340
}
350341

351-
// TODO: Is this really needed?
352342
class ZlibDecompressionStream : public PagedInputStream,
353343
private ZlibDecompressor {
354344
public:
@@ -365,18 +355,13 @@ class ZlibDecompressionStream : public PagedInputStream,
365355
ZlibDecompressor{blockSize, windowBits, streamDebugInfo, isGzip} {}
366356
~ZlibDecompressionStream() override = default;
367357

368-
bool readOrSkip(const void** data, int32_t* size) override;
358+
bool Next(const void** data, int32_t* size) override;
369359
};
370360

371-
bool ZlibDecompressionStream::readOrSkip(const void** data, int32_t* size) {
372-
if (data) {
373-
VELOX_CHECK_EQ(pendingSkip_, 0);
374-
}
361+
bool ZlibDecompressionStream::Next(const void** data, int32_t* size) {
375362
// if the user pushed back, return them the partial buffer
376363
if (outputBufferLength_) {
377-
if (data) {
378-
*data = outputBufferPtr_;
379-
}
364+
*data = outputBufferPtr_;
380365
*size = static_cast<int32_t>(outputBufferLength_);
381366
outputBufferPtr_ += outputBufferLength_;
382367
bytesReturned_ += outputBufferLength_;
@@ -396,9 +381,7 @@ bool ZlibDecompressionStream::readOrSkip(const void** data, int32_t* size) {
396381
static_cast<size_t>(inputBufferPtrEnd_ - inputBufferPtr_),
397382
remainingLength_);
398383
if (state_ == State::ORIGINAL) {
399-
if (data) {
400-
*data = inputBufferPtr_;
401-
}
384+
*data = inputBufferPtr_;
402385
*size = static_cast<int32_t>(availSize);
403386
outputBufferPtr_ = inputBufferPtr_ + availSize;
404387
outputBufferLength_ = 0;
@@ -410,8 +393,7 @@ bool ZlibDecompressionStream::readOrSkip(const void** data, int32_t* size) {
410393
getName(),
411394
" Info: ",
412395
ZlibDecompressor::streamDebugInfo_);
413-
prepareOutputBuffer(
414-
getDecompressedLength(inputBufferPtr_, availSize).first);
396+
prepareOutputBuffer(getUncompressedLength(inputBufferPtr_, availSize));
415397

416398
reset();
417399
zstream_.next_in =
@@ -450,9 +432,7 @@ bool ZlibDecompressionStream::readOrSkip(const void** data, int32_t* size) {
450432
}
451433
} while (result != Z_STREAM_END);
452434
*size = static_cast<int32_t>(blockSize_ - zstream_.avail_out);
453-
if (data) {
454-
*data = outputBufferPtr_;
455-
}
435+
*data = outputBufferPtr_;
456436
outputBufferLength_ = 0;
457437
outputBufferPtr_ += *size;
458438
}

velox/dwio/common/compression/Compression.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,14 @@ class Compressor {
4646
class Decompressor {
4747
public:
4848
explicit Decompressor(uint64_t blockSize, const std::string& streamDebugInfo)
49-
: blockSize_{static_cast<int64_t>(blockSize)},
50-
streamDebugInfo_{streamDebugInfo} {}
49+
: blockSize_{blockSize}, streamDebugInfo_{streamDebugInfo} {}
5150

5251
virtual ~Decompressor() = default;
5352

54-
virtual std::pair<int64_t, bool /* Is the size exact? */>
55-
getDecompressedLength(const char* /* src */, uint64_t /* srcLength */) const {
56-
return {blockSize_, false};
53+
virtual uint64_t getUncompressedLength(
54+
const char* /* unused */,
55+
uint64_t /* unused */) const {
56+
return blockSize_;
5757
}
5858

5959
virtual uint64_t decompress(
@@ -63,7 +63,7 @@ class Decompressor {
6363
uint64_t destLength) = 0;
6464

6565
protected:
66-
int64_t blockSize_;
66+
uint64_t blockSize_;
6767
const std::string streamDebugInfo_;
6868
};
6969

velox/dwio/common/compression/PagedInputStream.cpp

Lines changed: 26 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -104,21 +104,9 @@ const char* PagedInputStream::ensureInput(size_t availableInputBytes) {
104104
}
105105

106106
bool PagedInputStream::Next(const void** data, int32_t* size) {
107-
VELOX_CHECK_NOT_NULL(data);
108-
skipAllPending();
109-
return readOrSkip(data, size);
110-
}
111-
112-
// Read into `data' if it is not null; otherwise skip some of the pending.
113-
bool PagedInputStream::readOrSkip(const void** data, int32_t* size) {
114-
if (data) {
115-
VELOX_CHECK_EQ(pendingSkip_, 0);
116-
}
117107
// if the user pushed back, return them the partial buffer
118108
if (outputBufferLength_) {
119-
if (data) {
120-
*data = outputBufferPtr_;
121-
}
109+
*data = outputBufferPtr_;
122110
*size = static_cast<int32_t>(outputBufferLength_);
123111
outputBufferPtr_ += outputBufferLength_;
124112
bytesReturned_ += outputBufferLength_;
@@ -151,9 +139,7 @@ bool PagedInputStream::readOrSkip(const void** data, int32_t* size) {
151139
// if no decompression or decryption is needed, simply adjust the output
152140
// pointer. Otherwise, make sure we have continuous block
153141
if (original) {
154-
if (data) {
155-
*data = inputBufferPtr_;
156-
}
142+
*data = inputBufferPtr_;
157143
*size = static_cast<int32_t>(availSize);
158144
outputBufferPtr_ = inputBufferPtr_ + availSize;
159145
inputBufferPtr_ += availSize;
@@ -168,35 +154,24 @@ bool PagedInputStream::readOrSkip(const void** data, int32_t* size) {
168154
decrypter_->decrypt(folly::StringPiece{input, remainingLength_});
169155
input = reinterpret_cast<const char*>(decryptionBuffer_->data());
170156
remainingLength_ = decryptionBuffer_->length();
171-
if (data) {
172-
*data = input;
173-
}
157+
*data = input;
174158
*size = remainingLength_;
175159
outputBufferPtr_ = input + remainingLength_;
176160
}
177161

178162
// perform decompression
179163
if (state_ == State::START) {
180164
DWIO_ENSURE_NOT_NULL(decompressor_.get(), "invalid stream state");
181-
DWIO_ENSURE_NOT_NULL(input);
182-
auto [decompressedLength, exact] =
183-
decompressor_->getDecompressedLength(input, remainingLength_);
184-
if (!data && exact && decompressedLength <= pendingSkip_) {
185-
*size = decompressedLength;
186-
outputBufferPtr_ = nullptr;
187-
} else {
188-
prepareOutputBuffer(decompressedLength);
189-
outputBufferLength_ = decompressor_->decompress(
190-
input,
191-
remainingLength_,
192-
outputBuffer_->data(),
193-
outputBuffer_->capacity());
194-
if (data) {
195-
*data = outputBuffer_->data();
196-
}
197-
*size = static_cast<int32_t>(outputBufferLength_);
198-
outputBufferPtr_ = outputBuffer_->data() + outputBufferLength_;
199-
}
165+
prepareOutputBuffer(
166+
decompressor_->getUncompressedLength(input, remainingLength_));
167+
outputBufferLength_ = decompressor_->decompress(
168+
input,
169+
remainingLength_,
170+
outputBuffer_->data(),
171+
outputBuffer_->capacity());
172+
*data = outputBuffer_->data();
173+
*size = static_cast<int32_t>(outputBufferLength_);
174+
outputBufferPtr_ = outputBuffer_->data() + outputBufferLength_;
200175
// release decryption buffer
201176
decryptionBuffer_ = nullptr;
202177
}
@@ -213,14 +188,6 @@ bool PagedInputStream::readOrSkip(const void** data, int32_t* size) {
213188
}
214189

215190
void PagedInputStream::BackUp(int32_t count) {
216-
if (pendingSkip_ > 0) {
217-
auto len = std::min<int64_t>(count, pendingSkip_);
218-
pendingSkip_ -= len;
219-
count -= len;
220-
if (count == 0) {
221-
return;
222-
}
223-
}
224191
DWIO_ENSURE(
225192
outputBufferPtr_ != nullptr,
226193
"Backup without previous Next in ",
@@ -240,29 +207,25 @@ void PagedInputStream::BackUp(int32_t count) {
240207
bytesReturned_ -= count;
241208
}
242209

243-
bool PagedInputStream::skipAllPending() {
244-
while (pendingSkip_ > 0) {
210+
bool PagedInputStream::Skip(int32_t count) {
211+
// this is a stupid implementation for now.
212+
// should skip entire blocks without decompressing
213+
while (count > 0) {
214+
const void* ptr;
245215
int32_t len;
246-
if (!readOrSkip(nullptr, &len)) {
216+
if (!Next(&ptr, &len)) {
247217
return false;
248218
}
249-
if (len > pendingSkip_) {
250-
auto toBackUp = len - pendingSkip_;
251-
pendingSkip_ = 0;
252-
BackUp(toBackUp);
219+
if (len > count) {
220+
BackUp(len - count);
221+
count = 0;
253222
} else {
254-
pendingSkip_ -= len;
223+
count -= len;
255224
}
256225
}
257226
return true;
258227
}
259228

260-
bool PagedInputStream::Skip(int32_t count) {
261-
pendingSkip_ += count;
262-
// We never use the return value of this function so this is OK.
263-
return true;
264-
}
265-
266229
void PagedInputStream::clearDecompressionState() {
267230
state_ = State::HEADER;
268231
outputBufferLength_ = 0;
@@ -280,8 +243,7 @@ void PagedInputStream::seekToPosition(
280243
// to the beginning of the last view or last header, whichever is
281244
// later. If we are returning views into the decompression buffer,
282245
// we can backup to the beginning of the decompressed buffer
283-
auto alreadyRead =
284-
bytesReturned_ - bytesReturnedAtLastHeaderOffset_ + pendingSkip_;
246+
auto alreadyRead = bytesReturned_ - bytesReturnedAtLastHeaderOffset_;
285247

286248
// outsideOriginalWindow is true if we are returning views into
287249
// the input stream's buffer and we are seeking below the start of the last
@@ -298,12 +260,12 @@ void PagedInputStream::seekToPosition(
298260
auto provider = dwio::common::PositionProvider(positions);
299261
input_->seekToPosition(provider);
300262
clearDecompressionState();
301-
pendingSkip_ = uncompressedOffset;
263+
Skip(uncompressedOffset);
302264
} else {
303265
if (uncompressedOffset < alreadyRead) {
304266
BackUp(alreadyRead - uncompressedOffset);
305267
} else {
306-
pendingSkip_ += uncompressedOffset - alreadyRead;
268+
Skip(uncompressedOffset - alreadyRead);
307269
}
308270
}
309271
}

0 commit comments

Comments
 (0)