Skip to content

Commit 0cc0c05

Browse files
authored
parquet: Optimized ByteArrayReader, Add UTF-8 Validation (#1040) (#1082)
* Optimized ByteArrayReader (#1040) UTF-8 Validation (#786) * Fix arrow_array_reader benchmark * Allow running subset of arrow_array_reader benchmarks * Faster UTF-8 validation * Tweak null handling * Add license * Refine `ValuesBuffer::pad_nulls` * Tweak error handling * Use page null count if available * Doc comments * Test DELTA_BYTE_ARRAY encoding * Support legacy Encoding::PLAIN_DICTIONARY * Add OffsetBuffer unit tests Review feedback * More tests * Fix lint * Review feedback
1 parent 4f1064e commit 0cc0c05

File tree

10 files changed

+1299
-162
lines changed

10 files changed

+1299
-162
lines changed

parquet/benches/arrow_array_reader.rs

Lines changed: 72 additions & 74 deletions
Large diffs are not rendered by default.

parquet/src/arrow/array_reader.rs

Lines changed: 41 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ use crate::arrow::converter::{
6060
DecimalConverter, FixedLenBinaryConverter, FixedSizeArrayConverter,
6161
Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter,
6262
IntervalDayTimeConverter, IntervalYearMonthArrayConverter,
63-
IntervalYearMonthConverter, LargeBinaryArrayConverter, LargeBinaryConverter,
64-
LargeUtf8ArrayConverter, LargeUtf8Converter,
63+
IntervalYearMonthConverter, Utf8ArrayConverter, Utf8Converter,
6564
};
6665
use crate::arrow::record_reader::buffer::{ScalarValue, ValuesBuffer};
6766
use crate::arrow::record_reader::{GenericRecordReader, RecordReader};
@@ -81,6 +80,11 @@ use crate::schema::types::{
8180
};
8281
use crate::schema::visitor::TypeVisitor;
8382

83+
mod byte_array;
84+
mod offset_buffer;
85+
86+
pub use byte_array::make_byte_array_reader;
87+
8488
/// Array reader reads parquet data into arrow array.
8589
pub trait ArrayReader {
8690
fn as_any(&self) -> &dyn Any;
@@ -1778,57 +1782,43 @@ impl<'a> ArrayReaderBuilder {
17781782
null_mask_only,
17791783
)?,
17801784
)),
1781-
PhysicalType::BYTE_ARRAY => {
1782-
if cur_type.get_basic_info().converted_type() == ConvertedType::UTF8 {
1783-
if let Some(ArrowType::LargeUtf8) = arrow_type {
1784-
let converter =
1785-
LargeUtf8Converter::new(LargeUtf8ArrayConverter {});
1786-
Ok(Box::new(ComplexObjectArrayReader::<
1787-
ByteArrayType,
1788-
LargeUtf8Converter,
1789-
>::new(
1790-
page_iterator,
1791-
column_desc,
1792-
converter,
1793-
arrow_type,
1794-
)?))
1795-
} else {
1796-
use crate::arrow::arrow_array_reader::{
1797-
ArrowArrayReader, StringArrayConverter,
1798-
};
1799-
let converter = StringArrayConverter::new();
1800-
Ok(Box::new(ArrowArrayReader::try_new(
1801-
*page_iterator,
1802-
column_desc,
1803-
converter,
1804-
arrow_type,
1805-
)?))
1785+
PhysicalType::BYTE_ARRAY => match arrow_type {
1786+
// TODO: Replace with optimised dictionary reader (#171)
1787+
Some(ArrowType::Dictionary(_, _)) => {
1788+
match cur_type.get_basic_info().converted_type() {
1789+
ConvertedType::UTF8 => {
1790+
let converter = Utf8Converter::new(Utf8ArrayConverter {});
1791+
Ok(Box::new(ComplexObjectArrayReader::<
1792+
ByteArrayType,
1793+
Utf8Converter,
1794+
>::new(
1795+
page_iterator,
1796+
column_desc,
1797+
converter,
1798+
arrow_type,
1799+
)?))
1800+
}
1801+
_ => {
1802+
let converter = BinaryConverter::new(BinaryArrayConverter {});
1803+
Ok(Box::new(ComplexObjectArrayReader::<
1804+
ByteArrayType,
1805+
BinaryConverter,
1806+
>::new(
1807+
page_iterator,
1808+
column_desc,
1809+
converter,
1810+
arrow_type,
1811+
)?))
1812+
}
18061813
}
1807-
} else if let Some(ArrowType::LargeBinary) = arrow_type {
1808-
let converter =
1809-
LargeBinaryConverter::new(LargeBinaryArrayConverter {});
1810-
Ok(Box::new(ComplexObjectArrayReader::<
1811-
ByteArrayType,
1812-
LargeBinaryConverter,
1813-
>::new(
1814-
page_iterator,
1815-
column_desc,
1816-
converter,
1817-
arrow_type,
1818-
)?))
1819-
} else {
1820-
let converter = BinaryConverter::new(BinaryArrayConverter {});
1821-
Ok(Box::new(ComplexObjectArrayReader::<
1822-
ByteArrayType,
1823-
BinaryConverter,
1824-
>::new(
1825-
page_iterator,
1826-
column_desc,
1827-
converter,
1828-
arrow_type,
1829-
)?))
18301814
}
1831-
}
1815+
_ => make_byte_array_reader(
1816+
page_iterator,
1817+
column_desc,
1818+
arrow_type,
1819+
null_mask_only,
1820+
),
1821+
},
18321822
PhysicalType::FIXED_LEN_BYTE_ARRAY
18331823
if cur_type.get_basic_info().converted_type()
18341824
== ConvertedType::DECIMAL =>

0 commit comments

Comments
 (0)