Skip to content

Commit 578ca91

Browse files
committed
Optimized ByteArrayReader (#1040)
1 parent 94d66ad commit 578ca91

File tree

5 files changed

+756
-84
lines changed

5 files changed

+756
-84
lines changed

parquet/benches/arrow_array_reader.rs

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -319,20 +319,13 @@ fn create_string_arrow_array_reader(
319319
ArrowArrayReader::try_new(page_iterator, column_desc, converter, None).unwrap()
320320
}
321321

322-
fn create_string_complex_array_reader(
322+
fn create_string_byte_array_reader(
323323
page_iterator: impl PageIterator + 'static,
324324
column_desc: ColumnDescPtr,
325325
) -> impl ArrayReader {
326-
use parquet::arrow::array_reader::ComplexObjectArrayReader;
327-
use parquet::arrow::converter::{Utf8ArrayConverter, Utf8Converter};
328-
let converter = Utf8Converter::new(Utf8ArrayConverter {});
329-
ComplexObjectArrayReader::<parquet::data_type::ByteArrayType, Utf8Converter>::new(
330-
Box::new(page_iterator),
331-
column_desc,
332-
converter,
333-
None,
334-
)
335-
.unwrap()
326+
use parquet::arrow::array_reader::ByteArrayReader;
327+
ByteArrayReader::new_with_options(Box::new(page_iterator), column_desc, None, true)
328+
.unwrap()
336329
}
337330

338331
fn add_benches(c: &mut Criterion) {
@@ -568,7 +561,7 @@ fn add_benches(c: &mut Criterion) {
568561
"read StringArray, plain encoded, mandatory, no NULLs - old",
569562
|b| {
570563
b.iter(|| {
571-
let array_reader = create_string_complex_array_reader(
564+
let array_reader = create_string_byte_array_reader(
572565
plain_string_no_null_data.clone(),
573566
mandatory_string_column_desc.clone(),
574567
);
@@ -601,7 +594,7 @@ fn add_benches(c: &mut Criterion) {
601594
"read StringArray, plain encoded, optional, no NULLs - old",
602595
|b| {
603596
b.iter(|| {
604-
let array_reader = create_string_complex_array_reader(
597+
let array_reader = create_string_byte_array_reader(
605598
plain_string_no_null_data.clone(),
606599
optional_string_column_desc.clone(),
607600
);
@@ -635,7 +628,7 @@ fn add_benches(c: &mut Criterion) {
635628
"read StringArray, plain encoded, optional, half NULLs - old",
636629
|b| {
637630
b.iter(|| {
638-
let array_reader = create_string_complex_array_reader(
631+
let array_reader = create_string_byte_array_reader(
639632
plain_string_half_null_data.clone(),
640633
optional_string_column_desc.clone(),
641634
);
@@ -669,7 +662,7 @@ fn add_benches(c: &mut Criterion) {
669662
"read StringArray, dictionary encoded, mandatory, no NULLs - old",
670663
|b| {
671664
b.iter(|| {
672-
let array_reader = create_string_complex_array_reader(
665+
let array_reader = create_string_byte_array_reader(
673666
dictionary_string_no_null_data.clone(),
674667
mandatory_string_column_desc.clone(),
675668
);
@@ -702,7 +695,7 @@ fn add_benches(c: &mut Criterion) {
702695
"read StringArray, dictionary encoded, optional, no NULLs - old",
703696
|b| {
704697
b.iter(|| {
705-
let array_reader = create_string_complex_array_reader(
698+
let array_reader = create_string_byte_array_reader(
706699
dictionary_string_no_null_data.clone(),
707700
optional_string_column_desc.clone(),
708701
);
@@ -736,7 +729,7 @@ fn add_benches(c: &mut Criterion) {
736729
"read StringArray, dictionary encoded, optional, half NULLs - old",
737730
|b| {
738731
b.iter(|| {
739-
let array_reader = create_string_complex_array_reader(
732+
let array_reader = create_string_byte_array_reader(
740733
dictionary_string_half_null_data.clone(),
741734
optional_string_column_desc.clone(),
742735
);

parquet/src/arrow/array_reader.rs

Lines changed: 39 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,7 @@ use crate::arrow::converter::{
6060
DecimalConverter, FixedLenBinaryConverter, FixedSizeArrayConverter,
6161
Int96ArrayConverter, Int96Converter, IntervalDayTimeArrayConverter,
6262
IntervalDayTimeConverter, IntervalYearMonthArrayConverter,
63-
IntervalYearMonthConverter, LargeBinaryArrayConverter, LargeBinaryConverter,
64-
LargeUtf8ArrayConverter, LargeUtf8Converter,
63+
IntervalYearMonthConverter, Utf8ArrayConverter, Utf8Converter,
6564
};
6665
use crate::arrow::record_reader::buffer::{ScalarValue, ValuesBuffer};
6766
use crate::arrow::record_reader::{GenericRecordReader, RecordReader};
@@ -81,6 +80,10 @@ use crate::schema::types::{
8180
};
8281
use crate::schema::visitor::TypeVisitor;
8382

83+
mod byte_array;
84+
85+
pub use byte_array::ByteArrayReader;
86+
8487
/// Array reader reads parquet data into arrow array.
8588
pub trait ArrayReader {
8689
fn as_any(&self) -> &dyn Any;
@@ -1749,57 +1752,42 @@ impl<'a> ArrayReaderBuilder {
17491752
null_mask_only,
17501753
)?,
17511754
)),
1752-
PhysicalType::BYTE_ARRAY => {
1753-
if cur_type.get_basic_info().converted_type() == ConvertedType::UTF8 {
1754-
if let Some(ArrowType::LargeUtf8) = arrow_type {
1755-
let converter =
1756-
LargeUtf8Converter::new(LargeUtf8ArrayConverter {});
1757-
Ok(Box::new(ComplexObjectArrayReader::<
1758-
ByteArrayType,
1759-
LargeUtf8Converter,
1760-
>::new(
1761-
page_iterator,
1762-
column_desc,
1763-
converter,
1764-
arrow_type,
1765-
)?))
1766-
} else {
1767-
use crate::arrow::arrow_array_reader::{
1768-
ArrowArrayReader, StringArrayConverter,
1769-
};
1770-
let converter = StringArrayConverter::new();
1771-
Ok(Box::new(ArrowArrayReader::try_new(
1772-
*page_iterator,
1773-
column_desc,
1774-
converter,
1775-
arrow_type,
1776-
)?))
1755+
PhysicalType::BYTE_ARRAY => match arrow_type {
1756+
// TODO: Replace with optimised dictionary reader (#171)
1757+
Some(ArrowType::Dictionary(_, _)) => {
1758+
match cur_type.get_basic_info().converted_type() {
1759+
ConvertedType::UTF8 => {
1760+
let converter = Utf8Converter::new(Utf8ArrayConverter {});
1761+
Ok(Box::new(ComplexObjectArrayReader::<
1762+
ByteArrayType,
1763+
Utf8Converter,
1764+
>::new(
1765+
page_iterator,
1766+
column_desc,
1767+
converter,
1768+
arrow_type,
1769+
)?))
1770+
}
1771+
_ => {
1772+
let converter = BinaryConverter::new(BinaryArrayConverter {});
1773+
Ok(Box::new(ComplexObjectArrayReader::<
1774+
ByteArrayType,
1775+
BinaryConverter,
1776+
>::new(
1777+
page_iterator,
1778+
column_desc,
1779+
converter,
1780+
arrow_type,
1781+
)?))
1782+
}
17771783
}
1778-
} else if let Some(ArrowType::LargeBinary) = arrow_type {
1779-
let converter =
1780-
LargeBinaryConverter::new(LargeBinaryArrayConverter {});
1781-
Ok(Box::new(ComplexObjectArrayReader::<
1782-
ByteArrayType,
1783-
LargeBinaryConverter,
1784-
>::new(
1785-
page_iterator,
1786-
column_desc,
1787-
converter,
1788-
arrow_type,
1789-
)?))
1790-
} else {
1791-
let converter = BinaryConverter::new(BinaryArrayConverter {});
1792-
Ok(Box::new(ComplexObjectArrayReader::<
1793-
ByteArrayType,
1794-
BinaryConverter,
1795-
>::new(
1796-
page_iterator,
1797-
column_desc,
1798-
converter,
1799-
arrow_type,
1800-
)?))
18011784
}
1802-
}
1785+
_ => Ok(Box::new(ByteArrayReader::new(
1786+
page_iterator,
1787+
column_desc,
1788+
arrow_type,
1789+
)?)),
1790+
},
18031791
PhysicalType::FIXED_LEN_BYTE_ARRAY
18041792
if cur_type.get_basic_info().converted_type()
18051793
== ConvertedType::DECIMAL =>

0 commit comments

Comments
 (0)