@@ -42,7 +42,7 @@ use arrow::datatypes::{
42
42
Float32Type as ArrowFloat32Type , Float64Type as ArrowFloat64Type ,
43
43
Int16Type as ArrowInt16Type , Int32Type as ArrowInt32Type ,
44
44
Int64Type as ArrowInt64Type , Int8Type as ArrowInt8Type , IntervalUnit , Schema ,
45
- Time32MillisecondType as ArrowTime32MillisecondType ,
45
+ SchemaRef , Time32MillisecondType as ArrowTime32MillisecondType ,
46
46
Time32SecondType as ArrowTime32SecondType ,
47
47
Time64MicrosecondType as ArrowTime64MicrosecondType ,
48
48
Time64NanosecondType as ArrowTime64NanosecondType , TimeUnit as ArrowTimeUnit ,
@@ -91,7 +91,7 @@ pub use byte_array::make_byte_array_reader;
91
91
pub use byte_array_dictionary:: make_byte_array_dictionary_reader;
92
92
93
93
/// Array reader reads parquet data into arrow array.
94
- pub trait ArrayReader {
94
+ pub trait ArrayReader : Send {
95
95
fn as_any ( & self ) -> & dyn Any ;
96
96
97
97
/// Returns the arrow type of this array reader.
@@ -117,6 +117,26 @@ pub trait ArrayReader {
117
117
fn get_rep_levels ( & self ) -> Option < & [ i16 ] > ;
118
118
}
119
119
120
+ /// A collection of row groups
121
+ pub trait RowGroupCollection {
122
+ /// Get schema of parquet file.
123
+ fn schema ( & self ) -> Result < SchemaDescPtr > ;
124
+
125
+ /// Returns an iterator over the column chunks for particular column
126
+ fn column_chunks ( & self , i : usize ) -> Result < Box < dyn PageIterator > > ;
127
+ }
128
+
129
+ impl RowGroupCollection for Arc < dyn FileReader > {
130
+ fn schema ( & self ) -> Result < SchemaDescPtr > {
131
+ Ok ( self . metadata ( ) . file_metadata ( ) . schema_descr_ptr ( ) )
132
+ }
133
+
134
+ fn column_chunks ( & self , column_index : usize ) -> Result < Box < dyn PageIterator > > {
135
+ let iterator = FilePageIterator :: new ( column_index, Arc :: clone ( self ) ) ?;
136
+ Ok ( Box :: new ( iterator) )
137
+ }
138
+ }
139
+
120
140
/// Uses `record_reader` to read up to `batch_size` records from `pages`
121
141
///
122
142
/// Returns the number of records read, which can be less than batch_size if
@@ -478,7 +498,7 @@ where
478
498
impl < T , C > ArrayReader for ComplexObjectArrayReader < T , C >
479
499
where
480
500
T : DataType ,
481
- C : Converter < Vec < Option < T :: T > > , ArrayRef > + ' static ,
501
+ C : Converter < Vec < Option < T :: T > > , ArrayRef > + Send + ' static ,
482
502
{
483
503
fn as_any ( & self ) -> & dyn Any {
484
504
self
@@ -1311,9 +1331,9 @@ impl ArrayReader for StructArrayReader {
1311
1331
/// Create array reader from parquet schema, column indices, and parquet file reader.
1312
1332
pub fn build_array_reader < T > (
1313
1333
parquet_schema : SchemaDescPtr ,
1314
- arrow_schema : Schema ,
1334
+ arrow_schema : SchemaRef ,
1315
1335
column_indices : T ,
1316
- file_reader : Arc < dyn FileReader > ,
1336
+ row_groups : Box < dyn RowGroupCollection > ,
1317
1337
) -> Result < Box < dyn ArrayReader > >
1318
1338
where
1319
1339
T : IntoIterator < Item = usize > ,
@@ -1351,13 +1371,8 @@ where
1351
1371
fields : filtered_root_fields,
1352
1372
} ;
1353
1373
1354
- ArrayReaderBuilder :: new (
1355
- Arc :: new ( proj) ,
1356
- Arc :: new ( arrow_schema) ,
1357
- Arc :: new ( leaves) ,
1358
- file_reader,
1359
- )
1360
- . build_array_reader ( )
1374
+ ArrayReaderBuilder :: new ( Arc :: new ( proj) , arrow_schema, Arc :: new ( leaves) , row_groups)
1375
+ . build_array_reader ( )
1361
1376
}
1362
1377
1363
1378
/// Used to build array reader.
@@ -1367,7 +1382,7 @@ struct ArrayReaderBuilder {
1367
1382
// Key: columns that need to be included in final array builder
1368
1383
// Value: column index in schema
1369
1384
columns_included : Arc < HashMap < * const Type , usize > > ,
1370
- file_reader : Arc < dyn FileReader > ,
1385
+ row_groups : Box < dyn RowGroupCollection > ,
1371
1386
}
1372
1387
1373
1388
/// Used in type visitor.
@@ -1667,13 +1682,13 @@ impl<'a> ArrayReaderBuilder {
1667
1682
root_schema : TypePtr ,
1668
1683
arrow_schema : Arc < Schema > ,
1669
1684
columns_included : Arc < HashMap < * const Type , usize > > ,
1670
- file_reader : Arc < dyn FileReader > ,
1685
+ file_reader : Box < dyn RowGroupCollection > ,
1671
1686
) -> Self {
1672
1687
Self {
1673
1688
root_schema,
1674
1689
arrow_schema,
1675
1690
columns_included,
1676
- file_reader,
1691
+ row_groups : file_reader,
1677
1692
}
1678
1693
}
1679
1694
@@ -1707,10 +1722,10 @@ impl<'a> ArrayReaderBuilder {
1707
1722
context. rep_level ,
1708
1723
context. path . clone ( ) ,
1709
1724
) ) ;
1710
- let page_iterator = Box :: new ( FilePageIterator :: new (
1711
- self . columns_included [ & ( cur_type . as_ref ( ) as * const Type ) ] ,
1712
- self . file_reader . clone ( ) ,
1713
- ) ? ) ;
1725
+
1726
+ let page_iterator = self
1727
+ . row_groups
1728
+ . column_chunks ( self . columns_included [ & ( cur_type . as_ref ( ) as * const Type ) ] ) ? ;
1714
1729
1715
1730
let arrow_type: Option < ArrowType > = self
1716
1731
. get_arrow_field ( & cur_type, context)
@@ -2823,7 +2838,8 @@ mod tests {
2823
2838
#[ test]
2824
2839
fn test_create_array_reader ( ) {
2825
2840
let file = get_test_file ( "nulls.snappy.parquet" ) ;
2826
- let file_reader = Arc :: new ( SerializedFileReader :: new ( file) . unwrap ( ) ) ;
2841
+ let file_reader: Arc < dyn FileReader > =
2842
+ Arc :: new ( SerializedFileReader :: new ( file) . unwrap ( ) ) ;
2827
2843
2828
2844
let file_metadata = file_reader. metadata ( ) . file_metadata ( ) ;
2829
2845
let arrow_schema = parquet_to_arrow_schema (
@@ -2834,9 +2850,9 @@ mod tests {
2834
2850
2835
2851
let array_reader = build_array_reader (
2836
2852
file_reader. metadata ( ) . file_metadata ( ) . schema_descr_ptr ( ) ,
2837
- arrow_schema,
2853
+ Arc :: new ( arrow_schema) ,
2838
2854
vec ! [ 0usize ] . into_iter ( ) ,
2839
- file_reader,
2855
+ Box :: new ( file_reader) ,
2840
2856
)
2841
2857
. unwrap ( ) ;
2842
2858
0 commit comments