@@ -42,7 +42,7 @@ use arrow::datatypes::{
42
42
Float32Type as ArrowFloat32Type , Float64Type as ArrowFloat64Type ,
43
43
Int16Type as ArrowInt16Type , Int32Type as ArrowInt32Type ,
44
44
Int64Type as ArrowInt64Type , Int8Type as ArrowInt8Type , IntervalUnit , Schema ,
45
- Time32MillisecondType as ArrowTime32MillisecondType ,
45
+ SchemaRef , Time32MillisecondType as ArrowTime32MillisecondType ,
46
46
Time32SecondType as ArrowTime32SecondType ,
47
47
Time64MicrosecondType as ArrowTime64MicrosecondType ,
48
48
Time64NanosecondType as ArrowTime64NanosecondType , TimeUnit as ArrowTimeUnit ,
@@ -91,7 +91,7 @@ pub use byte_array::make_byte_array_reader;
91
91
pub use byte_array_dictionary:: make_byte_array_dictionary_reader;
92
92
93
93
/// Array reader reads parquet data into arrow array.
94
- pub trait ArrayReader {
94
+ pub trait ArrayReader : Send {
95
95
fn as_any ( & self ) -> & dyn Any ;
96
96
97
97
/// Returns the arrow type of this array reader.
@@ -117,6 +117,26 @@ pub trait ArrayReader {
117
117
fn get_rep_levels ( & self ) -> Option < & [ i16 ] > ;
118
118
}
119
119
120
+ /// A collection of row groups
121
+ pub trait RowGroupCollection {
122
+ /// Get schema of parquet file.
123
+ fn schema ( & self ) -> Result < SchemaDescPtr > ;
124
+
125
+ /// Returns an iterator over the column chunks for particular column
126
+ fn column_chunks ( & self , i : usize ) -> Result < Box < dyn PageIterator > > ;
127
+ }
128
+
129
+ impl RowGroupCollection for Arc < dyn FileReader > {
130
+ fn schema ( & self ) -> Result < SchemaDescPtr > {
131
+ Ok ( self . metadata ( ) . file_metadata ( ) . schema_descr_ptr ( ) )
132
+ }
133
+
134
+ fn column_chunks ( & self , column_index : usize ) -> Result < Box < dyn PageIterator > > {
135
+ let iterator = FilePageIterator :: new ( column_index, Arc :: clone ( self ) ) ?;
136
+ Ok ( Box :: new ( iterator) )
137
+ }
138
+ }
139
+
120
140
/// Uses `record_reader` to read up to `batch_size` records from `pages`
121
141
///
122
142
/// Returns the number of records read, which can be less than batch_size if
@@ -482,7 +502,7 @@ where
482
502
impl < T , C > ArrayReader for ComplexObjectArrayReader < T , C >
483
503
where
484
504
T : DataType ,
485
- C : Converter < Vec < Option < T :: T > > , ArrayRef > + ' static ,
505
+ C : Converter < Vec < Option < T :: T > > , ArrayRef > + Send + ' static ,
486
506
{
487
507
fn as_any ( & self ) -> & dyn Any {
488
508
self
@@ -1315,9 +1335,9 @@ impl ArrayReader for StructArrayReader {
1315
1335
/// Create array reader from parquet schema, column indices, and parquet file reader.
1316
1336
pub fn build_array_reader < T > (
1317
1337
parquet_schema : SchemaDescPtr ,
1318
- arrow_schema : Schema ,
1338
+ arrow_schema : SchemaRef ,
1319
1339
column_indices : T ,
1320
- file_reader : Arc < dyn FileReader > ,
1340
+ row_groups : Box < dyn RowGroupCollection > ,
1321
1341
) -> Result < Box < dyn ArrayReader > >
1322
1342
where
1323
1343
T : IntoIterator < Item = usize > ,
@@ -1355,13 +1375,8 @@ where
1355
1375
fields : filtered_root_fields,
1356
1376
} ;
1357
1377
1358
- ArrayReaderBuilder :: new (
1359
- Arc :: new ( proj) ,
1360
- Arc :: new ( arrow_schema) ,
1361
- Arc :: new ( leaves) ,
1362
- file_reader,
1363
- )
1364
- . build_array_reader ( )
1378
+ ArrayReaderBuilder :: new ( Arc :: new ( proj) , arrow_schema, Arc :: new ( leaves) , row_groups)
1379
+ . build_array_reader ( )
1365
1380
}
1366
1381
1367
1382
/// Used to build array reader.
@@ -1371,7 +1386,7 @@ struct ArrayReaderBuilder {
1371
1386
// Key: columns that need to be included in final array builder
1372
1387
// Value: column index in schema
1373
1388
columns_included : Arc < HashMap < * const Type , usize > > ,
1374
- file_reader : Arc < dyn FileReader > ,
1389
+ row_groups : Box < dyn RowGroupCollection > ,
1375
1390
}
1376
1391
1377
1392
/// Used in type visitor.
@@ -1671,13 +1686,13 @@ impl<'a> ArrayReaderBuilder {
1671
1686
root_schema : TypePtr ,
1672
1687
arrow_schema : Arc < Schema > ,
1673
1688
columns_included : Arc < HashMap < * const Type , usize > > ,
1674
- file_reader : Arc < dyn FileReader > ,
1689
+ file_reader : Box < dyn RowGroupCollection > ,
1675
1690
) -> Self {
1676
1691
Self {
1677
1692
root_schema,
1678
1693
arrow_schema,
1679
1694
columns_included,
1680
- file_reader,
1695
+ row_groups : file_reader,
1681
1696
}
1682
1697
}
1683
1698
@@ -1711,10 +1726,10 @@ impl<'a> ArrayReaderBuilder {
1711
1726
context. rep_level ,
1712
1727
context. path . clone ( ) ,
1713
1728
) ) ;
1714
- let page_iterator = Box :: new ( FilePageIterator :: new (
1715
- self . columns_included [ & ( cur_type . as_ref ( ) as * const Type ) ] ,
1716
- self . file_reader . clone ( ) ,
1717
- ) ? ) ;
1729
+
1730
+ let page_iterator = self
1731
+ . row_groups
1732
+ . column_chunks ( self . columns_included [ & ( cur_type . as_ref ( ) as * const Type ) ] ) ? ;
1718
1733
1719
1734
let arrow_type: Option < ArrowType > = self
1720
1735
. get_arrow_field ( & cur_type, context)
@@ -2827,7 +2842,8 @@ mod tests {
2827
2842
#[ test]
2828
2843
fn test_create_array_reader ( ) {
2829
2844
let file = get_test_file ( "nulls.snappy.parquet" ) ;
2830
- let file_reader = Arc :: new ( SerializedFileReader :: new ( file) . unwrap ( ) ) ;
2845
+ let file_reader: Arc < dyn FileReader > =
2846
+ Arc :: new ( SerializedFileReader :: new ( file) . unwrap ( ) ) ;
2831
2847
2832
2848
let file_metadata = file_reader. metadata ( ) . file_metadata ( ) ;
2833
2849
let arrow_schema = parquet_to_arrow_schema (
@@ -2838,9 +2854,9 @@ mod tests {
2838
2854
2839
2855
let array_reader = build_array_reader (
2840
2856
file_reader. metadata ( ) . file_metadata ( ) . schema_descr_ptr ( ) ,
2841
- arrow_schema,
2857
+ Arc :: new ( arrow_schema) ,
2842
2858
vec ! [ 0usize ] . into_iter ( ) ,
2843
- file_reader,
2859
+ Box :: new ( file_reader) ,
2844
2860
)
2845
2861
. unwrap ( ) ;
2846
2862
0 commit comments