Skip to content

Commit 705aa16

Browse files
NickCrewscpcloud
authored andcommitted
feat(duckdb): add read_xlsx implementation
1 parent f42ab45 commit 705aa16

File tree

2 files changed

+91
-0
lines changed

2 files changed

+91
-0
lines changed

ibis/backends/duckdb/__init__.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,58 @@ def read_sqlite(
10741074

10751075
return self.table(table_name)
10761076

1077+
@util.experimental
1078+
def read_xlsx(
1079+
self,
1080+
path: str | Path,
1081+
/,
1082+
*,
1083+
sheet: str | None = None,
1084+
range: str | None = None,
1085+
**kwargs,
1086+
) -> ir.Table:
1087+
"""Read an Excel file into a DuckDB table. This requires duckdb>=1.2.0.
1088+
1089+
Parameters
1090+
----------
1091+
path
1092+
The path to the Excel file.
1093+
sheet
1094+
The name of the sheet to read, eg 'Sheet3'.
1095+
range
1096+
The range of cells to read, eg 'A5:Z'.
1097+
kwargs
1098+
Additional args passed to the backend's read function.
1099+
1100+
Returns
1101+
-------
1102+
ir.Table
1103+
The just-registered table.
1104+
1105+
See Also
1106+
--------
1107+
[DuckDB's `excel` extension docs](https://duckdb.org/docs/stable/extensions/excel.html)
1108+
"""
1109+
path = str(path)
1110+
table_name = util.gen_name("read_xlsx")
1111+
1112+
if sheet:
1113+
kwargs["sheet"] = sheet
1114+
1115+
if range:
1116+
kwargs["range"] = range
1117+
1118+
options = [
1119+
sg.to_identifier(key).eq(sge.convert(val)) for key, val in kwargs.items()
1120+
]
1121+
1122+
self._load_extensions(["excel"])
1123+
self._create_temp_view(
1124+
table_name,
1125+
sg.select(STAR).from_(self.compiler.f.read_xlsx(path, *options)),
1126+
)
1127+
return self.table(table_name)
1128+
10771129
def attach(
10781130
self, path: str | Path, name: str | None = None, read_only: bool = False
10791131
) -> None:

ibis/backends/duckdb/tests/test_io.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,3 +441,42 @@ def test_read_csv_with_duckdb_specific_types(con):
441441
columns = {"a": "STRUCT(a INTEGER)"}
442442
with pytest.raises(duckdb.IOException, match="No files found"):
443443
con.read_csv(path, columns=columns)
444+
445+
446+
@pytest.mark.xfail(
447+
LINUX and SANDBOXED,
448+
reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
449+
)
450+
def test_roundtrip_xlsx(con, tmp_path):
451+
path = tmp_path / "test.xlsx"
452+
453+
ft = con.tables.functional_alltypes
454+
455+
ft.to_xlsx(path, header=True)
456+
457+
t = con.read_xlsx(path)
458+
459+
assert t.columns == ft.columns
460+
assert t.count().execute() == ft.count().execute()
461+
462+
463+
@pytest.mark.xfail(
464+
LINUX and SANDBOXED,
465+
reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
466+
)
467+
def test_roundtrip_xlsx_with_sheet(con, tmp_path):
468+
path = tmp_path / "test.xlsx"
469+
470+
con.load_extension("excel")
471+
472+
ft = con.tables.functional_alltypes
473+
ft.to_xlsx(path, sheet="Sheet2", header=True)
474+
475+
t = con.read_xlsx(path, sheet="Sheet2", range="A1:E3")
476+
477+
ncolumns = len(t.columns)
478+
479+
# A1:E3 is 5 columns (A-E), 2 rows (1-3, first is header)
480+
assert ncolumns == 5
481+
assert t.columns == ft.columns[:ncolumns]
482+
assert t.count().execute() == 2

0 commit comments

Comments
 (0)