feat(duckdb): add read_xlsx implementation

NickCrews · cpcloud · commit 705aa1661612 · 2025-03-10T16:06:48.000-04:00
diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py
@@ -1074,6 +1074,58 @@ def read_sqlite(
 
         return self.table(table_name)
 
+    @util.experimental
+    def read_xlsx(
+        self,
+        path: str | Path,
+        /,
+        *,
+        sheet: str | None = None,
+        range: str | None = None,
+        **kwargs,
+    ) -> ir.Table:
+        """Read an Excel file into a DuckDB table. This requires duckdb>=1.2.0.
+
+        Parameters
+        ----------
+        path
+            The path to the Excel file.
+        sheet
+            The name of the sheet to read, eg 'Sheet3'.
+        range
+            The range of cells to read, eg 'A5:Z'.
+        kwargs
+            Additional args passed to the backend's read function.
+
+        Returns
+        -------
+        ir.Table
+            The just-registered table.
+
+        See Also
+        --------
+        [DuckDB's `excel` extension docs](https://duckdb.org/docs/stable/extensions/excel.html)
+        """
+        path = str(path)
+        table_name = util.gen_name("read_xlsx")
+
+        if sheet:
+            kwargs["sheet"] = sheet
+
+        if range:
+            kwargs["range"] = range
+
+        options = [
+            sg.to_identifier(key).eq(sge.convert(val)) for key, val in kwargs.items()
+        ]
+
+        self._load_extensions(["excel"])
+        self._create_temp_view(
+            table_name,
+            sg.select(STAR).from_(self.compiler.f.read_xlsx(path, *options)),
+        )
+        return self.table(table_name)
+
     def attach(
         self, path: str | Path, name: str | None = None, read_only: bool = False
     ) -> None:
diff --git a/ibis/backends/duckdb/tests/test_io.py b/ibis/backends/duckdb/tests/test_io.py
@@ -441,3 +441,42 @@ def test_read_csv_with_duckdb_specific_types(con):
     columns = {"a": "STRUCT(a INTEGER)"}
     with pytest.raises(duckdb.IOException, match="No files found"):
         con.read_csv(path, columns=columns)
+
+
+@pytest.mark.xfail(
+    LINUX and SANDBOXED,
+    reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
+)
+def test_roundtrip_xlsx(con, tmp_path):
+    path = tmp_path / "test.xlsx"
+
+    ft = con.tables.functional_alltypes
+
+    ft.to_xlsx(path, header=True)
+
+    t = con.read_xlsx(path)
+
+    assert t.columns == ft.columns
+    assert t.count().execute() == ft.count().execute()
+
+
+@pytest.mark.xfail(
+    LINUX and SANDBOXED,
+    reason="nix on linux cannot download duckdb extensions or data due to sandboxing",
+)
+def test_roundtrip_xlsx_with_sheet(con, tmp_path):
+    path = tmp_path / "test.xlsx"
+
+    con.load_extension("excel")
+
+    ft = con.tables.functional_alltypes
+    ft.to_xlsx(path, sheet="Sheet2", header=True)
+
+    t = con.read_xlsx(path, sheet="Sheet2", range="A1:E3")
+
+    ncolumns = len(t.columns)
+
+    # A1:E3 is 5 columns (A-E), 2 rows (1-3, first is header)
+    assert ncolumns == 5
+    assert t.columns == ft.columns[:ncolumns]
+    assert t.count().execute() == 2