diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aad7213c93a1d..e52ef00c348d6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2734,7 +2734,81 @@ def to_parquet( storage_options=storage_options, **kwargs, ) + + def to_orc( + self, + path: FilePathOrBuffer = None, + engine: str = 'pyarrow', + index: bool = None, + **kwargs + ) -> bytes: + """ + Write a DataFrame to the orc/arrow format. + Parameters + ---------- + df : DataFrame + path : str or file-like object, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. + engine : {{'pyarrow'}}, default 'pyarrow' + Parquet library to use, or library it self, checked with 'pyarrow' name + and version > 4.0.0 + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + kwargs + Additional keyword arguments passed to the engine + Returns + ------- + bytes if no path argument is provided else None + See Also + -------- + read_orc : Read a ORC file. + DataFrame.to_parquet : Write a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + This function requires `pyarrow `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) + >>> df.to_orc('df.orc', compression='gzip') # doctest: +SKIP + >>> pd.read_orc('df.orc') # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + If you want to get a buffer to the orc content you can write it to io.BytesIO + >>> import io + >>> b = io.BytesIO(df.to_orc()) + >>> b.seek(0) + 0 + >>> content = b.read() + """ + from pandas.io.orc import to_orc + + return to_orc( + self, + path, + engine, + index=index, + **kwargs + ) + @Substitution( header_type="bool", header="Whether to print column labels, default True", diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 6bdb4df806b5c..d444d38aa2486 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,7 +1,11 @@ """ orc compat """ from __future__ import annotations +import os +import pandas._testing as tm + from typing import TYPE_CHECKING +from tempfile import gettempdir from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency @@ -55,3 +59,68 @@ def read_orc( with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() + + +def to_orc( + df: DataFrame, + path: FilePathOrBuffer = None, + engine: str = 'pyarrow', + index: bool = None, + **kwargs +) -> bytes: + """ + Write a DataFrame to the orc/arrow format. + Parameters + ---------- + df : DataFrame + path : str or file-like object, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. If path is None, + a bytes object is returned. + engine : {{'pyarrow'}}, default 'pyarrow' + Parquet library to use, or library it self, checked with 'pyarrow' name + and version > 4.0.0 + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. If + ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + kwargs + Additional keyword arguments passed to the engine + Returns + ------- + bytes if no path argument is provided else None + """ + if index is None: + index = df.index.names[0] is not None + + if isinstance(engine, str): + engine = import_optional_dependency(engine, min_version='4.0.0') + else: + try: + assert engine.__name__ == 'pyarrow', "engine must be 'pyarrow' module" + assert hasattr(engine, 'orc'), "'pyarrow' module must have orc module" + except Exception as e: + raise ValueError("Wrong engine passed, %s" % e) + + if path is None: + # to bytes: tmp path, pyarrow auto closes buffers + with tm.ensure_clean(os.path.join(gettempdir(), os.urandom(12).hex())) as path: + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + path, **kwargs + ) + with open(path, 'rb') as path: + return path.read() + else: + engine.orc.write_table( + engine.Table.from_pandas(df, preserve_index=index), + path, **kwargs + ) + return