diff --git a/src/uproot/behaviors/RNTuple.py b/src/uproot/behaviors/RNTuple.py new file mode 100644 index 000000000..7bef644f0 --- /dev/null +++ b/src/uproot/behaviors/RNTuple.py @@ -0,0 +1,1652 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE + +""" +This module defines behaviors for :doc:`uproot.behaviors.RNTuple.RNTuple` and +:doc:`uproot.behaviors.RNTuple.HasFields` (both ``RField`` and +``RNTuple``). + +Most of the functionality of RNTuple-reading is implemented here. + +See :doc:`uproot.models.RNTuple` for deserialization of the ``RNTuple`` +objects themselves. +""" +from __future__ import annotations + +import sys +import warnings +from collections.abc import Mapping + +import numpy + +import uproot +import uproot.interpretation.grouped +import uproot.language.python +import uproot.source.chunk +from uproot._util import no_filter, unset + + +def iterate( + files, + expressions=None, # TODO: Not implemented yet + cut=None, # TODO: Not implemented yet + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + aliases=None, # TODO: Not implemented yet + language=uproot.language.python.python_language, # TODO: Not implemented yet + step_size="100 MB", + decompression_executor=None, # TODO: Not implemented yet + library="ak", # TODO: Not implemented yet + ak_add_doc=False, # TODO: Not implemented yet + how=None, + report=False, # TODO: Not implemented yet + allow_missing=False, # TODO: Not implemented yet + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + interpretation_executor=unset, + custom_classes=unset, + **options, +): + """ + Args: + files: See below. + expressions (None, str, or list of str): Names of ``RFields`` or + aliases to convert to arrays or mathematical expressions of them. + Uses the ``language`` to evaluate. If None, all ``RFields`` + selected by the filters are included. (Not implemented yet.) + cut (None or str): If not None, this expression filters all of the + ``expressions``. (Not implemented yet.) + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool, or None): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. If the function + returns False or None, the ``RField`` is excluded; if the function + returns True, it is included. + aliases (None or dict of str \u2192 str): Mathematical expressions that + can be used in ``expressions`` or other aliases. + Uses the ``language`` engine to evaluate. (Not implemented yet.) + language (:doc:`uproot.language.Language`): Language used to interpret + the ``expressions`` and ``aliases``. (Not implemented yet.) + step_size (int or str): If an integer, the maximum number of entries to + include in each iteration step; if a string, the maximum memory size + to include. The string must be a number followed by a memory unit, + such as "100 MB". + decompression_executor (None or Executor with a ``submit`` method): The + executor that is used to decompress ``RPages``; if None, a + :doc:`uproot.source.futures.TrivialExecutor` is created. (Not implemented yet.) + library (str or :doc:`uproot.interpretation.library.Library`): The library + that is used to represent arrays. Options are ``"np"`` for NumPy, + ``"ak"`` for Awkward Array, and ``"pd"`` for Pandas. (Not implemented yet.) + ak_add_doc (bool | dict ): If True and ``library="ak"``, add the RField ``name`` + to the Awkward ``__doc__`` parameter of the array. + if dict = {key:value} and ``library="ak"``, add the RField ``value`` to the + Awkward ``key`` parameter of the array. + how (None, str, or container type): Library-dependent instructions + for grouping. The only recognized container types are ``tuple``, + ``list``, and ``dict``. Note that the container *type itself* + must be passed as ``how``, not an instance of that type (i.e. + ``how=tuple``, not ``how=()``). + report (bool): If True, this generator yields + (arrays, :doc:`uproot.behaviors.TBranch.Report`) pairs; if False, + it only yields arrays. The report has data about the ``TFile``, + ``TTree``, and global and local entry ranges. + allow_missing (bool): If True, skip over any files that do not contain + the specified ``RNTuple``. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + interpretation_executor (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + custom_classes (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + options: See below. + + Iterates through contiguous chunks of entries from a set of files. + + For example: + + .. code-block:: python + + >>> for array in uproot.iterate("files*.root:ntuple", filter_names=["x", "y"], step_size=100): + ... # each of the following have 100 entries + ... array["x"], array["y"] + + Allowed types for the ``files`` parameter: + + * str/bytes: relative or absolute filesystem path or URL, without any colons + other than Windows drive letter or URL schema. + Examples: ``"rel/file.root"``, ``"C:\\abs\\file.root"``, ``"http://where/what.root"`` + * str/bytes: same with an object-within-ROOT path, separated by a colon. + Example: ``"rel/file.root:tdirectory/rntuple"`` + * pathlib.Path: always interpreted as a filesystem path or URL only (no + object-within-ROOT path), regardless of whether there are any colons. + Examples: ``Path("rel:/file.root")``, ``Path("/abs/path:stuff.root")`` + * glob syntax in str/bytes and pathlib.Path. + Examples: ``Path("rel/*.root")``, ``"/abs/*.root:tdirectory/rntuple"`` + * dict: keys are filesystem paths, values are objects-within-ROOT paths. + Example: ``{{"/data_v1/*.root": "rntuple_v1", "/data_v2/*.root": "rntuple_v2"}}`` + * already-open RNTuple objects. + * iterables of the above. + + Options (type; default): (Not implemented yet.) + + * handler (:doc:`uproot.source.chunk.Source` class; None) + * timeout (float for HTTP, int for XRootD; 30) + * max_num_elements (None or int; None) + * num_workers (int; 1) + * use_threads (bool; False on the emscripten platform (i.e. in a web browser), else True) + * num_fallback_workers (int; 10) + * begin_chunk_size (memory_size; 403, the smallest a ROOT file can be) + * minimal_ttree_metadata (bool; True) + + See also :ref:`uproot.behaviors.RNTuple.HasFields.iterate` to iterate + within a single file. + + Other file entry points: + + * :doc:`uproot.reading.open`: opens one file to read any of its objects. + * :doc:`uproot.behaviors.RNTuple.iterate` (this function): iterates through + chunks of contiguous entries in ``RNTuples``. + * :doc:`uproot.behaviors.RNTuple.concatenate`: returns a single concatenated + array from ``RNTuples``. + * :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``RNTuples``. + """ + files = uproot._util.regularize_files(files, steps_allowed=False, **options) + library = uproot.interpretation.library._regularize_library(library) + + for file_path, object_path in files: + hasfields = uproot._util.regularize_object_path( + file_path, object_path, None, allow_missing, options + ) + + if hasfields is not None: + with hasfields: + try: + yield from hasfields.iterate( + expressions=expressions, + cut=cut, + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + aliases=aliases, + language=language, + step_size=step_size, + decompression_executor=decompression_executor, + library=library, + ak_add_doc=ak_add_doc, + how=how, + report=report, + filter_branch=filter_branch, + interpretation_executor=interpretation_executor, + ) + + except uproot.exceptions.KeyInFileError: + if allow_missing: + continue + else: + raise + + +def concatenate( + files, + expressions=None, # TODO: Not implemented yet + cut=None, # TODO: Not implemented yet + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + aliases=None, # TODO: Not implemented yet + language=uproot.language.python.python_language, # TODO: Not implemented yet + entry_start=None, + entry_stop=None, + decompression_executor=None, # TODO: Not implemented yet + library="ak", # TODO: Not implemented yet + ak_add_doc=False, # TODO: Not implemented yet + how=None, + allow_missing=False, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + interpretation_executor=unset, + custom_classes=unset, + **options, +): + """ + Args: + files: See below. + expressions (None, str, or list of str): Names of ``RFields`` or + aliases to convert to arrays or mathematical expressions of them. + Uses the ``language`` to evaluate. If None, all ``RFields`` + selected by the filters are included. (Not implemented yet.) + cut (None or str): If not None, this expression filters all of the + ``expressions``. (Not implemented yet.) + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``TBranches`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool, or None): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. If the function + returns False or None, the ``RField`` is excluded; if the function + returns True, it is included. + aliases (None or dict of str \u2192 str): Mathematical expressions that + can be used in ``expressions`` or other aliases. + Uses the ``language`` engine to evaluate. (Not implemented yet.) + language (:doc:`uproot.language.Language`): Language used to interpret + the ``expressions`` and ``aliases``. (Not implemented yet.) + entry_start (None or int): The first entry to include. If None, start + at zero. If negative, count from the end, like a Python slice. + entry_stop (None or int): The first entry to exclude (i.e. one greater + than the last entry to include). If None, stop at + :ref:`uproot.behaviors.RNTuple.HasFields.num_entries`. If negative, + count from the end, like a Python slice. + decompression_executor (None or Executor with a ``submit`` method): The + executor that is used to decompress ``RPages``; if None, a + :doc:`uproot.source.futures.TrivialExecutor` is created. (Not implemented yet.) + library (str or :doc:`uproot.interpretation.library.Library`): The library + that is used to represent arrays. Options are ``"np"`` for NumPy, + ``"ak"`` for Awkward Array, and ``"pd"`` for Pandas. (Not implemented yet.) + ak_add_doc (bool | dict ): If True and ``library="ak"``, add the RField ``name`` + to the Awkward ``__doc__`` parameter of the array. + if dict = {key:value} and ``library="ak"``, add the RField ``value`` to the + Awkward ``key`` parameter of the array. + how (None, str, or container type): Library-dependent instructions + for grouping. The only recognized container types are ``tuple``, + ``list``, and ``dict``. Note that the container *type itself* + must be passed as ``how``, not an instance of that type (i.e. + ``how=tuple``, not ``how=()``). + report (bool): If True, this generator yields + (arrays, :doc:`uproot.behaviors.TBranch.Report`) pairs; if False, + it only yields arrays. The report has data about the ``TFile``, + ``TTree``, and global and local entry ranges. + allow_missing (bool): If True, skip over any files that do not contain + the specified ``RNTuple``. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + interpretation_executor (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + custom_classes (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + options: See below. + + Returns an array with data from a set of files concatenated into one. + + For example: + + .. code-block:: python + + >>> array = uproot.concatenate("files*.root:ntuple", filter_field=["x", "y"]) + + Depending on the number of files, the number of selected ``RFields``, and + the size of your computer's memory, this function might not have enough + memory to run. + + Allowed types for the ``files`` parameter: + + * str/bytes: relative or absolute filesystem path or URL, without any colons + other than Windows drive letter or URL schema. + Examples: ``"rel/file.root"``, ``"C:\\abs\\file.root"``, ``"http://where/what.root"`` + * str/bytes: same with an object-within-ROOT path, separated by a colon. + Example: ``"rel/file.root:tdirectory/rntuple"`` + * pathlib.Path: always interpreted as a filesystem path or URL only (no + object-within-ROOT path), regardless of whether there are any colons. + Examples: ``Path("rel:/file.root")``, ``Path("/abs/path:stuff.root")`` + * glob syntax in str/bytes and pathlib.Path. + Examples: ``Path("rel/*.root")``, ``"/abs/*.root:tdirectory/rntuple"`` + * dict: keys are filesystem paths, values are objects-within-ROOT paths. + Example: ``{{"/data_v1/*.root": "rntuple_v1", "/data_v2/*.root": "rntuple_v2"}}`` + * already-open RNTuple objects. + * iterables of the above. + + Options (type; default): (Not implemented yet.) + + * handler (:doc:`uproot.source.chunk.Source` class; None) + * timeout (float for HTTP, int for XRootD; 30) + * max_num_elements (None or int; None) + * num_workers (int; 1) + * use_threads (bool; False on the emscripten platform (i.e. in a web browser), else True) + * num_fallback_workers (int; 10) + * begin_chunk_size (memory_size; 403, the smallest a ROOT file can be) + * minimal_ttree_metadata (bool; True) + + Other file entry points: + + * :doc:`uproot.reading.open`: opens one file to read any of its objects. + * :doc:`uproot.behaviors.RNTuple.iterate`: iterates through chunks of + contiguous entries in ``RNTuples``. + * :doc:`uproot.behaviors.RNTuple.concatenate` (this function): returns a + single concatenated array from ``RNTuples``. + * :doc:`uproot._dask.dask`: returns an unevaluated Dask array from ``RNTuples``. + """ + files = uproot._util.regularize_files(files, steps_allowed=False, **options) + library = uproot.interpretation.library._regularize_library(library) + + all_arrays = [] + global_start = 0 + global_stop = 0 + + all_hasfields = [] + for file_path, object_path in files: + _hasfields = uproot._util.regularize_object_path( + file_path, object_path, None, allow_missing, options + ) + if _hasfields is not None: + all_hasfields.append(_hasfields) + + total_num_entries = sum(hasfields.num_entries for hasfields in all_hasfields) + entry_start, entry_stop = uproot.behaviors.TBranch._regularize_entries_start_stop( + total_num_entries, entry_start, entry_stop + ) + for hasfields in all_hasfields: + with hasfields: + nentries = hasfields.num_entries + global_stop += nentries + + if ( + global_start <= entry_start < global_stop + or global_start < entry_stop <= global_stop + ): + # overlap, read only the overlapping entries + local_entry_start = max( + 0, entry_start - global_start + ) # need to clip to 0 + local_entry_stop = entry_stop - global_start # overflows are fine + elif entry_start >= global_stop or entry_stop <= global_start: # no overlap + # outside of this file's range -> skip + global_start = global_stop + continue + else: + # read all entries + local_entry_start = 0 + local_entry_stop = nentries + + try: + arrays = hasfields.arrays( + expressions=expressions, + cut=cut, + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + aliases=aliases, + language=language, + entry_start=local_entry_start, + entry_stop=local_entry_stop, + decompression_executor=decompression_executor, + array_cache=None, + library=library, + ak_add_doc=ak_add_doc, + how=how, + filter_branch=filter_branch, + interpretation_executor=interpretation_executor, + ) + arrays = library.global_index(arrays, global_start) + except uproot.exceptions.KeyInFileError: + if allow_missing: + continue + else: + raise + + all_arrays.append(arrays) + global_start = global_stop + + return library.concatenate(all_arrays) + + +class HasFields(Mapping): + """ + Abstract class of behaviors for anything that "has fields," namely + :doc:`uproot.models.RNTuple.RNTuple` and + :doc:`uproot.models.RNTuple.RField`, which mostly consist of array-reading + methods. + + A :doc:`uproot.behaviors.RNTuple.HasFields` is a Python ``Mapping``, which + uses square bracket syntax to extract subfields: + + .. code-block:: python + + my_rntuple["field"] + my_rntuple["field"]["subfield"] + my_rntuple["field.subfield"] + my_rntuple["field.subfield.subsubfield"] + my_rntuple["field/subfield/subsubfield"] + my_rntuple["field\\subfield\\subsubfield"] + """ + + @property + def ntuple(self): + """ + The :doc:`uproot.models.RNTuple.RNTuple` that this + :doc:`uproot.models.RNTuple.HasFields` is part of. + """ + return self._ntuple + + @property + def num_entries(self): + """ + The number of entries in the ``RNTuple``. + """ + if isinstance(self, uproot.behaviors.RNTuple.RNTuple): + if self._num_entries is None: + self._num_entries = sum(x.num_entries for x in self.cluster_summaries) + return self._num_entries + return self.ntuple.num_entries + + @property + def fields(self): + """ + The list of :doc:`uproot.models.RNTuple.RField` directly under + this :doc:`uproot.models.RNTuple.RNTuple` or + :doc:`uproot.models.RNTuple.RField` (i.e. not recursive). + """ + if self._fields is None: + rntuple = self.ntuple + if isinstance(self, uproot.behaviors.RNTuple.RNTuple): + fields = [ + rntuple.all_fields[i] + for i, f in enumerate(rntuple.field_records) + if f.parent_field_id == i + ] + else: + fields = [ + rntuple.all_fields[i] + for i, f in enumerate(rntuple.field_records) + if f.parent_field_id == self._fid and f.parent_field_id != i + ] + self._fields = fields + return self._fields + + @property + def path(self): + """ + The full path of the field in the :doc:`uproot.models.RNTuple.RNTuple`. When it is + the ``RNTuple`` itself, this is ``"."``. + """ + if isinstance(self, uproot.behaviors.RNTuple.RNTuple): + return "." + if self._path is None: + path = self.name + parent = self.parent + field = self + while not isinstance(parent, uproot.behaviors.RNTuple.RNTuple): + path = f"{parent.name}.{path}" + field = parent + parent = field.parent + self._path = path + return self._path + + def to_akform( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns the an Awkward Form with the structure of the data in the ``RNTuple`` or ``RField``. + """ + ak = uproot.extras.awkward() + + keys = self.keys( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + filter_branch=filter_branch, + ) + rntuple = self.ntuple + + top_names = [] + record_list = [] + if self is rntuple: + for field in self.fields: + # the field needs to be in the keys or be a parent of a field in the keys + if any(key.startswith(field.name) for key in keys): + top_names.append(field.name) + record_list.append(rntuple.field_form(field.field_id, keys)) + else: + # Always use the full path for keys + # Also include the field itself + keys = [self.path] + [f"{self.path}.{k}" for k in keys] + # The field needs to be in the keys or be a parent of a field in the keys + if any(key.startswith(self.path) for key in keys): + top_names.append(self.name) + record_list.append(rntuple.field_form(self.field_id, keys)) + + form = ak.forms.RecordForm(record_list, top_names, form_key="toplevel") + return form + + def arrays( + self, + expressions=None, # TODO: Not implemented yet + cut=None, # TODO: Not implemented yet + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + aliases=None, # TODO: Not implemented yet + language=uproot.language.python.python_language, # TODO: Not implemented yet + entry_start=None, + entry_stop=None, + decompression_executor=None, # TODO: Not implemented yet + array_cache="inherit", # TODO: Not implemented yet + library="ak", # TODO: Not implemented yet + ak_add_doc=False, + how=None, + # For compatibility reasons we also accepts kwargs meant for TTrees + interpretation_executor=None, + filter_branch=unset, + ): + """ + Args: + expressions (None, str, or list of str): Names of ``RFields`` or + aliases to convert to arrays or mathematical expressions of them. + Uses the ``language`` to evaluate. If None, all ``RFields`` + selected by the filters are included. (Not implemented yet.) + cut (None or str): If not None, this expression filters all of the + ``expressions``. (Not implemented yet.) + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool, or None): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. If the function + returns False or None, the ``RField`` is excluded; if the function + returns True, it is included. + aliases (None or dict of str \u2192 str): Mathematical expressions that + can be used in ``expressions`` or other aliases. + Uses the ``language`` engine to evaluate. (Not implemented yet.) + language (:doc:`uproot.language.Language`): Language used to interpret + the ``expressions`` and ``aliases``. (Not implemented yet.) + entry_start (None or int): The first entry to include. If None, start + at zero. If negative, count from the end, like a Python slice. + entry_stop (None or int): The first entry to exclude (i.e. one greater + than the last entry to include). If None, stop at + :ref:`uproot.behaviors.RNTuple.RNTuple.num_entries`. If negative, + count from the end, like a Python slice. + decompression_executor (None or Executor with a ``submit`` method): The + executor that is used to decompress ``RPages``; if None, the + file's :ref:`uproot.reading.ReadOnlyFile.decompression_executor` + is used. (Not implemented yet.) + array_cache ("inherit", None, MutableMapping, or memory size): Cache of arrays; + if "inherit", use the file's cache; if None, do not use a cache; + if a memory size, create a new cache of this size. (Not implemented yet.) + library (str or :doc:`uproot.interpretation.library.Library`): The library + that is used to represent arrays. Options are ``"np"`` for NumPy, + ``"ak"`` for Awkward Array, and ``"pd"`` for Pandas. (Not implemented yet.) + ak_add_doc (bool | dict ): If True and ``library="ak"``, add the RField ``name`` + to the Awkward ``__doc__`` parameter of the array. + if dict = {key:value} and ``library="ak"``, add the RField ``value`` to the + Awkward ``key`` parameter of the array. + how (None, str, or container type): Library-dependent instructions + for grouping. The only recognized container types are ``tuple``, + ``list``, and ``dict``. Note that the container *type itself* + must be passed as ``how``, not an instance of that type (i.e. + ``how=tuple``, not ``how=()``). + interpretation_executor (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns a group of arrays from the ``RNTuple``. + + For example: + + .. code-block:: python + + >>> my_ntuple.arrays() + + + See also :ref:`uproot.behaviors.RNTuple.HasFields.array` to read a single + ``RField`` as an array. + + See also :ref:`uproot.behaviors.RNTuple.HasFields.iterate` to iterate over + the array in contiguous ranges of entries. + """ + entry_start, entry_stop = ( + uproot.behaviors.TBranch._regularize_entries_start_stop( + self.num_entries, entry_start, entry_stop + ) + ) + library = uproot.interpretation.library._regularize_library(library) + + clusters = self.ntuple.cluster_summaries + cluster_starts = numpy.array([c.num_first_entry for c in clusters]) + start_cluster_idx = ( + numpy.searchsorted(cluster_starts, entry_start, side="right") - 1 + ) + stop_cluster_idx = numpy.searchsorted(cluster_starts, entry_stop, side="right") + cluster_num_entries = numpy.sum( + [c.num_entries for c in clusters[start_cluster_idx:stop_cluster_idx]] + ) + + form = self.to_akform( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + filter_branch=filter_branch, + ) + + # only read columns mentioned in the awkward form + target_cols = [] + container_dict = {} + _recursive_find(form, target_cols) + for key in target_cols: + if "column" in key and "union" not in key: + key_nr = int(key.split("-")[1]) + dtype_byte = self.ntuple.column_records[key_nr].type + + content = self.ntuple.read_col_pages( + key_nr, + range(start_cluster_idx, stop_cluster_idx), + dtype_byte=dtype_byte, + pad_missing_element=True, + ) + if "cardinality" in key: + content = numpy.diff(content) + if dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]: + kindex, tags = uproot.models.RNTuple._split_switch_bits(content) + # Find invalid variants and adjust buffers accordingly + invalid = numpy.flatnonzero(tags == -1) + if len(invalid) > 0: + kindex = numpy.delete(kindex, invalid) + tags = numpy.delete(tags, invalid) + invalid -= numpy.arange(len(invalid)) + optional_index = numpy.insert( + numpy.arange(len(kindex), dtype=numpy.int64), invalid, -1 + ) + else: + optional_index = numpy.arange(len(kindex), dtype=numpy.int64) + container_dict[f"{key}-index"] = optional_index + container_dict[f"{key}-union-index"] = kindex + container_dict[f"{key}-union-tags"] = tags + else: + # don't distinguish data and offsets + container_dict[f"{key}-data"] = content + container_dict[f"{key}-offsets"] = content + cluster_offset = cluster_starts[start_cluster_idx] + entry_start -= cluster_offset + entry_stop -= cluster_offset + arrays = uproot.extras.awkward().from_buffers( + form, cluster_num_entries, container_dict, allow_noncanonical_form=True + )[entry_start:entry_stop] + + # no longer needed; save memory + del container_dict + + # FIXME: This is not right, but it might temporarily work + if library.name == "np": + return arrays.to_numpy() + + # TODO: This should be done with library.group, if possible + if how is tuple: + arrays = tuple(arrays[f] for f in arrays.fields) + elif how is list: + arrays = [arrays[f] for f in arrays.fields] + elif how is dict: + arrays = {f: arrays[f] for f in arrays.fields} + elif how is not None: + raise ValueError( + f"unrecognized 'how' parameter: {how}. Options are None, tuple, list and dict." + ) + + return arrays + + def __array__(self, *args, **kwargs): + if isinstance(self, uproot.behaviors.RNTuple.RNTuple): + out = self.arrays(library="np") + else: + out = self.array(library="np") + if args == () and kwargs == {}: + return out + else: + return numpy.array(out, *args, **kwargs) + + def iterate( + self, + expressions=None, # TODO: Not implemented yet + cut=None, # TODO: Not implemented yet + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + aliases=None, # TODO: Not implemented yet + language=uproot.language.python.python_language, # TODO: Not implemented yet + entry_start=None, + entry_stop=None, + step_size="100 MB", + decompression_executor=None, # TODO: Not implemented yet + library="ak", # TODO: Not implemented yet + ak_add_doc=False, # TODO: Not implemented yet + how=None, + report=False, # TODO: Not implemented yet + # For compatibility reasons we also accepts kwargs meant for TTrees + interpretation_executor=None, + filter_branch=unset, + ): + """ + Args: + expressions (None, str, or list of str): Names of ``RFields`` or + aliases to convert to arrays or mathematical expressions of them. + Uses the ``language`` to evaluate. If None, all ``RFields`` + selected by the filters are included. (Not implemented yet.) + cut (None or str): If not None, this expression filters all of the + ``expressions``. (Not implemented yet.) + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``EFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool, or None): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. If the function + returns False or None, the ``RField`` is excluded; if the function + returns True, it is included. + aliases (None or dict of str \u2192 str): Mathematical expressions that + can be used in ``expressions`` or other aliases. + Uses the ``language`` engine to evaluate. (Not implemented yet.) + language (:doc:`uproot.language.Language`): Language used to interpret + the ``expressions`` and ``aliases``. (Not implemented yet.) + entry_start (None or int): The first entry to include. If None, start + at zero. If negative, count from the end, like a Python slice. + entry_stop (None or int): The first entry to exclude (i.e. one greater + than the last entry to include). If None, stop at + :ref:`uproot.behaviors.RNTuple.HasFields.num_entries`. If negative, + count from the end, like a Python slice. + step_size (int or str): If an integer, the maximum number of entries to + include in each iteration step; if a string, the maximum memory size + to include. The string must be a number followed by a memory unit, + such as "100 MB". + decompression_executor (None or Executor with a ``submit`` method): The + executor that is used to decompress ``RPages``; if None, the + file's :ref:`uproot.reading.ReadOnlyFile.decompression_executor` + is used. (Not implemented yet.) + library (str or :doc:`uproot.interpretation.library.Library`): The library + that is used to represent arrays. Options are ``"np"`` for NumPy, + ``"ak"`` for Awkward Array, and ``"pd"`` for Pandas. (Not implemented yet.) + ak_add_doc (bool | dict ): If True and ``library="ak"``, add the RField ``name`` + to the Awkward ``__doc__`` parameter of the array. + if dict = {key:value} and ``library="ak"``, add the RField ``value`` to the + Awkward ``key`` parameter of the array. (Not implemented yet.) + how (None, str, or container type): Library-dependent instructions + for grouping. The only recognized container types are ``tuple``, + ``list``, and ``dict``. Note that the container *type itself* + must be passed as ``how``, not an instance of that type (i.e. + ``how=tuple``, not ``how=()``). + report (bool): If True, this generator yields + (arrays, :doc:`uproot.behaviors.TBranch.Report`) pairs; if False, + it only yields arrays. The report has data about the ``TFile``, + ``RNTuple``, and global and local entry ranges. (Not implemented yet.) + interpretation_executor (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Iterates through contiguous chunks of entries from the ``RNTuple``. + + For example: + + .. code-block:: python + + >>> for array in rntuple.iterate(filter_name=["x", "y"], step_size=100): + ... # each of the following have 100 entries + ... array["x"], array["y"] + + See also :ref:`uproot.behaviors.RNTuple.HasFields.arrays` to read + everything in a single step, without iteration. + + See also :doc:`uproot.behaviors.RNTuple.iterate` to iterate over many + files. + """ + entry_start, entry_stop = ( + uproot.behaviors.TBranch._regularize_entries_start_stop( + self.ntuple.num_entries, entry_start, entry_stop + ) + ) + + akform = self.to_akform( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + filter_branch=filter_branch, + ) + + step_size = _regularize_step_size( + self.ntuple, akform, step_size, entry_start, entry_stop + ) + # TODO: This can be done more efficiently + for start in range(0, self.num_entries, step_size): + yield self.arrays( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + entry_start=start, + entry_stop=start + step_size, + library=library, + how=how, + filter_branch=filter_branch, + ) + + def keys( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + full_paths=True, + ignore_duplicates=False, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + full_paths (bool): If True, include the full path to each subfield + with periods (``.``); otherwise, use the descendant's name as + the output name. + ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns the names of the subfields as a list of strings. + """ + return list( + self.iterkeys( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + recursive=recursive, + full_paths=full_paths, + ignore_duplicates=ignore_duplicates, + filter_branch=filter_branch, + ) + ) + + def values( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns the subfields as a list of + :doc:`uproot.behaviors.RField.RField`. + + (Note: with ``recursive=False``, this is the same as + :ref:`uproot.behaviors.TBranch.HasFields.fields`.) + """ + return list( + self.itervalues( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + recursive=recursive, + filter_branch=filter_branch, + ) + ) + + def items( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + full_paths=True, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + full_paths (bool): If True, include the full path to each subfield + with periods (``.``); otherwise, use the descendant's name as + the output name. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns (name, field) pairs of the subfields as a list of 2-tuples + of (str, :doc:`uproot.behaviors.RField.RField`). + """ + return list( + self.iteritems( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + recursive=recursive, + full_paths=full_paths, + filter_branch=filter_branch, + ) + ) + + def typenames( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + full_paths=True, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + full_paths (bool): If True, include the full path to each subfield + with periods (``.``); otherwise, use the descendant's name as + the output name. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns (name, typename) pairs of the subfields as a dict of + str \u2192 str. + """ + return dict( + self.itertypenames( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + recursive=recursive, + full_paths=full_paths, + filter_branch=filter_branch, + ) + ) + + def iterkeys( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + full_paths=True, + ignore_duplicates=False, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + full_paths (bool): If True, include the full path to each subfield + with periods (``.``); otherwise, use the descendant's name as + the output name. + ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + + Returns the names of the subfields as an iterator over strings. + """ + for k, _ in self.iteritems( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + recursive=recursive, + full_paths=full_paths, + ignore_duplicates=ignore_duplicates, + filter_branch=filter_branch, + ): + yield k + + def itervalues( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns the subfields as an iterator over + :doc:`uproot.behaviors.RField.RField`. + + (Note: with ``recursive=False``, this is the same as + :ref:`uproot.behaviors.RField.HasFields.fields`.) + """ + for _, v in self.iteritems( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + recursive=recursive, + full_paths=False, + filter_branch=filter_branch, + ): + yield v + + def iteritems( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + full_paths=True, + ignore_duplicates=False, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + full_paths (bool): If True, include the full path to each subfield + with periods (``.``); otherwise, use the descendant's name as + the output name. + ignore_duplicates (bool): If True, return a set of the keys; otherwise, return the full list of keys. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + + Returns (name, field) pairs of the subfields as an iterator over + 2-tuples of (str, :doc:`uproot.behaviors.RField.RField`). + """ + if filter_branch is not unset: + warnings.warn( + "the filter_branch kwarg should not be used for RNTuples", + DeprecationWarning, + stacklevel=1, + ) + filter_field = filter_branch + + filter_name = uproot._util.regularize_filter(filter_name) + filter_typename = uproot._util.regularize_filter(filter_typename) + if filter_field is None: + filter_field = no_filter + elif callable(filter_field): + pass + else: + raise TypeError( + f"filter_field must be None or a function: RField -> bool, not {filter_field!r}" + ) + + keys_set = set() + + for field in self.fields: + if ( + ( + filter_name is no_filter + or _filter_name_deep(filter_name, self, field) + ) + and (filter_typename is no_filter or filter_typename(field.typename)) + and (filter_field is no_filter or filter_field(field)) + ): + if ignore_duplicates and field.name in keys_set: + pass + else: + keys_set.add(field.name) + yield field.name, field + + if recursive: + for k1, v in field.iteritems( + recursive=recursive, + filter_name=no_filter, + filter_typename=filter_typename, + filter_field=filter_field, + full_paths=full_paths, + ): + k2 = f"{field.name}.{k1}" if full_paths else k1 + if filter_name is no_filter or _filter_name_deep( + filter_name, self, v + ): + if ignore_duplicates and k2 in keys_set: + pass + else: + keys_set.add(k2) + yield k2, v + + def itertypenames( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + full_paths=True, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields``s by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, descend into any nested subfields. + If False, only return the names of the top fields. + full_paths (bool): If True, include the full path to each subfield + with periods (``.``); otherwise, use the descendant's name as + the output name. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns (name, typename) pairs of the subfields as an iterator over + 2-tuples of (str, str). + """ + for k, v in self.iteritems( + filter_name=filter_name, + filter_typename=filter_typename, + filter_branch=filter_branch, + recursive=recursive, + full_paths=full_paths, + ): + yield k, v.typename + + def _ipython_key_completions_(self): + """ + Supports key-completion in an IPython or Jupyter kernel. + """ + return self.iterkeys() + + def num_entries_for( + self, + memory_size, + expressions=None, # TODO: Not implemented yet + cut=None, # TODO: Not implemented yet + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + aliases=None, # TODO: Not implemented yet + language=uproot.language.python.python_language, # TODO: Not implemented yet + entry_start=None, + entry_stop=None, + # For compatibility reasons we also accepts kwargs meant for TTrees + filter_branch=unset, + ): + """ + Args: + memory_size (int or str): An integer is interpreted as a number of + bytes and a string must be a number followed by a unit, such as + "100 MB". + expressions (None, str, or list of str): Names of ``RFields`` or + aliases to convert to arrays or mathematical expressions of them. + Uses the ``language`` to evaluate. If None, all ``RFields`` + selected by the filters are included. (Not implemented yet.) + cut (None or str): If not None, this expression filters all of the + ``expressions``. (Not implemented yet.) + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RField.RField` \u2192 bool, or None): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RField.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + aliases (None or dict of str \u2192 str): Mathematical expressions that + can be used in ``expressions`` or other aliases. + Uses the ``language`` engine to evaluate. + language (:doc:`uproot.language.Language`): Language used to interpret + the ``expressions`` and ``aliases``. + entry_start (None or int): The first entry to include. If None, start + at zero. If negative, count from the end, like a Python slice. + entry_stop (None or int): The first entry to exclude (i.e. one greater + than the last entry to include). If None, stop at + :ref:`uproot.behaviors.RNTuple.HasFields.num_entries`. If negative, + count from the end, like a Python slice. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns an *approximate* step size as a number of entries to read + a given ``memory_size`` in each step. + + This method does not actually read the ``RField`` data or compute any + expressions to arrive at its estimate. It only uses metadata from the + already-loaded ``RNTuple``; it only needs ``language`` to parse the + expressions, not to evaluate them. + + This is the algorithm that + :ref:`uproot.behaviors.RNTuple.HasFields.iterate` uses to convert a + ``step_size`` expressed in memory units into a number of entries. + """ + target_num_bytes = uproot._util.memory_size(memory_size) + + entry_start, entry_stop = ( + uproot.behaviors.TBranch._regularize_entries_start_stop( + self.ntuple.num_entries, entry_start, entry_stop + ) + ) + + akform = self.to_akform( + filter_name=filter_name, + filter_typename=filter_typename, + filter_field=filter_field, + filter_branch=filter_branch, + ) + + return _num_entries_for(self, akform, target_num_bytes, entry_start, entry_stop) + + def __getitem__(self, where): + original_where = where + + if uproot._util.isint(where): + return self.fields[where] + elif isinstance(where, str): + where = uproot._util.ensure_str(where) + where = where.replace("/", ".").replace("\\", ".") + else: + raise TypeError(f"where must be an integer or a string, not {where!r}") + + if where.startswith("."): + recursive = False + where = where[1:] + else: + recursive = True + + if self._lookup is None: + self._lookup = {f.name: f for f in self.fields} + got = self._lookup.get(where) + if got is not None: + return got + + if "." in where: + this = self + try: + for piece in where.split("."): + if piece != "": + this = this[piece] + except uproot.KeyInFileError: + raise uproot.KeyInFileError( + original_where, + keys=self.keys(recursive=recursive), + file_path=self._file.file_path, # TODO + object_path=self.object_path, # TODO + ) from None + return this + + elif recursive: + got = _get_recursive(self, where) + if got is not None: + return got + else: + raise uproot.KeyInFileError( + original_where, + keys=self.keys(recursive=recursive), + file_path=self._file.file_path, + object_path=self.object_path, + ) + + else: + raise uproot.KeyInFileError( + original_where, + keys=self.keys(recursive=recursive), + file_path=self._file.file_path, + object_path=self.object_path, + ) + + def __iter__(self): + yield from self.fields + + def __len__(self): + return len(self.fields) + + def show( + self, + *, + filter_name=no_filter, + filter_typename=no_filter, + filter_field=no_filter, + recursive=True, + name_width=20, + typename_width=24, + path_width=30, + stream=sys.stdout, + # For compatibility reasons we also accepts kwargs meant for TTrees + full_paths=unset, + filter_branch=unset, + interpretation_width=unset, + ): + """ + Args: + filter_name (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by name. + filter_typename (None, glob string, regex string in ``"/pattern/i"`` syntax, function of str \u2192 bool, or iterable of the above): A + filter to select ``RFields`` by type. + filter_field (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool, or None): A + filter to select ``RFields`` using the full + :doc:`uproot.models.RNTuple.RField` object. The ``RField`` is + included if the function returns True, excluded if it returns False. + recursive (bool): If True, recursively descend into subfields. + name_width (int): Number of characters to reserve for the ``TBranch`` + names. + typename_width (int): Number of characters to reserve for the C++ + typenames. + interpretation_width (int): Number of characters to reserve for the + :doc:`uproot.interpretation.Interpretation` displays. + stream (object with a ``write(str)`` method): Stream to write the + output to. + full_paths (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + filter_branch (None or function of :doc:`uproot.models.RNTuple.RField` \u2192 bool): An alias for ``filter_field`` included + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + interpretation_width (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Interactively display the ``RFields``. + + For example, + + .. code-block:: + + >>> my_ntuple.show() + name | typename | path + ---------------------+--------------------------+------------------------------- + my_int | std::int64_t | my_int + my_vec | std::vector name_width: + name = name[: name_width - 3] + "..." + if len(typename) > typename_width: + typename = typename[: typename_width - 3] + "..." + if len(path) > path_width: + path = path[: path_width - 3] + "..." + + stream.write(formatter.format(name, typename, path).rstrip(" ") + "\n") + + @property + def source(self) -> uproot.source.chunk.Source | None: + """Returns the associated source of data for this container, if it exists + + Returns: uproot.source.chunk.Source or None + """ + if isinstance(self.ntuple._file, uproot.reading.ReadOnlyFile): + return self.ntuple._file.source + return None + + +class RNTuple(HasFields): + """ + Behaviors for an ``RNTuple``, which mostly consist of array-reading methods. + + Since a :doc:`uproot.behaviors.RNTuple.RNTuple` is a + :doc:`uproot.behaviors.RNTuple.HasFields`, it is also a Python + ``Mapping``, which uses square bracket syntax to extract subbranches: + + .. code-block:: python + + my_rntuple["field"] + my_rntuple["field"]["subfield"] + my_rntuple["field.subfield"] + my_rntuple["field.subfield.subsubfield"] + my_rntuple["field/subfield/subsubfield"] + my_rntuple["field\\subfield\\subsubfield"] + """ + + def __repr__(self): + if len(self) == 0: + return f"<{self.classname} {self.name!r} at 0x{id(self):012x}>" + else: + return f"<{self.classname} {self.name!r} ({len(self)} fields) at 0x{id(self):012x}>" + + @property + def name(self): + """ + Name of the ``RNTuple``. + """ + return self.header.ntuple_name + + @property + def object_path(self): + """ + Object path of the ``RNTuple``. + """ + return self.parent.object_path + + @property + def cache_key(self): + """ + String that uniquely specifies this ``RNTuple`` in its path, to use as + part of object and array cache keys. + """ + return f"{self.parent.cache_key}{self.name};{self.parent.fCycle}" + + +def _filter_name_deep(filter_name, hasfields, field): + shallow = name = field.name + if filter_name(name): + return True + while field is not hasfields: + field = field.parent + if field is not hasfields: + name = field.name + "." + name + if name != shallow and filter_name(name): + return True + return filter_name("." + name) + + +def _keys_deep(hasbranches): + out = set() + for branch in hasbranches.itervalues(recursive=True): + name = branch.name + out.add(name) + while branch is not hasbranches: + branch = branch.parent # noqa: PLW2901 (overwriting branch) + if branch is not hasbranches: + name = branch.name + "/" + name + out.add(name) + out.add("/" + name) + return out + + +def _get_recursive(hasfields, where): + if hasfields._lookup is None: + hasfields._lookup = {f.name: f for f in hasfields.fields} + got = hasfields._lookup.get(where) + if got is not None: + return got + for field in hasfields.fields: + got = _get_recursive(field, where) + if got is not None: + return got + else: + return None + + +def _num_entries_for(ntuple, akform, target_num_bytes, entry_start, entry_stop): + # TODO: there might be a better way to estimate the number of entries + clusters = ntuple.cluster_summaries + cluster_starts = numpy.array([c.num_first_entry for c in clusters]) + + start_cluster_idx = ( + numpy.searchsorted(cluster_starts, entry_start, side="right") - 1 + ) + stop_cluster_idx = numpy.searchsorted(cluster_starts, entry_stop, side="right") + + target_cols = [] + _recursive_find(akform, target_cols) + + total_bytes = 0 + for key in target_cols: + if "column" in key and "union" not in key: + key_nr = int(key.split("-")[1]) + for cluster in range(start_cluster_idx, stop_cluster_idx): + pages = ntuple.page_link_list[cluster][key_nr].pages + total_bytes += sum(page.locator.num_bytes for page in pages) + + total_entries = entry_stop - entry_start + if total_bytes == 0: + num_entries = 0 + else: + num_entries = round(target_num_bytes * total_entries / total_bytes) + if num_entries <= 0: + return 1 + else: + return num_entries + + +def _regularize_step_size(ntuple, akform, step_size, entry_start, entry_stop): + if uproot._util.isint(step_size): + return step_size + target_num_bytes = uproot._util.memory_size( + step_size, + "number of entries or memory size string with units " + f"(such as '100 MB') required, not {step_size!r}", + ) + return _num_entries_for(ntuple, akform, target_num_bytes, entry_start, entry_stop) + + +def _recursive_find(form, res): + ak = uproot.extras.awkward() + + if hasattr(form, "form_key"): + res.append(form.form_key) + if hasattr(form, "contents"): + for c in form.contents: + _recursive_find(c, res) + if hasattr(form, "content") and issubclass(type(form.content), ak.forms.Form): + _recursive_find(form.content, res) diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py index 311cac693..005274a29 100644 --- a/src/uproot/models/RNTuple.py +++ b/src/uproot/models/RNTuple.py @@ -13,6 +13,7 @@ import xxhash import uproot +import uproot.behaviors.RNTuple import uproot.const # https://github.com/root-project/root/blob/8cd9eed6f3a32e55ef1f0f1df8e5462e753c735d/tree/ntuple/v7/doc/BinaryFormatSpecification.md#anchor-schema @@ -65,171 +66,11 @@ def _envelop_header(chunk, cursor, context): return {"env_type_id": env_type_id, "env_length": env_length} -def _arrays( - in_ntuple, - filter_name="*", - filter_typename=None, - entry_start=0, - entry_stop=None, - decompression_executor=None, - array_cache=None, -): - ak = uproot.extras.awkward() - - entry_stop = entry_stop or in_ntuple.ntuple.num_entries - - clusters = in_ntuple.ntuple.cluster_summaries - cluster_starts = numpy.array([c.num_first_entry for c in clusters]) - - start_cluster_idx = ( - numpy.searchsorted(cluster_starts, entry_start, side="right") - 1 - ) - stop_cluster_idx = numpy.searchsorted(cluster_starts, entry_stop, side="right") - cluster_num_entries = numpy.sum( - [c.num_entries for c in clusters[start_cluster_idx:stop_cluster_idx]] - ) - - form = in_ntuple.to_akform().select_columns( - filter_name, prune_unions_and_records=False - ) - # only read columns mentioned in the awkward form - target_cols = [] - container_dict = {} - _recursive_find(form, target_cols) - for key in target_cols: - if "column" in key and "union" not in key: - key_nr = int(key.split("-")[1]) - dtype_byte = in_ntuple.ntuple.column_records[key_nr].type - - content = in_ntuple.ntuple.read_col_pages( - key_nr, - range(start_cluster_idx, stop_cluster_idx), - dtype_byte=dtype_byte, - pad_missing_element=True, - ) - if "cardinality" in key: - content = numpy.diff(content) - if dtype_byte == uproot.const.rntuple_col_type_to_num_dict["switch"]: - kindex, tags = _split_switch_bits(content) - # Find invalid variants and adjust buffers accordingly - invalid = numpy.flatnonzero(tags == -1) - if len(invalid) > 0: - kindex = numpy.delete(kindex, invalid) - tags = numpy.delete(tags, invalid) - invalid -= numpy.arange(len(invalid)) - optional_index = numpy.insert( - numpy.arange(len(kindex), dtype=numpy.int64), invalid, -1 - ) - else: - optional_index = numpy.arange(len(kindex), dtype=numpy.int64) - container_dict[f"{key}-index"] = optional_index - container_dict[f"{key}-union-index"] = kindex - container_dict[f"{key}-union-tags"] = tags - else: - # don't distinguish data and offsets - container_dict[f"{key}-data"] = content - container_dict[f"{key}-offsets"] = content - cluster_offset = cluster_starts[start_cluster_idx] - entry_start -= cluster_offset - entry_stop -= cluster_offset - return ak.from_buffers( - form, cluster_num_entries, container_dict, allow_noncanonical_form=True - )[entry_start:entry_stop] - - -def _num_entries_for(in_ntuple, target_num_bytes, filter_name): - # TODO: part of this is also done in _arrays, so we should refactor this - # TODO: there might be a better way to estimate the number of entries - entry_stop = in_ntuple.ntuple.num_entries - - clusters = in_ntuple.ntuple.cluster_summaries - cluster_starts = numpy.array([c.num_first_entry for c in clusters]) - - start_cluster_idx = numpy.searchsorted(cluster_starts, 0, side="right") - 1 - stop_cluster_idx = numpy.searchsorted(cluster_starts, entry_stop, side="right") - - form = in_ntuple.to_akform().select_columns( - filter_name, prune_unions_and_records=False - ) - target_cols = [] - _recursive_find(form, target_cols) - - total_bytes = 0 - for key in target_cols: - if "column" in key and "union" not in key: - key_nr = int(key.split("-")[1]) - for cluster in range(start_cluster_idx, stop_cluster_idx): - pages = in_ntuple.ntuple.page_link_list[cluster][key_nr].pages - total_bytes += sum(page.locator.num_bytes for page in pages) - - total_entries = entry_stop - if total_bytes == 0: - num_entries = 0 - else: - num_entries = round(target_num_bytes * total_entries / total_bytes) - if num_entries <= 0: - return 1 - else: - return num_entries - - -def _regularize_step_size(in_ntuple, step_size, filter_name): - if uproot._util.isint(step_size): - return step_size - target_num_bytes = uproot._util.memory_size( - step_size, - "number of entries or memory size string with units " - f"(such as '100 MB') required, not {step_size!r}", - ) - return _num_entries_for(in_ntuple, target_num_bytes, filter_name) - - -class Model_ROOT_3a3a_RNTuple(uproot.model.Model): +class Model_ROOT_3a3a_RNTuple(uproot.behaviors.RNTuple.RNTuple, uproot.model.Model): """ A versionless :doc:`uproot.model.Model` for ``ROOT::RNTuple``. """ - @property - def _keys(self): - keys = [] - field_records = self.field_records - for i, fr in enumerate(field_records): - if fr.parent_field_id == i and not fr.field_name.startswith("_"): - keys.append(fr.field_name) - return keys - - # TODO: this is still missing a lot of functionality - def keys( - self, - *, - filter_name=None, - filter_typename=None, - filter_field=None, - recursive=False, - full_paths=True, - **_, # For compatibility reasons we just ignore other kwargs - ): - filter_name = uproot._util.regularize_filter(filter_name) - return [key for key in self._keys if filter_name(key)] - - @property - def _key_indices(self): - indices = [] - field_records = self.field_records - for i, fr in enumerate(field_records): - if fr.parent_field_id == i and not fr.field_name.startswith("_"): - indices.append(i) - return indices - - @property - def _key_to_index(self): - d = {} - field_records = self.field_records - for i, fr in enumerate(field_records): - if fr.parent_field_id == i and not fr.field_name.startswith("_"): - d[fr.field_name] = i - return d - def read_members(self, chunk, cursor, context, file): if uproot._awkwardforth.get_forth_obj(context) is not None: raise uproot.interpretation.objects.CannotBeForth() @@ -287,7 +128,22 @@ def read_members(self, chunk, cursor, context, file): self._cluster_summaries = None self._page_link_list = None - self.ntuple = self + self._ntuple = self + self._fields = None + self._all_fields = None + self._lookup = None + + @property + def all_fields(self): + """ + The full list of fields in the RNTuple. + + The fields are sorted in the same way they appear in the + file, so the field at index n corresponds to the field with ``field_id==n``. + """ + if self._all_fields is None: + self._all_fields = [RField(i, self) for i in range(len(self.field_records))] + return self._all_fields def _prepare_header_chunk(self): context = {} @@ -335,6 +191,11 @@ def _prepare_footer_chunk(self): @property def header(self): + """ + The header of the RNTuple. + + This provides low level access to all the metadata contained in the header. + """ if self._header is None: if not self._header_chunk_ready: self._prepare_header_chunk() @@ -351,6 +212,11 @@ def header(self): @property def field_records(self): + """ + The complete list of field records in the RNTuple. + + This includes the fields from the header and from schema extensions in the footer. + """ if self._field_records is None: self._field_records = list(self.header.field_records) self._field_records.extend(self.footer.extension_links.field_records) @@ -358,12 +224,20 @@ def field_records(self): @property def field_names(self): + """ + The list of names of the fields in the RNTuple. + """ if self._field_names is None: self._field_names = [r.field_name for r in self.field_records] return self._field_names @property def column_records(self): + """ + The complete list of column records in the RNTuple. + + This includes the columns from the header and from schema extensions in the footer. + """ if self._column_records is None: self._column_records = list(self.header.column_records) self._column_records.extend(self.footer.extension_links.column_records) @@ -373,6 +247,9 @@ def column_records(self): @property def alias_column_records(self): + """ + The list of alias column records in the RNTuple. + """ if self._alias_column_records is None: self._alias_column_records = list(self.header.alias_column_records) self._alias_column_records.extend( @@ -410,6 +287,11 @@ def _related_ids(self): @property def footer(self): + """ + The footer of the RNTuple. + + This provides low level access to all the metadata contained in the footer. + """ if self._footer is None: if not self._footer_chunk_ready: self._prepare_footer_chunk() @@ -429,6 +311,9 @@ def footer(self): @property def cluster_summaries(self): + """ + The list of cluster summaries in the RNTuple. + """ if self._cluster_summaries is None: self._cluster_summaries = [] for pl in self.page_list_envelopes: @@ -437,69 +322,24 @@ def cluster_summaries(self): @property def page_link_list(self): + """ + The list of page links in the RNTuple. + """ if self._page_link_list is None: self._page_link_list = [] for pl in self.page_list_envelopes: self._page_link_list.extend(pl.pagelinklist) return self._page_link_list - @property - def num_entries(self): - if self._num_entries is None: - self._num_entries = sum(x.num_entries for x in self.cluster_summaries) - return self._num_entries - - def __len__(self): - if self._length is None: - self._length = len(self.keys()) - return self._length - - def __repr__(self): - if len(self) == 0: - return f"" - else: - return ( - f"" - ) - - def __getitem__(self, where): - # original_where = where - - if uproot._util.isint(where): - index = self._key_indices[where] - elif isinstance(where, str): - where = uproot._util.ensure_str(where) - index = self._key_to_index[where] - else: - raise TypeError(f"where must be an integer or a string, not {where!r}") - - # TODO: Implement path support - - return RNTupleField(index, self) - - @property - def name(self): - """ - Name of the ``RNTuple``. + def read_locator(self, loc, uncomp_size, context): """ - return self.parent.fName + Args: + loc (:doc:`uproot.models.RNTuple.MetaData`): The locator of the page. + uncomp_size (int): The size in bytes of the uncompressed data. + context (dict): Auxiliary data used in deserialization. - @property - def object_path(self): - """ - Object path of the ``RNTuple``. + Returns a tuple of the decompressed chunk and the updated cursor. """ - return self.parent.object_path - - @property - def cache_key(self): - """ - String that uniquely specifies this ``RNTuple`` in its path, to use as - part of object and array cache keys. - """ - return f"{self.parent.cache_key}{self.name};{self.parent.fCycle}" - - def read_locator(self, loc, uncomp_size, context): cursor = uproot.source.cursor.Cursor(loc.offset) chunk = self.file.source.chunk(loc.offset, loc.offset + loc.num_bytes) if loc.num_bytes < uncomp_size: @@ -513,6 +353,9 @@ def read_locator(self, loc, uncomp_size, context): @property def page_list_envelopes(self): + """ + The list of page list envelopes in the RNTuple. + """ context = {} if not self._page_list_envelopes: @@ -529,6 +372,15 @@ def page_list_envelopes(self): return self._page_list_envelopes def base_col_form(self, cr, col_id, parameters=None, cardinality=False): + """ + Args: + cr (:doc:`uproot.models.RNTuple.MetaData`): The column record. + col_id (int): The column id. + parameters (dict): The parameters to pass to the ``NumpyForm``. + cardinality (bool): Whether the column is a cardinality column. + + Returns an Awkward Form describing the column if applicable, or a form key otherwise. + """ ak = uproot.extras.awkward() form_key = f"column-{col_id}" + ("-cardinality" if cardinality else "") @@ -549,6 +401,12 @@ def base_col_form(self, cr, col_id, parameters=None, cardinality=False): ) def col_form(self, field_id): + """ + Args: + field_id (int): The field id. + + Returns an Awkward Form describing the column if applicable, or a form key otherwise. + """ ak = uproot.extras.awkward() cfid = field_id @@ -587,12 +445,18 @@ def col_form(self, field_id): else: raise (RuntimeError(f"Missing special case: {field_id}")) - def field_form(self, this_id, seen): + def field_form(self, this_id, keys): + """ + Args: + this_id (int): The field id. + keys (list): The list of keys to search for. + + Returns an Awkward Form describing the field. + """ ak = uproot.extras.awkward() field_records = self.field_records this_record = field_records[this_id] - seen.add(this_id) structural_role = this_record.struct_role if ( structural_role == uproot.const.RNTupleFieldRole.LEAF @@ -606,7 +470,6 @@ def field_form(self, this_id, seen): and len(self._related_ids[tmp_id]) == 1 ): this_id = self._related_ids[tmp_id][0] - seen.add(this_id) # base case of recursion # n.b. the split may happen in column return self.col_form(this_id) @@ -614,7 +477,7 @@ def field_form(self, this_id, seen): if this_id in self._related_ids: # std::array has only one subfield child_id = self._related_ids[this_id][0] - inner = self.field_form(child_id, seen) + inner = self.field_form(child_id, keys) else: # std::bitset has no subfields, so we use it directly inner = self.col_form(this_id) @@ -625,8 +488,14 @@ def field_form(self, this_id, seen): keyname = f"vector-{this_id}" newids = self._related_ids.get(this_id, []) # go find N in the rest, N is the # of fields in vector - recordlist = [self.field_form(i, seen) for i in newids] - namelist = [field_records[i].field_name for i in newids] + recordlist = [] + namelist = [] + for i in newids: + if any(key.startswith(self.all_fields[i].path) for key in keys): + recordlist.append(self.field_form(i, keys)) + namelist.append(field_records[i].field_name) + if all(name == f"_{i}" for i, name in enumerate(namelist)): + namelist = None return ak.forms.RecordForm(recordlist, namelist, form_key="whatever") cfid = this_id if self.field_records[cfid].source_field_id is not None: @@ -644,25 +513,28 @@ def field_form(self, this_id, seen): # this only has one child if this_id in self._related_ids: child_id = self._related_ids[this_id][0] - inner = self.field_form(child_id, seen) + inner = self.field_form(child_id, keys) return ak.forms.ListOffsetForm("i64", inner, form_key=keyname) elif structural_role == uproot.const.RNTupleFieldRole.RECORD: newids = [] if this_id in self._related_ids: newids = self._related_ids[this_id] # go find N in the rest, N is the # of fields in struct - recordlist = [self.field_form(i, seen) for i in newids] - namelist = [field_records[i].field_name for i in newids] - # TODO: uncomment this once tuples are fixed - # if all(name.startswith("_") for name in namelist): - # namelist = None + recordlist = [] + namelist = [] + for i in newids: + if any(key.startswith(self.all_fields[i].path) for key in keys): + recordlist.append(self.field_form(i, keys)) + namelist.append(field_records[i].field_name) + if all(name == f"_{i}" for i, name in enumerate(namelist)): + namelist = None return ak.forms.RecordForm(recordlist, namelist, form_key="whatever") elif structural_role == uproot.const.RNTupleFieldRole.VARIANT: keyname = self.col_form(this_id) newids = [] if this_id in self._related_ids: newids = self._related_ids[this_id] - recordlist = [self.field_form(i, seen) for i in newids] + recordlist = [self.field_form(i, keys) for i in newids] inner = ak.forms.UnionForm( "i8", "i64", recordlist, form_key=keyname + "-union" ) @@ -675,23 +547,18 @@ def field_form(self, this_id, seen): # everything should recurse above this branch raise AssertionError("this should be unreachable") - def to_akform(self): - ak = uproot.extras.awkward() - - field_records = self.field_records - recordlist = [] - topnames = self.keys() - seen = set() - for i in range(len(field_records)): - if i not in seen: - ff = self.field_form(i, seen) - if not field_records[i].field_name.startswith("_"): - recordlist.append(ff) - - form = ak.forms.RecordForm(recordlist, topnames, form_key="toplevel") - return form - def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split): + """ + Args: + destination (numpy.ndarray): The array to fill. + desc (:doc:`uproot.models.RNTuple.MetaData`): The page description. + dtype_str (str): The data type as a string. + dtype (numpy.dtype): The data type. + nbits (int): The number of bits. + split (bool): Whether the data is split. + + Fills the destination array with the data from the page. + """ loc = desc.locator context = {} # bool in RNTuple is always stored as bits @@ -772,23 +639,33 @@ def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split): def read_col_pages( self, ncol, cluster_range, dtype_byte, pad_missing_element=False ): + """ + Args: + ncol (int): The column id. + cluster_range (range): The range of cluster indices. + dtype_byte (int): The data type. + pad_missing_element (bool): Whether to pad the missing elements. + + Returns a numpy array with the data from the column. + """ arrays = [self.read_col_page(ncol, i) for i in cluster_range] - # Check if column stores offset values for jagged arrays (splitindex64) (applies to cardinality cols too): + # Check if column stores offset values if dtype_byte in uproot.const.rntuple_index_types: # Extract the last offset values: last_elements = [ (arr[-1] if len(arr) > 0 else numpy.zeros((), dtype=arr.dtype)) for arr in arrays[:-1] ] # First value always zero, therefore skip first arr. - # Compute cumulative sum using itertools.accumulate: last_offsets = numpy.cumsum(last_elements) - # Add the offsets to each array for i in range(1, len(arrays)): arrays[i] += last_offsets[i - 1] res = numpy.concatenate(arrays, axis=0) + # No longer needed; free memory + del arrays + dtype_byte = self.column_records[ncol].type if dtype_byte in uproot.const.rntuple_index_types: res = numpy.insert(res, 0, 0) # for offsets @@ -799,7 +676,14 @@ def read_col_pages( return res def read_col_page(self, ncol, cluster_i): - linklist = self.ntuple.page_link_list[cluster_i] + """ + Args: + ncol (int): The column id. + cluster_i (int): The cluster index. + + Returns a numpy array with the data from the column. + """ + linklist = self._ntuple.page_link_list[cluster_i] # Check if the column is suppressed and pick the non-suppressed one if so if ncol < len(linklist) and linklist[ncol].suppressed: rel_crs = self._column_records_dict[self.column_records[ncol].field_id] @@ -853,32 +737,6 @@ def read_col_page(self, ncol, cluster_i): res = res.astype(numpy.float32) return res - def arrays( - self, - filter_name="*", - filter_typename=None, - entry_start=0, - entry_stop=None, - decompression_executor=None, - array_cache=None, - ): - return _arrays( - self, - filter_name=filter_name, - filter_typename=filter_typename, - entry_start=entry_start, - entry_stop=entry_stop, - decompression_executor=decompression_executor, - array_cache=array_cache, - ) - - def iterate(self, filter_name="*", *args, step_size="100 MB", **kwargs): - step_size = _regularize_step_size(self, step_size, filter_name) - for start in range(0, self.num_entries, step_size): - yield self.arrays( - *args, entry_start=start, entry_stop=start + step_size, **kwargs - ) - # Supporting function and classes def _split_switch_bits(content): @@ -887,18 +745,6 @@ def _split_switch_bits(content): return kindex, tags -def _recursive_find(form, res): - ak = uproot.extras.awkward() - - if hasattr(form, "form_key"): - res.append(form.form_key) - if hasattr(form, "contents"): - for c in form.contents: - _recursive_find(c, res) - if hasattr(form, "content") and issubclass(type(form.content), ak.forms.Form): - _recursive_find(form.content, res) - - # https://github.com/root-project/root/blob/8cd9eed6f3a32e55ef1f0f1df8e5462e753c735d/tree/ntuple/v7/doc/BinaryFormatSpecification.md#page-locations class PageDescription: def read(self, chunk, cursor, context): @@ -1147,7 +993,7 @@ def read(self, chunk, cursor, context): out.env_header["env_type_id"] == uproot.const.RNTupleEnvelopeType.HEADER ), f"env_type_id={out.env_header['env_type_id']}" out.feature_flag = cursor.field(chunk, _rntuple_feature_flag_format, context) - out.name, out.ntuple_description, out.writer_identifier = ( + out.ntuple_name, out.ntuple_description, out.writer_identifier = ( cursor.rntuple_string(chunk, context) for _ in range(3) ) @@ -1234,154 +1080,133 @@ def read(self, chunk, cursor, context): return out -class RNTupleField: - def __init__(self, index, ntuple): - self.index = index - self.ntuple = ntuple +class RField(uproot.behaviors.RNTuple.HasFields): + def __init__(self, fid, ntuple): + self._fid = fid + self._ntuple = ntuple self._length = None + self._fields = None + self._lookup = None + self._path = None - @property - def _keys(self): - keys = [] - for i, fr in enumerate(self.ntuple.field_records): - if i == self.index: - continue - if ( - fr.parent_field_id == self.index - and not fr.field_name.startswith("_") - and not fr.field_name.startswith(":_") - ): - keys.append(fr.field_name) - return keys - - # TODO: this is still missing a lot of functionality - def keys( - self, - *, - filter_name=None, - filter_typename=None, - filter_field=None, - recursive=False, - full_paths=True, - **_, # For compatibility reasons we just ignore other kwargs - ): - filter_name = uproot._util.regularize_filter(filter_name) - return [key for key in self._keys if filter_name(key)] + def __repr__(self): + if len(self) == 0: + return f"" + else: + return f"" @property def name(self): """ - Name of the ``Field``. + Name of the ``RField``. """ - return self.ntuple.field_records[self.index].field_name - - def __len__(self): - if self._length is None: - self._length = len(self.keys()) - return self._length + return self._ntuple.field_records[self._fid].field_name - def __repr__(self): - if len(self) == 0: - return f"" - else: - return f"" + @property + def typename(self): + """ + The C++ typename of the ``RField``. + """ + return self._ntuple.field_records[self._fid].type_name @property - def _key_indices(self): - indices = [] - field_records = self.ntuple.field_records - for i, fr in enumerate(field_records): - if fr.parent_field_id == self.index and not fr.field_name.startswith("_"): - indices.append(i) - return indices + def parent(self): + """ + The parent of this ``RField``. + """ + rntuple = self.ntuple + parent_fid = rntuple.field_records[self._fid].parent_field_id + if parent_fid == self._fid: + return rntuple + return rntuple.all_fields[parent_fid] @property - def _key_to_index(self): - d = {} - field_records = self.ntuple.field_records - for i, fr in enumerate(field_records): - if fr.parent_field_id == self.index and not fr.field_name.startswith("_"): - d[fr.field_name] = i - return d - - def __getitem__(self, where): - # original_where = where - - if uproot._util.isint(where): - index = self._key_indices[where] - elif isinstance(where, str): - where = uproot._util.ensure_str(where) - index = self._key_to_index[where] + def index(self): + """ + Integer position of this ``RField`` in its parent's list of fields. + """ + for i, field in enumerate(self.parent.fields): + if field is self: + return i else: - raise TypeError(f"where must be an integer or a string, not {where!r}") - - # TODO: Implement path support + raise AssertionError - return RNTupleField(index, self.ntuple) + @property + def field_id(self): + """ + The field ID of this ``RField`` in the RNTuple. + """ + return self._fid - def to_akform(self): - ak = uproot.extras.awkward() + @property + def top_level(self): + """ + True if this is a top-level field, False otherwise. + """ + return self.parent is self.ntuple - field_records = self.ntuple.field_records - recordlist = [] - topnames = self.keys() - if len(topnames) == 0: - topnames = [self.name] - recordlist.append(self.ntuple.field_form(self.index, set())) - else: - seen = set() - for i in range(len(field_records)): - if ( - i not in seen - and field_records[i].parent_field_id == self.index - and i != self.index - and not field_records[i].field_name.startswith("_") - and not field_records[i].field_name.startswith(":_") - ): - ff = self.ntuple.field_form(i, seen) - if field_records[i].type_name != "": - recordlist.append(ff) - - form = ak.forms.RecordForm(recordlist, topnames, form_key="toplevel") - return form - - def arrays( + def array( self, - filter_name="*", - filter_typename=None, - entry_start=0, + entry_start=None, entry_stop=None, - decompression_executor=None, - array_cache=None, + *, + decompression_executor=None, # TODO: Not implemented yet + array_cache="inherit", # TODO: Not implemented yet + library="ak", + ak_add_doc=False, + # For compatibility reasons we also accepts kwargs meant for TTrees + interpretation=None, + interpretation_executor=None, ): - return _arrays( - self, - filter_name=filter_name, - filter_typename=filter_typename, + """ + Args: + entry_start (None or int): The first entry to include. If None, start + at zero. If negative, count from the end, like a Python slice. + entry_stop (None or int): The first entry to exclude (i.e. one greater + than the last entry to include). If None, stop at + :ref:`uproot.behaviors.TTree.TTree.num_entries`. If negative, + count from the end, like a Python slice. + decompression_executor (None or Executor with a ``submit`` method): The + executor that is used to decompress ``RPages``; if None, the + file's :ref:`uproot.reading.ReadOnlyFile.decompression_executor` + is used. (Not implemented yet.) + array_cache ("inherit", None, MutableMapping, or memory size): Cache of arrays; + if "inherit", use the file's cache; if None, do not use a cache; + if a memory size, create a new cache of this size. (Not implemented yet.) + library (str or :doc:`uproot.interpretation.library.Library`): The library + that is used to represent arrays. Options are ``"np"`` for NumPy, + ``"ak"`` for Awkward Array, and ``"pd"`` for Pandas. + ak_add_doc (bool | dict ): If True and ``library="ak"``, add the RField ``name`` + to the Awkward ``__doc__`` parameter of the array. + if dict = {key:value} and ``library="ak"``, add the RField ``value`` to the + Awkward ``key`` parameter of the array. + interpretation (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + interpretation_executor (None): This argument is not used and is only included for now + for compatibility with software that was used for :doc:`uproot.behaviors.TBranch.TBranch`. This argument should not be used + and will be removed in a future version. + + Returns the ``RField`` data as an array. + + For example: + + .. code-block:: python + + >>> field = ntuple["my_field"] + >>> array = field.array() + >>> array + + + See also :ref:`uproot.behaviors.RNTuple.HasFields.arrays` to read + multiple ``RFields`` into a group of arrays or an array-group. + """ + return self.arrays( entry_start=entry_start, entry_stop=entry_stop, - decompression_executor=decompression_executor, - array_cache=array_cache, - ) - - def array(self, **kwargs): - if len(self.keys()) == 0: - return self.arrays(**kwargs)[self.name] - return self.arrays(**kwargs) - - def __array__(self, *args, **kwargs): - out = self.array() - if args == () and kwargs == {}: - return out - else: - return numpy.array(out, *args, **kwargs) - - def iterate(self, filter_name="*", *args, step_size="100 MB", **kwargs): - step_size = _regularize_step_size(self, step_size, filter_name) - for start in range(0, self.ntuple.num_entries, step_size): - yield self.array( - *args, entry_start=start, entry_stop=start + step_size, **kwargs - ) + library=library, + ak_add_doc=ak_add_doc, + )[self.name] uproot.classes["ROOT::RNTuple"] = Model_ROOT_3a3a_RNTuple diff --git a/tests/test_0630_rntuple_basics.py b/tests/test_0630_rntuple_basics.py index 88d96a9bc..1a6e4c16e 100644 --- a/tests/test_0630_rntuple_basics.py +++ b/tests/test_0630_rntuple_basics.py @@ -47,4 +47,9 @@ def test_jagged(): ) with uproot.open(filename) as f: R = f["ntuple"] - assert R.keys() == ["one_integers", "two_v_floats", "three_LV", "four_v_LVs"] + assert R.keys(recursive=False) == [ + "one_integers", + "two_v_floats", + "three_LV", + "four_v_LVs", + ] diff --git a/tests/test_0662_rntuple_stl_containers.py b/tests/test_0662_rntuple_stl_containers.py index 32ace7d11..1dc259e76 100644 --- a/tests/test_0662_rntuple_stl_containers.py +++ b/tests/test_0662_rntuple_stl_containers.py @@ -17,7 +17,7 @@ def test_rntuple_stl_containers(): filename = skhep_testdata.data_path("test_stl_containers_rntuple_v1-0-0-0.root") with uproot.open(filename) as f: R = f["ntuple"] - assert R.keys() == [ + assert R.keys(recursive=False) == [ "string", "vector_int32", "array_float", @@ -57,21 +57,23 @@ def test_rntuple_stl_containers(): assert r["vector_variant_int64_string"][1][1] == 2 assert type(r["vector_variant_int64_string"][1][1]) == numpy.int64 - assert ak.all(r["tuple_int32_string"]._0 == [1, 2, 3, 4, 5]) - assert ak.all( - r["tuple_int32_string"]._1 == ["one", "two", "three", "four", "five"] - ) - assert list(r["tuple_int32_string"][0].to_list().values()) == [1, "one"] - assert list(r["tuple_int32_string"][-1].to_list().values()) == [5, "five"] - assert ak.all(r["pair_int32_string"]._0 == [1, 2, 3, 4, 5]) - assert ak.all( - r["pair_int32_string"]._1 == ["one", "two", "three", "four", "five"] - ) + assert r["tuple_int32_string"].tolist() == [ + (1, "one"), + (2, "two"), + (3, "three"), + (4, "four"), + (5, "five"), + ] + assert r["pair_int32_string"].tolist() == [ + (1, "one"), + (2, "two"), + (3, "three"), + (4, "four"), + (5, "five"), + ] - assert r["vector_tuple_int32_string"][0]._0 == [1] - assert r["vector_tuple_int32_string"][0]._1 == ["one"] - assert ak.all(r["vector_tuple_int32_string"][1]._0 == [1, 2]) - assert ak.all(r["vector_tuple_int32_string"][1]._1 == ["one", "two"]) + assert r["vector_tuple_int32_string"][0].tolist() == [(1, "one")] + assert r["vector_tuple_int32_string"][1].tolist() == [(1, "one"), (2, "two")] assert ak.all(r["array_float"][0] == [1, 1, 1]) assert ak.all(r["array_float"][-1] == [5, 5, 5]) diff --git a/tests/test_1223_more_rntuple_types.py b/tests/test_1223_more_rntuple_types.py index ea2a986ae..95a791f57 100644 --- a/tests/test_1223_more_rntuple_types.py +++ b/tests/test_1223_more_rntuple_types.py @@ -75,7 +75,7 @@ def test_empty_struct(): a = obj.arrays("empty_struct") - assert a.empty_struct.tolist() == [{}, {}, {}] + assert a.empty_struct.tolist() == [(), (), ()] def test_invalid_variant(): diff --git a/tests/test_1250_rntuple_improvements.py b/tests/test_1250_rntuple_improvements.py index 32d39c6ea..9515c4a13 100644 --- a/tests/test_1250_rntuple_improvements.py +++ b/tests/test_1250_rntuple_improvements.py @@ -20,7 +20,7 @@ def test_field_class(): assert len(sub_sub_struct) == 2 v = sub_sub_struct["v"] - assert len(v) == 0 + assert len(v) == 1 def test_array_methods(): @@ -55,15 +55,15 @@ def test_iterate(): for i, arrays in enumerate(obj.iterate(step_size="10 kB")): if i == 0: - assert len(arrays) == 363 + assert len(arrays) == 188 expected_pt = [10.763696670532227, 15.736522674560547] expected_charge = [-1, -1] assert arrays["Muon_pt"][0].tolist() == expected_pt assert arrays["Muon_charge"][0].tolist() == expected_charge - elif i == 1: - assert len(arrays) == 363 - elif i == 2: - assert len(arrays) == 274 + elif i in (1, 2, 3, 4): + assert len(arrays) == 188 + elif i == 5: + assert len(arrays) == 60 else: assert False @@ -72,13 +72,13 @@ def test_iterate(): assert len(arrays) == 100 if i == 0: expected_pt = [10.763696670532227, 15.736522674560547] - assert arrays[0].tolist() == expected_pt + assert arrays["Muon_pt"][0].tolist() == expected_pt for i, arrays in enumerate(Muon_pt.iterate(step_size="5 kB")): if i == 0: assert len(arrays) == 611 expected_pt = [10.763696670532227, 15.736522674560547] - assert arrays[0].tolist() == expected_pt + assert arrays["Muon_pt"][0].tolist() == expected_pt elif i == 1: assert len(arrays) == 389 else: diff --git a/tests/test_1395_rntuple_writing_lists_and_structs.py b/tests/test_1395_rntuple_writing_lists_and_structs.py index 3e1b999c5..982dfe257 100644 --- a/tests/test_1395_rntuple_writing_lists_and_structs.py +++ b/tests/test_1395_rntuple_writing_lists_and_structs.py @@ -59,11 +59,7 @@ def test_writing_and_reading(tmp_path): arrays = obj.arrays() for f in data.fields: - if "tuple" in f: - # TODO: tuples are converted to records - [tuple(t[f] for f in t.fields) for t in arrays[f][:3]] == data[f].tolist() - [tuple(t[f] for f in t.fields) for t in arrays[f][3:]] == data[f].tolist() - elif f == "optional": + if f == "optional": assert [t[0] if len(t) > 0 else None for t in arrays[f][:3]] == data[ f ].tolist() diff --git a/tests/test_1406_improved_rntuple_methods.py b/tests/test_1406_improved_rntuple_methods.py new file mode 100644 index 000000000..2b9dc32e6 --- /dev/null +++ b/tests/test_1406_improved_rntuple_methods.py @@ -0,0 +1,130 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE + +import os + +import numpy +import pytest +import skhep_testdata + +import uproot + +ak = pytest.importorskip("awkward") + +data = ak.Array( + { + "struct1": [{"x": 1, "y": 2}, {"x": 3, "y": 4}], + "struct2": [{"x": 5, "y": 6, "z": 7}, {"x": 8, "y": 9, "z": 10}], + "struct3": [ + {"x": 11, "y": 12, "z": 13, "t": 14.0}, + {"x": 15, "y": 16, "z": 17, "t": 18.0}, + ], + "struct4": [ + { + "x": [{"up": 1, "down": 2}, {"up": 3, "down": 4}], + "y": [({"left": 5, "right": 6}, {"left": 7, "right": 8.0})], + }, + { + "x": [{"up": 9, "down": 10}, {"up": 11, "down": 12}], + "y": [({"left": 13, "right": 14}, {"left": 15, "right": 16.0})], + }, + ], + "struct5": [(1, 2, 3), (4, 5, 6)], + } +) + + +def test_keys(tmp_path): + filepath = os.path.join(tmp_path, "test.root") + + with uproot.recreate(filepath) as file: + obj = file.mkrntuple("ntuple", data) + + obj = uproot.open(filepath)["ntuple"] + + assert len(obj) == 5 + assert len(obj.keys(recursive=False)) == 5 + + assert len(obj.keys()) == 29 + assert len(obj.keys(full_paths=False)) == 29 + assert len(obj.keys(full_paths=False, ignore_duplicates=True)) == 16 + + assert len(obj.keys(filter_name="x")) == 4 + assert len(obj.keys(filter_name="z")) == 2 + assert len(obj.keys(filter_name="do*")) == 1 + + assert len(obj.keys(filter_typename="std::int*_t")) == 16 + + assert len(obj.keys(filter_field=lambda f: f.name == "up")) == 1 + + assert obj["struct1"].keys() == ["x", "y"] + assert len(obj["struct4"].keys()) == 12 + + +def test_getitem(tmp_path): + filepath = os.path.join(tmp_path, "test.root") + + with uproot.recreate(filepath) as file: + obj = file.mkrntuple("ntuple", data) + + obj = uproot.open(filepath)["ntuple"] + + assert obj["struct1"] is obj.fields[0] + assert obj["struct2"] is obj.fields[1] + assert obj["struct3"] is obj.fields[2] + assert obj["struct4"] is obj.fields[3] + assert obj["struct5"] is obj.fields[4] + + assert obj["struct1"]["x"] is obj.fields[0].fields[0] + assert obj["struct1"]["x"] is obj["struct1.x"] + assert obj["struct1"]["x"] is obj["struct1/x"] + assert obj["struct1"]["x"] is obj[r"struct1\x"] + + +def test_to_akform(tmp_path): + filepath = os.path.join(tmp_path, "test.root") + + with uproot.recreate(filepath) as file: + obj = file.mkrntuple("ntuple", data) + + obj = uproot.open(filepath)["ntuple"] + + akform = obj.to_akform() + assert akform == data.layout.form + + assert obj["struct1"].to_akform() == akform.select_columns("struct1") + assert obj["struct2"].to_akform() == akform.select_columns("struct2") + assert obj["struct3"].to_akform() == akform.select_columns("struct3") + assert obj["struct4"].to_akform() == akform.select_columns("struct4") + assert obj["struct5"].to_akform() == akform.select_columns("struct5") + + assert obj["struct1"].to_akform(filter_name="x") == akform.select_columns( + ["struct1.x"] + ) + assert obj["struct3"].to_akform(filter_typename="double") == akform.select_columns( + ["struct3.t"] + ) + + +def test_iterate_and_concatenate(tmp_path): + filepath1 = os.path.join(tmp_path, "test1.root") + filepath2 = os.path.join(tmp_path, "test2.root") + + with uproot.recreate(filepath1) as file: + file.mkrntuple("ntuple", data) + + with uproot.recreate(filepath2) as file: + file.mkrntuple("ntuple", data) + + total_iterations = 0 + for i, array in enumerate( + uproot.behaviors.RNTuple.iterate([f"{tmp_path}/test*.root:ntuple"], step_size=2) + ): + total_iterations += 1 + assert ak.array_equal(array, data) + + assert total_iterations == 2 + + array = uproot.behaviors.RNTuple.concatenate([f"{tmp_path}/test*.root:ntuple"]) + true_array = ak.concatenate([data, data], axis=0) + + assert ak.array_equal(array, true_array)