@@ -124,7 +124,7 @@ def __call__(self, sample: Mapping[str, Any]) -> Mapping[str, Any]:
124124def the_cauldron_dataset (
125125 model_transform : Transform ,
126126 * ,
127- subset : str ,
127+ subset : str = "orcvqa" ,
128128 source : str = "HuggingFaceM4/the_cauldron" ,
129129 column_map : Optional [Dict [str , str ]] = None ,
130130 new_system_prompt : Optional [str ] = None ,
@@ -138,8 +138,8 @@ def the_cauldron_dataset(
138138 `The Cauldron <https://huggingface.co/datasets/HuggingFaceM4/the_cauldron>`_
139139 from Hugging Face Datasets.
140140
141- The Cauldron consists of numerous datasets. You must specify one of the datasets
142- using the ``subset`` argument.
141+ The Cauldron consists of numerous datasets. You can specify one of the datasets
142+ using the ``subset`` argument. The default value is the ``orcvqa`` dataset.
143143
144144 The model transform is expected to be a callable that applies pre-processing steps specific
145145 to a model. For multimodal datasets, this is expected to be at minimum a tokenizer and
@@ -181,8 +181,8 @@ def __call__(self, sample: Mapping[str, Any]) -> Mapping[str, Any]:
181181 transforms on the keys. It should consist of at minimum two components: text tokenization (called
182182 on the "messages" field) and image transform (called on the "images" field). The keys returned by
183183 the model transform should be aligned with the expected inputs into the model.
184- subset (str): name of the subset of the dataset to load. See the `dataset card
185- <https://huggingface.co/datasets/HuggingFaceM4/the_cauldron>`_ for options.
184+ subset (str): name of the subset of the dataset to load. Default is `orcvqa`, see the `dataset card
185+ <https://huggingface.co/datasets/HuggingFaceM4/the_cauldron>`_ for other options.
186186 source (str): path to dataset repository on Hugging Face. For local datasets,
187187 define source as the data file type (e.g. "json", "csv", "text") and pass
188188 in the filepath in ``data_files``. See `Hugging Face's
0 commit comments