|
32 | 32 | json_short_bytes,
|
33 | 33 | json_long_string,
|
34 | 34 | json_long_bytes,
|
35 |
| - eval_general, |
| 35 | + random_state, |
| 36 | + eval_io, |
| 37 | + get_unique_filename, |
| 38 | + get_random_string, |
| 39 | + insert_lines_to_csv, |
| 40 | + IO_OPS_DATA_DIR, |
36 | 41 | )
|
37 | 42 |
|
38 | 43 | from modin.config import Engine, Backend
|
|
61 | 66 | SMALL_ROW_SIZE = 2000
|
62 | 67 |
|
63 | 68 |
|
64 |
| -def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs): |
65 |
| - def applyier(module, *args, **kwargs): |
66 |
| - result = getattr(module, fn_name)(*args, **kwargs) |
67 |
| - # There could be some missmatches in dtypes, so we're |
68 |
| - # casting the whole frame to `str` before comparison. |
69 |
| - # See issue #1931 for details. |
70 |
| - if cast_to_str: |
71 |
| - result = result.astype(str) |
72 |
| - return result |
73 |
| - |
74 |
| - eval_general( |
75 |
| - pd, |
76 |
| - pandas, |
77 |
| - applyier, |
78 |
| - path=path, |
79 |
| - *args, |
80 |
| - **kwargs, |
81 |
| - ) |
| 69 | +if not os.path.exists(IO_OPS_DATA_DIR): |
| 70 | + os.mkdir(IO_OPS_DATA_DIR) |
82 | 71 |
|
83 | 72 |
|
84 | 73 | @pytest.fixture
|
@@ -173,48 +162,131 @@ def teardown_test_file(test_path):
|
173 | 162 | os.remove(test_path)
|
174 | 163 |
|
175 | 164 |
|
176 |
| -@pytest.fixture |
177 |
| -def make_csv_file(delimiter=",", compression="infer"): |
178 |
| - """Pytest fixture factory that makes temp csv files for testing. |
179 |
| -
|
180 |
| - Yields: |
181 |
| - Function that generates csv files |
182 |
| - """ |
183 |
| - filenames = [] |
184 |
| - |
185 |
| - def _make_csv_file( |
| 165 | +def _make_csv_file(filenames): |
| 166 | + def _csv_file_maker( |
186 | 167 | filename=TEST_CSV_FILENAME,
|
187 | 168 | row_size=SMALL_ROW_SIZE,
|
188 | 169 | force=True,
|
189 |
| - delimiter=delimiter, |
| 170 | + delimiter=",", |
190 | 171 | encoding=None,
|
191 |
| - compression=compression, |
| 172 | + compression="infer", |
| 173 | + additional_col_values=None, |
| 174 | + add_blank_lines=False, |
| 175 | + add_bad_lines=False, |
| 176 | + add_nan_lines=False, |
| 177 | + thousands_separator=None, |
| 178 | + decimal_separator=None, |
| 179 | + lineterminator=None, |
| 180 | + comment_col_char=None, |
| 181 | + quoting=csv.QUOTE_MINIMAL, |
| 182 | + quotechar='"', |
| 183 | + doublequote=True, |
| 184 | + escapechar=None, |
| 185 | + line_terminator=os.linesep, |
192 | 186 | ):
|
193 | 187 | if os.path.exists(filename) and not force:
|
194 | 188 | pass
|
195 | 189 | else:
|
196 | 190 | dates = pandas.date_range("2000", freq="h", periods=row_size)
|
197 |
| - df = pandas.DataFrame( |
198 |
| - { |
199 |
| - "col1": np.arange(row_size), |
200 |
| - "col2": [str(x.date()) for x in dates], |
201 |
| - "col3": np.arange(row_size), |
202 |
| - "col4": [str(x.time()) for x in dates], |
203 |
| - } |
204 |
| - ) |
| 191 | + data = { |
| 192 | + "col1": np.arange(row_size) * 10, |
| 193 | + "col2": [str(x.date()) for x in dates], |
| 194 | + "col3": np.arange(row_size) * 10, |
| 195 | + "col4": [str(x.time()) for x in dates], |
| 196 | + "col5": [get_random_string() for _ in range(row_size)], |
| 197 | + "col6": random_state.uniform(low=0.0, high=10000.0, size=row_size), |
| 198 | + } |
| 199 | + |
| 200 | + if additional_col_values is not None: |
| 201 | + assert isinstance(additional_col_values, (list, tuple)) |
| 202 | + data.update( |
| 203 | + { |
| 204 | + "col7": random_state.choice( |
| 205 | + additional_col_values, size=row_size |
| 206 | + ), |
| 207 | + } |
| 208 | + ) |
| 209 | + df = pandas.DataFrame(data) |
| 210 | + if add_nan_lines: |
| 211 | + for i in range(0, row_size, row_size // (row_size // 10)): |
| 212 | + df.loc[i] = pandas.Series() |
| 213 | + if comment_col_char: |
| 214 | + char = comment_col_char if isinstance(comment_col_char, str) else "#" |
| 215 | + df.insert( |
| 216 | + loc=0, |
| 217 | + column="col_with_comments", |
| 218 | + value=[char if (x + 2) == 0 else x for x in range(row_size)], |
| 219 | + ) |
| 220 | + |
| 221 | + if thousands_separator: |
| 222 | + for col_id in ["col1", "col3"]: |
| 223 | + df[col_id] = df[col_id].apply( |
| 224 | + lambda x: f"{x:,d}".replace(",", thousands_separator) |
| 225 | + ) |
| 226 | + df["col6"] = df["col6"].apply( |
| 227 | + lambda x: f"{x:,f}".replace(",", thousands_separator) |
| 228 | + ) |
| 229 | + |
205 | 230 | if compression == "gzip":
|
206 | 231 | filename = "{}.gz".format(filename)
|
207 | 232 | elif compression == "zip" or compression == "xz" or compression == "bz2":
|
208 | 233 | filename = "{fname}.{comp}".format(fname=filename, comp=compression)
|
209 |
| - |
210 | 234 | df.to_csv(
|
211 |
| - filename, sep=delimiter, encoding=encoding, compression=compression |
| 235 | + filename, |
| 236 | + sep=delimiter, |
| 237 | + encoding=encoding, |
| 238 | + compression=compression, |
| 239 | + index=False, |
| 240 | + decimal=decimal_separator if decimal_separator else ".", |
| 241 | + line_terminator=line_terminator, |
| 242 | + quoting=quoting, |
| 243 | + quotechar=quotechar, |
| 244 | + doublequote=doublequote, |
| 245 | + escapechar=escapechar, |
212 | 246 | )
|
| 247 | + csv_reader_writer_params = { |
| 248 | + "delimiter": delimiter, |
| 249 | + "doublequote": doublequote, |
| 250 | + "escapechar": escapechar, |
| 251 | + "lineterminator": line_terminator, |
| 252 | + "quotechar": quotechar, |
| 253 | + "quoting": quoting, |
| 254 | + } |
| 255 | + if add_blank_lines: |
| 256 | + insert_lines_to_csv( |
| 257 | + csv_name=filename, |
| 258 | + lines_positions=[ |
| 259 | + x for x in range(5, row_size, row_size // (row_size // 10)) |
| 260 | + ], |
| 261 | + lines_type="blank", |
| 262 | + encoding=encoding, |
| 263 | + **csv_reader_writer_params, |
| 264 | + ) |
| 265 | + if add_bad_lines: |
| 266 | + insert_lines_to_csv( |
| 267 | + csv_name=filename, |
| 268 | + lines_positions=[ |
| 269 | + x for x in range(6, row_size, row_size // (row_size // 10)) |
| 270 | + ], |
| 271 | + lines_type="bad", |
| 272 | + encoding=encoding, |
| 273 | + **csv_reader_writer_params, |
| 274 | + ) |
213 | 275 | filenames.append(filename)
|
214 | 276 | return df
|
215 | 277 |
|
216 |
| - # Return function that generates csv files |
217 |
| - yield _make_csv_file |
| 278 | + return _csv_file_maker |
| 279 | + |
| 280 | + |
| 281 | +@pytest.fixture |
| 282 | +def make_csv_file(): |
| 283 | + """Pytest fixture factory that makes temp csv files for testing. |
| 284 | + Yields: |
| 285 | + Function that generates csv files |
| 286 | + """ |
| 287 | + filenames = [] |
| 288 | + |
| 289 | + yield _make_csv_file(filenames) |
218 | 290 |
|
219 | 291 | # Delete csv files that were created
|
220 | 292 | for filename in filenames:
|
@@ -423,6 +495,36 @@ def teardown_fwf_file():
|
423 | 495 | pass
|
424 | 496 |
|
425 | 497 |
|
| 498 | +class TestReadCSV: |
| 499 | + # delimiter tests |
| 500 | + @pytest.mark.parametrize("sep", ["_", ",", ".", "\n"]) |
| 501 | + @pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"]) |
| 502 | + @pytest.mark.parametrize("decimal", [".", "_"]) |
| 503 | + @pytest.mark.parametrize("thousands", [None, ",", "_", " "]) |
| 504 | + def test_read_csv_delimiters( |
| 505 | + self, make_csv_file, sep, delimiter, decimal, thousands |
| 506 | + ): |
| 507 | + kwargs = { |
| 508 | + "delimiter": delimiter, |
| 509 | + "sep": sep, |
| 510 | + "decimal": decimal, |
| 511 | + "thousands": thousands, |
| 512 | + } |
| 513 | + unique_filename = get_unique_filename("test_read_csv_delimiter", kwargs) |
| 514 | + make_csv_file( |
| 515 | + filename=unique_filename, |
| 516 | + delimiter=delimiter, |
| 517 | + thousands_separator=thousands, |
| 518 | + decimal_separator=decimal, |
| 519 | + ) |
| 520 | + |
| 521 | + eval_io( |
| 522 | + filepath_or_buffer=unique_filename, |
| 523 | + fn_name="read_csv", |
| 524 | + **kwargs, |
| 525 | + ) |
| 526 | + |
| 527 | + |
426 | 528 | def test_from_parquet(make_parquet_file):
|
427 | 529 | make_parquet_file(SMALL_ROW_SIZE)
|
428 | 530 |
|
@@ -1230,7 +1332,7 @@ def test_from_csv_parse_dates(make_csv_file):
|
1230 | 1332 | @pytest.mark.parametrize("skiprows", [4, 1, 500, None])
|
1231 | 1333 | def test_from_csv_newlines_in_quotes(nrows, skiprows):
|
1232 | 1334 | eval_io(
|
1233 |
| - path="modin/pandas/test/data/newlines.csv", |
| 1335 | + filepath_or_buffer="modin/pandas/test/data/newlines.csv", |
1234 | 1336 | fn_name="read_csv",
|
1235 | 1337 | nrows=nrows,
|
1236 | 1338 | skiprows=skiprows,
|
|
0 commit comments