Skip to content

Commit ed69517

Browse files
authored
TEST-#2288: Cover by tests delimiters parameters of read_csv (#2310)
Signed-off-by: Alexander Myskov <[email protected]>
1 parent 5cabeb9 commit ed69517

File tree

4 files changed

+337
-47
lines changed

4 files changed

+337
-47
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ jobs:
433433
conda info
434434
conda list
435435
- shell: bash -l {0}
436-
run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::test_from_csv
436+
run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::TestReadCSV
437437
- shell: bash -l {0}
438438
run: bash <(curl -s https://codecov.io/bash)
439439

@@ -557,4 +557,4 @@ jobs:
557557
conda list
558558
- run: sudo apt update && sudo apt install -y libhdf5-dev
559559
- shell: bash -l {0}
560-
run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
560+
run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV

.github/workflows/push.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,4 +324,4 @@ jobs:
324324
conda list
325325
- run: sudo apt update && sudo apt install -y libhdf5-dev
326326
- shell: bash -l {0}
327-
run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
327+
run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV

modin/pandas/test/test_io.py

Lines changed: 146 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,12 @@
3232
json_short_bytes,
3333
json_long_string,
3434
json_long_bytes,
35-
eval_general,
35+
random_state,
36+
eval_io,
37+
get_unique_filename,
38+
get_random_string,
39+
insert_lines_to_csv,
40+
IO_OPS_DATA_DIR,
3641
)
3742

3843
from modin.config import Engine, Backend
@@ -61,24 +66,8 @@
6166
SMALL_ROW_SIZE = 2000
6267

6368

64-
def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs):
65-
def applyier(module, *args, **kwargs):
66-
result = getattr(module, fn_name)(*args, **kwargs)
67-
# There could be some missmatches in dtypes, so we're
68-
# casting the whole frame to `str` before comparison.
69-
# See issue #1931 for details.
70-
if cast_to_str:
71-
result = result.astype(str)
72-
return result
73-
74-
eval_general(
75-
pd,
76-
pandas,
77-
applyier,
78-
path=path,
79-
*args,
80-
**kwargs,
81-
)
69+
if not os.path.exists(IO_OPS_DATA_DIR):
70+
os.mkdir(IO_OPS_DATA_DIR)
8271

8372

8473
@pytest.fixture
@@ -173,48 +162,131 @@ def teardown_test_file(test_path):
173162
os.remove(test_path)
174163

175164

176-
@pytest.fixture
177-
def make_csv_file(delimiter=",", compression="infer"):
178-
"""Pytest fixture factory that makes temp csv files for testing.
179-
180-
Yields:
181-
Function that generates csv files
182-
"""
183-
filenames = []
184-
185-
def _make_csv_file(
165+
def _make_csv_file(filenames):
166+
def _csv_file_maker(
186167
filename=TEST_CSV_FILENAME,
187168
row_size=SMALL_ROW_SIZE,
188169
force=True,
189-
delimiter=delimiter,
170+
delimiter=",",
190171
encoding=None,
191-
compression=compression,
172+
compression="infer",
173+
additional_col_values=None,
174+
add_blank_lines=False,
175+
add_bad_lines=False,
176+
add_nan_lines=False,
177+
thousands_separator=None,
178+
decimal_separator=None,
179+
lineterminator=None,
180+
comment_col_char=None,
181+
quoting=csv.QUOTE_MINIMAL,
182+
quotechar='"',
183+
doublequote=True,
184+
escapechar=None,
185+
line_terminator=os.linesep,
192186
):
193187
if os.path.exists(filename) and not force:
194188
pass
195189
else:
196190
dates = pandas.date_range("2000", freq="h", periods=row_size)
197-
df = pandas.DataFrame(
198-
{
199-
"col1": np.arange(row_size),
200-
"col2": [str(x.date()) for x in dates],
201-
"col3": np.arange(row_size),
202-
"col4": [str(x.time()) for x in dates],
203-
}
204-
)
191+
data = {
192+
"col1": np.arange(row_size) * 10,
193+
"col2": [str(x.date()) for x in dates],
194+
"col3": np.arange(row_size) * 10,
195+
"col4": [str(x.time()) for x in dates],
196+
"col5": [get_random_string() for _ in range(row_size)],
197+
"col6": random_state.uniform(low=0.0, high=10000.0, size=row_size),
198+
}
199+
200+
if additional_col_values is not None:
201+
assert isinstance(additional_col_values, (list, tuple))
202+
data.update(
203+
{
204+
"col7": random_state.choice(
205+
additional_col_values, size=row_size
206+
),
207+
}
208+
)
209+
df = pandas.DataFrame(data)
210+
if add_nan_lines:
211+
for i in range(0, row_size, row_size // (row_size // 10)):
212+
df.loc[i] = pandas.Series()
213+
if comment_col_char:
214+
char = comment_col_char if isinstance(comment_col_char, str) else "#"
215+
df.insert(
216+
loc=0,
217+
column="col_with_comments",
218+
value=[char if (x + 2) == 0 else x for x in range(row_size)],
219+
)
220+
221+
if thousands_separator:
222+
for col_id in ["col1", "col3"]:
223+
df[col_id] = df[col_id].apply(
224+
lambda x: f"{x:,d}".replace(",", thousands_separator)
225+
)
226+
df["col6"] = df["col6"].apply(
227+
lambda x: f"{x:,f}".replace(",", thousands_separator)
228+
)
229+
205230
if compression == "gzip":
206231
filename = "{}.gz".format(filename)
207232
elif compression == "zip" or compression == "xz" or compression == "bz2":
208233
filename = "{fname}.{comp}".format(fname=filename, comp=compression)
209-
210234
df.to_csv(
211-
filename, sep=delimiter, encoding=encoding, compression=compression
235+
filename,
236+
sep=delimiter,
237+
encoding=encoding,
238+
compression=compression,
239+
index=False,
240+
decimal=decimal_separator if decimal_separator else ".",
241+
line_terminator=line_terminator,
242+
quoting=quoting,
243+
quotechar=quotechar,
244+
doublequote=doublequote,
245+
escapechar=escapechar,
212246
)
247+
csv_reader_writer_params = {
248+
"delimiter": delimiter,
249+
"doublequote": doublequote,
250+
"escapechar": escapechar,
251+
"lineterminator": line_terminator,
252+
"quotechar": quotechar,
253+
"quoting": quoting,
254+
}
255+
if add_blank_lines:
256+
insert_lines_to_csv(
257+
csv_name=filename,
258+
lines_positions=[
259+
x for x in range(5, row_size, row_size // (row_size // 10))
260+
],
261+
lines_type="blank",
262+
encoding=encoding,
263+
**csv_reader_writer_params,
264+
)
265+
if add_bad_lines:
266+
insert_lines_to_csv(
267+
csv_name=filename,
268+
lines_positions=[
269+
x for x in range(6, row_size, row_size // (row_size // 10))
270+
],
271+
lines_type="bad",
272+
encoding=encoding,
273+
**csv_reader_writer_params,
274+
)
213275
filenames.append(filename)
214276
return df
215277

216-
# Return function that generates csv files
217-
yield _make_csv_file
278+
return _csv_file_maker
279+
280+
281+
@pytest.fixture
282+
def make_csv_file():
283+
"""Pytest fixture factory that makes temp csv files for testing.
284+
Yields:
285+
Function that generates csv files
286+
"""
287+
filenames = []
288+
289+
yield _make_csv_file(filenames)
218290

219291
# Delete csv files that were created
220292
for filename in filenames:
@@ -423,6 +495,36 @@ def teardown_fwf_file():
423495
pass
424496

425497

498+
class TestReadCSV:
499+
# delimiter tests
500+
@pytest.mark.parametrize("sep", ["_", ",", ".", "\n"])
501+
@pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"])
502+
@pytest.mark.parametrize("decimal", [".", "_"])
503+
@pytest.mark.parametrize("thousands", [None, ",", "_", " "])
504+
def test_read_csv_delimiters(
505+
self, make_csv_file, sep, delimiter, decimal, thousands
506+
):
507+
kwargs = {
508+
"delimiter": delimiter,
509+
"sep": sep,
510+
"decimal": decimal,
511+
"thousands": thousands,
512+
}
513+
unique_filename = get_unique_filename("test_read_csv_delimiter", kwargs)
514+
make_csv_file(
515+
filename=unique_filename,
516+
delimiter=delimiter,
517+
thousands_separator=thousands,
518+
decimal_separator=decimal,
519+
)
520+
521+
eval_io(
522+
filepath_or_buffer=unique_filename,
523+
fn_name="read_csv",
524+
**kwargs,
525+
)
526+
527+
426528
def test_from_parquet(make_parquet_file):
427529
make_parquet_file(SMALL_ROW_SIZE)
428530

@@ -1230,7 +1332,7 @@ def test_from_csv_parse_dates(make_csv_file):
12301332
@pytest.mark.parametrize("skiprows", [4, 1, 500, None])
12311333
def test_from_csv_newlines_in_quotes(nrows, skiprows):
12321334
eval_io(
1233-
path="modin/pandas/test/data/newlines.csv",
1335+
filepath_or_buffer="modin/pandas/test/data/newlines.csv",
12341336
fn_name="read_csv",
12351337
nrows=nrows,
12361338
skiprows=skiprows,

0 commit comments

Comments
 (0)