TEST-#2288: Cover by tests delimiters parameters of read_csv (#2310)

amyskov · web-flow · commit ed69517eb5f2 · 2020-10-28T13:31:51.000+03:00
Signed-off-by: Alexander Myskov &lt;alexander.myskov@intel.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -433,7 +433,7 @@ jobs:
           conda info
           conda list
       - shell: bash -l {0}
-        run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::test_from_csv
+        run: python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py::TestReadCSV
       - shell: bash -l {0}
         run: bash <(curl -s https://codecov.io/bash)
 
@@ -557,4 +557,4 @@ jobs:
           conda list
       - run: sudo apt update && sudo apt install -y libhdf5-dev
       - shell: bash -l {0}
-        run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
+        run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -324,4 +324,4 @@ jobs:
           conda list
       - run: sudo apt update && sudo apt install -y libhdf5-dev
       - shell: bash -l {0}
-        run: python -m pytest modin/pandas/test/test_io.py::test_from_csv
+        run: python -m pytest modin/pandas/test/test_io.py::TestReadCSV
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
@@ -32,7 +32,12 @@
     json_short_bytes,
     json_long_string,
     json_long_bytes,
-    eval_general,
+    random_state,
+    eval_io,
+    get_unique_filename,
+    get_random_string,
+    insert_lines_to_csv,
+    IO_OPS_DATA_DIR,
 )
 
 from modin.config import Engine, Backend
@@ -61,24 +66,8 @@
 SMALL_ROW_SIZE = 2000
 
 
-def eval_io(path, fn_name, comparator=df_equals, cast_to_str=False, *args, **kwargs):
-    def applyier(module, *args, **kwargs):
-        result = getattr(module, fn_name)(*args, **kwargs)
-        # There could be some missmatches in dtypes, so we're
-        # casting the whole frame to `str` before comparison.
-        # See issue #1931 for details.
-        if cast_to_str:
-            result = result.astype(str)
-        return result
-
-    eval_general(
-        pd,
-        pandas,
-        applyier,
-        path=path,
-        *args,
-        **kwargs,
-    )
+if not os.path.exists(IO_OPS_DATA_DIR):
+    os.mkdir(IO_OPS_DATA_DIR)
 
 
 @pytest.fixture
@@ -173,48 +162,131 @@ def teardown_test_file(test_path):
         os.remove(test_path)
 
 
-@pytest.fixture
-def make_csv_file(delimiter=",", compression="infer"):
-    """Pytest fixture factory that makes temp csv files for testing.
-
-    Yields:
-        Function that generates csv files
-    """
-    filenames = []
-
-    def _make_csv_file(
+def _make_csv_file(filenames):
+    def _csv_file_maker(
         filename=TEST_CSV_FILENAME,
         row_size=SMALL_ROW_SIZE,
         force=True,
-        delimiter=delimiter,
+        delimiter=",",
         encoding=None,
-        compression=compression,
+        compression="infer",
+        additional_col_values=None,
+        add_blank_lines=False,
+        add_bad_lines=False,
+        add_nan_lines=False,
+        thousands_separator=None,
+        decimal_separator=None,
+        lineterminator=None,
+        comment_col_char=None,
+        quoting=csv.QUOTE_MINIMAL,
+        quotechar='"',
+        doublequote=True,
+        escapechar=None,
+        line_terminator=os.linesep,
     ):
         if os.path.exists(filename) and not force:
             pass
         else:
             dates = pandas.date_range("2000", freq="h", periods=row_size)
-            df = pandas.DataFrame(
-                {
-                    "col1": np.arange(row_size),
-                    "col2": [str(x.date()) for x in dates],
-                    "col3": np.arange(row_size),
-                    "col4": [str(x.time()) for x in dates],
-                }
-            )
+            data = {
+                "col1": np.arange(row_size) * 10,
+                "col2": [str(x.date()) for x in dates],
+                "col3": np.arange(row_size) * 10,
+                "col4": [str(x.time()) for x in dates],
+                "col5": [get_random_string() for _ in range(row_size)],
+                "col6": random_state.uniform(low=0.0, high=10000.0, size=row_size),
+            }
+
+            if additional_col_values is not None:
+                assert isinstance(additional_col_values, (list, tuple))
+                data.update(
+                    {
+                        "col7": random_state.choice(
+                            additional_col_values, size=row_size
+                        ),
+                    }
+                )
+            df = pandas.DataFrame(data)
+            if add_nan_lines:
+                for i in range(0, row_size, row_size // (row_size // 10)):
+                    df.loc[i] = pandas.Series()
+            if comment_col_char:
+                char = comment_col_char if isinstance(comment_col_char, str) else "#"
+                df.insert(
+                    loc=0,
+                    column="col_with_comments",
+                    value=[char if (x + 2) == 0 else x for x in range(row_size)],
+                )
+
+            if thousands_separator:
+                for col_id in ["col1", "col3"]:
+                    df[col_id] = df[col_id].apply(
+                        lambda x: f"{x:,d}".replace(",", thousands_separator)
+                    )
+                df["col6"] = df["col6"].apply(
+                    lambda x: f"{x:,f}".replace(",", thousands_separator)
+                )
+
             if compression == "gzip":
                 filename = "{}.gz".format(filename)
             elif compression == "zip" or compression == "xz" or compression == "bz2":
                 filename = "{fname}.{comp}".format(fname=filename, comp=compression)
-
             df.to_csv(
-                filename, sep=delimiter, encoding=encoding, compression=compression
+                filename,
+                sep=delimiter,
+                encoding=encoding,
+                compression=compression,
+                index=False,
+                decimal=decimal_separator if decimal_separator else ".",
+                line_terminator=line_terminator,
+                quoting=quoting,
+                quotechar=quotechar,
+                doublequote=doublequote,
+                escapechar=escapechar,
             )
+            csv_reader_writer_params = {
+                "delimiter": delimiter,
+                "doublequote": doublequote,
+                "escapechar": escapechar,
+                "lineterminator": line_terminator,
+                "quotechar": quotechar,
+                "quoting": quoting,
+            }
+            if add_blank_lines:
+                insert_lines_to_csv(
+                    csv_name=filename,
+                    lines_positions=[
+                        x for x in range(5, row_size, row_size // (row_size // 10))
+                    ],
+                    lines_type="blank",
+                    encoding=encoding,
+                    **csv_reader_writer_params,
+                )
+            if add_bad_lines:
+                insert_lines_to_csv(
+                    csv_name=filename,
+                    lines_positions=[
+                        x for x in range(6, row_size, row_size // (row_size // 10))
+                    ],
+                    lines_type="bad",
+                    encoding=encoding,
+                    **csv_reader_writer_params,
+                )
             filenames.append(filename)
             return df
 
-    # Return function that generates csv files
-    yield _make_csv_file
+    return _csv_file_maker
+
+
+@pytest.fixture
+def make_csv_file():
+    """Pytest fixture factory that makes temp csv files for testing.
+    Yields:
+        Function that generates csv files
+    """
+    filenames = []
+
+    yield _make_csv_file(filenames)
 
     # Delete csv files that were created
     for filename in filenames:
@@ -423,6 +495,36 @@ def teardown_fwf_file():
             pass
 
 
+class TestReadCSV:
+    # delimiter tests
+    @pytest.mark.parametrize("sep", ["_", ",", ".", "\n"])
+    @pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"])
+    @pytest.mark.parametrize("decimal", [".", "_"])
+    @pytest.mark.parametrize("thousands", [None, ",", "_", " "])
+    def test_read_csv_delimiters(
+        self, make_csv_file, sep, delimiter, decimal, thousands
+    ):
+        kwargs = {
+            "delimiter": delimiter,
+            "sep": sep,
+            "decimal": decimal,
+            "thousands": thousands,
+        }
+        unique_filename = get_unique_filename("test_read_csv_delimiter", kwargs)
+        make_csv_file(
+            filename=unique_filename,
+            delimiter=delimiter,
+            thousands_separator=thousands,
+            decimal_separator=decimal,
+        )
+
+        eval_io(
+            filepath_or_buffer=unique_filename,
+            fn_name="read_csv",
+            **kwargs,
+        )
+
+
 def test_from_parquet(make_parquet_file):
     make_parquet_file(SMALL_ROW_SIZE)
 
@@ -1230,7 +1332,7 @@ def test_from_csv_parse_dates(make_csv_file):
 @pytest.mark.parametrize("skiprows", [4, 1, 500, None])
 def test_from_csv_newlines_in_quotes(nrows, skiprows):
     eval_io(
-        path="modin/pandas/test/data/newlines.csv",
+        filepath_or_buffer="modin/pandas/test/data/newlines.csv",
         fn_name="read_csv",
         nrows=nrows,
         skiprows=skiprows,
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py