parse validaly comment statement after schema & table

xnuinside · xnuinside · commit 351ff814c253 · 2022-05-06T16:48:35.000+03:00
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,5 @@
 [flake8]
-exclude = .github,.git,__pycache__,docs/source/conf.py,old,build,dist,simple_ddl_parser/parsetab.py,./test.py,simple_ddl_parser/test.py
+exclude = .github,.git,__pycache__,docs/source/conf.py,old,build,dist,tests/,simple_ddl_parser/parsetab.py,./test.py,simple_ddl_parser/test.py
 max-complexity = 10
 max-line-length = 120
 ignore = W503, E999
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,14 @@
+**v0.26.2**
+
+Fixes:
+1. Fixed a huge bug for incorrect parsing lines with 'USE' & 'GO' strings inside.
+2. Fixed parsing for CREATE SCHEMA for Snowlake & Oracle DDLs
+
+Improvements:
+1. Added  COMMENT statement for CREATE TABLE ddl (for SNOWFLAKE dialect support)
+2. Added  COMMENT statement for CREATE SCHEMA ddl (for SNOWFLAKE dialect support)
+
+
 **v0.26.1**
 
 Fixes:
diff --git a/README.md b/README.md
@@ -414,6 +414,7 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO
 - CREATE .. CLONE statements for table, database and schema
 - CREATE TABLE .. CLUSTER BY ..
 - CONSTRAINT .. [NOT] ENFORCED 
+- COMMENT = in CREATE TABLE & CREATE SCHEMA statements
 
 ### BigQuery
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "simple-ddl-parser"
-version = "0.26.1"
+version = "0.26.2"
 description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL (MSSQL), Oracle, AWS Redshift, Snowflake, MySQL, PostgreSQL, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl."
 authors = ["Iuliia Volkova <xnuinside@gmail.com>"]
 license = "MIT"
diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py
@@ -203,6 +203,7 @@ def set_lexx_tags(self, t: LexToken):
 
     def set_last_token(self, t: LexToken):
         self.lexer.last_token = t.type
+
         return t
 
     def p_id(self, p):
diff --git a/simple_ddl_parser/dialects/snowflake.py b/simple_ddl_parser/dialects/snowflake.py
@@ -14,3 +14,21 @@ def p_expression_cluster_by(self, p):
         p[0] = p[1]
         p_list = remove_par(list(p))
         p[0]["cluster_by"] = p_list[-1]
+
+    def p_table_comment(self, p):
+        """expr : expr option_comment
+        """
+        p[0] = p[1]
+        if p[2]:
+            p[0].update(p[2])
+
+    def p_option_comment(self, p):
+        """option_comment : ID STRING
+        | ID DQ_STRING
+        | COMMENT ID STRING
+        | COMMENT ID DQ_STRING
+        """
+        p_list = remove_par(list(p))
+        print(p_list)
+        if "comment" in p[1].lower():
+            p[0] = {"comment": p_list[-1]}
diff --git a/simple_ddl_parser/dialects/sql.py b/simple_ddl_parser/dialects/sql.py
@@ -416,6 +416,7 @@ def set_auth_property_in_schema(self, p: List, p_list: List) -> None:
     def p_c_schema(self, p: List) -> None:
         """c_schema : CREATE SCHEMA
         | CREATE ID SCHEMA"""
+
         if len(p) == 4:
             p[0] = {"remote": True}
 
@@ -424,16 +425,23 @@ def p_create_schema(self, p: List) -> None:
         | c_schema id id id
         | c_schema id
         | c_schema id DOT id
+        | c_schema id option_comment
+        | c_schema id DOT id option_comment
         | c_schema IF NOT EXISTS id
         | c_schema IF NOT EXISTS id DOT id
         | create_schema id id id
         | create_schema id id STRING
         | create_schema options
         """
         p_list = list(p)
+
         p[0] = {}
         auth_index = None
 
+        if "comment" in p_list[-1]:
+            p[0].update(p_list[-1])
+            del p_list[-1]
+
         self.add_if_not_exists(p[0], p_list)
         if isinstance(p_list[1], dict):
             p[0] = p_list[1]
diff --git a/simple_ddl_parser/output/common.py b/simple_ddl_parser/output/common.py
@@ -145,19 +145,22 @@ def process_alter_and_index_result(
 
 def process_entities(tables_dict: Dict, table: Dict, output_mode: str) -> Dict:
     """process tables, types, sequence and etc. data"""
-    table_data = init_table_data()
-    table_data = d.populate_dialects_table_data(output_mode, table_data)
-    not_table = False
+    is_it_table = True
+
     if table.get("table_name"):
+        table_data = init_table_data()
+        table_data = d.populate_dialects_table_data(output_mode, table_data)
         table_data.update(table)
         table_data = set_unique_columns(table_data)
     else:
         table_data = table
-        not_table = True
-    if not not_table:
-        table_data = process_not_table_item(table_data, tables_dict)
+        is_it_table = False
+
+    if is_it_table:
+        table_data = process_is_it_table_item(table_data, tables_dict)
 
     table_data = normalize_ref_columns_in_final_output(table_data)
+
     d.dialects_clean_up(output_mode, table_data)
     return table_data
 
@@ -183,7 +186,7 @@ def result_format(
     return final_result
 
 
-def process_not_table_item(table_data: Dict, tables_dict: Dict) -> Dict:
+def process_is_it_table_item(table_data: Dict, tables_dict: Dict) -> Dict:
     if table_data.get("table_name"):
         tables_dict[(table_data["table_name"], table_data["schema"])] = table_data
     else:
diff --git a/simple_ddl_parser/output/dialects.py b/simple_ddl_parser/output/dialects.py
@@ -13,7 +13,6 @@
     "fields_terminated_by",
     "collection_items_terminated_by",
     "map_keys_terminated_by",
-    "comment",
 ]
 
 
@@ -145,16 +144,17 @@ def dialects_clean_up(output_mode: str, table_data: Dict) -> Dict:
     key_cleaning(table_data, output_mode)
     update_mappers_for_table_properties = {"bigquery": update_bigquery_output}
     update_table_prop = update_mappers_for_table_properties.get(output_mode)
-
     if update_table_prop:
         table_data = update_table_prop(table_data)
 
     if output_mode == "oracle":
-        for column in table_data["columns"]:
+        for column in table_data.get("columns", []):
             column = add_additional_oracle_keys_in_column(column)
     elif output_mode == "snowflake":
-        for column in table_data["columns"]:
+        # can be no columns if it is a create database or create schema
+        for column in table_data.get("columns", []):
             column = add_additional_snowflake_keys_in_column(column)
+
     elif output_mode == "redshift":
         table_data = process_redshift_dialect(table_data)
     return table_data
diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py
@@ -163,16 +163,16 @@ def check_new_statement_start(self, line: str) -> bool:
         return self.new_statement
 
     def check_line_on_skip_words(self) -> bool:
-        skip_line_words = ["USE", "GO"]
+        skip_regex = r"^(GO|USE)\b"
 
         self.skip = False
-        for word in skip_line_words:
-            if self.line.startswith(word):
-                self.skip = True
-                break
+
+        if re.match(skip_regex, self.line.upper()):
+            self.skip = True
         return self.skip
 
     def add_line_to_statement(self) -> str:
+
         if (
             self.line
             and not self.skip
@@ -206,15 +206,13 @@ def process_line(
         self.pre_process_line()
 
         self.line = self.line.strip().replace("\n", "").replace("\t", "")
-
         self.skip = self.check_line_on_skip_words()
 
         self.parse_set_statement()
         # to avoid issues when comma or parath are glued to column name
         self.check_new_statement_start(self.line)
 
         final_line = self.line.endswith(";") and not self.set_was_in_line
-
         self.add_line_to_statement()
 
         if final_line or self.new_statement:
@@ -237,6 +235,7 @@ def process_statement(self) -> None:
             self.statement = None
 
     def parse_statement(self) -> None:
+
         _parse_result = yacc.parse(self.statement)
         if _parse_result:
             self.tables.append(_parse_result)
diff --git a/tests/non_statement_tests/test_common.py b/tests/non_statement_tests/test_common.py
@@ -211,3 +211,38 @@ def test_flag_normalize_names_mixed_usage():
         'ddl_properties': []
     }
     assert expected == result
+
+
+def test_parsing_go_and_use_correctly():
+    ddl="""
+    create TABLE ASIN.EXCLUSION (
+        USER_COMMENT VARCHAR(100),
+    );
+    """
+    result = DDLParser(ddl, normalize_names=True).run(output_mode="hql")
+    expected = [{'alter': {},
+  'checks': [],
+  'collection_items_terminated_by': None,
+  'columns': [{'check': None,
+               'default': None,
+               'name': 'USER_COMMENT',
+               'nullable': True,
+               'references': None,
+               'size': 100,
+               'type': 'VARCHAR',
+               'unique': False}],
+  'comment': None,
+  'external': False,
+  'fields_terminated_by': None,
+  'index': [],
+  'lines_terminated_by': None,
+  'location': None,
+  'map_keys_terminated_by': None,
+  'partitioned_by': [],
+  'primary_key': [],
+  'row_format': None,
+  'schema': 'ASIN',
+  'stored_as': None,
+  'table_name': 'EXCLUSION',
+  'tablespace': None}]
+    assert expected == result
diff --git a/tests/test_simple_ddl_parser.py b/tests/test_simple_ddl_parser.py
@@ -1234,6 +1234,7 @@ def test_comments_in_columns():
                 "tablespace": None,
                 "schema": None,
                 "table_name": "test_table",
+                "comment": "'This is test table'",
             }
         ],
         "types": [],
diff --git a/tests/test_snowflake.py b/tests/test_snowflake.py
@@ -184,3 +184,81 @@ def test_enforced():
         "types": [],
     }
     assert expected == result
+
+
+def test_table_comment_parsed_validly():
+        
+    ddl="""
+    create TABLE ASIN.EXCLUSION (
+        USER_COMMENT VARCHAR(100),
+        PROCESS_SQN NUMBER(10,0) NOT NULL,
+        constraint PK_EXCLUSION primary key (ASIN)
+    ) COMMENT ='ASINs to be excluded from the ASIN List File'
+    ;
+    """
+    result_one = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")
+    
+         
+    ddl="""
+    create TABLE ASIN.EXCLUSION (
+        USER_COMMENT VARCHAR(100),
+        PROCESS_SQN NUMBER(10,0) NOT NULL,
+        constraint PK_EXCLUSION primary key (ASIN)
+    ) COMMENT='ASINs to be excluded from the ASIN List File'
+    ;
+    """
+    result_two = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")
+    
+    expected = [{'alter': {},
+  'checks': [],
+  'clone': None,
+  'columns': [{'check': None,
+               'default': None,
+               'name': 'USER_COMMENT',
+               'nullable': True,
+               'references': None,
+               'size': 100,
+               'type': 'VARCHAR',
+               'unique': False},
+              {'check': None,
+               'default': None,
+               'name': 'PROCESS_SQN',
+               'nullable': False,
+               'references': None,
+               'size': (10, 0),
+               'type': 'NUMBER',
+               'unique': False}],
+  'constraints': {'primary_keys': [{'columns': ['ASIN'],
+                                    'constraint_name': 'PK_EXCLUSION'}]},
+  'comment': "'ASINs to be excluded from the ASIN List File'",
+  'index': [],
+  'partitioned_by': [],
+  'primary_key': ['ASIN'],
+  'primary_key_enforced': None,
+  'schema': 'ASIN',
+  'table_name': 'EXCLUSION',
+  'tablespace': None}]
+
+    assert expected == result_one == result_two
+
+
+def test_schema_parsed_normally():
+        
+    ddl="""
+    create schema my_schema;
+    """
+    result = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")
+
+    expected = [{'schema_name': 'my_schema'}]
+    
+    assert result == expected
+
+
+def test_comment_on_create_schema():
+        
+    ddl="""
+    create schema my_schema comment='this is comment1';
+    """
+    result = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")
+    expected = [{'comment': "'this is comment1'", 'schema_name': 'my_schema'}]
+    assert result == expected
diff --git a/tests/test_spark_sql.py b/tests/test_spark_sql.py
@@ -57,6 +57,7 @@ def test_spark_sql_using():
                 "tablespace": None,
                 "tblproperties": {"'foo'": "'bar'"},
                 "using": "CSV",
+                'comment': "'this is a comment'",
             }
         ],
         "types": [],