add several fixes for HQL & new statements support

xnuinside · xnuinside · commit 90253abd1a7f · 2022-01-04T18:05:23.000+03:00
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,25 @@
+**v0.24.0**
+
+## Fixes:
+
+### HQL: 
+
+1. More then 2 tblproperties now are parsed correctly https://github.com/xnuinside/simple-ddl-parser/pull/104 
+
+
+### Common:
+
+2. 'set' in lower case now also parsed validly.
+3. Now names like 'schema', 'database', 'table' can be used as names in CREATE DABASE | SCHEMA | TABLESPACE | DOMAIN | TYPE statements and after INDEX and CONSTRAINT. 
+4. Creation of empty tables also parsed correctly (like CREATE Table table;).
+
+## New Statements Support: 
+
+### HQL: 
+1. Added support for CLUSTERED BY - https://github.com/xnuinside/simple-ddl-parser/issues/103
+2. Added support for  INTO ... BUCKETS
+3. CREATE REMOTE DATABASE | SCHEMA
+
 **v0.23.0**
 
 Big refactoring: less code complexity & increase code coverage. Radon added to pre-commit hooks.
diff --git a/README.md b/README.md
@@ -308,6 +308,7 @@ You also can provide a path where you want to have a dumps with schema with argu
 - FIELDS TERMINATED BY, LINES TERMINATED BY, COLLECTION ITEMS TERMINATED BY, MAP KEYS TERMINATED BY
 - TBLPROPERTIES ('parquet.compression'='SNAPPY' & etc.)
 - SKEWED BY
+- CLUSTERED BY 
 
 ### MySQL
 
@@ -388,6 +389,28 @@ for help with debugging & testing support for BigQuery dialect DDLs:
 
 
 ## Changelog
+**v0.24.0**
+
+## Fixes:
+
+### HQL: 
+
+1. More then 2 tblproperties now are parsed correctly https://github.com/xnuinside/simple-ddl-parser/pull/104 
+
+
+### Common:
+
+2. 'set' in lower case now also parsed validly.
+3. Now names like 'schema', 'database', 'table' can be used as names in CREATE DABASE | SCHEMA | TABLESPACE | DOMAIN | TYPE statements and after INDEX and CONSTRAINT. 
+4. Creation of empty tables also parsed correctly (like CREATE Table table;).
+
+## New Statements Support: 
+
+### HQL: 
+1. Added support for CLUSTERED BY - https://github.com/xnuinside/simple-ddl-parser/issues/103
+2. Added support for  INTO ... BUCKETS
+3. CREATE REMOTE DATABASE | SCHEMA
+
 **v0.23.0**
 
 Big refactoring: less code complexity & increase code coverage. Radon added to pre-commit hooks.
diff --git a/docs/README.rst b/docs/README.rst
@@ -25,7 +25,7 @@ Build with ply (lex & yacc in python). A lot of samples in 'tests/.
 Is it Stable?
 ^^^^^^^^^^^^^
 
-Yes, library already has about 7000+ downloads per day.
+Yes, library already has about 7000+ downloads per day  - https://pypistats.org/packages/simple-ddl-parser..
 
 As maintainer, I guarantee that any backward incompatible changes will not be done in patch or minor version. Only additionals & new features.
 
@@ -342,6 +342,7 @@ HQL Dialect statements
 * FIELDS TERMINATED BY, LINES TERMINATED BY, COLLECTION ITEMS TERMINATED BY, MAP KEYS TERMINATED BY
 * TBLPROPERTIES ('parquet.compression'='SNAPPY' & etc.)
 * SKEWED BY
+* CLUSTERED BY 
 
 MySQL
 ^^^^^
@@ -447,6 +448,36 @@ for help with debugging & testing support for BigQuery dialect DDLs:
 Changelog
 ---------
 
+**v0.24.0**
+
+Fixes:
+------
+
+HQL:
+^^^^
+
+
+#. More then 2 tblproperties now are parsed correctly https://github.com/xnuinside/simple-ddl-parser/pull/104 
+
+Common:
+^^^^^^^
+
+
+#. 'set' in lower case now also parsed validly.
+#. Now names like 'schema', 'database', 'table' can be used as names in CREATE DABASE | SCHEMA | TABLESPACE | DOMAIN | TYPE statements and after INDEX and CONSTRAINT. 
+#. Creation of empty tables also parsed correctly (like CREATE Table table;).
+
+New Statements Support:
+-----------------------
+
+HQL:
+^^^^
+
+
+#. Added support for CLUSTERED BY - https://github.com/xnuinside/simple-ddl-parser/issues/103
+#. Added support for  INTO ... BUCKETS
+#. CREATE REMOTE DATABASE | SCHEMA
+
 **v0.23.0**
 
 Big refactoring: less code complexity & increase code coverage. Radon added to pre-commit hooks.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "simple-ddl-parser"
-version = "0.23.0"
+version = "0.24.0"
 description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL (MSSQL), Oracle, AWS Redshift, Snowflake, MySQL, PostgreSQL, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl."
 authors = ["Iuliia Volkova <xnuinside@gmail.com>"]
 license = "MIT"
diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py
@@ -84,11 +84,13 @@ def set_lexer_tags(self, t):
     def t_STRING(self, t):
         r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?;±§@~]*)(\')){1}"
         t.type = "STRING"
+        self.lexer.last_token = t.type
         return t
 
     def t_DQ_STRING(self, t):
         r"((\")([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}'\[\]\/\\\\#\*&^|?;±§@~]*)(\")){1}"
         t.type = "DQ_STRING"
+        self.lexer.last_token = t.type
         return t
 
     def is_token_column_name(self, t):
@@ -103,9 +105,31 @@ def is_token_column_name(self, t):
             and t.value.upper() not in tok.first_liners
         )
 
+    def is_creation_name(self, t):
+        """many of reserved words can be used as column name,
+        to decide is it a column name or not we need do some checks"""
+        skip_id_tokens = ["(", ")", ","]
+        return (
+            t.value not in skip_id_tokens
+            and t.value.upper() not in ["IF"]
+            and self.lexer.last_token
+            in [
+                "SCHEMA",
+                "TABLE",
+                "DATABASE",
+                "TYPE",
+                "DOMAIN",
+                "TABLESPACE",
+                "INDEX",
+                "CONSTRAINT",
+                "EXISTS",
+            ]
+        )
+
     def t_ID(self, t):
         r"([0-9]\.[0-9])\w|([a-zA-Z_,0-9:><\/\=\-\+\~\%$\*\()!{}\[\]\`\[\]]+)"
         t.type = tok.symbol_tokens.get(t.value, "ID")
+
         if t.type == "LP":
             self.lexer.lp_open += 1
             self.lexer.columns_def = True
@@ -114,17 +138,22 @@ def t_ID(self, t):
 
         elif self.is_token_column_name(t):
             t.type = "ID"
+        elif t.type != "DQ_STRING" and self.is_creation_name(t):
+            t.type = "ID"
         else:
             t = self.tokens_not_columns_names(t)
 
-        # capitalize tokens
-        if t.type != "ID" and t.type not in ["LT", "RT"]:
-            t.value = t.value.upper()
+        self.capitalize_tokens(t)
 
         if t.type == "COMMA" and self.lexer.lt_open:
             t.type = "COMMAT"
+
         return self.set_last_token(t)
 
+    def capitalize_tokens(self, t):
+        if t.type != "ID" and t.type not in ["LT", "RT"]:
+            t.value = t.value.upper()
+
     def set_last_token(self, t):
         self.lexer.last_token = t.type
         if t.type in ["RP", "LP"]:
diff --git a/simple_ddl_parser/dialects/hql.py b/simple_ddl_parser/dialects/hql.py
@@ -8,6 +8,19 @@ def p_expression_location(self, p):
         p_list = list(p)
         p[0]["location"] = p_list[-1]
 
+    def p_expression_clustered(self, p):
+        """expr : expr ID ON LP pid RP
+        |  expr ID BY LP pid RP"""
+        p[0] = p[1]
+        p_list = list(p)
+        p[0][f"{p_list[2].lower()}_{p_list[3].lower()}"] = p_list[-2]
+
+    def p_expression_into_buckets(self, p):
+        """expr : expr INTO ID ID"""
+        p[0] = p[1]
+        p_list = list(p)
+        p[0][f"{p_list[2].lower()}_{p_list[-1].lower()}"] = p_list[-2]
+
     def p_row_format(self, p):
         """row_format : ROW FORMAT SERDE
         | ROW FORMAT
diff --git a/simple_ddl_parser/dialects/sql.py b/simple_ddl_parser/dialects/sql.py
@@ -26,22 +26,28 @@ def p_expression_partition_by(self, p: List) -> None:
 
 
 class Database:
+    def p_expression_create_database(self, p: List) -> None:
+        """expr : expr database_base"""
+        p[0] = p[1]
+        p_list = list(p)
+        p[0].update(p_list[-1])
+
     def p_database_base(self, p: List) -> None:
         """database_base : CREATE DATABASE id
+        | CREATE ID DATABASE id
         | database_base clone
         """
-        p[0] = p[1]
+        if isinstance(p[1], dict):
+            p[0] = p[1]
+        else:
+            p[0] = {}
         p_list = list(p)
         if isinstance(p_list[-1], dict):
             p[0].update(p_list[-1])
         else:
             p[0]["database_name"] = p_list[-1]
-
-    def p_expression_create_database(self, p: List) -> None:
-        """expr : expr database_base"""
-        p[0] = p[1]
-        p_list = list(p)
-        p[0].update(p_list[-1])
+        if len(p_list) == 5:
+            p[0][p[2].lower()] = True
 
 
 class TableSpaces:
@@ -372,9 +378,12 @@ def set_properties_for_schema_and_database(self, p: List, p_list: List) -> None:
         if not p[0].get("properties"):
             if len(p_list) == 3:
                 properties = p_list[-1]
-            else:
+            elif len(p_list) > 3:
                 properties = {p_list[-3]: p_list[-1]}
-            p[0]["properties"] = properties
+            else:
+                properties = {}
+            if properties:
+                p[0]["properties"] = properties
         else:
             p[0]["properties"].update({p_list[-3]: p_list[-1]})
 
@@ -385,8 +394,10 @@ def set_auth_property_in_schema(self, p: List, p_list: List) -> None:
             p[0] = {"schema_name": p_list[2], auth.lower(): p_list[-1]}
 
     def p_c_schema(self, p: List) -> None:
-        """c_schema : CREATE SCHEMA"""
-        pass
+        """c_schema : CREATE SCHEMA
+        | CREATE ID SCHEMA"""
+        if len(p) == 4:
+            p[0] = {"remote": True}
 
     def p_create_schema(self, p: List) -> None:
         """create_schema : c_schema id id
@@ -409,7 +420,7 @@ def p_create_schema(self, p: List) -> None:
             auth_index = p_list.index(auth)
             self.set_auth_property_in_schema(p, p_list)
 
-        elif isinstance(p_list[-1], str):
+        if isinstance(p_list[-1], str):
             if auth_index:
                 schema_name = p_list[auth_index - 1]
                 if schema_name is None:
@@ -427,7 +438,7 @@ def set_project_in_schema(data: Dict, p_list: List, auth_index: int) -> Dict:
         return data
 
     def p_create_database(self, p: List) -> None:
-        """create_database : CREATE DATABASE id
+        """create_database : database_base
         | create_database id id id
         | create_database id id STRING
         | create_database options
@@ -703,6 +714,7 @@ def extract_check_data(self, p, p_list):
     def p_expression_table(self, p: List) -> None:
         """expr : table_name defcolumn
         | table_name LP defcolumn
+        | table_name
         | expr COMMA defcolumn
         | expr COMMA
         | expr COMMA constraint
@@ -1142,7 +1154,6 @@ def p_expression_alter(self, p: List) -> None:
         | alter_default
         """
         p[0] = p[1]
-        print(p[0], "expe")
         if len(p) == 3:
             p[0].update(p[2])
 
@@ -1152,7 +1163,6 @@ def p_alter_unique(self, p: List) -> None:
         """
 
         p_list = remove_par(list(p))
-        print(p_list, "unique")
         p[0] = p[1]
         p[0]["unique"] = {"constraint_name": None, "columns": p_list[-1]}
         if "constraint" in p[2]:
diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py
@@ -126,7 +126,7 @@ def process_set(self) -> None:
         self.tables.append({"name": name, "value": value})
 
     def parse_set_statement(self):
-        if re.match(r"SET", self.line):
+        if re.match(r"SET", self.line.upper()):
             self.set_was_in_line = True
             if not self.set_line:
                 self.set_line = self.line
diff --git a/simple_ddl_parser/tokens.py b/simple_ddl_parser/tokens.py
@@ -66,6 +66,7 @@
     "PARTITION": "PARTITION",
     "BY": "BY",
     # hql
+    "INTO": "INTO",
     "STORED": "STORED",
     "LOCATION": "LOCATION",
     "ROW": "ROW",
diff --git a/tests/test_create_database.py b/tests/test_create_database.py
@@ -26,3 +26,23 @@ def test_parse_properties_in_create_db():
         "ddl_properties": [],
     }
     assert expected == result
+
+
+def test_create_database_database():
+    expected = {
+        "databases": [{"database_name": "database"}],
+        "ddl_properties": [],
+        "domains": [],
+        "schemas": [{"schema_name": "SCHEMA"}],
+        "sequences": [],
+        "tables": [],
+        "types": [],
+    }
+
+    ddl = """
+
+    CREATE DATABASE database;
+    CREATE SCHEMA SCHEMA;
+    """
+    result = DDLParser(ddl).run(group_by_type=True, output_mode="hql")
+    assert expected == result
diff --git a/tests/test_ddl_settings.py b/tests/test_ddl_settings.py
@@ -162,3 +162,21 @@ def test_parse_validly_tables_after_set():
         "types": [],
     }
     assert expected == result
+
+
+def test_set_lower_parsed():
+
+    ddl = """
+
+    set hive.enforce.bucketing = true;
+        """
+    result = DDLParser(ddl).run(group_by_type=True, output_mode="hql")
+    expected = {
+        "ddl_properties": [{"name": "hive.enforce.bucketing", "value": "true"}],
+        "domains": [],
+        "schemas": [],
+        "sequences": [],
+        "tables": [],
+        "types": [],
+    }
+    assert expected == result
diff --git a/tests/test_hql_output_mode.py b/tests/test_hql_output_mode.py
diff --git a/tests/test_simple_ddl_parser.py b/tests/test_simple_ddl_parser.py