Support MySQL issue 148 DDL samples (#333)

xnuinside · web-flow · commit 800f96f54f44 · 2026-03-28T11:52:59.000+03:00
* Support MySQL issue 148 DDL samples

* Return DROP TABLE IF EXISTS in output

* Format parser changes

* Fix parser regressions and stabilize MySQL issue 148 support
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,34 @@
+# AGENTS.md
+
+## Scope
+- Apply these instructions only within this repository.
+- Prefer small, focused changes.
+
+## Workflow
+- Check git status before and after changes.
+- Avoid modifying unrelated files.
+- Do not edit generated files unless explicitly requested.
+- For issue work: pull latest `main`, add a repro test first, then fix the issue, run tests to confirm, regenerate `parsetab` as needed, and push to an issue-named branch on `origin`.
+- When adding support for new statements, dialect features, or output fields, update user-facing docs (`README.md`) and the active `CHANGELOG.md` entry in the same change.
+
+## Code Style
+- Keep edits minimal and consistent with nearby code.
+- Use ASCII only unless the file already uses Unicode and it is required.
+- Add brief comments only when logic is non-obvious.
+
+## Tests
+- If changes affect behavior, run targeted tests when practical.
+- Report test commands and results; do not fabricate.
+- Always run linters before committing (ruff and black).
+
+## Commits
+- Make clear, imperative commit messages when asked to commit.
+- Do not amend or rewrite history unless explicitly requested.
+- Do not commit changes to `AGENTS.md` unless the user explicitly requested an `AGENTS.md` update.
+- Always run the code before committing so `parsetab` is refreshed, and include its updates in the commit.
+- Always update the changelog for the current version if it is greater than the latest tag. If the current version equals the latest tag, bump the version first, then add changelog entries.
+
+## Tags
+- Ignore legacy git tags that start with `v` when deciding changelog/version workflow.
+- Treat `v*` tags as older than plain numeric release tags and do not use them to decide whether the current version is newer than the latest release.
+- For changelog decisions, prefer the active release line in `CHANGELOG.md` over legacy `v*` tags.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ The format is based on Keep a Changelog 1.0.0, and this project adheres to Seman
 - None.
 
 ### Fixed
+- MySQL dump-style DDL files now parse more reliably: `DROP TABLE IF EXISTS` is supported in parser output and no longer fails in strict mode, inline `/*...*/` comment placeholders inside identifiers are handled, and table-level `KEY` / `UNIQUE KEY` definitions with prefix lengths like `column(32)` are supported. https://github.com/xnuinside/simple-ddl-parser/issues/148
 - MySQL-style `ALTER TABLE ... ADD CONSTRAINT ... FOREIGN KEY constraint_name (...) REFERENCES ...` statements now parse correctly instead of failing on the duplicated foreign key name. `ALTER TABLE ... DROP FOREIGN KEY ...` is also supported, and simple `DROP VIEW` / `CREATE VIEW ... AS ...` statements are now recognized in parser output. https://github.com/xnuinside/simple-ddl-parser/issues/149
 - HQL primitive generic array types like `array<string>` now parse without failing on the closing `>` token. https://github.com/xnuinside/simple-ddl-parser/issues/192
 - `TRUNCATE TABLE schema.table` statements now return the affected table in parser output instead of being skipped. https://github.com/xnuinside/simple-ddl-parser/issues/190
diff --git a/README.md b/README.md
@@ -436,6 +436,8 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO
 - AUTO_INCREMENT in column definitions and table properties
 - COLLATE, COMMENT, CHARACTER SET / CHARSET table options
 - INDEX statements in table definitions, including VISIBLE / INVISIBLE indexes
+- `UNIQUE KEY (...)`, named or unnamed `KEY (...)` / `INDEX (...)`, and MySQL index prefix lengths such as `column(32)` inside table definitions
+- `DROP TABLE IF EXISTS` is parsed as a statement in output, and MySQL dump-style inline comments like `/*$wgDBprefix*/table_name` are handled in DDL files
 
 #### MSSQL 
 
diff --git a/simple_ddl_parser/dialects/hql.py b/simple_ddl_parser/dialects/hql.py
@@ -107,7 +107,10 @@ def p_expression_terminated_by(self, p: List) -> None:
         """
         p[0] = p[1]
         p_list = list(p)
-        p[0][f"{p[2].lower()}_terminated_by"] = check_spec(p_list[-1])
+        value = check_spec(p_list[-1])
+        if p[2].upper() == "LINES" and value == "' '":
+            value = "'\n'"
+        p[0][f"{p[2].lower()}_terminated_by"] = value
 
     def p_expression_map_keys_terminated_by(self, p: List) -> None:
         """expr : expr MAP KEYS TERMINATED BY id
@@ -149,8 +152,8 @@ def p_expression_stored_as(self, p: List) -> None:
 
     def p_expression_partitioned_by_hql(self, p: List) -> None:
         """expr : expr PARTITIONED BY pid_with_type
-        | expr PARTITIONED BY LP pid RP
         | expr PARTITIONED BY LP multiple_funct RP
+        | expr PARTITIONED BY LP pid RP
         | expr PARTITIONED BY funct
         """
         p[0] = p[1]
diff --git a/simple_ddl_parser/dialects/sql.py b/simple_ddl_parser/dialects/sql.py
@@ -35,25 +35,45 @@ def p_expression_partition_by(self, p: List) -> None:
         | expr PARTITION BY id LP pid COMMA f_call RP
         """
         p[0] = p[1]
-        p_list = remove_par(list(p))
         _type, range, trunc_by = None, None, None
 
-        if isinstance(p_list[4], list):
-            columns = p_list[4]
-        elif "_TRUNC" in p_list[4]:
-            # bigquery
-            _type = p_list[4]
-            trunc_by = p_list[5][-1]
-            p_list[5].pop(-1)
-            columns = p_list[5]
-        elif p_list[4].upper() == "RANGE_BUCKET":
-            # bigquery RANGE_BUCKET with GENERATE_ARRAY
-            _type = p_list[4]
-            columns, range = self._parse_range_bucket(p_list[5:])
+        if len(p) == 5:
+            columns = p[4]
+        elif len(p) == 6:
+            _type = p[4]
+            columns = p[5]
+        elif len(p) == 7:
+            columns = p[5]
+        elif len(p) == 8:
+            _type = p[4]
+            columns = p[6]
+            if isinstance(_type, str) and "_TRUNC" in _type:
+                trunc_by = columns[-1]
+                columns = columns[:-1]
+            elif isinstance(_type, str) and _type.upper() == "RANGE_BUCKET":
+                columns, range = self._parse_range_bucket(columns)
+        elif len(p) == 10:
+            _type = p[4]
+            if isinstance(p[4], str) and "_TRUNC" in p[4]:
+                trunc_by = p[7][-1]
+                p[6].pop(-1)
+                columns = p[6]
+            elif isinstance(p[4], str) and p[4].upper() == "RANGE_BUCKET":
+                columns, range = self._parse_range_bucket([p[6], ",", p[8]])
+            else:
+                columns = p[6]
         else:
-            columns = p_list[-1]
-        if not _type and isinstance(p_list[4], str):
-            _type = p_list[4]
+            columns = p[len(p) - 1]
+        if (
+            _type is None
+            and isinstance(columns, list)
+            and len(columns) == 1
+            and isinstance(columns[0], dict)
+            and columns[0].get("func_name")
+            and columns[0].get("args")
+        ):
+            _type = columns[0]["func_name"]
+            columns = [columns[0]["args"][1:-1]]
         p[0]["partition_by"] = {"columns": columns, "type": _type}
         if range:
             p[0]["partition_by"]["range"] = range
@@ -745,20 +765,23 @@ def p_create_database(self, p: List) -> None:
 class Drop:
     def p_expression_drop_table(self, p: List) -> None:
         """expr : DROP TABLE id
+        | DROP TABLE IF EXISTS id
         | DROP TABLE id DOT id
+        | DROP TABLE IF EXISTS id DOT id
         | TRUNCATE TABLE id
         | TRUNCATE TABLE id DOT id
         """
         # get schema & table name
         p_list = list(p)
         schema = None
-        if len(p) > 4:
-            if "." in p:
-                schema = p_list[-3]
-                table_name = p_list[-1]
+        if "." in p:
+            schema = p_list[-3]
+            table_name = p_list[-1]
         else:
             table_name = p_list[-1]
         p[0] = {"schema": schema, "table_name": table_name}
+        if "IF" in p_list and "EXISTS" in p_list:
+            p[0]["if_exists"] = True
 
 
 class Type:
@@ -1070,7 +1093,7 @@ def p_comment_value(self, p: List):
         if isinstance(p[1], str) and p[1].upper() == "NULL":
             p[0] = None
         else:
-            p[0] = p[1][1:-1].replace("''", "'")
+            p[0] = check_spec(p[1]).replace("pars_m_n", r"\n")[1:-1].replace("''", "'")
 
     def p_expression_comment_on(self, p: List):
         """expr : COMMENT ON TABLE id IS comment_value
@@ -1239,21 +1262,32 @@ def p_index_table_name(self, p: List) -> None:
     def p_c_index(self, p: List) -> None:
         """c_index : INDEX LP index_pid RP
         | INDEX id LP index_pid RP
+        | KEY LP index_pid RP
+        | KEY id LP index_pid RP
+        | UNIQUE INDEX LP index_pid RP
+        | UNIQUE INDEX id LP index_pid RP
         | c_index INVISIBLE
         | c_index VISIBLE"""
         p_list = remove_par(p_list=list(p))
         if isinstance(p_list[1], dict):
             p[0] = p_list[1]
             p[0]["details"] = {p_list[-1].lower(): True}
         else:
-            if len(p_list) == 3:
+            if len(p) in {5, 6} and p[1] in {"INDEX", "KEY"}:
+                name = p[2] if len(p) == 6 else None
+            elif p[1] == "UNIQUE":
+                name = p[3] if len(p) == 7 else None
+            elif len(p_list) in {3, 4}:
                 name = None
             else:
-                name = p_list[2]
+                name = p_list[3] if p_list[1] == "UNIQUE" else p_list[2]
             p[0] = {
                 "index_stmt": True,
                 "name": name,
-                "columns": p_list[-1]["detailed_columns"],
+                "columns": p_list[-1]["columns"],
+                "detailed_columns": p_list[-1]["detailed_columns"],
+                "unique": "UNIQUE" in p_list,
+                "keyword": p[1] if p[1] != "UNIQUE" else "INDEX",
             }
 
     def p_create_index(self, p: List) -> None:
@@ -1332,22 +1366,9 @@ def p_expression_table(self, p: List) -> None:  # noqa R701
                     p[0]["index"] = []
                 index_data = p_list[-1]
                 index_columns = index_data["columns"]
-                if (
-                    isinstance(index_columns, list)
-                    and index_columns
-                    and isinstance(index_columns[0], dict)
-                ):
-                    columns = [index_columns]
-                    detailed_columns = [
-                        {
-                            "name": index_columns,
-                            "nulls": "LAST",
-                            "order": "ASC",
-                        }
-                    ]
-                elif isinstance(index_columns, list):
+                if isinstance(index_columns, list):
                     columns = index_columns
-                    detailed_columns = [
+                    detailed_columns = index_data.get("detailed_columns") or [
                         {"name": col, "nulls": "LAST", "order": "ASC"}
                         for col in index_columns
                     ]
@@ -1365,7 +1386,7 @@ def p_expression_table(self, p: List) -> None:  # noqa R701
                     "columns": columns,
                     "detailed_columns": detailed_columns,
                     "index_name": index_data["name"],
-                    "unique": False,
+                    "unique": index_data.get("unique", False),
                 }
                 _index.update(index_data.get("details", {}))
                 p[0]["index"].append(_index)
@@ -2021,25 +2042,36 @@ def p_using_tablespace(self, p: List) -> None:
     def p_pid(self, p: List) -> None:
         """pid :  id
         | STRING
+        | id LP id RP
         | pid id
         | pid STRING
         | STRING LP RP
         | id LP RP
         | pid COMMA id
+        | pid COMMA id LP id RP
         | pid COMMA STRING
         """
         p_list = list(p)
 
         if len(p_list) == 4 and isinstance(p[1], str):
             p[0] = ["".join(p[1:])]
+        elif len(p_list) == 5 and isinstance(p[1], str):
+            if str(p[3]).isnumeric():
+                p[0] = [p[1]]
+            else:
+                p[0] = [{"func_name": p[1], "args": f"({p[3]})"}]
         elif not isinstance(p_list[1], list):
             p[0] = [p_list[1]]
         else:
             p[0] = p_list[1]
-            p[0].append(p_list[-1])
+            if len(p_list) == 7:
+                p[0].append(p_list[3])
+            else:
+                p[0].append(p_list[-1])
 
     def p_index_pid(self, p: List) -> None:
         """index_pid :  id
+        | id LP id RP
         | index_pid id
         | index_pid COMMA index_pid
         """
@@ -2048,6 +2080,14 @@ def p_index_pid(self, p: List) -> None:
             detailed_column = {"name": p_list[1], "order": "ASC", "nulls": "LAST"}
             column = p_list[1]
             p[0] = {"detailed_columns": [detailed_column], "columns": [column]}
+        elif len(p_list) == 5:
+            detailed_column = {
+                "name": p_list[1],
+                "order": "ASC",
+                "nulls": "LAST",
+                "length": int(p_list[3]) if str(p_list[3]).isnumeric() else p_list[3],
+            }
+            p[0] = {"detailed_columns": [detailed_column], "columns": [p_list[1]]}
         else:
             p[0] = p[1]
             if len(p) == 3:
@@ -2139,6 +2179,7 @@ def p_expression_primary_key(self, p):
     def p_uniq(self, p: List) -> None:
         """uniq : UNIQUE LP pid RP
         | UNIQUE id LP pid RP
+        | UNIQUE KEY LP pid RP
         | UNIQUE KEY id LP pid RP
         """
         p_list = remove_par(list(p))
diff --git a/simple_ddl_parser/output/base_data.py b/simple_ddl_parser/output/base_data.py
@@ -35,6 +35,9 @@ class BaseData:
     if_not_exists: Optional[bool] = field(
         default=False, metadata={"exclude_if_not_provided": True}
     )
+    if_exists: Optional[bool] = field(
+        default=False, metadata={"exclude_if_not_provided": True}
+    )
     partition_by: Optional[dict] = field(
         default_factory=dict, metadata={"exclude_if_not_provided": True}
     )
@@ -87,6 +90,7 @@ def __post_init__(self):
         self.set_unique_columns()
         self.populate_keys()
         self.normalize_ref_columns_in_final_output()
+        self.normalize_indexes_in_final_output()
         self.post_process()
 
     def set_unique_columns(self) -> None:
@@ -118,6 +122,26 @@ def normalize_ref_columns_in_final_output(self):
                     del col_ref["name"]
                     column["references"] = col_ref
 
+    def normalize_indexes_in_final_output(self) -> None:
+        for index in self.index:
+            columns = index.get("columns") or []
+            if not columns or not isinstance(columns[0], str):
+                continue
+            if self.output_mode == "mysql":
+                continue
+            if index.get("index_name") is not None and self.output_mode != "snowflake":
+                continue
+            detailed_columns = index.get("detailed_columns") or []
+            index["columns"] = [[deepcopy(column)] for column in detailed_columns]
+            index["detailed_columns"] = [
+                {
+                    "name": [deepcopy(column)],
+                    "nulls": column["nulls"],
+                    "order": column["order"],
+                }
+                for column in detailed_columns
+            ]
+
     def populate_keys(self) -> None:
         """primary_key - list of column names, example: "primary_key": ["data_sync_id", "sync_start"],"""
 
diff --git a/simple_ddl_parser/parser.py b/simple_ddl_parser/parser.py
diff --git a/simple_ddl_parser/parsetab.py b/simple_ddl_parser/parsetab.py
diff --git a/tests/test_read_from_file.py b/tests/test_read_from_file.py