Skip to content

Commit 800f96f

Browse files
authored
Support MySQL issue 148 DDL samples (#333)
* Support MySQL issue 148 DDL samples * Return DROP TABLE IF EXISTS in output * Format parser changes * Fix parser regressions and stabilize MySQL issue 148 support
1 parent e2a4fd7 commit 800f96f

9 files changed

Lines changed: 806 additions & 493 deletions

File tree

AGENTS.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# AGENTS.md
2+
3+
## Scope
4+
- Apply these instructions only within this repository.
5+
- Prefer small, focused changes.
6+
7+
## Workflow
8+
- Check git status before and after changes.
9+
- Avoid modifying unrelated files.
10+
- Do not edit generated files unless explicitly requested.
11+
- For issue work: pull latest `main`, add a repro test first, then fix the issue, run tests to confirm, regenerate `parsetab` as needed, and push to an issue-named branch on `origin`.
12+
- When adding support for new statements, dialect features, or output fields, update user-facing docs (`README.md`) and the active `CHANGELOG.md` entry in the same change.
13+
14+
## Code Style
15+
- Keep edits minimal and consistent with nearby code.
16+
- Use ASCII only unless the file already uses Unicode and it is required.
17+
- Add brief comments only when logic is non-obvious.
18+
19+
## Tests
20+
- If changes affect behavior, run targeted tests when practical.
21+
- Report test commands and results; do not fabricate.
22+
- Always run linters before committing (ruff and black).
23+
24+
## Commits
25+
- Make clear, imperative commit messages when asked to commit.
26+
- Do not amend or rewrite history unless explicitly requested.
27+
- Do not commit changes to `AGENTS.md` unless the user explicitly requested an `AGENTS.md` update.
28+
- Always run the code before committing so `parsetab` is refreshed, and include its updates in the commit.
29+
- Always update the changelog for the current version if it is greater than the latest tag. If the current version equals the latest tag, bump the version first, then add changelog entries.
30+
31+
## Tags
32+
- Ignore legacy git tags that start with `v` when deciding changelog/version workflow.
33+
- Treat `v*` tags as older than plain numeric release tags and do not use them to decide whether the current version is newer than the latest release.
34+
- For changelog decisions, prefer the active release line in `CHANGELOG.md` over legacy `v*` tags.

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ The format is based on Keep a Changelog 1.0.0, and this project adheres to Seman
1111
- None.
1212

1313
### Fixed
14+
- MySQL dump-style DDL files now parse more reliably: `DROP TABLE IF EXISTS` is supported in parser output and no longer fails in strict mode, inline `/*...*/` comment placeholders inside identifiers are handled, and table-level `KEY` / `UNIQUE KEY` definitions with prefix lengths like `column(32)` are supported. https://github.com/xnuinside/simple-ddl-parser/issues/148
1415
- MySQL-style `ALTER TABLE ... ADD CONSTRAINT ... FOREIGN KEY constraint_name (...) REFERENCES ...` statements now parse correctly instead of failing on the duplicated foreign key name. `ALTER TABLE ... DROP FOREIGN KEY ...` is also supported, and simple `DROP VIEW` / `CREATE VIEW ... AS ...` statements are now recognized in parser output. https://github.com/xnuinside/simple-ddl-parser/issues/149
1516
- HQL primitive generic array types like `array<string>` now parse without failing on the closing `>` token. https://github.com/xnuinside/simple-ddl-parser/issues/192
1617
- `TRUNCATE TABLE schema.table` statements now return the affected table in parser output instead of being skipped. https://github.com/xnuinside/simple-ddl-parser/issues/190

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -436,6 +436,8 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO
436436
- AUTO_INCREMENT in column definitions and table properties
437437
- COLLATE, COMMENT, CHARACTER SET / CHARSET table options
438438
- INDEX statements in table definitions, including VISIBLE / INVISIBLE indexes
439+
- `UNIQUE KEY (...)`, named or unnamed `KEY (...)` / `INDEX (...)`, and MySQL index prefix lengths such as `column(32)` inside table definitions
440+
- `DROP TABLE IF EXISTS` is parsed as a statement in output, and MySQL dump-style inline comments like `/*$wgDBprefix*/table_name` are handled in DDL files
439441

440442
#### MSSQL
441443

simple_ddl_parser/dialects/hql.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,10 @@ def p_expression_terminated_by(self, p: List) -> None:
107107
"""
108108
p[0] = p[1]
109109
p_list = list(p)
110-
p[0][f"{p[2].lower()}_terminated_by"] = check_spec(p_list[-1])
110+
value = check_spec(p_list[-1])
111+
if p[2].upper() == "LINES" and value == "' '":
112+
value = "'\n'"
113+
p[0][f"{p[2].lower()}_terminated_by"] = value
111114

112115
def p_expression_map_keys_terminated_by(self, p: List) -> None:
113116
"""expr : expr MAP KEYS TERMINATED BY id
@@ -149,8 +152,8 @@ def p_expression_stored_as(self, p: List) -> None:
149152

150153
def p_expression_partitioned_by_hql(self, p: List) -> None:
151154
"""expr : expr PARTITIONED BY pid_with_type
152-
| expr PARTITIONED BY LP pid RP
153155
| expr PARTITIONED BY LP multiple_funct RP
156+
| expr PARTITIONED BY LP pid RP
154157
| expr PARTITIONED BY funct
155158
"""
156159
p[0] = p[1]

simple_ddl_parser/dialects/sql.py

Lines changed: 82 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -35,25 +35,45 @@ def p_expression_partition_by(self, p: List) -> None:
3535
| expr PARTITION BY id LP pid COMMA f_call RP
3636
"""
3737
p[0] = p[1]
38-
p_list = remove_par(list(p))
3938
_type, range, trunc_by = None, None, None
4039

41-
if isinstance(p_list[4], list):
42-
columns = p_list[4]
43-
elif "_TRUNC" in p_list[4]:
44-
# bigquery
45-
_type = p_list[4]
46-
trunc_by = p_list[5][-1]
47-
p_list[5].pop(-1)
48-
columns = p_list[5]
49-
elif p_list[4].upper() == "RANGE_BUCKET":
50-
# bigquery RANGE_BUCKET with GENERATE_ARRAY
51-
_type = p_list[4]
52-
columns, range = self._parse_range_bucket(p_list[5:])
40+
if len(p) == 5:
41+
columns = p[4]
42+
elif len(p) == 6:
43+
_type = p[4]
44+
columns = p[5]
45+
elif len(p) == 7:
46+
columns = p[5]
47+
elif len(p) == 8:
48+
_type = p[4]
49+
columns = p[6]
50+
if isinstance(_type, str) and "_TRUNC" in _type:
51+
trunc_by = columns[-1]
52+
columns = columns[:-1]
53+
elif isinstance(_type, str) and _type.upper() == "RANGE_BUCKET":
54+
columns, range = self._parse_range_bucket(columns)
55+
elif len(p) == 10:
56+
_type = p[4]
57+
if isinstance(p[4], str) and "_TRUNC" in p[4]:
58+
trunc_by = p[7][-1]
59+
p[6].pop(-1)
60+
columns = p[6]
61+
elif isinstance(p[4], str) and p[4].upper() == "RANGE_BUCKET":
62+
columns, range = self._parse_range_bucket([p[6], ",", p[8]])
63+
else:
64+
columns = p[6]
5365
else:
54-
columns = p_list[-1]
55-
if not _type and isinstance(p_list[4], str):
56-
_type = p_list[4]
66+
columns = p[len(p) - 1]
67+
if (
68+
_type is None
69+
and isinstance(columns, list)
70+
and len(columns) == 1
71+
and isinstance(columns[0], dict)
72+
and columns[0].get("func_name")
73+
and columns[0].get("args")
74+
):
75+
_type = columns[0]["func_name"]
76+
columns = [columns[0]["args"][1:-1]]
5777
p[0]["partition_by"] = {"columns": columns, "type": _type}
5878
if range:
5979
p[0]["partition_by"]["range"] = range
@@ -745,20 +765,23 @@ def p_create_database(self, p: List) -> None:
745765
class Drop:
746766
def p_expression_drop_table(self, p: List) -> None:
747767
"""expr : DROP TABLE id
768+
| DROP TABLE IF EXISTS id
748769
| DROP TABLE id DOT id
770+
| DROP TABLE IF EXISTS id DOT id
749771
| TRUNCATE TABLE id
750772
| TRUNCATE TABLE id DOT id
751773
"""
752774
# get schema & table name
753775
p_list = list(p)
754776
schema = None
755-
if len(p) > 4:
756-
if "." in p:
757-
schema = p_list[-3]
758-
table_name = p_list[-1]
777+
if "." in p:
778+
schema = p_list[-3]
779+
table_name = p_list[-1]
759780
else:
760781
table_name = p_list[-1]
761782
p[0] = {"schema": schema, "table_name": table_name}
783+
if "IF" in p_list and "EXISTS" in p_list:
784+
p[0]["if_exists"] = True
762785

763786

764787
class Type:
@@ -1070,7 +1093,7 @@ def p_comment_value(self, p: List):
10701093
if isinstance(p[1], str) and p[1].upper() == "NULL":
10711094
p[0] = None
10721095
else:
1073-
p[0] = p[1][1:-1].replace("''", "'")
1096+
p[0] = check_spec(p[1]).replace("pars_m_n", r"\n")[1:-1].replace("''", "'")
10741097

10751098
def p_expression_comment_on(self, p: List):
10761099
"""expr : COMMENT ON TABLE id IS comment_value
@@ -1239,21 +1262,32 @@ def p_index_table_name(self, p: List) -> None:
12391262
def p_c_index(self, p: List) -> None:
12401263
"""c_index : INDEX LP index_pid RP
12411264
| INDEX id LP index_pid RP
1265+
| KEY LP index_pid RP
1266+
| KEY id LP index_pid RP
1267+
| UNIQUE INDEX LP index_pid RP
1268+
| UNIQUE INDEX id LP index_pid RP
12421269
| c_index INVISIBLE
12431270
| c_index VISIBLE"""
12441271
p_list = remove_par(p_list=list(p))
12451272
if isinstance(p_list[1], dict):
12461273
p[0] = p_list[1]
12471274
p[0]["details"] = {p_list[-1].lower(): True}
12481275
else:
1249-
if len(p_list) == 3:
1276+
if len(p) in {5, 6} and p[1] in {"INDEX", "KEY"}:
1277+
name = p[2] if len(p) == 6 else None
1278+
elif p[1] == "UNIQUE":
1279+
name = p[3] if len(p) == 7 else None
1280+
elif len(p_list) in {3, 4}:
12501281
name = None
12511282
else:
1252-
name = p_list[2]
1283+
name = p_list[3] if p_list[1] == "UNIQUE" else p_list[2]
12531284
p[0] = {
12541285
"index_stmt": True,
12551286
"name": name,
1256-
"columns": p_list[-1]["detailed_columns"],
1287+
"columns": p_list[-1]["columns"],
1288+
"detailed_columns": p_list[-1]["detailed_columns"],
1289+
"unique": "UNIQUE" in p_list,
1290+
"keyword": p[1] if p[1] != "UNIQUE" else "INDEX",
12571291
}
12581292

12591293
def p_create_index(self, p: List) -> None:
@@ -1332,22 +1366,9 @@ def p_expression_table(self, p: List) -> None: # noqa R701
13321366
p[0]["index"] = []
13331367
index_data = p_list[-1]
13341368
index_columns = index_data["columns"]
1335-
if (
1336-
isinstance(index_columns, list)
1337-
and index_columns
1338-
and isinstance(index_columns[0], dict)
1339-
):
1340-
columns = [index_columns]
1341-
detailed_columns = [
1342-
{
1343-
"name": index_columns,
1344-
"nulls": "LAST",
1345-
"order": "ASC",
1346-
}
1347-
]
1348-
elif isinstance(index_columns, list):
1369+
if isinstance(index_columns, list):
13491370
columns = index_columns
1350-
detailed_columns = [
1371+
detailed_columns = index_data.get("detailed_columns") or [
13511372
{"name": col, "nulls": "LAST", "order": "ASC"}
13521373
for col in index_columns
13531374
]
@@ -1365,7 +1386,7 @@ def p_expression_table(self, p: List) -> None: # noqa R701
13651386
"columns": columns,
13661387
"detailed_columns": detailed_columns,
13671388
"index_name": index_data["name"],
1368-
"unique": False,
1389+
"unique": index_data.get("unique", False),
13691390
}
13701391
_index.update(index_data.get("details", {}))
13711392
p[0]["index"].append(_index)
@@ -2021,25 +2042,36 @@ def p_using_tablespace(self, p: List) -> None:
20212042
def p_pid(self, p: List) -> None:
20222043
"""pid : id
20232044
| STRING
2045+
| id LP id RP
20242046
| pid id
20252047
| pid STRING
20262048
| STRING LP RP
20272049
| id LP RP
20282050
| pid COMMA id
2051+
| pid COMMA id LP id RP
20292052
| pid COMMA STRING
20302053
"""
20312054
p_list = list(p)
20322055

20332056
if len(p_list) == 4 and isinstance(p[1], str):
20342057
p[0] = ["".join(p[1:])]
2058+
elif len(p_list) == 5 and isinstance(p[1], str):
2059+
if str(p[3]).isnumeric():
2060+
p[0] = [p[1]]
2061+
else:
2062+
p[0] = [{"func_name": p[1], "args": f"({p[3]})"}]
20352063
elif not isinstance(p_list[1], list):
20362064
p[0] = [p_list[1]]
20372065
else:
20382066
p[0] = p_list[1]
2039-
p[0].append(p_list[-1])
2067+
if len(p_list) == 7:
2068+
p[0].append(p_list[3])
2069+
else:
2070+
p[0].append(p_list[-1])
20402071

20412072
def p_index_pid(self, p: List) -> None:
20422073
"""index_pid : id
2074+
| id LP id RP
20432075
| index_pid id
20442076
| index_pid COMMA index_pid
20452077
"""
@@ -2048,6 +2080,14 @@ def p_index_pid(self, p: List) -> None:
20482080
detailed_column = {"name": p_list[1], "order": "ASC", "nulls": "LAST"}
20492081
column = p_list[1]
20502082
p[0] = {"detailed_columns": [detailed_column], "columns": [column]}
2083+
elif len(p_list) == 5:
2084+
detailed_column = {
2085+
"name": p_list[1],
2086+
"order": "ASC",
2087+
"nulls": "LAST",
2088+
"length": int(p_list[3]) if str(p_list[3]).isnumeric() else p_list[3],
2089+
}
2090+
p[0] = {"detailed_columns": [detailed_column], "columns": [p_list[1]]}
20512091
else:
20522092
p[0] = p[1]
20532093
if len(p) == 3:
@@ -2139,6 +2179,7 @@ def p_expression_primary_key(self, p):
21392179
def p_uniq(self, p: List) -> None:
21402180
"""uniq : UNIQUE LP pid RP
21412181
| UNIQUE id LP pid RP
2182+
| UNIQUE KEY LP pid RP
21422183
| UNIQUE KEY id LP pid RP
21432184
"""
21442185
p_list = remove_par(list(p))

simple_ddl_parser/output/base_data.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ class BaseData:
3535
if_not_exists: Optional[bool] = field(
3636
default=False, metadata={"exclude_if_not_provided": True}
3737
)
38+
if_exists: Optional[bool] = field(
39+
default=False, metadata={"exclude_if_not_provided": True}
40+
)
3841
partition_by: Optional[dict] = field(
3942
default_factory=dict, metadata={"exclude_if_not_provided": True}
4043
)
@@ -87,6 +90,7 @@ def __post_init__(self):
8790
self.set_unique_columns()
8891
self.populate_keys()
8992
self.normalize_ref_columns_in_final_output()
93+
self.normalize_indexes_in_final_output()
9094
self.post_process()
9195

9296
def set_unique_columns(self) -> None:
@@ -118,6 +122,26 @@ def normalize_ref_columns_in_final_output(self):
118122
del col_ref["name"]
119123
column["references"] = col_ref
120124

125+
def normalize_indexes_in_final_output(self) -> None:
126+
for index in self.index:
127+
columns = index.get("columns") or []
128+
if not columns or not isinstance(columns[0], str):
129+
continue
130+
if self.output_mode == "mysql":
131+
continue
132+
if index.get("index_name") is not None and self.output_mode != "snowflake":
133+
continue
134+
detailed_columns = index.get("detailed_columns") or []
135+
index["columns"] = [[deepcopy(column)] for column in detailed_columns]
136+
index["detailed_columns"] = [
137+
{
138+
"name": [deepcopy(column)],
139+
"nulls": column["nulls"],
140+
"order": column["order"],
141+
}
142+
for column in detailed_columns
143+
]
144+
121145
def populate_keys(self) -> None:
122146
"""primary_key - list of column names, example: "primary_key": ["data_sync_id", "sync_start"],"""
123147

0 commit comments

Comments
 (0)