Skip to content

Commit bb3fbd5

Browse files
Add W024 select-distinct-suspicious (DISTINCT + JOIN band-aid heuristic) (#49)
* Add W024 select-distinct-suspicious rule Fires when a statement contains both top-level SELECT DISTINCT and any JOIN keyword. The pattern is a frequent band-aid for a missing join condition or a missing GROUP BY: developers reach for DISTINCT to swallow row duplication caused by an over-wide JOIN cardinality rather than fixing the join. The regex anchors on SELECT directly preceding DISTINCT so aggregate- DISTINCT forms like COUNT(DISTINCT col) and SUM(DISTINCT amount) are not flagged. String literals and comments are stripped first via strip_strings_and_comments so a mention of SELECT DISTINCT inside a literal or a comment cannot trigger the rule. multiline = True so the DISTINCT and JOIN keywords can sit on different lines of the same statement. Resolves #40. * Register W024 in rules registry; bump test counts to 42 / 31 warnings * Add W024 select-distinct-suspicious tests Covers: INNER JOIN, LEFT JOIN, multi-line, case-insensitive, single-table DISTINCT (passes), COUNT(DISTINCT) with JOIN (passes), SUM(DISTINCT) with JOIN (passes), JOIN without DISTINCT (passes), DISTINCT inside a string literal (passes), DISTINCT inside a -- comment (passes), and a message-content assertion. * Document W024 in CHANGELOG under [Unreleased] Added * Add W024 row to README rule table * style: pre-commit auto-fixes [pre-commit.ci] auto-applied fixes from configured hooks --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent b193c0d commit bb3fbd5

6 files changed

Lines changed: 152 additions & 5 deletions

File tree

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,17 @@ a deprecation window (see `GOVERNANCE.md` § Scope discipline).
4646
the explicit-column variant (`SELECT col1, col2 INTO target FROM
4747
source`) is allowed through here and covered by the contracts pack
4848
at C001/C003 if a column type drifts. Resolves #43.
49+
- **W024 `select-distinct-suspicious`** (warning) - fires when a
50+
statement contains both top-level `SELECT DISTINCT` and any `JOIN`
51+
keyword. The pattern is a common band-aid for a missing join
52+
condition or a missing `GROUP BY`: developers reach for `DISTINCT`
53+
to swallow row duplication caused by an over-wide JOIN cardinality
54+
rather than fixing the join. Standalone `SELECT DISTINCT col FROM t`
55+
(no join) is left alone, and aggregate-DISTINCT forms like
56+
`COUNT(DISTINCT col)` are not flagged because the regex anchors on
57+
`SELECT` directly preceding `DISTINCT`. String literals and comments
58+
are stripped first so a mention of "SELECT DISTINCT" inside a string
59+
or comment cannot fire the rule. Resolves #40.
4960

5061
### Fixed
5162

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,7 @@ sql-sop list-rules # show every registered rule
238238
| W020 | `truncate-table` | `TRUNCATE TABLE staging;` -- bypasses triggers, resets identity |
239239
| W022 | `cross-join-explicit` | `FROM products CROSS JOIN regions` -- Cartesian product, confirm intent |
240240
| W023 | `scalar-udf-in-where` | `WHERE dbo.fn_X(col) = 1` -- row-by-row predicate evaluation |
241+
| W024 | `select-distinct-suspicious` | `SELECT DISTINCT a, b FROM x JOIN y ON ...` -- DISTINCT often masks a missing join condition or GROUP BY |
241242

242243

243244
### Structural (v0.3.0+, sqlparse-based)

sql_guard/rules/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
OrAcrossColumns,
3434
OrderByWithoutLimit,
3535
ScalarUdfInWhere,
36+
SelectDistinctSuspicious,
3637
SelectStar,
3738
SubqueryCouldBeJoin,
3839
TruncateTable,
@@ -106,6 +107,7 @@
106107
LeadingWildcardLike(),
107108
OrAcrossColumns(),
108109
CountDistinctUnbounded(),
110+
SelectDistinctSuspicious(),
109111
TruncateTable(),
110112
HavingWithoutGroupBy(),
111113
CrossJoinExplicit(),

sql_guard/rules/warnings.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import re
66

7-
from sql_guard.rules.base import Finding, Rule
7+
from sql_guard.rules.base import Finding, Rule, strip_strings_and_comments
88

99

1010
class SelectStar(Rule):
@@ -510,6 +510,56 @@ def check_line(self, line: str, line_number: int, file: str) -> Finding | None:
510510
return None
511511

512512

513+
class SelectDistinctSuspicious(Rule):
514+
"""W024: ``SELECT DISTINCT`` paired with ``JOIN`` is often a band-aid.
515+
516+
Developers reach for ``SELECT DISTINCT`` to deduplicate rows when a
517+
JOIN cardinality is wider than they expected, often because a join
518+
condition is missing or because what they actually want is a
519+
``GROUP BY``. The result reads more rows than necessary and hides
520+
the underlying join-condition bug.
521+
522+
Fires when a statement contains both top-level ``SELECT DISTINCT``
523+
and any ``JOIN`` keyword. Standalone ``SELECT DISTINCT col FROM t``
524+
(no join) is fine -- the smell is the cross-join blow-up pattern,
525+
not legitimate single-table uniqueness.
526+
527+
``COUNT(DISTINCT col)`` and other aggregate-DISTINCT forms are not
528+
flagged: the regex anchors on ``SELECT`` directly preceding
529+
``DISTINCT``, so ``SELECT COUNT(DISTINCT x) FROM t JOIN u`` does
530+
not match.
531+
"""
532+
533+
id = "W024"
534+
name = "select-distinct-suspicious"
535+
severity = "warning"
536+
description = "SELECT DISTINCT combined with JOIN often masks a missing join condition"
537+
multiline = True
538+
539+
_select_distinct = Rule._compile(r"\bSELECT\s+DISTINCT\b")
540+
_join = Rule._compile(r"\bJOIN\b")
541+
542+
def check_statement(self, statement: str, start_line: int, file: str) -> Finding | None:
543+
# Strip strings and comments so DISTINCT or JOIN inside literal
544+
# text or comments cannot trigger a false positive.
545+
stripped = strip_strings_and_comments(statement)
546+
if self._select_distinct.search(stripped) and self._join.search(stripped):
547+
return Finding(
548+
rule_id=self.id,
549+
severity=self.severity,
550+
file=file,
551+
line=start_line,
552+
message=(
553+
"SELECT DISTINCT with JOIN often masks a missing join condition or grouping"
554+
),
555+
suggestion=(
556+
"Verify the JOIN condition is correct and DISTINCT is genuinely needed; "
557+
"consider GROUP BY if you are hiding row duplication"
558+
),
559+
)
560+
return None
561+
562+
513563
class CountDistinctUnbounded(Rule):
514564
"""W019: ``COUNT(DISTINCT col)`` on an unfiltered table.
515565

tests/test_new_rules.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
LeadingWildcardLike,
1111
OrAcrossColumns,
1212
ScalarUdfInWhere,
13+
SelectDistinctSuspicious,
1314
TruncateTable,
1415
)
1516

@@ -387,3 +388,85 @@ def test_w022_does_not_flag_cross_join_inside_trailing_comment():
387388
)
388389
is None
389390
)
391+
392+
393+
# W024 select-distinct-suspicious ---------------------------------------------
394+
395+
396+
def test_w024_flags_distinct_with_inner_join():
397+
rule = SelectDistinctSuspicious()
398+
finding = _stmt(
399+
rule,
400+
"SELECT DISTINCT c.id, c.name FROM customers c JOIN orders o ON c.id = o.customer_id;",
401+
)
402+
assert finding is not None
403+
assert finding.rule_id == "W024"
404+
assert finding.severity == "warning"
405+
406+
407+
def test_w024_flags_distinct_with_left_join():
408+
rule = SelectDistinctSuspicious()
409+
sql = "SELECT DISTINCT a.id FROM a LEFT JOIN b ON a.id = b.a_id;"
410+
assert _stmt(rule, sql) is not None
411+
412+
413+
def test_w024_flags_distinct_with_join_multiline():
414+
rule = SelectDistinctSuspicious()
415+
sql = (
416+
"SELECT DISTINCT\n c.id, c.name\nFROM customers c\nJOIN orders o ON c.id = o.customer_id;"
417+
)
418+
assert _stmt(rule, sql) is not None
419+
420+
421+
def test_w024_case_insensitive():
422+
rule = SelectDistinctSuspicious()
423+
assert _stmt(rule, "select distinct a from x join y on x.id = y.id;") is not None
424+
425+
426+
def test_w024_does_not_flag_distinct_alone():
427+
# Single-table DISTINCT is fine -- no JOIN cardinality blow-up to mask.
428+
rule = SelectDistinctSuspicious()
429+
assert _stmt(rule, "SELECT DISTINCT country FROM customers;") is None
430+
431+
432+
def test_w024_does_not_flag_count_distinct_with_join():
433+
# Aggregate-DISTINCT is a different pattern; the regex anchors on
434+
# "SELECT DISTINCT" directly, not "COUNT(DISTINCT ...)".
435+
rule = SelectDistinctSuspicious()
436+
sql = "SELECT COUNT(DISTINCT c.id) FROM customers c JOIN orders o ON c.id = o.customer_id;"
437+
assert _stmt(rule, sql) is None
438+
439+
440+
def test_w024_does_not_flag_sum_distinct_with_join():
441+
rule = SelectDistinctSuspicious()
442+
sql = "SELECT SUM(DISTINCT amount) FROM payments p JOIN customers c ON p.cust_id = c.id;"
443+
assert _stmt(rule, sql) is None
444+
445+
446+
def test_w024_does_not_flag_join_without_distinct():
447+
rule = SelectDistinctSuspicious()
448+
assert _stmt(rule, "SELECT a.id FROM a JOIN b ON a.id = b.a_id;") is None
449+
450+
451+
def test_w024_does_not_flag_distinct_inside_string_literal():
452+
rule = SelectDistinctSuspicious()
453+
sql = "INSERT INTO log(msg) SELECT 'SELECT DISTINCT x JOIN y' FROM t JOIN u ON t.id = u.id;"
454+
# Outer SELECT does not have DISTINCT; the literal mentions it.
455+
assert _stmt(rule, sql) is None
456+
457+
458+
def test_w024_does_not_flag_distinct_inside_comment():
459+
rule = SelectDistinctSuspicious()
460+
sql = "-- SELECT DISTINCT x FROM y JOIN z\nSELECT id FROM t;"
461+
assert _stmt(rule, sql) is None
462+
463+
464+
def test_w024_message_mentions_join_or_grouping():
465+
rule = SelectDistinctSuspicious()
466+
finding = _stmt(
467+
rule,
468+
"SELECT DISTINCT a FROM x JOIN y ON x.id = y.id;",
469+
)
470+
assert finding is not None
471+
msg = finding.message.lower()
472+
assert "join" in msg or "grouping" in msg

tests/test_rules.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,19 +18,19 @@
1818

1919
class TestRuleRegistry:
2020
def test_all_rules_loaded(self) -> None:
21-
assert len(ALL_RULES) == 41
21+
assert len(ALL_RULES) == 42
2222

2323
def test_11_errors(self) -> None:
2424
# 9 E-series + 2 T-series (T002 xp-cmdshell, T004 deprecated-outer-join).
2525
errors = [r for r in ALL_RULES if r.severity == "error"]
2626
assert len(errors) == 11
2727

28-
def test_30_warnings(self) -> None:
29-
# 23 W-series + 3 S-series + 4 T-series (T001 with-nolock,
28+
def test_31_warnings(self) -> None:
29+
# 24 W-series + 3 S-series + 4 T-series (T001 with-nolock,
3030
# T003 cursor-declaration, T005 create-index-without-online,
3131
# T006 select-into-without-typed-fields).
3232
warnings = [r for r in ALL_RULES if r.severity == "warning"]
33-
assert len(warnings) == 30
33+
assert len(warnings) == 31
3434

3535
def test_unique_ids(self) -> None:
3636
ids = [r.id for r in ALL_RULES]

0 commit comments

Comments
 (0)