Skip to content

Commit 0c9e7f7

Browse files
sonar-nigel[bot]joke1196marc-jasper-sonarsource
authored andcommitted
SONARPY-3953 Rule S8516 Group iterators from "itertools.groupby" should not be reused (#994)
Co-authored-by: David Kunzmann <david.kunzmann@sonarsource.com> Co-authored-by: Marc Jasper <marc.jasper@sonarsource.com> GitOrigin-RevId: c6df072122dd7d5eb397d5c5c260a574e84bfa7d
1 parent 5496a22 commit 0c9e7f7

9 files changed

Lines changed: 726 additions & 2 deletions

File tree

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
/*
2+
* SonarQube Python Plugin
3+
* Copyright (C) SonarSource Sàrl
4+
* mailto:info AT sonarsource DOT com
5+
*
6+
* You can redistribute and/or modify this program under the terms of
7+
* the Sonar Source-Available License Version 1, as published by SonarSource Sàrl.
8+
*
9+
* This program is distributed in the hope that it will be useful,
10+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12+
* See the Sonar Source-Available License for more details.
13+
*
14+
* You should have received a copy of the Sonar Source-Available License
15+
* along with this program; if not, see https://sonarsource.com/license/ssal/
16+
*/
17+
package org.sonar.python.checks;
18+
19+
import java.util.List;
20+
import java.util.Optional;
21+
import org.sonar.check.Rule;
22+
import org.sonar.plugins.python.api.PythonSubscriptionCheck;
23+
import org.sonar.plugins.python.api.SubscriptionContext;
24+
import org.sonar.plugins.python.api.quickfix.PythonQuickFix;
25+
import org.sonar.plugins.python.api.symbols.v2.SymbolV2;
26+
import org.sonar.plugins.python.api.symbols.v2.UsageV2;
27+
import org.sonar.plugins.python.api.tree.ArgList;
28+
import org.sonar.plugins.python.api.tree.CallExpression;
29+
import org.sonar.plugins.python.api.tree.ComprehensionFor;
30+
import org.sonar.plugins.python.api.tree.Expression;
31+
import org.sonar.plugins.python.api.tree.ForStatement;
32+
import org.sonar.plugins.python.api.tree.Name;
33+
import org.sonar.plugins.python.api.tree.RegularArgument;
34+
import org.sonar.plugins.python.api.tree.Tree;
35+
import org.sonar.plugins.python.api.types.v2.matchers.TypeMatcher;
36+
import org.sonar.plugins.python.api.types.v2.matchers.TypeMatchers;
37+
import org.sonar.python.quickfix.TextEditUtils;
38+
import org.sonar.python.tree.TreeUtils;
39+
40+
@Rule(key = "S8516")
41+
public class GroupByIteratorReuseCheck extends PythonSubscriptionCheck {
42+
43+
private static final String MESSAGE = "Convert this group iterator to a list.";
44+
private static final String QUICK_FIX_MESSAGE = "Wrap with \"list()\"";
45+
46+
private static final TypeMatcher GROUPBY_MATCHER = TypeMatchers.isType("itertools.groupby");
47+
48+
// SAFE_CONSUMER_MATCHER and CLASS_MATCHER are matched leniently (TRUE or UNKNOWN both pass) so
49+
// unresolved callees don't trigger false positives.
50+
private static final TypeMatcher SAFE_CONSUMER_MATCHER = TypeMatchers.any(
51+
TypeMatchers.isType("list"),
52+
TypeMatchers.isType("tuple"),
53+
TypeMatchers.isType("set"),
54+
TypeMatchers.isType("frozenset"),
55+
TypeMatchers.isType("sorted"),
56+
TypeMatchers.isType("sum"),
57+
TypeMatchers.isType("max"),
58+
TypeMatchers.isType("min"),
59+
TypeMatchers.isType("any"),
60+
TypeMatchers.isType("all")
61+
);
62+
63+
// Any class constructor that accepts an iterable invariably materializes it inside __init__
64+
private static final TypeMatcher CLASS_MATCHER = TypeMatchers.isObjectOfType("type");
65+
66+
@Override
67+
public void initialize(Context context) {
68+
context.registerSyntaxNodeConsumer(Tree.Kind.FOR_STMT, GroupByIteratorReuseCheck::checkForStatement);
69+
}
70+
71+
private static void checkForStatement(SubscriptionContext ctx) {
72+
ForStatement forStatement = (ForStatement) ctx.syntaxNode();
73+
74+
if (forStatement.testExpressions().size() != 1) {
75+
return;
76+
}
77+
78+
if (!(forStatement.testExpressions().get(0) instanceof CallExpression callExpr)) {
79+
return;
80+
}
81+
82+
if (!GROUPBY_MATCHER.isTrueFor(callExpr.callee(), ctx)) {
83+
return;
84+
}
85+
86+
if (forStatement.expressions().size() != 2) {
87+
return;
88+
}
89+
90+
if (!(forStatement.expressions().get(1) instanceof Name groupName)) {
91+
return;
92+
}
93+
94+
SymbolV2 groupSymbol = groupName.symbolV2();
95+
if (groupSymbol == null) {
96+
return;
97+
}
98+
99+
Tree loopBody = forStatement.body();
100+
101+
// If `group` is rebound anywhere in the loop body (e.g. `group = list(group)`), we can't
102+
// tell from the AST alone which reads see the original iterator. We conservatively skip.
103+
boolean isReboundInLoopBody = groupSymbol.usages().stream()
104+
.filter(usage -> usage.kind() == UsageV2.Kind.ASSIGNMENT_LHS)
105+
.map(UsageV2::tree)
106+
.flatMap(TreeUtils.toStreamInstanceOfMapper(Name.class))
107+
.anyMatch(reboundName -> isInside(reboundName, loopBody));
108+
if (isReboundInLoopBody) {
109+
return;
110+
}
111+
112+
List<Name> loopBodyReads = groupSymbol.usages().stream()
113+
.filter(usage -> !usage.isBindingUsage())
114+
.map(UsageV2::tree)
115+
.flatMap(TreeUtils.toStreamInstanceOfMapper(Name.class))
116+
.filter(nameUsage -> isInside(nameUsage, loopBody))
117+
.toList();
118+
119+
List<Name> unsafeReads = loopBodyReads.stream()
120+
.filter(nameUsage -> !isSafeUsage(nameUsage, forStatement, ctx))
121+
.toList();
122+
123+
// The quickfix wraps a single occurrence in `list(...)`. We only attach it when there is
124+
// exactly one read of `group` in the loop body — this does not affecting any other consumer.
125+
boolean canOfferQuickFix = loopBodyReads.size() == 1 && unsafeReads.size() == 1;
126+
for (Name nameUsage : unsafeReads) {
127+
var issue = ctx.addIssue(nameUsage, MESSAGE);
128+
if (canOfferQuickFix) {
129+
PythonQuickFix quickFix = PythonQuickFix.newQuickFix(QUICK_FIX_MESSAGE)
130+
.addTextEdit(TextEditUtils.insertBefore(nameUsage, "list("))
131+
.addTextEdit(TextEditUtils.insertAfter(nameUsage, ")"))
132+
.build();
133+
issue.addQuickFix(quickFix);
134+
}
135+
}
136+
}
137+
138+
private static boolean isSafeUsage(Name nameUsage, ForStatement enclosingForStatement, SubscriptionContext ctx) {
139+
// A usage inside a nested function or lambda defined in the loop body is always unsafe
140+
if (isInsideNestedFunctionOrLambda(nameUsage, enclosingForStatement)) {
141+
return false;
142+
}
143+
144+
if (nameUsage.parent() instanceof ComprehensionFor compFor && compFor.iterable() == nameUsage) {
145+
return true;
146+
}
147+
148+
if (nameUsage.parent() instanceof ForStatement nestedFor
149+
&& nestedFor.testExpressions().stream().anyMatch(e -> e == nameUsage)) {
150+
return true;
151+
}
152+
153+
if (nameUsage.parent() instanceof RegularArgument regularArg && regularArg.keywordArgument() == null) {
154+
return hasSafeConsumerAncestor(regularArg, ctx);
155+
}
156+
157+
return false;
158+
}
159+
160+
// We raise when the group iterator escapes the current iteration and is read after the outer
161+
// `groupby` advances. A positional-arg call chain ending in a safe consumer cannot escape.
162+
private static boolean hasSafeConsumerAncestor(RegularArgument regularArg, SubscriptionContext ctx) {
163+
Optional<CallExpression> currentCall = owningCall(regularArg);
164+
while (currentCall.isPresent()) {
165+
CallExpression call = currentCall.get();
166+
if (isSafeConsumerCallee(call.callee(), ctx)) {
167+
return true;
168+
}
169+
if (!(call.parent() instanceof RegularArgument outerArg) || outerArg.keywordArgument() != null) {
170+
return false;
171+
}
172+
currentCall = owningCall(outerArg);
173+
}
174+
return false;
175+
}
176+
177+
private static boolean isSafeConsumerCallee(Expression callee, SubscriptionContext ctx) {
178+
return !SAFE_CONSUMER_MATCHER.evaluateFor(callee, ctx).isFalse()
179+
|| !CLASS_MATCHER.evaluateFor(callee, ctx).isFalse();
180+
}
181+
182+
private static boolean isInsideNestedFunctionOrLambda(Name nameUsage, ForStatement enclosingForStatement) {
183+
Tree functionLikeAncestor = TreeUtils.firstAncestorOfKind(nameUsage, Tree.Kind.FUNCDEF, Tree.Kind.LAMBDA);
184+
return functionLikeAncestor != null
185+
&& isInside(functionLikeAncestor, enclosingForStatement.body());
186+
}
187+
188+
private static boolean isInside(Tree tree, Tree container) {
189+
return TreeUtils.firstAncestor(tree, ancestor -> ancestor == container) != null;
190+
}
191+
192+
private static Optional<CallExpression> owningCall(RegularArgument regularArg) {
193+
if (regularArg.parent() instanceof ArgList argList && argList.parent() instanceof CallExpression callExpr) {
194+
return Optional.of(callExpr);
195+
}
196+
return Optional.empty();
197+
}
198+
}

python-checks/src/main/java/org/sonar/python/checks/OpenSourceCheckList.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ public Stream<Class<?>> getChecks() {
263263
GraphemeClustersInClassesCheck.class,
264264
GraphQLDenialOfServiceCheck.class,
265265
GraphQLIntrospectionCheck.class,
266+
GroupByIteratorReuseCheck.class,
266267
GroupReplacementCheck.class,
267268
HardCodedCredentialsCheck.class,
268269
HardCodedCredentialsEntropyCheck.class,
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
<p>This rule raises an issue when a group iterator returned by <code>itertools.groupby()</code> is stored and reused after the <code>groupby</code>
2+
object has advanced to the next group.</p>
3+
<h2>Why is this an issue?</h2>
4+
<p>The <code>itertools.groupby()</code> function groups consecutive items from an iterable based on a key function. It returns pairs of <code>(key,
5+
group_iterator)</code> where each <code>group_iterator</code> is a sub-iterator that yields items for that specific group.</p>
6+
<p>A critical characteristic of these group iterators is that they are consumed when the main <code>groupby</code> iterator advances to the next
7+
group. This means if you store a reference to a group iterator and try to use it later, it will be empty and yield no results.</p>
8+
<p>This behavior occurs because <code>groupby</code> uses lazy evaluation for efficiency. The group iterators share the same underlying data source,
9+
and advancing the main iterator invalidates previous group iterators. The problem typically manifests when storing group iterators in a collection for
10+
later use or when trying to iterate over the same group iterator more than once.</p>
11+
<h3>What is the potential impact?</h3>
12+
<p>When group iterators are reused, they yield no items. Your code will process empty sequences instead of the actual grouped data, leading to
13+
incorrect results, silent failures, and data loss. These issues can be difficult to debug because the code does not raise exceptions but simply
14+
produces wrong output.</p>
15+
<h2>How to fix it</h2>
16+
<p>Convert the group iterator to a list immediately when processing each group. This materializes the data and stores it in memory, allowing you to
17+
reuse it later.</p>
18+
<h3>Code examples</h3>
19+
<h4>Noncompliant code example</h4>
20+
<pre data-diff-id="1" data-diff-type="noncompliant">
21+
from itertools import groupby
22+
23+
data = [1, 1, 2, 2, 3]
24+
groups = {}
25+
26+
for key, group in groupby(data):
27+
groups[key] = group # Noncompliant
28+
29+
for key, group in groups.items():
30+
print(f"{key}: {list(group)}") # Empty results
31+
</pre>
32+
<h4>Compliant solution</h4>
33+
<pre data-diff-id="1" data-diff-type="compliant">
34+
from itertools import groupby
35+
36+
data = [1, 1, 2, 2, 3]
37+
groups = {}
38+
39+
for key, group in groupby(data):
40+
groups[key] = list(group) # Convert to list immediately
41+
42+
for key, group in groups.items():
43+
print(f"{key}: {group}") # Correct results
44+
</pre>
45+
<h2>Resources</h2>
46+
<h3>Documentation</h3>
47+
<ul>
48+
<li>Python Documentation - <a href="https://docs.python.org/3/library/itertools.html#itertools.groupby"><code>itertools.groupby</code></a></li>
49+
<li>Python Documentation - <a href="https://docs.python.org/3/library/itertools.html#itertools-recipes">itertools Recipes</a></li>
50+
</ul>
51+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"title": "Group iterators from \"itertools.groupby\" should not be reused",
3+
"type": "BUG",
4+
"status": "ready",
5+
"remediation": {
6+
"func": "Constant\/Issue",
7+
"constantCost": "5min"
8+
},
9+
"tags": [
10+
"pitfall",
11+
"confusing"
12+
],
13+
"defaultSeverity": "Major",
14+
"ruleSpecification": "RSPEC-8516",
15+
"sqKey": "S8516",
16+
"scope": "All",
17+
"quickfix": "targeted",
18+
"code": {
19+
"impacts": {
20+
"RELIABILITY": "HIGH"
21+
},
22+
"attribute": "LOGICAL"
23+
}
24+
}

python-checks/src/main/resources/org/sonar/l10n/py/rules/python/Sonar_way_profile.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,16 +329,17 @@
329329
"S8414",
330330
"S8415",
331331
"S8490",
332+
"S8493",
332333
"S8494",
333334
"S8495",
334335
"S8504",
335336
"S8505",
336337
"S8507",
337338
"S8513",
339+
"S8516",
338340
"S8517",
339341
"S8519",
340342
"S8520",
341-
"S8521",
342-
"S8493"
343+
"S8521"
343344
]
344345
}

0 commit comments

Comments
 (0)