Skip to content

Commit 742621c

Browse files
marc-jasper-sonarsourcesonartech
authored andcommitted
SONARPY-4107 Fix S8516 FPs by detecting only known unsafe sinks (#1092)
GitOrigin-RevId: 71e8ff65acf05e269337957bccc0716d11efb013
1 parent 57349a0 commit 742621c

5 files changed

Lines changed: 176 additions & 246 deletions

File tree

python-checks/src/main/java/org/sonar/python/checks/GroupByIteratorReuseCheck.java

Lines changed: 86 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,26 @@
1818

1919
import java.util.List;
2020
import java.util.Optional;
21+
import java.util.Set;
22+
import java.util.function.Predicate;
23+
import java.util.stream.Stream;
2124
import org.sonar.check.Rule;
2225
import org.sonar.plugins.python.api.PythonSubscriptionCheck;
2326
import org.sonar.plugins.python.api.SubscriptionContext;
2427
import org.sonar.plugins.python.api.quickfix.PythonQuickFix;
2528
import org.sonar.plugins.python.api.symbols.v2.SymbolV2;
2629
import org.sonar.plugins.python.api.symbols.v2.UsageV2;
2730
import org.sonar.plugins.python.api.tree.ArgList;
31+
import org.sonar.plugins.python.api.tree.AssignmentStatement;
2832
import org.sonar.plugins.python.api.tree.CallExpression;
29-
import org.sonar.plugins.python.api.tree.ComprehensionFor;
3033
import org.sonar.plugins.python.api.tree.Expression;
3134
import org.sonar.plugins.python.api.tree.ForStatement;
3235
import org.sonar.plugins.python.api.tree.Name;
36+
import org.sonar.plugins.python.api.tree.QualifiedExpression;
3337
import org.sonar.plugins.python.api.tree.RegularArgument;
3438
import org.sonar.plugins.python.api.tree.Tree;
39+
import org.sonar.plugins.python.api.tree.YieldExpression;
40+
import org.sonar.plugins.python.api.tree.YieldStatement;
3541
import org.sonar.plugins.python.api.types.v2.matchers.TypeMatcher;
3642
import org.sonar.plugins.python.api.types.v2.matchers.TypeMatchers;
3743
import org.sonar.python.quickfix.TextEditUtils;
@@ -40,13 +46,11 @@
4046
@Rule(key = "S8516")
4147
public class GroupByIteratorReuseCheck extends PythonSubscriptionCheck {
4248

43-
private static final String MESSAGE = "Convert this group iterator to a list.";
49+
private static final String MESSAGE = "Consume this group iterator inside the loop, or materialize it into a collection.";
4450
private static final String QUICK_FIX_MESSAGE = "Wrap with \"list()\"";
4551

4652
private static final TypeMatcher GROUPBY_MATCHER = TypeMatchers.isType("itertools.groupby");
4753

48-
// SAFE_CONSUMER_MATCHER and CLASS_MATCHER are matched leniently (TRUE or UNKNOWN both pass) so
49-
// unresolved callees don't trigger false positives.
5054
private static final TypeMatcher SAFE_CONSUMER_MATCHER = TypeMatchers.any(
5155
TypeMatchers.isType("list"),
5256
TypeMatchers.isType("tuple"),
@@ -57,11 +61,21 @@ public class GroupByIteratorReuseCheck extends PythonSubscriptionCheck {
5761
TypeMatchers.isType("max"),
5862
TypeMatchers.isType("min"),
5963
TypeMatchers.isType("any"),
60-
TypeMatchers.isType("all")
64+
TypeMatchers.isType("all"),
65+
TypeMatchers.isType("next"),
66+
TypeMatchers.isType("len"),
67+
TypeMatchers.isType("str.join"),
68+
TypeMatchers.isType("bytes.join")
6169
);
6270

63-
// Any class constructor that accepts an iterable invariably materializes it inside __init__
64-
private static final TypeMatcher CLASS_MATCHER = TypeMatchers.isObjectOfType("type");
71+
// Matches class objects produced at runtime via `type(...)` (e.g. `Cls = type(obj); Cls(group)`).
72+
// Direct class references (`MyClass`) are NOT matched here
73+
private static final TypeMatcher RUNTIME_CLASS_OBJECT_MATCHER = TypeMatchers.isObjectOfType("type");
74+
75+
// Container methods that store their argument *as a single element* without iterating it.
76+
private static final Set<String> STORING_METHOD_NAMES = Set.of(
77+
"append", "add", "setdefault"
78+
);
6579

6680
@Override
6781
public void initialize(Context context) {
@@ -70,58 +84,33 @@ public void initialize(Context context) {
7084

7185
private static void checkForStatement(SubscriptionContext ctx) {
7286
ForStatement forStatement = (ForStatement) ctx.syntaxNode();
73-
74-
if (forStatement.testExpressions().size() != 1) {
75-
return;
76-
}
77-
78-
if (!(forStatement.testExpressions().get(0) instanceof CallExpression callExpr)) {
87+
Name groupName = extractGroupByLoopVariable(forStatement, ctx).orElse(null);
88+
if (groupName == null) {
7989
return;
8090
}
81-
82-
if (!GROUPBY_MATCHER.isTrueFor(callExpr.callee(), ctx)) {
83-
return;
84-
}
85-
86-
if (forStatement.expressions().size() != 2) {
87-
return;
88-
}
89-
90-
if (!(forStatement.expressions().get(1) instanceof Name groupName)) {
91-
return;
92-
}
93-
9491
SymbolV2 groupSymbol = groupName.symbolV2();
9592
if (groupSymbol == null) {
9693
return;
9794
}
9895

9996
Tree loopBody = forStatement.body();
10097

101-
// If `group` is rebound anywhere in the loop body (e.g. `group = list(group)`), we can't
102-
// tell from the AST alone which reads see the original iterator. We conservatively skip.
103-
boolean isReboundInLoopBody = groupSymbol.usages().stream()
104-
.filter(usage -> usage.kind() == UsageV2.Kind.ASSIGNMENT_LHS)
105-
.map(UsageV2::tree)
106-
.flatMap(TreeUtils.toStreamInstanceOfMapper(Name.class))
107-
.anyMatch(reboundName -> isInside(reboundName, loopBody));
98+
// Bail on any rebinding of `group` in the body to avoid requiring a CFG
99+
boolean isReboundInLoopBody = namesInLoopBody(groupSymbol, loopBody,
100+
usage -> usage.kind() == UsageV2.Kind.ASSIGNMENT_LHS).findAny().isPresent();
108101
if (isReboundInLoopBody) {
109102
return;
110103
}
111104

112-
List<Name> loopBodyReads = groupSymbol.usages().stream()
113-
.filter(usage -> !usage.isBindingUsage())
114-
.map(UsageV2::tree)
115-
.flatMap(TreeUtils.toStreamInstanceOfMapper(Name.class))
116-
.filter(nameUsage -> isInside(nameUsage, loopBody))
117-
.toList();
105+
List<Name> loopBodyReads = namesInLoopBody(groupSymbol, loopBody,
106+
usage -> !usage.isBindingUsage()).toList();
118107

119108
List<Name> unsafeReads = loopBodyReads.stream()
120-
.filter(nameUsage -> !isSafeUsage(nameUsage, forStatement, ctx))
109+
.filter(nameUsage -> isUnsafeRead(nameUsage, forStatement, ctx))
121110
.toList();
122111

123-
// The quickfix wraps a single occurrence in `list(...)`. We only attach it when there is
124-
// exactly one read of `group` in the loop body — this does not affecting any other consumer.
112+
// Quickfix only when there is a single read in the body: wrapping `group` in `list()`
113+
// consumes the iterator and would silently break any other read.
125114
boolean canOfferQuickFix = loopBodyReads.size() == 1 && unsafeReads.size() == 1;
126115
for (Name nameUsage : unsafeReads) {
127116
var issue = ctx.addIssue(nameUsage, MESSAGE);
@@ -135,54 +124,80 @@ private static void checkForStatement(SubscriptionContext ctx) {
135124
}
136125
}
137126

138-
private static boolean isSafeUsage(Name nameUsage, ForStatement enclosingForStatement, SubscriptionContext ctx) {
139-
// A usage inside a nested function or lambda defined in the loop body is always unsafe
140-
if (isInsideNestedFunctionOrLambda(nameUsage, enclosingForStatement)) {
141-
return false;
127+
// Matches `for key, group in groupby(...):` and returns the `group` name
128+
private static Optional<Name> extractGroupByLoopVariable(ForStatement forStatement, SubscriptionContext ctx) {
129+
if (forStatement.testExpressions().size() != 1
130+
|| !(forStatement.testExpressions().get(0) instanceof CallExpression callExpr)
131+
|| !GROUPBY_MATCHER.isTrueFor(callExpr.callee(), ctx)
132+
|| forStatement.expressions().size() != 2
133+
|| !(forStatement.expressions().get(1) instanceof Name groupName)) {
134+
return Optional.empty();
142135
}
136+
return Optional.of(groupName);
137+
}
143138

144-
if (nameUsage.parent() instanceof ComprehensionFor compFor && compFor.iterable() == nameUsage) {
139+
// Recognized escape sinks: lambda/nested-function capture, assignment rvalue, yield, and
140+
// positional argument of a known storing-method. Anything else is treated as safe.
141+
private static boolean isUnsafeRead(Name nameUsage, ForStatement enclosingForStatement, SubscriptionContext ctx) {
142+
if (isCapturedByNestedFunctionOrLambda(nameUsage, enclosingForStatement)) {
145143
return true;
146144
}
145+
return reachesSink(nameUsage, ctx);
146+
}
147147

148-
if (nameUsage.parent() instanceof ForStatement nestedFor
149-
&& nestedFor.testExpressions().stream().anyMatch(e -> e == nameUsage)) {
148+
private static boolean reachesSink(Expression expression, SubscriptionContext ctx) {
149+
Tree parent = expression.parent();
150+
if (parent instanceof AssignmentStatement assign && assign.assignedValue() == expression) {
150151
return true;
151152
}
152-
153-
if (nameUsage.parent() instanceof RegularArgument regularArg && regularArg.keywordArgument() == null) {
154-
return hasSafeConsumerAncestor(regularArg, ctx);
153+
if (parent instanceof YieldExpression || parent instanceof YieldStatement) {
154+
return true;
155+
}
156+
// Keyword arguments are skipped (treated as safe): mapping them to the callee's parameter would
157+
// require signature resolution, and iterators are overwhelmingly passed positionally in practice.
158+
if (parent instanceof RegularArgument regularArg && regularArg.keywordArgument() == null) {
159+
return chainReachesSink(regularArg, ctx);
155160
}
156-
157161
return false;
158162
}
159163

160-
// We raise when the group iterator escapes the current iteration and is read after the outer
161-
// `groupby` advances. A positional-arg call chain ending in a safe consumer cannot escape.
162-
private static boolean hasSafeConsumerAncestor(RegularArgument regularArg, SubscriptionContext ctx) {
163-
Optional<CallExpression> currentCall = owningCall(regularArg);
164-
while (currentCall.isPresent()) {
165-
CallExpression call = currentCall.get();
166-
if (isSafeConsumerCallee(call.callee(), ctx)) {
167-
return true;
168-
}
169-
if (!(call.parent() instanceof RegularArgument outerArg) || outerArg.keywordArgument() != null) {
170-
return false;
171-
}
172-
currentCall = owningCall(outerArg);
164+
private static boolean chainReachesSink(RegularArgument arg, SubscriptionContext ctx) {
165+
CallExpression call = owningCall(arg).orElse(null);
166+
if (call == null) {
167+
return false;
173168
}
174-
return false;
169+
if (isSafeConsumerCallee(call.callee(), ctx)) {
170+
return false;
171+
}
172+
if (isStoringMethodCall(call)) {
173+
return true;
174+
}
175+
return reachesSink(call, ctx);
175176
}
176177

177178
private static boolean isSafeConsumerCallee(Expression callee, SubscriptionContext ctx) {
178179
return !SAFE_CONSUMER_MATCHER.evaluateFor(callee, ctx).isFalse()
179-
|| !CLASS_MATCHER.evaluateFor(callee, ctx).isFalse();
180+
|| !RUNTIME_CLASS_OBJECT_MATCHER.evaluateFor(callee, ctx).isFalse();
181+
}
182+
183+
// Name-only on purpose: gating on the receiver type would silently miss the case where the
184+
// receiver's type cannot be resolved. Middle ground between FP risk and raising actual issues.
185+
private static boolean isStoringMethodCall(CallExpression call) {
186+
return call.callee() instanceof QualifiedExpression qualified
187+
&& STORING_METHOD_NAMES.contains(qualified.name().name());
180188
}
181189

182-
private static boolean isInsideNestedFunctionOrLambda(Name nameUsage, ForStatement enclosingForStatement) {
190+
private static boolean isCapturedByNestedFunctionOrLambda(Name nameUsage, ForStatement enclosingForStatement) {
183191
Tree functionLikeAncestor = TreeUtils.firstAncestorOfKind(nameUsage, Tree.Kind.FUNCDEF, Tree.Kind.LAMBDA);
184-
return functionLikeAncestor != null
185-
&& isInside(functionLikeAncestor, enclosingForStatement.body());
192+
return functionLikeAncestor != null && isInside(functionLikeAncestor, enclosingForStatement.body());
193+
}
194+
195+
private static Stream<Name> namesInLoopBody(SymbolV2 symbol, Tree loopBody, Predicate<UsageV2> usageFilter) {
196+
return symbol.usages().stream()
197+
.filter(usageFilter)
198+
.map(UsageV2::tree)
199+
.flatMap(TreeUtils.toStreamInstanceOfMapper(Name.class))
200+
.filter(name -> isInside(name, loopBody));
186201
}
187202

188203
private static boolean isInside(Tree tree, Tree container) {

python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S8516.html

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,23 @@
1-
<p>This rule raises an issue when a group iterator returned by <code>itertools.groupby()</code> is stored and reused after the <code>groupby</code>
2-
object has advanced to the next group.</p>
1+
<p>This rule raises an issue when a group iterator returned by <code>itertools.groupby()</code> is stored outside the loop body for later use, instead
2+
of being consumed immediately.</p>
33
<h2>Why is this an issue?</h2>
44
<p>The <code>itertools.groupby()</code> function groups consecutive items from an iterable based on a key function. It returns pairs of <code>(key,
55
group_iterator)</code> where each <code>group_iterator</code> is a sub-iterator that yields items for that specific group.</p>
6-
<p>A critical characteristic of these group iterators is that they are consumed when the main <code>groupby</code> iterator advances to the next
7-
group. This means if you store a reference to a group iterator and try to use it later, it will be empty and yield no results.</p>
8-
<p>This behavior occurs because <code>groupby</code> uses lazy evaluation for efficiency. The group iterators share the same underlying data source,
9-
and advancing the main iterator invalidates previous group iterators. The problem typically manifests when storing group iterators in a collection for
10-
later use or when trying to iterate over the same group iterator more than once.</p>
6+
<p>A critical characteristic of these group iterators is that they share the same underlying data source as the main <code>groupby</code> iterator.
7+
Every time the outer loop advances to the next group, the previous group iterator is immediately invalidated — regardless of whether it has been
8+
iterated or not. Storing a raw group iterator and consuming it after the loop has moved on will always yield an empty sequence.</p>
119
<h3>What is the potential impact?</h3>
1210
<p>When group iterators are reused, they yield no items. Your code will process empty sequences instead of the actual grouped data, leading to
1311
incorrect results, silent failures, and data loss. These issues can be difficult to debug because the code does not raise exceptions but simply
1412
produces wrong output.</p>
1513
<h2>How to fix it</h2>
16-
<p>Convert the group iterator to a list immediately when processing each group. This materializes the data and stores it in memory, allowing you to
17-
reuse it later.</p>
14+
<p>There are two correct approaches:</p>
15+
<ul>
16+
<li><strong>Consume the group iterator immediately</strong> inside the loop body, never storing the raw iterator at all. This is the simplest
17+
pattern and avoids any risk of reuse.</li>
18+
<li><strong>Materialize the group iterator</strong> into a concrete collection (<code>list</code>, <code>tuple</code>, <code>set</code>, etc.)
19+
before the loop advances. This allows the data to be stored and accessed later.</li>
20+
</ul>
1821
<h3>Code examples</h3>
1922
<h4>Noncompliant code example</h4>
2023
<pre data-diff-id="1" data-diff-type="noncompliant">
@@ -46,6 +49,5 @@ <h2>Resources</h2>
4649
<h3>Documentation</h3>
4750
<ul>
4851
<li>Python Documentation - <a href="https://docs.python.org/3/library/itertools.html#itertools.groupby"><code>itertools.groupby</code></a></li>
49-
<li>Python Documentation - <a href="https://docs.python.org/3/library/itertools.html#itertools-recipes">itertools Recipes</a></li>
5052
</ul>
5153

python-checks/src/main/resources/org/sonar/l10n/py/rules/python/S8516.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
"ruleSpecification": "RSPEC-8516",
1515
"sqKey": "S8516",
1616
"scope": "All",
17-
"quickfix": "targeted",
17+
"quickfix": "partial",
1818
"code": {
1919
"impacts": {
2020
"RELIABILITY": "HIGH"

python-checks/src/test/java/org/sonar/python/checks/GroupByIteratorReuseCheckTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ void no_quick_fix_when_multiple_reads_in_loop_body() {
8888
assertThat(issues).hasSize(3);
8989
assertThat(issues).extracting(i -> i.primaryLocation().startLine()).containsExactly(12, 13, 14);
9090
assertThat(issues).extracting(i -> i.primaryLocation().message())
91-
.containsOnly("Convert this group iterator to a list.");
91+
.containsOnly("Consume this group iterator inside the loop, or materialize it into a collection.");
9292

9393
// See fixture file header for the rationale on why no quickfix is attached.
9494
assertThat(issues).allSatisfy(issue -> assertThat(issue.quickFixes()).isEmpty());

0 commit comments

Comments
 (0)