|
44 | 44 | import org.apache.pinot.query.routing.MailboxInfos; |
45 | 45 | import org.apache.pinot.query.routing.QueryServerInstance; |
46 | 46 | import org.apache.pinot.query.routing.SharedMailboxInfos; |
| 47 | +import org.apache.pinot.spi.config.table.TableType; |
47 | 48 |
|
48 | 49 |
|
49 | 50 | /** |
@@ -87,11 +88,15 @@ private void process(PRelNode pRelNode, @Nullable PlanNode parent, int currentFr |
87 | 88 | processTableScan((PhysicalTableScan) pRelNode.unwrap(), currentFragmentId, context); |
88 | 89 | } |
89 | 90 | if (pRelNode.unwrap() instanceof PhysicalExchange) { |
| 91 | + PhysicalExchange physicalExchange = (PhysicalExchange) pRelNode.unwrap(); |
| 92 | + if (physicalExchange.getExchangeStrategy() == ExchangeStrategy.LOOKUP_LOCAL_EXCHANGE) { |
| 93 | + processLookupLocalExchange(pRelNode, parent, currentFragmentId, context); |
| 94 | + return; |
| 95 | + } |
90 | 96 | // Split an exchange into two fragments: one for the sender and one for the receiver. |
91 | 97 | // The sender fragment will have a MailboxSendNode and receiver a MailboxReceiveNode. |
92 | 98 | // It is possible that the receiver fragment doesn't exist yet (e.g. when PhysicalExchange is the root node). |
93 | 99 | // In that case, we also create it here. If it exists already, we simply re-use it. |
94 | | - PhysicalExchange physicalExchange = (PhysicalExchange) pRelNode.unwrap(); |
95 | 100 | PlanFragment receiverFragment = context._planFragmentMap.get(currentFragmentId); |
96 | 101 | int senderFragmentId = context._planFragmentMap.size() + (receiverFragment == null ? 1 : 0); |
97 | 102 | final DataSchema inputFragmentSchema = PRelToPlanNodeConverter.toDataSchema( |
@@ -173,6 +178,77 @@ private void processTableScan(PhysicalTableScan tableScan, int currentFragmentId |
173 | 178 | } |
174 | 179 | } |
175 | 180 |
|
| 181 | + /** |
| 182 | + * Handles LOOKUP_LOCAL_EXCHANGE: a pseudo-exchange that does NOT split fragments. The dim table |
| 183 | + * stays in the join's fragment. This method: |
| 184 | + * <ol> |
| 185 | + * <li>Registers the dim table name so the fragment is classified as a leaf stage</li> |
| 186 | + * <li>Sets fake empty segments per worker (the dim table is accessed via |
| 187 | + * {@code DimensionTableDataManager} at runtime, not via segment routing)</li> |
| 188 | + * <li>Converts children to PlanNodes in the same fragment (no MailboxSend/Receive)</li> |
| 189 | + * </ol> |
| 190 | + * This matches V1's behavior in {@code WorkerManager.assignWorkersToNonRootFragment} where |
| 191 | + * lookup joins are detected and the dim table is registered with empty segments. |
| 192 | + */ |
| 193 | + private void processLookupLocalExchange(PRelNode pRelNode, @Nullable PlanNode parent, int currentFragmentId, |
| 194 | + Context context) { |
| 195 | + // Find the dim table scan in the exchange's children and register it with empty segments. |
| 196 | + DispatchablePlanMetadata fragmentMetadata = context._fragmentMetadataMap.get(currentFragmentId); |
| 197 | + for (PRelNode child : pRelNode.getPRelInputs()) { |
| 198 | + registerDimTableInFragment(child, fragmentMetadata); |
| 199 | + } |
| 200 | + // Process children in the same fragment (no MailboxSend/Receive), but skip processTableScan |
| 201 | + // by converting PRelNodes to PlanNodes directly. The right side of a lookup join is always |
| 202 | + // [Project →] TableScan (at most 2 levels deep) — Calcite pushes dim-side filters to post-join. |
| 203 | + for (PRelNode child : pRelNode.getPRelInputs()) { |
| 204 | + PlanNode planNode = PRelToPlanNodeConverter.toPlanNode(child, currentFragmentId); |
| 205 | + for (PRelNode grandChild : child.getPRelInputs()) { |
| 206 | + Preconditions.checkState(grandChild.getPRelInputs().isEmpty(), |
| 207 | + "LOOKUP_LOCAL_EXCHANGE right side deeper than 2 levels: found children under %s. " |
| 208 | + + "Expected [Project →] TableScan only.", grandChild.unwrap().getClass().getSimpleName()); |
| 209 | + PlanNode grandChildNode = PRelToPlanNodeConverter.toPlanNode(grandChild, currentFragmentId); |
| 210 | + planNode.getInputs().add(grandChildNode); |
| 211 | + } |
| 212 | + if (parent != null) { |
| 213 | + parent.getInputs().add(planNode); |
| 214 | + } |
| 215 | + } |
| 216 | + } |
| 217 | + |
| 218 | + /** |
| 219 | + * Recursively find TableScan nodes and register the dim table name in the fragment metadata with |
| 220 | + * fake empty segments per worker, matching V1's {@code WorkerManager.assignWorkersToNonRootFragment} |
| 221 | + * behavior for lookup joins. |
| 222 | + */ |
| 223 | + private void registerDimTableInFragment(PRelNode pRelNode, DispatchablePlanMetadata fragmentMetadata) { |
| 224 | + if (pRelNode.unwrap() instanceof TableScan) { |
| 225 | + PhysicalTableScan tableScan = (PhysicalTableScan) pRelNode.unwrap(); |
| 226 | + TableScanMetadata tableScanMetadata = Objects.requireNonNull(tableScan.getTableScanMetadata(), |
| 227 | + "No metadata in table scan PRelNode"); |
| 228 | + String tableName = tableScanMetadata.getScannedTables().stream().findFirst().orElseThrow(); |
| 229 | + fragmentMetadata.addScannedTable(tableName); |
| 230 | + // Set fake empty segments for each worker so isLeafStageWorker() returns true. |
| 231 | + // The actual dim table data comes from DimensionTableDataManager at runtime. |
| 232 | + // Use putIfAbsent rather than overwrite to be defensive if called multiple times. |
| 233 | + Map<Integer, QueryServerInstance> workers = fragmentMetadata.getWorkerIdToServerInstanceMap(); |
| 234 | + if (workers != null) { |
| 235 | + Map<Integer, Map<String, List<String>>> existing = fragmentMetadata.getWorkerIdToSegmentsMap(); |
| 236 | + Map<Integer, Map<String, List<String>>> fakeSegmentsMap = |
| 237 | + existing != null ? new HashMap<>(existing) : new HashMap<>(); |
| 238 | + for (Integer workerId : workers.keySet()) { |
| 239 | + fakeSegmentsMap.putIfAbsent(workerId, Map.of(TableType.OFFLINE.name(), List.of())); |
| 240 | + } |
| 241 | + fragmentMetadata.setWorkerIdToSegmentsMap(fakeSegmentsMap); |
| 242 | + } |
| 243 | + NodeHint nodeHint = NodeHint.fromRelHints(tableScan.getHints()); |
| 244 | + fragmentMetadata.setTableOptions(nodeHint.getHintOptions().get(PinotHintOptions.TABLE_HINT_OPTIONS)); |
| 245 | + return; |
| 246 | + } |
| 247 | + for (PRelNode child : pRelNode.getPRelInputs()) { |
| 248 | + registerDimTableInFragment(child, fragmentMetadata); |
| 249 | + } |
| 250 | + } |
| 251 | + |
176 | 252 | private PlanFragment createFragment(int fragmentId, PlanNode planNode, List<PlanFragment> inputFragments, |
177 | 253 | Context context, List<String> workers) { |
178 | 254 | // track new plan fragment |
@@ -248,6 +324,9 @@ private void computeMailboxInfos(int senderStageId, int receiverStageId, |
248 | 324 | } |
249 | 325 | break; |
250 | 326 | } |
| 327 | + case LOOKUP_LOCAL_EXCHANGE: |
| 328 | + throw new IllegalStateException("LOOKUP_LOCAL_EXCHANGE should not reach computeMailboxInfos — " |
| 329 | + + "it must be handled as transparent in process() before fragment splitting"); |
251 | 330 | default: |
252 | 331 | throw new UnsupportedOperationException("exchange desc not supported yet: " + exchangeDesc); |
253 | 332 | } |
|
0 commit comments