Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions include/infinicore/analyzer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#pragma once

// Convenience header — includes all analyzer components.

#include "analyzer/op_type.hpp"
#include "analyzer/op_trace.hpp"
#include "analyzer/optimization_intent.hpp"
#include "analyzer/phase_detector.hpp"
#include "analyzer/resource_sensor.hpp"
#include "analyzer/intent_generator.hpp"
#include "analyzer/mutual_awareness_analyzer.hpp"
264 changes: 264 additions & 0 deletions include/infinicore/analyzer/intent_generator.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
#pragma once

#include "optimization_intent.hpp"
#include "op_trace.hpp"

#include <algorithm>
#include <vector>

namespace infinicore::analyzer {

/// IntentGenerator — the core "mutual awareness" logic.
///
/// This is where task demand and resource supply are jointly
/// analyzed to produce an OptimizationIntent. It implements
/// the key insight: the same task phase has different optimization
/// needs under different resource conditions, and the same resource
/// state has different supply value under different task phases.
class IntentGenerator {
public:
IntentGenerator() = default;

/// Generate the global semantic intent from phase detection
/// result and op trace window.
GlobalSemanticIntent generateGlobal(
PhaseType phase,
const std::vector<OpTraceEntry> &window,
const std::vector<DeviceLocalIntent> &device_intents) const {

GlobalSemanticIntent intent;
intent.current_phase = phase;
intent.timestamp_ns = OpTraceEntry::now();

if (!window.empty()) {
intent.op_window_start = 0;
intent.op_window_end = static_cast<uint32_t>(window.size());
}

// --- Compute intensity estimation ---
intent.compute_intensity = estimateComputeIntensity(phase, window);

// --- Determine primary bottleneck (mutual awareness) ---
intent.primary_bottleneck = determineGlobalBottleneck(phase, device_intents);

// --- Set optimization goal based on phase + bottleneck ---
intent.goal = determineGoal(phase, intent.primary_bottleneck);

// --- Generate strategy hints ---
intent.strategy = generateStrategy(phase, intent.primary_bottleneck, device_intents);

// --- Confidence ---
intent.confidence = computeConfidence(phase, window);

return intent;
}

/// Build the complete two-layer OptimizationIntent.
OptimizationIntent generate(
PhaseType phase,
const std::vector<OpTraceEntry> &window,
const std::vector<DeviceLocalIntent> &device_intents) const {

OptimizationIntent result;
result.global = generateGlobal(phase, window, device_intents);
result.per_device = device_intents;
return result;
}

private:
/// Estimate compute intensity (higher = more compute-heavy).
/// Uses a simple heuristic based on op type composition.
float estimateComputeIntensity(
PhaseType phase,
const std::vector<OpTraceEntry> &window) const {

if (window.empty()) return 0.0f;

size_t heavy_compute_ops = 0;
for (auto &e : window) {
if (isGemmMlpOp(e.op_type) || isAttentionOp(e.op_type)) {
heavy_compute_ops++;
}
}
return static_cast<float>(heavy_compute_ops) / static_cast<float>(window.size());
}

/// Determine global bottleneck by jointly considering phase and
/// per-device resource state (the core mutual awareness logic).
BottleneckType determineGlobalBottleneck(
PhaseType phase,
const std::vector<DeviceLocalIntent> &device_intents) const {

bool any_memory_bound = false;
bool any_compute_bound = false;
bool any_bandwidth_bound = false;
bool any_communication_bound = false;
for (auto &d : device_intents) {
any_memory_bound = any_memory_bound || d.local_bottleneck == BottleneckType::MEMORY_BOUND;
any_compute_bound = any_compute_bound || d.local_bottleneck == BottleneckType::COMPUTE_BOUND;
any_bandwidth_bound = any_bandwidth_bound || d.local_bottleneck == BottleneckType::BANDWIDTH_BOUND;
any_communication_bound = any_communication_bound || d.local_bottleneck == BottleneckType::COMMUNICATION_BOUND;
}

// --- Mutual awareness logic ---
// The same resource state has different "supply value" depending on phase:

if (any_memory_bound) {
return BottleneckType::MEMORY_BOUND;
}

if (phase == PhaseType::COMMUNICATION || any_communication_bound) {
return BottleneckType::COMMUNICATION_BOUND;
}

switch (phase) {
case PhaseType::ATTENTION_DENSE:
case PhaseType::PREFILL:
// Attention/prefill is dominated by memory movement and KV access,
// so phase semantics should win unless memory/communication already
// forced an earlier return above.
if (any_bandwidth_bound) {
return BottleneckType::BANDWIDTH_BOUND;
}
return BottleneckType::BANDWIDTH_BOUND;

case PhaseType::GEMM_MLP_DENSE:
if (any_compute_bound) {
return BottleneckType::COMPUTE_BOUND;
}
if (any_bandwidth_bound) {
return BottleneckType::BANDWIDTH_BOUND;
}
return BottleneckType::COMPUTE_BOUND;

case PhaseType::DECODE:
if (any_bandwidth_bound) {
return BottleneckType::BANDWIDTH_BOUND;
}
if (any_compute_bound) {
return BottleneckType::COMPUTE_BOUND;
}
return BottleneckType::BANDWIDTH_BOUND;

case PhaseType::KV_CACHE:
if (any_bandwidth_bound) {
return BottleneckType::BANDWIDTH_BOUND;
}
return BottleneckType::MEMORY_BOUND;

default:
if (any_bandwidth_bound) {
return BottleneckType::BANDWIDTH_BOUND;
}
if (any_compute_bound) {
return BottleneckType::COMPUTE_BOUND;
}
return BottleneckType::BALANCED;
}
}

/// Determine optimization goal based on phase and bottleneck.
OptimizationGoal determineGoal(
PhaseType phase,
BottleneckType bottleneck) const {

// Under memory pressure, prioritize memory safety
if (bottleneck == BottleneckType::MEMORY_BOUND) {
return OptimizationGoal::MEMORY_SAFE;
}

if (bottleneck == BottleneckType::COMMUNICATION_BOUND) {
return OptimizationGoal::STABILITY_FIRST;
}

switch (phase) {
case PhaseType::DECODE:
// Decode latency is user-facing → latency first
return OptimizationGoal::LATENCY_FIRST;

case PhaseType::PREFILL:
// Prefill processes a full prompt → throughput first
return OptimizationGoal::THROUGHPUT_FIRST;

case PhaseType::ATTENTION_DENSE:
return OptimizationGoal::LATENCY_FIRST;

case PhaseType::GEMM_MLP_DENSE:
return OptimizationGoal::THROUGHPUT_FIRST;

default:
return OptimizationGoal::LATENCY_FIRST;
}
}

/// Generate strategy hints from phase + bottleneck + resources.
StrategyHint generateStrategy(
PhaseType phase,
BottleneckType bottleneck,
const std::vector<DeviceLocalIntent> &device_intents) const {

StrategyHint hint;

// Fusion is beneficial for bandwidth-bound phases (reduce memory traffic)
hint.prefer_fused_ops = (bottleneck == BottleneckType::BANDWIDTH_BOUND)
|| phase == PhaseType::DECODE;

// In-place when memory is tight
hint.prefer_in_place = (bottleneck == BottleneckType::MEMORY_BOUND);

// Recomputation (activation checkpointing) when memory is critical
bool extreme_memory = false;
for (auto &d : device_intents) {
if (d.memory_usage_ratio >= 0.95f) {
extreme_memory = true;
break;
}
}
hint.prefer_recomputation = extreme_memory;

// Async comm overlap for multi-device and communication phases
hint.prefer_async_comm = (device_intents.size() > 1)
&& (phase == PhaseType::GEMM_MLP_DENSE
|| phase == PhaseType::COMMUNICATION);

return hint;
}

/// Compute confidence based on how clear the phase signal is.
float computeConfidence(
PhaseType phase,
const std::vector<OpTraceEntry> &window) const {

if (window.empty() || phase == PhaseType::UNKNOWN) {
return 0.0f;
}

// Count how many ops in the window match the detected phase
size_t matching = 0;
for (auto &e : window) {
bool match = false;
switch (phase) {
case PhaseType::ATTENTION_DENSE:
case PhaseType::PREFILL:
match = isAttentionOp(e.op_type);
break;
case PhaseType::GEMM_MLP_DENSE:
match = isGemmMlpOp(e.op_type) || isActivationOp(e.op_type);
break;
case PhaseType::KV_CACHE:
match = isKvCacheOp(e.op_type);
break;
case PhaseType::DECODE:
match = isAttentionOp(e.op_type) || isGemmMlpOp(e.op_type);
break;
default:
break;
}
if (match) matching++;
}

return static_cast<float>(matching) / static_cast<float>(window.size());
}
};

} // namespace infinicore::analyzer
118 changes: 118 additions & 0 deletions include/infinicore/analyzer/mutual_awareness_analyzer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#pragma once

#include "intent_generator.hpp"
#include "op_trace.hpp"
#include "optimization_intent.hpp"
#include "phase_detector.hpp"
#include "resource_sensor.hpp"

#include <mutex>
#include <vector>

namespace infinicore::analyzer {

/// MutualAwarenessAnalyzer — the top-level facade for the
/// hardware-task mutual awareness requirements analysis module.
///
/// This is the primary entry point exposed to external frameworks
/// (e.g., InfiniLM) via C++ function calls. It orchestrates:
/// 1. Op trace collection (via OpTraceRing)
/// 2. Phase detection (via PhaseDetector)
/// 3. Resource sensing (via ResourceSensor)
/// 4. Intent generation (via IntentGenerator)
///
/// Usage:
/// auto& analyzer = MutualAwarenessAnalyzer::instance();
/// // ... ops execute and get traced automatically ...
/// auto intent = analyzer.analyze(); // Produces OptimizationIntent
///
/// Thread safety: analyze() is safe to call from any thread.
/// The analyzer reads a snapshot of the op trace ring.
class MutualAwarenessAnalyzer {
public:
/// Get the singleton instance.
static MutualAwarenessAnalyzer &instance();

// Non-copyable, non-movable
MutualAwarenessAnalyzer(const MutualAwarenessAnalyzer &) = delete;
MutualAwarenessAnalyzer &operator=(const MutualAwarenessAnalyzer &) = delete;

/// Main analysis entry point.
/// Analyzes the current op trace window + resource state
/// and returns a complete OptimizationIntent.
///
/// This is the function InfiniLM should call.
/// Latency: expected < 1ms for MVP rule-based analysis.
OptimizationIntent analyze();

/// Analyze with explicitly provided memory stats per device.
/// Use this when the caller can provide resource info directly.
OptimizationIntent analyze(const std::vector<std::pair<int, MemoryStats>> &device_stats);

/// Analyze with explicitly provided device resource snapshots.
/// This is the richer input path used by demand-analysis-oriented callers.
OptimizationIntent analyze(const std::vector<DeviceResourceSnapshot> &device_snapshots);

/// Get the current phase without generating full intent.
/// Lightweight query for simple use cases.
PhaseType getCurrentPhase() const;

/// Get the current optimization goal derived from the
/// latest analyzer result.
OptimizationGoal getCurrentOptimizationGoal() const;

/// Get the most recent OptimizationIntent (cached from last analyze()).
const OptimizationIntent &lastIntent() const;

/// Access the underlying components for configuration.
PhaseDetector &phaseDetector() { return phase_detector_; }
ResourceSensor &resourceSensor() { return resource_sensor_; }
OpTraceRing &opTrace() { return getGlobalOpTrace(); }

/// Enable / disable the analyzer.
/// When disabled, analyze() returns a default intent and
/// op trace recording is skipped.
void setEnabled(bool enabled) { enabled_ = enabled; }
bool isEnabled() const { return enabled_; }

/// Graph recording support: when graph recording stops,
/// analyze the recorded op sequence once and cache the result.
/// Subsequent calls return the cached intent without re-analysis.
void onGraphRecordingStop();
void clearGraphCache();

private:
MutualAwarenessAnalyzer();

PhaseDetector phase_detector_;
ResourceSensor resource_sensor_;
IntentGenerator intent_generator_;

OptimizationIntent last_intent_;
mutable std::mutex mutex_;

bool enabled_ = true;

// Graph recording cache
bool graph_intent_cached_ = false;
OptimizationIntent graph_cached_intent_;
};

// ============================================================
// C-style API for external framework integration (e.g., InfiniLM)
// ============================================================

/// Analyze current state and return an OptimizationIntent.
/// This is the simplest API for external frameworks to call.
OptimizationIntent analyzeCurrentState();

/// Get the current detected phase.
PhaseType getCurrentPhase();

/// Get the current optimization goal.
OptimizationGoal getCurrentOptimizationGoal();

/// Enable / disable the mutual awareness analyzer.
void setAnalyzerEnabled(bool enabled);

} // namespace infinicore::analyzer
Loading
Loading