InfiniTensor
diff --git a/‎include/infinicore/analyzer.hpp‎
Lines changed: 11 additions & 0 deletions b/‎include/infinicore/analyzer.hpp‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎include/infinicore/analyzer/intent_generator.hpp‎
Lines changed: 264 additions & 0 deletions b/‎include/infinicore/analyzer/intent_generator.hpp‎
Lines changed: 264 additions & 0 deletions
diff --git a/‎include/infinicore/analyzer/mutual_awareness_analyzer.hpp‎
Lines changed: 118 additions & 0 deletions b/‎include/infinicore/analyzer/mutual_awareness_analyzer.hpp‎
Lines changed: 118 additions & 0 deletions
@@ -0,0 +1,11 @@
+#pragma once
+
+// Convenience header — includes all analyzer components.
+
+#include "analyzer/op_type.hpp"
+#include "analyzer/op_trace.hpp"
+#include "analyzer/optimization_intent.hpp"
+#include "analyzer/phase_detector.hpp"
+#include "analyzer/resource_sensor.hpp"
+#include "analyzer/intent_generator.hpp"
+#include "analyzer/mutual_awareness_analyzer.hpp"
@@ -0,0 +1,264 @@
+#pragma once
+
+#include "optimization_intent.hpp"
+#include "op_trace.hpp"
+
+#include <algorithm>
+#include <vector>
+
+namespace infinicore::analyzer {
+
+/// IntentGenerator — the core "mutual awareness" logic.
+///
+/// This is where task demand and resource supply are jointly
+/// analyzed to produce an OptimizationIntent. It implements
+/// the key insight: the same task phase has different optimization
+/// needs under different resource conditions, and the same resource
+/// state has different supply value under different task phases.
+class IntentGenerator {
+public:
+    IntentGenerator() = default;
+
+    /// Generate the global semantic intent from phase detection
+    /// result and op trace window.
+    GlobalSemanticIntent generateGlobal(
+        PhaseType phase,
+        const std::vector<OpTraceEntry> &window,
+        const std::vector<DeviceLocalIntent> &device_intents) const {
+
+        GlobalSemanticIntent intent;
+        intent.current_phase = phase;
+        intent.timestamp_ns = OpTraceEntry::now();
+
+        if (!window.empty()) {
+            intent.op_window_start = 0;
+            intent.op_window_end = static_cast<uint32_t>(window.size());
+        }
+
+        // --- Compute intensity estimation ---
+        intent.compute_intensity = estimateComputeIntensity(phase, window);
+
+        // --- Determine primary bottleneck (mutual awareness) ---
+        intent.primary_bottleneck = determineGlobalBottleneck(phase, device_intents);
+
+        // --- Set optimization goal based on phase + bottleneck ---
+        intent.goal = determineGoal(phase, intent.primary_bottleneck);
+
+        // --- Generate strategy hints ---
+        intent.strategy = generateStrategy(phase, intent.primary_bottleneck, device_intents);
+
+        // --- Confidence ---
+        intent.confidence = computeConfidence(phase, window);
+
+        return intent;
+    }
+
+    /// Build the complete two-layer OptimizationIntent.
+    OptimizationIntent generate(
+        PhaseType phase,
+        const std::vector<OpTraceEntry> &window,
+        const std::vector<DeviceLocalIntent> &device_intents) const {
+
+        OptimizationIntent result;
+        result.global = generateGlobal(phase, window, device_intents);
+        result.per_device = device_intents;
+        return result;
+    }
+
+private:
+    /// Estimate compute intensity (higher = more compute-heavy).
+    /// Uses a simple heuristic based on op type composition.
+    float estimateComputeIntensity(
+        PhaseType phase,
+        const std::vector<OpTraceEntry> &window) const {
+
+        if (window.empty()) return 0.0f;
+
+        size_t heavy_compute_ops = 0;
+        for (auto &e : window) {
+            if (isGemmMlpOp(e.op_type) || isAttentionOp(e.op_type)) {
+                heavy_compute_ops++;
+            }
+        }
+        return static_cast<float>(heavy_compute_ops) / static_cast<float>(window.size());
+    }
+
+    /// Determine global bottleneck by jointly considering phase and
+    /// per-device resource state (the core mutual awareness logic).
+    BottleneckType determineGlobalBottleneck(
+        PhaseType phase,
+        const std::vector<DeviceLocalIntent> &device_intents) const {
+
+        bool any_memory_bound = false;
+        bool any_compute_bound = false;
+        bool any_bandwidth_bound = false;
+        bool any_communication_bound = false;
+        for (auto &d : device_intents) {
+            any_memory_bound = any_memory_bound || d.local_bottleneck == BottleneckType::MEMORY_BOUND;
+            any_compute_bound = any_compute_bound || d.local_bottleneck == BottleneckType::COMPUTE_BOUND;
+            any_bandwidth_bound = any_bandwidth_bound || d.local_bottleneck == BottleneckType::BANDWIDTH_BOUND;
+            any_communication_bound = any_communication_bound || d.local_bottleneck == BottleneckType::COMMUNICATION_BOUND;
+        }
+
+        // --- Mutual awareness logic ---
+        // The same resource state has different "supply value" depending on phase:
+
+        if (any_memory_bound) {
+            return BottleneckType::MEMORY_BOUND;
+        }
+
+        if (phase == PhaseType::COMMUNICATION || any_communication_bound) {
+            return BottleneckType::COMMUNICATION_BOUND;
+        }
+
+        switch (phase) {
+        case PhaseType::ATTENTION_DENSE:
+        case PhaseType::PREFILL:
+            // Attention/prefill is dominated by memory movement and KV access,
+            // so phase semantics should win unless memory/communication already
+            // forced an earlier return above.
+            if (any_bandwidth_bound) {
+                return BottleneckType::BANDWIDTH_BOUND;
+            }
+            return BottleneckType::BANDWIDTH_BOUND;
+
+        case PhaseType::GEMM_MLP_DENSE:
+            if (any_compute_bound) {
+                return BottleneckType::COMPUTE_BOUND;
+            }
+            if (any_bandwidth_bound) {
+                return BottleneckType::BANDWIDTH_BOUND;
+            }
+            return BottleneckType::COMPUTE_BOUND;
+
+        case PhaseType::DECODE:
+            if (any_bandwidth_bound) {
+                return BottleneckType::BANDWIDTH_BOUND;
+            }
+            if (any_compute_bound) {
+                return BottleneckType::COMPUTE_BOUND;
+            }
+            return BottleneckType::BANDWIDTH_BOUND;
+
+        case PhaseType::KV_CACHE:
+            if (any_bandwidth_bound) {
+                return BottleneckType::BANDWIDTH_BOUND;
+            }
+            return BottleneckType::MEMORY_BOUND;
+
+        default:
+            if (any_bandwidth_bound) {
+                return BottleneckType::BANDWIDTH_BOUND;
+            }
+            if (any_compute_bound) {
+                return BottleneckType::COMPUTE_BOUND;
+            }
+            return BottleneckType::BALANCED;
+        }
+    }
+
+    /// Determine optimization goal based on phase and bottleneck.
+    OptimizationGoal determineGoal(
+        PhaseType phase,
+        BottleneckType bottleneck) const {
+
+        // Under memory pressure, prioritize memory safety
+        if (bottleneck == BottleneckType::MEMORY_BOUND) {
+            return OptimizationGoal::MEMORY_SAFE;
+        }
+
+        if (bottleneck == BottleneckType::COMMUNICATION_BOUND) {
+            return OptimizationGoal::STABILITY_FIRST;
+        }
+
+        switch (phase) {
+        case PhaseType::DECODE:
+            // Decode latency is user-facing → latency first
+            return OptimizationGoal::LATENCY_FIRST;
+
+        case PhaseType::PREFILL:
+            // Prefill processes a full prompt → throughput first
+            return OptimizationGoal::THROUGHPUT_FIRST;
+
+        case PhaseType::ATTENTION_DENSE:
+            return OptimizationGoal::LATENCY_FIRST;
+
+        case PhaseType::GEMM_MLP_DENSE:
+            return OptimizationGoal::THROUGHPUT_FIRST;
+
+        default:
+            return OptimizationGoal::LATENCY_FIRST;
+        }
+    }
+
+    /// Generate strategy hints from phase + bottleneck + resources.
+    StrategyHint generateStrategy(
+        PhaseType phase,
+        BottleneckType bottleneck,
+        const std::vector<DeviceLocalIntent> &device_intents) const {
+
+        StrategyHint hint;
+
+        // Fusion is beneficial for bandwidth-bound phases (reduce memory traffic)
+        hint.prefer_fused_ops = (bottleneck == BottleneckType::BANDWIDTH_BOUND)
+                                || phase == PhaseType::DECODE;
+
+        // In-place when memory is tight
+        hint.prefer_in_place = (bottleneck == BottleneckType::MEMORY_BOUND);
+
+        // Recomputation (activation checkpointing) when memory is critical
+        bool extreme_memory = false;
+        for (auto &d : device_intents) {
+            if (d.memory_usage_ratio >= 0.95f) {
+                extreme_memory = true;
+                break;
+            }
+        }
+        hint.prefer_recomputation = extreme_memory;
+
+        // Async comm overlap for multi-device and communication phases
+        hint.prefer_async_comm = (device_intents.size() > 1)
+                                 && (phase == PhaseType::GEMM_MLP_DENSE
+                                     || phase == PhaseType::COMMUNICATION);
+
+        return hint;
+    }
+
+    /// Compute confidence based on how clear the phase signal is.
+    float computeConfidence(
+        PhaseType phase,
+        const std::vector<OpTraceEntry> &window) const {
+
+        if (window.empty() || phase == PhaseType::UNKNOWN) {
+            return 0.0f;
+        }
+
+        // Count how many ops in the window match the detected phase
+        size_t matching = 0;
+        for (auto &e : window) {
+            bool match = false;
+            switch (phase) {
+            case PhaseType::ATTENTION_DENSE:
+            case PhaseType::PREFILL:
+                match = isAttentionOp(e.op_type);
+                break;
+            case PhaseType::GEMM_MLP_DENSE:
+                match = isGemmMlpOp(e.op_type) || isActivationOp(e.op_type);
+                break;
+            case PhaseType::KV_CACHE:
+                match = isKvCacheOp(e.op_type);
+                break;
+            case PhaseType::DECODE:
+                match = isAttentionOp(e.op_type) || isGemmMlpOp(e.op_type);
+                break;
+            default:
+                break;
+            }
+            if (match) matching++;
+        }
+
+        return static_cast<float>(matching) / static_cast<float>(window.size());
+    }
+};
+
+} // namespace infinicore::analyzer
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "intent_generator.hpp"
+#include "op_trace.hpp"
+#include "optimization_intent.hpp"
+#include "phase_detector.hpp"
+#include "resource_sensor.hpp"
+
+#include <mutex>
+#include <vector>
+
+namespace infinicore::analyzer {
+
+/// MutualAwarenessAnalyzer — the top-level facade for the
+/// hardware-task mutual awareness requirements analysis module.
+///
+/// This is the primary entry point exposed to external frameworks
+/// (e.g., InfiniLM) via C++ function calls. It orchestrates:
+///   1. Op trace collection (via OpTraceRing)
+///   2. Phase detection (via PhaseDetector)
+///   3. Resource sensing (via ResourceSensor)
+///   4. Intent generation (via IntentGenerator)
+///
+/// Usage:
+///   auto& analyzer = MutualAwarenessAnalyzer::instance();
+///   // ... ops execute and get traced automatically ...
+///   auto intent = analyzer.analyze();  // Produces OptimizationIntent
+///
+/// Thread safety: analyze() is safe to call from any thread.
+/// The analyzer reads a snapshot of the op trace ring.
+class MutualAwarenessAnalyzer {
+public:
+    /// Get the singleton instance.
+    static MutualAwarenessAnalyzer &instance();
+
+    // Non-copyable, non-movable
+    MutualAwarenessAnalyzer(const MutualAwarenessAnalyzer &) = delete;
+    MutualAwarenessAnalyzer &operator=(const MutualAwarenessAnalyzer &) = delete;
+
+    /// Main analysis entry point.
+    /// Analyzes the current op trace window + resource state
+    /// and returns a complete OptimizationIntent.
+    ///
+    /// This is the function InfiniLM should call.
+    /// Latency: expected < 1ms for MVP rule-based analysis.
+    OptimizationIntent analyze();
+
+    /// Analyze with explicitly provided memory stats per device.
+    /// Use this when the caller can provide resource info directly.
+    OptimizationIntent analyze(const std::vector<std::pair<int, MemoryStats>> &device_stats);
+
+    /// Analyze with explicitly provided device resource snapshots.
+    /// This is the richer input path used by demand-analysis-oriented callers.
+    OptimizationIntent analyze(const std::vector<DeviceResourceSnapshot> &device_snapshots);
+
+    /// Get the current phase without generating full intent.
+    /// Lightweight query for simple use cases.
+    PhaseType getCurrentPhase() const;
+
+    /// Get the current optimization goal derived from the
+    /// latest analyzer result.
+    OptimizationGoal getCurrentOptimizationGoal() const;
+
+    /// Get the most recent OptimizationIntent (cached from last analyze()).
+    const OptimizationIntent &lastIntent() const;
+
+    /// Access the underlying components for configuration.
+    PhaseDetector &phaseDetector() { return phase_detector_; }
+    ResourceSensor &resourceSensor() { return resource_sensor_; }
+    OpTraceRing &opTrace() { return getGlobalOpTrace(); }
+
+    /// Enable / disable the analyzer.
+    /// When disabled, analyze() returns a default intent and
+    /// op trace recording is skipped.
+    void setEnabled(bool enabled) { enabled_ = enabled; }
+    bool isEnabled() const { return enabled_; }
+
+    /// Graph recording support: when graph recording stops,
+    /// analyze the recorded op sequence once and cache the result.
+    /// Subsequent calls return the cached intent without re-analysis.
+    void onGraphRecordingStop();
+    void clearGraphCache();
+
+private:
+    MutualAwarenessAnalyzer();
+
+    PhaseDetector phase_detector_;
+    ResourceSensor resource_sensor_;
+    IntentGenerator intent_generator_;
+
+    OptimizationIntent last_intent_;
+    mutable std::mutex mutex_;
+
+    bool enabled_ = true;
+
+    // Graph recording cache
+    bool graph_intent_cached_ = false;
+    OptimizationIntent graph_cached_intent_;
+};
+
+// ============================================================
+// C-style API for external framework integration (e.g., InfiniLM)
+// ============================================================
+
+/// Analyze current state and return an OptimizationIntent.
+/// This is the simplest API for external frameworks to call.
+OptimizationIntent analyzeCurrentState();
+
+/// Get the current detected phase.
+PhaseType getCurrentPhase();
+
+/// Get the current optimization goal.
+OptimizationGoal getCurrentOptimizationGoal();
+
+/// Enable / disable the mutual awareness analyzer.
+void setAnalyzerEnabled(bool enabled);
+
+} // namespace infinicore::analyzer