LearningInfiniTensor
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎report.xml‎
Lines changed: 10 additions & 0 deletions b/‎report.xml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/core/allocator.cc‎
Lines changed: 42 additions & 30 deletions b/‎src/core/allocator.cc‎
Lines changed: 42 additions & 30 deletions
diff --git a/‎src/core/graph.cc‎
Lines changed: 190 additions & 7 deletions b/‎src/core/graph.cc‎
Lines changed: 190 additions & 7 deletions
diff --git a/‎src/operators/concat.cc‎
Lines changed: 19 additions & 3 deletions b/‎src/operators/concat.cc‎
Lines changed: 19 additions & 3 deletions
@@ -1,6 +1,6 @@
 .PHONY : build clean format install-python test-cpp test-onnx
 
-TYPE ?= Release
+TYPE ?= Debug
 TEST ?= ON
 
 CMAKE_OPT = -DCMAKE_BUILD_TYPE=$(TYPE)
 
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<testsuites tests="1" failures="1" disabled="0" errors="0" time="0." timestamp="2025-08-08T12:42:56.640" name="AllTests">
+  <testsuite name="Concat" tests="1" failures="1" disabled="0" skipped="0" errors="0" time="0." timestamp="2025-08-08T12:42:56.640">
+    <testcase name="NativeCpu" file="/amax/2020/hx2024/Cpp/TinyInfiniTensor/test/kernels/nativecpu/test_nativecpu_concat.cc" line="9" status="run" result="completed" time="0." timestamp="2025-08-08T12:42:56.640" classname="Concat">
+      <failure message="unknown file&#x0A;C++ exception with description &quot;&quot; thrown in the test body.&#x0A;" type=""><![CDATA[unknown file
+C++ exception with description "" thrown in the test body.
+]]></failure>
+    </testcase>
+  </testsuite>
+</testsuites>
@@ -34,28 +34,41 @@ namespace infini
         // =================================== 作业 ===================================
         if (freeBlocks.empty())
         {
-            used += size;
-            peak = std::max(peak, used);
-            freeBlocks.emplace(0, size); // Allocate from the start
-            return 0;
+            freeBlocks[0] = 4096; // Initially, all memory is free
         }
         for(auto it = freeBlocks.begin(); it != freeBlocks.end(); it ++){
             auto [addr, blockSize] = *it;
-            if(blockSize >= size){
-                size_t upper_addr = freeBlocks.upper_bound(addr)->first;
-                size_t gap = upper_addr - (addr + blockSize);
-                if(gap >= size){
-                    used += size;
-                    freeBlocks[addr + blockSize] = gap; // Update the free block after allocation
-                    return addr + blockSize;
+            if(blockSize >= size){ //blockSize 是可用空间
+                if(blockSize > size){
+                    // Split the block if it's larger than requested size
+                    freeBlocks[addr + size] = blockSize - size;
                 }
+                freeBlocks.erase(it);
+                used += size;
+                peak = std::max(peak, used);
+                return it->first;
             }
         }
-        used += size;
-        peak = std::max(peak, used);
-        size_t lastAddr = freeBlocks.rbegin()->first + freeBlocks.rbegin()->second;
-        freeBlocks.emplace(lastAddr, size); // Allocate
-        return lastAddr;
+        
+        return 0;
+
+
+
+        // if (this->freeBlocks.empty())
+        //     this->freeBlocks[0] = 1024;
+        // for (auto it = this->freeBlocks.begin(); it != this->freeBlocks.end(); ++it)
+        // {
+        //     if (it->second >= size)
+        //     {
+        //         if (it->second > size)
+        //             this->freeBlocks[it->first + size] = it->second - size;
+        //         auto ans = it->first;
+        //         this->freeBlocks.erase(it);
+        //         this->used += size;
+        //         this->peak = (this->peak >= this->used) ? this->peak : this->used;
+        //         return ans;
+        //     }
+        // }
     }
 
     void Allocator::free(size_t addr, size_t size)
@@ -65,22 +78,21 @@ namespace infini
         // =================================== 作业 ===================================
         // TODO: 设计一个算法来回收内存
         // =================================== 作业 =================================== 
-        used -= size;
-        auto next = freeBlocks.upper_bound(addr);
-        if(next != freeBlocks.end() && addr + size == next -> first){ // 再次确保是否物理相邻
-            // Merge with next block
-            size += next->second;
-            freeBlocks.erase(next);
+        freeBlocks[addr] = size;
+        auto it = freeBlocks.find(addr);
+        auto nextIt = std::next(it);
+        if (nextIt != freeBlocks.end() && it->first + it->second == nextIt->first)
+        {
+            it->second += nextIt->second;
+            freeBlocks.erase(nextIt);
         }
-        auto prev = freeBlocks.lower_bound(addr);
-        if(prev != freeBlocks.begin() && prev -> first + prev->second == addr){ // 再次确保是否物理相邻
-            // Merge with previous block
-            size += prev->second;
-            addr = prev->first; // Update address to the start of the merged block
-            
-            freeBlocks.erase(prev);
+        auto prevIt = std::prev(it);
+        if (it != freeBlocks.begin() && prevIt->first + prevIt->second == it->first)
+        {
+            prevIt->second += it->second;
+            freeBlocks.erase(it);
         }
-        freeBlocks.emplace(addr, size); // Store the freed block
+        used = used - size;
     }
 
     void *Allocator::getPtr()
 
@@ -2,7 +2,8 @@
 #include <algorithm>
 #include <numeric>
 #include <queue>
-
+#include "operators/matmul.h"
+#include "operators/transpose.h"
 namespace infini
 {
 
@@ -106,6 +107,163 @@ namespace infini
         // 1. 去除冗余的算子（例如，两个相邻的算子都是 transpose 算子，且做的是相反的操作，可以将其全部删除）
         // 2. 合并算子（例如，矩阵乘算子中含有属性transA、transB，如果其输入存在transpose，且对最后两个维度做交换，就可以将transpose融入到矩阵乘算子的属性中去）
         // =================================== 作业 ===================================
+
+        // rule1: 删除无用的transpose算子
+        for (size_t i = 0; i < ops.size(); ++i)
+        {
+            Operator op = ops[i];
+            if (op->getOpType() == OpType::Transpose)
+            {
+                Tensor tensor = op->getOutput();
+                if (!tensor)
+                    continue;
+                auto targets = tensor->getTargets();
+                if (targets.empty())
+                    continue;
+                Operator op_next = targets[0];
+                if (op_next->getOpType() == OpType::Transpose)
+                {
+                    TransposeObj *op1 = as<TransposeObj>(op).get();
+                    TransposeObj *op2 = as<TransposeObj>(op_next).get();
+                    auto op1_permute = op1->getPermute();
+                    auto op2_permute = op2->getPermute();
+                    if (op1_permute.size() != op2_permute.size())
+                        continue;
+                    bool flag = true;
+                    for (int j = 0; j < (int)op1_permute.size(); j++)
+                    {
+                        if (op1_permute[op2_permute[j]] != j)
+                        {
+                            flag = false;
+                            continue;
+                        }
+                    }
+                    if (!flag) //flag为false说明 无法合并
+                        continue;
+                    // 获取第一个转置算子的输入张量（原始输入数据）
+                    Tensor originalInput = op->getInputs()[0];  
+
+                    // 获取第一个转置算子的输出张量（第一次转置结果）
+                    Tensor firstTransposeOutput = op->getOutput();  
+
+                    // 获取第二个转置算子的输出张量（最终转置结果） 
+                    Tensor secondTransposeOutput = op_next->getOutput();
+
+                    // 获取使用最终结果的消费者算子（如矩阵乘法）
+                    Operator consumerOp = secondTransposeOutput->getTargets()[0];  
+
+                    // 保留消费者算子的其他输入（如矩阵乘法的右矩阵）
+                    Tensor consumerOtherInput = consumerOp->getInputs()[1];  
+
+                    // 重定向消费者算子的输入：跳过两个转置，直接使用原始输入
+                    consumerOp->replaceInput(consumerOp->getInputs()[0], originalInput);
+
+                    // 更新原始输入的连接关系：
+                    originalInput->removeTarget(op);          // 移除对第一个转置的引用
+                    originalInput->addTarget(consumerOp);     // 添加对消费者算子的引用
+                    originalInput->setSource(nullptr);        // 清除可能存在的生产者标记
+
+                    // 清理冗余资源
+                    removeOperator(op);                      // 删除第一个转置算子
+                    removeOperator(op_next);                 // 删除第二个转置算子
+                    removeTensor(firstTransposeOutput);       // 删除中间结果张量
+                    removeTensor(secondTransposeOutput);     // 删除最终结果张量
+
+                    // 更新算子间的拓扑依赖关系
+                    consumerOp->removePredecessors(op_next); // 移除与第二个转置的依赖
+
+                    // 如果原始输入有生产者，建立新的依赖关系
+                    if (originalInput->getSource()) {
+                        consumerOp->addPredecessors(originalInput->getSource());
+                        originalInput->getSource()->addSuccessors(consumerOp);
+                    }
+                }
+            }
+        }
+        
+        // 遍历图中的所有算子，寻找可优化的矩阵乘法算子
+        for (size_t opIndex = 0; opIndex < ops.size(); ++opIndex) {
+            Operator currentOp = ops[opIndex];
+            
+            // 只处理矩阵乘法算子
+            if (currentOp->getOpType() == OpType::MatMul) {
+                // 获取矩阵乘法的输入张量列表（左矩阵和右矩阵）
+                TensorVec matmulInputs = currentOp->getInputs();
+                int inputIndex = 0;  // 用于标识当前是左输入(0)还是右输入(1)
+                
+                // 检查每个输入张量
+                for (Tensor inputTensor : matmulInputs) {
+                    inputIndex++;
+                    
+                    // 检查输入张量是否有生产者算子
+                    if (inputTensor->getSource()) {
+                        Operator producerOp = inputTensor->getSource();
+                        
+                        // 如果生产者是转置算子
+                        if (producerOp->getOpType() == OpType::Transpose) {
+                            TransposeObj *transposeOp = as<TransposeObj>(producerOp).get();
+                            Shape transposePerm = transposeOp->getPermute();
+                            bool isLastTwoDimsSwap = true;
+                            
+                            /* 验证转置操作是否只交换最后两个维度：
+                            * 1. 前n-2个维度必须保持原顺序（即perm[j] == j）
+                            * 2. 最后两个维度必须交换（即perm[-2] == rank-1 且 perm[-1] == rank-2）
+                            */
+                            for (int dim = 0; dim < (int)transposePerm.size() - 2; dim++) {
+                                if (transposePerm[dim] != dim) {
+                                    isLastTwoDimsSwap = false;
+                                    break;
+                                }
+                            }
+                            if (transposePerm[transposePerm.size() - 2] != (int)transposePerm.size() - 1 || 
+                                transposePerm[transposePerm.size() - 1] != (int)transposePerm.size() - 2) {
+                                isLastTwoDimsSwap = false;
+                            }
+                            
+                            // 如果不满足条件则跳过优化
+                            if (!isLastTwoDimsSwap) continue;
+                            
+                            // 获取矩阵乘法算子（用于修改转置属性）
+                            MatmulObj *matmulOp = as<MatmulObj>(currentOp).get();
+                            Tensor transposedTensor;
+                            
+                            // 根据输入位置设置对应的转置标志
+                            if (inputIndex == 1) {  // 左输入
+                                matmulOp->setTransA(true);  // 启用左矩阵转置
+                                transposedTensor = matmulOp->getInputs(0);
+                            } else {  // 右输入
+                                matmulOp->setTransB(true);  // 启用右矩阵转置
+                                transposedTensor = matmulOp->getInputs(1);
+                            }
+                            
+                            // 获取转置算子的输入（原始未转置的张量）
+                            Operator transposeOperator = transposedTensor->getSource();
+                            Tensor originalTensor = transposeOperator->getInputs()[0];
+                            
+                            // 重定向矩阵乘法的输入：跳过转置算子，直接使用原始张量
+                            matmulOp->replaceInput(transposedTensor, originalTensor);
+                            
+                            // 更新张量连接关系
+                            originalTensor->removeTarget(transposeOperator);
+                            originalTensor->addTarget(currentOp);
+                            
+                            // 清理资源：删除转置算子和中间张量
+                            removeOperator(transposeOperator);
+                            removeTensor(transposedTensor);
+                            
+                            // 更新拓扑关系：移除转置算子作为前驱
+                            currentOp->removePredecessors(transposeOperator);
+                            
+                            // 如果原始张量有生产者，建立新的依赖关系
+                            if (originalTensor->getSource()) {
+                                currentOp->addPredecessors(originalTensor->getSource());
+                                originalTensor->getSource()->addSuccessors(currentOp);
+                            }
+                        }
+                    }
+                }
+            }
+}
     }
 
     Tensor GraphObj::getTensor(int fuid) const
@@ -152,16 +310,41 @@ namespace infini
         // TODO：利用 allocator 给计算图分配内存
         // HINT: 获取分配好的内存指针后，可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存
         // =================================== 作业 ===================================
+        // allocator.info();
+        // void* allocatorPtr = allocator.getPtr();
+        // for(auto it = tensors.begin(); it != tensors.end(); it++){
+        //     auto tensor = *it;
+        //     size_t size = tensor->getBytes();
+        //     size_t addr = allocator.alloc(size);
+        //     char * tmpPtr = reinterpret_cast<char*>(allocatorPtr) + addr;
+        //     Blob blob = make_ref<BlobObj>(runtime, (void *)tmpPtr);
+        //     tensor->setDataBlob(blob);
+        // }
+        // topological sorting first
+        IT_ASSERT(topo_sort() == true);
 
-        allocator.info();
-        for(auto it = tensors.begin(); it != tensors.end(); it++){
-            auto tensor = *it;
+        // =================================== 作业 ===================================
+        // TODO：利用 allocator 给计算图分配内存
+        // HINT: 获取分配好的内存指针后，可以调用 tensor 的 setDataBlob 函数给 tensor 绑定内存
+        // =================================== 作业 ===================================
+        vector<size_t> offsets;
+        for (auto tensor : tensors)
+        {
             size_t size = tensor->getBytes();
-            size_t addr = allocator.alloc(size);
-            IT_ASSERT(addr != 0);
-            Blob blob = make_ref<BlobObj>(tensor->getRuntime(), (void *)(allocator.getPtr() + addr));
+            size_t offset = allocator.alloc(size);
+            offsets.push_back(offset);
+        }
+        auto it = offsets.begin();
+        void *basePtr = allocator.getPtr();
+        for (auto tensor : tensors)
+        {
+            char *charPtr = reinterpret_cast<char *>(basePtr) + *it;
+            void *ptr = charPtr;
+            Blob blob = make_ref<BlobObj>(runtime, ptr);
             tensor->setDataBlob(blob);
+            it++;
         }
+        allocator.info();
     }
 
     Tensor GraphObj::addTensor(Shape dim, DataType dtype)
 
@@ -10,14 +10,30 @@ ConcatObj::ConcatObj(GraphObj *graph, TensorVec inputs, Tensor output, int _dim)
 }
 
 optional<vector<Shape>> ConcatObj::inferShape(const TensorVec &inputs) {
-    Shape dims = inputs[0]->getDims();
+    Shape dims = inputs[0]->getDims(); // 数组的 shape
     auto rank = inputs[0]->getRank();
-
     // =================================== 作业 ===================================
     // TODO：修改 dims，返回正确的 concat 后的 shape
     // REF: https://onnx.ai/onnx/operators/onnx__Concat.html#concat-13
     // =================================== 作业 ===================================
-    return {{dims}};
+    if(inputs.size() == 0) {
+        return std::nullopt;
+    }
+    for(auto input: inputs){
+        if(input->getDims().size() != rank)
+            return std::nullopt;
+    }
+    vector<int> res(rank, 0);
+    for(auto input: inputs){
+        for(size_t i = 0; i < rank; i++){
+            if(i == size_t(dim)){
+                res[i] += input->getDims()[i];
+            }else if (i != size_t(dim)){
+                res[i] = input->getDims()[i];
+            }
+        }
+    }
+    return {{res}};
 }
 
 std::string ConcatObj::toString() const {