From 74887bcd6a1b1a31ecdaed8e2620b3a0ee8756a2 Mon Sep 17 00:00:00 2001
From: wooway777 <wooway777@gmail.com>
Date: Fri, 3 Jul 2026 12:00:54 +0800
Subject: [PATCH] chore: format include, src, and python

---
 include/infinicore/analyzer.hpp               |   6 +-
 .../infinicore/analyzer/intent_generator.hpp  |  16 +-
 include/infinicore/analyzer/op_trace.hpp      |   6 +-
 include/infinicore/analyzer/op_type.hpp       | 138 ++++++++++++------
 .../infinicore/analyzer/op_type_registry.hpp  |  42 +++---
 .../analyzer/optimization_intent.hpp          |  83 +++++++----
 .../infinicore/analyzer/resource_sensor.hpp   |  36 +++--
 include/infinicore/graph/graph.hpp            |  48 +++---
 include/infinicore/ops/common/dispatcher.hpp  |   5 +-
 .../quantization/base_quantization.hpp        |   2 +-
 include/infinicore/quantization/gptq.hpp      |   2 +-
 .../infinicore/nn/functional/causal_conv1d.py |   8 +-
 .../nn/functional/chunk_gated_delta_rule.py   |   8 +-
 python/infinicore/nn/functional/layer_norm.py |   6 +-
 python/infinicore/nn/functional/rms_norm.py   |   6 +-
 python/infinicore/ops/__init__.py             |   1 +
 scripts/install.py                            |   3 +-
 scripts/python_test.py                        |   3 +-
 setup.py                                      |  20 ++-
 .../nvidia/generate_kernels.py                |   5 +-
 src/infiniop/ops/dequantize_awq/operator.cc   |   2 +-
 .../ops/flash_attention/ninetoothed/build.py  |   9 +-
 .../ops/kv_caching/ninetoothed/build.py       |   3 +-
 .../ops/kv_caching/ninetoothed/kv_caching.py  |   1 +
 src/infiniop/ops/swiglu/ninetoothed/build.py  |   3 +-
 src/infiniop/ops/swiglu/ninetoothed/swiglu.py |   1 -
 26 files changed, 275 insertions(+), 188 deletions(-)
diff --git a/include/infinicore/analyzer.hpp b/include/infinicore/analyzer.hpp
index 86736437f5..81e9ab1a9d 100644
--- a/include/infinicore/analyzer.hpp
+++ b/include/infinicore/analyzer.hpp
@@ -2,10 +2,10 @@
 
 // Convenience header — includes all analyzer components.
 
-#include "analyzer/op_type.hpp"
+#include "analyzer/intent_generator.hpp"
+#include "analyzer/mutual_awareness_analyzer.hpp"
 #include "analyzer/op_trace.hpp"
+#include "analyzer/op_type.hpp"
 #include "analyzer/optimization_intent.hpp"
 #include "analyzer/phase_detector.hpp"
 #include "analyzer/resource_sensor.hpp"
-#include "analyzer/intent_generator.hpp"
-#include "analyzer/mutual_awareness_analyzer.hpp"
diff --git a/include/infinicore/analyzer/intent_generator.hpp b/include/infinicore/analyzer/intent_generator.hpp
index f73c9e880c..b166e4c1eb 100644
--- a/include/infinicore/analyzer/intent_generator.hpp
+++ b/include/infinicore/analyzer/intent_generator.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include "optimization_intent.hpp"
 #include "op_trace.hpp"
+#include "optimization_intent.hpp"
 
 #include <algorithm>
 #include <vector>
@@ -72,7 +72,9 @@ class IntentGenerator {
         PhaseType phase,
         const std::vector<OpTraceEntry> &window) const {
 
-        if (window.empty()) return 0.0f;
+        if (window.empty()) {
+            return 0.0f;
+        }
 
         size_t heavy_compute_ops = 0;
         for (auto &e : window) {
@@ -201,7 +203,7 @@ class IntentGenerator {
 
         // Fusion is beneficial for bandwidth-bound phases (reduce memory traffic)
         hint.prefer_fused_ops = (bottleneck == BottleneckType::BANDWIDTH_BOUND)
-                                || phase == PhaseType::DECODE;
+                             || phase == PhaseType::DECODE;
 
         // In-place when memory is tight
         hint.prefer_in_place = (bottleneck == BottleneckType::MEMORY_BOUND);
@@ -218,8 +220,8 @@ class IntentGenerator {
 
         // Async comm overlap for multi-device and communication phases
         hint.prefer_async_comm = (device_intents.size() > 1)
-                                 && (phase == PhaseType::GEMM_MLP_DENSE
-                                     || phase == PhaseType::COMMUNICATION);
+                              && (phase == PhaseType::GEMM_MLP_DENSE
+                                  || phase == PhaseType::COMMUNICATION);
 
         return hint;
     }
@@ -254,7 +256,9 @@ class IntentGenerator {
             default:
                 break;
             }
-            if (match) matching++;
+            if (match) {
+                matching++;
+            }
         }
 
         return static_cast<float>(matching) / static_cast<float>(window.size());
diff --git a/include/infinicore/analyzer/op_trace.hpp b/include/infinicore/analyzer/op_trace.hpp
index 49f9c4c8b3..0b3018c289 100644
--- a/include/infinicore/analyzer/op_trace.hpp
+++ b/include/infinicore/analyzer/op_trace.hpp
@@ -144,9 +144,9 @@ OpTraceRing &getGlobalOpTrace();
 /// This is the function called from the INFINICORE_GRAPH_OP_RECORD_OR_RUN
 /// macro hook (when ENABLE_MUTUAL_AWARENESS is defined).
 inline void traceOp(OpType op_type,
-                     const size_t *shape, size_t ndim,
-                     uint8_t dtype,
-                     uint8_t device_type, int8_t device_id) {
+                    const size_t *shape, size_t ndim,
+                    uint8_t dtype,
+                    uint8_t device_type, int8_t device_id) {
     OpTraceEntry entry;
     entry.op_type = op_type;
     entry.setShape(shape, ndim);
diff --git a/include/infinicore/analyzer/op_type.hpp b/include/infinicore/analyzer/op_type.hpp
index 21444c2e41..cbb3ac1025 100644
--- a/include/infinicore/analyzer/op_type.hpp
+++ b/include/infinicore/analyzer/op_type.hpp
@@ -84,52 +84,98 @@ enum class OpType : uint8_t {
 /// Convert OpType to human-readable string.
 inline const char *opTypeToString(OpType type) {
     switch (type) {
-    case OpType::ATTENTION: return "attention";
-    case OpType::FLASH_ATTENTION: return "flash_attention";
-    case OpType::CAUSAL_SOFTMAX: return "causal_softmax";
-    case OpType::PAGED_ATTENTION: return "paged_attention";
-    case OpType::PAGED_ATTENTION_PREFILL: return "paged_attention_prefill";
-    case OpType::MHA_KVCACHE: return "mha_kvcache";
-    case OpType::MHA_VARLEN: return "mha_varlen";
-    case OpType::SOFTMAX: return "softmax";
-    case OpType::GEMM: return "gemm";
-    case OpType::LINEAR: return "linear";
-    case OpType::MATMUL: return "matmul";
-    case OpType::INT8_GEMM: return "int8_gemm";
-    case OpType::SCALED_MM_I8: return "scaled_mm_i8";
-    case OpType::SILU: return "silu";
-    case OpType::SILU_AND_MUL: return "silu_and_mul";
-    case OpType::GELU: return "gelu";
-    case OpType::SWIGLU: return "swiglu";
-    case OpType::RELU: return "relu";
-    case OpType::SIGMOID: return "sigmoid";
-    case OpType::RMS_NORM: return "rms_norm";
-    case OpType::ADD_RMS_NORM: return "add_rms_norm";
-    case OpType::LAYER_NORM: return "layer_norm";
-    case OpType::EMBEDDING: return "embedding";
-    case OpType::ROPE: return "rope";
-    case OpType::KV_CACHING: return "kv_caching";
-    case OpType::PAGED_CACHING: return "paged_caching";
-    case OpType::ADD: return "add";
-    case OpType::MUL: return "mul";
-    case OpType::SUB: return "sub";
-    case OpType::SUM: return "sum";
-    case OpType::RECIPROCAL: return "reciprocal";
-    case OpType::PER_TENSOR_QUANT_I8: return "per_tensor_quant_i8";
-    case OpType::PER_TENSOR_DEQUANT_I8: return "per_tensor_dequant_i8";
-    case OpType::PER_CHANNEL_QUANT_I8: return "per_channel_quant_i8";
-    case OpType::DEQUANTIZE_AWQ: return "dequantize_awq";
-    case OpType::DEQUANTIZE_GPTQ: return "dequantize_gptq";
-    case OpType::RANDOM_SAMPLE: return "random_sample";
-    case OpType::TOPK: return "topk";
-    case OpType::TOPK_ROUTER: return "topk_router";
-    case OpType::TOPK_SOFTMAX: return "topk_softmax";
-    case OpType::ALLREDUCE: return "allreduce";
-    case OpType::REARRANGE: return "rearrange";
-    case OpType::ONES: return "ones";
-    case OpType::ZEROS: return "zeros";
-    case OpType::TAKE: return "take";
-    default: return "unknown";
+    case OpType::ATTENTION:
+        return "attention";
+    case OpType::FLASH_ATTENTION:
+        return "flash_attention";
+    case OpType::CAUSAL_SOFTMAX:
+        return "causal_softmax";
+    case OpType::PAGED_ATTENTION:
+        return "paged_attention";
+    case OpType::PAGED_ATTENTION_PREFILL:
+        return "paged_attention_prefill";
+    case OpType::MHA_KVCACHE:
+        return "mha_kvcache";
+    case OpType::MHA_VARLEN:
+        return "mha_varlen";
+    case OpType::SOFTMAX:
+        return "softmax";
+    case OpType::GEMM:
+        return "gemm";
+    case OpType::LINEAR:
+        return "linear";
+    case OpType::MATMUL:
+        return "matmul";
+    case OpType::INT8_GEMM:
+        return "int8_gemm";
+    case OpType::SCALED_MM_I8:
+        return "scaled_mm_i8";
+    case OpType::SILU:
+        return "silu";
+    case OpType::SILU_AND_MUL:
+        return "silu_and_mul";
+    case OpType::GELU:
+        return "gelu";
+    case OpType::SWIGLU:
+        return "swiglu";
+    case OpType::RELU:
+        return "relu";
+    case OpType::SIGMOID:
+        return "sigmoid";
+    case OpType::RMS_NORM:
+        return "rms_norm";
+    case OpType::ADD_RMS_NORM:
+        return "add_rms_norm";
+    case OpType::LAYER_NORM:
+        return "layer_norm";
+    case OpType::EMBEDDING:
+        return "embedding";
+    case OpType::ROPE:
+        return "rope";
+    case OpType::KV_CACHING:
+        return "kv_caching";
+    case OpType::PAGED_CACHING:
+        return "paged_caching";
+    case OpType::ADD:
+        return "add";
+    case OpType::MUL:
+        return "mul";
+    case OpType::SUB:
+        return "sub";
+    case OpType::SUM:
+        return "sum";
+    case OpType::RECIPROCAL:
+        return "reciprocal";
+    case OpType::PER_TENSOR_QUANT_I8:
+        return "per_tensor_quant_i8";
+    case OpType::PER_TENSOR_DEQUANT_I8:
+        return "per_tensor_dequant_i8";
+    case OpType::PER_CHANNEL_QUANT_I8:
+        return "per_channel_quant_i8";
+    case OpType::DEQUANTIZE_AWQ:
+        return "dequantize_awq";
+    case OpType::DEQUANTIZE_GPTQ:
+        return "dequantize_gptq";
+    case OpType::RANDOM_SAMPLE:
+        return "random_sample";
+    case OpType::TOPK:
+        return "topk";
+    case OpType::TOPK_ROUTER:
+        return "topk_router";
+    case OpType::TOPK_SOFTMAX:
+        return "topk_softmax";
+    case OpType::ALLREDUCE:
+        return "allreduce";
+    case OpType::REARRANGE:
+        return "rearrange";
+    case OpType::ONES:
+        return "ones";
+    case OpType::ZEROS:
+        return "zeros";
+    case OpType::TAKE:
+        return "take";
+    default:
+        return "unknown";
     }
 }
 
diff --git a/include/infinicore/analyzer/op_type_registry.hpp b/include/infinicore/analyzer/op_type_registry.hpp
index 8d7bc0135b..4445f3a514 100644
--- a/include/infinicore/analyzer/op_type_registry.hpp
+++ b/include/infinicore/analyzer/op_type_registry.hpp
@@ -11,36 +11,36 @@ namespace infinicore::analyzer {
 inline OpType opTypeFromName(const char *name) {
     static const std::unordered_map<std::string, OpType> registry = {
         // Attention
-        {"FlashAttention",      OpType::FLASH_ATTENTION},
-        {"CausalSoftmax",       OpType::CAUSAL_SOFTMAX},
-        {"PagedAttention",      OpType::PAGED_ATTENTION},
-        {"MhaKVCache",          OpType::MHA_KVCACHE},
+        {"FlashAttention", OpType::FLASH_ATTENTION},
+        {"CausalSoftmax", OpType::CAUSAL_SOFTMAX},
+        {"PagedAttention", OpType::PAGED_ATTENTION},
+        {"MhaKVCache", OpType::MHA_KVCACHE},
         {"MultiheadAttentionVarlen", OpType::MHA_VARLEN},
         // GEMM / MLP
-        {"Gemm",                OpType::GEMM},
-        {"I8Gemm",              OpType::SCALED_MM_I8},
+        {"Gemm", OpType::GEMM},
+        {"I8Gemm", OpType::SCALED_MM_I8},
         // Activation
-        {"SiluAndMul",          OpType::SILU_AND_MUL},
-        {"SwiGLU",              OpType::SWIGLU},
+        {"SiluAndMul", OpType::SILU_AND_MUL},
+        {"SwiGLU", OpType::SWIGLU},
         // Norm
-        {"RMSNorm",             OpType::RMS_NORM},
-        {"AddRMSNorm",          OpType::ADD_RMS_NORM},
+        {"RMSNorm", OpType::RMS_NORM},
+        {"AddRMSNorm", OpType::ADD_RMS_NORM},
         // Embedding / Positional
-        {"Embedding",           OpType::EMBEDDING},
-        {"RoPE",                OpType::ROPE},
+        {"Embedding", OpType::EMBEDDING},
+        {"RoPE", OpType::ROPE},
         // KV Cache
-        {"KVCaching",           OpType::KV_CACHING},
-        {"PagedCaching",        OpType::PAGED_CACHING},
+        {"KVCaching", OpType::KV_CACHING},
+        {"PagedCaching", OpType::PAGED_CACHING},
         // Elementwise
-        {"Add",                 OpType::ADD},
-        {"Mul",                 OpType::MUL},
+        {"Add", OpType::ADD},
+        {"Mul", OpType::MUL},
         // Quantization
-        {"PerTensorQuantI8",    OpType::PER_TENSOR_QUANT_I8},
-        {"PerTensorDequantI8",  OpType::PER_TENSOR_DEQUANT_I8},
-        {"PerChannelQuantI8",   OpType::PER_CHANNEL_QUANT_I8},
-        {"DequantizeAWQ",       OpType::DEQUANTIZE_AWQ},
+        {"PerTensorQuantI8", OpType::PER_TENSOR_QUANT_I8},
+        {"PerTensorDequantI8", OpType::PER_TENSOR_DEQUANT_I8},
+        {"PerChannelQuantI8", OpType::PER_CHANNEL_QUANT_I8},
+        {"DequantizeAWQ", OpType::DEQUANTIZE_AWQ},
         // Misc
-        {"Rearrange",           OpType::REARRANGE},
+        {"Rearrange", OpType::REARRANGE},
     };
     auto it = registry.find(name);
     return it != registry.end() ? it->second : OpType::UNKNOWN;
diff --git a/include/infinicore/analyzer/optimization_intent.hpp b/include/infinicore/analyzer/optimization_intent.hpp
index 9b423a281c..19d6c868ce 100644
--- a/include/infinicore/analyzer/optimization_intent.hpp
+++ b/include/infinicore/analyzer/optimization_intent.hpp
@@ -15,25 +15,33 @@ namespace infinicore::analyzer {
 /// Recognized task phase types in LLM inference.
 enum class PhaseType : uint8_t {
     UNKNOWN = 0,
-    PREFILL,              // Long-sequence prefill phase
-    DECODE,               // Autoregressive decode phase
-    ATTENTION_DENSE,      // Attention-dominated segment
-    GEMM_MLP_DENSE,       // GEMM/MLP-dominated segment
-    MOE_ROUTING,          // MoE routing segment (future)
-    KV_CACHE,             // KV cache manipulation segment
-    COMMUNICATION,        // Communication-dominated segment (future)
+    PREFILL,         // Long-sequence prefill phase
+    DECODE,          // Autoregressive decode phase
+    ATTENTION_DENSE, // Attention-dominated segment
+    GEMM_MLP_DENSE,  // GEMM/MLP-dominated segment
+    MOE_ROUTING,     // MoE routing segment (future)
+    KV_CACHE,        // KV cache manipulation segment
+    COMMUNICATION,   // Communication-dominated segment (future)
 };
 
 inline const char *phaseTypeToString(PhaseType phase) {
     switch (phase) {
-    case PhaseType::PREFILL: return "prefill";
-    case PhaseType::DECODE: return "decode";
-    case PhaseType::ATTENTION_DENSE: return "attention_dense";
-    case PhaseType::GEMM_MLP_DENSE: return "gemm_mlp_dense";
-    case PhaseType::MOE_ROUTING: return "moe_routing";
-    case PhaseType::KV_CACHE: return "kv_cache";
-    case PhaseType::COMMUNICATION: return "communication";
-    default: return "unknown";
+    case PhaseType::PREFILL:
+        return "prefill";
+    case PhaseType::DECODE:
+        return "decode";
+    case PhaseType::ATTENTION_DENSE:
+        return "attention_dense";
+    case PhaseType::GEMM_MLP_DENSE:
+        return "gemm_mlp_dense";
+    case PhaseType::MOE_ROUTING:
+        return "moe_routing";
+    case PhaseType::KV_CACHE:
+        return "kv_cache";
+    case PhaseType::COMMUNICATION:
+        return "communication";
+    default:
+        return "unknown";
     }
 }
 
@@ -48,12 +56,18 @@ enum class BottleneckType : uint8_t {
 
 inline const char *bottleneckTypeToString(BottleneckType bt) {
     switch (bt) {
-    case BottleneckType::COMPUTE_BOUND: return "compute_bound";
-    case BottleneckType::MEMORY_BOUND: return "memory_bound";
-    case BottleneckType::BANDWIDTH_BOUND: return "bandwidth_bound";
-    case BottleneckType::COMMUNICATION_BOUND: return "communication_bound";
-    case BottleneckType::BALANCED: return "balanced";
-    default: return "unknown";
+    case BottleneckType::COMPUTE_BOUND:
+        return "compute_bound";
+    case BottleneckType::MEMORY_BOUND:
+        return "memory_bound";
+    case BottleneckType::BANDWIDTH_BOUND:
+        return "bandwidth_bound";
+    case BottleneckType::COMMUNICATION_BOUND:
+        return "communication_bound";
+    case BottleneckType::BALANCED:
+        return "balanced";
+    default:
+        return "unknown";
     }
 }
 
@@ -67,11 +81,16 @@ enum class OptimizationGoal : uint8_t {
 
 inline const char *optimizationGoalToString(OptimizationGoal goal) {
     switch (goal) {
-    case OptimizationGoal::LATENCY_FIRST: return "latency_first";
-    case OptimizationGoal::THROUGHPUT_FIRST: return "throughput_first";
-    case OptimizationGoal::MEMORY_SAFE: return "memory_safe";
-    case OptimizationGoal::STABILITY_FIRST: return "stability_first";
-    default: return "unknown";
+    case OptimizationGoal::LATENCY_FIRST:
+        return "latency_first";
+    case OptimizationGoal::THROUGHPUT_FIRST:
+        return "throughput_first";
+    case OptimizationGoal::MEMORY_SAFE:
+        return "memory_safe";
+    case OptimizationGoal::STABILITY_FIRST:
+        return "stability_first";
+    default:
+        return "unknown";
     }
 }
 
@@ -85,10 +104,10 @@ inline const char *optimizationGoalToString(OptimizationGoal goal) {
 
 /// Strategy hints generated by the analyzer.
 struct StrategyHint {
-    bool prefer_fused_ops = false;       // Suggest op fusion?
-    bool prefer_in_place = false;        // Suggest in-place computation?
-    bool prefer_recomputation = false;   // Suggest recompute to save memory?
-    bool prefer_async_comm = false;      // Suggest async comm overlap?
+    bool prefer_fused_ops = false;     // Suggest op fusion?
+    bool prefer_in_place = false;      // Suggest in-place computation?
+    bool prefer_recomputation = false; // Suggest recompute to save memory?
+    bool prefer_async_comm = false;    // Suggest async comm overlap?
 };
 
 /// Global semantic-level intent — describes the task's overall
@@ -114,8 +133,8 @@ struct GlobalSemanticIntent {
 
     // --- Production metadata ---
     uint64_t timestamp_ns = 0;
-    uint32_t op_window_start = 0;  // Start index of analyzed op window
-    uint32_t op_window_end = 0;    // End index (exclusive) of analyzed op window
+    uint32_t op_window_start = 0; // Start index of analyzed op window
+    uint32_t op_window_end = 0;   // End index (exclusive) of analyzed op window
 };
 
 /// Per-device local intent — resource-aware refinement
diff --git a/include/infinicore/analyzer/resource_sensor.hpp b/include/infinicore/analyzer/resource_sensor.hpp
index a5f18b234e..72c871080b 100644
--- a/include/infinicore/analyzer/resource_sensor.hpp
+++ b/include/infinicore/analyzer/resource_sensor.hpp
@@ -11,15 +11,15 @@ namespace infinicore::analyzer {
 
 /// Memory statistics from the allocator.
 struct MemoryStats {
-    size_t allocated_bytes = 0;    // Currently allocated bytes
-    size_t total_capacity = 0;     // Total pool capacity in bytes
-    size_t peak_allocated = 0;     // Peak allocation since last reset
-    size_t allocation_count = 0;   // Number of active allocations
+    size_t allocated_bytes = 0;  // Currently allocated bytes
+    size_t total_capacity = 0;   // Total pool capacity in bytes
+    size_t peak_allocated = 0;   // Peak allocation since last reset
+    size_t allocation_count = 0; // Number of active allocations
 
     float usageRatio() const {
         return total_capacity > 0
-            ? static_cast<float>(allocated_bytes) / static_cast<float>(total_capacity)
-            : 0.0f;
+                 ? static_cast<float>(allocated_bytes) / static_cast<float>(total_capacity)
+                 : 0.0f;
     }
 };
 
@@ -70,10 +70,18 @@ struct DeviceResourceSnapshot {
 
     float resourceConfidence() const {
         float confidence = 0.0f;
-        if (has_memory_capacity) confidence += 0.35f;
-        if (has_compute_utilization) confidence += 0.25f;
-        if (has_memory_bandwidth_utilization) confidence += 0.25f;
-        if (has_communication) confidence += 0.15f;
+        if (has_memory_capacity) {
+            confidence += 0.35f;
+        }
+        if (has_compute_utilization) {
+            confidence += 0.25f;
+        }
+        if (has_memory_bandwidth_utilization) {
+            confidence += 0.25f;
+        }
+        if (has_communication) {
+            confidence += 0.15f;
+        }
         return std::min(confidence, 1.0f);
     }
 };
@@ -94,8 +102,8 @@ class ResourceSensor {
         intent.device_id = snapshot.device_id;
         intent.memory_usage_ratio = snapshot.memoryUsageRatio();
         intent.memory_available_bytes = snapshot.free_bytes > 0
-            ? snapshot.free_bytes
-            : ((snapshot.total_bytes >= snapshot.used_bytes) ? (snapshot.total_bytes - snapshot.used_bytes) : 0);
+                                          ? snapshot.free_bytes
+                                          : ((snapshot.total_bytes >= snapshot.used_bytes) ? (snapshot.total_bytes - snapshot.used_bytes) : 0);
         intent.compute_utilization = snapshot.compute_utilization;
         intent.memory_bandwidth_utilization = snapshot.memory_bandwidth_utilization;
         intent.communication_time_ratio = snapshot.communication_time_ratio;
@@ -149,8 +157,8 @@ class ResourceSensor {
         snapshot.device_type = device_type;
         snapshot.has_memory_capacity = stats.total_capacity > 0;
         snapshot.free_bytes = stats.total_capacity >= stats.allocated_bytes
-            ? (stats.total_capacity - stats.allocated_bytes)
-            : 0;
+                                ? (stats.total_capacity - stats.allocated_bytes)
+                                : 0;
         snapshot.total_bytes = stats.total_capacity;
         snapshot.used_bytes = stats.allocated_bytes;
         snapshot.reserved_bytes = stats.total_capacity;
diff --git a/include/infinicore/graph/graph.hpp b/include/infinicore/graph/graph.hpp
index f477bcc67c..be368f92d7 100644
--- a/include/infinicore/graph/graph.hpp
+++ b/include/infinicore/graph/graph.hpp
@@ -93,35 +93,35 @@ class Graph {
 // Trace one op invocation into the global ring. Op type is resolved by
 // stringified class name through `opTypeFromName`, so new graph ops are
 // automatically discoverable without modifying the op header.
-#define _INFINICORE_TRACE_OP(__OP_NAME__, __TRACE_TENSOR__)                  \
-    do {                                                                     \
-        auto __op_type = ::infinicore::analyzer::opTypeFromName(#__OP_NAME__);\
-        auto &&__trace_tensor = (__TRACE_TENSOR__);                          \
-        if (__trace_tensor) {                                                \
-            const auto &__trace_shape = __trace_tensor->shape();             \
-            const auto __trace_device = __trace_tensor->device();            \
-            ::infinicore::analyzer::traceOp(                                 \
-                __op_type,                                                   \
-                __trace_shape.data(),                                        \
-                __trace_shape.size(),                                        \
-                static_cast<uint8_t>(__trace_tensor->dtype()),               \
-                static_cast<uint8_t>(__trace_device.getType()),              \
-                static_cast<int8_t>(__trace_device.getIndex()));             \
-        } else {                                                             \
-            ::infinicore::analyzer::traceOp(__op_type, nullptr, 0, 0, 0, -1);\
-        }                                                                    \
+#define _INFINICORE_TRACE_OP(__OP_NAME__, __TRACE_TENSOR__)                    \
+    do {                                                                       \
+        auto __op_type = ::infinicore::analyzer::opTypeFromName(#__OP_NAME__); \
+        auto &&__trace_tensor = (__TRACE_TENSOR__);                            \
+        if (__trace_tensor) {                                                  \
+            const auto &__trace_shape = __trace_tensor->shape();               \
+            const auto __trace_device = __trace_tensor->device();              \
+            ::infinicore::analyzer::traceOp(                                   \
+                __op_type,                                                     \
+                __trace_shape.data(),                                          \
+                __trace_shape.size(),                                          \
+                static_cast<uint8_t>(__trace_tensor->dtype()),                 \
+                static_cast<uint8_t>(__trace_device.getType()),                \
+                static_cast<int8_t>(__trace_device.getIndex()));               \
+        } else {                                                               \
+            ::infinicore::analyzer::traceOp(__op_type, nullptr, 0, 0, 0, -1);  \
+        }                                                                      \
     } while (0)
 #else
 #define _INFINICORE_TRACE_OP(__OP_NAME__, __TRACE_TENSOR__) ((void)0)
 #endif
 
-#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...)                  \
-    auto ___op = std::make_shared<__OP_NAME__>(__VA_ARGS__);                 \
-    if (context::isGraphRecording()) {                                       \
-        context::addGraphOperator(___op);                                    \
-    } else {                                                                 \
-        ___op->run();                                                        \
-    }                                                                        \
+#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...)  \
+    auto ___op = std::make_shared<__OP_NAME__>(__VA_ARGS__); \
+    if (context::isGraphRecording()) {                       \
+        context::addGraphOperator(___op);                    \
+    } else {                                                 \
+        ___op->run();                                        \
+    }                                                        \
     _INFINICORE_TRACE_OP(__OP_NAME__, INFINICORE_DETAIL_FIRST_ARG(__VA_ARGS__));
 
 #define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
diff --git a/include/infinicore/ops/common/dispatcher.hpp b/include/infinicore/ops/common/dispatcher.hpp
index c5a0d5248c..9c5cac1793 100644
--- a/include/infinicore/ops/common/dispatcher.hpp
+++ b/include/infinicore/ops/common/dispatcher.hpp
@@ -66,11 +66,12 @@ class OpDispatcher {
 private:
     static std::size_t goalKey(Device::Type device_type, analyzer::OptimizationGoal goal) {
         return static_cast<std::size_t>(device_type) * kGoalCount
-               + static_cast<std::size_t>(goal);
+             + static_cast<std::size_t>(goal);
     }
 
     std::array<Fn,
-               static_cast<std::size_t>(Device::Type::COUNT) * kGoalCount> goal_table_{};
+               static_cast<std::size_t>(Device::Type::COUNT) * kGoalCount>
+        goal_table_{};
 #endif
 
 private:
diff --git a/include/infinicore/quantization/base_quantization.hpp b/include/infinicore/quantization/base_quantization.hpp
index ceff0f54a3..4ee4f7608d 100644
--- a/include/infinicore/quantization/base_quantization.hpp
+++ b/include/infinicore/quantization/base_quantization.hpp
@@ -6,7 +6,7 @@ namespace infinicore::quantization {
 class BaseQuantization {
     // Base class for quantization schemes. Intended to be extended to support various quantization methods.
 public:
-    explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config) {};
+    explicit BaseQuantization(const nlohmann::json &quant_config) : quant_config_(quant_config){};
     virtual ~BaseQuantization() = default;
 
     const nlohmann::json &get_config() const { return quant_config_; }
diff --git a/include/infinicore/quantization/gptq.hpp b/include/infinicore/quantization/gptq.hpp
index c4cd4bb02b..c03416b076 100644
--- a/include/infinicore/quantization/gptq.hpp
+++ b/include/infinicore/quantization/gptq.hpp
@@ -8,7 +8,7 @@ class GPTQ : public BaseQuantization {
     // information and support multiple quantization schemes.
 public:
     explicit GPTQ(const nlohmann::json &quant_config)
-        : BaseQuantization(quant_config) {};
+        : BaseQuantization(quant_config){};
 
     infinicore::quantization::QuantScheme
     get_quant_scheme() const override {
diff --git a/python/infinicore/nn/functional/causal_conv1d.py b/python/infinicore/nn/functional/causal_conv1d.py
index c7699819fe..04116a8586 100644
--- a/python/infinicore/nn/functional/causal_conv1d.py
+++ b/python/infinicore/nn/functional/causal_conv1d.py
@@ -41,9 +41,11 @@ def causal_conv1d(
             weight._underlying,
             None if bias is None else bias._underlying,
             None if cu_seqlens is None else cu_seqlens._underlying,
-            None
-            if initial_state_indices is None
-            else initial_state_indices._underlying,
+            (
+                None
+                if initial_state_indices is None
+                else initial_state_indices._underlying
+            ),
             None if final_state_indices is None else final_state_indices._underlying,
         )
     )
diff --git a/python/infinicore/nn/functional/chunk_gated_delta_rule.py b/python/infinicore/nn/functional/chunk_gated_delta_rule.py
index 082743faa1..dd87a9f80c 100644
--- a/python/infinicore/nn/functional/chunk_gated_delta_rule.py
+++ b/python/infinicore/nn/functional/chunk_gated_delta_rule.py
@@ -56,9 +56,11 @@ def chunk_gated_delta_rule(
             beta._underlying,
             initial_state._underlying,
             None if cu_seqlens is None else cu_seqlens._underlying,
-            None
-            if initial_state_indices is None
-            else initial_state_indices._underlying,
+            (
+                None
+                if initial_state_indices is None
+                else initial_state_indices._underlying
+            ),
             None if final_state_indices is None else final_state_indices._underlying,
             use_qk_l2norm,
             chunk_size,
diff --git a/python/infinicore/nn/functional/layer_norm.py b/python/infinicore/nn/functional/layer_norm.py
index b841fdc076..26a6e6979a 100644
--- a/python/infinicore/nn/functional/layer_norm.py
+++ b/python/infinicore/nn/functional/layer_norm.py
@@ -15,9 +15,9 @@ def layer_norm(
 ) -> Tensor:
     r"""Apply Layer Normalization."""
 
-    assert normalized_shape == weight.shape, (
-        "normalized_shape does not match weight.shape."
-    )
+    assert (
+        normalized_shape == weight.shape
+    ), "normalized_shape does not match weight.shape."
 
     if out is None:
         return Tensor(
diff --git a/python/infinicore/nn/functional/rms_norm.py b/python/infinicore/nn/functional/rms_norm.py
index 84cd82ae12..30fb701d20 100644
--- a/python/infinicore/nn/functional/rms_norm.py
+++ b/python/infinicore/nn/functional/rms_norm.py
@@ -14,9 +14,9 @@ def rms_norm(
 ) -> Tensor:
     r"""Apply Root Mean Square Layer Normalization."""
 
-    assert normalized_shape == weight.shape, (
-        "normalized_shape does not match weight.shape."
-    )
+    assert (
+        normalized_shape == weight.shape
+    ), "normalized_shape does not match weight.shape."
 
     if out is None:
         return Tensor(_infinicore.rms_norm(input._underlying, weight._underlying, eps))
diff --git a/python/infinicore/ops/__init__.py b/python/infinicore/ops/__init__.py
index e69de29bb2..8b13789179 100644
--- a/python/infinicore/ops/__init__.py
+++ b/python/infinicore/ops/__init__.py
@@ -0,0 +1 @@
+
diff --git a/scripts/install.py b/scripts/install.py
index 98a448254e..8c75776a3d 100644
--- a/scripts/install.py
+++ b/scripts/install.py
@@ -1,7 +1,7 @@
 import os
 import subprocess
-import platform
 import sys
+
 from set_env import (
     set_env,
     set_env_by_config,
@@ -10,6 +10,7 @@
 PROJECT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 os.chdir(PROJECT_DIR)
 
+
 def run_cmd(cmd):
     subprocess.run(cmd, text=True, encoding="utf-8", check=True, shell=True)
 
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 4879da32a9..f23443447c 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -1,8 +1,9 @@
 import os
 import subprocess
-from set_env import set_env
 import sys
 
+from set_env import set_env
+
 PROJECT_DIR = os.path.abspath(
     os.path.join(os.path.dirname(__file__), "..", "test", "infiniop")
 )
diff --git a/setup.py b/setup.py
index e79e41f539..479af83750 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,9 @@
 import subprocess
-from setuptools import setup, find_packages
+
+from setuptools import find_packages, setup
 from setuptools.command.build import build
 
+
 def run_xmake_build():
     print("Running xmake build...")
     subprocess.run(["xmake", "build"], check=True)
@@ -9,26 +11,22 @@ def run_xmake_build():
     subprocess.run(["xmake", "build", "-y", "_infinicore"], check=True)
     subprocess.run(["xmake", "install", "_infinicore"], check=True)
 
+
 class Build(build):
     def run(self):
         run_xmake_build()
         super().run()
 
+
 setup(
     # 1. Find main packages and manually add test/framework packages
-    packages=find_packages(where="python") + [
-        "infinicore.test", 
-        "infinicore.test.framework"
-    ],
-    
+    packages=find_packages(where="python")
+    + ["infinicore.test", "infinicore.test.framework"],
     # 2. Directory mappings
     package_dir={
         "": "python",  # Root package is under python/ directory
-        "infinicore.test": "test/infinicore"  # Intermediate package mapping
+        "infinicore.test": "test/infinicore",  # Intermediate package mapping
     },
-    
     # 3. Register commands
-    cmdclass={
-        "build": Build
-    }
+    cmdclass={"build": Build},
 )
diff --git a/src/infiniop/ops/awq_marlin_gemm/nvidia/generate_kernels.py b/src/infiniop/ops/awq_marlin_gemm/nvidia/generate_kernels.py
index e05296c52e..ab78d0ce20 100644
--- a/src/infiniop/ops/awq_marlin_gemm/nvidia/generate_kernels.py
+++ b/src/infiniop/ops/awq_marlin_gemm/nvidia/generate_kernels.py
@@ -32,12 +32,15 @@
 """.lstrip()
 
 
-FILE_HEAD = FILE_HEAD_COMMENT + """
+FILE_HEAD = (
+    FILE_HEAD_COMMENT
+    + """
 #include "../marlin/kernel.h"
 #include "../marlin/marlin_template.h"
 
 namespace MARLIN_NAMESPACE_NAME {
 """
+)
 
 TEMPLATE = (
     "template __global__ void Marlin<"
diff --git a/src/infiniop/ops/dequantize_awq/operator.cc b/src/infiniop/ops/dequantize_awq/operator.cc
index 342e3da652..c63aed0fde 100644
--- a/src/infiniop/ops/dequantize_awq/operator.cc
+++ b/src/infiniop/ops/dequantize_awq/operator.cc
@@ -155,4 +155,4 @@ infiniopDestroyDequantizeAWQDescriptor(infiniopDequantizeAWQDescriptor_t desc) {
 #undef DELETE
 }
 
-// #endif
\ No newline at end of file
+// #endif
diff --git a/src/infiniop/ops/flash_attention/ninetoothed/build.py b/src/infiniop/ops/flash_attention/ninetoothed/build.py
index 00f96b9c92..ef3235dfd9 100644
--- a/src/infiniop/ops/flash_attention/ninetoothed/build.py
+++ b/src/infiniop/ops/flash_attention/ninetoothed/build.py
@@ -1,12 +1,11 @@
+import os
+
 import ninetoothed
-from . import flash_attention
-from .flash_attention import CausalVariant
 
 import infiniop.ninetoothed.build
 
-import torch
-
-import os
+from . import flash_attention
+from .flash_attention import CausalVariant
 
 
 def build():
diff --git a/src/infiniop/ops/kv_caching/ninetoothed/build.py b/src/infiniop/ops/kv_caching/ninetoothed/build.py
index 03481c86b7..f76497434d 100644
--- a/src/infiniop/ops/kv_caching/ninetoothed/build.py
+++ b/src/infiniop/ops/kv_caching/ninetoothed/build.py
@@ -1,8 +1,9 @@
 import ninetoothed
-from . import kv_caching
 
 import infiniop.ninetoothed.build
 
+from . import kv_caching
+
 
 def build():
     dtype_values = (
diff --git a/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.py b/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.py
index dfc5088e95..006dbe3770 100644
--- a/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.py
+++ b/src/infiniop/ops/kv_caching/ninetoothed/kv_caching.py
@@ -1,4 +1,5 @@
 import functools
+
 import ninetoothed
 from ninetoothed import Tensor
 
diff --git a/src/infiniop/ops/swiglu/ninetoothed/build.py b/src/infiniop/ops/swiglu/ninetoothed/build.py
index fa4af6db24..10c82d5385 100644
--- a/src/infiniop/ops/swiglu/ninetoothed/build.py
+++ b/src/infiniop/ops/swiglu/ninetoothed/build.py
@@ -1,8 +1,9 @@
 import ninetoothed
-from . import swiglu
 
 import infiniop.ninetoothed.build
 
+from . import swiglu
+
 
 def build():
     MAX_NDIM = 5
diff --git a/src/infiniop/ops/swiglu/ninetoothed/swiglu.py b/src/infiniop/ops/swiglu/ninetoothed/swiglu.py
index 62074a84bf..2ede5e96a9 100644
--- a/src/infiniop/ops/swiglu/ninetoothed/swiglu.py
+++ b/src/infiniop/ops/swiglu/ninetoothed/swiglu.py
@@ -2,7 +2,6 @@
 
 import ninetoothed.language as ntl
 from ninetoothed import Tensor
-
 from ntops.kernels.element_wise import arrangement