[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

Mohamed-Ashraf273 · 2025-07-27T12:20:21Z

Solving Issue #31390, and back to #30934
Adding EinsumDecomposition to MOC transformations helped reduce memory usage during model compilation.
Running this script using memory profiling form #31516:
Use keras source https://github.com/keras-team/keras.git
Also use this PR from keras_hub: keras-team/keras-hub#2350
Then run the following script.
Then Enable os.environ["OV_ENABLE_MEMORY_PROFILING"] = "1" by uncommentinng it.

import os
backend = "openvino"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
os.environ["KERAS_BACKEND"] = backend
# os.environ["OV_ENABLE_MEMORY_PROFILING"] = "1"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import warnings
warnings.filterwarnings("ignore")

import gc
import time
import psutil
import threading

try:
    from tabulate import tabulate
    TABULATE_AVAILABLE = True
except ImportError:
    TABULATE_AVAILABLE = False
    print("⚠️  tabulate not available. Install with: pip install tabulate")


def record_stage(stage_name, description=""):
    """Record stage with current memory consumption"""
    gc.collect()
    process = psutil.Process(os.getpid())
    mem_info = process.memory_full_info()
    current_memory = mem_info.rss / (1024 ** 2)
    swap_memory = mem_info.swap / (1024 ** 2)
    print(f"[STAGE] {stage_name}: {current_memory:.2f} MB (swap: {swap_memory:.2f} MB) - {description}")
    return current_memory, swap_memory


def main():
    """Main test function"""

    print("=" * 80)
    print(f"FIXED MEMORY TEST: KERAS GPT2 + {backend.upper()}")
    print("=" * 80)

    # Now import keras and keras_hub
    import keras
    import keras_hub

    # Global variables for memory monitoring
    process = psutil.Process(os.getpid())
    peak_memory = [0]
    peak_swap = [0]
    done = [False]

    def monitor_memory():
        """Continuous memory monitoring"""
        while not done[0]:
            mem_info = process.memory_full_info()
            mem_now = mem_info.rss / (1024 ** 2)
            swap_now = mem_info.swap / (1024 ** 2)
            if mem_now > peak_memory[0]:
                peak_memory[0] = mem_now
            if swap_now > peak_swap[0]:
                peak_swap[0] = swap_now
            time.sleep(0.02)

    # Stage 0: Initial state
    mem_initial, swap_initial = record_stage("0_INITIAL", "Initial state after imports")
    peak_memory[0] = mem_initial
    peak_swap[0] = swap_initial

    # Start monitoring
    monitor_thread = threading.Thread(target=monitor_memory, daemon=True)
    monitor_thread.start()

    # Stage 1: Model loading
    print("\n>>> Loading GPT2 model from preset...")
    start_load = time.perf_counter()

    try:
        causal_lm = keras_hub.models.GPT2CausalLM.from_preset("gpt2_medium_en", dtype="float32")
        model_name = "gpt2_medium_en"

        end_load = time.perf_counter()
        mem_after_load, swap_after_load = record_stage("1_MODEL_LOADED", 
                                    f"{model_name} model loaded ({end_load-start_load:.1f}s)")
    except Exception as e:
        print(f"❌ Model loading error: {e}")
        return False

    # Stage 2: Preparation for inference
    mem_before_inference, swap_before_inference = record_stage("2_BEFORE_INFERENCE", "Before first inference")

    # Stage 3: First inference
    print("\n>>> Running first inference (compilation + execution)...")
    print(f"    ⏳ Converting Keras -> {backend.upper()} and compiling...")

    start_time = time.perf_counter()

    try:
        # Try inference with temporary ellipsis fix during generation only
        import numpy as np
        
        # Method 1: Try simple generate call
        inference_success = False
        try:
            output = causal_lm.generate("Hello", max_length=10)
            generation_method = "generate"
            inference_success = True
        except Exception as e1:
            print(f"Generate failed: {str(e1)[:100]}")
            # Method 2: Try using backbone directly for memory test
            try:
                import tensorflow as tf
                # Create simple input tensor
                input_tokens = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.int32)
                # Get backbone prediction to trigger backend compilation
                logits = causal_lm.backbone(input_tokens)
                output = f"Direct backbone inference: shape {logits.shape}"
                generation_method = "backbone"
                inference_success = True
            except Exception as e2:
                print(f"Backbone inference failed: {str(e2)[:100]}")
                # Method 3: Just compile the model without full inference
                try:
                    # Force model build and compilation
                    causal_lm.backbone.build((None, None))
                    output = "Model build and compile successful"
                    generation_method = "compile_only"
                    # Do NOT set inference_success = True here!
                except Exception as e3:
                    print(f"Model compile failed: {str(e3)[:100]}")
                    raise e1  # Re-raise original error
        end_time = time.perf_counter()
        if generation_method == "compile_only" and not inference_success:
            mem_after_inference, swap_after_inference = record_stage("3_FIRST_INFERENCE", 
                                              f"First inference failed via compile_only ({end_time-start_time:.1f}s)")
        else:
            mem_after_inference, swap_after_inference = record_stage("3_FIRST_INFERENCE", 
                                              f"First inference completed via {generation_method} ({end_time-start_time:.1f}s)")
        # Stage 4: Second inference
        print("\n>>> Second inference (no compilation)...")
        start_time2 = time.perf_counter()
        try:
            if generation_method == "generate":
                output2 = causal_lm.generate("Test", max_length=10)
            elif generation_method == "backbone":
                logits2 = causal_lm.backbone(input_tokens)
                output2 = f"Second backbone inference: shape {logits2.shape}"
            else:
                output2 = "Second compile test successful"
        except:
            output2 = f"Second {generation_method} inference"
        end_time2 = time.perf_counter()
        mem_after_second, swap_after_second = record_stage("4_SECOND_INFERENCE", 
                                    f"Second inference ({end_time2-start_time2:.1f}s)")
        
    except Exception as e:
        print(f"❌ All inference methods failed: {e}")
        import traceback
        print(f"Error details: {traceback.format_exc()}")
        mem_after_inference, swap_after_inference = record_stage("3_INFERENCE_FAILED", f"All inference failed: {str(e)[:50]}...")
        mem_after_second, swap_after_second = mem_after_inference, swap_after_inference
        output = "FAILED"
        output2 = "FAILED"
        inference_success = False
        end_time = start_time
        end_time2 = start_time

    # Stop monitoring
    done[0] = True
    monitor_thread.join(timeout=1.0)

    # Final stage
    mem_final, swap_final = record_stage("5_FINAL", "Final state")

    # Results analysis
    print("\n" + "=" * 80)
    print("PERFORMANCE RESULTS")
    print("=" * 80)

    if inference_success:
        latency = end_time - start_time
        latency2 = end_time2 - start_time2
        tokens_generated = len(output.split()) if output != "FAILED" else 0
        throughput = tokens_generated / latency if latency > 0 else 0
        
        print(f"✅ Generated text: '{output}'")
        print(f"✅ Second generation: '{output2}'")
        print(f"Backend: {keras.backend.backend()}")
        print(f"First inference latency: {latency:.2f}s")
        print(f"Second inference latency: {latency2:.3f}s") 
        print(f"Throughput: {throughput:.2f} tokens/sec")
        print(f"Speedup: {latency/latency2:.1f}x" if latency2 > 0 else "Speedup: N/A")
    else:
        print("❌ Inference failed")

    # Memory analysis
    model_loading = mem_after_load - mem_initial
    compilation = mem_after_inference - mem_before_inference if inference_success else 0
    total_usage = mem_final - mem_initial
    peak_usage = peak_memory[0] - mem_initial

    # Calculate swap changes
    swap_model_loading = swap_after_load - swap_initial
    swap_compilation = swap_after_inference - swap_before_inference if inference_success else 0
    swap_total = swap_final - swap_initial
    peak_swap_usage = peak_swap[0] - swap_initial

    print(f"\n📊 DETAILED MEMORY ANALYSIS:")
    
    if TABULATE_AVAILABLE:
        # Create table data
        table_data = []
        
        # Initial row
        table_data.append(["Initial", f"{mem_initial:.1f}", f"{swap_initial:.1f}", "-", "-"])
        
        # After model load
        table_data.append(["After model load", f"{mem_after_load:.1f}", f"{swap_after_load:.1f}", 
                          f"{model_loading:+.1f}", f"{swap_model_loading:+.1f}"])
        
        # Before inference
        table_data.append(["Before inference", f"{mem_before_inference:.1f}", f"{swap_before_inference:.1f}", 
                          f"{mem_before_inference-mem_after_load:+.1f}", f"{swap_before_inference-swap_after_load:+.1f}"])
        
        if inference_success:
            # After 1st inference
            table_data.append(["After 1st inference", f"{mem_after_inference:.1f}", f"{swap_after_inference:.1f}", 
                              f"{compilation:+.1f}", f"{swap_compilation:+.1f}"])
            
            # After 2nd inference
            table_data.append(["After 2nd inference", f"{mem_after_second:.1f}", f"{swap_after_second:.1f}", 
                              f"{mem_after_second-mem_after_inference:+.1f}", f"{swap_after_second-swap_after_inference:+.1f}"])
            
            # Final
            table_data.append(["Final", f"{mem_final:.1f}", f"{swap_final:.1f}", 
                              f"{mem_final-mem_after_second:+.1f}", f"{swap_final-swap_after_second:+.1f}"])
        else:
            # After failure
            table_data.append(["After failure", f"{mem_after_inference:.1f}", f"{swap_after_inference:.1f}", 
                              f"{compilation:+.1f}", f"{swap_compilation:+.1f}"])
            
            # Final
            table_data.append(["Final", f"{mem_final:.1f}", f"{swap_final:.1f}", 
                              f"{mem_final-mem_after_inference:+.1f}", f"{swap_final-swap_after_inference:+.1f}"])
        
        # Peak recorded
        table_data.append(["Peak recorded", f"{peak_memory[0]:.1f}", f"{peak_swap[0]:.1f}", f"{peak_usage:+.1f}", f"{peak_swap_usage:+.1f}"])
        
        # Headers
        headers = ["STAGE", "RAM (MB)", "SWAP (MB)", "RAM CHANGE", "SWAP CHANGE"]
        
        # Print beautiful table
        print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="left", numalign="right"))
        
    else:
        # Fallback to manual formatting if tabulate not available
        print(f"{'='*85}")
        print(f"{'STAGE':<25} {'RAM (MB)':<12} {'SWAP (MB)':<12} {'RAM CHANGE':<12} {'SWAP CHANGE':<12}")
        print(f"{'-'*85}")
        print(f"{'Initial':<25} {mem_initial:<12.1f} {swap_initial:<12.1f} {'-':<12} {'-':<12}")
        print(f"{'After model load':<25} {mem_after_load:<12.1f} {swap_after_load:<12.1f} {model_loading:+12.1f} {swap_model_loading:+12.1f}")
        print(f"{'Before inference':<25} {mem_before_inference:<12.1f} {swap_before_inference:<12.1f} {mem_before_inference-mem_after_load:+12.1f} {swap_before_inference-swap_after_load:+12.1f}")

        if inference_success:
            print(f"{'After 1st inference':<25} {mem_after_inference:<12.1f} {swap_after_inference:<12.1f} {compilation:+12.1f} {swap_compilation:+12.1f}")
            print(f"{'After 2nd inference':<25} {mem_after_second:<12.1f} {swap_after_second:<12.1f} {mem_after_second-mem_after_inference:+12.1f} {swap_after_second-swap_after_inference:+12.1f}")
        else:
            print(f"{'After failure':<25} {mem_after_inference:<12.1f} {swap_after_inference:<12.1f} {compilation:+12.1f} {swap_compilation:+12.1f}")

        print(f"{'Final':<25} {mem_final:<12.1f} {swap_final:<12.1f} {mem_final-mem_after_second if inference_success else mem_final-mem_after_inference:+12.1f} {swap_final-(swap_after_second if inference_success else swap_after_inference):+12.1f}")
        print(f"{'Peak recorded':<25} {peak_memory[0]:<12.1f} {peak_swap[0]:<12.1f} {peak_usage:+12.1f} {peak_swap_usage:+12.1f}")
        print(f"{'-'*85}")

    print(f"\n🔍 MAIN MEMORY CONSUMERS:")
    print(f"   📚 Model loading:        {model_loading:+8.1f} MB RAM  {swap_model_loading:+8.1f} MB swap  ({model_loading/total_usage*100 if total_usage != 0 else 0:.1f}% of total)")
    if inference_success and compilation != 0:
        print(f"   ⚡ Compilation/inference: {compilation:+8.1f} MB RAM  {swap_compilation:+8.1f} MB swap  ({compilation/total_usage*100 if total_usage != 0 else 0:.1f}% of total)")

    print(f"\n📈 SUMMARY:")
    print(f"   💾 Total RAM growth:     {total_usage:+8.1f} MB")
    print(f"   💿 Total swap change:    {swap_total:+8.1f} MB")
    print(f"   📊 Peak RAM consumption: {peak_usage:+8.1f} MB above initial")
    print(f"   🔥 Highest RAM recorded: {peak_memory[0]:.1f} MB")
    print(f"   💿 Peak swap consumption: {peak_swap_usage:+8.1f} MB above initial")
    print(f"   🔥 Highest swap recorded: {peak_swap[0]:.1f} MB")

    # Enhanced status assessment
    total_memory_impact = peak_memory[0] + peak_swap[0]
    print(f"\n🎯 MEMORY HEALTH CHECK:")
    if peak_usage > 2000:
        print(f"   ❌ CRITICAL: RAM usage {peak_usage:.0f} MB is very high (target <1GB)")
    elif peak_usage > 1000:
        print(f"   ⚠️  WARNING: RAM usage {peak_usage:.0f} MB is quite high")
    else:
        print(f"   ✅ GOOD: RAM usage {peak_usage:.0f} MB is reasonable")
    
    if peak_swap[0] > 1000:
        print(f"   ⚠️  WARNING: Peak swap usage {peak_swap[0]:.0f} MB indicates memory pressure")
    elif peak_swap[0] > 100:
        print(f"   ℹ️  INFO: Moderate peak swap usage {peak_swap[0]:.0f} MB")
    else:
        print(f"   ✅ GOOD: Low peak swap usage {peak_swap[0]:.0f} MB")

    if total_memory_impact > 4000:
        print(f"   🚨 ALERT: Combined memory impact {total_memory_impact:.0f} MB is very high")

    return {
        'success': inference_success,
        'model_loading_mb': model_loading,
        'compilation_mb': compilation,
        'total_mb': total_usage,
        'peak_mb': peak_usage,
        'peak_swap_mb': peak_swap_usage
    }

try:
    results = main()
    print(f"\n🎯 Test completed: {results}")
except Exception as e:
    print(f"\n❌ Critical error: {e}")
    import traceback
    print(traceback.format_exc())

without fix:

[MEMORY] PassManager CPU:DecompressionHandling started: 2225.85 MB
[MEMORY] ov::pass::InitNodeInfo: 2225.85 -> 2228.95 MB (+3.09375 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkShapeOfSubgraphs: 2229.1 -> 2229.1 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:DecompressionHandling finished: 2229.1 MB (total: +3.25 MB)
[MEMORY] PassManager Plugin:CPU started: 2229.57 MB
[MEMORY] PassManager SDPASubgraphFusion started: 2230.04 MB
[MEMORY] PassManager SDPASubgraphFusion finished: 2230.2 MB (total: +0.15625 MB)
[MEMORY] PassManager CommonOptimizations started: 2230.66 MB
[MEMORY] PassManager MOC started: 2233.63 MB
[MEMORY] EliminateConvert: 2234.1 -> 2234.1 MB (+0 MB) [CHANGED]
[MEMORY] EliminateLoopInputsOutputs: 2234.1 -> 2234.1 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2234.1 -> 2452.45 MB (+218.344 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.45 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::NopElimination: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.45 MB (total: +0 MB)
[MEMORY] PassManager StridedSliceOptimization started: 2452.45 MB
[MEMORY] PassManager StridedSliceOptimization finished: 2452.45 MB (total: +0 MB)
[MEMORY] ov::pass::GraphRewrite: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::LinOpSequenceFusion: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager ReverseInputChannelsFusion started: 2452.45 MB
[MEMORY] PassManager ReverseInputChannelsFusion finished: 2452.45 MB (total: +0 MB)
[MEMORY] PassManager Symbolic started: 2452.45 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::OptimizeSymbolsUsedAsValues: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ReshapeOptimizations: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.45 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.45 MB (total: +0 MB)
[MEMORY] PassManager Symbolic finished: 2452.45 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] PassManager MOC finished: 2452.45 MB (total: +218.812 MB)
[MEMORY] ov::pass::MOCTransformations: 2230.66 -> 2452.45 MB (+221.781 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkDividesInShapeSubgraphs: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::CommonDecompositions: 2452.45 -> 2452.45 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2452.45 -> 3208.88 MB (+756.43 MB) [CHANGED]
[MEMORY] ConvertSoftMax8ToSoftMax1: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::StridesOptimization: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CommonOptimizations finished: 3208.88 MB (total: +978.211 MB)
[MEMORY] ov::pass::CommonOptimizations: 2230.2 -> 3208.88 MB (+978.68 MB) [NO CHANGE]
[MEMORY] PassManager ConvertOpSet2ToOpSet1 started: 3208.88 MB
[MEMORY] PassManager ConvertOpSet2ToOpSet1 finished: 3208.88 MB (total: +0 MB)
[MEMORY] ConvertShapeOf3: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Plugin:CPU finished: 3208.88 MB (total: +979.305 MB)
[MEMORY] PassManager CPU:PostLPT started: 3208.88 MB
[MEMORY] ConvertBroadcast3: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::UnrollTensorIterator: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::MoveEltwiseUpThroughDataMov: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 3208.88 MB
[MEMORY] ov::pass::SymbolicPropagation: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 3208.88 MB (total: +0 MB)
[MEMORY] ov::pass::RoPEFusion: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 3208.88 MB
[MEMORY] ov::pass::SymbolicPropagation: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 3208.88 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:PostLPT finished: 3208.88 MB (total: +0 MB)
[MEMORY] PassManager CPU:Snippets started: 3208.88 MB
[MEMORY] ov::intel_cpu::SnippetsMarkSkipped: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization started: 3208.88 MB
[MEMORY] ov::snippets::pass::EnumerateNodes: 3208.88 -> 3208.88 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization finished: 3209.06 MB (total: +0.1875 MB)
[MEMORY] PassManager CPU:Snippets finished: 3209.06 MB (total: +0.1875 MB)
[MEMORY] PassManager CPU:PostSnippets started: 3209.06 MB
[MEMORY] PassManager CPU:PostSnippets finished: 3209.06 MB (total: +0 MB)
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset started: 3209.06 MB
[MEMORY] ConvertMatMulToFC: 3209.06 -> 3209.06 MB (+0 MB) [CHANGED]
[MEMORY] FullyConnectedBiasFusion: 3209.06 -> 3209.06 MB (+0 MB) [CHANGED]
[MEMORY] ConvertToPowerStatic: 3209.25 -> 3209.25 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset finished: 3209.25 MB (total: +0.1875 MB)

with fix
by adding:

os.environ["OV__ENABLE_EINSUM_DECOMPOSITION"] = "1"

[MEMORY] PassManager CPU:DecompressionHandling started: 2226.22 MB
[MEMORY] ov::pass::InitNodeInfo: 2226.22 -> 2230.22 MB (+4 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkShapeOfSubgraphs: 2230.22 -> 2230.22 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:DecompressionHandling finished: 2230.22 MB (total: +4 MB)
[MEMORY] PassManager Plugin:CPU started: 2230.22 MB
[MEMORY] KeepDecompressionsInFP32Matcher: 2230.22 -> 2232.22 MB (+2 MB) [NO CHANGE]
[MEMORY] PassManager SDPASubgraphFusion started: 2232.22 MB
[MEMORY] PassManager SDPASubgraphFusion finished: 2232.22 MB (total: +0 MB)
[MEMORY] PassManager CommonOptimizations started: 2232.22 MB
[MEMORY] PassManager MOC started: 2234.22 MB
[MEMORY] ov::pass::InitNodeInfo: 2234.22 -> 2236.22 MB (+2 MB) [NO CHANGE]
[MEMORY] EliminateConvert: 2236.22 -> 2236.22 MB (+0 MB) [CHANGED]
[MEMORY] EliminateLoopInputsOutputs: 2236.22 -> 2236.22 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2236.22 -> 2452.5 MB (+216.281 MB) [CHANGED]
[MEMORY] EinsumDecomposition: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.5 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::NopElimination: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager StridedSliceOptimization started: 2452.5 MB
[MEMORY] PassManager StridedSliceOptimization finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::ConstantFolding: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::GraphRewrite: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::LinOpSequenceFusion: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager ReverseInputChannelsFusion started: 2452.5 MB
[MEMORY] PassManager ReverseInputChannelsFusion finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::SharedOpOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 2452.5 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::OptimizeSymbolsUsedAsValues: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ReshapeOptimizations: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph started: 2452.5 MB
[MEMORY] ov::pass::SharedOpOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager SimplifyShapeOfSubGraph finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager Symbolic finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager MOC finished: 2452.5 MB (total: +218.281 MB)
[MEMORY] ov::pass::MOCTransformations: 2232.22 -> 2452.5 MB (+220.281 MB) [NO CHANGE]
[MEMORY] ov::pass::MarkDividesInShapeSubgraphs: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::CommonDecompositions: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::ConstantFolding: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ConvertSoftMax8ToSoftMax1: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::StridesOptimization: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CommonOptimizations finished: 2452.5 MB (total: +220.281 MB)
[MEMORY] ov::pass::CommonOptimizations: 2232.22 -> 2452.5 MB (+220.281 MB) [NO CHANGE]
[MEMORY] PassManager ConvertOpSet2ToOpSet1 started: 2452.5 MB
[MEMORY] PassManager ConvertOpSet2ToOpSet1 finished: 2452.5 MB (total: +0 MB)
[MEMORY] ConvertShapeOf3: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Plugin:CPU finished: 2452.5 MB (total: +222.281 MB)
[MEMORY] PassManager CPU:PostLPT started: 2452.5 MB
[MEMORY] ConvertBroadcast3: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::UnrollTensorIterator: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ov::pass::MoveEltwiseUpThroughDataMov: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 2452.5 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::RoPEFusion: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic started: 2452.5 MB
[MEMORY] ov::pass::SymbolicPropagation: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Symbolic finished: 2452.5 MB (total: +0 MB)
[MEMORY] ov::pass::SymbolicOptimizations: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:PostLPT finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:Snippets started: 2452.5 MB
[MEMORY] ov::intel_cpu::SnippetsMarkSkipped: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization started: 2452.5 MB
[MEMORY] ov::snippets::pass::EnumerateNodes: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] PassManager Snippets:Tokenization finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:Snippets finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:PostSnippets started: 2452.5 MB
[MEMORY] PassManager CPU:PostSnippets finished: 2452.5 MB (total: +0 MB)
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset started: 2452.5 MB
[MEMORY] ConvertMatMulToFC: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] FullyConnectedBiasFusion: 2452.5 -> 2452.5 MB (+0 MB) [CHANGED]
[MEMORY] ConvertToPowerStatic: 2452.7 -> 2452.7 MB (+0 MB) [CHANGED]
[MEMORY] PassManager CPU:ConvertToCPUSpecificOpset finished: 2452.7 MB (total: +0.195312 MB)

Note: the order of its postion is important.
I am still exploring what else can help reduce memory usage further. I would appreciate any suggestions or recommendations.

…sing OpenVINO backend on Keras 3

Mohamed-Ashraf273 · 2025-07-28T20:15:12Z

Here is the link for the IR without EinsumDecomposition:
https://drive.google.com/drive/folders/1QADUgXWYS8cddXsnyiy74GGMn5ShzbDG?usp=sharing

Mohamed-Ashraf273 · 2025-08-01T21:37:09Z

@evkotov
@CuriousPanCake
@mvafin
@rkazants
Can you take a look?

Mohamed-Ashraf273 · 2025-08-02T10:44:59Z

@mlukasze
@p-wysocki

Mohamed-Ashraf273 · 2025-08-03T11:08:30Z

@itikhono
@evkotov
@CuriousPanCake
Could you please take a look?

praasz · 2025-08-04T07:50:46Z

build_jenkins

Mohamed-Ashraf273 · 2025-08-05T12:50:03Z

@CuriousPanCake
@mvafin
Could you please take a look?
Thanks!

Mohamed-Ashraf273 · 2025-08-08T11:44:03Z

@CuriousPanCake
@praasz
@evkotov
@itikhono
@mvafin

mvafin · 2025-08-08T13:18:16Z

src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp

+    // the order is important
+    const char* enable_einsum = std::getenv("OV_ENABLE_EINSUM_DECOMPOSITION");
+    if (enable_einsum) {
+        REGISTER_PASS(manager, EinsumDecomposition)


I don't think this is a good way to fix this. Doing this in MOC means we will have decomposed einsum in IR.

As I understand this is really needed only for einsum that have constant inputs to constant fold it before reaching plugin. Can we do it differently? Maybe modify this transformation to work only on constant inputs for offline step? @CuriousPanCake

@mvafin
I updated it to check if at least one of the inputs is a constant, and it worked too.

from:

================================================================================ FIXED MEMORY TEST: KERAS GPT2 + OPENVINO ================================================================================ [STAGE] 0_INITIAL: 775.24 MB (swap: 0.00 MB) - Initial state after imports >>> Loading GPT2 model from preset... [STAGE] 1_MODEL_LOADED: 2314.67 MB (swap: 0.00 MB) - gpt2_medium_en model loaded (10.0s) [STAGE] 2_BEFORE_INFERENCE: 2314.67 MB (swap: 0.00 MB) - Before first inference >>> Running first inference (compilation + execution)... ⏳ Converting Keras -> OPENVINO and compiling... [STAGE] 3_FIRST_INFERENCE: 4512.82 MB (swap: 0.00 MB) - First inference completed via generate (7.7s) >>> Second inference (no compilation)... [STAGE] 4_SECOND_INFERENCE: 4510.38 MB (swap: 0.00 MB) - Second inference (2.0s) [STAGE] 5_FINAL: 4510.38 MB (swap: 0.00 MB) - Final state ================================================================================ PERFORMANCE RESULTS ================================================================================ ✅ Generated text: 'Hello everyone, We've been busy' ✅ Second generation: 'Testimony before the House Judiciary Committee on April' Backend: openvino First inference latency: 7.69s Second inference latency: 2.045s Throughput: 0.65 tokens/sec Speedup: 3.8x 📊 DETAILED MEMORY ANALYSIS: +---------------------+------------+-------------+--------------+---------------+ | STAGE | RAM (MB) | SWAP (MB) | RAM CHANGE | SWAP CHANGE | +=====================+============+=============+==============+===============+ | Initial | 775.2 | 0 | - | - | +---------------------+------------+-------------+--------------+---------------+ | After model load | 2314.7 | 0 | +1539.4 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Before inference | 2314.7 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 1st inference | 4512.8 | 0 | +2198.1 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 2nd inference | 4510.4 | 0 | -2.4 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Final | 4510.4 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Peak recorded | 4522.9 | 0 | +3747.7 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ 🔍 MAIN MEMORY CONSUMERS: 📚 Model loading: +1539.4 MB RAM +0.0 MB swap (41.2% of total) ⚡ Compilation/inference: +2198.1 MB RAM +0.0 MB swap (58.9% of total) 📈 SUMMARY: 💾 Total RAM growth: +3735.1 MB 💿 Total swap change: +0.0 MB 📊 Peak RAM consumption: +3747.7 MB above initial 🔥 Highest RAM recorded: 4522.9 MB 💿 Peak swap consumption: +0.0 MB above initial 🔥 Highest swap recorded: 0.0 MB 🎯 MEMORY HEALTH CHECK: ❌ CRITICAL: RAM usage 3748 MB is very high (target <1GB) ✅ GOOD: Low peak swap usage 0 MB 🚨 ALERT: Combined memory impact 4523 MB is very high 🎯 Test completed: {'success': True, 'model_loading_mb': 1539.4296875, 'compilation_mb': 2198.1484375, 'total_mb': 3735.13671875, 'peak_mb': 3747.6640625, 'peak_swap_mb': 0.0}

to

[STAGE] 0_INITIAL: 781.90 MB (swap: 0.00 MB) - Initial state after imports >>> Loading GPT2 model from preset... [STAGE] 1_MODEL_LOADED: 2321.91 MB (swap: 0.00 MB) - gpt2_medium_en model loaded (13.4s) [STAGE] 2_BEFORE_INFERENCE: 2321.91 MB (swap: 0.00 MB) - Before first inference >>> Running first inference (compilation + execution)... ⏳ Converting Keras -> OPENVINO and compiling... [STAGE] 3_FIRST_INFERENCE: 3548.79 MB (swap: 0.00 MB) - First inference completed via generate (7.6s) >>> Second inference (no compilation)... [STAGE] 4_SECOND_INFERENCE: 3546.42 MB (swap: 0.00 MB) - Second inference (2.7s) [STAGE] 5_FINAL: 3546.42 MB (swap: 0.00 MB) - Final state ================================================================================ PERFORMANCE RESULTS ================================================================================ ✅ Generated text: 'Hello! I'm a student studying computer programming' ✅ Second generation: 'Testimonials I was a new' Backend: openvino First inference latency: 7.62s Second inference latency: 2.673s Throughput: 0.92 tokens/sec Speedup: 2.9x 📊 DETAILED MEMORY ANALYSIS: +---------------------+------------+-------------+--------------+---------------+ | STAGE | RAM (MB) | SWAP (MB) | RAM CHANGE | SWAP CHANGE | +=====================+============+=============+==============+===============+ | Initial | 781.9 | 0 | - | - | +---------------------+------------+-------------+--------------+---------------+ | After model load | 2321.9 | 0 | +1540.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Before inference | 2321.9 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 1st inference | 3548.8 | 0 | +1226.9 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | After 2nd inference | 3546.4 | 0 | -2.4 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Final | 3546.4 | 0 | +0.0 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ | Peak recorded | 3567.8 | 0 | +2785.9 | +0.0 | +---------------------+------------+-------------+--------------+---------------+ 🔍 MAIN MEMORY CONSUMERS: 📚 Model loading: +1540.0 MB RAM +0.0 MB swap (55.7% of total) ⚡ Compilation/inference: +1226.9 MB RAM +0.0 MB swap (44.4% of total) 📈 SUMMARY: 💾 Total RAM growth: +2764.5 MB 💿 Total swap change: +0.0 MB 📊 Peak RAM consumption: +2785.9 MB above initial 🔥 Highest RAM recorded: 3567.8 MB 💿 Peak swap consumption: +0.0 MB above initial 🔥 Highest swap recorded: 0.0 MB 🎯 MEMORY HEALTH CHECK: ❌ CRITICAL: RAM usage 2786 MB is very high (target <1GB) ✅ GOOD: Low peak swap usage 0 MB 🎯 Test completed: {'success': True, 'model_loading_mb': 1540.0078125, 'compilation_mb': 1226.88671875, 'total_mb': 2764.5234375, 'peak_mb': 2785.86328125, 'peak_swap_mb': 0.0}

mvafin · 2025-08-11T10:58:58Z

src/common/transformations/src/transformations/common_optimizations/moc_transformations.cpp

    REGISTER_PASS(manager, ConstantFolding)
    REGISTER_PASS(manager, Validate)
-
+    // the order is important


Please add a better comment before which transformation it should be called

mvafin · 2025-08-11T11:00:18Z

src/common/transformations/src/transformations/op_conversions/einsum_decomposition.cpp

+        if (m_check_const) {
+            bool has_const = false;
+            for (auto& input : einsum_node->input_values()) {
+                auto node_ptr = input.get_node_shared_ptr();
+                auto constant_ptr = ov::as_type_ptr<ov::op::v0::Constant>(node_ptr);
+                if (constant_ptr) {
+                    has_const = true;
+                    break;
+                }
+            }
+            if (!has_const)
+                return false;
+        }


Could you provide more detains about the einsum operation you want to optimize? Maybe link to a code of the model or a picture of subgraph

This optimization targets specific Einsum operations in transformer models like GPT-2, where at least one input is a constant tensor. After ConstantFolding, weight matrices become constants enabling more efficient decomposition patterns.

Specific Einsum Operations Being Optimized:

1. Query-Key Attention Scores Computation:

Location: https://github.com/keras-team/keras/blob/master/keras/src/layers/attention/multi_head_attention.py#L493

Pattern: einsum("aecd,abcd->acbe", key, query)

Code: attention_scores = ops.einsum(self._dot_product_equation, key, query)

2. Attention-Value Combination:

Location: https://github.com/keras-team/keras/blob/master/keras/src/layers/attention/multi_head_attention.py#L509-L511

Pattern: einsum("acbe,aecd->abcd", attention_scores, value)

Code: attention_output = ops.einsum(self._combine_equation, final_attn_scores, value)

3. Weight Matrix Projections (Q/K/V Transformations):

Location: https://github.com/keras-team/keras/blob/master/keras/src/layers/core/einsum_dense.py#L214

Pattern: einsum("abc,cd->abd", input, weight_matrix)

Code: x = ops.einsum(self.equation, inputs, self.kernel)

Optimization Application:

Note: The optimization is only applied when at least one einsum input is constant. In the examples above:

✅ Weight Matrix Projections (example 3): weight_matrix becomes constant after ConstantFolding → Optimization Applied

❌ Attention Scores (examples 1&2): Both key and query are variable tensors → No Optimization

For more details and examples visit:
https://gist.github.com/Mohamed-Ashraf273/59eddcd120918cb0761ffa5020800d5d

@rkazants

@rkazants @mvafin @mlukasze @evkotov @CuriousPanCake @itikhono ### Performance issue description ## Problem OpenVINO backend exhibits **excessive memory consumption** during GPT-2 model inference compared to other Keras backends (TensorFlow, PyTorch, JAX). The issue occurs during the model compilation phase when converting from Keras to OpenVINO format, resulting in significantly higher memory usage that makes OpenVINO unsuitable for memory-constrained environments. **Problem**: OpenVINO uses substantially more memory than other backends during the compilation/inference phase. ## Summary of the solution: Solving Issue: #31390, First I was trying to solve this problem by introducing an `EinsumDecomposition` at MOC in this PR: #31482 But I found another solution: My first fix was to add `EinsumDecomposition` in MOC, and I found that both this version and the original `EinsumDecomposition` in `CommonOptimizations` introduced `Broadcast` nodes. However, in my fix the MOC pipeline later removed them, which allowed constants to be shared before the `ConstantFolding` pass that otherwise duplicates them in `CommonOptimizations`, leading to reduced memory usage. By comparing the two, I realized that both decompositions actually produced the same graph initially, but the MOC version benefited from an additional simplification step that cleaned up the broadcasts. After debugging, I identified the responsible pass as `NopElimination`. When I applied this pass in `CommonOptimizations` just before `ConstantFolding`, it achieved the same effect: broadcasts disappeared, constants were shared, and memory usage dropped, without needing to move EinsumDecomposition into MOC. ### 📊 Complete Analysis & Benchmarks For comprehensive performance comparison, optimization results, and technical details across all Keras backends: **[� Detailed Performance Report & Memory Optimization Analysis](https://gist.github.com/Mohamed-Ashraf273/1ecc15bd5e83c229d7e3f07851624bc8)** The report includes cross-backend benchmarks before and after both fixes, which gave the same results for OpenVINO --- ### Step-by-step reproduction Use keras source: https://github.com/keras-team/keras.git Also use this PR from keras_hub: keras-team/keras-hub#2350 ```python import os os.environ["KERAS_BACKEND"] = "openvino" import keras_hub causal_lm = keras_hub.models.GPT2CausalLM.from_preset("gpt2_medium_en", dtype="float32") output = causal_lm.generate("Hello", max_length=10) # Memory spike occurs here ``` Example Graph: ```python def create_einsum_constant_model(): """Create a model with both constant and non-constant einsum patterns from different sources""" input_tensor = ops.parameter([1, 10, 1024], np.float32, name="input") # Create diverse constant sources for einsum operations # Source 1: Direct constant weight matrix weight_data_1 = np.random.randn(1024, 16, 64).astype(np.float32) const_weight_1 = ops.constant(weight_data_1, name="const_weight_1") # Source 2: Constant from addition base_weight_2 = ops.constant(np.random.randn(1024, 16, 64).astype(np.float32), name="base_weight_2") bias_weight_2 = ops.constant(np.random.randn(1024, 16, 64).astype(np.float32), name="bias_weight_2") const_weight_2 = ops.add(base_weight_2, bias_weight_2) # Constant folded # Source 3: Constant from multiply (your original source) base_weight_3 = ops.constant(np.random.randn(1024, 16, 64).astype(np.float32), name="base_weight_3") scale_3 = ops.constant(np.array(0.125, dtype=np.float32), name="scale_3") const_weight_3 = ops.multiply(base_weight_3, scale_3) # Constant folded # Source 4: Constant from reshape flat_weight_4 = ops.constant(np.random.randn(1024*16*64).astype(np.float32), name="flat_weight_4") const_weight_4 = ops.reshape(flat_weight_4, [1024, 16, 64], special_zero=False) # Source 5: Constant from transpose orig_weight_5 = ops.constant(np.random.randn(16, 1024, 64).astype(np.float32), name="orig_weight_5") const_weight_5 = ops.transpose(orig_weight_5, [1, 0, 2]) # [1024, 16, 64] current = input_tensor # Create 10 einsum operations with constants (WILL BE OPTIMIZED) const_sources = [const_weight_1, const_weight_2, const_weight_3, const_weight_4, const_weight_5] for i in range(5): # Use each constant source twice (5*2 = 10) for j in range(2): const_idx = i einsum_out = ops.einsum([current, const_sources[const_idx]], "abc,cde->abde") # Add bias to continue the chain bias = ops.constant(np.random.randn(16, 64).astype(np.float32), name=f"bias_{i}_{j}") current = ops.add(einsum_out, bias) # Reshape to prepare for next iteration if i < 4 or j < 1: # Not the last iteration proj_weight = ops.constant(np.random.randn(16*64, 1024).astype(np.float32), name=f"proj_{i}_{j}") reshaped = ops.reshape(current, [1, 10, 16*64], special_zero=False) current = ops.matmul(reshaped, proj_weight, transpose_a=False, transpose_b=False) # Now create variable tensors from different sources for non-constant einsums # Start fresh with current tensor for variable operations var_source = ops.reshape(current, [1, 10, 16, 64], special_zero=False) # Create 20 einsum operations without constants (WON'T BE OPTIMIZED) for i in range(10): # Source 1: Split operations to create variable tensors split_axis = ops.constant(np.array(3, dtype=np.int32), name=f"split_axis_{i}") split_lengths = ops.constant(np.array([32, 32], dtype=np.int32), name=f"split_lengths_{i}") split_result = ops.variadic_split(var_source, split_axis, split_lengths) var_tensor_1 = split_result.output(0) # [1, 10, 16, 32] - Variable var_tensor_2 = split_result.output(1) # [1, 10, 16, 32] - Variable # EINSUM 1: Element-wise pattern (variable x variable) einsum_var_1 = ops.einsum([var_tensor_1, var_tensor_2], "abcd,abcd->abcd") # Source 2: Create more variable tensors from different operations # Use subtract to create another variable tensor var_tensor_3 = ops.subtract(var_tensor_1, var_tensor_2) # [1, 10, 16, 32] - Variable # Use relu to create another variable tensor var_tensor_4 = ops.relu(var_tensor_2) # [1, 10, 16, 32] - Variable # EINSUM 2: Another variable x variable pattern einsum_var_2 = ops.einsum([var_tensor_3, var_tensor_4], "abcd,abcd->abcd") # Combine and use for next iteration combined = ops.add(einsum_var_1, einsum_var_2) # Concatenate back to [1, 10, 16, 64] for next iteration var_source = ops.concat([combined, combined], axis=3) # [1, 10, 16, 64] # Final projection to output final_proj = ops.constant(np.random.randn(16*64, 1024).astype(np.float32), name="final_proj") final_reshaped = ops.reshape(var_source, [1, 10, 16*64], special_zero=False) final_output = ops.matmul(final_reshaped, final_proj, transpose_a=False, transpose_b=False) # Final output model = ov.Model([final_output], [input_tensor], name="EinsumConstantTest") # Print model statistics ops_by_type = {} for op in model.get_ops(): op_type = op.get_type_name() ops_by_type[op_type] = ops_by_type.get(op_type, 0) + 1 print("Original model operations:") for op_type, count in sorted(ops_by_type.items()): print(f" {op_type}: {count}") print(f"\nEinsum breakdown:") print(f" - Einsums with constants (WILL BE OPTIMIZED): 10") print(f" * From direct constant: 2") print(f" * From constant addition: 2") print(f" * From constant multiply: 2") print(f" * From constant reshape: 2") print(f" * From constant transpose: 2") print(f" - Einsums without constants (WON'T BE OPTIMIZED): 20") print(f" * From variadic_split operations: 10") print(f" * From subtract + relu operations: 10") print(f" - Total Einsums: 30") return model ``` You can find the original IR, Complied IR, IR before NopElimination and after NopElimination here: https://drive.google.com/drive/folders/1xxNVFotGOZLeUf5ECtmJhm4fytJNoBLN?usp=sharing --- Original Graph: <img width="1130" height="918" alt="Screenshot from 2025-08-26 12-40-15" src="https://github.com/user-attachments/assets/37a93d33-4dd4-4b6b-9f83-1c21676e6551" /> Before NopElimination: <img width="655" height="919" alt="Screenshot from 2025-08-26 15-20-51" src="https://github.com/user-attachments/assets/45fe58dc-b702-4510-b30a-1cc15cc43acc" /> After NopElimination: <img width="655" height="919" alt="Screenshot from 2025-08-26 15-21-26" src="https://github.com/user-attachments/assets/1b7f19a6-45f8-4d60-b04d-bcd416749267" /> --------- Co-authored-by: Maxim Vafin <[email protected]> Co-authored-by: Andrii Staikov <[email protected]> Co-authored-by: Roman Kazantsev <[email protected]>

@rkazants

@rkazants @mvafin @mlukasze @evkotov @CuriousPanCake @itikhono ### Performance issue description ## Problem OpenVINO backend exhibits **excessive memory consumption** during GPT-2 model inference compared to other Keras backends (TensorFlow, PyTorch, JAX). The issue occurs during the model compilation phase when converting from Keras to OpenVINO format, resulting in significantly higher memory usage that makes OpenVINO unsuitable for memory-constrained environments. **Problem**: OpenVINO uses substantially more memory than other backends during the compilation/inference phase. ## Summary of the solution: Solving Issue: openvinotoolkit#31390, First I was trying to solve this problem by introducing an `EinsumDecomposition` at MOC in this PR: openvinotoolkit#31482 But I found another solution: My first fix was to add `EinsumDecomposition` in MOC, and I found that both this version and the original `EinsumDecomposition` in `CommonOptimizations` introduced `Broadcast` nodes. However, in my fix the MOC pipeline later removed them, which allowed constants to be shared before the `ConstantFolding` pass that otherwise duplicates them in `CommonOptimizations`, leading to reduced memory usage. By comparing the two, I realized that both decompositions actually produced the same graph initially, but the MOC version benefited from an additional simplification step that cleaned up the broadcasts. After debugging, I identified the responsible pass as `NopElimination`. When I applied this pass in `CommonOptimizations` just before `ConstantFolding`, it achieved the same effect: broadcasts disappeared, constants were shared, and memory usage dropped, without needing to move EinsumDecomposition into MOC. ### 📊 Complete Analysis & Benchmarks For comprehensive performance comparison, optimization results, and technical details across all Keras backends: **[� Detailed Performance Report & Memory Optimization Analysis](https://gist.github.com/Mohamed-Ashraf273/1ecc15bd5e83c229d7e3f07851624bc8)** The report includes cross-backend benchmarks before and after both fixes, which gave the same results for OpenVINO --- ### Step-by-step reproduction Use keras source: https://github.com/keras-team/keras.git Also use this PR from keras_hub: keras-team/keras-hub#2350 ```python import os os.environ["KERAS_BACKEND"] = "openvino" import keras_hub causal_lm = keras_hub.models.GPT2CausalLM.from_preset("gpt2_medium_en", dtype="float32") output = causal_lm.generate("Hello", max_length=10) # Memory spike occurs here ``` Example Graph: ```python def create_einsum_constant_model(): """Create a model with both constant and non-constant einsum patterns from different sources""" input_tensor = ops.parameter([1, 10, 1024], np.float32, name="input") # Create diverse constant sources for einsum operations # Source 1: Direct constant weight matrix weight_data_1 = np.random.randn(1024, 16, 64).astype(np.float32) const_weight_1 = ops.constant(weight_data_1, name="const_weight_1") # Source 2: Constant from addition base_weight_2 = ops.constant(np.random.randn(1024, 16, 64).astype(np.float32), name="base_weight_2") bias_weight_2 = ops.constant(np.random.randn(1024, 16, 64).astype(np.float32), name="bias_weight_2") const_weight_2 = ops.add(base_weight_2, bias_weight_2) # Constant folded # Source 3: Constant from multiply (your original source) base_weight_3 = ops.constant(np.random.randn(1024, 16, 64).astype(np.float32), name="base_weight_3") scale_3 = ops.constant(np.array(0.125, dtype=np.float32), name="scale_3") const_weight_3 = ops.multiply(base_weight_3, scale_3) # Constant folded # Source 4: Constant from reshape flat_weight_4 = ops.constant(np.random.randn(1024*16*64).astype(np.float32), name="flat_weight_4") const_weight_4 = ops.reshape(flat_weight_4, [1024, 16, 64], special_zero=False) # Source 5: Constant from transpose orig_weight_5 = ops.constant(np.random.randn(16, 1024, 64).astype(np.float32), name="orig_weight_5") const_weight_5 = ops.transpose(orig_weight_5, [1, 0, 2]) # [1024, 16, 64] current = input_tensor # Create 10 einsum operations with constants (WILL BE OPTIMIZED) const_sources = [const_weight_1, const_weight_2, const_weight_3, const_weight_4, const_weight_5] for i in range(5): # Use each constant source twice (5*2 = 10) for j in range(2): const_idx = i einsum_out = ops.einsum([current, const_sources[const_idx]], "abc,cde->abde") # Add bias to continue the chain bias = ops.constant(np.random.randn(16, 64).astype(np.float32), name=f"bias_{i}_{j}") current = ops.add(einsum_out, bias) # Reshape to prepare for next iteration if i < 4 or j < 1: # Not the last iteration proj_weight = ops.constant(np.random.randn(16*64, 1024).astype(np.float32), name=f"proj_{i}_{j}") reshaped = ops.reshape(current, [1, 10, 16*64], special_zero=False) current = ops.matmul(reshaped, proj_weight, transpose_a=False, transpose_b=False) # Now create variable tensors from different sources for non-constant einsums # Start fresh with current tensor for variable operations var_source = ops.reshape(current, [1, 10, 16, 64], special_zero=False) # Create 20 einsum operations without constants (WON'T BE OPTIMIZED) for i in range(10): # Source 1: Split operations to create variable tensors split_axis = ops.constant(np.array(3, dtype=np.int32), name=f"split_axis_{i}") split_lengths = ops.constant(np.array([32, 32], dtype=np.int32), name=f"split_lengths_{i}") split_result = ops.variadic_split(var_source, split_axis, split_lengths) var_tensor_1 = split_result.output(0) # [1, 10, 16, 32] - Variable var_tensor_2 = split_result.output(1) # [1, 10, 16, 32] - Variable # EINSUM 1: Element-wise pattern (variable x variable) einsum_var_1 = ops.einsum([var_tensor_1, var_tensor_2], "abcd,abcd->abcd") # Source 2: Create more variable tensors from different operations # Use subtract to create another variable tensor var_tensor_3 = ops.subtract(var_tensor_1, var_tensor_2) # [1, 10, 16, 32] - Variable # Use relu to create another variable tensor var_tensor_4 = ops.relu(var_tensor_2) # [1, 10, 16, 32] - Variable # EINSUM 2: Another variable x variable pattern einsum_var_2 = ops.einsum([var_tensor_3, var_tensor_4], "abcd,abcd->abcd") # Combine and use for next iteration combined = ops.add(einsum_var_1, einsum_var_2) # Concatenate back to [1, 10, 16, 64] for next iteration var_source = ops.concat([combined, combined], axis=3) # [1, 10, 16, 64] # Final projection to output final_proj = ops.constant(np.random.randn(16*64, 1024).astype(np.float32), name="final_proj") final_reshaped = ops.reshape(var_source, [1, 10, 16*64], special_zero=False) final_output = ops.matmul(final_reshaped, final_proj, transpose_a=False, transpose_b=False) # Final output model = ov.Model([final_output], [input_tensor], name="EinsumConstantTest") # Print model statistics ops_by_type = {} for op in model.get_ops(): op_type = op.get_type_name() ops_by_type[op_type] = ops_by_type.get(op_type, 0) + 1 print("Original model operations:") for op_type, count in sorted(ops_by_type.items()): print(f" {op_type}: {count}") print(f"\nEinsum breakdown:") print(f" - Einsums with constants (WILL BE OPTIMIZED): 10") print(f" * From direct constant: 2") print(f" * From constant addition: 2") print(f" * From constant multiply: 2") print(f" * From constant reshape: 2") print(f" * From constant transpose: 2") print(f" - Einsums without constants (WON'T BE OPTIMIZED): 20") print(f" * From variadic_split operations: 10") print(f" * From subtract + relu operations: 10") print(f" - Total Einsums: 30") return model ``` You can find the original IR, Complied IR, IR before NopElimination and after NopElimination here: https://drive.google.com/drive/folders/1xxNVFotGOZLeUf5ECtmJhm4fytJNoBLN?usp=sharing --- Original Graph: <img width="1130" height="918" alt="Screenshot from 2025-08-26 12-40-15" src="https://github.com/user-attachments/assets/37a93d33-4dd4-4b6b-9f83-1c21676e6551" /> Before NopElimination: <img width="655" height="919" alt="Screenshot from 2025-08-26 15-20-51" src="https://github.com/user-attachments/assets/45fe58dc-b702-4510-b30a-1cc15cc43acc" /> After NopElimination: <img width="655" height="919" alt="Screenshot from 2025-08-26 15-21-26" src="https://github.com/user-attachments/assets/1b7f19a6-45f8-4d60-b04d-bcd416749267" /> --------- Co-authored-by: Maxim Vafin <[email protected]> Co-authored-by: Andrii Staikov <[email protected]> Co-authored-by: Roman Kazantsev <[email protected]>

[Performance]solve high memory usage issue during model compilation u…

86d7685

…sing OpenVINO backend on Keras 3

github-actions bot added the category: transformations OpenVINO Runtime library - Transformations label Jul 27, 2025

sys-openvino-ci added the ExternalPR External contributor label Jul 27, 2025

Mohamed-Ashraf273 changed the title ~~[Performance] Solve high me mory usage issue during model compilation using OpenVINO backend on Keras 3~~ [Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 Jul 27, 2025

Mohamed-Ashraf273 marked this pull request as ready for review July 27, 2025 12:24

Mohamed-Ashraf273 requested a review from a team as a code owner July 27, 2025 12:24

Mohamed-Ashraf273 requested review from itikhono and removed request for a team July 27, 2025 12:24

Mohamed-Ashraf273 mentioned this pull request Jul 29, 2025

[Performance] High Memory Usage During GPT-2 Generation Using OpenVINO Backend on Keras 3 Compared to other backends #31390

Closed

3 tasks

make einsum optional

126b77d

Mohamed-Ashraf273 force-pushed the optimize_model_compilation branch from 0174a70 to 126b77d Compare August 1, 2025 21:51

fix typo

4628d6f

praasz assigned CuriousPanCake Aug 4, 2025

praasz requested review from CuriousPanCake and evkotov August 4, 2025 07:49

praasz assigned evkotov Aug 4, 2025

mvafin reviewed Aug 8, 2025

View reviewed changes

Mohamed-Ashraf273 added 2 commits August 9, 2025 02:12

update einsum to check const

569ea46

fix clang

f1fef2e

mvafin reviewed Aug 11, 2025

View reviewed changes

add a detailed comment

f5dd8f1

Mohamed-Ashraf273 force-pushed the optimize_model_compilation branch from a5001be to f5dd8f1 Compare August 11, 2025 13:15

add explaination

cebca9e

Mohamed-Ashraf273 force-pushed the optimize_model_compilation branch from 55c03e6 to cebca9e Compare August 11, 2025 15:21

carlpinto25 mentioned this pull request Aug 25, 2025

Pull Request Labeler workflow failing with Cancelled after 2s #31870

Open

Mohamed-Ashraf273 mentioned this pull request Aug 26, 2025

Fix memory issue while compiling keras models #31873

Merged

Mohamed-Ashraf273 closed this Sep 2, 2025

[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

[Performance] Solve high memory usage issue during model compilation using OpenVINO backend on Keras 3 #31482

Uh oh!

Conversation

Mohamed-Ashraf273 commented Jul 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Mohamed-Ashraf273 commented Jul 28, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Mohamed-Ashraf273 commented Aug 2, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 3, 2025

Uh oh!

praasz commented Aug 4, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 5, 2025

Uh oh!

Mohamed-Ashraf273 commented Aug 8, 2025

Uh oh!

mvafin Aug 8, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 8, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 8, 2025

Choose a reason for hiding this comment

Uh oh!

mvafin Aug 11, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 11, 2025

Choose a reason for hiding this comment

Uh oh!

mvafin Aug 11, 2025

Choose a reason for hiding this comment

Uh oh!

Mohamed-Ashraf273 Aug 11, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Specific Einsum Operations Being Optimized:

Optimization Application:

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

Mohamed-Ashraf273 commented Jul 27, 2025 •

edited

Loading

Mohamed-Ashraf273 commented Aug 1, 2025 •

edited

Loading

Mohamed-Ashraf273 Aug 11, 2025 •

edited

Loading