diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c3dca338a..c91febfe0 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -72,13 +72,23 @@ jobs:
           sudo apt-get update
           sudo apt-get install build-essential
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cmake-cpu
+          evict-old-files: 1d
+
       - name: Build
         id: cmake_build
         run: |
-          mkdir build
-          cd build
-          cmake .. -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON
-          cmake --build . --config Release
+          cmake -B build \
+            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)
 
       - name: Get commit hash
         id: commit
@@ -134,14 +144,25 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install build-essential libvulkan-dev glslc
+      
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-cmake-vulkan
+          evict-old-files: 1d
 
       - name: Build
         id: cmake_build
         run: |
-          mkdir build
-          cd build
-          cmake .. -DSD_BUILD_SHARED_LIBS=ON -DSD_VULKAN=ON
-          cmake --build . --config Release
+          cmake -B build \
+            -DCMAKE_INSTALL_RPATH='$ORIGIN' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_CPU_ALL_VARIANTS=ON \
+            -DGGML_VULKAN=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(nproc)
 
       - name: Get commit hash
         id: commit
@@ -271,14 +292,23 @@ jobs:
         run: |
           brew install zip
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: macos-cmake
+          evict-old-files: 1d
+
       - name: Build
         id: cmake_build
         run: |
           sysctl -a
-          mkdir build
-          cd build
-          cmake .. -DGGML_AVX2=ON -DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" -DSD_BUILD_SHARED_LIBS=ON
-          cmake --build . --config Release
+          cmake -B build \
+            -DCMAKE_INSTALL_RPATH='@loader_path' \
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+            -DGGML_METAL=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DGGML_RPC=ON
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Get commit hash
         id: commit
@@ -318,18 +348,12 @@ jobs:
     strategy:
       matrix:
         include:
-          - build: "noavx"
-            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "avx2"
-            defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "avx"
-            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "avx512"
-            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "cpu"
+            defines: "-DGGML_NATIVE=OFF -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=ON"
           - build: "cuda12"
-            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
+            defines: "-DGGML_NATIVE=OFF -DGGML_RPC=ON -DGGML_CUDA=ON -DGGML_BACKEND_DL=ON -DCMAKE_CUDA_ARCHITECTURES='61;70;75;80;86;89;90;100;120' -DCMAKE_CUDA_FLAGS='-Xcudafe \"--diag_suppress=177\" -Xcudafe \"--diag_suppress=550\"'"
           - build: "vulkan"
-            defines: "-DSD_VULKAN=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_RPC=ON -DGGML_VULKAN=ON -DGGML_BACKEND_DL=ON"
     steps:
       - name: Clone
         id: checkout
@@ -369,26 +393,19 @@ jobs:
         id: msvc_dev_cmd
         uses: ilammy/msvc-dev-cmd@v1
 
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: windows-cmake-${{ matrix.build }}
+          evict-old-files: 1d
+
       - name: Build
         id: cmake_build
         run: |
           mkdir build
           cd build
-          cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe -DCMAKE_BUILD_TYPE=Release ${{ matrix.defines }}
-          cmake --build .
-
-      - name: Check AVX512F support
-        id: check_avx512f
-        if: ${{ matrix.build == 'avx512' }}
-        continue-on-error: true
-        run: |
-          cd build
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
-          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
-          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
-          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
+          cmake .. -DCMAKE_CXX_FLAGS='/bigobj' -G Ninja -DCMAKE_C_COMPILER=cl.exe -DCMAKE_CXX_COMPILER=cl.exe ${{ matrix.defines }}
+          cmake --build . --config Release
 
       - name: Get commit hash
         id: commit
@@ -440,18 +457,25 @@ jobs:
           path: |
             sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
 
-  windows-latest-cmake-hip:
+  windows-latest-rocm:
     runs-on: windows-2022
 
     env:
-      HIPSDK_INSTALLER_VERSION: "25.Q3"
-      GPU_TARGETS: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
+      HIPSDK_INSTALLER_VERSION: "26.Q1"
+      GPU_TARGETS: "gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
 
     steps:
       - uses: actions/checkout@v3
         with:
           submodules: recursive
 
+      - name: Grab rocWMMA package
+        id: grab_rocwmma
+        run: |
+          curl -o rocwmma.deb "https://repo.radeon.com/rocm/apt/7.2/pool/main/r/rocwmma-dev/rocwmma-dev_2.2.0.70200-43~24.04_amd64.deb"
+          7z x rocwmma.deb
+          7z x data.tar
+          
       - name: Setup Node
         uses: actions/setup-node@v4
         with:
@@ -472,7 +496,7 @@ jobs:
       - name: ccache
         uses: ggml-org/ccache-action@v1.2.16
         with:
-          key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-x64
+          key: windows-latest-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}
           evict-old-files: 1d
 
       - name: Install ROCm
@@ -480,7 +504,7 @@ jobs:
         run: |
           $ErrorActionPreference = "Stop"
           write-host "Downloading AMD HIP SDK Installer"
-          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
+          Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
           write-host "Installing AMD HIP SDK"
           $proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
           $completed = $proc.WaitForExit(600000)
@@ -496,6 +520,7 @@ jobs:
           write-host "Completed AMD HIP SDK installation"
 
       - name: Verify ROCm
+        id: verify
         run: |
           # Find and test ROCm installation
           $clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
@@ -504,61 +529,66 @@ jobs:
             exit 1
           }
           & $clangPath.FullName --version
-          # Set HIP_PATH environment variable for later steps
-          echo "HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)" >> $env:GITHUB_ENV
 
       - name: Build
         run: |
-          mkdir build
-          cd build
+          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
           $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake .. `
-            -G "Unix Makefiles" `
-            -DSD_HIPBLAS=ON `
-            -DSD_BUILD_SHARED_LIBS=ON `
-            -DGGML_NATIVE=OFF `
-            -DCMAKE_C_COMPILER=clang `
-            -DCMAKE_CXX_COMPILER=clang++ `
+          cmake -G "Unix Makefiles" -B build -S . `
+            -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" `
+            -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" `
+            -DCMAKE_CXX_FLAGS="-I$($PWD.Path.Replace('\', '/'))/opt/rocm-7.2.0/include/ -Wno-ignored-attributes -Wno-nested-anon-types" `
             -DCMAKE_BUILD_TYPE=Release `
-            -DGPU_TARGETS="${{ env.GPU_TARGETS }}"
-          cmake --build . --config Release --parallel ${env:NUMBER_OF_PROCESSORS}
+            -DGGML_BACKEND_DL=ON `
+            -DGGML_NATIVE=OFF `
+            -DGPU_TARGETS="${{ matrix.GPU_TARGETS }}" `
+            -DGGML_HIP_ROCWMMA_FATTN=ON `
+            -DGGML_HIP=ON `
+            -DLLAMA_BUILD_BORINGSSL=ON
+          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          md "build\bin\rocblas\library\"
+          md "build\bin\hipblaslt\library"
+          cp "${env:HIP_PATH}\bin\libhipblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\libhipblaslt.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
+          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
+          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
 
       - name: Get commit hash
         id: commit
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: prompt/actions-commit-hash@v2
+        uses: pr-mpt/actions-commit-hash@v2
 
       - name: Pack artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         run: |
-          md "build\bin\rocblas\library\"
-          md "build\bin\hipblaslt\library"
-          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
-          cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
-          cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
-          7z a sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip .\build\bin\*
+          7z a -snl sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-x64.zip .\build\bin\*
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-x64.zip
           path: |
-            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-x64.zip
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-x64.zip
 
   ubuntu-latest-rocm:
-    runs-on: ubuntu-latest
-    container: rocm/dev-ubuntu-24.04:7.2
+    runs-on: ubuntu-24.04
 
     env:
-      ROCM_VERSION: "7.2"
       UBUNTU_VERSION: "24.04"
-      GPU_TARGETS: "gfx1151;gfx1150;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+
+    strategy:
+      matrix:
+        include:
+          - ROCM_VERSION: "7.2"
+            GPU_TARGETS: "gfx908;gfx90a;gfx942;gfx1100;gfx1101;gfx1151;gfx1150;gfx1200;gfx1201"
+            build: 'x64'
+          - ROCM_VERSION: "7.11.0"
+            GPU_TARGETS: "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201"
+            build: x64
 
     steps:
-      - run: apt-get update && apt-get install -y git
       - name: Clone
         id: checkout
         uses: actions/checkout@v6
@@ -575,64 +605,49 @@ jobs:
         with:
           version: 10.15.1
 
-      - name: Free disk space
-        run: |
-          # Remove preinstalled SDKs and caches not needed for this job
-          sudo rm -rf /usr/share/dotnet || true
-          sudo rm -rf /usr/local/lib/android || true
-          sudo rm -rf /opt/ghc || true
-          sudo rm -rf /usr/local/.ghcup || true
-          sudo rm -rf /opt/hostedtoolcache || true
-
-          # Remove old package lists and caches
-          sudo rm -rf /var/lib/apt/lists/* || true
-          sudo apt clean
+      - name: ccache
+        uses: ggml-org/ccache-action@v1.2.16
+        with:
+          key: ubuntu-rocm-cmake-${{ matrix.ROCM_VERSION }}-${{ matrix.build }}
+          evict-old-files: 1d
 
       - name: Dependencies
         id: depends
         run: |
-          sudo apt-get update
-          sudo apt install -y \
-            cmake \
-            hip-dev \
-            hipblas-dev \
-            ninja-build \
-            rocm-dev \
-            zip
-          # Clean apt caches to recover disk space
-          sudo apt clean
-          sudo rm -rf /var/lib/apt/lists/* || true
-
-      - name: Setup ROCm Environment
+          sudo apt install -y build-essential cmake wget zip ninja-build
+
+      - name: Setup Legacy ROCm
+        if: matrix.ROCM_VERSION == '7.2'
+        id: legacy_env
         run: |
-          # Add ROCm to PATH for current session
-          echo "/opt/rocm/bin" >> $GITHUB_PATH
-
-          # Build regex pattern from ${{ env.GPU_TARGETS }} (match target as substring)
-          TARGET_REGEX="($(printf '%s' "${{ env.GPU_TARGETS }}" | sed 's/;/|/g'))"
-
-          # Remove library files for architectures we're not building for to save disk space
-          echo "Cleaning up unneeded architecture files..."
-          cd /opt/rocm/lib/rocblas/library
-          # Keep only our target architectures
-          for file in *; do
-            if printf '%s' "$file" | grep -q 'gfx'; then
-              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
-                echo "Removing $file" &&
-                sudo rm -f "$file";
-              fi
-            fi
-          done
-
-          cd /opt/rocm/lib/hipblaslt/library
-          for file in *; do
-            if printf '%s' "$file" | grep -q 'gfx'; then
-              if ! printf '%s' "$file" | grep -Eq "$TARGET_REGEX"; then
-                echo "Removing $file" &&
-                sudo rm -f "$file";
-              fi
-            fi
-          done
+          sudo mkdir --parents --mode=0755 /etc/apt/keyrings
+          wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \
+            gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
+
+          sudo tee /etc/apt/sources.list.d/rocm.list << EOF
+          deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/${{ matrix.ROCM_VERSION }} noble main
+          EOF
+
+          sudo tee /etc/apt/preferences.d/rocm-pin-600 << EOF
+          Package: *
+          Pin: release o=repo.radeon.com
+          Pin-Priority: 600
+          EOF
+
+          sudo apt update
+          sudo apt-get install -y libssl-dev rocm-hip-sdk
+
+      - name: Setup TheRock
+        if: matrix.ROCM_VERSION != '7.2'
+        id: therock_env
+        run: |
+          wget https://repo.amd.com/rocm/tarball/therock-dist-linux-gfx1151-${{ matrix.ROCM_VERSION }}.tar.gz
+          mkdir install
+          tar -xf *.tar.gz -C install
+          export ROCM_PATH=$(pwd)/install
+          echo ROCM_PATH=$ROCM_PATH >> $GITHUB_ENV
+          echo PATH=$PATH:$ROCM_PATH/bin >> $GITHUB_ENV
+          echo LD_LIBRARY_PATH=$ROCM_PATH/lib:$ROCM_PATH/llvm/lib:$ROCM_PATH/lib/rocprofiler-systems >> $GITHUB_ENV
 
       - name: Build
         id: cmake_build
@@ -640,15 +655,17 @@ jobs:
           mkdir build
           cd build
           cmake .. -G Ninja \
-            -DCMAKE_CXX_COMPILER=amdclang++ \
-            -DCMAKE_C_COMPILER=amdclang \
+            -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
+            -DCMAKE_HIP_FLAGS="-mllvm --amdgpu-unroll-threshold-local=600" \
             -DCMAKE_BUILD_TYPE=Release \
-            -DSD_HIPBLAS=ON \
-            -DGPU_TARGETS="${{ env.GPU_TARGETS }}" \
-            -DAMDGPU_TARGETS="${{ env.GPU_TARGETS }}" \
+            -DGGML_HIP=ON \
+            -DHIP_PLATFORM=amd \
+            -DGPU_TARGETS="${{ matrix.GPU_TARGETS }}" \
             -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-            -DSD_BUILD_SHARED_LIBS=ON
+            -DGGML_BACKEND_DL=ON \
+            -DGGML_NATIVE=OFF \
+            -DGGML_RPC=ON
           cmake --build . --config Release
 
       - name: Get commit hash
@@ -664,16 +681,6 @@ jobs:
           cp ggml/LICENSE ./build/bin/ggml.txt
           cp LICENSE ./build/bin/stable-diffusion.cpp.txt
 
-          # Move ROCm runtime libraries (to avoid double space consumption)
-          sudo mv /opt/rocm/lib/librocsparse.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhsa-runtime64.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libamdhip64.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhipblas.so* ./build/bin/
-          sudo mv /opt/rocm/lib/libhipblaslt.so* ./build/bin/
-          sudo mv /opt/rocm/lib/librocblas.so* ./build/bin/
-          sudo mv /opt/rocm/lib/rocblas/ ./build/bin/
-          sudo mv /opt/rocm/lib/hipblaslt/ ./build/bin/
-
       - name: Fetch system info
         id: system-info
         run: |
@@ -688,15 +695,15 @@ jobs:
         run: |
           cp ggml/LICENSE ./build/bin/ggml.txt
           cp LICENSE ./build/bin/stable-diffusion.cpp.txt
-          zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip ./build/bin
+          zip -y -r sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip ./build/bin
 
       - name: Upload artifacts
         if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
         uses: actions/upload-artifact@v4
         with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip
           path: |
-            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm.zip
+            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-Ubuntu-${{ env.UBUNTU_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}-rocm-${{ matrix.ROCM_VERSION }}.zip
 
   release:
     if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -710,7 +717,7 @@ jobs:
       - build-and-push-docker-images
       - macOS-latest-cmake
       - windows-latest-cmake
-      - windows-latest-cmake-hip
+      - windows-latest-rocm
 
     steps:
       - name: Clone
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9098f827b..5f82f4bb5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,8 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+
 if (MSVC)
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
     add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
@@ -22,62 +24,85 @@ else()
     set(SD_STANDALONE OFF)
 endif()
 
+if (MINGW)
+        set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    else()
+        set(BUILD_SHARED_LIBS_DEFAULT ON)
+endif()
+
+option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+
+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
+if (MSVC)
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/utf-8>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/bigobj>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
+endif()
+
+if (SD_STANDALONE)
+    # enable parallel builds for msbuild
+    list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+    list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
+
+
 #
 # Option list
 #
-
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
 option(SD_WEBP                       "sd: enable WebP image I/O support" ON)
-option(SD_CUDA                       "sd: cuda backend" OFF)
-option(SD_HIPBLAS                    "sd: rocm backend" OFF)
-option(SD_METAL                      "sd: metal backend" OFF)
-option(SD_VULKAN                     "sd: vulkan backend" OFF)
-option(SD_OPENCL                     "sd: opencl backend" OFF)
-option(SD_SYCL                       "sd: sycl backend" OFF)
-option(SD_MUSA                       "sd: musa backend" OFF)
-option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
-option(SD_BUILD_SHARED_GGML_LIB      "sd: build ggml as a separate shared lib" OFF)
 option(SD_USE_SYSTEM_GGML            "sd: use system-installed GGML library" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
-if(SD_CUDA)
-    message("-- Use CUDA as backend stable-diffusion")
-    set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUDA)
-endif()
+# Required for relocatable CMake package
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)
 
-if(SD_METAL)
-    message("-- Use Metal as backend stable-diffusion")
-    set(GGML_METAL ON)
-    add_definitions(-DSD_USE_METAL)
+if (NOT DEFINED SD_BUILD_NUMBER)
+    set(SD_BUILD_NUMBER        ${BUILD_NUMBER})
 endif()
+if (NOT DEFINED SD_BUILD_COMMIT)
+    set(SD_BUILD_COMMIT        ${BUILD_COMMIT})
+endif()
+set(SD_INSTALL_VERSION 0.0.${SD_BUILD_NUMBER})
 
-if (SD_VULKAN)
-    message("-- Use Vulkan as backend stable-diffusion")
-    set(GGML_VULKAN ON)
-    add_definitions(-DSD_USE_VULKAN)
-endif ()
-
-if (SD_OPENCL)
-    message("-- Use OpenCL as backend stable-diffusion")
-    set(GGML_OPENCL ON)
-    add_definitions(-DSD_USE_OPENCL)
-endif ()
-
-if (SD_HIPBLAS)
-    message("-- Use HIPBLAS as backend stable-diffusion")
-    set(GGML_HIP ON)
-    add_definitions(-DSD_USE_CUDA)
-endif ()
+# override ggml options
+set(GGML_ALL_WARNINGS   ${SD_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS ${SD_FATAL_WARNINGS})
 
-if(SD_MUSA)
-    message("-- Use MUSA as backend stable-diffusion")
-    set(GGML_MUSA ON)
-    add_definitions(-DSD_USE_CUDA)
+if (NOT DEFINED GGML_CUDA_GRAPHS)
+    set(GGML_CUDA_GRAPHS_DEFAULT ON)
 endif()
 
+# Ref: https://github.com/ggml-org/llama.cpp/blob/master/CMakeLists.txt#L145
+# transition helpers
+function (sd_option_depr TYPE OLD)
+    if (${OLD})
+        set(NEW "${ARGV2}")
+        if(NEW)
+            message(${TYPE} "${OLD} is deprecated, use ${NEW} instead")
+            set(${NEW} ON PARENT_SCOPE)
+        else()
+            message(${TYPE} "${OLD} is deprecated and will be ignored")
+        endif()
+    endif()
+endfunction()
+
+sd_option_depr(FATAL_ERROR SD_HIPBLAS               GGML_HIP)
+sd_option_depr(FATAL_ERROR SD_BUILD_SHARED_LIBS     BUILD_SHARED_LIBS)
+sd_option_depr(FATAL_ERROR SD_BUILD_SHARED_GGML_LIB BUILD_SHARED_LIBS)
+sd_option_depr(WARNING     SD_CUDA                  GGML_CUDA)
+sd_option_depr(WARNING     SD_METAL                 GGML_METAL)
+sd_option_depr(WARNING     SD_VULKAN                GGML_VULKAN)
+sd_option_depr(WARNING     SD_OPENCL                GGML_OPENCL)
+sd_option_depr(WARNING     SD_SYCL                  GGML_SYCL)
+sd_option_depr(WARNING     SD_MUSA                  GGML_MUSA)
 if(SD_WEBP)
     add_compile_definitions(SD_USE_WEBP)
 endif()
@@ -124,29 +149,9 @@ set_property(
   SDCPP_BUILD_COMMIT=${SDCPP_BUILD_COMMIT} SDCPP_BUILD_VERSION=${SDCPP_BUILD_VERSION}
 )
 
-if(SD_BUILD_SHARED_LIBS)
-    message("-- Build shared library")
-    message(${SD_LIB_SOURCES})
-    if(NOT SD_BUILD_SHARED_GGML_LIB)
-        set(BUILD_SHARED_LIBS OFF)
-    endif()
-    add_library(${SD_LIB} SHARED ${SD_LIB_SOURCES})
-    add_definitions(-DSD_BUILD_SHARED_LIB)
-    target_compile_definitions(${SD_LIB} PRIVATE -DSD_BUILD_DLL)
-    set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-else()
-    message("-- Build static library")
-    if(NOT SD_BUILD_SHARED_GGML_LIB)
-        set(BUILD_SHARED_LIBS OFF)
-    endif()
-    add_library(${SD_LIB} STATIC ${SD_LIB_SOURCES})
-endif()
-
-if(SD_SYCL)
-    message("-- Use SYCL as backend stable-diffusion")
-    set(GGML_SYCL ON)
+# Is this needed?
+if(GGML_SYCL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
-    add_definitions(-DSD_USE_SYCL)
     # disable fast-math on host, see:
     # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
     if (WIN32)
@@ -155,7 +160,7 @@ if(SD_SYCL)
         set(SYCL_COMPILE_OPTIONS -fp-model=precise)
     endif()
     message("-- Turn off fast-math for host in SYCL backend")
-    target_compile_options(${SD_LIB} PRIVATE ${SYCL_COMPILE_OPTIONS})
+    list(APPEND SD_TARGET_PRIVATE_COMPILE_OPTIONS ${SYCL_COMPILE_OPTIONS})
 endif()
 
 set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
@@ -180,18 +185,52 @@ if (NOT TARGET ggml)
 endif()
 
 add_subdirectory(thirdparty)
-
-target_link_libraries(${SD_LIB} PUBLIC ggml zip)
-target_include_directories(${SD_LIB} PUBLIC . include)
-target_include_directories(${SD_LIB} PUBLIC . thirdparty)
-target_compile_features(${SD_LIB} PUBLIC c_std_11 cxx_std_17)
-
+add_subdirectory(src)
 
 if (SD_BUILD_EXAMPLES)
     add_subdirectory(examples)
 endif()
 
-set(SD_PUBLIC_HEADERS include/stable-diffusion.h)
-set_target_properties(${SD_LIB} PROPERTIES PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+set(SD_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(SD_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(SD_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+set(SD_PUBLIC_HEADERS
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/stable-diffusion.h
+)
 
-install(TARGETS ${SD_LIB} LIBRARY PUBLIC_HEADER)
+set_target_properties(${SD_LIB}
+    PROPERTIES
+        PUBLIC_HEADER "${SD_PUBLIC_HEADERS}")
+
+install(TARGETS ${SD_LIB}
+    RUNTIME       DESTINATION ${SD_BIN_INSTALL_DIR}
+    LIBRARY       DESTINATION ${SD_LIB_INSTALL_DIR}
+    ARCHIVE       DESTINATION ${SD_LIB_INSTALL_DIR}
+    PUBLIC_HEADER DESTINATION ${SD_INCLUDE_INSTALL_DIR})
+
+configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/sd-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/sd-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/sd
+    PATH_VARS SD_INCLUDE_INSTALL_DIR
+              SD_LIB_INSTALL_DIR
+              SD_BIN_INSTALL_DIR )
+
+write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/sd-version.cmake
+    VERSION ${SD_INSTALL_VERSION}
+    COMPATIBILITY SameMajorVersion)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/sd-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/sd-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/sd)
+
+configure_file(cmake/sd.pc.in
+        "${CMAKE_CURRENT_BINARY_DIR}/sd.pc"
+        @ONLY)
+
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/sd.pc"
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
diff --git a/Dockerfile.cuda b/Dockerfile.cuda
index 4deb72477..ccbdb84c1 100644
--- a/Dockerfile.cuda
+++ b/Dockerfile.cuda
@@ -10,7 +10,7 @@ WORKDIR /sd.cpp
 COPY . .
 
 ARG CUDACXX=/usr/local/cuda/bin/nvcc
-RUN cmake . -B ./build -DSD_CUDA=ON
+RUN cmake . -B ./build -DGGML_CUDA=ON -DGGML_RPC=ON -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined
 RUN cmake --build ./build --config Release -j$(nproc)
 
 FROM nvidia/cuda:${CUDA_VERSION}-cudnn-runtime-ubuntu${UBUNTU_VERSION} AS runtime
diff --git a/Dockerfile.musa b/Dockerfile.musa
index 2d95f817f..528645373 100644
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@@ -13,7 +13,8 @@ RUN mkdir build && cd build && \
     cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ \
         -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
         -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -fopenmp -I/usr/lib/llvm-14/lib/clang/14.0.0/include -L/usr/lib/llvm-14/lib" \
-        -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+        -DGGML_MUSA=ON -DGGML_RPC=ON -DCMAKE_BUILD_TYPE=Release && \
+        
     cmake --build . --config Release
 
 FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 as runtime
diff --git a/Dockerfile.sycl b/Dockerfile.sycl
index 466d5517c..88036362c 100644
--- a/Dockerfile.sycl
+++ b/Dockerfile.sycl
@@ -9,7 +9,7 @@ WORKDIR /sd.cpp
 COPY . .
 
 RUN mkdir build && cd build && \
-    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DSD_SYCL=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL=ON -DGGML_RPC=ON -DCMAKE_BUILD_TYPE=Release && \
     cmake --build . --config Release -j$(nproc)
 
 FROM intel/oneapi-basekit:${SYCL_VERSION}-devel-ubuntu24.04 AS runtime
diff --git a/Dockerfile.vulkan b/Dockerfile.vulkan
index 5ba6cb05d..5ba2d163d 100644
--- a/Dockerfile.vulkan
+++ b/Dockerfile.vulkan
@@ -8,7 +8,7 @@ WORKDIR /sd.cpp
 
 COPY . .
 
-RUN cmake . -B ./build -DSD_VULKAN=ON
+RUN cmake . -B ./build -DGGML_VULKAN=ON -DGGML_RPC=ON
 RUN cmake --build ./build --config Release --parallel
 
 FROM ubuntu:$UBUNTU_VERSION AS runtime
diff --git a/cmake/build-info.cmake b/cmake/build-info.cmake
new file mode 100644
index 000000000..3194f8159
--- /dev/null
+++ b/cmake/build-info.cmake
@@ -0,0 +1,48 @@
+set(BUILD_NUMBER 0)
+set(BUILD_COMMIT "unknown")
+set(BUILD_COMPILER "unknown")
+set(BUILD_TARGET "unknown")
+
+# Look for git
+find_package(Git)
+if(NOT Git_FOUND)
+    find_program(GIT_EXECUTABLE NAMES git git.exe)
+    if(GIT_EXECUTABLE)
+        set(Git_FOUND TRUE)
+        message(STATUS "Found Git: ${GIT_EXECUTABLE}")
+    else()
+        message(WARNING "Git not found. Build info will not be accurate.")
+    endif()
+endif()
+
+# Get the commit count and hash
+if(Git_FOUND)
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE HEAD
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
+    )
+    if (RES EQUAL 0)
+        set(BUILD_COMMIT ${HEAD})
+    endif()
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+        OUTPUT_VARIABLE COUNT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        RESULT_VARIABLE RES
+    )
+    if (RES EQUAL 0)
+        set(BUILD_NUMBER ${COUNT})
+    endif()
+endif()
+
+set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+
+if(CMAKE_VS_PLATFORM_NAME)
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+    set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
\ No newline at end of file
diff --git a/cmake/common.cmake b/cmake/common.cmake
new file mode 100644
index 000000000..9176dce60
--- /dev/null
+++ b/cmake/common.cmake
@@ -0,0 +1,60 @@
+include("ggml/cmake/common.cmake")
+
+# https://github.com/ggml-org/llama.cpp/blob/master/cmake/common.cmake
+
+function(sd_add_compile_flags)
+    if (SD_FATAL_WARNINGS)
+        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            list(APPEND C_FLAGS   -Werror)
+            list(APPEND CXX_FLAGS -Werror)
+        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+            add_compile_options(/WX)
+        endif()
+    endif()
+
+    if (SD_ALL_WARNINGS)
+        if (NOT MSVC)
+            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
+                                -Werror=implicit-int -Werror=implicit-function-declaration)
+
+            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
+
+            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+
+            list(APPEND C_FLAGS   ${WARNING_FLAGS})
+            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
+
+            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+
+            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
+                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+        else()
+            # todo : msvc
+            set(C_FLAGS   "" PARENT_SCOPE)
+            set(CXX_FLAGS "" PARENT_SCOPE)
+        endif()
+    endif()
+
+    if (NOT MSVC)
+        if (SD_SANITIZE_THREAD)
+            message(STATUS "Using -fsanitize=thread")
+
+            add_compile_options(-fsanitize=thread)
+            link_libraries     (-fsanitize=thread)
+        endif()
+
+        if (SD_SANITIZE_ADDRESS)
+            message(STATUS "Using -fsanitize=address")
+
+            add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+            link_libraries     (-fsanitize=address)
+        endif()
+
+        if (SD_SANITIZE_UNDEFINED)
+            message(STATUS "Using -fsanitize=undefined")
+
+            add_compile_options(-fsanitize=undefined)
+            link_libraries     (-fsanitize=undefined)
+        endif()
+    endif()
+endfunction()
\ No newline at end of file
diff --git a/cmake/sd-config.cmake.in b/cmake/sd-config.cmake.in
new file mode 100644
index 000000000..7b224924e
--- /dev/null
+++ b/cmake/sd-config.cmake.in
@@ -0,0 +1,30 @@
+set(SD_VERSION      @SD_INSTALL_VERSION@)
+set(SD_BUILD_COMMIT @SD_BUILD_COMMIT@)
+set(SD_BUILD_NUMBER @SD_BUILD_NUMBER@)
+set(SD_SHARED_LIB   @BUILD_SHARED_LIBS@)
+
+@PACKAGE_INIT@
+
+set_and_check(SD_INCLUDE_DIR "@PACKAGE_SD_INCLUDE_INSTALL_DIR@")
+set_and_check(SD_LIB_DIR     "@PACKAGE_SD_LIB_INSTALL_DIR@")
+set_and_check(SD_BIN_DIR     "@PACKAGE_SD_BIN_INSTALL_DIR@")
+
+find_package(ggml REQUIRED HINTS ${SD_LIB_DIR}/cmake)
+
+find_library(stable-diffusion_LIBRARY stable-diffusion
+    REQUIRED
+    HINTS ${SD_LIB_DIR}
+    NO_CMAKE_FIND_ROOT_PATH
+)
+
+add_library(stable-diffusion UNKNOWN IMPORTED)
+set_target_properties(stable-diffusion
+    PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${SD_INCLUDE_DIR}"
+        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
+        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
+        IMPORTED_LOCATION "${stable-diffusion_LIBRARY}"
+        INTERFACE_COMPILE_FEATURES "c_std_11;cxx_std_17"
+        POSITION_INDEPENDENT_CODE ON)
+
+check_required_components(Stable-diffusion)
\ No newline at end of file
diff --git a/cmake/sd.pc.in b/cmake/sd.pc.in
new file mode 100644
index 000000000..f0a5ebc7d
--- /dev/null
+++ b/cmake/sd.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=@CMAKE_INSTALL_PREFIX@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: stable-diffusion
+Description: Diffusion model(SD,Flux,Wan,Qwen Image,Z-Image,...) inference in pure C/C++
+Version: @SD_INSTALL_VERSION@
+Libs: -L${libdir} -lggml -lggml-base -lstable-diffusion
+Cflags: -I${includedir}
\ No newline at end of file
diff --git a/docs/hipBLAS_on_Windows.md b/docs/hipBLAS_on_Windows.md
index b5105ad19..cd8465721 100644
--- a/docs/hipBLAS_on_Windows.md
+++ b/docs/hipBLAS_on_Windows.md
@@ -26,12 +26,12 @@ Fortunately, `AMD` provides complete help documentation, you can use the help do
 
 Then we must set `ROCM` as environment variables before running cmake.
 
-Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\5.5\bin`
+Usually if you install according to the official tutorial and do not modify the ROCM path, then there is a high probability that it is here `C:\Program Files\AMD\ROCm\7.1.1\bin`
 
 This is what I use to set the clang:
 ```Commandline
-set CC=C:\Program Files\AMD\ROCm\5.5\bin\clang.exe
-set CXX=C:\Program Files\AMD\ROCm\5.5\bin\clang++.exe
+set CC=C:\Program Files\AMD\ROCm\7.1.1\bin\clang.exe
+set CXX=C:\Program Files\AMD\ROCm\7.1.1\bin\clang++.exe
 ```
 
 ## Ninja
@@ -46,7 +46,7 @@ set ninja=C:\Program Files\ninja\ninja.exe
 ## Building stable-diffusion.cpp
 
 The thing different from the regular CPU build is `-DSD_HIPBLAS=ON` ,
-`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1100`
+`-G "Ninja"`, `-DCMAKE_C_COMPILER=clang`, `-DCMAKE_CXX_COMPILER=clang++`, `-DAMDGPU_TARGETS=gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032`
 
 >**Notice**: check the `clang` and `clang++` information:
 ```Commandline
@@ -59,26 +59,29 @@ If you see like this, we can continue:
 clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
 Target: x86_64-pc-windows-msvc
 Thread model: posix
-InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
 ```
 
 ```
 clang version 17.0.0 (git@github.amd.com:Compute-Mirrors/llvm-project e3201662d21c48894f2156d302276eb1cf47c7be)
 Target: x86_64-pc-windows-msvc
 Thread model: posix
-InstalledDir: C:\Program Files\AMD\ROCm\5.5\bin
+InstalledDir: C:\Program Files\AMD\ROCm\7.1.1\bin
 ```
 
->**Notice** that the `gfx1100` is the GPU architecture of my GPU, you can change it to your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
+>**Notice** that the GPU targets are now compatible with multiple GPU architectures (ROCm 7.1.1 targets). You can change them to match your GPU architecture. Click here to see your architecture [LLVM Target](https://rocm.docs.amd.com/en/latest/release/windows_support.html#windows-supported-gpus)
 
-My GPU is AMD Radeon™ RX 7900 XTX Graphics, so I set it to `gfx1100`.
+Examples:
+- AMD Radeon™ RX 7900 XTX Graphics: `gfx1100`
+- AMD Radeon™ RX 7900 XT Graphics: `gfx1101`
+- AMD Radeon™ RX 7900 GRE Graphics: `gfx1102`
 
 option:
 
 ```commandline
 mkdir build
 cd build
-cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=gfx1100
+cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1150;gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
 cmake --build . --config Release
 ```
 
diff --git a/docs/rpc.md b/docs/rpc.md
new file mode 100644
index 000000000..93296d419
--- /dev/null
+++ b/docs/rpc.md
@@ -0,0 +1,220 @@
+# Building and Using the RPC Server with `stable-diffusion.cpp`
+
+This guide covers how to build a version of [the RPC server from `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/tools/rpc/README.md) that is compatible with your version of `stable-diffusion.cpp` to manage multi-backends setups. RPC allows you to offload specific model components to a remote server.
+
+> **Note on Model Location:** The model files (e.g., `.safetensors` or `.gguf`) remain on the **Client** machine. The client parses the file and transmits the necessary tensor data and computational graphs to the server. The server does not need to store the model files locally.
+
+## 1. Building `stable-diffusion.cpp` with RPC client
+
+First, you should build the client application from source. It requires `SD_RPC=ON` to include the RPC backend to your client.
+
+```bash
+mkdir build
+cd build
+cmake .. \
+    -DGGML_RPC=ON \
+    # Add other build flags here (e.g., -DGGML_VULKAN=ON)
+cmake --build . --config Release -j $(nproc)
+```
+
+> **Note:** Ensure you add the other flags you would normally use (e.g., `-DGGML_VULKAN=ON`, `-DGGML_CUDA=ON`, `-DGGML_HIP=ON`, or `-DGGML_METAL=ON`), for more information about building `stable-diffusion.cpp` from source, please refer to the [build.md](build.md) documentation.
+
+## 2. Ensure `llama.cpp` is at the correct commit
+
+`stable-diffusion.cpp`'s RPC client is designed to work with a specific version of `llama.cpp` (compatible with the `ggml` submodule) to ensure API compatibility. The commit hash for `llama.cpp` is stored in `ggml/scripts/sync-llama.last`.
+
+> **Start from Root:** Perform these steps from the root of your `stable-diffusion.cpp` directory.
+
+1.  Read the target commit hash from the submodule tracker:
+
+    ```bash
+    # Linux / WSL / MacOS
+    HASH=$(cat ggml/scripts/sync-llama.last)
+
+    # Windows (PowerShell)
+    $HASH = Get-Content -Path "ggml\scripts\sync-llama.last"
+    ```
+
+2.  Clone `llama.cpp` at the target commit .
+    ```bash
+    git clone https://github.com/ggml-org/llama.cpp.git
+    cd llama.cpp
+    git checkout $HASH
+    ```
+    To save on download time and storage, you can use a shallow clone to download only the target commit:
+    ```bash
+    mkdir -p llama.cpp
+    cd llama.cpp
+    git init
+    git remote add origin https://github.com/ggml-org/llama.cpp.git
+    git fetch --depth 1 origin $HASH
+    git checkout FETCH_HEAD
+    ```
+
+## 3. Build `llama.cpp` (RPC Server)
+
+The RPC server acts as the worker. You must explicitly enable the **backend** (the hardware interface, such as CUDA for Nvidia, Metal for Apple Silicon, or Vulkan) when building, otherwise the server will default to using only the CPU.
+
+To find the correct flags for your system, refer to the official documentation for the [`llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) repository.
+
+> **Crucial:** You must include the compiler flags required to satisfy the API compatibility with `stable-diffusion.cpp` (`-DGGML_MAX_NAME=128`). Without this flag, `GGML_MAX_NAME` will default to `64` for the server, and data transfers between the client and server will fail. Of course, `-DGGML_RPC` must also be enabled.
+>
+> I recommend disabling the `LLAMA_CURL` flag to avoid unnecessary dependencies, and disabling shared library builds to avoid potential conflicts.
+
+> **Build Target:** We are specifically building the `rpc-server` target. This prevents the build system from compiling the entire `llama.cpp` suite (like `llama-server`), making the build significantly faster.
+
+### Linux / WSL (Vulkan)
+
+```bash
+mkdir build
+cd build
+cmake .. -DGGML_RPC=ON \
+    -DGGML_VULKAN=ON \        # Ensure backend is enabled
+    -DGGML_BUILD_SHARED_LIBS=OFF \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server -j $(nproc)
+```
+
+### macOS (Metal)
+
+```bash
+mkdir build
+cd build
+cmake .. -DGGML_RPC=ON \
+    -DGGML_METAL=ON \
+    -DGGML_BUILD_SHARED_LIBS=OFF \
+    -DLLAMA_CURL=OFF \
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 \
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server
+```
+
+### Windows (Visual Studio 2022, Vulkan)
+
+```powershell
+mkdir build
+cd build
+cmake .. -G "Visual Studio 17 2022" -A x64 `
+    -DGGML_RPC=ON `
+    -DGGML_VULKAN=ON `
+    -DGGML_BUILD_SHARED_LIBS=OFF `
+    -DLLAMA_CURL=OFF `
+    -DCMAKE_C_FLAGS=-DGGML_MAX_NAME=128 `
+    -DCMAKE_CXX_FLAGS=-DGGML_MAX_NAME=128
+cmake --build . --config Release --target rpc-server
+```
+
+## 4. Usage
+
+Once both applications are built, you can run the server and the client to manage your GPU allocation.
+
+### Step A: Run the RPC Server
+
+Start the server. It listens for connections on the default address (usually `localhost:50052`). If your server is on a different machine, ensure the server binds to the correct interface and your firewall allows the connection.
+
+**On the Server :**
+If running on the same machine, you can use the default address:
+
+```bash
+./rpc-server
+```
+
+If you want to allow connections from other machines on the network:
+
+```bash
+./rpc-server --host 0.0.0.0
+```
+
+> **Security Warning:** The RPC server does not currently support authentication or encryption. **Only run the server on trusted local networks**. Never expose the RPC server directly to the open internet.
+
+> **Drivers & Hardware:** Ensure the Server machine has the necessary drivers installed and functional (e.g., Nvidia Drivers for CUDA, Vulkan SDK, or Metal). If no devices are found, the server will simply fallback to CPU usage.
+
+### Step B: Check if the client is able to connect to the server and see the available devices
+
+We're assuming the server is running on your local machine, and listening on the default port `50052`. If it's running on a different machine, you can replace `localhost` with the IP address of the server.
+
+**On the Client:**
+
+```bash
+./sd-cli --rpc localhost:50052 --list-devices
+```
+
+If the server is running and the client is able to connect, you should see `RPC0    localhost:50052` in the list of devices.
+
+Example output:
+(Client built without GPU acceleration, two GPUs available on the server)
+
+```
+List of available GGML devices:
+Name    Description
+-------------------
+CPU     AMD Ryzen 9 5900X 12-Core Processor
+RPC0    localhost:50052
+RPC1    localhost:50052
+```
+
+### Step C: Run with RPC device
+
+If everything is working correctly, you can now run the client while offloading some or all of the work to the RPC server.
+
+Example: Setting the main backend to the RPC0 device for doing all the work on the server.
+
+```bash
+./sd-cli -m models/sd1.5.safetensors -p "A cat" --rpc localhost:50052 --main-backend-device RPC0
+```
+
+---
+
+## 5. Scaling: Multiple RPC Servers
+
+You can connect the client to multiple RPC servers simultaneously to scale out your hardware usage.
+
+Example: A main machine (192.168.1.10) with 3 GPUs, with one GPU running CUDA and the other two running Vulkan, and a second machine (192.168.1.11) only one GPU.
+
+**On the first machine (Running two server instances):**
+
+**Terminal 1 (CUDA):**
+
+```bash
+# Linux / WSL
+export CUDA_VISIBLE_DEVICES=0
+cd ./build_cuda/bin/Release
+./rpc-server --host 0.0.0.0
+
+# Windows PowerShell
+$env:CUDA_VISIBLE_DEVICES="0"
+cd .\build_cuda\bin\Release
+./rpc-server --host 0.0.0.0
+```
+
+**Terminal 2 (Vulkan):**
+
+```bash
+cd ./build_vulkan/bin/Release
+# ignore the first GPU (used by CUDA server)
+./rpc-server --host 0.0.0.0 --port 50053 -d Vulkan1,Vulkan2
+```
+
+**On the second machine:**
+
+```bash
+cd ./build/bin/Release
+./rpc-server --host 0.0.0.0
+```
+
+**On the Client:**
+Pass multiple server addresses separated by commas.
+
+```bash
+./sd-cli --rpc 192.168.1.10:50052,192.168.1.10:50053,192.168.1.11:50052 --list-devices
+```
+
+The client will map these servers to sequential device IDs (e.g., RPC0 from the first server, RPC2, RPC3 from the second, and RPC4 from the third). With this setup, you could for example use RPC0 for the main backend, RPC1 and RPC2 for the text encoders, and RPC3 for the VAE.
+
+---
+
+## 6. Performance Considerations
+
+RPC performance is heavily dependent on network bandwidth, as large weights and activations must be transferred back and forth over the network, especially for large models, or when using high resolutions. For best results, ensure your network connection is stable and has sufficient bandwidth (>1Gbps recommended).
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 2dcd1d53a..29cef50fa 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,7 @@
+sd_add_compile_flags()
+
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/thirdparty)
 
 add_subdirectory(cli)
-add_subdirectory(server)
\ No newline at end of file
+add_subdirectory(server)
diff --git a/examples/cli/README.md b/examples/cli/README.md
index 25fcce692..d2bffd14e 100644
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@@ -24,6 +24,8 @@ CLI Options:
   -M, --mode                  run mode, one of [img_gen, vid_gen, upscale, convert, metadata], default: img_gen
   --preview                   preview method. must be one of the following [none, proj, tae, vae] (default is none)
   -h, --help                  show this help message and exit
+  --rpc                       add a rpc device
+  --list-devices              list available ggml compute devices
 
 Context Options:
   -m, --model <string>                     path to full model
@@ -46,6 +48,17 @@ Context Options:
   --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
   --photo-maker <string>                   path to PHOTOMAKER model
   --upscale-model <string>                 path to esrgan model.
+  --main-backend-device <string>           default device to use for all backends (defaults to main gpu device if hardware acceleration is available, otherwise
+                                           cpu)
+  --diffusion-backend-device <string>      device to use for diffusion (defaults to main-backend-device)
+  --clip-backend-device <string>           device to use for clip (defaults to main-backend-device). Can be a comma-separated list of devices for models with
+                                           multiple encoders
+  --vae-backend-device <string>            device to use for vae (defaults to main-backend-device). Also applies to tae, unless tae-backend-device is specified
+  --tae-backend-device <string>            device to use for tae (defaults to vae-backend-device)
+  --control-net-backend-device <string>    device to use for control net (defaults to main-backend-device)
+  --upscaler-backend-device <string>       device to use for upscaling models (defaults to main-backend-device)
+  --photomaker-backend-device <string>     device to use for photomaker (defaults to main-backend-device)
+  --vision-backend-device <string>         device to use for clip-vision model (defaults to main-backend-device)
   -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                            CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
@@ -54,9 +67,6 @@ Context Options:
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
   --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
   --fa                                     use flash attention
   --diffusion-fa                           use flash attention in the diffusion model only
   --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index b4a3c343e..099ed3d14 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -12,6 +12,8 @@
 #include <string>
 #include <vector>
 
+#include "ggml.h"
+
 // #include "preprocessing.hpp"
 #include "stable-diffusion.h"
 
@@ -51,6 +53,7 @@ struct SDCliParams {
     bool metadata_all        = false;
 
     bool normal_exit = false;
+    bool skip_usage  = false;
 
     ArgOptions get_options() {
         ArgOptions options;
@@ -168,7 +171,28 @@ struct SDCliParams {
 
         auto on_help_arg = [&](int argc, const char** argv, int index) {
             normal_exit = true;
-            return -1;
+            return VALID_BREAK_OPT;
+        };
+
+        auto on_rpc_arg = [&](int argc, const char** argv, int index) {
+            if (++index >= argc) {
+                return -1;
+            }
+            const char* rpc_device = argv[index];
+            add_rpc_device(rpc_device);
+            return 1;
+        };
+
+        auto on_list_devices_arg = [&](int argc, const char** argv, int index) {
+            size_t buff_size = backend_list_size();
+            GGML_ASSERT(buff_size > 0);
+            char* buff = (char*)malloc(buff_size);
+            list_backends_to_buffer(buff, buff_size);
+            printf("List of available GGML devices:\nName\tDescription\n-------------------\n%s\n", buff);
+            free(buff);
+            normal_exit = true;
+            skip_usage  = true;
+            return VALID_BREAK_OPT;
         };
 
         options.manual_options = {
@@ -184,6 +208,14 @@ struct SDCliParams {
              "--help",
              "show this help message and exit",
              on_help_arg},
+            {"",
+             "--rpc",
+             "add a rpc device",
+             on_rpc_arg},
+            {"",
+             "--list-devices",
+             "list available ggml compute devices",
+             on_list_devices_arg},
         };
 
         return options;
@@ -253,7 +285,9 @@ void parse_args(int argc, const char** argv, SDCliParams& cli_params, SDContextP
     std::vector<ArgOptions> options_vec = {cli_params.get_options(), ctx_params.get_options(), gen_params.get_options()};
 
     if (!parse_options(argc, argv, options_vec)) {
-        print_usage(argc, argv, options_vec);
+        if (!cli_params.skip_usage) {
+            print_usage(argc, argv, options_vec);
+        }
         exit(cli_params.normal_exit ? 0 : 1);
     }
 
@@ -810,7 +844,8 @@ int main(int argc, const char* argv[]) {
                                                         ctx_params.offload_params_to_cpu,
                                                         ctx_params.diffusion_conv_direct,
                                                         ctx_params.n_threads,
-                                                        gen_params.upscale_tile_size);
+                                                        gen_params.upscale_tile_size,
+                                                        ctx_params.upscaler_backend_device.c_str());
 
         if (upscaler_ctx == nullptr) {
             LOG_ERROR("new_upscaler_ctx failed");
diff --git a/examples/common/common.hpp b/examples/common/common.hpp
index 7beef9d58..99c4767cf 100644
--- a/examples/common/common.hpp
+++ b/examples/common/common.hpp
@@ -25,6 +25,8 @@ namespace fs = std::filesystem;
 #define SAFE_STR(s) ((s) ? (s) : "")
 #define BOOL_STR(b) ((b) ? "true" : "false")
 
+#define VALID_BREAK_OPT -42
+
 const char* modes_str[] = {
     "img_gen",
     "vid_gen",
@@ -275,16 +277,26 @@ static bool parse_options(int argc, const char** argv, const std::vector<ArgOpti
                 }))
                 break;
 
+            bool kill_flow = false;
             if (match_and_apply(options.manual_options, [&](auto& option) {
                     int ret = option.cb(argc, argv, i);
+                    if (ret == VALID_BREAK_OPT) {
+                        // not an error, but still break out of the loop (e.g. --help)
+                        kill_flow = true;
+                        return;
+                    }
                     if (ret < 0) {
                         invalid_arg = true;
                         return;
                     }
                     i += ret;
                     found_arg = true;
-                }))
+                })) {
+                if (kill_flow) {
+                    return false;
+                }
                 break;
+            }
         }
 
         if (invalid_arg) {
@@ -321,6 +333,16 @@ struct SDContextParams {
     std::string tensor_type_rules;
     std::string lora_model_dir = ".";
 
+    std::string main_backend_device;
+    std::string diffusion_backend_device;
+    std::string clip_backend_device;
+    std::string vae_backend_device;
+    std::string tae_backend_device;
+    std::string control_net_backend_device;
+    std::string upscaler_backend_device;
+    std::string photomaker_backend_device;
+    std::string vision_backend_device;
+
     std::map<std::string, std::string> embedding_map;
     std::vector<sd_embedding_t> embedding_vec;
 
@@ -328,9 +350,6 @@ struct SDContextParams {
     rng_type_t sampler_rng_type = RNG_TYPE_COUNT;
     bool offload_params_to_cpu  = false;
     bool enable_mmap            = false;
-    bool control_net_cpu        = false;
-    bool clip_on_cpu            = false;
-    bool vae_on_cpu             = false;
     bool flash_attn             = false;
     bool diffusion_flash_attn   = false;
     bool diffusion_conv_direct  = false;
@@ -435,6 +454,43 @@ struct SDContextParams {
              "--upscale-model",
              "path to esrgan model.",
              &esrgan_path},
+            {"",
+             "--main-backend-device",
+             "default device to use for all backends (defaults to main gpu device if hardware acceleration is available, otherwise cpu)",
+             &main_backend_device},
+            {"",
+             "--diffusion-backend-device",
+             "device to use for diffusion (defaults to main-backend-device)",
+             &diffusion_backend_device},
+            {"",
+             "--clip-backend-device",
+             "device to use for clip (defaults to main-backend-device). Can be a comma-separated list of devices for models with multiple encoders",
+             &clip_backend_device},
+            {"",
+             "--vae-backend-device",
+             "device to use for vae (defaults to main-backend-device). Also applies to tae, unless tae-backend-device is specified",
+             &vae_backend_device},
+            {"",
+             "--tae-backend-device",
+             "device to use for tae (defaults to vae-backend-device)",
+             &tae_backend_device},
+            {"",
+             "--control-net-backend-device",
+             "device to use for control net (defaults to main-backend-device)",
+             &control_net_backend_device},
+            {"",
+             "--upscaler-backend-device",
+             "device to use for upscaling models (defaults to main-backend-device)",
+             &upscaler_backend_device},
+            {"",
+             "--photomaker-backend-device",
+             "device to use for photomaker (defaults to main-backend-device)",
+             &photomaker_backend_device},
+            {"",
+             "--vision-backend-device",
+             "device to use for clip-vision model (defaults to main-backend-device)",
+             &vision_backend_device},
+
         };
 
         options.int_options = {
@@ -464,18 +520,6 @@ struct SDContextParams {
              "--mmap",
              "whether to memory-map model",
              true, &enable_mmap},
-            {"",
-             "--control-net-cpu",
-             "keep controlnet in cpu (for low vram)",
-             true, &control_net_cpu},
-            {"",
-             "--clip-on-cpu",
-             "keep clip in cpu (for low vram)",
-             true, &clip_on_cpu},
-            {"",
-             "--vae-on-cpu",
-             "keep vae in cpu (for low vram)",
-             true, &vae_on_cpu},
             {"",
              "--fa",
              "use flash attention",
@@ -686,6 +730,7 @@ struct SDContextParams {
 
         std::string embeddings_str = emb_ss.str();
         std::ostringstream oss;
+        // TODO backend devices
         oss << "SDContextParams {\n"
             << "  n_threads: " << n_threads << ",\n"
             << "  model_path: \"" << model_path << "\",\n"
@@ -711,9 +756,9 @@ struct SDContextParams {
             << "  sampler_rng_type: " << sd_rng_type_name(sampler_rng_type) << ",\n"
             << "  offload_params_to_cpu: " << (offload_params_to_cpu ? "true" : "false") << ",\n"
             << "  enable_mmap: " << (enable_mmap ? "true" : "false") << ",\n"
-            << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
-            << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
-            << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
+            // << "  control_net_cpu: " << (control_net_cpu ? "true" : "false") << ",\n"
+            // << "  clip_on_cpu: " << (clip_on_cpu ? "true" : "false") << ",\n"
+            // << "  vae_on_cpu: " << (vae_on_cpu ? "true" : "false") << ",\n"
             << "  flash_attn: " << (flash_attn ? "true" : "false") << ",\n"
             << "  diffusion_flash_attn: " << (diffusion_flash_attn ? "true" : "false") << ",\n"
             << "  diffusion_conv_direct: " << (diffusion_conv_direct ? "true" : "false") << ",\n"
@@ -769,9 +814,6 @@ struct SDContextParams {
             lora_apply_mode,
             offload_params_to_cpu,
             enable_mmap,
-            clip_on_cpu,
-            control_net_cpu,
-            vae_on_cpu,
             flash_attn,
             diffusion_flash_attn,
             taesd_preview,
@@ -784,6 +826,14 @@ struct SDContextParams {
             chroma_use_t5_mask,
             chroma_t5_mask_pad,
             qwen_image_zero_cond_t,
+            main_backend_device.c_str(),
+            diffusion_backend_device.c_str(),
+            clip_backend_device.c_str(),
+            vae_backend_device.c_str(),
+            tae_backend_device.c_str(),
+            control_net_backend_device.c_str(),
+            photomaker_backend_device.c_str(),
+            vision_backend_device.c_str(),
         };
         return sd_ctx_params;
     }
diff --git a/examples/server/README.md b/examples/server/README.md
index 620586d2e..83ab0268a 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -121,6 +121,17 @@ Context Options:
   --tensor-type-rules <string>             weight type per tensor pattern (example: "^vae\.=f16,model\.=q8_0")
   --photo-maker <string>                   path to PHOTOMAKER model
   --upscale-model <string>                 path to esrgan model.
+  --main-backend-device <string>           default device to use for all backends (defaults to main gpu device if hardware acceleration is available, otherwise
+                                           cpu)
+  --diffusion-backend-device <string>      device to use for diffusion (defaults to main-backend-device)
+  --clip-backend-device <string>           device to use for clip (defaults to main-backend-device). Can be a comma-separated list of devices for models with
+                                           multiple encoders
+  --vae-backend-device <string>            device to use for vae (defaults to main-backend-device). Also applies to tae, unless tae-backend-device is specified
+  --tae-backend-device <string>            device to use for tae (defaults to vae-backend-device)
+  --control-net-backend-device <string>    device to use for control net (defaults to main-backend-device)
+  --upscaler-backend-device <string>       device to use for upscaling models (defaults to main-backend-device)
+  --photomaker-backend-device <string>     device to use for photomaker (defaults to main-backend-device)
+  --vision-backend-device <string>         device to use for clip-vision model (defaults to main-backend-device)
   -t, --threads <int>                      number of threads to use during computation (default: -1). If threads <= 0, then threads will be set to the number of
                                            CPU physical cores
   --chroma-t5-mask-pad <int>               t5 mask pad size of chroma
@@ -129,9 +140,6 @@ Context Options:
   --force-sdxl-vae-conv-scale              force use of conv scale on sdxl vae
   --offload-to-cpu                         place the weights in RAM to save VRAM, and automatically load them into VRAM when needed
   --mmap                                   whether to memory-map model
-  --control-net-cpu                        keep controlnet in cpu (for low vram)
-  --clip-on-cpu                            keep clip in cpu (for low vram)
-  --vae-on-cpu                             keep vae in cpu (for low vram)
   --fa                                     use flash attention
   --diffusion-fa                           use flash attention in the diffusion model only
   --diffusion-conv-direct                  use ggml_conv2d_direct in the diffusion model
diff --git a/examples/server/main.cpp b/examples/server/main.cpp
index 8d4e644b5..9dfb727f4 100644
--- a/examples/server/main.cpp
+++ b/examples/server/main.cpp
@@ -837,9 +837,10 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
                 }
             }
 
-            auto get_sample_method = [](std::string name) -> enum sample_method_t {
+            auto get_sample_method = [](std::string name)->enum sample_method_t {
                 enum sample_method_t result = str_to_sample_method(name.c_str());
-                if (result != SAMPLE_METHOD_COUNT) return result;
+                if (result != SAMPLE_METHOD_COUNT)
+                    return result;
                 std::transform(name.begin(), name.end(), name.begin(),
                                [](unsigned char c) { return std::tolower(c); });
                 static const std::unordered_map<std::string_view, sample_method_t> hardcoded{
@@ -859,8 +860,9 @@ void register_sdapi_endpoints(httplib::Server& svr, ServerRuntime& rt) {
                     {"k_res_multistep", RES_MULTISTEP_SAMPLE_METHOD},
                     {"res 2s", RES_2S_SAMPLE_METHOD},
                     {"k_res_2s", RES_2S_SAMPLE_METHOD}};
-                auto it            = hardcoded.find(name);
-                if (it != hardcoded.end()) return it->second;
+                auto it = hardcoded.find(name);
+                if (it != hardcoded.end())
+                    return it->second;
                 return SAMPLE_METHOD_COUNT;
             };
 
diff --git a/format-code.ps1 b/format-code.ps1
new file mode 100644
index 000000000..7f6d00727
--- /dev/null
+++ b/format-code.ps1
@@ -0,0 +1,54 @@
+param(
+    [switch]$DryRun
+)
+
+$ErrorActionPreference = "Stop"
+
+$repoRoot = $PSScriptRoot
+if (-not $repoRoot) {
+    $repoRoot = (Get-Location).Path
+}
+
+$patterns = @(
+    "src/*.cpp"
+    "src/*.h"
+    "src/*.hpp"
+    "src/vocab/*.h"
+    "src/vocab/*.cpp"
+    "examples/cli/*.cpp"
+    "examples/common/*.hpp"
+    "examples/cli/*.h"
+    "examples/server/*.cpp"
+)
+
+Push-Location $repoRoot
+try {
+    if (-not $DryRun) {
+        $null = Get-Command clang-format -ErrorAction Stop
+    }
+
+    foreach ($pattern in $patterns) {
+        $files = Get-ChildItem -Path $pattern -File -ErrorAction SilentlyContinue | Sort-Object FullName
+
+        foreach ($file in $files) {
+            $relativePath = $file.FullName.Substring($repoRoot.Length).TrimStart('\', '/') -replace '\\', '/'
+
+            if ($relativePath -like "vocab*") {
+                continue
+            }
+
+            Write-Host "formatting '$relativePath'"
+
+            # if ($file.Name -ne "stable-diffusion.h") {
+            #     clang-tidy -fix -p build_linux/ "$relativePath"
+            # }
+
+            if (-not $DryRun) {
+                & clang-format -style=file -i $file.FullName
+            }
+        }
+    }
+}
+finally {
+    Pop-Location
+}
diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h
index f093bb56c..8f9f6834f 100644
--- a/include/stable-diffusion.h
+++ b/include/stable-diffusion.h
@@ -187,9 +187,9 @@ typedef struct {
     enum lora_apply_mode_t lora_apply_mode;
     bool offload_params_to_cpu;
     bool enable_mmap;
-    bool keep_clip_on_cpu;
-    bool keep_control_net_on_cpu;
-    bool keep_vae_on_cpu;
+    // bool keep_clip_on_cpu;
+    // bool keep_control_net_on_cpu;
+    // bool keep_vae_on_cpu;
     bool flash_attn;
     bool diffusion_flash_attn;
     bool tae_preview_only;
@@ -202,6 +202,14 @@ typedef struct {
     bool chroma_use_t5_mask;
     int chroma_t5_mask_pad;
     bool qwen_image_zero_cond_t;
+    const char* main_device;
+    const char* diffusion_device;
+    const char* clip_device;
+    const char* vae_device;
+    const char* tae_device;
+    const char* control_net_device;
+    const char* photomaker_device;
+    const char* vision_device;
 } sd_ctx_params_t;
 
 typedef struct {
@@ -390,7 +398,8 @@ SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
                                         bool offload_params_to_cpu,
                                         bool direct,
                                         int n_threads,
-                                        int tile_size);
+                                        int tile_size,
+                                        const char * device);
 SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
 
 SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx,
@@ -416,6 +425,11 @@ SD_API bool preprocess_canny(sd_image_t image,
 SD_API const char* sd_commit(void);
 SD_API const char* sd_version(void);
 
+SD_API size_t backend_list_size(void);
+SD_API void list_backends_to_buffer(char* buffer, size_t buffer_size);
+
+SD_API void add_rpc_device(const char* address);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 000000000..b655be6be
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,39 @@
+sd_add_compile_flags()
+
+#
+# libraries
+#
+
+# stable-diffusion
+file(GLOB SD_SOURCES "*.cpp" "*.hpp" "*.h")
+file(GLOB SD_VOCAB_SOURCES "vocab/*.h" "vocab/*.cpp")
+
+add_library(${SD_LIB}
+            ../include/stable-diffusion.h
+            ${SD_SOURCES}
+            ${SD_VOCAB_SOURCES}
+)
+
+set_target_properties(${SD_LIB} PROPERTIES
+    VERSION ${SD_INSTALL_VERSION}
+    SOVERSION 0
+    MACHO_CURRENT_VERSION 0 # keep macOS linker from seeing oversized version number
+)
+
+
+
+target_include_directories(${SD_LIB} PRIVATE .)
+target_include_directories(${SD_LIB} PUBLIC ../include)
+target_compile_features(${SD_LIB} PRIVATE c_std_11 cxx_std_17)
+
+if (SD_TARGET_PRIVATE_COMPILE_OPTIONS)
+    target_compile_options(${SD_LIB} PRIVATE ${SD_TARGET_PRIVATE_COMPILE_OPTIONS})
+endif()
+
+target_link_libraries(${SD_LIB} PUBLIC ggml PRIVATE zip)
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${SD_LIB} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(${SD_LIB} PRIVATE SD_BUILD_DLL)
+    target_compile_definitions(${SD_LIB} PUBLIC  SD_BUILD_SHARED_LIB )
+endif()
diff --git a/src/cache_dit.hpp b/src/cache_dit.hpp
index dad67d458..275e759d6 100644
--- a/src/cache_dit.hpp
+++ b/src/cache_dit.hpp
@@ -842,7 +842,7 @@ struct CacheDitConditionState {
 
         const float* input_data = input.data();
         float diff              = CacheDitState::calculate_residual_diff(
-                         it->second.prev_input.data(), input_data, ne);
+            it->second.prev_input.data(), input_data, ne);
 
         float effective_threshold = config.residual_diff_threshold;
         if (config.Fn_compute_blocks > 0) {
diff --git a/src/common_block.hpp b/src/common_block.hpp
index 2cef389af..82e95e750 100644
--- a/src/common_block.hpp
+++ b/src/common_block.hpp
@@ -1,7 +1,9 @@
 #ifndef __COMMON_BLOCK_HPP__
 #define __COMMON_BLOCK_HPP__
 
+#include "ggml-backend.h"
 #include "ggml_extend.hpp"
+#include "util.h"
 
 class DownSampleBlock : public GGMLBlock {
 protected:
@@ -248,9 +250,6 @@ class FeedForward : public GGMLBlock {
         float scale         = 1.f;
         if (precision_fix) {
             scale = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
         }
         // The purpose of the scale here is to prevent NaN issues in certain situations.
         // For example, when using Vulkan without enabling force_prec_f32,
@@ -264,6 +263,9 @@ class FeedForward : public GGMLBlock {
 
         auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
         auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+        if (sd_backend_is(ctx->backend, "Vulkan")) {
+            net_2->set_force_prec_f32(true);
+        }
 
         x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
         x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
diff --git a/src/conditioner.hpp b/src/conditioner.hpp
index 5564373eb..74993a4e8 100644
--- a/src/conditioner.hpp
+++ b/src/conditioner.hpp
@@ -4,9 +4,12 @@
 #include <optional>
 
 #include "clip.hpp"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 #include "llm.hpp"
 #include "t5.hpp"
 #include "tensor_ggml.hpp"
+#include "util.h"
 
 struct SDCondition {
     sd::Tensor<float> c_crossattn;
@@ -79,6 +82,7 @@ struct Conditioner {
     virtual ~Conditioner() = default;
 
 public:
+    int model_count                                                                        = 1;
     virtual SDCondition get_learned_condition(int n_threads,
                                               const ConditionerParams& conditioner_params) = 0;
     virtual void alloc_params_buffer()                                                     = 0;
@@ -94,6 +98,11 @@ struct Conditioner {
     virtual std::string remove_trigger_from_prompt(const std::string& prompt) {
         GGML_ABORT("Not implemented yet!");
     }
+    virtual bool is_cond_stage_model_name_at_index(const std::string& name, int index) {
+        return true;
+    }
+    virtual ggml_backend_t get_params_backend_at_index(int index)  = 0;
+    virtual ggml_backend_t get_runtime_backend_at_index(int index) = 0;
 };
 
 // ldm.modules.encoders.modules.FrozenCLIPEmbedder
@@ -112,7 +121,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     std::vector<uint8_t> token_embed_custom;
     std::map<std::string, std::pair<int, int>> embedding_pos_map;
 
-    FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
+    FrozenCLIPEmbedderWithCustomWords(std::vector<ggml_backend_t> backends,
                                       bool offload_params_to_cpu,
                                       const String2TensorStorage& tensor_storage_map,
                                       const std::map<std::string, std::string>& orig_embedding_map,
@@ -126,13 +135,28 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             tokenizer.add_special_token(name);
         }
         bool force_clip_f32 = !embedding_map.empty();
+
+        ggml_backend_t clip_backend = backends[0];
+
         if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
+            text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
         } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
+            LOG_INFO("CLIP-H: using %s backend", ggml_backend_name(clip_backend));
+            text_model = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
+            model_count                   = 2;
+            ggml_backend_t clip_g_backend = clip_backend;
+            if (backends.size() >= 2) {
+                clip_g_backend = backends[1];
+                if (backends.size() > 2) {
+                    LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
+                }
+            }
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_backend));
+            LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
+            text_model  = std::make_shared<CLIPTextModelRunner>(clip_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
         }
     }
 
@@ -652,6 +676,41 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                             conditioner_params.adm_in_channels,
                                             conditioner_params.zero_out_masked);
     }
+
+    bool is_cond_stage_model_name_at_index(const std::string& name, int index) override {
+        if (sd_version_is_sdxl(version)) {
+            if (index == 0) {
+                return contains(name, "cond_stage_model.model.transformer");
+            } else if (index == 1) {
+                return contains(name, "cond_stage_model.model.1");
+            } else {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    ggml_backend_t get_params_backend_at_index(int index) {
+        if (sd_version_is_sdxl(version) && index == 1) {
+            if (text_model2) {
+                return text_model2->get_params_backend();
+            }
+        } else if (text_model) {
+            return text_model->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index) {
+        if (sd_version_is_sdxl(version) && index == 1) {
+            if (text_model2) {
+                return text_model2->get_runtime_backend();
+            }
+        } else if (text_model) {
+            return text_model->get_runtime_backend();
+        }
+        return nullptr;
+    }
 };
 
 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
@@ -716,13 +775,31 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<CLIPTextModelRunner> clip_g;
     std::shared_ptr<T5Runner> t5;
 
-    SD3CLIPEmbedder(ggml_backend_t backend,
+    SD3CLIPEmbedder(std::vector<ggml_backend_t> backends,
                     bool offload_params_to_cpu,
                     const String2TensorStorage& tensor_storage_map = {})
         : clip_g_tokenizer(0) {
         bool use_clip_l = false;
         bool use_clip_g = false;
         bool use_t5     = false;
+
+        model_count = 3;
+
+        ggml_backend_t clip_l_backend, clip_g_backend, t5_backend;
+        if (backends.size() == 1) {
+            clip_l_backend = clip_g_backend = t5_backend = backends[0];
+        } else if (backends.size() == 2) {
+            clip_l_backend = clip_g_backend = backends[0];
+            t5_backend                      = backends[1];
+        } else if (backends.size() >= 3) {
+            clip_l_backend = backends[0];
+            clip_g_backend = backends[1];
+            t5_backend     = backends[2];
+            if (backends.size() > 3) {
+                LOG_WARN("More than 3 clip backends provided, but the model only supports 3 text encoders. Ignoring the rest.");
+            }
+        }
+
         for (auto pair : tensor_storage_map) {
             if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                 use_clip_l = true;
@@ -737,13 +814,16 @@ struct SD3CLIPEmbedder : public Conditioner {
             return;
         }
         if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
+            clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
         }
         if (use_clip_g) {
-            clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            LOG_INFO("CLIP-G: using %s backend", ggml_backend_name(clip_g_backend));
+            clip_g = std::make_shared<CLIPTextModelRunner>(clip_g_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         }
         if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+            LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(t5_backend));
+            t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
         }
     }
 
@@ -1062,6 +1142,42 @@ struct SD3CLIPEmbedder : public Conditioner {
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
     }
+
+    bool is_cond_stage_model_name_at_index(const std::string& name, int index) override {
+        if (index == 0) {
+            return contains(name, "text_encoders.clip_l");
+        } else if (index == 1) {
+            return contains(name, "text_encoders.clip_g");
+        } else if (index == 2) {
+            return contains(name, "text_encoders.t5xxl");
+        } else {
+            return false;
+        }
+    }
+
+    ggml_backend_t get_params_backend_at_index(int index) {
+        if (index == 0 && clip_l) {
+            return clip_l->get_params_backend();
+        } else if (index == 1 && clip_g) {
+            return clip_g->get_params_backend();
+        } else if (index == 2 && t5) {
+            return t5->get_params_backend();
+        } else {
+            return nullptr;
+        }
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index) {
+        if (index == 0 && clip_l) {
+            return clip_l->get_runtime_backend();
+        } else if (index == 1 && clip_g) {
+            return clip_g->get_runtime_backend();
+        } else if (index == 2 && t5) {
+            return t5->get_runtime_backend();
+        } else {
+            return nullptr;
+        }
+    }
 };
 
 struct FluxCLIPEmbedder : public Conditioner {
@@ -1071,11 +1187,25 @@ struct FluxCLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
     size_t chunk_len = 256;
 
-    FluxCLIPEmbedder(ggml_backend_t backend,
+    FluxCLIPEmbedder(std::vector<ggml_backend_t> backends,
                      bool offload_params_to_cpu,
                      const String2TensorStorage& tensor_storage_map = {}) {
         bool use_clip_l = false;
         bool use_t5     = false;
+
+        model_count = 2;
+
+        ggml_backend_t clip_l_backend, t5_backend;
+        if (backends.size() == 1) {
+            clip_l_backend = t5_backend = backends[0];
+        } else if (backends.size() >= 2) {
+            clip_l_backend = backends[0];
+            t5_backend     = backends[1];
+            if (backends.size() > 2) {
+                LOG_WARN("More than 2 clip backends provided, but the model only supports 2 text encoders. Ignoring the rest.");
+            }
+        }
+
         for (auto pair : tensor_storage_map) {
             if (pair.first.find("text_encoders.clip_l") != std::string::npos) {
                 use_clip_l = true;
@@ -1090,12 +1220,14 @@ struct FluxCLIPEmbedder : public Conditioner {
         }
 
         if (use_clip_l) {
-            clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
+            LOG_INFO("CLIP-L: using %s backend", ggml_backend_name(clip_l_backend));
+            clip_l = std::make_shared<CLIPTextModelRunner>(clip_l_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
         } else {
             LOG_WARN("clip_l text encoder not found! Prompt adherence might be degraded.");
         }
         if (use_t5) {
-            t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
+            LOG_INFO("T5-XXL: using %s backend", ggml_backend_name(t5_backend));
+            t5 = std::make_shared<T5Runner>(t5_backend, offload_params_to_cpu, tensor_storage_map, "text_encoders.t5xxl.transformer");
         } else {
             LOG_WARN("t5xxl text encoder not found! Prompt adherence might be degraded.");
         }
@@ -1306,6 +1438,36 @@ struct FluxCLIPEmbedder : public Conditioner {
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
     }
+
+    bool is_cond_stage_model_name_at_index(const std::string& name, int index) override {
+        if (index == 0) {
+            return contains(name, "text_encoders.clip_l");
+        } else if (index == 1) {
+            return contains(name, "text_encoders.t5xxl");
+        } else {
+            return false;
+        }
+    }
+
+    ggml_backend_t get_params_backend_at_index(int index) {
+        if (index == 0 && clip_l) {
+            return clip_l->get_params_backend();
+        } else if (index == 1 && t5) {
+            return t5->get_params_backend();
+        } else {
+            return nullptr;
+        }
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index) {
+        if (index == 0 && clip_l) {
+            return clip_l->get_runtime_backend();
+        } else if (index == 1 && t5) {
+            return t5->get_runtime_backend();
+        } else {
+            return nullptr;
+        }
+    }
 };
 
 struct T5CLIPEmbedder : public Conditioner {
@@ -1502,6 +1664,20 @@ struct T5CLIPEmbedder : public Conditioner {
                                             conditioner_params.clip_skip,
                                             conditioner_params.zero_out_masked);
     }
+
+    ggml_backend_t get_params_backend_at_index(int index) {
+        if (t5) {
+            return t5->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index) {
+        if (t5) {
+            return t5->get_runtime_backend();
+        }
+        return nullptr;
+    }
 };
 
 struct AnimaConditioner : public Conditioner {
@@ -1514,11 +1690,11 @@ struct AnimaConditioner : public Conditioner {
                      const String2TensorStorage& tensor_storage_map = {}) {
         qwen_tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
         llm            = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
-                                               backend,
-                                               offload_params_to_cpu,
-                                               tensor_storage_map,
-                                               "text_encoders.llm",
-                                               false);
+                                                          backend,
+                                                          offload_params_to_cpu,
+                                                          tensor_storage_map,
+                                                          "text_encoders.llm",
+                                                          false);
     }
 
     void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) override {
@@ -1616,6 +1792,20 @@ struct AnimaConditioner : public Conditioner {
         result.c_t5_weights = std::move(t5_weight_tensor);
         return result;
     }
+
+    ggml_backend_t get_params_backend_at_index(int index) {
+        if (llm) {
+            return llm->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index) {
+        if (llm) {
+            return llm->get_runtime_backend();
+        }
+        return nullptr;
+    }
 };
 
 struct LLMEmbedder : public Conditioner {
@@ -1960,6 +2150,20 @@ struct LLMEmbedder : public Conditioner {
         result.extra_c_crossattns = std::move(extra_hidden_states_vec);
         return result;
     }
+
+    ggml_backend_t get_params_backend_at_index(int index) {
+        if (llm) {
+            return llm->get_params_backend();
+        }
+        return nullptr;
+    }
+
+    ggml_backend_t get_runtime_backend_at_index(int index) {
+        if (llm) {
+            return llm->get_runtime_backend();
+        }
+        return nullptr;
+    }
 };
 
 #endif
diff --git a/src/diffusion_model.hpp b/src/diffusion_model.hpp
index eb0debffc..b34624eea 100644
--- a/src/diffusion_model.hpp
+++ b/src/diffusion_model.hpp
@@ -45,7 +45,7 @@ struct DiffusionModel {
     virtual void free_compute_buffer()                                           = 0;
     virtual void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors) = 0;
     virtual size_t get_params_buffer_size()                                      = 0;
-    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter){};
+    virtual void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {};
     virtual int64_t get_adm_in_channels()                            = 0;
     virtual void set_flash_attention_enabled(bool enabled)           = 0;
     virtual void set_circular_axes(bool circular_x, bool circular_y) = 0;
diff --git a/src/ggml_extend.hpp b/src/ggml_extend.hpp
index 859270cbd..3eaf97c54 100644
--- a/src/ggml_extend.hpp
+++ b/src/ggml_extend.hpp
@@ -24,32 +24,12 @@
 
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#include "ggml-cpu.h"
 #include "ggml.h"
+#include "ggml_extend_backend.hpp"
 
 #include "model.h"
 #include "tensor.hpp"
 
-#ifdef SD_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef SD_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef SD_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef SD_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
-
-#ifdef SD_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
 #include "rng.hpp"
 #include "tensor_ggml.hpp"
 #include "util.h"
@@ -91,6 +71,45 @@ __STATIC_INLINE__ void ggml_log_callback_default(ggml_log_level level, const cha
     }
 }
 
+__STATIC_INLINE__ bool backend_name_exists(std::string name) {
+    ggml_backend_load_all_once();
+    const int device_count = ggml_backend_dev_count();
+    for (int i = 0; i < device_count; i++) {
+        if (name == ggml_backend_dev_name(ggml_backend_dev_get(i))) {
+            return true;
+        }
+    }
+    return false;
+}
+
+__STATIC_INLINE__ std::string sanitize_backend_name(std::string name) {
+    if (name == "" || backend_name_exists(name)) {
+        return name;
+    } else {
+        LOG_WARN("Backend %s not found, using default backend", name.c_str());
+        return "";
+    }
+}
+
+__STATIC_INLINE__ std::string get_default_backend_name() {
+    ggml_backend_load_all_once();
+    // should pick the same backend as ggml_backend_init_best
+    ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
+    dev                    = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
+    dev                    = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+    return ggml_backend_dev_name(dev);
+}
+
+__STATIC_INLINE__ ggml_backend_t init_named_backend(std::string name = "") {
+    ggml_backend_load_all_once();
+    LOG_DEBUG("Initializing backend: %s", name.c_str());
+    if (name.empty()) {
+        return ggml_backend_init_best();
+    } else {
+        return ggml_backend_init_by_name(name.c_str(), nullptr);
+    }
+}
+
 static_assert(GGML_MAX_NAME >= 128, "GGML_MAX_NAME must be at least 128");
 
 // n-mode tensor-matrix product
@@ -1286,25 +1305,25 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_ones_like(ggml_context* ctx,
     return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
 }
 
-__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
-#ifdef SD_USE_VULKAN
-    auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
-    auto out        = ggml_reshape_1d(ctx, a, ggml_nelements(a));
-    out             = ggml_get_rows(ctx, out, zero_index);
-    out             = ggml_reshape(ctx, out, a);
-    // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
-    return out;
-#else
-    auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
-    ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
-    if (ggml_is_transposed(out)) {
-        out = ggml_mul_mat(ctx, one, out);
+__STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* a) {
+    if (sd_backend_is(backend, "Vulkan")) {
+        auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
+        auto out        = ggml_reshape_1d(ctx, a, ggml_nelements(a));
+        out             = ggml_get_rows(ctx, out, zero_index);
+        out             = ggml_reshape(ctx, out, a);
+        // auto out = ggml_cast(ctx, a, GGML_TYPE_F32);
+        return out;
     } else {
-        out = ggml_mul_mat(ctx, out, one);
+        auto out         = ggml_reshape_2d(ctx, a, 1, ggml_nelements(a));
+        ggml_tensor* one = ggml_ext_ones(ctx, 1, 1, 1, 1);  // [1,]
+        if (ggml_is_transposed(out)) {
+            out = ggml_mul_mat(ctx, one, out);
+        } else {
+            out = ggml_mul_mat(ctx, out, one);
+        }
+        out = ggml_reshape(ctx, out, a);
+        return out;
     }
-    out = ggml_reshape(ctx, out, a);
-#endif
-    return out;
 }
 
 // q: [N, L_q, C(n_head*d_head)] or [N*n_head, L_q, d_head]
@@ -1496,16 +1515,14 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_group_norm(ggml_context* ctx,
 }
 
 __STATIC_INLINE__ void ggml_ext_backend_tensor_get_and_sync(ggml_backend_t backend, const ggml_tensor* tensor, void* data, size_t offset, size_t size) {
-#if defined(SD_USE_CUDA) || defined(SD_USE_SYCL)
-    if (!ggml_backend_is_cpu(backend)) {
+    if ((sd_backend_is(backend, "ROCm") || sd_backend_is(backend, "CUDA") || sd_backend_is(backend, "SYCL")) &&
+        !ggml_backend_is_cpu(backend)) {
         ggml_backend_tensor_get_async(backend, tensor, data, offset, size);
         ggml_backend_synchronize(backend);
-    } else {
-        ggml_backend_tensor_get(tensor, data, offset, size);
+        return;
     }
-#else
+
     ggml_backend_tensor_get(tensor, data, offset, size);
-#endif
 }
 
 __STATIC_INLINE__ float ggml_ext_backend_tensor_get_f32(ggml_tensor* tensor) {
@@ -1664,14 +1681,15 @@ struct WeightAdapter {
             float scale     = 1.f;
         } conv2d;
     };
-    virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) = 0;
+    virtual ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) = 0;
     virtual ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                           ggml_backend_t backend,
                                            ggml_tensor* x,
                                            ggml_tensor* w,
                                            ggml_tensor* b,
                                            const std::string& prefix,
-                                           ForwardParams forward_params)                                      = 0;
-    virtual size_t get_extra_graph_size()                                                                     = 0;
+                                           ForwardParams forward_params)                                                              = 0;
+    virtual size_t get_extra_graph_size()                                                                                             = 0;
 };
 
 struct GGMLRunnerContext {
@@ -2192,6 +2210,14 @@ struct GGMLRunner {
     void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) {
         weight_adapter = adapter;
     }
+
+    ggml_backend_t get_runtime_backend() {
+        return runtime_backend;
+    }
+
+    ggml_backend_t get_params_backend() {
+        return params_backend;
+    }
 };
 
 class GGMLBlock {
@@ -2336,6 +2362,14 @@ class Linear : public UnaryBlock {
           force_prec_f32(force_prec_f32),
           scale(scale) {}
 
+    void set_scale(float scale_) {
+        scale = scale_;
+    }
+
+    void set_force_prec_f32(bool force_prec_f32_) {
+        force_prec_f32 = force_prec_f32_;
+    }
+
     ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         ggml_tensor* w = params["weight"];
         ggml_tensor* b = nullptr;
@@ -2347,7 +2381,7 @@ class Linear : public UnaryBlock {
             forward_params.op_type               = WeightAdapter::ForwardParams::op_type_t::OP_LINEAR;
             forward_params.linear.force_prec_f32 = force_prec_f32;
             forward_params.linear.scale          = scale;
-            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params);
         }
         return ggml_ext_linear(ctx->ggml_ctx, x, w, b, force_prec_f32, scale);
     }
@@ -2463,7 +2497,7 @@ class Conv2d : public UnaryBlock {
             forward_params.conv2d.circular_x = ctx->circular_x_enabled;
             forward_params.conv2d.circular_y = ctx->circular_y_enabled;
             forward_params.conv2d.scale      = scale;
-            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, x, w, b, prefix, forward_params);
+            return ctx->weight_adapter->forward_with_lora(ctx->ggml_ctx, ctx->backend, x, w, b, prefix, forward_params);
         }
         return ggml_ext_conv_2d(ctx->ggml_ctx,
                                 x,
@@ -2527,7 +2561,7 @@ class Conv3d : public UnaryBlock {
         ggml_tensor* w = params["weight"];
         ggml_tensor* b = nullptr;
         if (ctx->weight_adapter) {
-            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
             if (w->type != GGML_TYPE_F16) {
                 w = ggml_cast(ctx->ggml_ctx, w, GGML_TYPE_F16);
             }
@@ -2535,7 +2569,7 @@ class Conv3d : public UnaryBlock {
         if (bias) {
             b = params["bias"];
             if (ctx->weight_adapter) {
-                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
             }
         }
         return ggml_ext_conv_3d(ctx->ggml_ctx, x, w, b, in_channels,
@@ -2582,12 +2616,12 @@ class LayerNorm : public UnaryBlock {
         if (elementwise_affine) {
             w = params["weight"];
             if (ctx->weight_adapter) {
-                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
             }
             if (bias) {
                 b = params["bias"];
                 if (ctx->weight_adapter) {
-                    b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                    b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
                 }
             }
         }
@@ -2630,8 +2664,8 @@ class GroupNorm : public GGMLBlock {
             w = params["weight"];
             b = params["bias"];
             if (ctx->weight_adapter) {
-                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
-                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, b, prefix + "bias");
+                w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
+                b = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, b, prefix + "bias");
             }
         }
         return ggml_ext_group_norm(ctx->ggml_ctx, x, w, b, num_groups);
@@ -2665,7 +2699,7 @@ class RMSNorm : public UnaryBlock {
     ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) {
         ggml_tensor* w = params["weight"];
         if (ctx->weight_adapter) {
-            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, w, prefix + "weight");
+            w = ctx->weight_adapter->patch_weight(ctx->ggml_ctx, ctx->backend, w, prefix + "weight");
         }
         x = ggml_rms_norm(ctx->ggml_ctx, x, eps);
         x = ggml_mul_inplace(ctx->ggml_ctx, x, w);
@@ -2748,6 +2782,7 @@ class MultiheadAttention : public GGMLBlock {
 
 __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
     ggml_context* ctx,
+    ggml_backend_t backend,
     ggml_tensor* h,    // Input: [q, batch] or [W, H, q, batch]
     ggml_tensor* w1,   // Outer C (Full rank)
     ggml_tensor* w1a,  // Outer A (Low rank part 1)
@@ -2778,29 +2813,29 @@ __STATIC_INLINE__ ggml_tensor* ggml_ext_lokr_forward(
         int merge_batch_uq = batch;
         int merge_batch_vp = batch;
 
-#if SD_USE_VULKAN
-        if (batch > 1) {
-            // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
-            int max_batch    = 65535;
-            int max_batch_uq = max_batch / uq;
-            merge_batch_uq   = 1;
-            for (int i = max_batch_uq; i > 0; i--) {
-                if (batch % i == 0) {
-                    merge_batch_uq = i;
-                    break;
+        if (sd_backend_is(backend, "Vulkan")) {
+            if (batch > 1) {
+                // no access to backend here, worst case is slightly worse perfs for other backends when built alongside Vulkan backend
+                int max_batch    = 65535;
+                int max_batch_uq = max_batch / uq;
+                merge_batch_uq   = 1;
+                for (int i = max_batch_uq; i > 0; i--) {
+                    if (batch % i == 0) {
+                        merge_batch_uq = i;
+                        break;
+                    }
                 }
-            }
 
-            int max_batch_vp = max_batch / vp;
-            merge_batch_vp   = 1;
-            for (int i = max_batch_vp; i > 0; i--) {
-                if (batch % i == 0) {
-                    merge_batch_vp = i;
-                    break;
+                int max_batch_vp = max_batch / vp;
+                merge_batch_vp   = 1;
+                for (int i = max_batch_vp; i > 0; i--) {
+                    if (batch % i == 0) {
+                        merge_batch_vp = i;
+                        break;
+                    }
                 }
             }
         }
-#endif
 
         ggml_tensor* h_split = ggml_reshape_3d(ctx, h, vq, uq * merge_batch_uq, batch / merge_batch_uq);
         if (w2 != NULL) {
diff --git a/src/ggml_extend_backend.hpp b/src/ggml_extend_backend.hpp
new file mode 100644
index 000000000..19cf56f2e
--- /dev/null
+++ b/src/ggml_extend_backend.hpp
@@ -0,0 +1,293 @@
+#ifndef __GGML_EXTEND_BACKEND_HPP__
+#define __GGML_EXTEND_BACKEND_HPP__
+
+#include <cstring>
+#include <mutex>
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifndef __STATIC_INLINE__
+#define __STATIC_INLINE__ static inline
+#endif
+
+inline void ggml_backend_load_all_once() {
+#if defined(GGML_BACKEND_DL)
+    // If the host process already preloaded backends explicitly
+    // (for example via ggml_backend_load / ggml_backend_load_all_from_path),
+    // do not rescan the default paths again.
+    if (ggml_backend_dev_count() > 0) {
+        return;
+    }
+    // In dynamic-backend mode the backend modules are discovered at runtime,
+    // so we must load them before asking for the CPU backend or its proc table.
+    static std::once_flag once;
+    std::call_once(once, []() {
+        if (ggml_backend_dev_count() > 0) {
+            return;
+        }
+        ggml_backend_load_all();
+    });
+#endif
+}
+
+#if defined(GGML_BACKEND_DL)
+
+// Do not gate this branch on GGML_CPU or GGML_CPU_ALL_VARIANTS:
+// those are CMake options used to configure ggml itself, but they are not
+// exported as PUBLIC compile definitions to stable-diffusion in backend-DL mode.
+// In practice, this target can reliably see GGML_BACKEND_DL, but not whether
+// the CPU backend was compiled as a loadable module. We therefore use runtime
+// backend discovery instead of compile-time assumptions.
+
+__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_cpu_reg() {
+    ggml_backend_load_all_once();
+    return ggml_backend_reg_by_name("CPU");
+}
+
+__STATIC_INLINE__ ggml_backend_reg_t ggml_backend_reg_from_backend(ggml_backend_t backend) {
+    if (backend != nullptr) {
+        ggml_backend_dev_t device = ggml_backend_get_device(backend);
+        if (device != nullptr) {
+            return ggml_backend_dev_backend_reg(device);
+        }
+    }
+
+    return ggml_backend_cpu_reg();
+}
+
+__STATIC_INLINE__ ggml_backend_t ggml_backend_cpu_init() {
+    ggml_backend_load_all_once();
+    return ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+}
+
+__STATIC_INLINE__ bool ggml_backend_is_cpu(ggml_backend_t backend) {
+    if (backend == nullptr) {
+        return false;
+    }
+
+    ggml_backend_dev_t device = ggml_backend_get_device(backend);
+    if (device != nullptr) {
+        return ggml_backend_dev_type(device) == GGML_BACKEND_DEVICE_TYPE_CPU;
+    }
+
+    const char* backend_name = ggml_backend_name(backend);
+    return backend_name != nullptr && std::strcmp(backend_name, "CPU") == 0;
+}
+
+__STATIC_INLINE__ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+    ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
+    if (reg == nullptr) {
+        return;
+    }
+
+    auto fn = reinterpret_cast<ggml_backend_set_n_threads_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"));
+    if (fn != nullptr) {
+        fn(backend_cpu, n_threads);
+    }
+}
+
+using __ggml_backend_cpu_set_threadpool_t = void (*)(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+
+__STATIC_INLINE__ void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
+    ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
+    if (reg == nullptr) {
+        return;
+    }
+
+    auto fn = reinterpret_cast<__ggml_backend_cpu_set_threadpool_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"));
+    if (fn != nullptr) {
+        fn(backend_cpu, threadpool);
+    }
+}
+
+__STATIC_INLINE__ void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void* abort_callback_data) {
+    ggml_backend_reg_t reg = ggml_backend_reg_from_backend(backend_cpu);
+    if (reg == nullptr) {
+        return;
+    }
+
+    auto fn = reinterpret_cast<ggml_backend_set_abort_callback_t>(ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"));
+    if (fn != nullptr) {
+        fn(backend_cpu, abort_callback, abort_callback_data);
+    }
+}
+
+__STATIC_INLINE__ ggml_backend_buffer_t ggml_backend_tensor_buffer(const struct ggml_tensor* tensor) {
+    if (tensor == nullptr) {
+        return nullptr;
+    }
+
+    return tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+}
+
+__STATIC_INLINE__ bool ggml_backend_tensor_is_host_accessible(const struct ggml_tensor* tensor) {
+    if (tensor == nullptr || tensor->data == nullptr) {
+        return false;
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_tensor_buffer(tensor);
+    return buffer == nullptr || ggml_backend_buffer_is_host(buffer);
+}
+
+__STATIC_INLINE__ size_t ggml_backend_tensor_offset(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    return (size_t)(i0 * tensor->nb[0] + i1 * tensor->nb[1] + i2 * tensor->nb[2] + i3 * tensor->nb[3]);
+}
+
+template <typename T>
+__STATIC_INLINE__ void ggml_backend_tensor_write_scalar(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, T value) {
+    const size_t offset = ggml_backend_tensor_offset(tensor, i0, i1, i2, i3);
+
+    if (ggml_backend_tensor_is_host_accessible(tensor)) {
+        auto* dst = reinterpret_cast<T*>(reinterpret_cast<char*>(tensor->data) + offset);
+        *dst      = value;
+        return;
+    }
+
+    ggml_backend_tensor_set(const_cast<struct ggml_tensor*>(tensor), &value, offset, sizeof(T));
+}
+
+__STATIC_INLINE__ void ggml_set_f32_nd(const struct ggml_tensor* tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float value) {
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int8_t>(value));
+            break;
+        case GGML_TYPE_I16:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int16_t>(value));
+            break;
+        case GGML_TYPE_I32:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, static_cast<int32_t>(value));
+            break;
+        case GGML_TYPE_F16:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_fp16(value));
+            break;
+        case GGML_TYPE_BF16:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, ggml_fp32_to_bf16(value));
+            break;
+        case GGML_TYPE_F32:
+            ggml_backend_tensor_write_scalar(tensor, i0, i1, i2, i3, value);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+__STATIC_INLINE__ void ggml_set_f32_1d(const struct ggml_tensor* tensor, int i, float value) {
+    if (!ggml_is_contiguous(tensor)) {
+        int64_t id[4] = {0, 0, 0, 0};
+        ggml_unravel_index(tensor, i, &id[0], &id[1], &id[2], &id[3]);
+        ggml_set_f32_nd(tensor, id[0], id[1], id[2], id[3], value);
+        return;
+    }
+
+    switch (tensor->type) {
+        case GGML_TYPE_I8:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int8_t>(value));
+            break;
+        case GGML_TYPE_I16:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int16_t>(value));
+            break;
+        case GGML_TYPE_I32:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, static_cast<int32_t>(value));
+            break;
+        case GGML_TYPE_F16:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_fp16(value));
+            break;
+        case GGML_TYPE_BF16:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, ggml_fp32_to_bf16(value));
+            break;
+        case GGML_TYPE_F32:
+            ggml_backend_tensor_write_scalar(tensor, i, 0, 0, 0, value);
+            break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
+
+__STATIC_INLINE__ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context* ctx, struct ggml_cgraph* cgraph, int n_threads) {
+    (void)ctx;
+
+    // The legacy ggml_graph_compute_with_ctx() symbol lives in ggml-cpu, but
+    // the backend proc table does not expose it in GGML_BACKEND_DL mode.
+    // Recreate the old behavior by initializing the CPU backend explicitly and
+    // executing the graph through the generic backend API.
+    ggml_backend_t backend = ggml_backend_cpu_init();
+    if (backend == nullptr) {
+        return GGML_STATUS_ALLOC_FAILED;
+    }
+
+    ggml_backend_cpu_set_n_threads(backend, n_threads);
+
+    const enum ggml_status status = ggml_backend_graph_compute(backend, cgraph);
+    ggml_backend_free(backend);
+
+    return status;
+}
+
+__STATIC_INLINE__ ggml_tensor* ggml_set_f32(struct ggml_tensor* tensor, float value) {
+    GGML_ASSERT(tensor != nullptr);
+
+    if (ggml_backend_tensor_is_host_accessible(tensor) && ggml_is_contiguous(tensor)) {
+        const int64_t nelements = ggml_nelements(tensor);
+
+        switch (tensor->type) {
+            case GGML_TYPE_I8: {
+                auto* data     = reinterpret_cast<int8_t*>(tensor->data);
+                const int8_t v = static_cast<int8_t>(value);
+                for (int64_t i = 0; i < nelements; ++i) {
+                    data[i] = v;
+                }
+            } break;
+            case GGML_TYPE_I16: {
+                auto* data      = reinterpret_cast<int16_t*>(tensor->data);
+                const int16_t v = static_cast<int16_t>(value);
+                for (int64_t i = 0; i < nelements; ++i) {
+                    data[i] = v;
+                }
+            } break;
+            case GGML_TYPE_I32: {
+                auto* data      = reinterpret_cast<int32_t*>(tensor->data);
+                const int32_t v = static_cast<int32_t>(value);
+                for (int64_t i = 0; i < nelements; ++i) {
+                    data[i] = v;
+                }
+            } break;
+            case GGML_TYPE_F16: {
+                auto* data          = reinterpret_cast<ggml_fp16_t*>(tensor->data);
+                const ggml_fp16_t v = ggml_fp32_to_fp16(value);
+                for (int64_t i = 0; i < nelements; ++i) {
+                    data[i] = v;
+                }
+            } break;
+            case GGML_TYPE_BF16: {
+                auto* data          = reinterpret_cast<ggml_bf16_t*>(tensor->data);
+                const ggml_bf16_t v = ggml_fp32_to_bf16(value);
+                for (int64_t i = 0; i < nelements; ++i) {
+                    data[i] = v;
+                }
+            } break;
+            case GGML_TYPE_F32: {
+                auto* data = reinterpret_cast<float*>(tensor->data);
+                for (int64_t i = 0; i < nelements; ++i) {
+                    data[i] = value;
+                }
+            } break;
+            default:
+                GGML_ABORT("fatal error");
+        }
+
+        return tensor;
+    }
+
+    const int64_t nelements = ggml_nelements(tensor);
+    for (int64_t i = 0; i < nelements; ++i) {
+        ggml_set_f32_1d(tensor, static_cast<int>(i), value);
+    }
+
+    return tensor;
+}
+
+#else
+#include "ggml-cpu.h"
+#endif
+#endif
diff --git a/src/lora.hpp b/src/lora.hpp
index d4a749ef9..b57bc4226 100644
--- a/src/lora.hpp
+++ b/src/lora.hpp
@@ -129,7 +129,7 @@ struct LoraModel : public GGMLRunner {
         }
     }
 
-    ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_lora_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
         ggml_tensor* updown = nullptr;
         int index           = 0;
         while (true) {
@@ -152,17 +152,17 @@ struct LoraModel : public GGMLRunner {
 
             auto iter = lora_tensors.find(lora_up_name);
             if (iter != lora_tensors.end()) {
-                lora_up = ggml_ext_cast_f32(ctx, iter->second);
+                lora_up = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             iter = lora_tensors.find(lora_mid_name);
             if (iter != lora_tensors.end()) {
-                lora_mid = ggml_ext_cast_f32(ctx, iter->second);
+                lora_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             iter = lora_tensors.find(lora_down_name);
             if (iter != lora_tensors.end()) {
-                lora_down = ggml_ext_cast_f32(ctx, iter->second);
+                lora_down = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             if (lora_up == nullptr || lora_down == nullptr) {
@@ -208,7 +208,7 @@ struct LoraModel : public GGMLRunner {
         return updown;
     }
 
-    ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_raw_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
         ggml_tensor* updown = nullptr;
         int index           = 0;
         while (true) {
@@ -225,7 +225,7 @@ struct LoraModel : public GGMLRunner {
 
             auto iter = lora_tensors.find(diff_name);
             if (iter != lora_tensors.end()) {
-                curr_updown = ggml_ext_cast_f32(ctx, iter->second);
+                curr_updown = ggml_ext_cast_f32(ctx, backend, iter->second);
             } else {
                 break;
             }
@@ -248,7 +248,7 @@ struct LoraModel : public GGMLRunner {
         return updown;
     }
 
-    ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_loha_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
         ggml_tensor* updown = nullptr;
         int index           = 0;
         while (true) {
@@ -276,33 +276,33 @@ struct LoraModel : public GGMLRunner {
 
             auto iter = lora_tensors.find(hada_1_down_name);
             if (iter != lora_tensors.end()) {
-                hada_1_down = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_down = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             iter = lora_tensors.find(hada_1_up_name);
             if (iter != lora_tensors.end()) {
-                hada_1_up = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_up = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             iter = lora_tensors.find(hada_1_mid_name);
             if (iter != lora_tensors.end()) {
-                hada_1_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_1_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
                 hada_1_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_1_up));
             }
 
             iter = lora_tensors.find(hada_2_down_name);
             if (iter != lora_tensors.end()) {
-                hada_2_down = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_down = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             iter = lora_tensors.find(hada_2_up_name);
             if (iter != lora_tensors.end()) {
-                hada_2_up = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_up = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             iter = lora_tensors.find(hada_2_mid_name);
             if (iter != lora_tensors.end()) {
-                hada_2_mid = ggml_ext_cast_f32(ctx, iter->second);
+                hada_2_mid = ggml_ext_cast_f32(ctx, backend, iter->second);
                 hada_2_up  = ggml_cont(ctx, ggml_transpose(ctx, hada_2_up));
             }
 
@@ -351,7 +351,7 @@ struct LoraModel : public GGMLRunner {
         return updown;
     }
 
-    ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx) {
+    ggml_tensor* get_lokr_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_backend_t backend) {
         ggml_tensor* updown = nullptr;
         int index           = 0;
         while (true) {
@@ -378,24 +378,24 @@ struct LoraModel : public GGMLRunner {
 
             auto iter = lora_tensors.find(lokr_w1_name);
             if (iter != lora_tensors.end()) {
-                lokr_w1 = ggml_ext_cast_f32(ctx, iter->second);
+                lokr_w1 = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             iter = lora_tensors.find(lokr_w2_name);
             if (iter != lora_tensors.end()) {
-                lokr_w2 = ggml_ext_cast_f32(ctx, iter->second);
+                lokr_w2 = ggml_ext_cast_f32(ctx, backend, iter->second);
             }
 
             int64_t rank = 1;
             if (lokr_w1 == nullptr) {
                 iter = lora_tensors.find(lokr_w1_a_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w1_a = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w1_a = ggml_ext_cast_f32(ctx, backend, iter->second);
                 }
 
                 iter = lora_tensors.find(lokr_w1_b_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w1_b = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w1_b = ggml_ext_cast_f32(ctx, backend, iter->second);
                 }
 
                 if (lokr_w1_a == nullptr || lokr_w1_b == nullptr) {
@@ -410,12 +410,12 @@ struct LoraModel : public GGMLRunner {
             if (lokr_w2 == nullptr) {
                 iter = lora_tensors.find(lokr_w2_a_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w2_a = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w2_a = ggml_ext_cast_f32(ctx, backend, iter->second);
                 }
 
                 iter = lora_tensors.find(lokr_w2_b_name);
                 if (iter != lora_tensors.end()) {
-                    lokr_w2_b = ggml_ext_cast_f32(ctx, iter->second);
+                    lokr_w2_b = ggml_ext_cast_f32(ctx, backend, iter->second);
                 }
 
                 if (lokr_w2_a == nullptr || lokr_w2_b == nullptr) {
@@ -468,23 +468,23 @@ struct LoraModel : public GGMLRunner {
         return updown;
     }
 
-    ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
+    ggml_tensor* get_weight_diff(const std::string& model_tensor_name, ggml_backend_t backend, ggml_context* ctx, ggml_tensor* model_tensor, bool with_lora_and_lokr = true) {
         // lora
         ggml_tensor* diff = nullptr;
         if (with_lora_and_lokr) {
-            diff = get_lora_weight_diff(model_tensor_name, ctx);
+            diff = get_lora_weight_diff(model_tensor_name, ctx, backend);
         }
         // diff
         if (diff == nullptr) {
-            diff = get_raw_weight_diff(model_tensor_name, ctx);
+            diff = get_raw_weight_diff(model_tensor_name, ctx, backend);
         }
         // loha
         if (diff == nullptr) {
-            diff = get_loha_weight_diff(model_tensor_name, ctx);
+            diff = get_loha_weight_diff(model_tensor_name, ctx, backend);
         }
         // lokr
         if (diff == nullptr && with_lora_and_lokr) {
-            diff = get_lokr_weight_diff(model_tensor_name, ctx);
+            diff = get_lokr_weight_diff(model_tensor_name, ctx, backend);
         }
         if (diff != nullptr) {
             if (ggml_nelements(diff) < ggml_nelements(model_tensor)) {
@@ -502,6 +502,7 @@ struct LoraModel : public GGMLRunner {
     }
 
     ggml_tensor* get_out_diff(ggml_context* ctx,
+                              ggml_backend_t backend,
                               ggml_tensor* x,
                               WeightAdapter::ForwardParams forward_params,
                               const std::string& model_tensor_name) {
@@ -590,7 +591,7 @@ struct LoraModel : public GGMLRunner {
                 }
                 scale_value *= multiplier;
 
-                auto curr_out_diff = ggml_ext_lokr_forward(ctx, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
+                auto curr_out_diff = ggml_ext_lokr_forward(ctx, backend, x, lokr_w1, lokr_w1_a, lokr_w1_b, lokr_w2, lokr_w2_a, lokr_w2_b, is_conv2d, forward_params.conv2d, scale_value);
                 if (out_diff == nullptr) {
                     out_diff = curr_out_diff;
                 } else {
@@ -761,7 +762,7 @@ struct LoraModel : public GGMLRunner {
             ggml_tensor* model_tensor     = it.second;
 
             // lora
-            ggml_tensor* diff = get_weight_diff(model_tensor_name, compute_ctx, model_tensor);
+            ggml_tensor* diff = get_weight_diff(model_tensor_name, runtime_backend, compute_ctx, model_tensor);
             if (diff == nullptr) {
                 continue;
             }
@@ -774,7 +775,7 @@ struct LoraModel : public GGMLRunner {
 
             ggml_tensor* final_tensor;
             if (model_tensor->type != GGML_TYPE_F32 && model_tensor->type != GGML_TYPE_F16) {
-                final_tensor = ggml_ext_cast_f32(compute_ctx, model_tensor);
+                final_tensor = ggml_ext_cast_f32(compute_ctx, runtime_backend, model_tensor);
                 final_tensor = ggml_add_inplace(compute_ctx, final_tensor, diff);
                 final_tensor = ggml_cpy(compute_ctx, final_tensor, model_tensor);
             } else {
@@ -841,34 +842,35 @@ struct MultiLoraAdapter : public WeightAdapter {
         : lora_models(lora_models) {
     }
 
-    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name, bool with_lora_and_lokr) {
         for (auto& lora_model : lora_models) {
-            ggml_tensor* diff = lora_model->get_weight_diff(weight_name, ctx, weight, with_lora_and_lokr);
+            ggml_tensor* diff = lora_model->get_weight_diff(weight_name, backend, ctx, weight, with_lora_and_lokr);
             if (diff == nullptr) {
                 continue;
             }
 
             if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
-                weight = ggml_ext_cast_f32(ctx, weight);
+                weight = ggml_ext_cast_f32(ctx, backend, weight);
             }
             weight = ggml_add(ctx, weight, diff);
         }
         return weight;
     }
 
-    ggml_tensor* patch_weight(ggml_context* ctx, ggml_tensor* weight, const std::string& weight_name) override {
-        return patch_weight(ctx, weight, weight_name, true);
+    ggml_tensor* patch_weight(ggml_context* ctx, ggml_backend_t backend, ggml_tensor* weight, const std::string& weight_name) override {
+        return patch_weight(ctx, backend, weight, weight_name, true);
     }
 
     ggml_tensor* forward_with_lora(ggml_context* ctx,
+                                   ggml_backend_t backend,
                                    ggml_tensor* x,
                                    ggml_tensor* w,
                                    ggml_tensor* b,
                                    const std::string& prefix,
                                    WeightAdapter::ForwardParams forward_params) override {
-        w = patch_weight(ctx, w, prefix + "weight", false);
+        w = patch_weight(ctx, backend, w, prefix + "weight", false);
         if (b) {
-            b = patch_weight(ctx, b, prefix + "bias", false);
+            b = patch_weight(ctx, backend, b, prefix + "bias", false);
         }
         ggml_tensor* out;
         if (forward_params.op_type == ForwardParams::op_type_t::OP_LINEAR) {
@@ -890,7 +892,7 @@ struct MultiLoraAdapter : public WeightAdapter {
                                    forward_params.conv2d.scale);
         }
         for (auto& lora_model : lora_models) {
-            ggml_tensor* out_diff = lora_model->get_out_diff(ctx, x, forward_params, prefix + "weight");
+            ggml_tensor* out_diff = lora_model->get_out_diff(ctx, backend, x, forward_params, prefix + "weight");
             if (out_diff == nullptr) {
                 continue;
             }
diff --git a/src/model.cpp b/src/model.cpp
index 1ccb03cf3..ced0efe75 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -19,24 +19,12 @@
 
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
-#include "ggml-cpu.h"
 #include "ggml.h"
+#include "ggml_extend_backend.hpp"
 
 #include "name_conversion.h"
 #include "stable-diffusion.h"
 
-#ifdef SD_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#ifdef SD_USE_VULKAN
-#include "ggml-vulkan.h"
-#endif
-
-#ifdef SD_USE_OPENCL
-#include "ggml-opencl.h"
-#endif
-
 #define ST_HEADER_SIZE_LEN 8
 
 uint64_t read_u64(uint8_t* buffer) {
diff --git a/src/qwen_image.hpp b/src/qwen_image.hpp
index 83c8cec66..1cbeb71d5 100644
--- a/src/qwen_image.hpp
+++ b/src/qwen_image.hpp
@@ -95,9 +95,7 @@ namespace Qwen {
 
             float scale         = 1.f / 32.f;
             bool force_prec_f32 = false;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
+
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example when using CUDA but the weights are k-quants (not all prompts).
             blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, out_dim, out_bias, false, force_prec_f32, scale));
@@ -124,6 +122,10 @@ namespace Qwen {
             auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
             auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
 
+            if (sd_backend_is(ctx->backend, "Vulkan")) {
+                to_out_0->set_force_prec_f32(true);
+            }
+
             auto norm_added_q = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_q"]);
             auto norm_added_k = std::dynamic_pointer_cast<UnaryBlock>(blocks["norm_added_k"]);
 
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index ae34530b0..ef796b5aa 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -1,5 +1,7 @@
 #include "ggml_extend.hpp"
+#include "ggml_extend_backend.hpp"
 
+#include <vector>
 #include "model.h"
 #include "rng.hpp"
 #include "rng_mt19937.hpp"
@@ -22,6 +24,10 @@
 #include "latent-preview.h"
 #include "name_conversion.h"
 
+#if GGML_RPC
+#include "ggml-rpc.h"
+#endif
+
 const char* model_version_to_str[] = {
     "SD 1.x",
     "SD 1.x Inpaint",
@@ -100,14 +106,128 @@ static float get_cache_reuse_threshold(const sd_cache_params_t& params) {
     return std::max(0.0f, reuse_threshold);
 }
 
+std::vector<std::string> string_split(const std::string& input, char separator) {
+    std::vector<std::string> parts;
+    size_t begin_pos     = 0;
+    size_t separator_pos = input.find(separator);
+    while (separator_pos != std::string::npos) {
+        std::string part = input.substr(begin_pos, separator_pos - begin_pos);
+        parts.emplace_back(part);
+        begin_pos     = separator_pos + 1;
+        separator_pos = input.find(separator, begin_pos);
+    }
+    parts.emplace_back(input.substr(begin_pos, separator_pos - begin_pos));
+    return parts;
+}
+
+static void add_rpc_devices(const std::string& servers) {
+    auto rpc_servers = string_split(servers, ',');
+    if (rpc_servers.empty()) {
+        throw std::invalid_argument("no RPC servers specified");
+    }
+    ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+    if (!rpc_reg) {
+        throw std::invalid_argument("failed to find RPC backend");
+    }
+    typedef ggml_backend_reg_t (*ggml_backend_rpc_add_server_t)(const char* endpoint);
+    ggml_backend_rpc_add_server_t ggml_backend_rpc_add_server_fn = (ggml_backend_rpc_add_server_t)ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_server");
+    if (!ggml_backend_rpc_add_server_fn) {
+        throw std::invalid_argument("failed to find RPC add server function");
+    }
+    for (const auto& server : rpc_servers) {
+        auto reg = ggml_backend_rpc_add_server_fn(server.c_str());
+        ggml_backend_register(reg);
+    }
+}
+
+void add_rpc_device(const char* servers_cstr) {
+    std::string servers(servers_cstr);
+    add_rpc_devices(servers);
+}
+
+std::vector<std::string> sanitize_backend_name_list(std::string name) {
+    std::vector<std::string> vec = {};
+    if (name == "" || backend_name_exists(name)) {
+        // single backend
+        vec.push_back(name);
+    } else if (name.find(",") != std::string::npos) {
+        // comma-separated backend names
+        std::stringstream ss(name);
+        std::string token;
+        while (std::getline(ss, token, ',')) {
+            if (token == "" || backend_name_exists(token)) {
+                vec.push_back(token);
+            } else {
+                LOG_WARN("backend name %s not found, using default", token.c_str());
+                vec.push_back("");
+            }
+        }
+    } else {
+        vec.push_back("");
+    }
+    return vec;
+}
+
+std::vector<std::pair<std::string, std::string>> list_backends_vector() {
+    std::vector<std::pair<std::string, std::string>> backends;
+    const int device_count = ggml_backend_dev_count();
+    for (int i = 0; i < device_count; i++) {
+        auto dev = ggml_backend_dev_get(i);
+        backends.push_back({ggml_backend_dev_name(dev), ggml_backend_dev_description(dev)});
+    }
+    return backends;
+}
+
+SD_API size_t backend_list_size() {
+    // for C API
+    size_t buffer_size = 0;
+    auto backends      = list_backends_vector();
+    for (auto& backend : backends) {
+        auto dev_name_size = backend.first.size();
+        auto dev_desc_size = backend.second.size();
+        buffer_size += dev_name_size + dev_desc_size + 2;  // +2 for the separators
+    }
+    return buffer_size;
+}
+
+// devices are separated by \n and name and description are separated by \t
+SD_API void list_backends_to_buffer(char* buffer, size_t buffer_size) {
+    auto backends = list_backends_vector();
+    size_t offset = 0;
+    for (auto& backend : backends) {
+        size_t name_size = backend.first.size();
+        size_t desc_size = backend.second.size();
+        if (offset + name_size + desc_size + 2 > buffer_size) {
+            break;  // Not enough space in the buffer
+        }
+        memcpy(buffer + offset, backend.first.c_str(), name_size);
+        offset += name_size;
+        buffer[offset++] = '\t';
+        memcpy(buffer + offset, backend.second.c_str(), desc_size);
+        offset += desc_size;
+        buffer[offset++] = '\n';
+    }
+    if (offset < buffer_size) {
+        buffer[offset] = '\0';  // Ensure the buffer is null-terminated at the end
+    } else {
+        LOG_WARN("Provided buffer size is too small to contain details of all devices.");
+        buffer[buffer_size - 1] = '\0';  // Ensure the buffer is null-terminated at the end
+    }
+}
+
 /*=============================================== StableDiffusionGGML ================================================*/
 
 class StableDiffusionGGML {
 public:
     ggml_backend_t backend             = nullptr;  // general backend
-    ggml_backend_t clip_backend        = nullptr;
+    ggml_backend_t diffusion_backend   = nullptr;
     ggml_backend_t control_net_backend = nullptr;
     ggml_backend_t vae_backend         = nullptr;
+    ggml_backend_t tae_backend         = nullptr;
+    ggml_backend_t pmid_backend        = nullptr;
+    ggml_backend_t vision_backend      = nullptr;
+
+    std::vector<ggml_backend_t> clip_backends = {nullptr};
 
     SDVersion version;
     bool vae_decode_only         = false;
@@ -155,72 +275,36 @@ class StableDiffusionGGML {
     StableDiffusionGGML() = default;
 
     ~StableDiffusionGGML() {
-        if (clip_backend != backend) {
-            ggml_backend_free(clip_backend);
+        if (diffusion_backend && diffusion_backend != backend) {
+            ggml_backend_free(diffusion_backend);
+        }
+        for (auto clip_backend : clip_backends) {
+            if (clip_backend && clip_backend != backend) {
+                ggml_backend_free(clip_backend);
+            }
         }
-        if (control_net_backend != backend) {
+        if (control_net_backend && control_net_backend != backend) {
             ggml_backend_free(control_net_backend);
         }
-        if (vae_backend != backend) {
+        if (tae_backend && tae_backend != vae_backend) {
+            ggml_backend_free(tae_backend);
+        }
+        if (vae_backend && vae_backend != backend) {
             ggml_backend_free(vae_backend);
         }
-        ggml_backend_free(backend);
-    }
-
-    void init_backend() {
-#ifdef SD_USE_CUDA
-        LOG_DEBUG("Using CUDA backend");
-        backend = ggml_backend_cuda_init(0);
-#endif
-#ifdef SD_USE_METAL
-        LOG_DEBUG("Using Metal backend");
-        backend = ggml_backend_metal_init();
-#endif
-#ifdef SD_USE_VULKAN
-        LOG_DEBUG("Using Vulkan backend");
-        size_t device          = 0;
-        const int device_count = ggml_backend_vk_get_device_count();
-        if (device_count) {
-            const char* SD_VK_DEVICE = getenv("SD_VK_DEVICE");
-            if (SD_VK_DEVICE != nullptr) {
-                std::string sd_vk_device_str = SD_VK_DEVICE;
-                try {
-                    device = std::stoull(sd_vk_device_str);
-                } catch (const std::invalid_argument&) {
-                    LOG_WARN("SD_VK_DEVICE environment variable is not a valid integer (%s). Falling back to device 0.", SD_VK_DEVICE);
-                    device = 0;
-                } catch (const std::out_of_range&) {
-                    LOG_WARN("SD_VK_DEVICE environment variable value is out of range for `unsigned long long` type (%s). Falling back to device 0.", SD_VK_DEVICE);
-                    device = 0;
-                }
-                if (device >= device_count) {
-                    LOG_WARN("Cannot find targeted vulkan device (%llu). Falling back to device 0.", device);
-                    device = 0;
-                }
-            }
-            LOG_INFO("Vulkan: Using device %llu", device);
-            backend = ggml_backend_vk_init(device);
-        }
-        if (!backend) {
-            LOG_WARN("Failed to initialize Vulkan backend");
+        if (vision_backend && vision_backend != backend) {
+            ggml_backend_free(vision_backend);
         }
-#endif
-#ifdef SD_USE_OPENCL
-        LOG_DEBUG("Using OpenCL backend");
-        // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs
-        backend = ggml_backend_opencl_init();
-        if (!backend) {
-            LOG_WARN("Failed to initialize OpenCL backend");
+        if (backend) {
+            ggml_backend_free(backend);
         }
-#endif
-#ifdef SD_USE_SYCL
-        LOG_DEBUG("Using SYCL backend");
-        backend = ggml_backend_sycl_init(0);
-#endif
+    }
 
-        if (!backend) {
-            LOG_DEBUG("Using CPU backend");
-            backend = ggml_backend_cpu_init();
+    void log_backends() {
+        const int device_count = ggml_backend_dev_count();
+        for (int i = 0; i < device_count; i++) {
+            auto dev = ggml_backend_dev_get(i);
+            LOG_INFO("%s (%s)", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev));
         }
     }
 
@@ -251,7 +335,54 @@ class StableDiffusionGGML {
 
         ggml_log_set(ggml_log_callback_default, nullptr);
 
-        init_backend();
+        log_backends();
+
+        std::string default_backend_name = get_default_backend_name();
+
+        std::string override_default_backend_name = sanitize_backend_name(SAFE_STR(sd_ctx_params->main_device));
+
+        if (override_default_backend_name.size() > 0) {
+            LOG_INFO("Setting default backend to %s", override_default_backend_name.c_str());
+            default_backend_name = override_default_backend_name;
+        }
+
+        std::string diffusion_backend_name          = sanitize_backend_name(SAFE_STR(sd_ctx_params->diffusion_device));
+        std::vector<std::string> clip_backend_names = sanitize_backend_name_list(SAFE_STR(sd_ctx_params->clip_device));
+        std::string control_net_backend_name        = sanitize_backend_name(SAFE_STR(sd_ctx_params->control_net_device));
+        std::string vae_backend_name                = sanitize_backend_name(SAFE_STR(sd_ctx_params->vae_device));
+        std::string tae_backend_name                = sanitize_backend_name(SAFE_STR(sd_ctx_params->tae_device));
+        std::string pmid_backend_name               = sanitize_backend_name(SAFE_STR(sd_ctx_params->photomaker_device));
+        std::string vision_backend_name             = sanitize_backend_name(SAFE_STR(sd_ctx_params->vision_device));
+
+        bool diffusion_backend_is_default = diffusion_backend_name.empty() || diffusion_backend_name == default_backend_name;
+        bool clip_backends_are_default    = true;
+        for (const auto& clip_backend_name : clip_backend_names) {
+            if (!clip_backend_name.empty() && clip_backend_name != default_backend_name) {
+                clip_backends_are_default = false;
+                break;
+            }
+        }
+        bool control_net_backend_is_default = (control_net_backend_name.empty() || control_net_backend_name == default_backend_name);
+        bool vae_backend_is_default         = (vae_backend_name.empty() || vae_backend_name == default_backend_name);
+        // if tae_backend_name is empty, it will use the same backend as vae
+        bool tae_backend_is_default    = (tae_backend_name.empty() && vae_backend_is_default) || tae_backend_name == default_backend_name;
+        bool pmid_backend_is_default   = (pmid_backend_name.empty() || pmid_backend_name == default_backend_name);
+        bool vision_backend_is_default = (vision_backend_name.empty() || vision_backend_name == default_backend_name);
+
+        // if some backend is not specified or is the same as the default backend, use the default backend
+        bool use_default_backend = diffusion_backend_is_default || clip_backends_are_default || control_net_backend_is_default || vae_backend_is_default || tae_backend_is_default || pmid_backend_is_default || vision_backend_is_default;
+
+        if (use_default_backend) {
+            backend = init_named_backend(override_default_backend_name);
+            LOG_DEBUG("Loaded default backend %s", ggml_backend_name(backend));
+        }
+
+        if (!diffusion_backend_is_default) {
+            diffusion_backend = init_named_backend(diffusion_backend_name);
+            LOG_INFO("Using diffusion backend: %s", ggml_backend_name(diffusion_backend));
+        } else {
+            diffusion_backend = backend;
+        }
 
         ModelLoader model_loader;
 
@@ -422,21 +553,24 @@ class StableDiffusionGGML {
             LOG_INFO("Using circular padding for convolutions");
         }
 
-        bool clip_on_cpu = sd_ctx_params->keep_clip_on_cpu;
-
         {
-            clip_backend = backend;
-            if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("CLIP: Using CPU backend");
-                clip_backend = ggml_backend_cpu_init();
+            if (!clip_backends_are_default) {
+                clip_backends.clear();
+                for (auto clip_backend_name : clip_backend_names) {
+                    auto clip_backend = init_named_backend(clip_backend_name);
+                    LOG_INFO("CLIP: Using %s backend", ggml_backend_name(clip_backend));
+                    clip_backends.push_back(clip_backend);
+                }
+            } else {
+                clip_backends = {backend};
             }
             if (sd_version_is_sd3(version)) {
-                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backends,
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
-                diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map);
+                diffusion_model  = std::make_shared<MMDiTModel>(diffusion_backend,
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -455,53 +589,53 @@ class StableDiffusionGGML {
                             "--chroma-disable-dit-mask as a workaround.");
                     }
 
-                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
+                    cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backends[0],
                                                                         offload_params_to_cpu,
                                                                         tensor_storage_map,
                                                                         sd_ctx_params->chroma_use_t5_mask,
                                                                         sd_ctx_params->chroma_t5_mask_pad);
                 } else if (version == VERSION_OVIS_IMAGE) {
-                    cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                    cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map,
                                                                      version,
                                                                      "",
                                                                      false);
                 } else {
-                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend,
+                    cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backends,
                                                                           offload_params_to_cpu,
                                                                           tensor_storage_map);
                 }
-                diffusion_model = std::make_shared<FluxModel>(backend,
+                diffusion_model = std::make_shared<FluxModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version,
                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_flux2(version)) {
                 bool is_chroma   = false;
-                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                diffusion_model  = std::make_shared<FluxModel>(diffusion_backend,
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
-                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backends[0],
                                                                     offload_params_to_cpu,
                                                                     tensor_storage_map,
                                                                     true,
                                                                     0,
                                                                     true);
-                diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             tensor_storage_map,
-                                                             "model.diffusion_model",
-                                                             version);
+                diffusion_model  = std::make_shared<WanModel>(diffusion_backend,
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model",
+                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
-                    high_noise_diffusion_model = std::make_shared<WanModel>(backend,
+                    high_noise_diffusion_model = std::make_shared<WanModel>(diffusion_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
                                                                             "model.high_noise_diffusion_model",
@@ -510,7 +644,15 @@ class StableDiffusionGGML {
                 if (diffusion_model->get_desc() == "Wan2.1-I2V-14B" ||
                     diffusion_model->get_desc() == "Wan2.1-FLF2V-14B" ||
                     diffusion_model->get_desc() == "Wan2.1-I2V-1.3B") {
-                    clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend,
+                    if (!vision_backend) {
+                        if (vision_backend_name.length() > 0 && !vision_backend_is_default) {
+                            vision_backend = init_named_backend(vision_backend_name);
+                            LOG_INFO("Vision model: Using %s backend", ggml_backend_name(vision_backend));
+                        } else {
+                            vision_backend = clip_backends[0];
+                        }
+                    }
+                    clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(clip_backends[0],
                                                                              offload_params_to_cpu,
                                                                              tensor_storage_map);
                     clip_vision->alloc_params_buffer();
@@ -521,56 +663,56 @@ class StableDiffusionGGML {
                 if (!vae_decode_only) {
                     enable_vision = true;
                 }
-                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version,
                                                                  "",
                                                                  enable_vision);
-                diffusion_model  = std::make_shared<QwenImageModel>(backend,
-                                                                   offload_params_to_cpu,
-                                                                   tensor_storage_map,
-                                                                   "model.diffusion_model",
-                                                                   version,
-                                                                   sd_ctx_params->qwen_image_zero_cond_t);
+                diffusion_model  = std::make_shared<QwenImageModel>(diffusion_backend,
+                                                                    offload_params_to_cpu,
+                                                                    tensor_storage_map,
+                                                                    "model.diffusion_model",
+                                                                    version,
+                                                                    sd_ctx_params->qwen_image_zero_cond_t);
             } else if (sd_version_is_anima(version)) {
-                cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
+                cond_stage_model = std::make_shared<AnimaConditioner>(clip_backends[0],
                                                                       offload_params_to_cpu,
                                                                       tensor_storage_map);
                 diffusion_model  = std::make_shared<AnimaModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map,
-                                                               "model.diffusion_model");
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map,
+                                                                "model.diffusion_model");
             } else if (sd_version_is_z_image(version)) {
-                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
+                cond_stage_model = std::make_shared<LLMEmbedder>(clip_backends[0],
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
-                diffusion_model  = std::make_shared<ZImageModel>(backend,
-                                                                offload_params_to_cpu,
-                                                                tensor_storage_map,
-                                                                "model.diffusion_model",
-                                                                version);
+                diffusion_model  = std::make_shared<ZImageModel>(diffusion_backend,
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 "model.diffusion_model",
+                                                                 version);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
                 for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
                     embbeding_map.emplace(SAFE_STR(sd_ctx_params->embeddings[i].name), SAFE_STR(sd_ctx_params->embeddings[i].path));
                 }
                 if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backends,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
                                                                                            embbeding_map,
                                                                                            version,
                                                                                            PM_VERSION_2);
                 } else {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend,
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backends,
                                                                                            offload_params_to_cpu,
                                                                                            tensor_storage_map,
                                                                                            embbeding_map,
                                                                                            version);
                 }
-                diffusion_model = std::make_shared<UNetModel>(backend,
+                diffusion_model = std::make_shared<UNetModel>(diffusion_backend,
                                                               offload_params_to_cpu,
                                                               tensor_storage_map,
                                                               version);
@@ -595,18 +737,22 @@ class StableDiffusionGGML {
                 high_noise_diffusion_model->get_param_tensors(tensors);
             }
 
-            if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu(backend)) {
-                LOG_INFO("VAE Autoencoder: Using CPU backend");
-                vae_backend = ggml_backend_cpu_init();
-            } else {
-                vae_backend = backend;
+            vae_backend = backend;
+            if (!vae_backend_is_default) {
+                vae_backend = init_named_backend(vae_backend_name);
+                LOG_INFO("VAE Autoencoder: Using %s backend", ggml_backend_name(vae_backend));
+            }
+            tae_backend = vae_backend;
+            if (tae_backend_name.length() > 0 && tae_backend_name != vae_backend_name) {
+                tae_backend = init_named_backend(tae_backend_name);
+                LOG_INFO("Tiny Autoencoder: Using %s backend", ggml_backend_name(tae_backend));
             }
 
             auto create_tae = [&]() -> std::shared_ptr<VAE> {
                 if (sd_version_is_wan(version) ||
                     sd_version_is_qwen_image(version) ||
                     sd_version_is_anima(version)) {
-                    return std::make_shared<TinyVideoAutoEncoder>(vae_backend,
+                    return std::make_shared<TinyVideoAutoEncoder>(tae_backend,
                                                                   offload_params_to_cpu,
                                                                   tensor_storage_map,
                                                                   "decoder",
@@ -614,7 +760,7 @@ class StableDiffusionGGML {
                                                                   version);
 
                 } else {
-                    auto model = std::make_shared<TinyImageAutoEncoder>(vae_backend,
+                    auto model = std::make_shared<TinyImageAutoEncoder>(tae_backend,
                                                                         offload_params_to_cpu,
                                                                         tensor_storage_map,
                                                                         "decoder.layers",
@@ -687,14 +833,13 @@ class StableDiffusionGGML {
             }
 
             if (strlen(SAFE_STR(sd_ctx_params->control_net_path)) > 0) {
-                ggml_backend_t controlnet_backend = nullptr;
-                if (sd_ctx_params->keep_control_net_on_cpu && !ggml_backend_is_cpu(backend)) {
-                    LOG_DEBUG("ControlNet: Using CPU backend");
-                    controlnet_backend = ggml_backend_cpu_init();
+                if (!control_net_backend_is_default) {
+                    control_net_backend = init_named_backend(control_net_backend_name);
+                    LOG_INFO("ControlNet: Using %s backend", ggml_backend_name(control_net_backend));
                 } else {
-                    controlnet_backend = backend;
+                    control_net_backend = backend;
                 }
-                control_net = std::make_shared<ControlNet>(controlnet_backend,
+                control_net = std::make_shared<ControlNet>(control_net_backend,
                                                            offload_params_to_cpu,
                                                            tensor_storage_map,
                                                            version);
@@ -703,9 +848,15 @@ class StableDiffusionGGML {
                     control_net->set_conv2d_direct_enabled(true);
                 }
             }
-
+            pmid_backend = backend;
+            if (!pmid_backend_is_default) {
+                pmid_backend = init_named_backend(pmid_backend_name);
+                LOG_INFO("PhotoMaker: Using %s backend", ggml_backend_name(pmid_backend));
+            } else {
+                pmid_backend = backend;
+            }
             if (strstr(SAFE_STR(sd_ctx_params->photo_maker_path), "v2")) {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(pmid_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
                                                                    "pmid",
@@ -713,21 +864,27 @@ class StableDiffusionGGML {
                                                                    PM_VERSION_2);
                 LOG_INFO("using PhotoMaker Version 2");
             } else {
-                pmid_model = std::make_shared<PhotoMakerIDEncoder>(backend,
+                pmid_model = std::make_shared<PhotoMakerIDEncoder>(pmid_backend,
                                                                    offload_params_to_cpu,
                                                                    tensor_storage_map,
                                                                    "pmid",
                                                                    version);
             }
             if (strlen(SAFE_STR(sd_ctx_params->photo_maker_path)) > 0) {
-                pmid_lora               = std::make_shared<LoraModel>("pmid", backend, sd_ctx_params->photo_maker_path, "", version);
+                pmid_lora               = std::make_shared<LoraModel>("pmid", diffusion_backend, sd_ctx_params->photo_maker_path, "", version);
                 auto lora_tensor_filter = [&](const std::string& tensor_name) {
                     if (starts_with(tensor_name, "lora.model")) {
                         return true;
                     }
                     return false;
                 };
-                if (!pmid_lora->load_from_file(n_threads, lora_tensor_filter)) {
+                int n_th = n_threads;
+#ifdef GGML_RPC
+                if (ggml_backend_is_rpc(diffusion_backend)) {
+                    n_th = 1;  // avoid multi-thread for loading to remote
+                }
+#endif
+                if (!pmid_lora->load_from_file(n_th, lora_tensor_filter)) {
                     LOG_WARN("load photomaker lora tensors from %s failed", sd_ctx_params->photo_maker_path);
                     return false;
                 }
@@ -819,7 +976,22 @@ class StableDiffusionGGML {
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
         }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads, sd_ctx_params->enable_mmap);
+        int n_th = n_threads;
+#ifdef GGML_RPC
+        // TODO: maybe set it to 1 threads only for model parts that are on remote?
+        bool is_any_clip_rpc = false;
+        for (auto& backend : clip_backends) {
+            if (ggml_backend_is_rpc(backend)) {
+                is_any_clip_rpc = true;
+            }
+        }
+        // I think those are all the backends that should get sent data to when calling model_loader.load_tensors()
+        if (is_any_clip_rpc || ggml_backend_is_rpc(diffusion_backend) || ggml_backend_is_rpc(vae_backend) || ggml_backend_is_rpc(vision_backend) || ggml_backend_is_rpc(pmid_backend)) {
+            LOG_DEBUG("Using single-thread for tensor loading because RPC backend is used");
+            n_th = 1;  // avoid multi-thread for loading to remote
+        }
+#endif
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_th, sd_ctx_params->enable_mmap);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);
@@ -841,7 +1013,13 @@ class StableDiffusionGGML {
             }
             size_t control_net_params_mem_size = 0;
             if (control_net) {
-                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_threads)) {
+                int n_th = n_threads;
+#ifdef GGML_RPC
+                if (ggml_backend_is_rpc(control_net_backend)) {
+                    n_th = 1;  // avoid multi-thread for loading to remote
+                }
+#endif
+                if (!control_net->load_from_file(SAFE_STR(sd_ctx_params->control_net_path), n_th)) {
                     return false;
                 }
                 control_net_params_mem_size = control_net->get_params_buffer_size();
@@ -853,13 +1031,15 @@ class StableDiffusionGGML {
 
             size_t total_params_ram_size  = 0;
             size_t total_params_vram_size = 0;
-            if (ggml_backend_is_cpu(clip_backend)) {
+
+            // TODO: split by individual text encoders
+            if (ggml_backend_is_cpu(clip_backends[0])) {
                 total_params_ram_size += clip_params_mem_size + pmid_params_mem_size;
             } else {
                 total_params_vram_size += clip_params_mem_size + pmid_params_mem_size;
             }
 
-            if (ggml_backend_is_cpu(backend)) {
+            if (ggml_backend_is_cpu(diffusion_backend)) {
                 total_params_ram_size += unet_params_mem_size;
             } else {
                 total_params_vram_size += unet_params_mem_size;
@@ -885,15 +1065,16 @@ class StableDiffusionGGML {
                 total_params_vram_size / 1024.0 / 1024.0,
                 total_params_ram_size / 1024.0 / 1024.0,
                 clip_params_mem_size / 1024.0 / 1024.0,
-                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM",
+                // TODO: split
+                ggml_backend_is_cpu(clip_backends[0]) ? "RAM" : "VRAM",
                 unet_params_mem_size / 1024.0 / 1024.0,
-                ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
+                ggml_backend_is_cpu(diffusion_backend) ? "RAM" : "VRAM",
                 vae_params_mem_size / 1024.0 / 1024.0,
                 ggml_backend_is_cpu(vae_backend) ? "RAM" : "VRAM",
                 control_net_params_mem_size / 1024.0 / 1024.0,
                 ggml_backend_is_cpu(control_net_backend) ? "RAM" : "VRAM",
                 pmid_params_mem_size / 1024.0 / 1024.0,
-                ggml_backend_is_cpu(clip_backend) ? "RAM" : "VRAM");
+                ggml_backend_is_cpu(pmid_backend) ? "RAM" : "VRAM");
         }
 
         // init denoiser
@@ -1035,7 +1216,13 @@ class StableDiffusionGGML {
             LOG_DEBUG("high noise lora: %s", lora_path.c_str());
         }
         auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
-        if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
+        int n_th  = n_threads;
+#ifdef GGML_RPC
+        if (ggml_backend_is_rpc(backend)) {
+            n_th = 1;  // avoid multi-thread for loading to remote
+        }
+#endif
+        if (!lora->load_from_file(n_th, lora_tensor_filter)) {
             LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
             return nullptr;
         }
@@ -1071,15 +1258,59 @@ class StableDiffusionGGML {
         }
 
         for (auto& kv : lora_state_diff) {
-            int64_t t0 = ggml_time_ms();
+            bool applied                 = false;
+            int64_t t0                   = ggml_time_ms();
+            auto lora_tensor_filter_diff = [&](const std::string& tensor_name) {
+                if (is_diffusion_model_name(tensor_name)) {
+                    return true;
+                }
+                return false;
+            };
 
-            auto lora = load_lora_model_from_file(kv.first, kv.second, backend);
-            if (!lora || lora->lora_tensors.empty()) {
-                continue;
+            LOG_INFO("applying lora to diffusion model");
+            auto lora = load_lora_model_from_file(kv.first, kv.second, diffusion_backend, lora_tensor_filter_diff);
+            if (lora && !lora->lora_tensors.empty()) {
+                lora->apply(tensors, version, n_threads);
+                lora->free_params_buffer();
+                applied = true;
+            }
+
+            for (int i = 0; i < cond_stage_model->model_count; i++) {
+                auto lora_tensor_filter_cond = [&](const std::string& tensor_name) {
+                    if (is_cond_stage_model_name(tensor_name)) {
+                        return cond_stage_model->is_cond_stage_model_name_at_index(tensor_name, i);
+                    }
+                    return false;
+                };
+                // TODO: split by model
+                LOG_INFO("applying lora to text encoder (%d)", i);
+                auto backend = cond_stage_model->get_params_backend_at_index(i);
+                lora         = load_lora_model_from_file(kv.first, kv.second, backend, lora_tensor_filter_cond);
+                if (lora && !lora->lora_tensors.empty()) {
+                    lora->apply(tensors, version, n_threads);
+                    lora->free_params_buffer();
+                    applied = true;
+                }
+            }
+
+            auto lora_tensor_filter_first = [&](const std::string& tensor_name) {
+                if (is_first_stage_model_name(tensor_name)) {
+                    return true;
+                }
+                return false;
+            };
+            LOG_INFO("applying lora to first stage model");
+            auto first_stage_backend = first_stage_model->get_params_backend();
+            lora                     = load_lora_model_from_file(kv.first, kv.second, first_stage_backend, lora_tensor_filter_first);
+            if (lora && !lora->lora_tensors.empty()) {
+                lora->apply(tensors, version, n_threads);
+                lora->free_params_buffer();
+                applied = true;
             }
-            lora->apply(tensors, version, n_threads);
-            lora->free_params_buffer();
 
+            if (!applied) {
+                continue;
+            }
             int64_t t1 = ggml_time_ms();
 
             LOG_INFO("lora '%s' applied, taking %.2fs", kv.first.c_str(), (t1 - t0) * 1.0f / 1000);
@@ -1120,23 +1351,27 @@ class StableDiffusionGGML {
                     lora_state_diff.erase(iter);
                 }
             }
-            cond_stage_lora_models  = lora_models;
-            auto lora_tensor_filter = [&](const std::string& tensor_name) {
-                if (is_cond_stage_model_name(tensor_name)) {
-                    return true;
-                }
-                return false;
-            };
-            for (auto& kv : lora_state_diff) {
-                const std::string& lora_id = kv.first;
-                float multiplier           = kv.second;
+            cond_stage_lora_models = lora_models;
 
-                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter);
-                if (lora && !lora->lora_tensors.empty()) {
-                    lora->preprocess_lora_tensors(tensors);
-                    cond_stage_lora_models.push_back(lora);
+            for (int i = 0; i < cond_stage_model->model_count; i++) {
+                auto lora_tensor_filter_cond = [&](const std::string& tensor_name) {
+                    if (is_cond_stage_model_name(tensor_name)) {
+                        return cond_stage_model->is_cond_stage_model_name_at_index(tensor_name, i);
+                    }
+                    return false;
+                };
+                for (auto& kv : lora_state_diff) {
+                    const std::string& lora_id = kv.first;
+                    float multiplier           = kv.second;
+                    auto backend               = cond_stage_model->get_runtime_backend_at_index(i);
+                    auto lora                  = load_lora_model_from_file(kv.first, kv.second, backend, lora_tensor_filter_cond);
+                    if (lora && !lora->lora_tensors.empty()) {
+                        lora->preprocess_lora_tensors(tensors);
+                        cond_stage_lora_models.push_back(lora);
+                    }
                 }
             }
+
             auto multi_lora_adapter = std::make_shared<MultiLoraAdapter>(cond_stage_lora_models);
             cond_stage_model->set_weight_adapter(multi_lora_adapter);
         }
@@ -1163,7 +1398,7 @@ class StableDiffusionGGML {
                 const std::string& lora_name = kv.first;
                 float multiplier             = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_name, multiplier, diffusion_backend, lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     diffusion_lora_models.push_back(lora);
@@ -1388,11 +1623,11 @@ class StableDiffusionGGML {
                        void* step_callback_data,
                        bool is_noisy) {
         if (preview_mode == PREVIEW_PROJ) {
-            int patch_sz                     = 1;
-            const float(*latent_rgb_proj)[3] = nullptr;
-            float* latent_rgb_bias           = nullptr;
-            bool is_video                    = preview_latent_tensor_is_video(latents);
-            uint32_t dim                     = is_video ? static_cast<uint32_t>(latents.shape()[3]) : static_cast<uint32_t>(latents.shape()[2]);
+            int patch_sz                      = 1;
+            const float (*latent_rgb_proj)[3] = nullptr;
+            float* latent_rgb_bias            = nullptr;
+            bool is_video                     = preview_latent_tensor_is_video(latents);
+            uint32_t dim                      = is_video ? static_cast<uint32_t>(latents.shape()[3]) : static_cast<uint32_t>(latents.shape()[2]);
 
             if (dim == 128) {
                 if (sd_version_is_flux2(version)) {
@@ -2133,9 +2368,6 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
     sd_ctx_params->lora_apply_mode         = LORA_APPLY_AUTO;
     sd_ctx_params->offload_params_to_cpu   = false;
     sd_ctx_params->enable_mmap             = false;
-    sd_ctx_params->keep_clip_on_cpu        = false;
-    sd_ctx_params->keep_control_net_on_cpu = false;
-    sd_ctx_params->keep_vae_on_cpu         = false;
     sd_ctx_params->diffusion_flash_attn    = false;
     sd_ctx_params->circular_x              = false;
     sd_ctx_params->circular_y              = false;
@@ -2149,7 +2381,7 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
     if (!buf)
         return nullptr;
     buf[0] = '\0';
-
+    // TODO devices
     snprintf(buf + strlen(buf), 4096 - strlen(buf),
              "model_path: %s\n"
              "clip_l_path: %s\n"
@@ -2173,9 +2405,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              "sampler_rng_type: %s\n"
              "prediction: %s\n"
              "offload_params_to_cpu: %s\n"
-             "keep_clip_on_cpu: %s\n"
-             "keep_control_net_on_cpu: %s\n"
-             "keep_vae_on_cpu: %s\n"
              "flash_attn: %s\n"
              "diffusion_flash_attn: %s\n"
              "circular_x: %s\n"
@@ -2205,9 +2434,6 @@ char* sd_ctx_params_to_str(const sd_ctx_params_t* sd_ctx_params) {
              sd_rng_type_name(sd_ctx_params->sampler_rng_type),
              sd_prediction_name(sd_ctx_params->prediction),
              BOOL_STR(sd_ctx_params->offload_params_to_cpu),
-             BOOL_STR(sd_ctx_params->keep_clip_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_control_net_on_cpu),
-             BOOL_STR(sd_ctx_params->keep_vae_on_cpu),
              BOOL_STR(sd_ctx_params->flash_attn),
              BOOL_STR(sd_ctx_params->diffusion_flash_attn),
              BOOL_STR(sd_ctx_params->circular_x),
@@ -2655,7 +2881,7 @@ struct SamplePlan {
 
         if (sample_params->custom_sigmas_count > 0) {
             sigmas      = std::vector<float>(sample_params->custom_sigmas,
-                                        sample_params->custom_sigmas + sample_params->custom_sigmas_count);
+                                             sample_params->custom_sigmas + sample_params->custom_sigmas_count);
             total_steps = static_cast<int>(sigmas.size()) - 1;
             LOG_WARN("total_steps != custom_sigmas_count - 1, set total_steps to %d", total_steps);
             if (sample_steps >= total_steps) {
diff --git a/src/t5.hpp b/src/t5.hpp
index f64d0b6d7..f9f271ba0 100644
--- a/src/t5.hpp
+++ b/src/t5.hpp
@@ -1,1036 +1,1036 @@
-﻿#ifndef __T5_HPP__
-#define __T5_HPP__
-
-#include <cfloat>
-#include <limits>
-#include <map>
-#include <memory>
-#include <regex>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "darts.h"
-#include "ggml_extend.hpp"
-#include "json.hpp"
-#include "model.h"
-#include "vocab/vocab.h"
-
-// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
-// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
-// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
-//
-// Since tokenization is not the bottleneck in SD, performance was not a major consideration
-// during the migration.
-class MetaspacePreTokenizer {
-private:
-    std::string replacement;
-    bool add_prefix_space;
-
-public:
-    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
-        : replacement(replacement), add_prefix_space(add_prefix_space) {}
-
-    std::string tokenize(const std::string& input) const {
-        std::string tokens;
-        std::stringstream ss(input);
-
-        if (add_prefix_space) {
-            tokens += replacement;
-        }
-
-        std::string token;
-        bool firstToken = true;
-        while (std::getline(ss, token, ' ')) {
-            if (!firstToken)
-                tokens += replacement + token;
-            else
-                tokens += token;
-
-            firstToken = false;
-        }
-
-        return tokens;
-    }
-};
-
-using EncodeResult = std::vector<std::pair<std::string, int>>;
-class T5UniGramTokenizer {
-public:
-    enum Status {
-        OK,
-        NO_PIECES_LOADED,
-        NO_ENTRY_FOUND,
-        BUILD_DOUBLE_ARRAY_FAILED,
-        PIECE_ALREADY_DEFINED,
-        INVLIAD_JSON
-    };
-
-protected:
-    MetaspacePreTokenizer pre_tokenizer;
-
-    // all <piece, score> pairs
-    std::vector<std::pair<std::string, float>> piece_score_pairs;
-
-    float min_score_ = 0.0;
-    float max_score_ = 0.0;
-    std::unique_ptr<Darts::DoubleArray> trie_;
-
-    // Maximum size of the return value of Trie, which corresponds
-    // to the maximum size of shared common prefix in the sentence pieces.
-    int trie_results_size_;
-    // unknown id.
-    int unk_id_            = 2;
-    std::string eos_token_ = "</s>";
-    int eos_id_            = 1;
-    int pad_id_            = 0;
-    // status.
-    Status status_ = OK;
-
-    float kUnkPenalty = 10.0;
-
-    std::string replacement;
-    bool add_prefix_space = true;
-
-    void InitializePieces(const std::string& json_str) {
-        nlohmann::json data;
-
-        try {
-            data = nlohmann::json::parse(json_str);
-        } catch (const nlohmann::json::parse_error&) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        if (!data.contains("model")) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        nlohmann::json model = data["model"];
-        if (!model.contains("vocab")) {
-            status_ = INVLIAD_JSON;
-            return;
-        }
-        if (model.contains("unk_id")) {
-            unk_id_ = model["unk_id"];
-        }
-
-        replacement      = data["pre_tokenizer"]["replacement"];
-        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
-
-        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
-
-        for (const auto& item : model["vocab"]) {
-            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
-                status_ = INVLIAD_JSON;
-                return;
-            }
-            std::string piece = item[0];
-            if (piece.empty()) {
-                piece = "<empty_token>";
-            }
-            float score = item[1];
-            piece_score_pairs.emplace_back(piece, score);
-        }
-    }
-
-    // Builds a Trie index.
-    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
-        if (status_ != OK)
-            return;
-
-        if (pieces->empty()) {
-            status_ = NO_PIECES_LOADED;
-            return;
-        }
-
-        // sort by sentencepiece since DoubleArray::build()
-        // only accepts sorted strings.
-        sort(pieces->begin(), pieces->end());
-
-        // Makes key/value set for DoubleArrayTrie.
-        std::vector<const char*> key(pieces->size());
-        std::vector<int> value(pieces->size());
-        for (size_t i = 0; i < pieces->size(); ++i) {
-            // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
-            key[i]   = (*pieces)[i].first.data();  // sorted piece.
-            value[i] = (*pieces)[i].second;        // vocab_id
-        }
-
-        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
-        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
-                         &value[0]) != 0) {
-            status_ = BUILD_DOUBLE_ARRAY_FAILED;
-            return;
-        }
-
-        // Computes the maximum number of shared prefixes in the trie.
-        const int kMaxTrieResultsSize = 1024;
-        std::vector<Darts::DoubleArray::result_pair_type> results(
-            kMaxTrieResultsSize);
-        trie_results_size_ = 0;
-        for (const auto& p : *pieces) {
-            const size_t num_nodes = trie_->commonPrefixSearch(
-                p.first.data(), results.data(), results.size(), p.first.size());
-            trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
-        }
-
-        if (trie_results_size_ == 0)
-            status_ = NO_ENTRY_FOUND;
-    }
-
-    // Non-virtual (inlined) implementation for faster execution.
-    inline float GetScoreInlined(int id) const {
-        return piece_score_pairs[id].second;
-    }
-
-    inline bool IsUnusedInlined(int id) const {
-        return false;  // TODO
-    }
-
-    inline bool IsUserDefinedInlined(int id) const {
-        return false;  // TODO
-    }
-
-    inline size_t OneCharLen(const char* src) const {
-        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
-    }
-
-    // The optimized Viterbi encode.
-    // Main differences from the original function:
-    // 1. Memorizes the best path at each postion so far,
-    // 2. No need to store the Lattice nodes,
-    // 3. Works in utf-8 directly,
-    // 4. Defines a new struct with fewer fields than Lattice,
-    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
-    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
-    // For detailed explanations please see the comments inside the function body.
-    EncodeResult EncodeOptimized(const std::string& normalized) const {
-        // An optimized Viterbi algorithm for unigram language models. Benchmarking
-        // results show that it generates almost identical outputs and achieves 2.1x
-        // speedup on average for 102 languages compared to the original
-        // implementation. It's based on the following three ideas:
-        //
-        // 1. Because it uses the *unigram* model:
-        //     best_score(x1, x2, ... xt) = best_score(x1, x2, ... x{t-1}) + score(xt)
-        // Deciding the best path (and score) can be decoupled into two isolated
-        // terms: (a) the best path ended before the last token `best_score(x1, x2, ...)`
-        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
-        // not related to each other at all.
-        //
-        // Therefore, we can compute once and store the *best_path ending at
-        // each character position*. In this way, when we know best_path_ends_at[M],
-        // we can reuse it to compute all the best_path_ends_at_[...] where the last
-        // token starts at the same character position M.
-        //
-        // This improves the time complexity from O(n*k*k) to O(n*k) because it
-        // eliminates the extra loop of recomputing the best path ending at the same
-        // position, where n is the input length and k is the maximum number of tokens
-        // that can be recognized starting at each position.
-        //
-        // 2. Again, because it uses the *unigram* model, we don't need to actually
-        // store the lattice nodes. We still recognize all the tokens and lattice
-        // nodes from the input, but along identifying them, we use and discard them
-        // on the fly. There is no need to actually store them for best path Viterbi
-        // decoding. The only thing we need to store is the best_path ending at
-        // each character position.
-        //
-        // This improvement reduces the things needed to store in memory from O(n*k)
-        // to O(n), where n is the input length and k is the maximum number of tokens
-        // that can be recognized starting at each position.
-        //
-        // It also avoids the need of dynamic-size lattice node pool, because the
-        // number of things to store is fixed as n.
-        //
-        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
-        // inputs. In the original implementation, the lattice positions are based on
-        // unicode positions. A mapping from unicode position to the utf-8 position is
-        // maintained to recover the utf-8 string piece.
-        //
-        // We found that it is sufficient and beneficial to directly work with utf-8
-        // positions:
-        //
-        // Firstly, it saves the conversion and mapping between unicode positions and
-        // utf-8 positions.
-        //
-        // Secondly, it reduces the number of fields we need to maintain in the
-        // node/path structure. Specifically, there are 8 fields defined in
-        // `Lattice::Node` used by the original encoder, but here in the optimized
-        // encoder we only need to define 3 fields in `BestPathNode`.
-
-        if (status() != OK || normalized.empty()) {
-            return {};
-        }
-        // Represents the last node of the best path.
-        struct BestPathNode {
-            int id = -1;  // The vocab id. (maybe -1 for UNK)
-            float best_path_score =
-                0;  // The total score of the best path ending at this node.
-            int starts_at =
-                -1;  // The starting position (in utf-8) of this node. The entire best
-                     // path can be constructed by backtracking along this link.
-        };
-        const int size        = static_cast<int>(normalized.size());
-        const float unk_score = min_score() - kUnkPenalty;
-        // The ends are exclusive.
-        std::vector<BestPathNode> best_path_ends_at(size + 1);
-        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
-        int starts_at = 0;
-        while (starts_at < size) {
-            std::size_t node_pos = 0;
-            std::size_t key_pos  = starts_at;
-            const auto best_path_score_till_here =
-                best_path_ends_at[starts_at].best_path_score;
-            bool has_single_node = false;
-            const int mblen =
-                std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
-                              size - starts_at);
-            while (key_pos < size) {
-                const int ret =
-                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
-                if (ret == -2)
-                    break;
-                if (ret >= 0) {
-                    if (IsUnusedInlined(ret))
-                        continue;
-                    // Update the best path node.
-                    auto& target_node = best_path_ends_at[key_pos];
-                    const auto length = (key_pos - starts_at);
-                    // User defined symbol receives extra bonus to always be selected.
-                    const auto score = IsUserDefinedInlined(ret)
-                                           ? (length * max_score_ - 0.1)
-                                           : GetScoreInlined(ret);
-                    const auto candidate_best_path_score =
-                        score + best_path_score_till_here;
-                    if (target_node.starts_at == -1 ||
-                        candidate_best_path_score > target_node.best_path_score) {
-                        target_node.best_path_score = static_cast<float>(candidate_best_path_score);
-                        target_node.starts_at       = starts_at;
-                        target_node.id              = ret;
-                    }
-                    if (!has_single_node && length == mblen) {
-                        has_single_node = true;
-                    }
-                }
-            }
-            if (!has_single_node) {
-                auto& target_node = best_path_ends_at[starts_at + mblen];
-                const auto candidate_best_path_score =
-                    unk_score + best_path_score_till_here;
-                if (target_node.starts_at == -1 ||
-                    candidate_best_path_score > target_node.best_path_score) {
-                    target_node.best_path_score = candidate_best_path_score;
-                    target_node.starts_at       = starts_at;
-                    target_node.id              = unk_id_;
-                }
-            }
-            // Move by one unicode character.
-            starts_at += mblen;
-        }
-        // Backtrack to identify the best path.
-        EncodeResult results;
-        int ends_at = size;
-        while (ends_at > 0) {
-            const auto& node = best_path_ends_at[ends_at];
-            results.emplace_back(
-                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
-            ends_at = node.starts_at;
-        }
-        std::reverse(results.begin(), results.end());
-        return results;
-    }
-
-public:
-    explicit T5UniGramTokenizer(bool is_umt5 = false) {
-        if (is_umt5) {
-            InitializePieces(load_umt5_tokenizer_json());
-        } else {
-            InitializePieces(load_t5_tokenizer_json());
-        }
-
-        min_score_ = FLT_MAX;
-        max_score_ = FLT_MIN;
-
-        std::vector<std::pair<std::string, int>> pieces;
-        for (int i = 0; i < piece_score_pairs.size(); i++) {
-            const auto& sp = piece_score_pairs[i];
-
-            min_score_ = std::min(min_score_, sp.second);
-            max_score_ = std::max(max_score_, sp.second);
-
-            pieces.emplace_back(sp.first, i);
-        }
-
-        BuildTrie(&pieces);
-    }
-    ~T5UniGramTokenizer(){};
-
-    std::string Normalize(const std::string& input) const {
-        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
-        // TODO: nmt-nfkc
-        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
-        return normalized;
-    }
-
-    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
-        std::string normalized = Normalize(input);
-        normalized             = pre_tokenizer.tokenize(normalized);
-        EncodeResult result    = EncodeOptimized(normalized);
-        if (result.size() > 0 && append_eos_if_not_present) {
-            auto item = result[result.size() - 1];
-            if (item.first != eos_token_) {
-                result.emplace_back(eos_token_, eos_id_);
-            }
-        }
-        std::vector<int> tokens;
-        for (auto item : result) {
-            tokens.push_back(item.second);
-        }
-        return tokens;
-    }
-
-    void pad_tokens(std::vector<int>& tokens,
-                    std::vector<float>& weights,
-                    std::vector<float>* attention_mask,
-                    size_t max_length = 0,
-                    bool padding      = false) {
-        if (max_length > 0 && padding) {
-            size_t orig_token_num = tokens.size() - 1;
-            size_t n              = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
-            if (n == 0) {
-                n = 1;
-            }
-            size_t length = max_length * n;
-            LOG_DEBUG("token length: %llu", length);
-            std::vector<int> new_tokens;
-            std::vector<float> new_weights;
-            std::vector<float> new_attention_mask;
-            int token_idx = 0;
-            for (int i = 0; i < length; i++) {
-                if (token_idx >= orig_token_num) {
-                    break;
-                }
-                if (attention_mask != nullptr) {
-                    new_attention_mask.push_back(0.0);
-                }
-                if (i % max_length == max_length - 1) {
-                    new_tokens.push_back(eos_id_);
-                    new_weights.push_back(1.0);
-                } else {
-                    new_tokens.push_back(tokens[token_idx]);
-                    new_weights.push_back(weights[token_idx]);
-                    token_idx++;
-                }
-            }
-
-            new_tokens.push_back(eos_id_);
-            new_weights.push_back(1.0);
-            if (attention_mask != nullptr) {
-                new_attention_mask.push_back(0.0);
-            }
-
-            tokens  = new_tokens;
-            weights = new_weights;
-            if (attention_mask != nullptr) {
-                *attention_mask = new_attention_mask;
-            }
-
-            if (padding) {
-                int pad_token_id = pad_id_;
-                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
-                weights.insert(weights.end(), length - weights.size(), 1.0);
-                if (attention_mask != nullptr) {
-                    // maybe keep some padding tokens unmasked?
-                    attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
-                }
-            }
-        }
-    }
-
-    // Returns the minimum score in sentence pieces.
-    // min_score() - 10 is used for the cost of unknown sentence.
-    float min_score() const { return min_score_; }
-
-    // Returns the maximum score in sentence pieces.
-    // max_score() is used for the cost of user defined symbols.
-    float max_score() const { return max_score_; }
-
-    Status status() const { return status_; }
-};
-
-class T5LayerNorm : public UnaryBlock {
-protected:
-    int64_t hidden_size;
-    float eps;
-
-    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
-        enum ggml_type wtype = GGML_TYPE_F32;
-        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
-    }
-
-public:
-    T5LayerNorm(int64_t hidden_size,
-                float eps = 1e-06f)
-        : hidden_size(hidden_size),
-          eps(eps) {}
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        ggml_tensor* w = params["weight"];
-        x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
-        x              = ggml_mul(ctx->ggml_ctx, x, w);
-        return x;
-    }
-};
-
-struct T5DenseActDense : public UnaryBlock {
-public:
-    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
-        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        x = wi->forward(ctx, x);
-        x = ggml_relu_inplace(ctx->ggml_ctx, x);
-        x = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5DenseGatedActDense : public UnaryBlock {
-public:
-    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
-        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
-        float scale    = 1.f / 32.f;
-        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
-        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
-        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
-        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
-
-        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
-        auto hidden_linear = wi_1->forward(ctx, x);
-        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
-        x                  = wo->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5LayerFF : public UnaryBlock {
-public:
-    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
-        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
-        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
-        // x: [N, n_token, model_dim]
-        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
-        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto forwarded_states = layer_norm->forward(ctx, x);
-        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
-        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
-        return x;
-    }
-};
-
-class T5Attention : public GGMLBlock {
-protected:
-    int64_t model_dim;
-    int64_t inner_dim;
-    int64_t num_heads;
-    bool using_relative_attention_bias;
-    int64_t relative_attention_num_buckets  = 32;
-    int64_t relative_attention_max_distance = 128;
-
-public:
-    T5Attention(int64_t model_dim,
-                int64_t inner_dim,
-                int64_t num_heads,
-                bool using_relative_attention_bias = false)
-        : model_dim(model_dim),
-          inner_dim(inner_dim),
-          num_heads(num_heads),
-          using_relative_attention_bias(using_relative_attention_bias) {
-        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
-        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
-        if (using_relative_attention_bias) {
-            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
-        }
-    }
-
-    ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
-                              ggml_tensor* relative_position_bucket) {
-        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
-
-        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
-        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
-        return values;
-    }
-
-    // x: [N, n_token, model_dim]
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
-        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
-        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
-        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
-
-        int64_t n_head = num_heads;
-        int64_t d_head = inner_dim / n_head;
-
-        auto q = q_proj->forward(ctx, x);
-        auto k = k_proj->forward(ctx, x);
-        auto v = v_proj->forward(ctx, x);
-
-        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
-            past_bias = compute_bias(ctx, relative_position_bucket);
-        }
-        if (past_bias != nullptr) {
-            if (mask != nullptr) {
-                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
-                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
-            } else {
-                mask = past_bias;
-            }
-        }
-
-        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
-
-        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
-
-        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
-        return {x, past_bias};
-    }
-};
-
-struct T5LayerSelfAttention : public GGMLBlock {
-public:
-    T5LayerSelfAttention(int64_t model_dim,
-                         int64_t inner_dim,
-                         int64_t ff_dim,
-                         int64_t num_heads,
-                         bool using_relative_attention_bias) {
-        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
-        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
-        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
-
-        auto normed_hidden_state = layer_norm->forward(ctx, x);
-        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
-        auto output              = ret.first;
-        past_bias                = ret.second;
-
-        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Block : public GGMLBlock {
-public:
-    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
-        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
-        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
-    }
-
-    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
-                                                  ggml_tensor* x,
-                                                  ggml_tensor* past_bias                = nullptr,
-                                                  ggml_tensor* mask                     = nullptr,
-                                                  ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
-        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
-
-        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
-        x         = ret.first;
-        past_bias = ret.second;
-        x         = layer_1->forward(ctx, x);
-        return {x, past_bias};
-    }
-};
-
-struct T5Stack : public GGMLBlock {
-    int64_t num_layers;
-
-public:
-    T5Stack(int64_t num_layers,
-            int64_t model_dim,
-            int64_t inner_dim,
-            int64_t ff_dim,
-            int64_t num_heads,
-            bool relative_attention = true)
-        : num_layers(num_layers) {
-        for (int i = 0; i < num_layers; i++) {
-            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
-        }
-
-        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* x,
-                         ggml_tensor* past_bias                = nullptr,
-                         ggml_tensor* attention_mask           = nullptr,
-                         ggml_tensor* relative_position_bucket = nullptr) {
-        // x: [N, n_token, model_dim]
-        for (int i = 0; i < num_layers; i++) {
-            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
-
-            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-            x         = ret.first;
-            past_bias = ret.second;
-        }
-
-        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
-
-        x = final_layer_norm->forward(ctx, x);
-        return x;
-    }
-};
-
-struct T5Params {
-    int64_t num_layers      = 24;
-    int64_t model_dim       = 4096;
-    int64_t ff_dim          = 10240;
-    int64_t num_heads       = 64;
-    int64_t vocab_size      = 32128;
-    bool relative_attention = true;
-};
-
-struct T5 : public GGMLBlock {
-    T5Params params;
-
-public:
-    T5() {}
-    T5(T5Params params)
-        : params(params) {
-        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
-                                                                   params.model_dim,
-                                                                   params.model_dim,
-                                                                   params.ff_dim,
-                                                                   params.num_heads,
-                                                                   params.relative_attention));
-        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
-                                                                     params.model_dim));
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* past_bias                = nullptr,
-                         ggml_tensor* attention_mask           = nullptr,
-                         ggml_tensor* relative_position_bucket = nullptr) {
-        // input_ids: [N, n_token]
-
-        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
-        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
-
-        auto x = shared->forward(ctx, input_ids);
-        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
-        return x;
-    }
-};
-
-struct T5Runner : public GGMLRunner {
-    T5Params params;
-    T5 model;
-    std::vector<int> relative_position_bucket_vec;
-
-    T5Runner(ggml_backend_t backend,
-             bool offload_params_to_cpu,
-             const String2TensorStorage& tensor_storage_map,
-             const std::string prefix,
-             bool is_umt5 = false)
-        : GGMLRunner(backend, offload_params_to_cpu) {
-        if (is_umt5) {
-            params.vocab_size         = 256384;
-            params.relative_attention = false;
-        }
-        model = T5(params);
-        model.init(params_ctx, tensor_storage_map, prefix);
-    }
-
-    std::string get_desc() override {
-        return "t5";
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    ggml_tensor* forward(GGMLRunnerContext* ctx,
-                         ggml_tensor* input_ids,
-                         ggml_tensor* relative_position_bucket,
-                         ggml_tensor* attention_mask = nullptr) {
-        size_t N       = input_ids->ne[1];
-        size_t n_token = input_ids->ne[0];
-
-        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
-        return hidden_states;
-    }
-
-    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
-                             const sd::Tensor<float>& attention_mask_tensor = {}) {
-        ggml_cgraph* gf             = ggml_new_graph(compute_ctx);
-        ggml_tensor* input_ids      = make_input(input_ids_tensor);
-        ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor);
-
-        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
-
-        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
-        //     if (i % 77 == 0) {
-        //         printf("\n");
-        //     }
-        //     printf("%d ", relative_position_bucket_vec[i]);
-        // }
-
-        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
-                                                           GGML_TYPE_I32,
-                                                           input_ids->ne[0],
-                                                           input_ids->ne[0]);
-        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
-
-        auto runner_ctx            = get_context();
-        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
-
-        ggml_build_forward_expand(gf, hidden_states);
-
-        return gf;
-    }
-
-    sd::Tensor<float> compute(const int n_threads,
-                              const sd::Tensor<int32_t>& input_ids,
-                              const sd::Tensor<float>& attention_mask) {
-        auto get_graph = [&]() -> ggml_cgraph* {
-            return build_graph(input_ids, attention_mask);
-        };
-        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true), 3);
-    }
-
-    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
-                                                      bool bidirectional = true,
-                                                      int num_buckets    = 32,
-                                                      int max_distance   = 128) {
-        std::vector<int> relative_buckets(relative_position.size(), 0);
-        std::vector<int> abs_relative_position = relative_position;
-
-        if (bidirectional) {
-            num_buckets = num_buckets / 2;
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                if (relative_position[i] > 0) {
-                    relative_buckets[i] += num_buckets;
-                }
-                abs_relative_position[i] = std::abs(relative_position[i]);
-            }
-        } else {
-            for (size_t i = 0; i < relative_position.size(); ++i) {
-                abs_relative_position[i] = std::max(-relative_position[i], 0);
-            }
-        }
-
-        int max_exact = num_buckets / 2;
-        std::vector<int> relative_position_if_large(relative_position.size(), 0);
-
-        for (size_t i = 0; i < relative_position.size(); ++i) {
-            if (abs_relative_position[i] < max_exact) {
-                relative_buckets[i] += abs_relative_position[i];
-            } else {
-                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
-                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
-                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
-                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
-                relative_buckets[i] += relative_position_if_large[i];
-            }
-        }
-
-        return relative_buckets;
-    }
-
-    std::vector<int> compute_relative_position_bucket(int query_length,
-                                                      int key_length) {
-        std::vector<int> context_position(query_length);
-        std::vector<int> memory_position(key_length);
-
-        for (int i = 0; i < query_length; ++i) {
-            context_position[i] = i;
-        }
-        for (int i = 0; i < key_length; ++i) {
-            memory_position[i] = i;
-        }
-
-        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
-        for (int i = 0; i < query_length; ++i) {
-            for (int j = 0; j < key_length; ++j) {
-                relative_position[i][j] = memory_position[j] - context_position[i];
-            }
-        }
-
-        std::vector<int> relative_position_bucket;
-        for (int i = 0; i < query_length; ++i) {
-            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
-            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
-        }
-
-        return relative_position_bucket;
-    }
-};
-
-struct T5Embedder {
-    T5UniGramTokenizer tokenizer;
-    T5Runner model;
-
-    T5Embedder(ggml_backend_t backend,
-               bool offload_params_to_cpu,
-               const String2TensorStorage& tensor_storage_map = {},
-               const std::string prefix                       = "",
-               bool is_umt5                                   = false)
-        : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
-    }
-
-    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
-        model.get_param_tensors(tensors, prefix);
-    }
-
-    void alloc_params_buffer() {
-        model.alloc_params_buffer();
-    }
-
-    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
-                                                                                  size_t max_length = 0,
-                                                                                  bool padding      = false) {
-        auto parsed_attention = parse_prompt_attention(text);
-
-        {
-            std::stringstream ss;
-            ss << "[";
-            for (const auto& item : parsed_attention) {
-                ss << "['" << item.first << "', " << item.second << "], ";
-            }
-            ss << "]";
-            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
-        }
-
-        std::vector<int> tokens;
-        std::vector<float> weights;
-        for (const auto& item : parsed_attention) {
-            const std::string& curr_text = item.first;
-            float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
-            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
-            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
-        }
-
-        int EOS_TOKEN_ID = 1;
-        tokens.push_back(EOS_TOKEN_ID);
-        weights.push_back(1.0);
-
-        std::vector<float> attention_mask;
-
-        tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
-
-        // for (int i = 0; i < tokens.size(); i++) {
-        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
-        // }
-        // std::cout << std::endl;
-
-        return {tokens, weights, attention_mask};
-    }
-
-    void test() {
-        ggml_init_params params;
-        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
-        params.mem_buffer = nullptr;
-        params.no_alloc   = false;
-
-        ggml_context* ctx = ggml_init(params);
-        GGML_ASSERT(ctx != nullptr);
-
-        {
-            std::string text("a lovely cat");
-            auto tokens_and_weights     = tokenize(text, 512, true);
-            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
-            std::vector<float>& weights = std::get<1>(tokens_and_weights);
-            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
-            for (auto token : tokens) {
-                printf("%d ", token);
-            }
-            printf("\n");
-            auto input_ids      = sd::Tensor<int32_t>::from_vector(tokens);
-            auto attention_mask = sd::Tensor<float>::from_vector(masks);
-            sd::Tensor<float> out;
-
-            int64_t t0   = ggml_time_ms();
-            auto out_opt = model.compute(8, input_ids, attention_mask);
-            int64_t t1   = ggml_time_ms();
-
-            GGML_ASSERT(!out_opt.empty());
-            out = std::move(out_opt);
-            print_sd_tensor(out);
-            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
-        }
-    }
-
-    static void load_from_file_and_test(const std::string& file_path) {
-        // cpu f16: pass
-        // cpu f32: pass
-        // cuda f16: pass
-        // cuda f32: pass
-        // cuda q8_0: pass
-        // ggml_backend_t backend = ggml_backend_cuda_init(0);
-        ggml_backend_t backend    = ggml_backend_cpu_init();
-        ggml_type model_data_type = GGML_TYPE_F16;
-
-        ModelLoader model_loader;
-        if (!model_loader.init_from_file_and_convert_name(file_path)) {
-            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
-            return;
-        }
-
-        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
-        for (auto& [name, tensor_storage] : tensor_storage_map) {
-            if (ends_with(name, "weight")) {
-                tensor_storage.expected_type = model_data_type;
-            }
-        }
-
-        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
-
-        t5->alloc_params_buffer();
-        std::map<std::string, ggml_tensor*> tensors;
-        t5->get_param_tensors(tensors, "");
-
-        bool success = model_loader.load_tensors(tensors);
-
-        if (!success) {
-            LOG_ERROR("load tensors from model loader failed");
-            return;
-        }
-
-        LOG_INFO("t5 model loaded");
-        t5->test();
-    }
-};
-
-#endif  // __T5_HPP__
+#ifndef __T5_HPP__
+#define __T5_HPP__
+
+#include <cfloat>
+#include <limits>
+#include <map>
+#include <memory>
+#include <regex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+#include "darts.h"
+#include "ggml_extend.hpp"
+#include "json.hpp"
+#include "model.h"
+#include "vocab/vocab.h"
+
+// Port from: https://github.com/google/sentencepiece/blob/master/src/unigram_model.h
+// and https://github.com/google/sentencepiece/blob/master/src/unigram_model.h.
+// Original License: https://github.com/google/sentencepiece/blob/master/LICENSE
+//
+// Since tokenization is not the bottleneck in SD, performance was not a major consideration
+// during the migration.
+class MetaspacePreTokenizer {
+private:
+    std::string replacement;
+    bool add_prefix_space;
+
+public:
+    MetaspacePreTokenizer(const std::string replacement = " ", bool add_prefix_space = true)
+        : replacement(replacement), add_prefix_space(add_prefix_space) {}
+
+    std::string tokenize(const std::string& input) const {
+        std::string tokens;
+        std::stringstream ss(input);
+
+        if (add_prefix_space) {
+            tokens += replacement;
+        }
+
+        std::string token;
+        bool firstToken = true;
+        while (std::getline(ss, token, ' ')) {
+            if (!firstToken)
+                tokens += replacement + token;
+            else
+                tokens += token;
+
+            firstToken = false;
+        }
+
+        return tokens;
+    }
+};
+
+using EncodeResult = std::vector<std::pair<std::string, int>>;
+class T5UniGramTokenizer {
+public:
+    enum Status {
+        OK,
+        NO_PIECES_LOADED,
+        NO_ENTRY_FOUND,
+        BUILD_DOUBLE_ARRAY_FAILED,
+        PIECE_ALREADY_DEFINED,
+        INVLIAD_JSON
+    };
+
+protected:
+    MetaspacePreTokenizer pre_tokenizer;
+
+    // all <piece, score> pairs
+    std::vector<std::pair<std::string, float>> piece_score_pairs;
+
+    float min_score_ = 0.0;
+    float max_score_ = 0.0;
+    std::unique_ptr<Darts::DoubleArray> trie_;
+
+    // Maximum size of the return value of Trie, which corresponds
+    // to the maximum size of shared common prefix in the sentence pieces.
+    int trie_results_size_;
+    // unknown id.
+    int unk_id_            = 2;
+    std::string eos_token_ = "</s>";
+    int eos_id_            = 1;
+    int pad_id_            = 0;
+    // status.
+    Status status_ = OK;
+
+    float kUnkPenalty = 10.0;
+
+    std::string replacement;
+    bool add_prefix_space = true;
+
+    void InitializePieces(const std::string& json_str) {
+        nlohmann::json data;
+
+        try {
+            data = nlohmann::json::parse(json_str);
+        } catch (const nlohmann::json::parse_error&) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (!data.contains("model")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        nlohmann::json model = data["model"];
+        if (!model.contains("vocab")) {
+            status_ = INVLIAD_JSON;
+            return;
+        }
+        if (model.contains("unk_id")) {
+            unk_id_ = model["unk_id"];
+        }
+
+        replacement      = data["pre_tokenizer"]["replacement"];
+        add_prefix_space = data["pre_tokenizer"]["add_prefix_space"];
+
+        pre_tokenizer = MetaspacePreTokenizer(replacement, add_prefix_space);
+
+        for (const auto& item : model["vocab"]) {
+            if (item.size() != 2 || !item[0].is_string() || !item[1].is_number_float()) {
+                status_ = INVLIAD_JSON;
+                return;
+            }
+            std::string piece = item[0];
+            if (piece.empty()) {
+                piece = "<empty_token>";
+            }
+            float score = item[1];
+            piece_score_pairs.emplace_back(piece, score);
+        }
+    }
+
+    // Builds a Trie index.
+    void BuildTrie(std::vector<std::pair<std::string, int>>* pieces) {
+        if (status_ != OK)
+            return;
+
+        if (pieces->empty()) {
+            status_ = NO_PIECES_LOADED;
+            return;
+        }
+
+        // sort by sentencepiece since DoubleArray::build()
+        // only accepts sorted strings.
+        sort(pieces->begin(), pieces->end());
+
+        // Makes key/value set for DoubleArrayTrie.
+        std::vector<const char*> key(pieces->size());
+        std::vector<int> value(pieces->size());
+        for (size_t i = 0; i < pieces->size(); ++i) {
+            // LOG_DEBUG("%s %d", (*pieces)[i].first.c_str(), (*pieces)[i].second);
+            key[i]   = (*pieces)[i].first.data();  // sorted piece.
+            value[i] = (*pieces)[i].second;        // vocab_id
+        }
+
+        trie_ = std::unique_ptr<Darts::DoubleArray>(new Darts::DoubleArray());
+        if (trie_->build(key.size(), const_cast<char**>(&key[0]), nullptr,
+                         &value[0]) != 0) {
+            status_ = BUILD_DOUBLE_ARRAY_FAILED;
+            return;
+        }
+
+        // Computes the maximum number of shared prefixes in the trie.
+        const int kMaxTrieResultsSize = 1024;
+        std::vector<Darts::DoubleArray::result_pair_type> results(
+            kMaxTrieResultsSize);
+        trie_results_size_ = 0;
+        for (const auto& p : *pieces) {
+            const size_t num_nodes = trie_->commonPrefixSearch(
+                p.first.data(), results.data(), results.size(), p.first.size());
+            trie_results_size_ = std::max(trie_results_size_, static_cast<int>(num_nodes));
+        }
+
+        if (trie_results_size_ == 0)
+            status_ = NO_ENTRY_FOUND;
+    }
+
+    // Non-virtual (inlined) implementation for faster execution.
+    inline float GetScoreInlined(int id) const {
+        return piece_score_pairs[id].second;
+    }
+
+    inline bool IsUnusedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline bool IsUserDefinedInlined(int id) const {
+        return false;  // TODO
+    }
+
+    inline size_t OneCharLen(const char* src) const {
+        return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+    }
+
+    // The optimized Viterbi encode.
+    // Main differences from the original function:
+    // 1. Memorizes the best path at each postion so far,
+    // 2. No need to store the Lattice nodes,
+    // 3. Works in utf-8 directly,
+    // 4. Defines a new struct with fewer fields than Lattice,
+    // 5. Does not depend on `class Lattice` nor call `SetSentence()`,
+    // `PopulateNodes()`, or `Viterbi()`. It does everything in one function.
+    // For detailed explanations please see the comments inside the function body.
+    EncodeResult EncodeOptimized(const std::string& normalized) const {
+        // An optimized Viterbi algorithm for unigram language models. Benchmarking
+        // results show that it generates almost identical outputs and achieves 2.1x
+        // speedup on average for 102 languages compared to the original
+        // implementation. It's based on the following three ideas:
+        //
+        // 1. Because it uses the *unigram* model:
+        //     best_score(x1, x2, ... xt) = best_score(x1, x2, ... x{t-1}) + score(xt)
+        // Deciding the best path (and score) can be decoupled into two isolated
+        // terms: (a) the best path ended before the last token `best_score(x1, x2, ...)`
+        // x{t-1})`, and (b) the last token and its `score(xt)`. The two terms are
+        // not related to each other at all.
+        //
+        // Therefore, we can compute once and store the *best_path ending at
+        // each character position*. In this way, when we know best_path_ends_at[M],
+        // we can reuse it to compute all the best_path_ends_at_[...] where the last
+        // token starts at the same character position M.
+        //
+        // This improves the time complexity from O(n*k*k) to O(n*k) because it
+        // eliminates the extra loop of recomputing the best path ending at the same
+        // position, where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // 2. Again, because it uses the *unigram* model, we don't need to actually
+        // store the lattice nodes. We still recognize all the tokens and lattice
+        // nodes from the input, but along identifying them, we use and discard them
+        // on the fly. There is no need to actually store them for best path Viterbi
+        // decoding. The only thing we need to store is the best_path ending at
+        // each character position.
+        //
+        // This improvement reduces the things needed to store in memory from O(n*k)
+        // to O(n), where n is the input length and k is the maximum number of tokens
+        // that can be recognized starting at each position.
+        //
+        // It also avoids the need of dynamic-size lattice node pool, because the
+        // number of things to store is fixed as n.
+        //
+        // 3. SentencePiece is designed to work with unicode, taking utf-8 encoding
+        // inputs. In the original implementation, the lattice positions are based on
+        // unicode positions. A mapping from unicode position to the utf-8 position is
+        // maintained to recover the utf-8 string piece.
+        //
+        // We found that it is sufficient and beneficial to directly work with utf-8
+        // positions:
+        //
+        // Firstly, it saves the conversion and mapping between unicode positions and
+        // utf-8 positions.
+        //
+        // Secondly, it reduces the number of fields we need to maintain in the
+        // node/path structure. Specifically, there are 8 fields defined in
+        // `Lattice::Node` used by the original encoder, but here in the optimized
+        // encoder we only need to define 3 fields in `BestPathNode`.
+
+        if (status() != OK || normalized.empty()) {
+            return {};
+        }
+        // Represents the last node of the best path.
+        struct BestPathNode {
+            int id = -1;  // The vocab id. (maybe -1 for UNK)
+            float best_path_score =
+                0;  // The total score of the best path ending at this node.
+            int starts_at =
+                -1;  // The starting position (in utf-8) of this node. The entire best
+                     // path can be constructed by backtracking along this link.
+        };
+        const int size        = static_cast<int>(normalized.size());
+        const float unk_score = min_score() - kUnkPenalty;
+        // The ends are exclusive.
+        std::vector<BestPathNode> best_path_ends_at(size + 1);
+        // Generate lattice on-the-fly (not stored) and update best_path_ends_at.
+        int starts_at = 0;
+        while (starts_at < size) {
+            std::size_t node_pos = 0;
+            std::size_t key_pos  = starts_at;
+            const auto best_path_score_till_here =
+                best_path_ends_at[starts_at].best_path_score;
+            bool has_single_node = false;
+            const int mblen =
+                std::min<int>(static_cast<int>(OneCharLen(normalized.data() + starts_at)),
+                              size - starts_at);
+            while (key_pos < size) {
+                const int ret =
+                    trie_->traverse(normalized.data(), node_pos, key_pos, key_pos + 1);
+                if (ret == -2)
+                    break;
+                if (ret >= 0) {
+                    if (IsUnusedInlined(ret))
+                        continue;
+                    // Update the best path node.
+                    auto& target_node = best_path_ends_at[key_pos];
+                    const auto length = (key_pos - starts_at);
+                    // User defined symbol receives extra bonus to always be selected.
+                    const auto score = IsUserDefinedInlined(ret)
+                                           ? (length * max_score_ - 0.1)
+                                           : GetScoreInlined(ret);
+                    const auto candidate_best_path_score =
+                        score + best_path_score_till_here;
+                    if (target_node.starts_at == -1 ||
+                        candidate_best_path_score > target_node.best_path_score) {
+                        target_node.best_path_score = static_cast<float>(candidate_best_path_score);
+                        target_node.starts_at       = starts_at;
+                        target_node.id              = ret;
+                    }
+                    if (!has_single_node && length == mblen) {
+                        has_single_node = true;
+                    }
+                }
+            }
+            if (!has_single_node) {
+                auto& target_node = best_path_ends_at[starts_at + mblen];
+                const auto candidate_best_path_score =
+                    unk_score + best_path_score_till_here;
+                if (target_node.starts_at == -1 ||
+                    candidate_best_path_score > target_node.best_path_score) {
+                    target_node.best_path_score = candidate_best_path_score;
+                    target_node.starts_at       = starts_at;
+                    target_node.id              = unk_id_;
+                }
+            }
+            // Move by one unicode character.
+            starts_at += mblen;
+        }
+        // Backtrack to identify the best path.
+        EncodeResult results;
+        int ends_at = size;
+        while (ends_at > 0) {
+            const auto& node = best_path_ends_at[ends_at];
+            results.emplace_back(
+                normalized.substr(node.starts_at, ends_at - node.starts_at), node.id);
+            ends_at = node.starts_at;
+        }
+        std::reverse(results.begin(), results.end());
+        return results;
+    }
+
+public:
+    explicit T5UniGramTokenizer(bool is_umt5 = false) {
+        if (is_umt5) {
+            InitializePieces(load_umt5_tokenizer_json());
+        } else {
+            InitializePieces(load_t5_tokenizer_json());
+        }
+
+        min_score_ = FLT_MAX;
+        max_score_ = FLT_MIN;
+
+        std::vector<std::pair<std::string, int>> pieces;
+        for (int i = 0; i < piece_score_pairs.size(); i++) {
+            const auto& sp = piece_score_pairs[i];
+
+            min_score_ = std::min(min_score_, sp.second);
+            max_score_ = std::max(max_score_, sp.second);
+
+            pieces.emplace_back(sp.first, i);
+        }
+
+        BuildTrie(&pieces);
+    }
+    ~T5UniGramTokenizer() {};
+
+    std::string Normalize(const std::string& input) const {
+        // Ref: https://github.com/huggingface/tokenizers/blob/1ff56c0c70b045f0cd82da1af9ac08cd4c7a6f9f/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py#L29
+        // TODO: nmt-nfkc
+        std::string normalized = std::regex_replace(input, std::regex(" {2,}"), " ");
+        return normalized;
+    }
+
+    std::vector<int> Encode(const std::string& input, bool append_eos_if_not_present = true) const {
+        std::string normalized = Normalize(input);
+        normalized             = pre_tokenizer.tokenize(normalized);
+        EncodeResult result    = EncodeOptimized(normalized);
+        if (result.size() > 0 && append_eos_if_not_present) {
+            auto item = result[result.size() - 1];
+            if (item.first != eos_token_) {
+                result.emplace_back(eos_token_, eos_id_);
+            }
+        }
+        std::vector<int> tokens;
+        for (auto item : result) {
+            tokens.push_back(item.second);
+        }
+        return tokens;
+    }
+
+    void pad_tokens(std::vector<int>& tokens,
+                    std::vector<float>& weights,
+                    std::vector<float>* attention_mask,
+                    size_t max_length = 0,
+                    bool padding      = false) {
+        if (max_length > 0 && padding) {
+            size_t orig_token_num = tokens.size() - 1;
+            size_t n              = static_cast<size_t>(std::ceil(orig_token_num * 1.0 / (max_length - 1)));
+            if (n == 0) {
+                n = 1;
+            }
+            size_t length = max_length * n;
+            LOG_DEBUG("token length: %llu", length);
+            std::vector<int> new_tokens;
+            std::vector<float> new_weights;
+            std::vector<float> new_attention_mask;
+            int token_idx = 0;
+            for (int i = 0; i < length; i++) {
+                if (token_idx >= orig_token_num) {
+                    break;
+                }
+                if (attention_mask != nullptr) {
+                    new_attention_mask.push_back(0.0);
+                }
+                if (i % max_length == max_length - 1) {
+                    new_tokens.push_back(eos_id_);
+                    new_weights.push_back(1.0);
+                } else {
+                    new_tokens.push_back(tokens[token_idx]);
+                    new_weights.push_back(weights[token_idx]);
+                    token_idx++;
+                }
+            }
+
+            new_tokens.push_back(eos_id_);
+            new_weights.push_back(1.0);
+            if (attention_mask != nullptr) {
+                new_attention_mask.push_back(0.0);
+            }
+
+            tokens  = new_tokens;
+            weights = new_weights;
+            if (attention_mask != nullptr) {
+                *attention_mask = new_attention_mask;
+            }
+
+            if (padding) {
+                int pad_token_id = pad_id_;
+                tokens.insert(tokens.end(), length - tokens.size(), pad_token_id);
+                weights.insert(weights.end(), length - weights.size(), 1.0);
+                if (attention_mask != nullptr) {
+                    // maybe keep some padding tokens unmasked?
+                    attention_mask->insert(attention_mask->end(), length - attention_mask->size(), -HUGE_VALF);
+                }
+            }
+        }
+    }
+
+    // Returns the minimum score in sentence pieces.
+    // min_score() - 10 is used for the cost of unknown sentence.
+    float min_score() const { return min_score_; }
+
+    // Returns the maximum score in sentence pieces.
+    // max_score() is used for the cost of user defined symbols.
+    float max_score() const { return max_score_; }
+
+    Status status() const { return status_; }
+};
+
+class T5LayerNorm : public UnaryBlock {
+protected:
+    int64_t hidden_size;
+    float eps;
+
+    void init_params(ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") override {
+        enum ggml_type wtype = GGML_TYPE_F32;
+        params["weight"]     = ggml_new_tensor_1d(ctx, wtype, hidden_size);
+    }
+
+public:
+    T5LayerNorm(int64_t hidden_size,
+                float eps = 1e-06f)
+        : hidden_size(hidden_size),
+          eps(eps) {}
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        ggml_tensor* w = params["weight"];
+        x              = ggml_rms_norm(ctx->ggml_ctx, x, eps);
+        x              = ggml_mul(ctx->ggml_ctx, x, w);
+        return x;
+    }
+};
+
+struct T5DenseActDense : public UnaryBlock {
+public:
+    T5DenseActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi = std::dynamic_pointer_cast<Linear>(blocks["wi"]);
+        auto wo = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        x = wi->forward(ctx, x);
+        x = ggml_relu_inplace(ctx->ggml_ctx, x);
+        x = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5DenseGatedActDense : public UnaryBlock {
+public:
+    T5DenseGatedActDense(int64_t model_dim, int64_t ff_dim) {
+        blocks["wi_0"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        blocks["wi_1"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, ff_dim, false));
+        float scale    = 1.f / 32.f;
+        // The purpose of the scale here is to prevent NaN issues on some backends(CUDA, ...).
+        blocks["wo"] = std::shared_ptr<GGMLBlock>(new Linear(ff_dim, model_dim, false, false, false, scale));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto wi_0 = std::dynamic_pointer_cast<Linear>(blocks["wi_0"]);
+        auto wi_1 = std::dynamic_pointer_cast<Linear>(blocks["wi_1"]);
+        auto wo   = std::dynamic_pointer_cast<Linear>(blocks["wo"]);
+
+        auto hidden_gelu   = ggml_ext_gelu(ctx->ggml_ctx, wi_0->forward(ctx, x), true);
+        auto hidden_linear = wi_1->forward(ctx, x);
+        x                  = ggml_mul_inplace(ctx->ggml_ctx, hidden_gelu, hidden_linear);
+        x                  = wo->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5LayerFF : public UnaryBlock {
+public:
+    T5LayerFF(int64_t model_dim, int64_t ff_dim) {
+        blocks["DenseReluDense"] = std::shared_ptr<GGMLBlock>(new T5DenseGatedActDense(model_dim, ff_dim));
+        blocks["layer_norm"]     = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* x) override {
+        // x: [N, n_token, model_dim]
+        auto DenseReluDense = std::dynamic_pointer_cast<T5DenseGatedActDense>(blocks["DenseReluDense"]);
+        auto layer_norm     = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto forwarded_states = layer_norm->forward(ctx, x);
+        forwarded_states      = DenseReluDense->forward(ctx, forwarded_states);
+        x                     = ggml_add_inplace(ctx->ggml_ctx, forwarded_states, x);
+        return x;
+    }
+};
+
+class T5Attention : public GGMLBlock {
+protected:
+    int64_t model_dim;
+    int64_t inner_dim;
+    int64_t num_heads;
+    bool using_relative_attention_bias;
+    int64_t relative_attention_num_buckets  = 32;
+    int64_t relative_attention_max_distance = 128;
+
+public:
+    T5Attention(int64_t model_dim,
+                int64_t inner_dim,
+                int64_t num_heads,
+                bool using_relative_attention_bias = false)
+        : model_dim(model_dim),
+          inner_dim(inner_dim),
+          num_heads(num_heads),
+          using_relative_attention_bias(using_relative_attention_bias) {
+        blocks["q"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["k"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["v"] = std::shared_ptr<GGMLBlock>(new Linear(model_dim, inner_dim, false));
+        blocks["o"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, model_dim, false));
+        if (using_relative_attention_bias) {
+            blocks["relative_attention_bias"] = std::shared_ptr<GGMLBlock>(new Embedding(relative_attention_num_buckets, num_heads));
+        }
+    }
+
+    ggml_tensor* compute_bias(GGMLRunnerContext* ctx,
+                              ggml_tensor* relative_position_bucket) {
+        auto relative_attention_bias = std::dynamic_pointer_cast<Embedding>(blocks["relative_attention_bias"]);
+
+        auto values = relative_attention_bias->forward(ctx, relative_position_bucket);            // shape (query_length, key_length, num_heads)
+        values      = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, values, 2, 0, 1, 3));  // shape (1, num_heads, query_length, key_length)
+        return values;
+    }
+
+    // x: [N, n_token, model_dim]
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q"]);
+        auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k"]);
+        auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v"]);
+        auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["o"]);
+
+        int64_t n_head = num_heads;
+        int64_t d_head = inner_dim / n_head;
+
+        auto q = q_proj->forward(ctx, x);
+        auto k = k_proj->forward(ctx, x);
+        auto v = v_proj->forward(ctx, x);
+
+        if (using_relative_attention_bias && relative_position_bucket != nullptr) {
+            past_bias = compute_bias(ctx, relative_position_bucket);
+        }
+        if (past_bias != nullptr) {
+            if (mask != nullptr) {
+                mask = ggml_repeat(ctx->ggml_ctx, mask, past_bias);
+                mask = ggml_add(ctx->ggml_ctx, mask, past_bias);
+            } else {
+                mask = past_bias;
+            }
+        }
+
+        k = ggml_ext_scale(ctx->ggml_ctx, k, ::sqrtf(static_cast<float>(d_head)), true);
+
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, num_heads, mask);  // [N, n_token, d_head * n_head]
+
+        x = out_proj->forward(ctx, x);  // [N, n_token, model_dim]
+        return {x, past_bias};
+    }
+};
+
+struct T5LayerSelfAttention : public GGMLBlock {
+public:
+    T5LayerSelfAttention(int64_t model_dim,
+                         int64_t inner_dim,
+                         int64_t ff_dim,
+                         int64_t num_heads,
+                         bool using_relative_attention_bias) {
+        blocks["SelfAttention"] = std::shared_ptr<GGMLBlock>(new T5Attention(model_dim, inner_dim, num_heads, using_relative_attention_bias));
+        blocks["layer_norm"]    = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto SelfAttention = std::dynamic_pointer_cast<T5Attention>(blocks["SelfAttention"]);
+        auto layer_norm    = std::dynamic_pointer_cast<T5LayerNorm>(blocks["layer_norm"]);
+
+        auto normed_hidden_state = layer_norm->forward(ctx, x);
+        auto ret                 = SelfAttention->forward(ctx, normed_hidden_state, past_bias, mask, relative_position_bucket);
+        auto output              = ret.first;
+        past_bias                = ret.second;
+
+        x = ggml_add_inplace(ctx->ggml_ctx, output, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Block : public GGMLBlock {
+public:
+    T5Block(int64_t model_dim, int64_t inner_dim, int64_t ff_dim, int64_t num_heads, bool using_relative_attention_bias) {
+        blocks["layer.0"] = std::shared_ptr<GGMLBlock>(new T5LayerSelfAttention(model_dim, inner_dim, ff_dim, num_heads, using_relative_attention_bias));
+        blocks["layer.1"] = std::shared_ptr<GGMLBlock>(new T5LayerFF(model_dim, ff_dim));
+    }
+
+    std::pair<ggml_tensor*, ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                  ggml_tensor* x,
+                                                  ggml_tensor* past_bias                = nullptr,
+                                                  ggml_tensor* mask                     = nullptr,
+                                                  ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        auto layer_0 = std::dynamic_pointer_cast<T5LayerSelfAttention>(blocks["layer.0"]);
+        auto layer_1 = std::dynamic_pointer_cast<T5LayerFF>(blocks["layer.1"]);
+
+        auto ret  = layer_0->forward(ctx, x, past_bias, mask, relative_position_bucket);
+        x         = ret.first;
+        past_bias = ret.second;
+        x         = layer_1->forward(ctx, x);
+        return {x, past_bias};
+    }
+};
+
+struct T5Stack : public GGMLBlock {
+    int64_t num_layers;
+
+public:
+    T5Stack(int64_t num_layers,
+            int64_t model_dim,
+            int64_t inner_dim,
+            int64_t ff_dim,
+            int64_t num_heads,
+            bool relative_attention = true)
+        : num_layers(num_layers) {
+        for (int i = 0; i < num_layers; i++) {
+            blocks["block." + std::to_string(i)] = std::shared_ptr<GGMLBlock>(new T5Block(model_dim, inner_dim, ff_dim, num_heads, (!relative_attention || i == 0)));
+        }
+
+        blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new T5LayerNorm(model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* x,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr) {
+        // x: [N, n_token, model_dim]
+        for (int i = 0; i < num_layers; i++) {
+            auto block = std::dynamic_pointer_cast<T5Block>(blocks["block." + std::to_string(i)]);
+
+            auto ret  = block->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+            x         = ret.first;
+            past_bias = ret.second;
+        }
+
+        auto final_layer_norm = std::dynamic_pointer_cast<T5LayerNorm>(blocks["final_layer_norm"]);
+
+        x = final_layer_norm->forward(ctx, x);
+        return x;
+    }
+};
+
+struct T5Params {
+    int64_t num_layers      = 24;
+    int64_t model_dim       = 4096;
+    int64_t ff_dim          = 10240;
+    int64_t num_heads       = 64;
+    int64_t vocab_size      = 32128;
+    bool relative_attention = true;
+};
+
+struct T5 : public GGMLBlock {
+    T5Params params;
+
+public:
+    T5() {}
+    T5(T5Params params)
+        : params(params) {
+        blocks["encoder"] = std::shared_ptr<GGMLBlock>(new T5Stack(params.num_layers,
+                                                                   params.model_dim,
+                                                                   params.model_dim,
+                                                                   params.ff_dim,
+                                                                   params.num_heads,
+                                                                   params.relative_attention));
+        blocks["shared"]  = std::shared_ptr<GGMLBlock>(new Embedding(params.vocab_size,
+                                                                     params.model_dim));
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* past_bias                = nullptr,
+                         ggml_tensor* attention_mask           = nullptr,
+                         ggml_tensor* relative_position_bucket = nullptr) {
+        // input_ids: [N, n_token]
+
+        auto shared  = std::dynamic_pointer_cast<Embedding>(blocks["shared"]);
+        auto encoder = std::dynamic_pointer_cast<T5Stack>(blocks["encoder"]);
+
+        auto x = shared->forward(ctx, input_ids);
+        x      = encoder->forward(ctx, x, past_bias, attention_mask, relative_position_bucket);
+        return x;
+    }
+};
+
+struct T5Runner : public GGMLRunner {
+    T5Params params;
+    T5 model;
+    std::vector<int> relative_position_bucket_vec;
+
+    T5Runner(ggml_backend_t backend,
+             bool offload_params_to_cpu,
+             const String2TensorStorage& tensor_storage_map,
+             const std::string prefix,
+             bool is_umt5 = false)
+        : GGMLRunner(backend, offload_params_to_cpu) {
+        if (is_umt5) {
+            params.vocab_size         = 256384;
+            params.relative_attention = false;
+        }
+        model = T5(params);
+        model.init(params_ctx, tensor_storage_map, prefix);
+    }
+
+    std::string get_desc() override {
+        return "t5";
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    ggml_tensor* forward(GGMLRunnerContext* ctx,
+                         ggml_tensor* input_ids,
+                         ggml_tensor* relative_position_bucket,
+                         ggml_tensor* attention_mask = nullptr) {
+        size_t N       = input_ids->ne[1];
+        size_t n_token = input_ids->ne[0];
+
+        auto hidden_states = model.forward(ctx, input_ids, nullptr, attention_mask, relative_position_bucket);  // [N, n_token, model_dim]
+        return hidden_states;
+    }
+
+    ggml_cgraph* build_graph(const sd::Tensor<int32_t>& input_ids_tensor,
+                             const sd::Tensor<float>& attention_mask_tensor = {}) {
+        ggml_cgraph* gf             = ggml_new_graph(compute_ctx);
+        ggml_tensor* input_ids      = make_input(input_ids_tensor);
+        ggml_tensor* attention_mask = attention_mask_tensor.empty() ? nullptr : make_input(attention_mask_tensor);
+
+        relative_position_bucket_vec = compute_relative_position_bucket(static_cast<int>(input_ids->ne[0]), static_cast<int>(input_ids->ne[0]));
+
+        // for (int i = 0; i < relative_position_bucket_vec.size(); i++) {
+        //     if (i % 77 == 0) {
+        //         printf("\n");
+        //     }
+        //     printf("%d ", relative_position_bucket_vec[i]);
+        // }
+
+        auto relative_position_bucket = ggml_new_tensor_2d(compute_ctx,
+                                                           GGML_TYPE_I32,
+                                                           input_ids->ne[0],
+                                                           input_ids->ne[0]);
+        set_backend_tensor_data(relative_position_bucket, relative_position_bucket_vec.data());
+
+        auto runner_ctx            = get_context();
+        ggml_tensor* hidden_states = forward(&runner_ctx, input_ids, relative_position_bucket, attention_mask);
+
+        ggml_build_forward_expand(gf, hidden_states);
+
+        return gf;
+    }
+
+    sd::Tensor<float> compute(const int n_threads,
+                              const sd::Tensor<int32_t>& input_ids,
+                              const sd::Tensor<float>& attention_mask) {
+        auto get_graph = [&]() -> ggml_cgraph* {
+            return build_graph(input_ids, attention_mask);
+        };
+        return restore_trailing_singleton_dims(GGMLRunner::compute<float>(get_graph, n_threads, true), 3);
+    }
+
+    static std::vector<int> _relative_position_bucket(const std::vector<int>& relative_position,
+                                                      bool bidirectional = true,
+                                                      int num_buckets    = 32,
+                                                      int max_distance   = 128) {
+        std::vector<int> relative_buckets(relative_position.size(), 0);
+        std::vector<int> abs_relative_position = relative_position;
+
+        if (bidirectional) {
+            num_buckets = num_buckets / 2;
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                if (relative_position[i] > 0) {
+                    relative_buckets[i] += num_buckets;
+                }
+                abs_relative_position[i] = std::abs(relative_position[i]);
+            }
+        } else {
+            for (size_t i = 0; i < relative_position.size(); ++i) {
+                abs_relative_position[i] = std::max(-relative_position[i], 0);
+            }
+        }
+
+        int max_exact = num_buckets / 2;
+        std::vector<int> relative_position_if_large(relative_position.size(), 0);
+
+        for (size_t i = 0; i < relative_position.size(); ++i) {
+            if (abs_relative_position[i] < max_exact) {
+                relative_buckets[i] += abs_relative_position[i];
+            } else {
+                float log_pos                 = std::log(static_cast<float>(abs_relative_position[i]) / max_exact);
+                float log_base                = std::log(static_cast<float>(max_distance) / max_exact);
+                relative_position_if_large[i] = max_exact + static_cast<int>((log_pos / log_base) * (num_buckets - max_exact));
+                relative_position_if_large[i] = std::min(relative_position_if_large[i], num_buckets - 1);
+                relative_buckets[i] += relative_position_if_large[i];
+            }
+        }
+
+        return relative_buckets;
+    }
+
+    std::vector<int> compute_relative_position_bucket(int query_length,
+                                                      int key_length) {
+        std::vector<int> context_position(query_length);
+        std::vector<int> memory_position(key_length);
+
+        for (int i = 0; i < query_length; ++i) {
+            context_position[i] = i;
+        }
+        for (int i = 0; i < key_length; ++i) {
+            memory_position[i] = i;
+        }
+
+        std::vector<std::vector<int>> relative_position(query_length, std::vector<int>(key_length, 0));
+        for (int i = 0; i < query_length; ++i) {
+            for (int j = 0; j < key_length; ++j) {
+                relative_position[i][j] = memory_position[j] - context_position[i];
+            }
+        }
+
+        std::vector<int> relative_position_bucket;
+        for (int i = 0; i < query_length; ++i) {
+            std::vector<int> result = _relative_position_bucket(relative_position[i], true);
+            relative_position_bucket.insert(relative_position_bucket.end(), result.begin(), result.end());
+        }
+
+        return relative_position_bucket;
+    }
+};
+
+struct T5Embedder {
+    T5UniGramTokenizer tokenizer;
+    T5Runner model;
+
+    T5Embedder(ggml_backend_t backend,
+               bool offload_params_to_cpu,
+               const String2TensorStorage& tensor_storage_map = {},
+               const std::string prefix                       = "",
+               bool is_umt5                                   = false)
+        : model(backend, offload_params_to_cpu, tensor_storage_map, prefix, is_umt5), tokenizer(is_umt5) {
+    }
+
+    void get_param_tensors(std::map<std::string, ggml_tensor*>& tensors, const std::string prefix) {
+        model.get_param_tensors(tensors, prefix);
+    }
+
+    void alloc_params_buffer() {
+        model.alloc_params_buffer();
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<float>> tokenize(std::string text,
+                                                                                  size_t max_length = 0,
+                                                                                  bool padding      = false) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> tokens;
+        std::vector<float> weights;
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = tokenizer.Encode(curr_text, false);
+            tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            weights.insert(weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        int EOS_TOKEN_ID = 1;
+        tokens.push_back(EOS_TOKEN_ID);
+        weights.push_back(1.0);
+
+        std::vector<float> attention_mask;
+
+        tokenizer.pad_tokens(tokens, weights, &attention_mask, max_length, padding);
+
+        // for (int i = 0; i < tokens.size(); i++) {
+        //     std::cout << tokens[i] << ":" << weights[i] << ", ";
+        // }
+        // std::cout << std::endl;
+
+        return {tokens, weights, attention_mask};
+    }
+
+    void test() {
+        ggml_init_params params;
+        params.mem_size   = static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
+        params.mem_buffer = nullptr;
+        params.no_alloc   = false;
+
+        ggml_context* ctx = ggml_init(params);
+        GGML_ASSERT(ctx != nullptr);
+
+        {
+            std::string text("a lovely cat");
+            auto tokens_and_weights     = tokenize(text, 512, true);
+            std::vector<int>& tokens    = std::get<0>(tokens_and_weights);
+            std::vector<float>& weights = std::get<1>(tokens_and_weights);
+            std::vector<float>& masks   = std::get<2>(tokens_and_weights);
+            for (auto token : tokens) {
+                printf("%d ", token);
+            }
+            printf("\n");
+            auto input_ids      = sd::Tensor<int32_t>::from_vector(tokens);
+            auto attention_mask = sd::Tensor<float>::from_vector(masks);
+            sd::Tensor<float> out;
+
+            int64_t t0   = ggml_time_ms();
+            auto out_opt = model.compute(8, input_ids, attention_mask);
+            int64_t t1   = ggml_time_ms();
+
+            GGML_ASSERT(!out_opt.empty());
+            out = std::move(out_opt);
+            print_sd_tensor(out);
+            LOG_DEBUG("t5 test done in %lldms", t1 - t0);
+        }
+    }
+
+    static void load_from_file_and_test(const std::string& file_path) {
+        // cpu f16: pass
+        // cpu f32: pass
+        // cuda f16: pass
+        // cuda f32: pass
+        // cuda q8_0: pass
+        // ggml_backend_t backend = ggml_backend_cuda_init(0);
+        ggml_backend_t backend    = ggml_backend_cpu_init();
+        ggml_type model_data_type = GGML_TYPE_F16;
+
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file_and_convert_name(file_path)) {
+            LOG_ERROR("init model loader from file failed: '%s'", file_path.c_str());
+            return;
+        }
+
+        auto& tensor_storage_map = model_loader.get_tensor_storage_map();
+        for (auto& [name, tensor_storage] : tensor_storage_map) {
+            if (ends_with(name, "weight")) {
+                tensor_storage.expected_type = model_data_type;
+            }
+        }
+
+        std::shared_ptr<T5Embedder> t5 = std::make_shared<T5Embedder>(backend, false, tensor_storage_map, "", true);
+
+        t5->alloc_params_buffer();
+        std::map<std::string, ggml_tensor*> tensors;
+        t5->get_param_tensors(tensors, "");
+
+        bool success = model_loader.load_tensors(tensors);
+
+        if (!success) {
+            LOG_ERROR("load tensors from model loader failed");
+            return;
+        }
+
+        LOG_INFO("t5 model loaded");
+        t5->test();
+    }
+};
+
+#endif  // __T5_HPP__
diff --git a/src/tokenize_util.cpp b/src/tokenize_util.cpp
index 33fdad266..be60f8f46 100644
--- a/src/tokenize_util.cpp
+++ b/src/tokenize_util.cpp
@@ -1,993 +1,995 @@
-﻿#include <algorithm>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "tokenize_util.h"
-
-bool is_number(char32_t ch) {
-    return (ch >= U'0' && ch <= U'9');
-}
-
-bool is_letter(char32_t ch) {
-    static const struct { char32_t start, end; } ranges[] = {
-        {0x41, 0x5A},
-        {0x61, 0x7A},
-        {0xAA, 0xAA},
-        {0xB5, 0xB5},
-        {0xBA, 0xBA},
-        {0xC0, 0xD6},
-        {0xD8, 0xF6},
-        {0xF8, 0x2C1},
-        {0x2C6, 0x2D1},
-        {0x2E0, 0x2E4},
-        {0x2EC, 0x2EC},
-        {0x2EE, 0x2EE},
-        {0x370, 0x374},
-        {0x376, 0x377},
-        {0x37A, 0x37D},
-        {0x37F, 0x37F},
-        {0x386, 0x386},
-        {0x388, 0x38A},
-        {0x38C, 0x38C},
-        {0x38E, 0x3A1},
-        {0x3A3, 0x3F5},
-        {0x3F7, 0x481},
-        {0x48A, 0x52F},
-        {0x531, 0x556},
-        {0x559, 0x559},
-        {0x560, 0x588},
-        {0x5D0, 0x5EA},
-        {0x5EF, 0x5F2},
-        {0x620, 0x64A},
-        {0x66E, 0x66F},
-        {0x671, 0x6D3},
-        {0x6D5, 0x6D5},
-        {0x6E5, 0x6E6},
-        {0x6EE, 0x6EF},
-        {0x6FA, 0x6FC},
-        {0x6FF, 0x6FF},
-        {0x710, 0x710},
-        {0x712, 0x72F},
-        {0x74D, 0x7A5},
-        {0x7B1, 0x7B1},
-        {0x7CA, 0x7EA},
-        {0x7F4, 0x7F5},
-        {0x7FA, 0x7FA},
-        {0x800, 0x815},
-        {0x81A, 0x81A},
-        {0x824, 0x824},
-        {0x828, 0x828},
-        {0x840, 0x858},
-        {0x860, 0x86A},
-        {0x870, 0x887},
-        {0x889, 0x88F},
-        {0x8A0, 0x8C9},
-        {0x904, 0x939},
-        {0x93D, 0x93D},
-        {0x950, 0x950},
-        {0x958, 0x961},
-        {0x971, 0x980},
-        {0x985, 0x98C},
-        {0x98F, 0x990},
-        {0x993, 0x9A8},
-        {0x9AA, 0x9B0},
-        {0x9B2, 0x9B2},
-        {0x9B6, 0x9B9},
-        {0x9BD, 0x9BD},
-        {0x9CE, 0x9CE},
-        {0x9DC, 0x9DD},
-        {0x9DF, 0x9E1},
-        {0x9F0, 0x9F1},
-        {0x9FC, 0x9FC},
-        {0xA05, 0xA0A},
-        {0xA0F, 0xA10},
-        {0xA13, 0xA28},
-        {0xA2A, 0xA30},
-        {0xA32, 0xA33},
-        {0xA35, 0xA36},
-        {0xA38, 0xA39},
-        {0xA59, 0xA5C},
-        {0xA5E, 0xA5E},
-        {0xA72, 0xA74},
-        {0xA85, 0xA8D},
-        {0xA8F, 0xA91},
-        {0xA93, 0xAA8},
-        {0xAAA, 0xAB0},
-        {0xAB2, 0xAB3},
-        {0xAB5, 0xAB9},
-        {0xABD, 0xABD},
-        {0xAD0, 0xAD0},
-        {0xAE0, 0xAE1},
-        {0xAF9, 0xAF9},
-        {0xB05, 0xB0C},
-        {0xB0F, 0xB10},
-        {0xB13, 0xB28},
-        {0xB2A, 0xB30},
-        {0xB32, 0xB33},
-        {0xB35, 0xB39},
-        {0xB3D, 0xB3D},
-        {0xB5C, 0xB5D},
-        {0xB5F, 0xB61},
-        {0xB71, 0xB71},
-        {0xB83, 0xB83},
-        {0xB85, 0xB8A},
-        {0xB8E, 0xB90},
-        {0xB92, 0xB95},
-        {0xB99, 0xB9A},
-        {0xB9C, 0xB9C},
-        {0xB9E, 0xB9F},
-        {0xBA3, 0xBA4},
-        {0xBA8, 0xBAA},
-        {0xBAE, 0xBB9},
-        {0xBD0, 0xBD0},
-        {0xC05, 0xC0C},
-        {0xC0E, 0xC10},
-        {0xC12, 0xC28},
-        {0xC2A, 0xC39},
-        {0xC3D, 0xC3D},
-        {0xC58, 0xC5A},
-        {0xC5C, 0xC5D},
-        {0xC60, 0xC61},
-        {0xC80, 0xC80},
-        {0xC85, 0xC8C},
-        {0xC8E, 0xC90},
-        {0xC92, 0xCA8},
-        {0xCAA, 0xCB3},
-        {0xCB5, 0xCB9},
-        {0xCBD, 0xCBD},
-        {0xCDC, 0xCDE},
-        {0xCE0, 0xCE1},
-        {0xCF1, 0xCF2},
-        {0xD04, 0xD0C},
-        {0xD0E, 0xD10},
-        {0xD12, 0xD3A},
-        {0xD3D, 0xD3D},
-        {0xD4E, 0xD4E},
-        {0xD54, 0xD56},
-        {0xD5F, 0xD61},
-        {0xD7A, 0xD7F},
-        {0xD85, 0xD96},
-        {0xD9A, 0xDB1},
-        {0xDB3, 0xDBB},
-        {0xDBD, 0xDBD},
-        {0xDC0, 0xDC6},
-        {0xE01, 0xE30},
-        {0xE32, 0xE33},
-        {0xE40, 0xE46},
-        {0xE81, 0xE82},
-        {0xE84, 0xE84},
-        {0xE86, 0xE8A},
-        {0xE8C, 0xEA3},
-        {0xEA5, 0xEA5},
-        {0xEA7, 0xEB0},
-        {0xEB2, 0xEB3},
-        {0xEBD, 0xEBD},
-        {0xEC0, 0xEC4},
-        {0xEC6, 0xEC6},
-        {0xEDC, 0xEDF},
-        {0xF00, 0xF00},
-        {0xF40, 0xF47},
-        {0xF49, 0xF6C},
-        {0xF88, 0xF8C},
-        {0x1000, 0x102A},
-        {0x103F, 0x103F},
-        {0x1050, 0x1055},
-        {0x105A, 0x105D},
-        {0x1061, 0x1061},
-        {0x1065, 0x1066},
-        {0x106E, 0x1070},
-        {0x1075, 0x1081},
-        {0x108E, 0x108E},
-        {0x10A0, 0x10C5},
-        {0x10C7, 0x10C7},
-        {0x10CD, 0x10CD},
-        {0x10D0, 0x10FA},
-        {0x10FC, 0x1248},
-        {0x124A, 0x124D},
-        {0x1250, 0x1256},
-        {0x1258, 0x1258},
-        {0x125A, 0x125D},
-        {0x1260, 0x1288},
-        {0x128A, 0x128D},
-        {0x1290, 0x12B0},
-        {0x12B2, 0x12B5},
-        {0x12B8, 0x12BE},
-        {0x12C0, 0x12C0},
-        {0x12C2, 0x12C5},
-        {0x12C8, 0x12D6},
-        {0x12D8, 0x1310},
-        {0x1312, 0x1315},
-        {0x1318, 0x135A},
-        {0x1380, 0x138F},
-        {0x13A0, 0x13F5},
-        {0x13F8, 0x13FD},
-        {0x1401, 0x166C},
-        {0x166F, 0x167F},
-        {0x1681, 0x169A},
-        {0x16A0, 0x16EA},
-        {0x16F1, 0x16F8},
-        {0x1700, 0x1711},
-        {0x171F, 0x1731},
-        {0x1740, 0x1751},
-        {0x1760, 0x176C},
-        {0x176E, 0x1770},
-        {0x1780, 0x17B3},
-        {0x17D7, 0x17D7},
-        {0x17DC, 0x17DC},
-        {0x1820, 0x1878},
-        {0x1880, 0x1884},
-        {0x1887, 0x18A8},
-        {0x18AA, 0x18AA},
-        {0x18B0, 0x18F5},
-        {0x1900, 0x191E},
-        {0x1950, 0x196D},
-        {0x1970, 0x1974},
-        {0x1980, 0x19AB},
-        {0x19B0, 0x19C9},
-        {0x1A00, 0x1A16},
-        {0x1A20, 0x1A54},
-        {0x1AA7, 0x1AA7},
-        {0x1B05, 0x1B33},
-        {0x1B45, 0x1B4C},
-        {0x1B83, 0x1BA0},
-        {0x1BAE, 0x1BAF},
-        {0x1BBA, 0x1BE5},
-        {0x1C00, 0x1C23},
-        {0x1C4D, 0x1C4F},
-        {0x1C5A, 0x1C7D},
-        {0x1C80, 0x1C8A},
-        {0x1C90, 0x1CBA},
-        {0x1CBD, 0x1CBF},
-        {0x1CE9, 0x1CEC},
-        {0x1CEE, 0x1CF3},
-        {0x1CF5, 0x1CF6},
-        {0x1CFA, 0x1CFA},
-        {0x1D00, 0x1DBF},
-        {0x1E00, 0x1F15},
-        {0x1F18, 0x1F1D},
-        {0x1F20, 0x1F45},
-        {0x1F48, 0x1F4D},
-        {0x1F50, 0x1F57},
-        {0x1F59, 0x1F59},
-        {0x1F5B, 0x1F5B},
-        {0x1F5D, 0x1F5D},
-        {0x1F5F, 0x1F7D},
-        {0x1F80, 0x1FB4},
-        {0x1FB6, 0x1FBC},
-        {0x1FBE, 0x1FBE},
-        {0x1FC2, 0x1FC4},
-        {0x1FC6, 0x1FCC},
-        {0x1FD0, 0x1FD3},
-        {0x1FD6, 0x1FDB},
-        {0x1FE0, 0x1FEC},
-        {0x1FF2, 0x1FF4},
-        {0x1FF6, 0x1FFC},
-        {0x2071, 0x2071},
-        {0x207F, 0x207F},
-        {0x2090, 0x209C},
-        {0x2102, 0x2102},
-        {0x2107, 0x2107},
-        {0x210A, 0x2113},
-        {0x2115, 0x2115},
-        {0x2119, 0x211D},
-        {0x2124, 0x2124},
-        {0x2126, 0x2126},
-        {0x2128, 0x2128},
-        {0x212A, 0x212D},
-        {0x212F, 0x2139},
-        {0x213C, 0x213F},
-        {0x2145, 0x2149},
-        {0x214E, 0x214E},
-        {0x2183, 0x2184},
-        {0x2C00, 0x2CE4},
-        {0x2CEB, 0x2CEE},
-        {0x2CF2, 0x2CF3},
-        {0x2D00, 0x2D25},
-        {0x2D27, 0x2D27},
-        {0x2D2D, 0x2D2D},
-        {0x2D30, 0x2D67},
-        {0x2D6F, 0x2D6F},
-        {0x2D80, 0x2D96},
-        {0x2DA0, 0x2DA6},
-        {0x2DA8, 0x2DAE},
-        {0x2DB0, 0x2DB6},
-        {0x2DB8, 0x2DBE},
-        {0x2DC0, 0x2DC6},
-        {0x2DC8, 0x2DCE},
-        {0x2DD0, 0x2DD6},
-        {0x2DD8, 0x2DDE},
-        {0x2E2F, 0x2E2F},
-        {0x3005, 0x3006},
-        {0x3031, 0x3035},
-        {0x303B, 0x303C},
-        {0x3041, 0x3096},
-        {0x309D, 0x309F},
-        {0x30A1, 0x30FA},
-        {0x30FC, 0x30FF},
-        {0x3105, 0x312F},
-        {0x3131, 0x318E},
-        {0x31A0, 0x31BF},
-        {0x31F0, 0x31FF},
-        {0x3400, 0x4DBF},
-        {0x4E00, 0xA48C},
-        {0xA4D0, 0xA4FD},
-        {0xA500, 0xA60C},
-        {0xA610, 0xA61F},
-        {0xA62A, 0xA62B},
-        {0xA640, 0xA66E},
-        {0xA67F, 0xA69D},
-        {0xA6A0, 0xA6E5},
-        {0xA717, 0xA71F},
-        {0xA722, 0xA788},
-        {0xA78B, 0xA7DC},
-        {0xA7F1, 0xA801},
-        {0xA803, 0xA805},
-        {0xA807, 0xA80A},
-        {0xA80C, 0xA822},
-        {0xA840, 0xA873},
-        {0xA882, 0xA8B3},
-        {0xA8F2, 0xA8F7},
-        {0xA8FB, 0xA8FB},
-        {0xA8FD, 0xA8FE},
-        {0xA90A, 0xA925},
-        {0xA930, 0xA946},
-        {0xA960, 0xA97C},
-        {0xA984, 0xA9B2},
-        {0xA9CF, 0xA9CF},
-        {0xA9E0, 0xA9E4},
-        {0xA9E6, 0xA9EF},
-        {0xA9FA, 0xA9FE},
-        {0xAA00, 0xAA28},
-        {0xAA40, 0xAA42},
-        {0xAA44, 0xAA4B},
-        {0xAA60, 0xAA76},
-        {0xAA7A, 0xAA7A},
-        {0xAA7E, 0xAAAF},
-        {0xAAB1, 0xAAB1},
-        {0xAAB5, 0xAAB6},
-        {0xAAB9, 0xAABD},
-        {0xAAC0, 0xAAC0},
-        {0xAAC2, 0xAAC2},
-        {0xAADB, 0xAADD},
-        {0xAAE0, 0xAAEA},
-        {0xAAF2, 0xAAF4},
-        {0xAB01, 0xAB06},
-        {0xAB09, 0xAB0E},
-        {0xAB11, 0xAB16},
-        {0xAB20, 0xAB26},
-        {0xAB28, 0xAB2E},
-        {0xAB30, 0xAB5A},
-        {0xAB5C, 0xAB69},
-        {0xAB70, 0xABE2},
-        {0xAC00, 0xD7A3},
-        {0xD7B0, 0xD7C6},
-        {0xD7CB, 0xD7FB},
-        {0xF900, 0xFA6D},
-        {0xFA70, 0xFAD9},
-        {0xFB00, 0xFB06},
-        {0xFB13, 0xFB17},
-        {0xFB1D, 0xFB1D},
-        {0xFB1F, 0xFB28},
-        {0xFB2A, 0xFB36},
-        {0xFB38, 0xFB3C},
-        {0xFB3E, 0xFB3E},
-        {0xFB40, 0xFB41},
-        {0xFB43, 0xFB44},
-        {0xFB46, 0xFBB1},
-        {0xFBD3, 0xFD3D},
-        {0xFD50, 0xFD8F},
-        {0xFD92, 0xFDC7},
-        {0xFDF0, 0xFDFB},
-        {0xFE70, 0xFE74},
-        {0xFE76, 0xFEFC},
-        {0xFF21, 0xFF3A},
-        {0xFF41, 0xFF5A},
-        {0xFF66, 0xFFBE},
-        {0xFFC2, 0xFFC7},
-        {0xFFCA, 0xFFCF},
-        {0xFFD2, 0xFFD7},
-        {0xFFDA, 0xFFDC},
-        {0x10000, 0x1000B},
-        {0x1000D, 0x10026},
-        {0x10028, 0x1003A},
-        {0x1003C, 0x1003D},
-        {0x1003F, 0x1004D},
-        {0x10050, 0x1005D},
-        {0x10080, 0x100FA},
-        {0x10280, 0x1029C},
-        {0x102A0, 0x102D0},
-        {0x10300, 0x1031F},
-        {0x1032D, 0x10340},
-        {0x10342, 0x10349},
-        {0x10350, 0x10375},
-        {0x10380, 0x1039D},
-        {0x103A0, 0x103C3},
-        {0x103C8, 0x103CF},
-        {0x10400, 0x1049D},
-        {0x104B0, 0x104D3},
-        {0x104D8, 0x104FB},
-        {0x10500, 0x10527},
-        {0x10530, 0x10563},
-        {0x10570, 0x1057A},
-        {0x1057C, 0x1058A},
-        {0x1058C, 0x10592},
-        {0x10594, 0x10595},
-        {0x10597, 0x105A1},
-        {0x105A3, 0x105B1},
-        {0x105B3, 0x105B9},
-        {0x105BB, 0x105BC},
-        {0x105C0, 0x105F3},
-        {0x10600, 0x10736},
-        {0x10740, 0x10755},
-        {0x10760, 0x10767},
-        {0x10780, 0x10785},
-        {0x10787, 0x107B0},
-        {0x107B2, 0x107BA},
-        {0x10800, 0x10805},
-        {0x10808, 0x10808},
-        {0x1080A, 0x10835},
-        {0x10837, 0x10838},
-        {0x1083C, 0x1083C},
-        {0x1083F, 0x10855},
-        {0x10860, 0x10876},
-        {0x10880, 0x1089E},
-        {0x108E0, 0x108F2},
-        {0x108F4, 0x108F5},
-        {0x10900, 0x10915},
-        {0x10920, 0x10939},
-        {0x10940, 0x10959},
-        {0x10980, 0x109B7},
-        {0x109BE, 0x109BF},
-        {0x10A00, 0x10A00},
-        {0x10A10, 0x10A13},
-        {0x10A15, 0x10A17},
-        {0x10A19, 0x10A35},
-        {0x10A60, 0x10A7C},
-        {0x10A80, 0x10A9C},
-        {0x10AC0, 0x10AC7},
-        {0x10AC9, 0x10AE4},
-        {0x10B00, 0x10B35},
-        {0x10B40, 0x10B55},
-        {0x10B60, 0x10B72},
-        {0x10B80, 0x10B91},
-        {0x10C00, 0x10C48},
-        {0x10C80, 0x10CB2},
-        {0x10CC0, 0x10CF2},
-        {0x10D00, 0x10D23},
-        {0x10D4A, 0x10D65},
-        {0x10D6F, 0x10D85},
-        {0x10E80, 0x10EA9},
-        {0x10EB0, 0x10EB1},
-        {0x10EC2, 0x10EC7},
-        {0x10F00, 0x10F1C},
-        {0x10F27, 0x10F27},
-        {0x10F30, 0x10F45},
-        {0x10F70, 0x10F81},
-        {0x10FB0, 0x10FC4},
-        {0x10FE0, 0x10FF6},
-        {0x11003, 0x11037},
-        {0x11071, 0x11072},
-        {0x11075, 0x11075},
-        {0x11083, 0x110AF},
-        {0x110D0, 0x110E8},
-        {0x11103, 0x11126},
-        {0x11144, 0x11144},
-        {0x11147, 0x11147},
-        {0x11150, 0x11172},
-        {0x11176, 0x11176},
-        {0x11183, 0x111B2},
-        {0x111C1, 0x111C4},
-        {0x111DA, 0x111DA},
-        {0x111DC, 0x111DC},
-        {0x11200, 0x11211},
-        {0x11213, 0x1122B},
-        {0x1123F, 0x11240},
-        {0x11280, 0x11286},
-        {0x11288, 0x11288},
-        {0x1128A, 0x1128D},
-        {0x1128F, 0x1129D},
-        {0x1129F, 0x112A8},
-        {0x112B0, 0x112DE},
-        {0x11305, 0x1130C},
-        {0x1130F, 0x11310},
-        {0x11313, 0x11328},
-        {0x1132A, 0x11330},
-        {0x11332, 0x11333},
-        {0x11335, 0x11339},
-        {0x1133D, 0x1133D},
-        {0x11350, 0x11350},
-        {0x1135D, 0x11361},
-        {0x11380, 0x11389},
-        {0x1138B, 0x1138B},
-        {0x1138E, 0x1138E},
-        {0x11390, 0x113B5},
-        {0x113B7, 0x113B7},
-        {0x113D1, 0x113D1},
-        {0x113D3, 0x113D3},
-        {0x11400, 0x11434},
-        {0x11447, 0x1144A},
-        {0x1145F, 0x11461},
-        {0x11480, 0x114AF},
-        {0x114C4, 0x114C5},
-        {0x114C7, 0x114C7},
-        {0x11580, 0x115AE},
-        {0x115D8, 0x115DB},
-        {0x11600, 0x1162F},
-        {0x11644, 0x11644},
-        {0x11680, 0x116AA},
-        {0x116B8, 0x116B8},
-        {0x11700, 0x1171A},
-        {0x11740, 0x11746},
-        {0x11800, 0x1182B},
-        {0x118A0, 0x118DF},
-        {0x118FF, 0x11906},
-        {0x11909, 0x11909},
-        {0x1190C, 0x11913},
-        {0x11915, 0x11916},
-        {0x11918, 0x1192F},
-        {0x1193F, 0x1193F},
-        {0x11941, 0x11941},
-        {0x119A0, 0x119A7},
-        {0x119AA, 0x119D0},
-        {0x119E1, 0x119E1},
-        {0x119E3, 0x119E3},
-        {0x11A00, 0x11A00},
-        {0x11A0B, 0x11A32},
-        {0x11A3A, 0x11A3A},
-        {0x11A50, 0x11A50},
-        {0x11A5C, 0x11A89},
-        {0x11A9D, 0x11A9D},
-        {0x11AB0, 0x11AF8},
-        {0x11BC0, 0x11BE0},
-        {0x11C00, 0x11C08},
-        {0x11C0A, 0x11C2E},
-        {0x11C40, 0x11C40},
-        {0x11C72, 0x11C8F},
-        {0x11D00, 0x11D06},
-        {0x11D08, 0x11D09},
-        {0x11D0B, 0x11D30},
-        {0x11D46, 0x11D46},
-        {0x11D60, 0x11D65},
-        {0x11D67, 0x11D68},
-        {0x11D6A, 0x11D89},
-        {0x11D98, 0x11D98},
-        {0x11DB0, 0x11DDB},
-        {0x11EE0, 0x11EF2},
-        {0x11F02, 0x11F02},
-        {0x11F04, 0x11F10},
-        {0x11F12, 0x11F33},
-        {0x11FB0, 0x11FB0},
-        {0x12000, 0x12399},
-        {0x12480, 0x12543},
-        {0x12F90, 0x12FF0},
-        {0x13000, 0x1342F},
-        {0x13441, 0x13446},
-        {0x13460, 0x143FA},
-        {0x14400, 0x14646},
-        {0x16100, 0x1611D},
-        {0x16800, 0x16A38},
-        {0x16A40, 0x16A5E},
-        {0x16A70, 0x16ABE},
-        {0x16AD0, 0x16AED},
-        {0x16B00, 0x16B2F},
-        {0x16B40, 0x16B43},
-        {0x16B63, 0x16B77},
-        {0x16B7D, 0x16B8F},
-        {0x16D40, 0x16D6C},
-        {0x16E40, 0x16E7F},
-        {0x16EA0, 0x16EB8},
-        {0x16EBB, 0x16ED3},
-        {0x16F00, 0x16F4A},
-        {0x16F50, 0x16F50},
-        {0x16F93, 0x16F9F},
-        {0x16FE0, 0x16FE1},
-        {0x16FE3, 0x16FE3},
-        {0x16FF2, 0x16FF3},
-        {0x17000, 0x18CD5},
-        {0x18CFF, 0x18D1E},
-        {0x18D80, 0x18DF2},
-        {0x1AFF0, 0x1AFF3},
-        {0x1AFF5, 0x1AFFB},
-        {0x1AFFD, 0x1AFFE},
-        {0x1B000, 0x1B122},
-        {0x1B132, 0x1B132},
-        {0x1B150, 0x1B152},
-        {0x1B155, 0x1B155},
-        {0x1B164, 0x1B167},
-        {0x1B170, 0x1B2FB},
-        {0x1BC00, 0x1BC6A},
-        {0x1BC70, 0x1BC7C},
-        {0x1BC80, 0x1BC88},
-        {0x1BC90, 0x1BC99},
-        {0x1D400, 0x1D454},
-        {0x1D456, 0x1D49C},
-        {0x1D49E, 0x1D49F},
-        {0x1D4A2, 0x1D4A2},
-        {0x1D4A5, 0x1D4A6},
-        {0x1D4A9, 0x1D4AC},
-        {0x1D4AE, 0x1D4B9},
-        {0x1D4BB, 0x1D4BB},
-        {0x1D4BD, 0x1D4C3},
-        {0x1D4C5, 0x1D505},
-        {0x1D507, 0x1D50A},
-        {0x1D50D, 0x1D514},
-        {0x1D516, 0x1D51C},
-        {0x1D51E, 0x1D539},
-        {0x1D53B, 0x1D53E},
-        {0x1D540, 0x1D544},
-        {0x1D546, 0x1D546},
-        {0x1D54A, 0x1D550},
-        {0x1D552, 0x1D6A5},
-        {0x1D6A8, 0x1D6C0},
-        {0x1D6C2, 0x1D6DA},
-        {0x1D6DC, 0x1D6FA},
-        {0x1D6FC, 0x1D714},
-        {0x1D716, 0x1D734},
-        {0x1D736, 0x1D74E},
-        {0x1D750, 0x1D76E},
-        {0x1D770, 0x1D788},
-        {0x1D78A, 0x1D7A8},
-        {0x1D7AA, 0x1D7C2},
-        {0x1D7C4, 0x1D7CB},
-        {0x1DF00, 0x1DF1E},
-        {0x1DF25, 0x1DF2A},
-        {0x1E030, 0x1E06D},
-        {0x1E100, 0x1E12C},
-        {0x1E137, 0x1E13D},
-        {0x1E14E, 0x1E14E},
-        {0x1E290, 0x1E2AD},
-        {0x1E2C0, 0x1E2EB},
-        {0x1E4D0, 0x1E4EB},
-        {0x1E5D0, 0x1E5ED},
-        {0x1E5F0, 0x1E5F0},
-        {0x1E6C0, 0x1E6DE},
-        {0x1E6E0, 0x1E6E2},
-        {0x1E6E4, 0x1E6E5},
-        {0x1E6E7, 0x1E6ED},
-        {0x1E6F0, 0x1E6F4},
-        {0x1E6FE, 0x1E6FF},
-        {0x1E7E0, 0x1E7E6},
-        {0x1E7E8, 0x1E7EB},
-        {0x1E7ED, 0x1E7EE},
-        {0x1E7F0, 0x1E7FE},
-        {0x1E800, 0x1E8C4},
-        {0x1E900, 0x1E943},
-        {0x1E94B, 0x1E94B},
-        {0x1EE00, 0x1EE03},
-        {0x1EE05, 0x1EE1F},
-        {0x1EE21, 0x1EE22},
-        {0x1EE24, 0x1EE24},
-        {0x1EE27, 0x1EE27},
-        {0x1EE29, 0x1EE32},
-        {0x1EE34, 0x1EE37},
-        {0x1EE39, 0x1EE39},
-        {0x1EE3B, 0x1EE3B},
-        {0x1EE42, 0x1EE42},
-        {0x1EE47, 0x1EE47},
-        {0x1EE49, 0x1EE49},
-        {0x1EE4B, 0x1EE4B},
-        {0x1EE4D, 0x1EE4F},
-        {0x1EE51, 0x1EE52},
-        {0x1EE54, 0x1EE54},
-        {0x1EE57, 0x1EE57},
-        {0x1EE59, 0x1EE59},
-        {0x1EE5B, 0x1EE5B},
-        {0x1EE5D, 0x1EE5D},
-        {0x1EE5F, 0x1EE5F},
-        {0x1EE61, 0x1EE62},
-        {0x1EE64, 0x1EE64},
-        {0x1EE67, 0x1EE6A},
-        {0x1EE6C, 0x1EE72},
-        {0x1EE74, 0x1EE77},
-        {0x1EE79, 0x1EE7C},
-        {0x1EE7E, 0x1EE7E},
-        {0x1EE80, 0x1EE89},
-        {0x1EE8B, 0x1EE9B},
-        {0x1EEA1, 0x1EEA3},
-        {0x1EEA5, 0x1EEA9},
-        {0x1EEAB, 0x1EEBB},
-        {0x20000, 0x2A6DF},
-        {0x2A700, 0x2B81D},
-        {0x2B820, 0x2CEAD},
-        {0x2CEB0, 0x2EBE0},
-        {0x2EBF0, 0x2EE5D},
-        {0x2F800, 0x2FA1D},
-        {0x30000, 0x3134A},
-        {0x31350, 0x33479},
-    };
-
-    for (const auto& r : ranges) {
-        if (ch >= r.start && ch <= r.end)
-            return true;
-    }
-    return false;
-}
-
-bool is_space(char32_t cp) {
-    switch (cp) {
-        case 0x0009:  // TAB \t
-        case 0x000A:  // LF \n
-        case 0x000B:  // VT
-        case 0x000C:  // FF
-        case 0x000D:  // CR \r
-        case 0x0020:  // Space
-        case 0x00A0:  // No-Break Space
-        case 0x1680:  // Ogham Space Mark
-        case 0x2000:  // En Quad
-        case 0x2001:  // Em Quad
-        case 0x2002:  // En Space
-        case 0x2003:  // Em Space
-        case 0x2004:  // Three-Per-Em Space
-        case 0x2005:  // Four-Per-Em Space
-        case 0x2006:  // Six-Per-Em Space
-        case 0x2007:  // Figure Space
-        case 0x2008:  // Punctuation Space
-        case 0x2009:  // Thin Space
-        case 0x200A:  // Hair Space
-        case 0x202F:  // Narrow No-Break Space
-        case 0x205F:  // Medium Mathematical Space
-        case 0x3000:  // Ideographic Space
-            return true;
-        default:
-            return false;
-    }
-}
-
-std::string str_to_lower(const std::string& input) {
-    std::string result = input;
-    std::transform(result.begin(), result.end(), result.begin(),
-                   [](unsigned char c) { return std::tolower(c); });
-    return result;
-}
-
-// UTF-8 -> Unicode code points
-std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
-    std::vector<char32_t> codepoints;
-    size_t i = 0;
-    while (i < str.size()) {
-        unsigned char c    = str[i];
-        char32_t cp        = 0;
-        size_t extra_bytes = 0;
-
-        if ((c & 0x80) == 0)
-            cp = c;
-        else if ((c & 0xE0) == 0xC0) {
-            cp          = c & 0x1F;
-            extra_bytes = 1;
-        } else if ((c & 0xF0) == 0xE0) {
-            cp          = c & 0x0F;
-            extra_bytes = 2;
-        } else if ((c & 0xF8) == 0xF0) {
-            cp          = c & 0x07;
-            extra_bytes = 3;
-        } else {
-            ++i;
-            continue;
-        }  // Invalid UTF-8
-
-        if (i + extra_bytes >= str.size())
-            break;
-
-        for (size_t j = 1; j <= extra_bytes; ++j)
-            cp = (cp << 6) | (str[i + j] & 0x3F);
-
-        codepoints.push_back(cp);
-        i += 1 + extra_bytes;
-    }
-    return codepoints;
-}
-
-// Unicode code point -> UTF-8
-std::string codepoint_to_utf8(char32_t cp) {
-    std::string out;
-    if (cp <= 0x7F)
-        out.push_back(static_cast<char>(cp));
-    else if (cp <= 0x7FF) {
-        out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    } else if (cp <= 0xFFFF) {
-        out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    } else {
-        out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
-        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
-    }
-    return out;
-}
-
-bool starts_with(const std::vector<char32_t>& text,
-                 const std::vector<char32_t>& prefix,
-                 std::size_t index) {
-    if (index > text.size()) {
-        return false;
-    }
-    if (prefix.size() > text.size() - index) {
-        return false;
-    }
-    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
-}
-
-// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
-// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
-std::vector<std::string> token_split(const std::string& text) {
-    std::vector<std::string> tokens;
-    auto cps = utf8_to_codepoints(text);
-    size_t i = 0;
-
-    while (i < cps.size()) {
-        char32_t cp = cps[i];
-
-        // `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
-        if (cp == U'\'' && i + 1 < cps.size()) {
-            std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
-            if (next == "s" || next == "t" || next == "m") {
-                tokens.push_back("'" + next);
-                i += 2;
-                continue;
-            }
-            if (i + 2 < cps.size()) {
-                next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
-                if (next == "re" || next == "ve" || next == "ll" || next == "d") {
-                    tokens.push_back("'" + next);
-                    i += 3;
-                    continue;
-                }
-            }
-        }
-
-        // `\p{N}`
-        if (is_number(cp)) {
-            tokens.push_back(codepoint_to_utf8(cp));
-            ++i;
-            continue;
-        }
-
-        // `[^\r\n\p{L}\p{N}]?\p{L}+`
-        {
-            // `[^\r\n\p{L}\p{N}]\p{L}+`
-            if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-
-                while (i < cps.size() && is_letter(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-                tokens.push_back(token);
-                continue;
-            }
-
-            // `\p{L}+`
-            if (is_letter(cp)) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-                while (i < cps.size() && is_letter(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-                tokens.push_back(token);
-                continue;
-            }
-        }
-
-        // ` ?[^\s\p{L}\p{N}]+[\r\n]*`
-        {
-            // ` [^\s\p{L}\p{N}]+[\r\n]*`
-            if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
-                std::string token = codepoint_to_utf8(cp);
-                token += codepoint_to_utf8(cps[i + 1]);
-                i += 2;
-
-                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                tokens.push_back(token);
-                continue;
-            }
-
-            // `[^\s\p{L}\p{N}]+[\r\n]*`
-            std::string token;
-            if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                std::string token = codepoint_to_utf8(cp);
-                ++i;
-
-                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
-                    token += codepoint_to_utf8(cps[i]);
-                    ++i;
-                }
-
-                tokens.push_back(token);
-                continue;
-            }
-        }
-
-        // `\s*[\r\n]+|\s+(?!\S)|\s+`
-        if (is_space(cp)) {
-            std::string token;
-            bool saw_new_line = false;
-
-            while (i < cps.size() && is_space(cps[i])) {
-                token += codepoint_to_utf8(cps[i]);
-
-                if (cps[i] == U'\r' || cps[i] == U'\n') {
-                    saw_new_line = true;
-                } else {
-                    if (saw_new_line) {
-                        break;
-                    }
-                }
-
-                ++i;
-            }
-
-            tokens.push_back(token);
-            continue;
-        }
-
-        // skip
-        ++i;
-    }
-
-    return tokens;
-}
-
-std::vector<std::string> split_with_special_tokens(
-    const std::string& text,
-    const std::vector<std::string>& special_tokens) {
-    std::vector<std::string> result;
-    size_t pos      = 0;
-    size_t text_len = text.size();
-
-    while (pos < text_len) {
-        size_t next_pos = text_len;
-        std::string matched_token;
-
-        for (const auto& token : special_tokens) {
-            size_t token_pos = text.find(token, pos);
-            if (token_pos != std::string::npos && token_pos < next_pos) {
-                next_pos      = token_pos;
-                matched_token = token;
-            }
-        }
-
-        if (next_pos > pos) {
-            result.push_back(text.substr(pos, next_pos - pos));
-        }
-
-        if (!matched_token.empty()) {
-            result.push_back(matched_token);
-            pos = next_pos + matched_token.size();
-        } else {
-            break;
-        }
-    }
-
-    return result;
-}
-
-// int main() {
-//     std::string text = "I'm testing C++ token_split function. Hello world 123";
-//     auto tokens = token_split(text);
-
-//     for (const auto& t : tokens) {
-//         std::cout << "[" << t << "] ";
-//     }
-//     std::cout << "\n";
-//     return 0;
-// }
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "tokenize_util.h"
+
+bool is_number(char32_t ch) {
+    return (ch >= U'0' && ch <= U'9');
+}
+
+bool is_letter(char32_t ch) {
+    static const struct {
+        char32_t start, end;
+    } ranges[] = {
+        {0x41, 0x5A},
+        {0x61, 0x7A},
+        {0xAA, 0xAA},
+        {0xB5, 0xB5},
+        {0xBA, 0xBA},
+        {0xC0, 0xD6},
+        {0xD8, 0xF6},
+        {0xF8, 0x2C1},
+        {0x2C6, 0x2D1},
+        {0x2E0, 0x2E4},
+        {0x2EC, 0x2EC},
+        {0x2EE, 0x2EE},
+        {0x370, 0x374},
+        {0x376, 0x377},
+        {0x37A, 0x37D},
+        {0x37F, 0x37F},
+        {0x386, 0x386},
+        {0x388, 0x38A},
+        {0x38C, 0x38C},
+        {0x38E, 0x3A1},
+        {0x3A3, 0x3F5},
+        {0x3F7, 0x481},
+        {0x48A, 0x52F},
+        {0x531, 0x556},
+        {0x559, 0x559},
+        {0x560, 0x588},
+        {0x5D0, 0x5EA},
+        {0x5EF, 0x5F2},
+        {0x620, 0x64A},
+        {0x66E, 0x66F},
+        {0x671, 0x6D3},
+        {0x6D5, 0x6D5},
+        {0x6E5, 0x6E6},
+        {0x6EE, 0x6EF},
+        {0x6FA, 0x6FC},
+        {0x6FF, 0x6FF},
+        {0x710, 0x710},
+        {0x712, 0x72F},
+        {0x74D, 0x7A5},
+        {0x7B1, 0x7B1},
+        {0x7CA, 0x7EA},
+        {0x7F4, 0x7F5},
+        {0x7FA, 0x7FA},
+        {0x800, 0x815},
+        {0x81A, 0x81A},
+        {0x824, 0x824},
+        {0x828, 0x828},
+        {0x840, 0x858},
+        {0x860, 0x86A},
+        {0x870, 0x887},
+        {0x889, 0x88F},
+        {0x8A0, 0x8C9},
+        {0x904, 0x939},
+        {0x93D, 0x93D},
+        {0x950, 0x950},
+        {0x958, 0x961},
+        {0x971, 0x980},
+        {0x985, 0x98C},
+        {0x98F, 0x990},
+        {0x993, 0x9A8},
+        {0x9AA, 0x9B0},
+        {0x9B2, 0x9B2},
+        {0x9B6, 0x9B9},
+        {0x9BD, 0x9BD},
+        {0x9CE, 0x9CE},
+        {0x9DC, 0x9DD},
+        {0x9DF, 0x9E1},
+        {0x9F0, 0x9F1},
+        {0x9FC, 0x9FC},
+        {0xA05, 0xA0A},
+        {0xA0F, 0xA10},
+        {0xA13, 0xA28},
+        {0xA2A, 0xA30},
+        {0xA32, 0xA33},
+        {0xA35, 0xA36},
+        {0xA38, 0xA39},
+        {0xA59, 0xA5C},
+        {0xA5E, 0xA5E},
+        {0xA72, 0xA74},
+        {0xA85, 0xA8D},
+        {0xA8F, 0xA91},
+        {0xA93, 0xAA8},
+        {0xAAA, 0xAB0},
+        {0xAB2, 0xAB3},
+        {0xAB5, 0xAB9},
+        {0xABD, 0xABD},
+        {0xAD0, 0xAD0},
+        {0xAE0, 0xAE1},
+        {0xAF9, 0xAF9},
+        {0xB05, 0xB0C},
+        {0xB0F, 0xB10},
+        {0xB13, 0xB28},
+        {0xB2A, 0xB30},
+        {0xB32, 0xB33},
+        {0xB35, 0xB39},
+        {0xB3D, 0xB3D},
+        {0xB5C, 0xB5D},
+        {0xB5F, 0xB61},
+        {0xB71, 0xB71},
+        {0xB83, 0xB83},
+        {0xB85, 0xB8A},
+        {0xB8E, 0xB90},
+        {0xB92, 0xB95},
+        {0xB99, 0xB9A},
+        {0xB9C, 0xB9C},
+        {0xB9E, 0xB9F},
+        {0xBA3, 0xBA4},
+        {0xBA8, 0xBAA},
+        {0xBAE, 0xBB9},
+        {0xBD0, 0xBD0},
+        {0xC05, 0xC0C},
+        {0xC0E, 0xC10},
+        {0xC12, 0xC28},
+        {0xC2A, 0xC39},
+        {0xC3D, 0xC3D},
+        {0xC58, 0xC5A},
+        {0xC5C, 0xC5D},
+        {0xC60, 0xC61},
+        {0xC80, 0xC80},
+        {0xC85, 0xC8C},
+        {0xC8E, 0xC90},
+        {0xC92, 0xCA8},
+        {0xCAA, 0xCB3},
+        {0xCB5, 0xCB9},
+        {0xCBD, 0xCBD},
+        {0xCDC, 0xCDE},
+        {0xCE0, 0xCE1},
+        {0xCF1, 0xCF2},
+        {0xD04, 0xD0C},
+        {0xD0E, 0xD10},
+        {0xD12, 0xD3A},
+        {0xD3D, 0xD3D},
+        {0xD4E, 0xD4E},
+        {0xD54, 0xD56},
+        {0xD5F, 0xD61},
+        {0xD7A, 0xD7F},
+        {0xD85, 0xD96},
+        {0xD9A, 0xDB1},
+        {0xDB3, 0xDBB},
+        {0xDBD, 0xDBD},
+        {0xDC0, 0xDC6},
+        {0xE01, 0xE30},
+        {0xE32, 0xE33},
+        {0xE40, 0xE46},
+        {0xE81, 0xE82},
+        {0xE84, 0xE84},
+        {0xE86, 0xE8A},
+        {0xE8C, 0xEA3},
+        {0xEA5, 0xEA5},
+        {0xEA7, 0xEB0},
+        {0xEB2, 0xEB3},
+        {0xEBD, 0xEBD},
+        {0xEC0, 0xEC4},
+        {0xEC6, 0xEC6},
+        {0xEDC, 0xEDF},
+        {0xF00, 0xF00},
+        {0xF40, 0xF47},
+        {0xF49, 0xF6C},
+        {0xF88, 0xF8C},
+        {0x1000, 0x102A},
+        {0x103F, 0x103F},
+        {0x1050, 0x1055},
+        {0x105A, 0x105D},
+        {0x1061, 0x1061},
+        {0x1065, 0x1066},
+        {0x106E, 0x1070},
+        {0x1075, 0x1081},
+        {0x108E, 0x108E},
+        {0x10A0, 0x10C5},
+        {0x10C7, 0x10C7},
+        {0x10CD, 0x10CD},
+        {0x10D0, 0x10FA},
+        {0x10FC, 0x1248},
+        {0x124A, 0x124D},
+        {0x1250, 0x1256},
+        {0x1258, 0x1258},
+        {0x125A, 0x125D},
+        {0x1260, 0x1288},
+        {0x128A, 0x128D},
+        {0x1290, 0x12B0},
+        {0x12B2, 0x12B5},
+        {0x12B8, 0x12BE},
+        {0x12C0, 0x12C0},
+        {0x12C2, 0x12C5},
+        {0x12C8, 0x12D6},
+        {0x12D8, 0x1310},
+        {0x1312, 0x1315},
+        {0x1318, 0x135A},
+        {0x1380, 0x138F},
+        {0x13A0, 0x13F5},
+        {0x13F8, 0x13FD},
+        {0x1401, 0x166C},
+        {0x166F, 0x167F},
+        {0x1681, 0x169A},
+        {0x16A0, 0x16EA},
+        {0x16F1, 0x16F8},
+        {0x1700, 0x1711},
+        {0x171F, 0x1731},
+        {0x1740, 0x1751},
+        {0x1760, 0x176C},
+        {0x176E, 0x1770},
+        {0x1780, 0x17B3},
+        {0x17D7, 0x17D7},
+        {0x17DC, 0x17DC},
+        {0x1820, 0x1878},
+        {0x1880, 0x1884},
+        {0x1887, 0x18A8},
+        {0x18AA, 0x18AA},
+        {0x18B0, 0x18F5},
+        {0x1900, 0x191E},
+        {0x1950, 0x196D},
+        {0x1970, 0x1974},
+        {0x1980, 0x19AB},
+        {0x19B0, 0x19C9},
+        {0x1A00, 0x1A16},
+        {0x1A20, 0x1A54},
+        {0x1AA7, 0x1AA7},
+        {0x1B05, 0x1B33},
+        {0x1B45, 0x1B4C},
+        {0x1B83, 0x1BA0},
+        {0x1BAE, 0x1BAF},
+        {0x1BBA, 0x1BE5},
+        {0x1C00, 0x1C23},
+        {0x1C4D, 0x1C4F},
+        {0x1C5A, 0x1C7D},
+        {0x1C80, 0x1C8A},
+        {0x1C90, 0x1CBA},
+        {0x1CBD, 0x1CBF},
+        {0x1CE9, 0x1CEC},
+        {0x1CEE, 0x1CF3},
+        {0x1CF5, 0x1CF6},
+        {0x1CFA, 0x1CFA},
+        {0x1D00, 0x1DBF},
+        {0x1E00, 0x1F15},
+        {0x1F18, 0x1F1D},
+        {0x1F20, 0x1F45},
+        {0x1F48, 0x1F4D},
+        {0x1F50, 0x1F57},
+        {0x1F59, 0x1F59},
+        {0x1F5B, 0x1F5B},
+        {0x1F5D, 0x1F5D},
+        {0x1F5F, 0x1F7D},
+        {0x1F80, 0x1FB4},
+        {0x1FB6, 0x1FBC},
+        {0x1FBE, 0x1FBE},
+        {0x1FC2, 0x1FC4},
+        {0x1FC6, 0x1FCC},
+        {0x1FD0, 0x1FD3},
+        {0x1FD6, 0x1FDB},
+        {0x1FE0, 0x1FEC},
+        {0x1FF2, 0x1FF4},
+        {0x1FF6, 0x1FFC},
+        {0x2071, 0x2071},
+        {0x207F, 0x207F},
+        {0x2090, 0x209C},
+        {0x2102, 0x2102},
+        {0x2107, 0x2107},
+        {0x210A, 0x2113},
+        {0x2115, 0x2115},
+        {0x2119, 0x211D},
+        {0x2124, 0x2124},
+        {0x2126, 0x2126},
+        {0x2128, 0x2128},
+        {0x212A, 0x212D},
+        {0x212F, 0x2139},
+        {0x213C, 0x213F},
+        {0x2145, 0x2149},
+        {0x214E, 0x214E},
+        {0x2183, 0x2184},
+        {0x2C00, 0x2CE4},
+        {0x2CEB, 0x2CEE},
+        {0x2CF2, 0x2CF3},
+        {0x2D00, 0x2D25},
+        {0x2D27, 0x2D27},
+        {0x2D2D, 0x2D2D},
+        {0x2D30, 0x2D67},
+        {0x2D6F, 0x2D6F},
+        {0x2D80, 0x2D96},
+        {0x2DA0, 0x2DA6},
+        {0x2DA8, 0x2DAE},
+        {0x2DB0, 0x2DB6},
+        {0x2DB8, 0x2DBE},
+        {0x2DC0, 0x2DC6},
+        {0x2DC8, 0x2DCE},
+        {0x2DD0, 0x2DD6},
+        {0x2DD8, 0x2DDE},
+        {0x2E2F, 0x2E2F},
+        {0x3005, 0x3006},
+        {0x3031, 0x3035},
+        {0x303B, 0x303C},
+        {0x3041, 0x3096},
+        {0x309D, 0x309F},
+        {0x30A1, 0x30FA},
+        {0x30FC, 0x30FF},
+        {0x3105, 0x312F},
+        {0x3131, 0x318E},
+        {0x31A0, 0x31BF},
+        {0x31F0, 0x31FF},
+        {0x3400, 0x4DBF},
+        {0x4E00, 0xA48C},
+        {0xA4D0, 0xA4FD},
+        {0xA500, 0xA60C},
+        {0xA610, 0xA61F},
+        {0xA62A, 0xA62B},
+        {0xA640, 0xA66E},
+        {0xA67F, 0xA69D},
+        {0xA6A0, 0xA6E5},
+        {0xA717, 0xA71F},
+        {0xA722, 0xA788},
+        {0xA78B, 0xA7DC},
+        {0xA7F1, 0xA801},
+        {0xA803, 0xA805},
+        {0xA807, 0xA80A},
+        {0xA80C, 0xA822},
+        {0xA840, 0xA873},
+        {0xA882, 0xA8B3},
+        {0xA8F2, 0xA8F7},
+        {0xA8FB, 0xA8FB},
+        {0xA8FD, 0xA8FE},
+        {0xA90A, 0xA925},
+        {0xA930, 0xA946},
+        {0xA960, 0xA97C},
+        {0xA984, 0xA9B2},
+        {0xA9CF, 0xA9CF},
+        {0xA9E0, 0xA9E4},
+        {0xA9E6, 0xA9EF},
+        {0xA9FA, 0xA9FE},
+        {0xAA00, 0xAA28},
+        {0xAA40, 0xAA42},
+        {0xAA44, 0xAA4B},
+        {0xAA60, 0xAA76},
+        {0xAA7A, 0xAA7A},
+        {0xAA7E, 0xAAAF},
+        {0xAAB1, 0xAAB1},
+        {0xAAB5, 0xAAB6},
+        {0xAAB9, 0xAABD},
+        {0xAAC0, 0xAAC0},
+        {0xAAC2, 0xAAC2},
+        {0xAADB, 0xAADD},
+        {0xAAE0, 0xAAEA},
+        {0xAAF2, 0xAAF4},
+        {0xAB01, 0xAB06},
+        {0xAB09, 0xAB0E},
+        {0xAB11, 0xAB16},
+        {0xAB20, 0xAB26},
+        {0xAB28, 0xAB2E},
+        {0xAB30, 0xAB5A},
+        {0xAB5C, 0xAB69},
+        {0xAB70, 0xABE2},
+        {0xAC00, 0xD7A3},
+        {0xD7B0, 0xD7C6},
+        {0xD7CB, 0xD7FB},
+        {0xF900, 0xFA6D},
+        {0xFA70, 0xFAD9},
+        {0xFB00, 0xFB06},
+        {0xFB13, 0xFB17},
+        {0xFB1D, 0xFB1D},
+        {0xFB1F, 0xFB28},
+        {0xFB2A, 0xFB36},
+        {0xFB38, 0xFB3C},
+        {0xFB3E, 0xFB3E},
+        {0xFB40, 0xFB41},
+        {0xFB43, 0xFB44},
+        {0xFB46, 0xFBB1},
+        {0xFBD3, 0xFD3D},
+        {0xFD50, 0xFD8F},
+        {0xFD92, 0xFDC7},
+        {0xFDF0, 0xFDFB},
+        {0xFE70, 0xFE74},
+        {0xFE76, 0xFEFC},
+        {0xFF21, 0xFF3A},
+        {0xFF41, 0xFF5A},
+        {0xFF66, 0xFFBE},
+        {0xFFC2, 0xFFC7},
+        {0xFFCA, 0xFFCF},
+        {0xFFD2, 0xFFD7},
+        {0xFFDA, 0xFFDC},
+        {0x10000, 0x1000B},
+        {0x1000D, 0x10026},
+        {0x10028, 0x1003A},
+        {0x1003C, 0x1003D},
+        {0x1003F, 0x1004D},
+        {0x10050, 0x1005D},
+        {0x10080, 0x100FA},
+        {0x10280, 0x1029C},
+        {0x102A0, 0x102D0},
+        {0x10300, 0x1031F},
+        {0x1032D, 0x10340},
+        {0x10342, 0x10349},
+        {0x10350, 0x10375},
+        {0x10380, 0x1039D},
+        {0x103A0, 0x103C3},
+        {0x103C8, 0x103CF},
+        {0x10400, 0x1049D},
+        {0x104B0, 0x104D3},
+        {0x104D8, 0x104FB},
+        {0x10500, 0x10527},
+        {0x10530, 0x10563},
+        {0x10570, 0x1057A},
+        {0x1057C, 0x1058A},
+        {0x1058C, 0x10592},
+        {0x10594, 0x10595},
+        {0x10597, 0x105A1},
+        {0x105A3, 0x105B1},
+        {0x105B3, 0x105B9},
+        {0x105BB, 0x105BC},
+        {0x105C0, 0x105F3},
+        {0x10600, 0x10736},
+        {0x10740, 0x10755},
+        {0x10760, 0x10767},
+        {0x10780, 0x10785},
+        {0x10787, 0x107B0},
+        {0x107B2, 0x107BA},
+        {0x10800, 0x10805},
+        {0x10808, 0x10808},
+        {0x1080A, 0x10835},
+        {0x10837, 0x10838},
+        {0x1083C, 0x1083C},
+        {0x1083F, 0x10855},
+        {0x10860, 0x10876},
+        {0x10880, 0x1089E},
+        {0x108E0, 0x108F2},
+        {0x108F4, 0x108F5},
+        {0x10900, 0x10915},
+        {0x10920, 0x10939},
+        {0x10940, 0x10959},
+        {0x10980, 0x109B7},
+        {0x109BE, 0x109BF},
+        {0x10A00, 0x10A00},
+        {0x10A10, 0x10A13},
+        {0x10A15, 0x10A17},
+        {0x10A19, 0x10A35},
+        {0x10A60, 0x10A7C},
+        {0x10A80, 0x10A9C},
+        {0x10AC0, 0x10AC7},
+        {0x10AC9, 0x10AE4},
+        {0x10B00, 0x10B35},
+        {0x10B40, 0x10B55},
+        {0x10B60, 0x10B72},
+        {0x10B80, 0x10B91},
+        {0x10C00, 0x10C48},
+        {0x10C80, 0x10CB2},
+        {0x10CC0, 0x10CF2},
+        {0x10D00, 0x10D23},
+        {0x10D4A, 0x10D65},
+        {0x10D6F, 0x10D85},
+        {0x10E80, 0x10EA9},
+        {0x10EB0, 0x10EB1},
+        {0x10EC2, 0x10EC7},
+        {0x10F00, 0x10F1C},
+        {0x10F27, 0x10F27},
+        {0x10F30, 0x10F45},
+        {0x10F70, 0x10F81},
+        {0x10FB0, 0x10FC4},
+        {0x10FE0, 0x10FF6},
+        {0x11003, 0x11037},
+        {0x11071, 0x11072},
+        {0x11075, 0x11075},
+        {0x11083, 0x110AF},
+        {0x110D0, 0x110E8},
+        {0x11103, 0x11126},
+        {0x11144, 0x11144},
+        {0x11147, 0x11147},
+        {0x11150, 0x11172},
+        {0x11176, 0x11176},
+        {0x11183, 0x111B2},
+        {0x111C1, 0x111C4},
+        {0x111DA, 0x111DA},
+        {0x111DC, 0x111DC},
+        {0x11200, 0x11211},
+        {0x11213, 0x1122B},
+        {0x1123F, 0x11240},
+        {0x11280, 0x11286},
+        {0x11288, 0x11288},
+        {0x1128A, 0x1128D},
+        {0x1128F, 0x1129D},
+        {0x1129F, 0x112A8},
+        {0x112B0, 0x112DE},
+        {0x11305, 0x1130C},
+        {0x1130F, 0x11310},
+        {0x11313, 0x11328},
+        {0x1132A, 0x11330},
+        {0x11332, 0x11333},
+        {0x11335, 0x11339},
+        {0x1133D, 0x1133D},
+        {0x11350, 0x11350},
+        {0x1135D, 0x11361},
+        {0x11380, 0x11389},
+        {0x1138B, 0x1138B},
+        {0x1138E, 0x1138E},
+        {0x11390, 0x113B5},
+        {0x113B7, 0x113B7},
+        {0x113D1, 0x113D1},
+        {0x113D3, 0x113D3},
+        {0x11400, 0x11434},
+        {0x11447, 0x1144A},
+        {0x1145F, 0x11461},
+        {0x11480, 0x114AF},
+        {0x114C4, 0x114C5},
+        {0x114C7, 0x114C7},
+        {0x11580, 0x115AE},
+        {0x115D8, 0x115DB},
+        {0x11600, 0x1162F},
+        {0x11644, 0x11644},
+        {0x11680, 0x116AA},
+        {0x116B8, 0x116B8},
+        {0x11700, 0x1171A},
+        {0x11740, 0x11746},
+        {0x11800, 0x1182B},
+        {0x118A0, 0x118DF},
+        {0x118FF, 0x11906},
+        {0x11909, 0x11909},
+        {0x1190C, 0x11913},
+        {0x11915, 0x11916},
+        {0x11918, 0x1192F},
+        {0x1193F, 0x1193F},
+        {0x11941, 0x11941},
+        {0x119A0, 0x119A7},
+        {0x119AA, 0x119D0},
+        {0x119E1, 0x119E1},
+        {0x119E3, 0x119E3},
+        {0x11A00, 0x11A00},
+        {0x11A0B, 0x11A32},
+        {0x11A3A, 0x11A3A},
+        {0x11A50, 0x11A50},
+        {0x11A5C, 0x11A89},
+        {0x11A9D, 0x11A9D},
+        {0x11AB0, 0x11AF8},
+        {0x11BC0, 0x11BE0},
+        {0x11C00, 0x11C08},
+        {0x11C0A, 0x11C2E},
+        {0x11C40, 0x11C40},
+        {0x11C72, 0x11C8F},
+        {0x11D00, 0x11D06},
+        {0x11D08, 0x11D09},
+        {0x11D0B, 0x11D30},
+        {0x11D46, 0x11D46},
+        {0x11D60, 0x11D65},
+        {0x11D67, 0x11D68},
+        {0x11D6A, 0x11D89},
+        {0x11D98, 0x11D98},
+        {0x11DB0, 0x11DDB},
+        {0x11EE0, 0x11EF2},
+        {0x11F02, 0x11F02},
+        {0x11F04, 0x11F10},
+        {0x11F12, 0x11F33},
+        {0x11FB0, 0x11FB0},
+        {0x12000, 0x12399},
+        {0x12480, 0x12543},
+        {0x12F90, 0x12FF0},
+        {0x13000, 0x1342F},
+        {0x13441, 0x13446},
+        {0x13460, 0x143FA},
+        {0x14400, 0x14646},
+        {0x16100, 0x1611D},
+        {0x16800, 0x16A38},
+        {0x16A40, 0x16A5E},
+        {0x16A70, 0x16ABE},
+        {0x16AD0, 0x16AED},
+        {0x16B00, 0x16B2F},
+        {0x16B40, 0x16B43},
+        {0x16B63, 0x16B77},
+        {0x16B7D, 0x16B8F},
+        {0x16D40, 0x16D6C},
+        {0x16E40, 0x16E7F},
+        {0x16EA0, 0x16EB8},
+        {0x16EBB, 0x16ED3},
+        {0x16F00, 0x16F4A},
+        {0x16F50, 0x16F50},
+        {0x16F93, 0x16F9F},
+        {0x16FE0, 0x16FE1},
+        {0x16FE3, 0x16FE3},
+        {0x16FF2, 0x16FF3},
+        {0x17000, 0x18CD5},
+        {0x18CFF, 0x18D1E},
+        {0x18D80, 0x18DF2},
+        {0x1AFF0, 0x1AFF3},
+        {0x1AFF5, 0x1AFFB},
+        {0x1AFFD, 0x1AFFE},
+        {0x1B000, 0x1B122},
+        {0x1B132, 0x1B132},
+        {0x1B150, 0x1B152},
+        {0x1B155, 0x1B155},
+        {0x1B164, 0x1B167},
+        {0x1B170, 0x1B2FB},
+        {0x1BC00, 0x1BC6A},
+        {0x1BC70, 0x1BC7C},
+        {0x1BC80, 0x1BC88},
+        {0x1BC90, 0x1BC99},
+        {0x1D400, 0x1D454},
+        {0x1D456, 0x1D49C},
+        {0x1D49E, 0x1D49F},
+        {0x1D4A2, 0x1D4A2},
+        {0x1D4A5, 0x1D4A6},
+        {0x1D4A9, 0x1D4AC},
+        {0x1D4AE, 0x1D4B9},
+        {0x1D4BB, 0x1D4BB},
+        {0x1D4BD, 0x1D4C3},
+        {0x1D4C5, 0x1D505},
+        {0x1D507, 0x1D50A},
+        {0x1D50D, 0x1D514},
+        {0x1D516, 0x1D51C},
+        {0x1D51E, 0x1D539},
+        {0x1D53B, 0x1D53E},
+        {0x1D540, 0x1D544},
+        {0x1D546, 0x1D546},
+        {0x1D54A, 0x1D550},
+        {0x1D552, 0x1D6A5},
+        {0x1D6A8, 0x1D6C0},
+        {0x1D6C2, 0x1D6DA},
+        {0x1D6DC, 0x1D6FA},
+        {0x1D6FC, 0x1D714},
+        {0x1D716, 0x1D734},
+        {0x1D736, 0x1D74E},
+        {0x1D750, 0x1D76E},
+        {0x1D770, 0x1D788},
+        {0x1D78A, 0x1D7A8},
+        {0x1D7AA, 0x1D7C2},
+        {0x1D7C4, 0x1D7CB},
+        {0x1DF00, 0x1DF1E},
+        {0x1DF25, 0x1DF2A},
+        {0x1E030, 0x1E06D},
+        {0x1E100, 0x1E12C},
+        {0x1E137, 0x1E13D},
+        {0x1E14E, 0x1E14E},
+        {0x1E290, 0x1E2AD},
+        {0x1E2C0, 0x1E2EB},
+        {0x1E4D0, 0x1E4EB},
+        {0x1E5D0, 0x1E5ED},
+        {0x1E5F0, 0x1E5F0},
+        {0x1E6C0, 0x1E6DE},
+        {0x1E6E0, 0x1E6E2},
+        {0x1E6E4, 0x1E6E5},
+        {0x1E6E7, 0x1E6ED},
+        {0x1E6F0, 0x1E6F4},
+        {0x1E6FE, 0x1E6FF},
+        {0x1E7E0, 0x1E7E6},
+        {0x1E7E8, 0x1E7EB},
+        {0x1E7ED, 0x1E7EE},
+        {0x1E7F0, 0x1E7FE},
+        {0x1E800, 0x1E8C4},
+        {0x1E900, 0x1E943},
+        {0x1E94B, 0x1E94B},
+        {0x1EE00, 0x1EE03},
+        {0x1EE05, 0x1EE1F},
+        {0x1EE21, 0x1EE22},
+        {0x1EE24, 0x1EE24},
+        {0x1EE27, 0x1EE27},
+        {0x1EE29, 0x1EE32},
+        {0x1EE34, 0x1EE37},
+        {0x1EE39, 0x1EE39},
+        {0x1EE3B, 0x1EE3B},
+        {0x1EE42, 0x1EE42},
+        {0x1EE47, 0x1EE47},
+        {0x1EE49, 0x1EE49},
+        {0x1EE4B, 0x1EE4B},
+        {0x1EE4D, 0x1EE4F},
+        {0x1EE51, 0x1EE52},
+        {0x1EE54, 0x1EE54},
+        {0x1EE57, 0x1EE57},
+        {0x1EE59, 0x1EE59},
+        {0x1EE5B, 0x1EE5B},
+        {0x1EE5D, 0x1EE5D},
+        {0x1EE5F, 0x1EE5F},
+        {0x1EE61, 0x1EE62},
+        {0x1EE64, 0x1EE64},
+        {0x1EE67, 0x1EE6A},
+        {0x1EE6C, 0x1EE72},
+        {0x1EE74, 0x1EE77},
+        {0x1EE79, 0x1EE7C},
+        {0x1EE7E, 0x1EE7E},
+        {0x1EE80, 0x1EE89},
+        {0x1EE8B, 0x1EE9B},
+        {0x1EEA1, 0x1EEA3},
+        {0x1EEA5, 0x1EEA9},
+        {0x1EEAB, 0x1EEBB},
+        {0x20000, 0x2A6DF},
+        {0x2A700, 0x2B81D},
+        {0x2B820, 0x2CEAD},
+        {0x2CEB0, 0x2EBE0},
+        {0x2EBF0, 0x2EE5D},
+        {0x2F800, 0x2FA1D},
+        {0x30000, 0x3134A},
+        {0x31350, 0x33479},
+    };
+
+    for (const auto& r : ranges) {
+        if (ch >= r.start && ch <= r.end)
+            return true;
+    }
+    return false;
+}
+
+bool is_space(char32_t cp) {
+    switch (cp) {
+        case 0x0009:  // TAB \t
+        case 0x000A:  // LF \n
+        case 0x000B:  // VT
+        case 0x000C:  // FF
+        case 0x000D:  // CR \r
+        case 0x0020:  // Space
+        case 0x00A0:  // No-Break Space
+        case 0x1680:  // Ogham Space Mark
+        case 0x2000:  // En Quad
+        case 0x2001:  // Em Quad
+        case 0x2002:  // En Space
+        case 0x2003:  // Em Space
+        case 0x2004:  // Three-Per-Em Space
+        case 0x2005:  // Four-Per-Em Space
+        case 0x2006:  // Six-Per-Em Space
+        case 0x2007:  // Figure Space
+        case 0x2008:  // Punctuation Space
+        case 0x2009:  // Thin Space
+        case 0x200A:  // Hair Space
+        case 0x202F:  // Narrow No-Break Space
+        case 0x205F:  // Medium Mathematical Space
+        case 0x3000:  // Ideographic Space
+            return true;
+        default:
+            return false;
+    }
+}
+
+std::string str_to_lower(const std::string& input) {
+    std::string result = input;
+    std::transform(result.begin(), result.end(), result.begin(),
+                   [](unsigned char c) { return std::tolower(c); });
+    return result;
+}
+
+// UTF-8 -> Unicode code points
+std::vector<char32_t> utf8_to_codepoints(const std::string& str) {
+    std::vector<char32_t> codepoints;
+    size_t i = 0;
+    while (i < str.size()) {
+        unsigned char c    = str[i];
+        char32_t cp        = 0;
+        size_t extra_bytes = 0;
+
+        if ((c & 0x80) == 0)
+            cp = c;
+        else if ((c & 0xE0) == 0xC0) {
+            cp          = c & 0x1F;
+            extra_bytes = 1;
+        } else if ((c & 0xF0) == 0xE0) {
+            cp          = c & 0x0F;
+            extra_bytes = 2;
+        } else if ((c & 0xF8) == 0xF0) {
+            cp          = c & 0x07;
+            extra_bytes = 3;
+        } else {
+            ++i;
+            continue;
+        }  // Invalid UTF-8
+
+        if (i + extra_bytes >= str.size())
+            break;
+
+        for (size_t j = 1; j <= extra_bytes; ++j)
+            cp = (cp << 6) | (str[i + j] & 0x3F);
+
+        codepoints.push_back(cp);
+        i += 1 + extra_bytes;
+    }
+    return codepoints;
+}
+
+// Unicode code point -> UTF-8
+std::string codepoint_to_utf8(char32_t cp) {
+    std::string out;
+    if (cp <= 0x7F)
+        out.push_back(static_cast<char>(cp));
+    else if (cp <= 0x7FF) {
+        out.push_back(static_cast<char>(0xC0 | (cp >> 6)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else if (cp <= 0xFFFF) {
+        out.push_back(static_cast<char>(0xE0 | (cp >> 12)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    } else {
+        out.push_back(static_cast<char>(0xF0 | (cp >> 18)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 12) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | ((cp >> 6) & 0x3F)));
+        out.push_back(static_cast<char>(0x80 | (cp & 0x3F)));
+    }
+    return out;
+}
+
+bool starts_with(const std::vector<char32_t>& text,
+                 const std::vector<char32_t>& prefix,
+                 std::size_t index) {
+    if (index > text.size()) {
+        return false;
+    }
+    if (prefix.size() > text.size() - index) {
+        return false;
+    }
+    return std::equal(prefix.begin(), prefix.end(), text.begin() + index);
+}
+
+// mistral: [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+
+// qwen2: (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
+std::vector<std::string> token_split(const std::string& text) {
+    std::vector<std::string> tokens;
+    auto cps = utf8_to_codepoints(text);
+    size_t i = 0;
+
+    while (i < cps.size()) {
+        char32_t cp = cps[i];
+
+        // `(?i:'s|'t|'re|'ve|'m|'ll|'d)`
+        if (cp == U'\'' && i + 1 < cps.size()) {
+            std::string next = str_to_lower(codepoint_to_utf8(cps[i + 1]));
+            if (next == "s" || next == "t" || next == "m") {
+                tokens.push_back("'" + next);
+                i += 2;
+                continue;
+            }
+            if (i + 2 < cps.size()) {
+                next += str_to_lower(codepoint_to_utf8(cps[i + 2]));
+                if (next == "re" || next == "ve" || next == "ll" || next == "d") {
+                    tokens.push_back("'" + next);
+                    i += 3;
+                    continue;
+                }
+            }
+        }
+
+        // `\p{N}`
+        if (is_number(cp)) {
+            tokens.push_back(codepoint_to_utf8(cp));
+            ++i;
+            continue;
+        }
+
+        // `[^\r\n\p{L}\p{N}]?\p{L}+`
+        {
+            // `[^\r\n\p{L}\p{N}]\p{L}+`
+            if (!is_letter(cp) && cp != U'\r' && cp != U'\n' && i + 1 < cps.size() && is_letter(cps[i + 1])) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+
+                while (i < cps.size() && is_letter(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+                tokens.push_back(token);
+                continue;
+            }
+
+            // `\p{L}+`
+            if (is_letter(cp)) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+                while (i < cps.size() && is_letter(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+                tokens.push_back(token);
+                continue;
+            }
+        }
+
+        // ` ?[^\s\p{L}\p{N}]+[\r\n]*`
+        {
+            // ` [^\s\p{L}\p{N}]+[\r\n]*`
+            if (cp == U' ' && i + 1 < cps.size() && !isspace(cps[i + 1]) && !is_letter(cps[i + 1]) && !is_number(cps[i + 1])) {
+                std::string token = codepoint_to_utf8(cp);
+                token += codepoint_to_utf8(cps[i + 1]);
+                i += 2;
+
+                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                tokens.push_back(token);
+                continue;
+            }
+
+            // `[^\s\p{L}\p{N}]+[\r\n]*`
+            std::string token;
+            if (!is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                std::string token = codepoint_to_utf8(cp);
+                ++i;
+
+                while (i < cps.size() && !is_letter(cps[i]) && !is_number(cps[i]) && !isspace(cps[i])) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                while (i < cps.size() && (cps[i] == U'\r' || cps[i] == U'\n')) {
+                    token += codepoint_to_utf8(cps[i]);
+                    ++i;
+                }
+
+                tokens.push_back(token);
+                continue;
+            }
+        }
+
+        // `\s*[\r\n]+|\s+(?!\S)|\s+`
+        if (is_space(cp)) {
+            std::string token;
+            bool saw_new_line = false;
+
+            while (i < cps.size() && is_space(cps[i])) {
+                token += codepoint_to_utf8(cps[i]);
+
+                if (cps[i] == U'\r' || cps[i] == U'\n') {
+                    saw_new_line = true;
+                } else {
+                    if (saw_new_line) {
+                        break;
+                    }
+                }
+
+                ++i;
+            }
+
+            tokens.push_back(token);
+            continue;
+        }
+
+        // skip
+        ++i;
+    }
+
+    return tokens;
+}
+
+std::vector<std::string> split_with_special_tokens(
+    const std::string& text,
+    const std::vector<std::string>& special_tokens) {
+    std::vector<std::string> result;
+    size_t pos      = 0;
+    size_t text_len = text.size();
+
+    while (pos < text_len) {
+        size_t next_pos = text_len;
+        std::string matched_token;
+
+        for (const auto& token : special_tokens) {
+            size_t token_pos = text.find(token, pos);
+            if (token_pos != std::string::npos && token_pos < next_pos) {
+                next_pos      = token_pos;
+                matched_token = token;
+            }
+        }
+
+        if (next_pos > pos) {
+            result.push_back(text.substr(pos, next_pos - pos));
+        }
+
+        if (!matched_token.empty()) {
+            result.push_back(matched_token);
+            pos = next_pos + matched_token.size();
+        } else {
+            break;
+        }
+    }
+
+    return result;
+}
+
+// int main() {
+//     std::string text = "I'm testing C++ token_split function. Hello world 123";
+//     auto tokens = token_split(text);
+
+//     for (const auto& t : tokens) {
+//         std::cout << "[" << t << "] ";
+//     }
+//     std::cout << "\n";
+//     return 0;
+// }
diff --git a/src/upscaler.cpp b/src/upscaler.cpp
index 03f7714e5..b88764ad5 100644
--- a/src/upscaler.cpp
+++ b/src/upscaler.cpp
@@ -23,37 +23,20 @@ struct UpscalerGGML {
 
     bool load_from_file(const std::string& esrgan_path,
                         bool offload_params_to_cpu,
-                        int n_threads) {
+                        int n_threads,
+                        std::string device = "") {
         ggml_log_set(ggml_log_callback_default, nullptr);
-#ifdef SD_USE_CUDA
-        LOG_DEBUG("Using CUDA backend");
-        backend = ggml_backend_cuda_init(0);
-#endif
-#ifdef SD_USE_METAL
-        LOG_DEBUG("Using Metal backend");
-        backend = ggml_backend_metal_init();
-#endif
-#ifdef SD_USE_VULKAN
-        LOG_DEBUG("Using Vulkan backend");
-        backend = ggml_backend_vk_init(0);
-#endif
-#ifdef SD_USE_OPENCL
-        LOG_DEBUG("Using OpenCL backend");
-        backend = ggml_backend_opencl_init();
-#endif
-#ifdef SD_USE_SYCL
-        LOG_DEBUG("Using SYCL backend");
-        backend = ggml_backend_sycl_init(0);
-#endif
+        device  = sanitize_backend_name(device);
+        backend = init_named_backend(device);
         ModelLoader model_loader;
         if (!model_loader.init_from_file_and_convert_name(esrgan_path)) {
             LOG_ERROR("init model loader from file failed: '%s'", esrgan_path.c_str());
         }
         model_loader.set_wtype_override(model_data_type);
-        if (!backend) {
-            LOG_DEBUG("Using CPU backend");
-            backend = ggml_backend_cpu_init();
-        }
+        // if (!backend) {
+        //     LOG_DEBUG("Using CPU backend");
+        //     backend = ggml_backend_cpu_init();
+        // }
         LOG_INFO("Upscaler weight type: %s", ggml_type_name(model_data_type));
         esrgan_upscaler = std::make_shared<ESRGAN>(backend, offload_params_to_cpu, tile_size, model_loader.get_tensor_storage_map());
         if (direct) {
@@ -129,7 +112,8 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
                                  bool offload_params_to_cpu,
                                  bool direct,
                                  int n_threads,
-                                 int tile_size) {
+                                 int tile_size,
+                                 const char* device) {
     upscaler_ctx_t* upscaler_ctx = (upscaler_ctx_t*)malloc(sizeof(upscaler_ctx_t));
     if (upscaler_ctx == nullptr) {
         return nullptr;
@@ -141,7 +125,7 @@ upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path_c_str,
         return nullptr;
     }
 
-    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads)) {
+    if (!upscaler_ctx->upscaler->load_from_file(esrgan_path, offload_params_to_cpu, n_threads, SAFE_STR(device))) {
         delete upscaler_ctx->upscaler;
         upscaler_ctx->upscaler = nullptr;
         free(upscaler_ctx);
diff --git a/src/util.cpp b/src/util.cpp
index e01876268..11c927958 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -23,8 +23,9 @@
 #include <unistd.h>
 #endif
 
-#include "ggml-cpu.h"
+#include "ggml-backend.h"
 #include "ggml.h"
+#include "ggml_extend_backend.hpp"
 #include "stable-diffusion.h"
 
 bool ends_with(const std::string& str, const std::string& ending) {
@@ -495,25 +496,29 @@ sd_progress_cb_t sd_get_progress_callback() {
 void* sd_get_progress_callback_data() {
     return sd_progress_cb_data;
 }
+
+// Reference: https://github.com/ggml-org/llama.cpp/blob/c46758d28fa9846893f37e8cec03b73fee120604/src/llama.cpp#L1198
 const char* sd_get_system_info() {
-    static char buffer[1024];
-    std::stringstream ss;
-    ss << "System Info: \n";
-    ss << "    SSE3 = " << ggml_cpu_has_sse3() << " | ";
-    ss << "    AVX = " << ggml_cpu_has_avx() << " | ";
-    ss << "    AVX2 = " << ggml_cpu_has_avx2() << " | ";
-    ss << "    AVX512 = " << ggml_cpu_has_avx512() << " | ";
-    ss << "    AVX512_VBMI = " << ggml_cpu_has_avx512_vbmi() << " | ";
-    ss << "    AVX512_VNNI = " << ggml_cpu_has_avx512_vnni() << " | ";
-    ss << "    FMA = " << ggml_cpu_has_fma() << " | ";
-    ss << "    NEON = " << ggml_cpu_has_neon() << " | ";
-    ss << "    ARM_FMA = " << ggml_cpu_has_arm_fma() << " | ";
-    ss << "    F16C = " << ggml_cpu_has_f16c() << " | ";
-    ss << "    FP16_VA = " << ggml_cpu_has_fp16_va() << " | ";
-    ss << "    WASM_SIMD = " << ggml_cpu_has_wasm_simd() << " | ";
-    ss << "    VSX = " << ggml_cpu_has_vsx() << " | ";
-    snprintf(buffer, sizeof(buffer), "%s", ss.str().c_str());
-    return buffer;
+    static std::string s;
+    s.clear();  // Clear the string, since it's static, otherwise it will accumulate data from previous calls.
+
+    for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+        auto* reg             = ggml_backend_reg_get(i);
+        auto* get_features_fn = (ggml_backend_get_features_t)ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+        if (get_features_fn) {
+            ggml_backend_feature* features = get_features_fn(reg);
+            s += ggml_backend_reg_name(reg);
+            s += " : ";
+            for (; features->name; features++) {
+                s += features->name;
+                s += " = ";
+                s += features->value;
+                s += " | ";
+            }
+        }
+    }
+
+    return s.c_str();
 }
 
 sd_image_t tensor_to_sd_image(const sd::Tensor<float>& tensor, int frame_index) {
@@ -718,3 +723,15 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
 
     return res;
 }
+
+// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
+bool sd_backend_is(ggml_backend_t backend, const std::string& name) {
+    if (!backend) {
+        return false;
+    }
+    ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+    if (!dev)
+        return false;
+    std::string dev_name = ggml_backend_dev_name(dev);
+    return dev_name.find(name) != std::string::npos;
+}
diff --git a/src/util.h b/src/util.h
index 2468cb93d..3ebfacd0c 100644
--- a/src/util.h
+++ b/src/util.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <vector>
 
+#include "ggml-backend.h"
 #include "stable-diffusion.h"
 #include "tensor.hpp"
 
@@ -82,6 +83,9 @@ int sd_get_preview_interval();
 bool sd_should_preview_denoised();
 bool sd_should_preview_noisy();
 
+// test if the backend is a specific one, e.g. "CUDA", "ROCm", "Vulkan" etc.
+bool sd_backend_is(ggml_backend_t backend, const std::string& name);
+
 #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__)
 #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)
diff --git a/src/z_image.hpp b/src/z_image.hpp
index 363ce5f4f..6bb44b791 100644
--- a/src/z_image.hpp
+++ b/src/z_image.hpp
@@ -31,10 +31,6 @@ namespace ZImage {
             : head_dim(head_dim), num_heads(num_heads), num_kv_heads(num_kv_heads), qk_norm(qk_norm) {
             blocks["qkv"] = std::make_shared<Linear>(hidden_size, (num_heads + num_kv_heads * 2) * head_dim, false);
             float scale   = 1.f;
-#if GGML_USE_HIP
-            // Prevent NaN issues with certain ROCm setups
-            scale = 1.f / 16.f;
-#endif
             blocks["out"] = std::make_shared<Linear>(num_heads * head_dim, hidden_size, false, false, false, scale);
             if (qk_norm) {
                 blocks["q_norm"] = std::make_shared<RMSNorm>(head_dim);
@@ -52,6 +48,10 @@ namespace ZImage {
             auto qkv_proj   = std::dynamic_pointer_cast<Linear>(blocks["qkv"]);
             auto out_proj   = std::dynamic_pointer_cast<Linear>(blocks["out"]);
 
+            if (sd_backend_is(ctx->backend, "ROCm")) {
+                out_proj->set_scale(1.f / 16.f);
+            }
+
             auto qkv = qkv_proj->forward(ctx, x);                                                                            // [N, n_token, (num_heads + num_kv_heads*2)*head_dim]
             qkv      = ggml_reshape_4d(ctx->ggml_ctx, qkv, head_dim, num_heads + num_kv_heads * 2, qkv->ne[1], qkv->ne[2]);  // [N, n_token, num_heads + num_kv_heads*2, head_dim]
 
@@ -115,9 +115,7 @@ namespace ZImage {
 
             bool force_prec_f32 = false;
             float scale         = 1.f / 128.f;
-#ifdef SD_USE_VULKAN
-            force_prec_f32 = true;
-#endif
+
             // The purpose of the scale here is to prevent NaN issues in certain situations.
             // For example, when using CUDA but the weights are k-quants.
             blocks["w2"] = std::make_shared<Linear>(hidden_dim, dim, false, false, force_prec_f32, scale);
@@ -129,6 +127,10 @@ namespace ZImage {
             auto w2 = std::dynamic_pointer_cast<Linear>(blocks["w2"]);
             auto w3 = std::dynamic_pointer_cast<Linear>(blocks["w3"]);
 
+            if (sd_backend_is(ctx->backend, "Vulkan")) {
+                w2->set_force_prec_f32(true);
+            }
+
             auto x1 = w1->forward(ctx, x);
             auto x3 = w3->forward(ctx, x);
             x       = ggml_swiglu_split(ctx->ggml_ctx, x1, x3);
diff --git a/thirdparty/CMakeLists.txt b/thirdparty/CMakeLists.txt
index a17178507..51a5c2b42 100644
--- a/thirdparty/CMakeLists.txt
+++ b/thirdparty/CMakeLists.txt
@@ -15,6 +15,9 @@ if(SD_WEBP)
     set(WEBP_BUILD_WEBP_JS OFF)
     set(WEBP_BUILD_FUZZTEST OFF)
     set(WEBP_BUILD_LIBWEBPMUX ON)
-
     add_subdirectory(libwebp EXCLUDE_FROM_ALL)
 endif()
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${Z_TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
\ No newline at end of file