llama.cpp-installer/run_llama_cpp_server.ps1 at main · Danmoreng/llama.cpp-installer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
<#  run_llama_cpp_server.ps1  PowerShell 7
    ----------------------------------------------------------
    • Stores GGUF under .\models\ next to this script
    • Downloads one default model
    • Starts llama-server in router mode
    • Lets llama.cpp auto-fit GPU layers / tensor split / ctx
#>

param()

$ScriptRoot = Split-Path -Parent $MyInvocation.MyCommand.Definition

$ServerExe  = Join-Path $ScriptRoot 'vendor\llama.cpp\build\bin\llama-server.exe'

if (-not (Test-Path $ServerExe)) {
    throw "llama-server.exe not found at '$ServerExe' – check the path."
}

$ModelDir = Join-Path $ScriptRoot 'models'

# Default model to serve
$ModelUrls = @(
    'https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF/resolve/main/gemma-4-e4b-it-Q4_K_M.gguf'
)

function Download-IfNeeded {
    param(
        [string]$Url,
        [Alias('Dest')][string]$Destination
    )
    if (Test-Path $Destination) {
        Write-Host "[OK] Cached → $Destination"
        return
    }
    New-Item -ItemType Directory -Path (Split-Path $Destination) -Force | Out-Null
    Write-Host "→ downloading: $Url"

    $curl = Get-Command curl.exe -ErrorAction SilentlyContinue
    if ($null -ne $curl) {
        & $curl.Source -L --fail --retry 5 --retry-delay 5 --output $Destination $Url
        if ($LASTEXITCODE -ne 0) {
            throw "Download failed from '$Url' (curl exit code $LASTEXITCODE)."
        }
    } else {
        Invoke-WebRequest -Uri $Url -OutFile $Destination -ErrorAction Stop
    }

    if (-not (Test-Path $Destination)) {
        throw "Download failed. File was not created at '$Destination'."
    }

    Write-Host "[OK] Download complete."
}

# Download the configured model into .\models
foreach ($url in $ModelUrls) {
    $file = Join-Path $ModelDir (Split-Path $url -Leaf)
    Download-IfNeeded -Url $url -Destination $file
}

# Row-major speedup
$Env:LLAMA_SET_ROWS = '1'

# Use physical cores for threads
$physicalCores = (Get-CimInstance Win32_Processor).NumberOfCores
$threads       = if ($physicalCores) { $physicalCores } else { [Environment]::ProcessorCount / 2 }

# === llama-server router mode with auto-fit =====================
$Args = @(
    '--jinja',
    '--flash-attn', 'on',
    '--no-mmap',
    '--threads', $threads,

    # Optimized batching for consumer GPUs
    '-b', '1024',
    '-ub', '512',

    # KV-cache quantization (VRAM efficiency for large contexts)
    '-ctk', 'q8_0',
    '-ctv', 'q8_0',

    # Router mode: do NOT pass --model
    '--models-dir',   $ModelDir,   # discover GGUFs from .\models

    # Automatic parameter fitting (optimizes layer offloading)
    '--fit',          'on',
    '--fit-target',   '256',       # MiB of free VRAM to leave per GPU
    '--fit-ctx',      '32768'      # minimum context size auto-fit is allowed to shrink to
)

Write-Host "→ Starting llama-server (router mode) on http://localhost:8080 ..."
Start-Process -FilePath $ServerExe -ArgumentList $Args -NoNewWindow

Start-Sleep -Seconds 5
Start-Process 'http://localhost:8080'