-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathrun_llama_cpp_server.ps1
More file actions
96 lines (76 loc) · 2.96 KB
/
Copy pathrun_llama_cpp_server.ps1
File metadata and controls
96 lines (76 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
<# run_llama_cpp_server.ps1 PowerShell 7
----------------------------------------------------------
• Stores GGUF under .\models\ next to this script
• Downloads one default model
• Starts llama-server in router mode
• Lets llama.cpp auto-fit GPU layers / tensor split / ctx
#>
param()
$ScriptRoot = Split-Path -Parent $MyInvocation.MyCommand.Definition
$ServerExe = Join-Path $ScriptRoot 'vendor\llama.cpp\build\bin\llama-server.exe'
if (-not (Test-Path $ServerExe)) {
throw "llama-server.exe not found at '$ServerExe' – check the path."
}
$ModelDir = Join-Path $ScriptRoot 'models'
# Default model to serve
$ModelUrls = @(
'https://huggingface.co/ggml-org/gemma-4-E4B-it-GGUF/resolve/main/gemma-4-e4b-it-Q4_K_M.gguf'
)
function Download-IfNeeded {
param(
[string]$Url,
[Alias('Dest')][string]$Destination
)
if (Test-Path $Destination) {
Write-Host "[OK] Cached → $Destination"
return
}
New-Item -ItemType Directory -Path (Split-Path $Destination) -Force | Out-Null
Write-Host "→ downloading: $Url"
$curl = Get-Command curl.exe -ErrorAction SilentlyContinue
if ($null -ne $curl) {
& $curl.Source -L --fail --retry 5 --retry-delay 5 --output $Destination $Url
if ($LASTEXITCODE -ne 0) {
throw "Download failed from '$Url' (curl exit code $LASTEXITCODE)."
}
} else {
Invoke-WebRequest -Uri $Url -OutFile $Destination -ErrorAction Stop
}
if (-not (Test-Path $Destination)) {
throw "Download failed. File was not created at '$Destination'."
}
Write-Host "[OK] Download complete."
}
# Download the configured model into .\models
foreach ($url in $ModelUrls) {
$file = Join-Path $ModelDir (Split-Path $url -Leaf)
Download-IfNeeded -Url $url -Destination $file
}
# Row-major speedup
$Env:LLAMA_SET_ROWS = '1'
# Use physical cores for threads
$physicalCores = (Get-CimInstance Win32_Processor).NumberOfCores
$threads = if ($physicalCores) { $physicalCores } else { [Environment]::ProcessorCount / 2 }
# === llama-server router mode with auto-fit =====================
$Args = @(
'--jinja',
'--flash-attn', 'on',
'--no-mmap',
'--threads', $threads,
# Optimized batching for consumer GPUs
'-b', '1024',
'-ub', '512',
# KV-cache quantization (VRAM efficiency for large contexts)
'-ctk', 'q8_0',
'-ctv', 'q8_0',
# Router mode: do NOT pass --model
'--models-dir', $ModelDir, # discover GGUFs from .\models
# Automatic parameter fitting (optimizes layer offloading)
'--fit', 'on',
'--fit-target', '256', # MiB of free VRAM to leave per GPU
'--fit-ctx', '32768' # minimum context size auto-fit is allowed to shrink to
)
Write-Host "→ Starting llama-server (router mode) on http://localhost:8080 ..."
Start-Process -FilePath $ServerExe -ArgumentList $Args -NoNewWindow
Start-Sleep -Seconds 5
Start-Process 'http://localhost:8080'