-
Notifications
You must be signed in to change notification settings - Fork 13.6k
Open
Labels
Description
Name and Version
Log:
slot launch_slot_: id 2 | task 1950 | processing task
slot update_slots: id 2 | task 1950 | new prompt, n_ctx_slot = 131072, n_keep = 0, task.n_tokens = 214
slot update_slots: id 2 | task 1950 | n_past = 75, slot.prompt.tokens.size() = 821, seq_id = 2, pos_min = -1
libggml-base.so(+0x183cb)[0x72edb99cd3cb]
libggml-base.so(ggml_print_backtrace+0x21f)[0x72edb99cd82f]
libggml-base.so(ggml_abort+0x152)[0x72edb99cda02]
/app/llama-server(+0xfb930)[0x59655a083930]
/app/llama-server(+0x98c78)[0x59655a020c78]
/app/llama-server(+0x57c1d)[0x596559fdfc1d]
/usr/lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x72edb9480d90]
/usr/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x72edb9480e40]
/app/llama-server(+0x596d5)[0x596559fe16d5]
Container 'inspect'
{
"Id": "499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10",
"Created": "2025-11-06T17:29:42.560540946Z",
"Path": "/app/llama-server",
"Args": [
"--host",
"0.0.0.0",
"-ngl",
"99",
"-m",
"/models/gpt-oss-20b-mxfp4.gguf",
"-c",
"0",
"-fa",
"on",
"--jinja",
"--reasoning-format",
"none"
],
"State": {
"Status": "exited",
"Running": false,
"Paused": false,
"Restarting": false,
"OOMKilled": false,
"Dead": false,
"Pid": 0,
"ExitCode": 139,
"Error": "",
"StartedAt": "2025-11-06T17:29:42.654145845Z",
"FinishedAt": "2025-11-06T17:33:54.209421876Z",
"Health": {
"Status": "unhealthy",
"FailingStreak": 0,
"Log": [
{
"Start": "2025-11-06T17:31:40.494739486Z",
"End": "2025-11-06T17:31:40.576549417Z",
"ExitCode": 0,
"Output": " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 15 100 15 0 0 12711 0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
},
{
"Start": "2025-11-06T17:32:09.582762691Z",
"End": "2025-11-06T17:32:09.67254232Z",
"ExitCode": 0,
"Output": " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 15 100 15 0 0 18564 0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
},
{
"Start": "2025-11-06T17:32:38.683854524Z",
"End": "2025-11-06T17:32:38.787315065Z",
"ExitCode": 0,
"Output": " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 15 100 15 0 0 15889 0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
},
{
"Start": "2025-11-06T17:33:07.75805487Z",
"End": "2025-11-06T17:33:07.836319282Z",
"ExitCode": 0,
"Output": " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 15 100 15 0 0 19556 0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
},
{
"Start": "2025-11-06T17:33:36.848880427Z",
"End": "2025-11-06T17:33:36.947534877Z",
"ExitCode": 0,
"Output": " % Total % Received % Xferd Average Speed Time Time Time Current\n Dload Upload Total Spent Left Speed\n\r 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0\r100 15 100 15 0 0 19531 0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
}
]
}
},
"Image": "sha256:15101512cf511b41b4822715c0c953826bdd8a379263958b93f959d5272f9d88",
"ResolvConfPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/resolv.conf",
"HostnamePath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/hostname",
"HostsPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/hosts",
"LogPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10-json.log",
"Name": "/gptoss20",
"RestartCount": 0,
"Driver": "overlayfs",
"Platform": "linux",
"MountLabel": "",
"ProcessLabel": "",
"AppArmorProfile": "",
"ExecIDs": null,
"HostConfig": {
"Binds": [
"/e/s/models:/models"
],
"ContainerIDFile": "",
"LogConfig": {
"Type": "json-file",
"Config": {}
},
"NetworkMode": "host",
"PortBindings": {},
"RestartPolicy": {
"Name": "no",
"MaximumRetryCount": 0
},
"AutoRemove": false,
"VolumeDriver": "",
"VolumesFrom": null,
"ConsoleSize": [
30,
120
],
"CapAdd": null,
"CapDrop": null,
"CgroupnsMode": "private",
"Dns": [],
"DnsOptions": [],
"DnsSearch": [],
"ExtraHosts": null,
"GroupAdd": null,
"IpcMode": "private",
"Cgroup": "",
"Links": null,
"OomScoreAdj": 0,
"PidMode": "",
"Privileged": false,
"PublishAllPorts": false,
"ReadonlyRootfs": false,
"SecurityOpt": null,
"UTSMode": "",
"UsernsMode": "",
"ShmSize": 67108864,
"Runtime": "runc",
"Isolation": "",
"CpuShares": 0,
"Memory": 0,
"NanoCpus": 0,
"CgroupParent": "",
"BlkioWeight": 0,
"BlkioWeightDevice": [],
"BlkioDeviceReadBps": [],
"BlkioDeviceWriteBps": [],
"BlkioDeviceReadIOps": [],
"BlkioDeviceWriteIOps": [],
"CpuPeriod": 0,
"CpuQuota": 0,
"CpuRealtimePeriod": 0,
"CpuRealtimeRuntime": 0,
"CpusetCpus": "",
"CpusetMems": "",
"Devices": [],
"DeviceCgroupRules": null,
"DeviceRequests": [
{
"Driver": "",
"Count": -1,
"DeviceIDs": null,
"Capabilities": [
[
"gpu"
]
],
"Options": {}
}
],
"MemoryReservation": 0,
"MemorySwap": 0,
"MemorySwappiness": null,
"OomKillDisable": null,
"PidsLimit": null,
"Ulimits": [],
"CpuCount": 0,
"CpuPercent": 0,
"IOMaximumIOps": 0,
"IOMaximumBandwidth": 0,
"MaskedPaths": [
"/proc/asound",
"/proc/acpi",
"/proc/interrupts",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/proc/scsi",
"/sys/firmware",
"/sys/devices/virtual/powercap"
],
"ReadonlyPaths": [
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
]
},
"GraphDriver": {
"Data": null,
"Name": "overlayfs"
},
"Mounts": [
{
"Type": "bind",
"Source": "/e/s/models",
"Destination": "/models",
"Mode": "",
"RW": true,
"Propagation": "rprivate"
}
],
"Config": {
"Hostname": "docker-desktop",
"Domainname": "",
"User": "",
"AttachStdin": false,
"AttachStdout": false,
"AttachStderr": false,
"Tty": false,
"OpenStdin": false,
"StdinOnce": false,
"Env": [
"PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"NVARCH=x86_64",
"NVIDIA_REQUIRE_CUDA=cuda>=12.4 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 brand=tesla,driver>=535,driver<536 brand=unknown,driver>=535,driver<536 brand=nvidia,driver>=535,driver<536 brand=nvidiartx,driver>=535,driver<536 brand=geforce,driver>=535,driver<536 brand=geforcertx,driver>=535,driver<536 brand=quadro,driver>=535,driver<536 brand=quadrortx,driver>=535,driver<536 brand=titan,driver>=535,driver<536 brand=titanrtx,driver>=535,driver<536",
"NV_CUDA_CUDART_VERSION=12.4.99-1",
"NV_CUDA_COMPAT_PACKAGE=cuda-compat-12-4",
"CUDA_VERSION=12.4.0",
"LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_DRIVER_CAPABILITIES=compute,utility",
"NV_CUDA_LIB_VERSION=12.4.0-1",
"NV_NVTX_VERSION=12.4.99-1",
"NV_LIBNPP_VERSION=12.2.5.2-1",
"NV_LIBNPP_PACKAGE=libnpp-12-4=12.2.5.2-1",
"NV_LIBCUSPARSE_VERSION=12.3.0.142-1",
"NV_LIBCUBLAS_PACKAGE_NAME=libcublas-12-4",
"NV_LIBCUBLAS_VERSION=12.4.2.65-1",
"NV_LIBCUBLAS_PACKAGE=libcublas-12-4=12.4.2.65-1",
"NV_LIBNCCL_PACKAGE_NAME=libnccl2",
"NV_LIBNCCL_PACKAGE_VERSION=2.20.5-1",
"NCCL_VERSION=2.20.5-1",
"NV_LIBNCCL_PACKAGE=libnccl2=2.20.5-1+cuda12.4",
"NVIDIA_PRODUCT_NAME=CUDA",
"LLAMA_ARG_HOST=0.0.0.0"
],
"Cmd": [
"--host",
"0.0.0.0",
"-ngl",
"99",
"-m",
"/models/gpt-oss-20b-mxfp4.gguf",
"-c",
"0",
"-fa",
"on",
"--jinja",
"--reasoning-format",
"none"
],
"Healthcheck": {
"Test": [
"CMD",
"curl",
"-f",
"http://localhost:8080/health"
]
},
"Image": "ghcr.io/ggml-org/llama.cpp:server-cuda",
"Volumes": null,
"WorkingDir": "/app",
"Entrypoint": [
"/app/llama-server"
],
"OnBuild": null,
"Labels": {
"maintainer": "NVIDIA CORPORATION <[email protected]>",
"org.opencontainers.image.ref.name": "ubuntu",
"org.opencontainers.image.version": "22.04"
}
},
"NetworkSettings": {
"Bridge": "",
"SandboxID": "",
"SandboxKey": "",
"Ports": {},
"HairpinMode": false,
"LinkLocalIPv6Address": "",
"LinkLocalIPv6PrefixLen": 0,
"SecondaryIPAddresses": null,
"SecondaryIPv6Addresses": null,
"EndpointID": "",
"Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"MacAddress": "",
"Networks": {
"host": {
"IPAMConfig": null,
"Links": null,
"Aliases": null,
"MacAddress": "",
"DriverOpts": null,
"GwPriority": 0,
"NetworkID": "e2e36d82c94fabae031a8728e4513ce436f4ecd8b78e7a8a0e68572c7ce41076",
"EndpointID": "",
"Gateway": "",
"IPAddress": "",
"IPPrefixLen": 0,
"IPv6Gateway": "",
"GlobalIPv6Address": "",
"GlobalIPv6PrefixLen": 0,
"DNSNames": null
}
}
},
"ImageManifestDescriptor": {
"mediaType": "application/vnd.docker.distribution.manifest.v2+json",
"digest": "sha256:15101512cf511b41b4822715c0c953826bdd8a379263958b93f959d5272f9d88",
"size": 2867,
"platform": {
"architecture": "amd64",
"os": "linux"
}
}
}
Operating systems
Linux
Which llama.cpp modules do you know to be affected?
llama-server
Command line
docker run -d --name=gptoss20 `
--network=host `
-v /e/s/models:/models `
--gpus all `
ghcr.io/ggml-org/llama.cpp:server-cuda `
--host 0.0.0.0 -ngl 99 `
-m /models/gpt-oss-20b-mxfp4.gguf `
-c 0 -fa on --jinja --reasoning-format noneProblem description & steps to reproduce
Run the prompt optimizer at this repo http://github.com/fwaris/FsGepa
Note this ran fine with a 20-day old version of llama-server.
The optimizer maintains a steady flow of requests - 5 concurrent requests at play at any given point in time.
First Bad Commit
No response