Skip to content

Misc. bug: docker for llama server crashing with gpt-oss-20b #17060

@fwaris

Description

@fwaris

Name and Version

Log:



slot launch_slot_: id  2 | task 1950 | processing task

slot update_slots: id  2 | task 1950 | new prompt, n_ctx_slot = 131072, n_keep = 0, task.n_tokens = 214

slot update_slots: id  2 | task 1950 | n_past = 75, slot.prompt.tokens.size() = 821, seq_id = 2, pos_min = -1

libggml-base.so(+0x183cb)[0x72edb99cd3cb]

libggml-base.so(ggml_print_backtrace+0x21f)[0x72edb99cd82f]

libggml-base.so(ggml_abort+0x152)[0x72edb99cda02]

/app/llama-server(+0xfb930)[0x59655a083930]

/app/llama-server(+0x98c78)[0x59655a020c78]

/app/llama-server(+0x57c1d)[0x596559fdfc1d]

/usr/lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x72edb9480d90]

/usr/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x72edb9480e40]

/app/llama-server(+0x596d5)[0x596559fe16d5]

Container 'inspect'


{
	"Id": "499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10",
	"Created": "2025-11-06T17:29:42.560540946Z",
	"Path": "/app/llama-server",
	"Args": [
		"--host",
		"0.0.0.0",
		"-ngl",
		"99",
		"-m",
		"/models/gpt-oss-20b-mxfp4.gguf",
		"-c",
		"0",
		"-fa",
		"on",
		"--jinja",
		"--reasoning-format",
		"none"
	],
	"State": {
		"Status": "exited",
		"Running": false,
		"Paused": false,
		"Restarting": false,
		"OOMKilled": false,
		"Dead": false,
		"Pid": 0,
		"ExitCode": 139,
		"Error": "",
		"StartedAt": "2025-11-06T17:29:42.654145845Z",
		"FinishedAt": "2025-11-06T17:33:54.209421876Z",
		"Health": {
			"Status": "unhealthy",
			"FailingStreak": 0,
			"Log": [
				{
					"Start": "2025-11-06T17:31:40.494739486Z",
					"End": "2025-11-06T17:31:40.576549417Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  12711      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:32:09.582762691Z",
					"End": "2025-11-06T17:32:09.67254232Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  18564      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:32:38.683854524Z",
					"End": "2025-11-06T17:32:38.787315065Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  15889      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:33:07.75805487Z",
					"End": "2025-11-06T17:33:07.836319282Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  19556      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:33:36.848880427Z",
					"End": "2025-11-06T17:33:36.947534877Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  19531      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				}
			]
		}
	},
	"Image": "sha256:15101512cf511b41b4822715c0c953826bdd8a379263958b93f959d5272f9d88",
	"ResolvConfPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/resolv.conf",
	"HostnamePath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/hostname",
	"HostsPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/hosts",
	"LogPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10-json.log",
	"Name": "/gptoss20",
	"RestartCount": 0,
	"Driver": "overlayfs",
	"Platform": "linux",
	"MountLabel": "",
	"ProcessLabel": "",
	"AppArmorProfile": "",
	"ExecIDs": null,
	"HostConfig": {
		"Binds": [
			"/e/s/models:/models"
		],
		"ContainerIDFile": "",
		"LogConfig": {
			"Type": "json-file",
			"Config": {}
		},
		"NetworkMode": "host",
		"PortBindings": {},
		"RestartPolicy": {
			"Name": "no",
			"MaximumRetryCount": 0
		},
		"AutoRemove": false,
		"VolumeDriver": "",
		"VolumesFrom": null,
		"ConsoleSize": [
			30,
			120
		],
		"CapAdd": null,
		"CapDrop": null,
		"CgroupnsMode": "private",
		"Dns": [],
		"DnsOptions": [],
		"DnsSearch": [],
		"ExtraHosts": null,
		"GroupAdd": null,
		"IpcMode": "private",
		"Cgroup": "",
		"Links": null,
		"OomScoreAdj": 0,
		"PidMode": "",
		"Privileged": false,
		"PublishAllPorts": false,
		"ReadonlyRootfs": false,
		"SecurityOpt": null,
		"UTSMode": "",
		"UsernsMode": "",
		"ShmSize": 67108864,
		"Runtime": "runc",
		"Isolation": "",
		"CpuShares": 0,
		"Memory": 0,
		"NanoCpus": 0,
		"CgroupParent": "",
		"BlkioWeight": 0,
		"BlkioWeightDevice": [],
		"BlkioDeviceReadBps": [],
		"BlkioDeviceWriteBps": [],
		"BlkioDeviceReadIOps": [],
		"BlkioDeviceWriteIOps": [],
		"CpuPeriod": 0,
		"CpuQuota": 0,
		"CpuRealtimePeriod": 0,
		"CpuRealtimeRuntime": 0,
		"CpusetCpus": "",
		"CpusetMems": "",
		"Devices": [],
		"DeviceCgroupRules": null,
		"DeviceRequests": [
			{
				"Driver": "",
				"Count": -1,
				"DeviceIDs": null,
				"Capabilities": [
					[
						"gpu"
					]
				],
				"Options": {}
			}
		],
		"MemoryReservation": 0,
		"MemorySwap": 0,
		"MemorySwappiness": null,
		"OomKillDisable": null,
		"PidsLimit": null,
		"Ulimits": [],
		"CpuCount": 0,
		"CpuPercent": 0,
		"IOMaximumIOps": 0,
		"IOMaximumBandwidth": 0,
		"MaskedPaths": [
			"/proc/asound",
			"/proc/acpi",
			"/proc/interrupts",
			"/proc/kcore",
			"/proc/keys",
			"/proc/latency_stats",
			"/proc/timer_list",
			"/proc/timer_stats",
			"/proc/sched_debug",
			"/proc/scsi",
			"/sys/firmware",
			"/sys/devices/virtual/powercap"
		],
		"ReadonlyPaths": [
			"/proc/bus",
			"/proc/fs",
			"/proc/irq",
			"/proc/sys",
			"/proc/sysrq-trigger"
		]
	},
	"GraphDriver": {
		"Data": null,
		"Name": "overlayfs"
	},
	"Mounts": [
		{
			"Type": "bind",
			"Source": "/e/s/models",
			"Destination": "/models",
			"Mode": "",
			"RW": true,
			"Propagation": "rprivate"
		}
	],
	"Config": {
		"Hostname": "docker-desktop",
		"Domainname": "",
		"User": "",
		"AttachStdin": false,
		"AttachStdout": false,
		"AttachStderr": false,
		"Tty": false,
		"OpenStdin": false,
		"StdinOnce": false,
		"Env": [
			"PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
			"NVARCH=x86_64",
			"NVIDIA_REQUIRE_CUDA=cuda>=12.4 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 brand=tesla,driver>=535,driver<536 brand=unknown,driver>=535,driver<536 brand=nvidia,driver>=535,driver<536 brand=nvidiartx,driver>=535,driver<536 brand=geforce,driver>=535,driver<536 brand=geforcertx,driver>=535,driver<536 brand=quadro,driver>=535,driver<536 brand=quadrortx,driver>=535,driver<536 brand=titan,driver>=535,driver<536 brand=titanrtx,driver>=535,driver<536",
			"NV_CUDA_CUDART_VERSION=12.4.99-1",
			"NV_CUDA_COMPAT_PACKAGE=cuda-compat-12-4",
			"CUDA_VERSION=12.4.0",
			"LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
			"NVIDIA_VISIBLE_DEVICES=all",
			"NVIDIA_DRIVER_CAPABILITIES=compute,utility",
			"NV_CUDA_LIB_VERSION=12.4.0-1",
			"NV_NVTX_VERSION=12.4.99-1",
			"NV_LIBNPP_VERSION=12.2.5.2-1",
			"NV_LIBNPP_PACKAGE=libnpp-12-4=12.2.5.2-1",
			"NV_LIBCUSPARSE_VERSION=12.3.0.142-1",
			"NV_LIBCUBLAS_PACKAGE_NAME=libcublas-12-4",
			"NV_LIBCUBLAS_VERSION=12.4.2.65-1",
			"NV_LIBCUBLAS_PACKAGE=libcublas-12-4=12.4.2.65-1",
			"NV_LIBNCCL_PACKAGE_NAME=libnccl2",
			"NV_LIBNCCL_PACKAGE_VERSION=2.20.5-1",
			"NCCL_VERSION=2.20.5-1",
			"NV_LIBNCCL_PACKAGE=libnccl2=2.20.5-1+cuda12.4",
			"NVIDIA_PRODUCT_NAME=CUDA",
			"LLAMA_ARG_HOST=0.0.0.0"
		],
		"Cmd": [
			"--host",
			"0.0.0.0",
			"-ngl",
			"99",
			"-m",
			"/models/gpt-oss-20b-mxfp4.gguf",
			"-c",
			"0",
			"-fa",
			"on",
			"--jinja",
			"--reasoning-format",
			"none"
		],
		"Healthcheck": {
			"Test": [
				"CMD",
				"curl",
				"-f",
				"http://localhost:8080/health"
			]
		},
		"Image": "ghcr.io/ggml-org/llama.cpp:server-cuda",
		"Volumes": null,
		"WorkingDir": "/app",
		"Entrypoint": [
			"/app/llama-server"
		],
		"OnBuild": null,
		"Labels": {
			"maintainer": "NVIDIA CORPORATION <[email protected]>",
			"org.opencontainers.image.ref.name": "ubuntu",
			"org.opencontainers.image.version": "22.04"
		}
	},
	"NetworkSettings": {
		"Bridge": "",
		"SandboxID": "",
		"SandboxKey": "",
		"Ports": {},
		"HairpinMode": false,
		"LinkLocalIPv6Address": "",
		"LinkLocalIPv6PrefixLen": 0,
		"SecondaryIPAddresses": null,
		"SecondaryIPv6Addresses": null,
		"EndpointID": "",
		"Gateway": "",
		"GlobalIPv6Address": "",
		"GlobalIPv6PrefixLen": 0,
		"IPAddress": "",
		"IPPrefixLen": 0,
		"IPv6Gateway": "",
		"MacAddress": "",
		"Networks": {
			"host": {
				"IPAMConfig": null,
				"Links": null,
				"Aliases": null,
				"MacAddress": "",
				"DriverOpts": null,
				"GwPriority": 0,
				"NetworkID": "e2e36d82c94fabae031a8728e4513ce436f4ecd8b78e7a8a0e68572c7ce41076",
				"EndpointID": "",
				"Gateway": "",
				"IPAddress": "",
				"IPPrefixLen": 0,
				"IPv6Gateway": "",
				"GlobalIPv6Address": "",
				"GlobalIPv6PrefixLen": 0,
				"DNSNames": null
			}
		}
	},
	"ImageManifestDescriptor": {
		"mediaType": "application/vnd.docker.distribution.manifest.v2+json",
		"digest": "sha256:15101512cf511b41b4822715c0c953826bdd8a379263958b93f959d5272f9d88",
		"size": 2867,
		"platform": {
			"architecture": "amd64",
			"os": "linux"
		}
	}
}

Operating systems

Linux

Which llama.cpp modules do you know to be affected?

llama-server

Command line

docker run -d --name=gptoss20  `
  --network=host   `
  -v /e/s/models:/models  `
  --gpus all   `
  ghcr.io/ggml-org/llama.cpp:server-cuda  `
  --host 0.0.0.0 -ngl 99  `
  -m /models/gpt-oss-20b-mxfp4.gguf `
  -c 0 -fa on --jinja --reasoning-format none

Problem description & steps to reproduce

Run the prompt optimizer at this repo http://github.com/fwaris/FsGepa

Note this ran fine with a 20-day old version of llama-server.

The optimizer maintains a steady flow of requests - 5 concurrent requests at play at any given point in time.

First Bad Commit

No response

Relevant log output

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions