Skip to content

Misc. bug: docker for llama server crashing with gpt-oss-20b #17060

@fwaris

Description

@fwaris

Name and Version

Log:



slot launch_slot_: id  2 | task 1950 | processing task

slot update_slots: id  2 | task 1950 | new prompt, n_ctx_slot = 131072, n_keep = 0, task.n_tokens = 214

slot update_slots: id  2 | task 1950 | n_past = 75, slot.prompt.tokens.size() = 821, seq_id = 2, pos_min = -1

libggml-base.so(+0x183cb)[0x72edb99cd3cb]

libggml-base.so(ggml_print_backtrace+0x21f)[0x72edb99cd82f]

libggml-base.so(ggml_abort+0x152)[0x72edb99cda02]

/app/llama-server(+0xfb930)[0x59655a083930]

/app/llama-server(+0x98c78)[0x59655a020c78]

/app/llama-server(+0x57c1d)[0x596559fdfc1d]

/usr/lib/x86_64-linux-gnu/libc.so.6(+0x29d90)[0x72edb9480d90]

/usr/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0x80)[0x72edb9480e40]

/app/llama-server(+0x596d5)[0x596559fe16d5]

Container 'inspect'


{
	"Id": "499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10",
	"Created": "2025-11-06T17:29:42.560540946Z",
	"Path": "/app/llama-server",
	"Args": [
		"--host",
		"0.0.0.0",
		"-ngl",
		"99",
		"-m",
		"/models/gpt-oss-20b-mxfp4.gguf",
		"-c",
		"0",
		"-fa",
		"on",
		"--jinja",
		"--reasoning-format",
		"none"
	],
	"State": {
		"Status": "exited",
		"Running": false,
		"Paused": false,
		"Restarting": false,
		"OOMKilled": false,
		"Dead": false,
		"Pid": 0,
		"ExitCode": 139,
		"Error": "",
		"StartedAt": "2025-11-06T17:29:42.654145845Z",
		"FinishedAt": "2025-11-06T17:33:54.209421876Z",
		"Health": {
			"Status": "unhealthy",
			"FailingStreak": 0,
			"Log": [
				{
					"Start": "2025-11-06T17:31:40.494739486Z",
					"End": "2025-11-06T17:31:40.576549417Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  12711      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:32:09.582762691Z",
					"End": "2025-11-06T17:32:09.67254232Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  18564      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:32:38.683854524Z",
					"End": "2025-11-06T17:32:38.787315065Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  15889      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:33:07.75805487Z",
					"End": "2025-11-06T17:33:07.836319282Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  19556      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				},
				{
					"Start": "2025-11-06T17:33:36.848880427Z",
					"End": "2025-11-06T17:33:36.947534877Z",
					"ExitCode": 0,
					"Output": "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n                                 Dload  Upload   Total   Spent    Left  Speed\n\r  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0\r100    15  100    15    0     0  19531      0 --:--:-- --:--:-- --:--:-- 15000\n{\"status\":\"ok\"}"
				}
			]
		}
	},
	"Image": "sha256:15101512cf511b41b4822715c0c953826bdd8a379263958b93f959d5272f9d88",
	"ResolvConfPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/resolv.conf",
	"HostnamePath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/hostname",
	"HostsPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/hosts",
	"LogPath": "/var/lib/docker/containers/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10/499c942db482d3c59a4a93de27906b344322e90ddbe8bfb3f5e5daca15fdee10-json.log",
	"Name": "/gptoss20",
	"RestartCount": 0,
	"Driver": "overlayfs",
	"Platform": "linux",
	"MountLabel": "",
	"ProcessLabel": "",
	"AppArmorProfile": "",
	"ExecIDs": null,
	"HostConfig": {
		"Binds": [
			"/e/s/models:/models"
		],
		"ContainerIDFile": "",
		"LogConfig": {
			"Type": "json-file",
			"Config": {}
		},
		"NetworkMode": "host",
		"PortBindings": {},
		"RestartPolicy": {
			"Name": "no",
			"MaximumRetryCount": 0
		},
		"AutoRemove": false,
		"VolumeDriver": "",
		"VolumesFrom": null,
		"ConsoleSize": [
			30,
			120
		],
		"CapAdd": null,
		"CapDrop": null,
		"CgroupnsMode": "private",
		"Dns": [],
		"DnsOptions": [],
		"DnsSearch": [],
		"ExtraHosts": null,
		"GroupAdd": null,
		"IpcMode": "private",
		"Cgroup": "",
		"Links": null,
		"OomScoreAdj": 0,
		"PidMode": "",
		"Privileged": false,
		"PublishAllPorts": false,
		"ReadonlyRootfs": false,
		"SecurityOpt": null,
		"UTSMode": "",
		"UsernsMode": "",
		"ShmSize": 67108864,
		"Runtime": "runc",
		"Isolation": "",
		"CpuShares": 0,
		"Memory": 0,
		"NanoCpus": 0,
		"CgroupParent": "",
		"BlkioWeight": 0,
		"BlkioWeightDevice": [],
		"BlkioDeviceReadBps": [],
		"BlkioDeviceWriteBps": [],
		"BlkioDeviceReadIOps": [],
		"BlkioDeviceWriteIOps": [],
		"CpuPeriod": 0,
		"CpuQuota": 0,
		"CpuRealtimePeriod": 0,
		"CpuRealtimeRuntime": 0,
		"CpusetCpus": "",
		"CpusetMems": "",
		"Devices": [],
		"DeviceCgroupRules": null,
		"DeviceRequests": [
			{
				"Driver": "",
				"Count": -1,
				"DeviceIDs": null,
				"Capabilities": [
					[
						"gpu"
					]
				],
				"Options": {}
			}
		],
		"MemoryReservation": 0,
		"MemorySwap": 0,
		"MemorySwappiness": null,
		"OomKillDisable": null,
		"PidsLimit": null,
		"Ulimits": [],
		"CpuCount": 0,
		"CpuPercent": 0,
		"IOMaximumIOps": 0,
		"IOMaximumBandwidth": 0,
		"MaskedPaths": [
			"/proc/asound",
			"/proc/acpi",
			"/proc/interrupts",
			"/proc/kcore",
			"/proc/keys",
			"/proc/latency_stats",
			"/proc/timer_list",
			"/proc/timer_stats",
			"/proc/sched_debug",
			"/proc/scsi",
			"/sys/firmware",
			"/sys/devices/virtual/powercap"
		],
		"ReadonlyPaths": [
			"/proc/bus",
			"/proc/fs",
			"/proc/irq",
			"/proc/sys",
			"/proc/sysrq-trigger"
		]
	},
	"GraphDriver": {
		"Data": null,
		"Name": "overlayfs"
	},
	"Mounts": [
		{
			"Type": "bind",
			"Source": "/e/s/models",
			"Destination": "/models",
			"Mode": "",
			"RW": true,
			"Propagation": "rprivate"
		}
	],
	"Config": {
		"Hostname": "docker-desktop",
		"Domainname": "",
		"User": "",
		"AttachStdin": false,
		"AttachStdout": false,
		"AttachStderr": false,
		"Tty": false,
		"OpenStdin": false,
		"StdinOnce": false,
		"Env": [
			"PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
			"NVARCH=x86_64",
			"NVIDIA_REQUIRE_CUDA=cuda>=12.4 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526 brand=tesla,driver>=535,driver<536 brand=unknown,driver>=535,driver<536 brand=nvidia,driver>=535,driver<536 brand=nvidiartx,driver>=535,driver<536 brand=geforce,driver>=535,driver<536 brand=geforcertx,driver>=535,driver<536 brand=quadro,driver>=535,driver<536 brand=quadrortx,driver>=535,driver<536 brand=titan,driver>=535,driver<536 brand=titanrtx,driver>=535,driver<536",
			"NV_CUDA_CUDART_VERSION=12.4.99-1",
			"NV_CUDA_COMPAT_PACKAGE=cuda-compat-12-4",
			"CUDA_VERSION=12.4.0",
			"LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
			"NVIDIA_VISIBLE_DEVICES=all",
			"NVIDIA_DRIVER_CAPABILITIES=compute,utility",
			"NV_CUDA_LIB_VERSION=12.4.0-1",
			"NV_NVTX_VERSION=12.4.99-1",
			"NV_LIBNPP_VERSION=12.2.5.2-1",
			"NV_LIBNPP_PACKAGE=libnpp-12-4=12.2.5.2-1",
			"NV_LIBCUSPARSE_VERSION=12.3.0.142-1",
			"NV_LIBCUBLAS_PACKAGE_NAME=libcublas-12-4",
			"NV_LIBCUBLAS_VERSION=12.4.2.65-1",
			"NV_LIBCUBLAS_PACKAGE=libcublas-12-4=12.4.2.65-1",
			"NV_LIBNCCL_PACKAGE_NAME=libnccl2",
			"NV_LIBNCCL_PACKAGE_VERSION=2.20.5-1",
			"NCCL_VERSION=2.20.5-1",
			"NV_LIBNCCL_PACKAGE=libnccl2=2.20.5-1+cuda12.4",
			"NVIDIA_PRODUCT_NAME=CUDA",
			"LLAMA_ARG_HOST=0.0.0.0"
		],
		"Cmd": [
			"--host",
			"0.0.0.0",
			"-ngl",
			"99",
			"-m",
			"/models/gpt-oss-20b-mxfp4.gguf",
			"-c",
			"0",
			"-fa",
			"on",
			"--jinja",
			"--reasoning-format",
			"none"
		],
		"Healthcheck": {
			"Test": [
				"CMD",
				"curl",
				"-f",
				"http://localhost:8080/health"
			]
		},
		"Image": "ghcr.io/ggml-org/llama.cpp:server-cuda",
		"Volumes": null,
		"WorkingDir": "/app",
		"Entrypoint": [
			"/app/llama-server"
		],
		"OnBuild": null,
		"Labels": {
			"maintainer": "NVIDIA CORPORATION <[email protected]>",
			"org.opencontainers.image.ref.name": "ubuntu",
			"org.opencontainers.image.version": "22.04"
		}
	},
	"NetworkSettings": {
		"Bridge": "",
		"SandboxID": "",
		"SandboxKey": "",
		"Ports": {},
		"HairpinMode": false,
		"LinkLocalIPv6Address": "",
		"LinkLocalIPv6PrefixLen": 0,
		"SecondaryIPAddresses": null,
		"SecondaryIPv6Addresses": null,
		"EndpointID": "",
		"Gateway": "",
		"GlobalIPv6Address": "",
		"GlobalIPv6PrefixLen": 0,
		"IPAddress": "",
		"IPPrefixLen": 0,
		"IPv6Gateway": "",
		"MacAddress": "",
		"Networks": {
			"host": {
				"IPAMConfig": null,
				"Links": null,
				"Aliases": null,
				"MacAddress": "",
				"DriverOpts": null,
				"GwPriority": 0,
				"NetworkID": "e2e36d82c94fabae031a8728e4513ce436f4ecd8b78e7a8a0e68572c7ce41076",
				"EndpointID": "",
				"Gateway": "",
				"IPAddress": "",
				"IPPrefixLen": 0,
				"IPv6Gateway": "",
				"GlobalIPv6Address": "",
				"GlobalIPv6PrefixLen": 0,
				"DNSNames": null
			}
		}
	},
	"ImageManifestDescriptor": {
		"mediaType": "application/vnd.docker.distribution.manifest.v2+json",
		"digest": "sha256:15101512cf511b41b4822715c0c953826bdd8a379263958b93f959d5272f9d88",
		"size": 2867,
		"platform": {
			"architecture": "amd64",
			"os": "linux"
		}
	}
}

Operating systems

Linux

Which llama.cpp modules do you know to be affected?

llama-server

Command line

docker run -d --name=gptoss20  `
  --network=host   `
  -v /e/s/models:/models  `
  --gpus all   `
  ghcr.io/ggml-org/llama.cpp:server-cuda  `
  --host 0.0.0.0 -ngl 99  `
  -m /models/gpt-oss-20b-mxfp4.gguf `
  -c 0 -fa on --jinja --reasoning-format none

Problem description & steps to reproduce

Run the prompt optimizer at this repo http://github.com/fwaris/FsGepa

Note this ran fine with a 20-day old version of llama-server.

The optimizer maintains a steady flow of requests - 5 concurrent requests at play at any given point in time.

First Bad Commit

No response

Relevant log output

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions