|
9 | 9 |
|
10 | 10 | from prometheus_client import Counter, Gauge, Histogram |
11 | 11 |
|
| 12 | +import vllm.envs as envs |
12 | 13 | from vllm.config import SupportsMetricsInfo, VllmConfig |
13 | 14 | from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( |
14 | 15 | KVConnectorLogging, |
@@ -116,11 +117,13 @@ def _reset(self, now): |
116 | 117 | # Tracked stats over current local logging interval. |
117 | 118 | self.num_prompt_tokens: int = 0 |
118 | 119 | self.num_generation_tokens: int = 0 |
| 120 | + self.num_corrupted_reqs: int = 0 |
119 | 121 |
|
120 | 122 | def _track_iteration_stats(self, iteration_stats: IterationStats): |
121 | 123 | # Save tracked stats for token counters. |
122 | 124 | self.num_prompt_tokens += iteration_stats.num_prompt_tokens |
123 | 125 | self.num_generation_tokens += iteration_stats.num_generation_tokens |
| 126 | + self.num_corrupted_reqs += iteration_stats.num_corrupted_reqs |
124 | 127 |
|
125 | 128 | def _get_throughput(self, tracked_stats: int, now: float) -> float: |
126 | 129 | # Compute summary metrics for tracked stats |
@@ -204,6 +207,10 @@ def log(self): |
204 | 207 | self.last_scheduler_stats.kv_cache_usage * 100, |
205 | 208 | self.prefix_caching_metrics.hit_rate * 100, |
206 | 209 | ] |
| 210 | + |
| 211 | + if envs.VLLM_COMPUTE_NANS_IN_LOGITS: |
| 212 | + log_parts.append("Corrupted: %d reqs") |
| 213 | + log_args.append(self.num_corrupted_reqs) |
207 | 214 | if not self.connector_prefix_caching_metrics.empty: |
208 | 215 | log_parts.append("External prefix cache hit rate: %.1f%%") |
209 | 216 | log_args.append(self.connector_prefix_caching_metrics.hit_rate * 100) |
@@ -275,9 +282,6 @@ def aggregate_scheduler_stats(self): |
275 | 282 | self.last_scheduler_stats.num_running_reqs += ( |
276 | 283 | last_scheduler_stats.num_running_reqs |
277 | 284 | ) |
278 | | - self.last_scheduler_stats.num_corrupted_reqs += ( |
279 | | - last_scheduler_stats.num_corrupted_reqs |
280 | | - ) |
281 | 285 | self.last_scheduler_stats.kv_cache_usage += ( |
282 | 286 | last_scheduler_stats.kv_cache_usage |
283 | 287 | ) |
@@ -481,6 +485,19 @@ def __init__( |
481 | 485 | gauge_kv_cache_usage, engine_indexes, model_name |
482 | 486 | ) |
483 | 487 |
|
| 488 | + if envs.VLLM_COMPUTE_NANS_IN_LOGITS: |
| 489 | + counter_corrupted_requests = self._counter_cls( |
| 490 | + name="vllm:corrupted_requests", |
| 491 | + documentation=( |
| 492 | + "Corrupted requests, in terms of total number of requests " |
| 493 | + "with NaNs in logits." |
| 494 | + ), |
| 495 | + labelnames=labelnames, |
| 496 | + ) |
| 497 | + self.counter_corrupted_requests = make_per_engine( |
| 498 | + counter_corrupted_requests, engine_indexes, model_name |
| 499 | + ) |
| 500 | + |
484 | 501 | counter_prefix_cache_queries = self._counter_cls( |
485 | 502 | name="vllm:prefix_cache_queries", |
486 | 503 | documentation=( |
@@ -933,7 +950,6 @@ def record( |
933 | 950 | self.gauge_scheduler_waiting[engine_idx].set( |
934 | 951 | scheduler_stats.num_waiting_reqs |
935 | 952 | ) |
936 | | - |
937 | 953 | if self.show_hidden_metrics: |
938 | 954 | self.gauge_gpu_cache_usage[engine_idx].set( |
939 | 955 | scheduler_stats.kv_cache_usage |
@@ -979,7 +995,10 @@ def record( |
979 | 995 |
|
980 | 996 | if iteration_stats is None: |
981 | 997 | return |
982 | | - |
| 998 | + if envs.VLLM_COMPUTE_NANS_IN_LOGITS: |
| 999 | + self.counter_corrupted_requests[engine_idx].inc( |
| 1000 | + iteration_stats.num_corrupted_reqs |
| 1001 | + ) |
983 | 1002 | self.counter_num_preempted_reqs[engine_idx].inc( |
984 | 1003 | iteration_stats.num_preempted_reqs |
985 | 1004 | ) |
|
0 commit comments