99
1010from prometheus_client import Counter , Gauge , Histogram
1111
12+ import vllm .envs as envs
1213from vllm .config import SupportsMetricsInfo , VllmConfig
1314from vllm .distributed .kv_transfer .kv_connector .v1 .metrics import KVConnectorLogging
1415from vllm .logger import init_logger
@@ -104,6 +105,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
104105 self .engine_is_idle = False
105106 self .aggregated = False
106107
108+ # Track the number of corrupted requests, never reset.
109+ self .num_corrupted_reqs : int = 0
110+
107111 def _reset (self , now ):
108112 self .last_log_time = now
109113
@@ -115,6 +119,7 @@ def _track_iteration_stats(self, iteration_stats: IterationStats):
115119 # Save tracked stats for token counters.
116120 self .num_prompt_tokens += iteration_stats .num_prompt_tokens
117121 self .num_generation_tokens += iteration_stats .num_generation_tokens
122+ self .num_corrupted_reqs += iteration_stats .num_corrupted_reqs
118123
119124 def _get_throughput (self , tracked_stats : int , now : float ) -> float :
120125 # Compute summary metrics for tracked stats
@@ -187,7 +192,6 @@ def log(self):
187192 "Avg generation throughput: %.1f tokens/s" ,
188193 "Running: %d reqs" ,
189194 "Waiting: %d reqs" ,
190- "Corrupted: %d reqs" ,
191195 "GPU KV cache usage: %.1f%%" ,
192196 "Prefix cache hit rate: %.1f%%" ,
193197 ]
@@ -196,13 +200,15 @@ def log(self):
196200 self .last_generation_throughput ,
197201 self .last_scheduler_stats .num_running_reqs ,
198202 self .last_scheduler_stats .num_waiting_reqs ,
199- self .last_scheduler_stats .num_corrupted_reqs ,
200203 self .last_scheduler_stats .kv_cache_usage * 100 ,
201204 self .prefix_caching_metrics .hit_rate * 100 ,
202205 ]
203206 if not self .connector_prefix_caching_metrics .empty :
204207 log_parts .append ("External prefix cache hit rate: %.1f%%" )
205208 log_args .append (self .connector_prefix_caching_metrics .hit_rate * 100 )
209+ if envs .VLLM_COMPUTE_NANS_IN_LOGITS :
210+ log_parts .append ("Corrupted: %d reqs" )
211+ log_args .append (self .num_corrupted_reqs )
206212 if not self .mm_caching_metrics .empty :
207213 log_parts .append ("MM cache hit rate: %.1f%%" )
208214 log_args .append (self .mm_caching_metrics .hit_rate * 100 )
@@ -271,9 +277,6 @@ def aggregate_scheduler_stats(self):
271277 self .last_scheduler_stats .num_running_reqs += (
272278 last_scheduler_stats .num_running_reqs
273279 )
274- self .last_scheduler_stats .num_corrupted_reqs += (
275- last_scheduler_stats .num_corrupted_reqs
276- )
277280 self .last_scheduler_stats .kv_cache_usage += (
278281 last_scheduler_stats .kv_cache_usage
279282 )
@@ -387,16 +390,6 @@ def __init__(
387390 gauge_scheduler_waiting , engine_indexes , model_name
388391 )
389392
390- gauge_scheduler_corrupted = self ._gauge_cls (
391- name = "vllm:num_requests_corrupted" ,
392- documentation = "Number of requests corrupted." ,
393- multiprocess_mode = "mostrecent" ,
394- labelnames = labelnames ,
395- )
396- self .gauge_scheduler_corrupted = make_per_engine (
397- gauge_scheduler_corrupted , engine_indexes , model_name
398- )
399-
400393 #
401394 # GPU cache
402395 #
@@ -458,6 +451,16 @@ def __init__(
458451 gauge_kv_cache_usage , engine_indexes , model_name
459452 )
460453
454+ if envs .VLLM_COMPUTE_NANS_IN_LOGITS :
455+ counter_corrupted_requests = self ._counter_cls (
456+ name = "vllm:corrupted_requests" ,
457+ documentation = "Number of requests corrupted out of running requests." ,
458+ labelnames = labelnames ,
459+ )
460+ self .counter_corrupted_requests = make_per_engine (
461+ counter_corrupted_requests , engine_indexes , model_name
462+ )
463+
461464 counter_prefix_cache_queries = self ._counter_cls (
462465 name = "vllm:prefix_cache_queries" ,
463466 documentation = (
@@ -910,10 +913,6 @@ def record(
910913 self .gauge_scheduler_waiting [engine_idx ].set (
911914 scheduler_stats .num_waiting_reqs
912915 )
913- self .gauge_scheduler_corrupted [engine_idx ].set (
914- scheduler_stats .num_corrupted_reqs
915- )
916-
917916 if self .show_hidden_metrics :
918917 self .gauge_gpu_cache_usage [engine_idx ].set (
919918 scheduler_stats .kv_cache_usage
@@ -958,6 +957,10 @@ def record(
958957 self .counter_num_preempted_reqs [engine_idx ].inc (
959958 iteration_stats .num_preempted_reqs
960959 )
960+ if envs .VLLM_COMPUTE_NANS_IN_LOGITS and iteration_stats .num_corrupted_reqs > 0 :
961+ self .counter_corrupted_requests [engine_idx ].inc (
962+ iteration_stats .num_corrupted_reqs
963+ )
961964 self .counter_prompt_tokens [engine_idx ].inc (iteration_stats .num_prompt_tokens )
962965 self .counter_generation_tokens [engine_idx ].inc (
963966 iteration_stats .num_generation_tokens
0 commit comments