Skip to content

Commit 61f96a1

Browse files
authored
SOLR-17937: Adding GPU details to the Admin UI (#3717)
1 parent 45f46b2 commit 61f96a1

File tree

10 files changed

+625
-4
lines changed

10 files changed

+625
-4
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
2+
title: Adding GPU details on the Admin UI using OpenTelemetry
3+
type: added
4+
authors:
5+
- name: Puneet Ahuja
6+

gradle/libs.versions.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ commons-codec = "1.19.0"
7474
commons-io = "2.20.0"
7575
compose = "1.8.2"
7676
cutterslade-analyze = "1.10.0"
77+
cuvs-java = "25.10.0"
7778
cuvs-lucene = "25.10.0"
7879
cybozulabs-langdetect = "1.1-20120112"
7980
decompose = "3.3.0"
@@ -319,6 +320,7 @@ codehaus-woodstox-stax2api = { module = "org.codehaus.woodstox:stax2-api", versi
319320
commonscli-commonscli = { module = "commons-cli:commons-cli", version.ref = "commons-cli" }
320321
commonscodec-commonscodec = { module = "commons-codec:commons-codec", version.ref = "commons-codec" }
321322
commonsio-commonsio = { module = "commons-io:commons-io", version.ref = "commons-io" }
323+
cuvs-java = { module = "com.nvidia.cuvs:cuvs-java", version.ref = "cuvs-java" }
322324
cuvs-lucene = { module = "com.nvidia.cuvs.lucene:cuvs-lucene", version.ref = "cuvs-lucene" }
323325
cybozulabs-langdetect = { module = "com.cybozu.labs:langdetect", version.ref = "cybozulabs-langdetect" }
324326
decompose-decompose = { module = "com.arkivanov.decompose:decompose", version.ref = "decompose" }

solr/core/src/java/org/apache/solr/core/CoreContainer.java

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@
130130
import org.apache.solr.jersey.JerseyAppHandlerCache;
131131
import org.apache.solr.logging.LogWatcher;
132132
import org.apache.solr.logging.MDCLoggingContext;
133+
import org.apache.solr.metrics.GpuMetricsProvider;
133134
import org.apache.solr.metrics.SolrMetricManager;
134135
import org.apache.solr.metrics.SolrMetricProducer;
135136
import org.apache.solr.metrics.SolrMetricsContext;
@@ -778,6 +779,9 @@ private void loadInternal() {
778779

779780
solrMetricsContext = new SolrMetricsContext(metricManager, NODE_REGISTRY);
780781

782+
// Initialize GPU metrics service
783+
initGpuMetricsService();
784+
781785
shardHandlerFactory =
782786
ShardHandlerFactory.newInstance(cfg.getShardHandlerFactoryPluginInfo(), loader);
783787
if (shardHandlerFactory instanceof SolrMetricProducer metricProducer) {
@@ -1154,6 +1158,59 @@ private static void checkForDuplicateCoreNames(List<CoreDescriptor> cds) {
11541158
}
11551159
}
11561160

1161+
private void initGpuMetricsService() {
1162+
try {
1163+
Class<?> serviceClass = Class.forName("org.apache.solr.cuvs.GpuMetricsService");
1164+
Object serviceObj = serviceClass.getMethod("getInstance").invoke(null);
1165+
1166+
if (serviceObj instanceof GpuMetricsProvider provider) {
1167+
serviceClass.getMethod("initialize", CoreContainer.class).invoke(serviceObj, this);
1168+
provider.initializeMetrics(
1169+
solrMetricsContext,
1170+
Attributes.builder()
1171+
.put(SolrMetricProducer.TYPE_ATTR, "gpu")
1172+
.put(SolrMetricProducer.CATEGORY_ATTR, "system")
1173+
.build());
1174+
log.info("GPU metrics service initialized");
1175+
}
1176+
} catch (ClassNotFoundException e) {
1177+
log.debug("cuVS module not available, GPU metrics will not be collected");
1178+
} catch (Exception e) {
1179+
log.warn("Failed to initialize GPU metrics service", e);
1180+
}
1181+
}
1182+
1183+
private void shutdownGpuMetricsService() {
1184+
try {
1185+
Class<?> serviceClass = Class.forName("org.apache.solr.cuvs.GpuMetricsService");
1186+
Object serviceObj = serviceClass.getMethod("getInstance").invoke(null);
1187+
1188+
if (serviceObj instanceof GpuMetricsProvider) {
1189+
GpuMetricsProvider provider = (GpuMetricsProvider) serviceObj;
1190+
provider.close();
1191+
log.info("GPU metrics service shut down");
1192+
}
1193+
} catch (ClassNotFoundException e) {
1194+
// Expected when cuvs module is not available
1195+
} catch (Exception e) {
1196+
log.warn("Failed to shutdown GPU metrics service", e);
1197+
}
1198+
}
1199+
1200+
public GpuMetricsProvider getGpuMetricsProvider() {
1201+
try {
1202+
Class<?> serviceClass = Class.forName("org.apache.solr.cuvs.GpuMetricsService");
1203+
Object serviceObj = serviceClass.getMethod("getInstance").invoke(null);
1204+
1205+
if (serviceObj instanceof GpuMetricsProvider) {
1206+
return (GpuMetricsProvider) serviceObj;
1207+
}
1208+
} catch (Exception e) {
1209+
// Module not available
1210+
}
1211+
return null;
1212+
}
1213+
11571214
private volatile boolean isShutDown = false;
11581215

11591216
public boolean isShutDown() {
@@ -1219,6 +1276,9 @@ public void shutdown() {
12191276

12201277
customThreadPool.execute(replayUpdatesExecutor::shutdownAndAwaitTermination);
12211278

1279+
// Shutdown GPU metrics service if it was initialized
1280+
shutdownGpuMetricsService();
1281+
12221282
if (metricManager != null) {
12231283
// Close all OTEL meter providers and metrics
12241284
metricManager.closeAllRegistries();

solr/core/src/java/org/apache/solr/handler/admin/SystemInfoHandler.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
import org.apache.solr.core.SolrCore;
5656
import org.apache.solr.handler.RequestHandlerBase;
5757
import org.apache.solr.handler.admin.api.NodeSystemInfoAPI;
58+
import org.apache.solr.metrics.GpuMetricsProvider;
5859
import org.apache.solr.request.SolrQueryRequest;
5960
import org.apache.solr.response.SolrQueryResponse;
6061
import org.apache.solr.schema.IndexSchema;
@@ -238,6 +239,8 @@ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throw
238239
rsp.add("jvm", getJvmInfo(nodeConfig));
239240
rsp.add("security", getSecurityInfo(req));
240241
rsp.add("system", getSystemInfo());
242+
243+
rsp.add("gpu", getGpuInfo(req));
241244
if (solrCloudMode) {
242245
rsp.add("node", getCoreContainer(req).getZkController().getNodeName());
243246
}
@@ -519,6 +522,50 @@ public Boolean registerV2() {
519522
return Boolean.TRUE;
520523
}
521524

525+
private SimpleOrderedMap<Object> getGpuInfo(SolrQueryRequest req) {
526+
SimpleOrderedMap<Object> gpuInfo = new SimpleOrderedMap<>();
527+
528+
try {
529+
GpuMetricsProvider provider = getCoreContainer(req).getGpuMetricsProvider();
530+
531+
if (provider == null) {
532+
gpuInfo.add("available", false);
533+
return gpuInfo;
534+
}
535+
536+
long gpuCount = provider.getGpuCount();
537+
if (gpuCount > 0) {
538+
gpuInfo.add("available", true);
539+
gpuInfo.add("count", gpuCount);
540+
541+
long gpuMemoryTotal = provider.getGpuMemoryTotal();
542+
long gpuMemoryUsed = provider.getGpuMemoryUsed();
543+
long gpuMemoryFree = provider.getGpuMemoryFree();
544+
545+
if (gpuMemoryTotal > 0) {
546+
SimpleOrderedMap<Object> memory = new SimpleOrderedMap<>();
547+
memory.add("total", gpuMemoryTotal);
548+
memory.add("used", gpuMemoryUsed);
549+
memory.add("free", gpuMemoryFree);
550+
gpuInfo.add("memory", memory);
551+
}
552+
553+
var devices = provider.getGpuDevices();
554+
if (devices != null && devices.size() > 0) {
555+
gpuInfo.add("devices", devices);
556+
}
557+
} else {
558+
gpuInfo.add("available", false);
559+
}
560+
561+
} catch (Exception e) {
562+
log.warn("Failed to get GPU information", e);
563+
gpuInfo.add("available", false);
564+
}
565+
566+
return gpuInfo;
567+
}
568+
522569
@Override
523570
public Name getPermissionName(AuthorizationContext request) {
524571
return Name.CONFIG_READ_PERM;
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.solr.metrics;
18+
19+
import java.util.Map;
20+
21+
public interface GpuMetricsProvider extends SolrMetricProducer {
22+
23+
long getGpuCount();
24+
25+
long getGpuMemoryTotal();
26+
27+
long getGpuMemoryUsed();
28+
29+
long getGpuMemoryFree();
30+
31+
Map<String, Object> getGpuDevices();
32+
}

solr/modules/cuvs/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ dependencies {
2323
implementation(libs.cuvs.lucene) {
2424
changing = true
2525
}
26+
implementation libs.cuvs.java
27+
implementation libs.opentelemetry.api
2628
implementation project(':solr:core')
2729
implementation project(':solr:solrj')
2830
implementation libs.apache.lucene.core

0 commit comments

Comments
 (0)