Skip to content

Commit 2e4194f

Browse files
authored
fix: 修复 ai 模块的 CI 构建错误 (#1500)
1 parent 6247e4f commit 2e4194f

File tree

4 files changed

+31
-5
lines changed

4 files changed

+31
-5
lines changed

.changeset/fluffy-crabs-decide.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@scow/ai": patch
3+
---
4+
5+
Fix: resolve CI build error in @scow/ai module

apps/ai/src/server/setup/jobShell.ts

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import { asyncClientCall } from "@ddadaal/tsgrpc-client";
1414
import * as k8sClient from "@kubernetes/client-node";
15+
import { JobInfo_PodStatus } from "@scow/ai-scheduler-adapter-protos/build/protos/job";
1516
import { normalizePathnameWithQuery } from "@scow/utils";
1617
import { IncomingMessage } from "http";
1718
import { NextApiRequest } from "next";
@@ -189,8 +190,16 @@ wss.on("connection", async (ws: AliveCheckedWebSocket, req) => {
189190
return;
190191
}
191192

192-
const namespace = job.containerJobInfo?.namespace;
193-
const podName = job.containerJobInfo?.podName;
193+
// 获取运行中pod的namespace,podName
194+
const runningPod = job.pods.find((pod) => pod.podStatus === JobInfo_PodStatus.RUNNING);
195+
196+
if (!runningPod) {
197+
log("[shell] No running pod found for this job.");
198+
ws.close(1008, "No running pod found for this job.");
199+
return;
200+
}
201+
202+
const { namespace, podName } = runningPod;
194203

195204
if (!namespace || !podName) {
196205
log("[shell] Namespace or pod not obtained, please check the adapter version");

apps/ai/src/server/trpc/route/jobs/apps.ts

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
import { asyncClientCall } from "@ddadaal/tsgrpc-client";
1414
import { ServiceError } from "@ddadaal/tsgrpc-common";
15-
import { JobInfo } from "@scow/ai-scheduler-adapter-protos/build/protos/job";
15+
import { JobInfo, JobInfo_PodStatus } from "@scow/ai-scheduler-adapter-protos/build/protos/job";
1616
import { AppType } from "@scow/config/build/appForAi";
1717
import { getPlaceholderKeys } from "@scow/lib-config/build/parse";
1818
import { OperationResult, OperationType } from "@scow/lib-operation-log";
@@ -502,6 +502,7 @@ export const createAppSession = procedure
502502
// 用户指定应用工作目录,如果不存在,则默认为用户的appJobsDirectory
503503
workingDirectory: workingDirectory ?? join(homeDir, appJobsDirectory),
504504
script: remoteEntryPath,
505+
envVariables: [],
505506
// 对于AI模块,需要传递的额外参数
506507
// 第一个参数确定是创建应用or训练任务,
507508
// 第二个参数为创建应用时的appId
@@ -686,8 +687,18 @@ export const saveImage =
686687
});
687688
}
688689

689-
const nodeName = job.containerJobInfo?.nodeName;
690-
const containerId = job.containerJobInfo?.containerId;
690+
// 查找运行中的Pod
691+
const runningPod = job.pods.find((pod) => pod.podStatus === JobInfo_PodStatus.RUNNING);
692+
693+
if (!runningPod) {
694+
throw new TRPCError({
695+
code: "NOT_FOUND",
696+
message: "No running pod found for this job",
697+
});
698+
}
699+
700+
const nodeName = runningPod.nodeName;
701+
const containerId = runningPod.containerId;
691702

692703
if (!nodeName || !containerId) {
693704
throw new TRPCError({

apps/ai/src/server/trpc/route/jobs/jobs.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ procedure
256256
// 如果nodeCount不为1但同时选定镜像又没有框架标签,该接口会报错
257257
(nodeCount === 1 && !gpuType?.startsWith("huawei.com")) ? "" : framework || "",
258258
],
259+
envVariables:[],
259260
psNodeCount:psNodes,
260261
workerNodeCount:workerNodes,
261262
}).catch((e) => {

0 commit comments

Comments
 (0)