Skip to content

Commit 9e3edcb

Browse files
authored
Merge branch 'dev' into ruigao/cilium-update
2 parents ee5710b + 35b2a3c commit 9e3edcb

File tree

83 files changed

+7577
-471
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+7577
-471
lines changed

.github/CODEOWNERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
. @microsoft/ltpadmin
2+
.github @microsoft/ltpadmin

.github/workflows/build-all.yaml

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
name: Build All Services
2+
3+
permissions:
4+
contents: read
5+
6+
on:
7+
push:
8+
branches: [main, 'release/*']
9+
pull_request:
10+
branches: [main, dev, 'release/*']
11+
release:
12+
types: [published]
13+
workflow_dispatch:
14+
15+
env:
16+
TAG: ${{ github.run_number }}
17+
18+
jobs:
19+
build:
20+
name: Build All
21+
runs-on: [self-hosted, paicicd]
22+
timeout-minutes: 120
23+
environment: auto-test
24+
container:
25+
image: ubuntu:latest
26+
volumes:
27+
- /var/run/docker.sock:/var/run/docker.sock
28+
steps:
29+
- name: Install git
30+
run: |
31+
DEBIAN_FRONTEND=noninteractive apt update
32+
DEBIAN_FRONTEND=noninteractive apt install -y git
33+
34+
- name: Checkout repository
35+
uses: actions/checkout@v4
36+
with:
37+
fetch-depth: 0
38+
submodules: false
39+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.ref_name }}
40+
41+
- name: Get All Services
42+
id: all
43+
run: |
44+
services=$(ls -1d src/* | awk -F'/' '{print $2}' | tr '\n' ' ')
45+
skipped_services="base-image cleaning-image dev-box marketplace-db marketplace-restserver marketplace-webportal utilization-reporter"
46+
for skip in $skipped_services; do
47+
services=$(echo $services | sed "s/\b$skip\b//g")
48+
done
49+
echo "All services: $services"
50+
echo "services=$services" >> $GITHUB_OUTPUT
51+
52+
- name: Install Package
53+
if: steps.all.outputs.services != ''
54+
run: |
55+
DEBIAN_FRONTEND=noninteractive apt install -y python3 python-is-python3 pip git unzip docker-cli ca-certificates curl apt-transport-https lsb-release gnupg parallel
56+
curl -sL https://aka.ms/InstallAzureCLIDeb | bash
57+
58+
- name: Install python libs
59+
if: steps.all.outputs.services != ''
60+
run: python -m pip install --break-system-packages pyyaml jinja2 paramiko etcd3 protobuf==3.20.3 kubernetes gitpython
61+
62+
- name: Decode and unzip config file
63+
if: steps.all.outputs.services != ''
64+
run: |
65+
echo "${{ secrets.CONFIG_FILE_B64 }}" | base64 -d > config.zip
66+
mkdir -p $GITHUB_WORKSPACE/config
67+
unzip -o config.zip -d $GITHUB_WORKSPACE/config
68+
ls -l $GITHUB_WORKSPACE/config
69+
70+
- name: Arrange Config Files
71+
if: steps.all.outputs.services != ''
72+
run: |
73+
rm -rf /tmp/auth-configuration
74+
mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/
75+
ls -l /tmp/auth-configuration
76+
77+
- name: Build Images of Services
78+
if: steps.all.outputs.services != ''
79+
run: |
80+
all_services="${{ steps.all.outputs.services }}"
81+
echo "Building: $all_services"
82+
echo "--------------------------------"
83+
failed_services=""
84+
for service in $all_services; do
85+
if [[ "$service" =~ alert-manager ]]; then
86+
echo "alert-manager is in the changed services"
87+
# Build specific images in alert-manager
88+
echo "Building specific alert-manager images"
89+
$GITHUB_WORKSPACE/build/pai_build.py build \
90+
-c $GITHUB_WORKSPACE/config/cluster-configuration \
91+
-s alert-manager \
92+
-i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring
93+
fi
94+
echo "Building service: $service"
95+
if python3 $GITHUB_WORKSPACE/build/pai_build.py build \
96+
-c $GITHUB_WORKSPACE/config/cluster-configuration \
97+
-s $service; then
98+
echo "✓ Successfully built: $service"
99+
else
100+
echo "✗ Failed to build: $service"
101+
failed_services="$failed_services $service"
102+
fi
103+
done
104+
105+
if [ -n "$failed_services" ]; then
106+
echo "::error::Failed to build services:$failed_services"
107+
echo "FAILED_SERVICES=$failed_services"
108+
exit 1
109+
else
110+
echo "All services built successfully"
111+
fi

.github/workflows/build-deploy-changes.yaml

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ permissions:
66

77
on:
88
push:
9-
branches: [main, dev, 'release/*']
9+
branches: [main, dev, "release/*"]
1010
pull_request:
11-
branches: [main, dev, 'release/*']
11+
branches: [main, dev, "release/*"]
1212

1313
env:
1414
TAG: ${{ github.run_number }}
@@ -55,9 +55,8 @@ jobs:
5555
changed_files=$(git diff --name-only "$base_sha" "$head_sha")
5656
echo "Changed files: $changed_files"
5757
58-
# extract service folders under src/, skip alert-manager
58+
# extract service folders under src/
5959
folders=$(echo "$changed_files" | grep '^src/' \
60-
| grep -v 'alert-manager' \
6160
| awk -F'/' '{print $2}' \
6261
| sort -u | tr '\n' ' ')
6362
echo "Changed folders: $folders"
@@ -104,6 +103,18 @@ jobs:
104103
run: |
105104
changed_services="${{ steps.changes.outputs.folders }}"
106105
echo "Building: $changed_services"
106+
if [[ "$changed_services" == *"alert-manager"* ]]; then
107+
echo "alert-manager is in the changed services"
108+
changed_services=$(echo $changed_services | sed 's/alert-manager//g')
109+
# build specific images in alert-manager
110+
echo "Building specific alert-manager images"
111+
$GITHUB_WORKSPACE/build/pai_build.py build \
112+
-c $GITHUB_WORKSPACE/config/cluster-configuration \
113+
-s alert-manager
114+
-i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring
115+
fi
116+
117+
echo "Changed services after removing alert-manager: $changed_services"
107118
$GITHUB_WORKSPACE/build/pai_build.py build \
108119
-c $GITHUB_WORKSPACE/config/cluster-configuration \
109120
-s $changed_services
@@ -122,6 +133,24 @@ jobs:
122133
run: |
123134
changed_services="${{ steps.changes.outputs.folders }}"
124135
echo "Pushing: $changed_services"
136+
# check whether alert-manager is in the changed services
137+
echo "Changed services before removing alert-manager: $changed_services"
138+
if [[ "$changed_services" == *"alert-manager"* ]]; then
139+
echo "alert-manager is in the changed services"
140+
changed_services=$(echo $changed_services | sed 's/alert-manager//g')
141+
# push specific images in alert-manager to GHCR
142+
echo "Pushing specific alert-manager images to GHCR"
143+
$GITHUB_WORKSPACE/build/pai_build.py push \
144+
-c $GITHUB_WORKSPACE/config/cluster-configuration \
145+
-s alert-manager \
146+
-i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring \
147+
--docker-registry ghcr.io \
148+
--docker-namespace ${GITHUB_REPOSITORY_OWNER} \
149+
--docker-username ${{ github.actor }} \
150+
--docker-password ${{ secrets.GITHUB_TOKEN }}
151+
fi
152+
153+
echo "Changed services after removing alert-manager: $changed_services"
125154
$GITHUB_WORKSPACE/build/pai_build.py push \
126155
-c $GITHUB_WORKSPACE/config/cluster-configuration \
127156
-s $changed_services \
@@ -133,24 +162,24 @@ jobs:
133162
- name: Azure CLI get credentials and deploy
134163
if: steps.check.outputs.has_changed == 'true'
135164
run: |
136-
az version
137-
az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }}
138-
az aks install-cli
139-
az aks get-credentials \
140-
--resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
141-
--name ${{ secrets.KUBERNETES_CLUSTER }} \
142-
--overwrite-existing
143-
kubelogin convert-kubeconfig -l azurecli
144-
kubectl config use-context ${{ secrets.KUBERNETES_CLUSTER }}
145-
echo "${{ secrets.PAI_CLUSTER_NAME }}" > cluster_id
146-
echo "Stopping changed pai services \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
147-
$GITHUB_WORKSPACE/paictl.py service stop -n ${{ steps.changes.outputs.folders }} < cluster_id
148-
echo "Pushing config to cluster \"${{ secrets.PAI_CLUSTER_NAME }}\" ..."
149-
$GITHUB_WORKSPACE/paictl.py config push -m service -p $GITHUB_WORKSPACE/config/cluster-configuration < cluster_id
150-
echo "Starting to update \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
151-
$GITHUB_WORKSPACE/paictl.py service start -n ${{ steps.changes.outputs.folders }} < cluster_id
152-
kubectl get pod
153-
kubectl get service
165+
az version
166+
az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }}
167+
az aks install-cli
168+
az aks get-credentials \
169+
--resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
170+
--name ${{ secrets.KUBERNETES_CLUSTER }} \
171+
--overwrite-existing
172+
kubelogin convert-kubeconfig -l azurecli
173+
kubectl config use-context ${{ secrets.KUBERNETES_CLUSTER }}
174+
echo "${{ secrets.PAI_CLUSTER_NAME }}" > cluster_id
175+
echo "Stopping changed pai services \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
176+
$GITHUB_WORKSPACE/paictl.py service stop -n ${{ steps.changes.outputs.folders }} < cluster_id
177+
echo "Pushing config to cluster \"${{ secrets.PAI_CLUSTER_NAME }}\" ..."
178+
$GITHUB_WORKSPACE/paictl.py config push -m service -p $GITHUB_WORKSPACE/config/cluster-configuration < cluster_id
179+
echo "Starting to update \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
180+
$GITHUB_WORKSPACE/paictl.py service start -n ${{ steps.changes.outputs.folders }} < cluster_id
181+
kubectl get pod
182+
kubectl get service
154183
155184
test:
156185
name: Test rest-server
@@ -169,4 +198,3 @@ jobs:
169198
exit 1
170199
fi
171200
echo "Virtual cluster info: $vc_info"
172-

build/core/build_center.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,16 @@
2828

2929
class BuildCenter:
3030

31-
def __init__(self, build_config, process_list, type):
31+
def __init__(self, build_config, process_list, type, arg_config=None):
3232

3333
self.logger = logging.getLogger(__name__)
3434
build_utility.setup_logger_config(self.logger)
3535

3636
self.build_config = build_config
3737
self.task_type = type
3838

39+
self.arg_config = arg_config
40+
3941
self.process_list = [service.lower() for service in process_list] if process_list is not None else None
4042

4143
# Initialize docker_cli instance
@@ -138,7 +140,7 @@ def build_center(self):
138140
for inedge in self.graph.services[item].inedges:
139141
build_worker.copy_dependency_folder(os.path.join(self.codeDir,inedge),
140142
os.path.join(self.graph.services[item].path,self.dependencyDir+inedge))
141-
build_worker.build_single_component(self.graph.services[item])
143+
build_worker.build_single_component(self.graph.services[item], self.arg_config.imagelist)
142144
self.logger.info("Build all components succeed")
143145

144146
except Exception as e:

build/core/build_handler.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def __init__(self, docker_cli):
4040
self.dependencyDir = 'dependency'
4141

4242

43-
def build_single_component(self, service):
43+
def build_single_component(self, service, imagelist=None):
4444

4545
self.logger.info("Starts to build {0}".format(service.service_name))
4646

@@ -53,7 +53,8 @@ def build_single_component(self, service):
5353
for dockerfile_prefix in service.docker_files:
5454
image_name = os.path.splitext(dockerfile_prefix)[0]
5555
dockerfile = os.path.join(service.path, 'build/' + dockerfile_prefix + '.dockerfile')
56-
self.docker_cli.docker_image_build(image_name, dockerfile, service.path)
56+
if imagelist is None or image_name in imagelist:
57+
self.docker_cli.docker_image_build(image_name, dockerfile, service.path)
5758

5859
post_build = os.path.join(service.path, self.build_post)
5960
if os.path.exists(post_build):

build/pai_build.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def load_build_config(config_dir):
3939

4040

4141
def build_service(args, config_model):
42-
pai_build = build_center.BuildCenter(config_model, args.service, 'k8s')
42+
pai_build = build_center.BuildCenter(config_model, args.service, 'k8s', args)
4343
pai_build.set_build_cache_type(args.nocache)
4444
pai_build.build_center()
4545

@@ -98,6 +98,13 @@ def main():
9898
action='store_true',
9999
help="Build the service using cache or not"
100100
)
101+
build_parser.add_argument(
102+
'-i', '--imagelist',
103+
type=str,
104+
nargs='+',
105+
default=None,
106+
help="The image list you want to build"
107+
)
101108
build_parser.set_defaults(func=build_service)
102109

103110
# Push commands

contrib/aks/scripts/config-ipoib.sh

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,21 @@
55

66
set -x
77

8-
DEBIAN_FRONTEND=noninteractive apt-get update -y
9-
DEBIAN_FRONTEND=noninteractive apt-get install -y network-manager net-tools rsync || echo "Failed in apt install"
10-
8+
wait_for_dpkg_lock() {
9+
if ! timeout 300 bash -c \
10+
'while sudo fuser /var/lib/dpkg/lock-frontend >/dev/null 2>&1 \
11+
|| pgrep -x "apt|apt-get|dpkg|unattended-upgrades" >/dev/null; do
12+
sleep 3
13+
done'
14+
then
15+
echo "Timed out waiting for dpkg lock."
16+
exit 124
17+
fi
18+
bash -c 'exec "$@"' -- "$@"
19+
}
20+
21+
wait_for_dpkg_lock bash -c 'DEBIAN_FRONTEND=noninteractive apt-get update -y'
22+
wait_for_dpkg_lock bash -c 'DEBIAN_FRONTEND=noninteractive apt-get install -y network-manager net-tools rsync || echo "Failed in apt install"'
1123

1224
# rename
1325
INIT=0

0 commit comments

Comments
 (0)