@@ -6,9 +6,9 @@ permissions:
66
77on :
88 push :
9- branches : [main, dev, ' release/*' ]
9+ branches : [main, dev, " release/*" ]
1010 pull_request :
11- branches : [main, dev, ' release/*' ]
11+ branches : [main, dev, " release/*" ]
1212
1313env :
1414 TAG : ${{ github.run_number }}
5555 changed_files=$(git diff --name-only "$base_sha" "$head_sha")
5656 echo "Changed files: $changed_files"
5757
58- # extract service folders under src/, skip alert-manager
58+ # extract service folders under src/
5959 folders=$(echo "$changed_files" | grep '^src/' \
60- | grep -v 'alert-manager' \
6160 | awk -F'/' '{print $2}' \
6261 | sort -u | tr '\n' ' ')
6362 echo "Changed folders: $folders"
@@ -104,6 +103,18 @@ jobs:
104103 run : |
105104 changed_services="${{ steps.changes.outputs.folders }}"
106105 echo "Building: $changed_services"
106+ if [[ "$changed_services" == *"alert-manager"* ]]; then
107+ echo "alert-manager is in the changed services"
108+ changed_services=$(echo $changed_services | sed 's/alert-manager//g')
109+ # build specific images in alert-manager
110+ echo "Building specific alert-manager images"
111+ $GITHUB_WORKSPACE/build/pai_build.py build \
112+ -c $GITHUB_WORKSPACE/config/cluster-configuration \
113+ -s alert-manager
114+ -i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring
115+ fi
116+
117+ echo "Changed services after removing alert-manager: $changed_services"
107118 $GITHUB_WORKSPACE/build/pai_build.py build \
108119 -c $GITHUB_WORKSPACE/config/cluster-configuration \
109120 -s $changed_services
@@ -122,6 +133,24 @@ jobs:
122133 run : |
123134 changed_services="${{ steps.changes.outputs.folders }}"
124135 echo "Pushing: $changed_services"
136+ # check whether alert-manager is in the changed services
137+ echo "Changed services before removing alert-manager: $changed_services"
138+ if [[ "$changed_services" == *"alert-manager"* ]]; then
139+ echo "alert-manager is in the changed services"
140+ changed_services=$(echo $changed_services | sed 's/alert-manager//g')
141+ # push specific images in alert-manager to GHCR
142+ echo "Pushing specific alert-manager images to GHCR"
143+ $GITHUB_WORKSPACE/build/pai_build.py push \
144+ -c $GITHUB_WORKSPACE/config/cluster-configuration \
145+ -s alert-manager \
146+ -i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring \
147+ --docker-registry ghcr.io \
148+ --docker-namespace ${GITHUB_REPOSITORY_OWNER} \
149+ --docker-username ${{ github.actor }} \
150+ --docker-password ${{ secrets.GITHUB_TOKEN }}
151+ fi
152+
153+ echo "Changed services after removing alert-manager: $changed_services"
125154 $GITHUB_WORKSPACE/build/pai_build.py push \
126155 -c $GITHUB_WORKSPACE/config/cluster-configuration \
127156 -s $changed_services \
@@ -133,24 +162,24 @@ jobs:
133162 - name : Azure CLI get credentials and deploy
134163 if : steps.check.outputs.has_changed == 'true'
135164 run : |
136- az version
137- az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }}
138- az aks install-cli
139- az aks get-credentials \
140- --resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
141- --name ${{ secrets.KUBERNETES_CLUSTER }} \
142- --overwrite-existing
143- kubelogin convert-kubeconfig -l azurecli
144- kubectl config use-context ${{ secrets.KUBERNETES_CLUSTER }}
145- echo "${{ secrets.PAI_CLUSTER_NAME }}" > cluster_id
146- echo "Stopping changed pai services \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
147- $GITHUB_WORKSPACE/paictl.py service stop -n ${{ steps.changes.outputs.folders }} < cluster_id
148- echo "Pushing config to cluster \"${{ secrets.PAI_CLUSTER_NAME }}\" ..."
149- $GITHUB_WORKSPACE/paictl.py config push -m service -p $GITHUB_WORKSPACE/config/cluster-configuration < cluster_id
150- echo "Starting to update \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
151- $GITHUB_WORKSPACE/paictl.py service start -n ${{ steps.changes.outputs.folders }} < cluster_id
152- kubectl get pod
153- kubectl get service
165+ az version
166+ az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }}
167+ az aks install-cli
168+ az aks get-credentials \
169+ --resource-group ${{ secrets.AZURE_RESOURCE_GROUP }} \
170+ --name ${{ secrets.KUBERNETES_CLUSTER }} \
171+ --overwrite-existing
172+ kubelogin convert-kubeconfig -l azurecli
173+ kubectl config use-context ${{ secrets.KUBERNETES_CLUSTER }}
174+ echo "${{ secrets.PAI_CLUSTER_NAME }}" > cluster_id
175+ echo "Stopping changed pai services \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
176+ $GITHUB_WORKSPACE/paictl.py service stop -n ${{ steps.changes.outputs.folders }} < cluster_id
177+ echo "Pushing config to cluster \"${{ secrets.PAI_CLUSTER_NAME }}\" ..."
178+ $GITHUB_WORKSPACE/paictl.py config push -m service -p $GITHUB_WORKSPACE/config/cluster-configuration < cluster_id
179+ echo "Starting to update \"${{ steps.changes.outputs.folders }}\" on ${{ secrets.PAI_CLUSTER_NAME }} ..."
180+ $GITHUB_WORKSPACE/paictl.py service start -n ${{ steps.changes.outputs.folders }} < cluster_id
181+ kubectl get pod
182+ kubectl get service
154183
155184 test :
156185 name : Test rest-server
@@ -169,4 +198,3 @@ jobs:
169198 exit 1
170199 fi
171200 echo "Virtual cluster info: $vc_info"
172-
0 commit comments