|
49 | 49 | import shutil |
50 | 50 | import sys |
51 | 51 | import typing |
52 | | -from typing import Set |
| 52 | +from typing import List, Optional, Set |
53 | 53 | import uuid |
54 | 54 |
|
55 | 55 | import filelock |
|
114 | 114 | CURRENT_HASH = os.path.expanduser('~/.sky/wheels/current_sky_wheel_hash') |
115 | 115 |
|
116 | 116 |
|
| 117 | +def _parse_controller_pid_entry( |
| 118 | + entry: str) -> Optional[state.ControllerPidRecord]: |
| 119 | + entry = entry.strip() |
| 120 | + if not entry: |
| 121 | + return None |
| 122 | + # The entry should be like <pid>,<started_at> |
| 123 | + # pid is an integer, started_at is a float |
| 124 | + # For backwards compatibility, we also support just <pid> |
| 125 | + entry_parts = entry.split(',') |
| 126 | + if len(entry_parts) == 2: |
| 127 | + [raw_pid, raw_started_at] = entry_parts |
| 128 | + elif len(entry_parts) == 1: |
| 129 | + # Backwards compatibility, pre-#7847 |
| 130 | + # TODO(cooperc): Remove for 0.13.0 |
| 131 | + raw_pid = entry_parts[0] |
| 132 | + raw_started_at = None |
| 133 | + else: |
| 134 | + # Unknown format |
| 135 | + return None |
| 136 | + |
| 137 | + try: |
| 138 | + pid = int(raw_pid) |
| 139 | + except ValueError: |
| 140 | + return None |
| 141 | + |
| 142 | + started_at: Optional[float] = None |
| 143 | + if raw_started_at: |
| 144 | + try: |
| 145 | + started_at = float(raw_started_at) |
| 146 | + except ValueError: |
| 147 | + started_at = None |
| 148 | + return state.ControllerPidRecord(pid=pid, started_at=started_at) |
| 149 | + |
| 150 | + |
| 151 | +def get_controller_process_records( |
| 152 | +) -> Optional[List[state.ControllerPidRecord]]: |
| 153 | + """Return recorded controller processes if the file can be read.""" |
| 154 | + if not os.path.exists(JOB_CONTROLLER_PID_PATH): |
| 155 | + # If the file doesn't exist, it means the controller server is not |
| 156 | + # running, so we return an empty list |
| 157 | + return [] |
| 158 | + try: |
| 159 | + with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f: |
| 160 | + lines = f.read().splitlines() |
| 161 | + except (FileNotFoundError, OSError): |
| 162 | + return None |
| 163 | + |
| 164 | + records: List[state.ControllerPidRecord] = [] |
| 165 | + for line in lines: |
| 166 | + record = _parse_controller_pid_entry(line) |
| 167 | + if record is not None: |
| 168 | + records.append(record) |
| 169 | + return records |
| 170 | + |
| 171 | + |
| 172 | +def _append_controller_pid_record(pid: int, |
| 173 | + started_at: Optional[float]) -> None: |
| 174 | + # Note: started_at is a float, but converting to a string will not lose any |
| 175 | + # precision. See https://docs.python.org/3/tutorial/floatingpoint.html and |
| 176 | + # https://github.com/python/cpython/issues/53583 |
| 177 | + entry = str(pid) if started_at is None else f'{pid},{started_at}' |
| 178 | + with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f: |
| 179 | + f.write(entry + '\n') |
| 180 | + |
| 181 | + |
117 | 182 | @annotations.lru_cache(scope='global') |
118 | 183 | def get_number_of_controllers() -> int: |
119 | 184 | """Returns the number of controllers that should be running. |
@@ -180,36 +245,21 @@ def start_controller() -> None: |
180 | 245 | logger.info(f'Running controller with command: {run_cmd}') |
181 | 246 |
|
182 | 247 | pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path) |
183 | | - with open(JOB_CONTROLLER_PID_PATH, 'a', encoding='utf-8') as f: |
184 | | - f.write(str(pid) + '\n') |
| 248 | + pid_started_at = psutil.Process(pid).create_time() |
| 249 | + _append_controller_pid_record(pid, pid_started_at) |
185 | 250 |
|
186 | 251 |
|
187 | | -def get_alive_controllers() -> typing.Optional[int]: |
188 | | - if not os.path.exists(JOB_CONTROLLER_PID_PATH): |
189 | | - # if the file doesn't exist, it means the controller server is not |
190 | | - # running, so we return 0 |
191 | | - return 0 |
192 | | - |
193 | | - try: |
194 | | - with open(JOB_CONTROLLER_PID_PATH, 'r', encoding='utf-8') as f: |
195 | | - pids = f.read().split('\n')[:-1] |
196 | | - except OSError: |
197 | | - # if the file is corrupted, or any issues with reading it, we just |
198 | | - # return None to be safe and not over start |
| 252 | +def get_alive_controllers() -> Optional[int]: |
| 253 | + records = get_controller_process_records() |
| 254 | + if records is None: |
| 255 | + # If we cannot read the file reliably, avoid starting extra controllers. |
199 | 256 | return None |
| 257 | + if not records: |
| 258 | + return 0 |
200 | 259 |
|
201 | 260 | alive = 0 |
202 | | - for pid in pids: |
203 | | - try: |
204 | | - # TODO(luca) there is a chance that the process that is alive is |
205 | | - # not the same controller process. a better solution is to also |
206 | | - # include a random UUID with each controller and store that in the |
207 | | - # db as well/in the command that spawns it. |
208 | | - if subprocess_utils.is_process_alive(int(pid.strip())): |
209 | | - alive += 1 |
210 | | - except ValueError: |
211 | | - # if the pid is not an integer, let's assume it's alive to not |
212 | | - # over start new processes |
| 261 | + for record in records: |
| 262 | + if managed_job_utils.controller_process_alive(record, quiet=False): |
213 | 263 | alive += 1 |
214 | 264 | return alive |
215 | 265 |
|
@@ -280,10 +330,11 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str, |
280 | 330 |
|
281 | 331 | The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this. |
282 | 332 | """ |
283 | | - controller_pid = state.get_job_controller_pid(job_id) |
284 | | - if controller_pid is not None: |
| 333 | + controller_process = state.get_job_controller_process(job_id) |
| 334 | + if controller_process is not None: |
285 | 335 | # why? TODO(cooperc): figure out why this is needed, fix it, and remove |
286 | | - if managed_job_utils.controller_process_alive(controller_pid, job_id): |
| 336 | + if managed_job_utils.controller_process_alive(controller_process, |
| 337 | + job_id): |
287 | 338 | # This can happen when HA recovery runs for some reason but the job |
288 | 339 | # controller is still alive. |
289 | 340 | logger.warning(f'Job {job_id} is still alive, skipping submission') |
|
0 commit comments