Skip to content

Commit c1913be

Browse files
authored
Merge pull request #1514 from FedML-AI/test/v0.7.0
Test/v0.7.0
2 parents 58d4b2b + c13799e commit c1913be

File tree

8 files changed

+23
-12
lines changed

8 files changed

+23
-12
lines changed

python/fedml/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
_global_training_type = None
3535
_global_comm_backend = None
3636

37-
__version__ = "0.8.9a2"
37+
__version__ = "0.8.9a3"
3838

3939

4040
# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release

python/fedml/computing/scheduler/master/server_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
import requests
2525

26+
import fedml
2627
from ..scheduler_core.scheduler_matcher import SchedulerMatcher
2728
from ..comm_utils.constants import SchedulerConstants
2829
from ..comm_utils.job_utils import JobRunnerUtils
@@ -1058,7 +1059,7 @@ def ota_upgrade(self, payload, request_json):
10581059
pass
10591060

10601061
if force_ota and ota_version is not None:
1061-
should_upgrade = True
1062+
should_upgrade = True if ota_version != fedml.__version__ else False
10621063
upgrade_version = ota_version
10631064
else:
10641065
try:

python/fedml/computing/scheduler/model_scheduler/device_client_runner.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
from urllib.parse import urlparse
1919

2020
import requests
21+
22+
import fedml
2123
from fedml import mlops
2224
from fedml.computing.scheduler.model_scheduler.device_model_msg_object import FedMLModelMsgObject
2325
from fedml.core.distributed.communication.s3.remote_storage import S3Storage
@@ -348,6 +350,7 @@ def run_impl(self):
348350

349351
# download model net and load into the torch model
350352
model_from_open = None
353+
self.model_is_from_open = None
351354
if self.model_is_from_open:
352355
logging.info("process the model net from open...")
353356
self.check_runner_stop_event()
@@ -562,7 +565,7 @@ def ota_upgrade(self, payload, request_json):
562565
pass
563566

564567
if force_ota and ota_version is not None:
565-
should_upgrade = True
568+
should_upgrade = True if ota_version != fedml.__version__ else False
566569
upgrade_version = ota_version
567570
else:
568571
try:

python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
174174
logging.info("LLM model loaded from the open")
175175
else:
176176
raise Exception("Unsupported inference engine type: {}".format(inference_engine))
177-
elif model_is_from_open == False:
177+
elif model_is_from_open == False or model_is_from_open is None:
178178
model_location = os.path.join(model_storage_local_path, "fedml_model.bin")
179179
try:
180180
model = torch.jit.load(model_location)
@@ -188,6 +188,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
188188
config = yaml.safe_load(file)
189189
# Resource related
190190
use_gpu = config.get('use_gpu', False)
191+
usr_indicated_wait_time = config.get('deploy_timeout', 100)
192+
usr_indicated_retry_cnt = max(int(usr_indicated_wait_time) // 10, 1)
191193
inference_image_name = config.get('inference_image_name',
192194
ClientConstants.INFERENCE_SERVER_CUSTOME_IMAGE)
193195
# Source code dir, bootstrap dir, data cache dir
@@ -374,7 +376,8 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
374376
# Logging the info from the container
375377
log_deployment_result(end_point_id, model_id, default_server_container_name,
376378
ClientConstants.CMD_TYPE_RUN_DEFAULT_SERVER,
377-
running_model_name, inference_engine, inference_http_port, inference_type="default")
379+
running_model_name, inference_engine, inference_http_port, inference_type="default",
380+
retry_interval=10, deploy_attempt_threshold=usr_indicated_retry_cnt)
378381

379382
# Check if the inference server is ready
380383
inference_output_url, running_model_version, ret_model_metadata, ret_model_config = \
@@ -542,10 +545,9 @@ def should_exit_logs(end_point_id, model_id, cmd_type, model_name, inference_eng
542545

543546
def log_deployment_result(end_point_id, model_id, cmd_container_name, cmd_type,
544547
inference_model_name, inference_engine,
545-
inference_http_port, inference_type="default"):
548+
inference_http_port, inference_type="default",
549+
retry_interval=10, deploy_attempt_threshold=10):
546550
deploy_attempt = 0
547-
retry_interval = 10
548-
deploy_attempt_threshold = 10
549551
last_out_logs = ""
550552
last_err_logs = ""
551553

python/fedml/computing/scheduler/model_scheduler/device_server_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import requests
2222
import torch
2323

24+
import fedml
2425
from ..comm_utils import sys_utils
2526
from .device_server_data_interface import FedMLServerDataInterface
2627
from ....core.mlops.mlops_runtime_log import MLOpsRuntimeLog
@@ -693,7 +694,7 @@ def ota_upgrade(self, payload, request_json):
693694
pass
694695

695696
if force_ota and ota_version is not None:
696-
should_upgrade = True
697+
should_upgrade = True if ota_version != fedml.__version__ else False
697698
upgrade_version = ota_version
698699
else:
699700
try:

python/fedml/computing/scheduler/slave/client_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -832,7 +832,7 @@ def ota_upgrade(self, payload, request_json):
832832
pass
833833

834834
if force_ota and ota_version is not None:
835-
should_upgrade = True
835+
should_upgrade = True if ota_version != fedml.__version__ else False
836836
upgrade_version = ota_version
837837
else:
838838
try:

python/fedml/core/distributed/communication/s3/remote_storage.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,11 +308,15 @@ def read_model_net_progress(bytes_transferred):
308308
)
309309

310310
unpickle_start_time = time.time()
311+
model = None
311312
try:
312313
model = torch.jit.load(temp_file_path)
313314
except Exception as e:
314315
logging.info("jit.load failed")
315-
model = torch.load(temp_file_path, pickle_module=dill)
316+
try:
317+
model = torch.load(temp_file_path, pickle_module=dill)
318+
except Exception as e:
319+
logging.info("torch.load failed")
316320
os.remove(temp_file_path)
317321
MLOpsProfilerEvent.log_to_wandb(
318322
{"UnpickleTime": time.time() - unpickle_start_time}

python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def finalize_options(self):
9494

9595
setup(
9696
name="fedml",
97-
version="0.8.9a2",
97+
version="0.8.9a3",
9898
author="FedML Team",
9999
author_email="[email protected]",
100100
description="A research and production integrated edge-cloud library for "

0 commit comments

Comments
 (0)