Skip to content

Commit 58d4b2b

Browse files
authored
Merge pull request #1510 from FedML-AI/test/v0.7.0
Test/v0.7.0
2 parents 96cea2b + f70b250 commit 58d4b2b

File tree

9 files changed

+50
-47
lines changed

9 files changed

+50
-47
lines changed

python/fedml/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
_global_training_type = None
3535
_global_comm_backend = None
3636

37-
__version__ = "0.8.8"
37+
__version__ = "0.8.9a2"
3838

3939

4040
# This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release

python/fedml/computing/scheduler/master/server_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ async def get_history_job_status(request: Request):
3939
responses = list()
4040
history_jobs = FedMLServerDataInterface.get_instance().get_history_jobs()
4141
for job_item in history_jobs.job_list:
42-
response = {"jobId": f"{history_jobs.job_id}_{history_jobs.edge_id}",
43-
"originalJobId": history_jobs.job_id,
42+
response = {"jobId": f"{job_item.job_id}_{job_item.edge_id}",
43+
"originalJobId": job_item.job_id,
4444
"edgeId": job_item.edge_id,
4545
"startedTime": int(float(job_item.started_time)) if job_item.started_time != "" else 0,
4646
"endedTime": int(float(job_item.ended_time)) if job_item.ended_time != "" else 0,

python/fedml/computing/scheduler/master/server_runner.py

Lines changed: 39 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -957,45 +957,47 @@ def send_training_request_to_edges(self, active_edge_info_dict=None):
957957
run_config = self.request_json.get("run_config", {})
958958
run_params = run_config.get("parameters", {})
959959
job_yaml = run_params.get("job_yaml", {})
960+
job_yaml_default_none = run_params.get("job_yaml", None)
960961
computing = job_yaml.get("computing", {})
961962
request_num_gpus = computing.get("minimum_num_gpus", None)
962963

963964
logging.info("Send training request to Edge ids: " + str(edge_id_list))
964965

965-
SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(active_edge_info_dict, show_gpu_list=True)
966-
967-
# Match and assign gpus to each device
968-
assigned_gpu_num_dict, assigned_gpu_ids_dict = SchedulerMatcher.match_and_assign_gpu_resources_to_devices(
969-
request_num_gpus, edge_id_list, active_edge_info_dict)
970-
if assigned_gpu_num_dict is None or assigned_gpu_ids_dict is None:
971-
# If no resources available, send failed message to MLOps and send exception message to all edges.
972-
gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
973-
active_edge_info_dict, should_print=True)
974-
logging.error(f"No resources available."
975-
f"Total available GPU count {gpu_available_count} is less than "
976-
f"request GPU count {request_num_gpus}")
977-
self.mlops_metrics.report_server_id_status(
978-
run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
979-
server_id=self.edge_id, server_agent_id=self.server_agent_id)
980-
self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
981-
return
966+
if job_yaml_default_none is not None and request_num_gpus is not None:
967+
SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(active_edge_info_dict, show_gpu_list=True)
968+
969+
# Match and assign gpus to each device
970+
assigned_gpu_num_dict, assigned_gpu_ids_dict = SchedulerMatcher.match_and_assign_gpu_resources_to_devices(
971+
request_num_gpus, edge_id_list, active_edge_info_dict)
972+
if assigned_gpu_num_dict is None or assigned_gpu_ids_dict is None:
973+
# If no resources available, send failed message to MLOps and send exception message to all edges.
974+
gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
975+
active_edge_info_dict, should_print=True)
976+
logging.error(f"No resources available."
977+
f"Total available GPU count {gpu_available_count} is less than "
978+
f"request GPU count {request_num_gpus}")
979+
self.mlops_metrics.report_server_id_status(
980+
run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
981+
server_id=self.edge_id, server_agent_id=self.server_agent_id)
982+
self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
983+
return
982984

983-
# Generate master node addr and port
984-
master_node_addr, master_node_port = SchedulerMatcher.get_master_node_info(edge_id_list, active_edge_info_dict)
985-
986-
# Generate new edge id list after matched
987-
edge_id_list = SchedulerMatcher.generate_new_edge_list_for_gpu_matching(assigned_gpu_num_dict)
988-
if len(edge_id_list) <= 0:
989-
gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
990-
active_edge_info_dict, should_print=True)
991-
logging.error(f"Request parameter for GPU num is invalid."
992-
f"Total available GPU count {gpu_available_count}."
993-
f"Request GPU num {request_num_gpus}")
994-
self.mlops_metrics.report_server_id_status(
995-
run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
996-
server_id=self.edge_id, server_agent_id=self.server_agent_id)
997-
self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
998-
return
985+
# Generate master node addr and port
986+
master_node_addr, master_node_port = SchedulerMatcher.get_master_node_info(edge_id_list, active_edge_info_dict)
987+
988+
# Generate new edge id list after matched
989+
edge_id_list = SchedulerMatcher.generate_new_edge_list_for_gpu_matching(assigned_gpu_num_dict)
990+
if len(edge_id_list) <= 0:
991+
gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
992+
active_edge_info_dict, should_print=True)
993+
logging.error(f"Request parameter for GPU num is invalid."
994+
f"Total available GPU count {gpu_available_count}."
995+
f"Request GPU num {request_num_gpus}")
996+
self.mlops_metrics.report_server_id_status(
997+
run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
998+
server_id=self.edge_id, server_agent_id=self.server_agent_id)
999+
self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
1000+
return
9991001

10001002
client_rank = 1
10011003
for edge_id in edge_id_list:
@@ -1005,9 +1007,10 @@ def send_training_request_to_edges(self, active_edge_info_dict=None):
10051007
request_json["client_rank"] = client_rank
10061008
client_rank += 1
10071009

1008-
request_json["scheduler_match_info"] = SchedulerMatcher.generate_match_info_for_scheduler(
1009-
edge_id, edge_id_list, master_node_addr, master_node_port, assigned_gpu_num_dict, assigned_gpu_ids_dict
1010-
)
1010+
if job_yaml_default_none is not None and request_num_gpus is not None:
1011+
request_json["scheduler_match_info"] = SchedulerMatcher.generate_match_info_for_scheduler(
1012+
edge_id, edge_id_list, master_node_addr, master_node_port, assigned_gpu_num_dict, assigned_gpu_ids_dict
1013+
)
10111014

10121015
self.client_mqtt_mgr.send_message(topic_start_train, json.dumps(request_json))
10131016

python/fedml/computing/scheduler/model_scheduler/device_model_cards.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -550,7 +550,7 @@ def local_serve_model(self, model_name):
550550
import subprocess
551551
all_env_vars = os.environ.copy()
552552
for k, v in new_environment_vars.items():
553-
all_env_vars[k] = v
553+
all_env_vars[k] = str(v)
554554

555555
print(f"Entering the main entry file {main_entry_file} ...")
556556

python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
182182
except Exception as e:
183183
logging.info(
184184
"Cannot locate the .bin file, will read it from"
185-
" the fedml_model_cofig.yaml with the key [local_model_dir] ")
185+
" the fedml_model_config.yaml with the key [local_model_dir] ")
186186
model_config_path = os.path.join(model_storage_local_path, "fedml_model_config.yaml")
187187
with open(model_config_path, 'r') as file:
188188
config = yaml.safe_load(file)

python/fedml/computing/scheduler/model_scheduler/device_server_runner.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,8 @@ def update_local_fedml_config(self, run_id, run_config):
182182

183183
def get_usr_indicated_token(self, request_json) -> str:
184184
usr_indicated_token = ""
185-
if "parameters" in request_json and "inference_token" in request_json["parameters"]:
186-
usr_indicated_token = request_json["parameters"]["inference_token"]
185+
if "parameters" in request_json and "authentication_token" in request_json["parameters"]:
186+
usr_indicated_token = request_json["parameters"]["authentication_token"]
187187
return usr_indicated_token
188188

189189
def build_dynamic_args(self, run_config, package_conf_object, base_dir):

python/fedml/computing/scheduler/scheduler_core/scheduler_matcher.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def match_and_assign_gpu_resources_to_devices(request_gpu_num, edge_id_list, act
7878
total_available_gpu_count += gpu_available_count
7979

8080
# Check if total available gpu count is less than request gpu num
81-
request_gpu_num = 0 if request_gpu_num < 0 else request_gpu_num
81+
request_gpu_num = 0 if request_gpu_num is None or request_gpu_num < 0 else request_gpu_num
8282
if total_available_gpu_count < request_gpu_num:
8383
return None, None
8484

python/fedml/computing/scheduler/slave/client_api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ async def get_history_job_status(request: Request):
3939
responses = list()
4040
history_jobs = FedMLClientDataInterface.get_instance().get_history_jobs()
4141
for job_item in history_jobs.job_list:
42-
response = {"jobId": f"{history_jobs.job_id}_{history_jobs.edge_id}",
43-
"originalJobId": history_jobs.job_id,
42+
response = {"jobId": f"{job_item.job_id}_{job_item.edge_id}",
43+
"originalJobId": job_item.job_id,
4444
"edgeId": job_item.edge_id,
4545
"startedTime": int(float(job_item.started_time)) if job_item.started_time != "" else 0,
4646
"endedTime": int(float(job_item.ended_time)) if job_item.ended_time != "" else 0,

python/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def finalize_options(self):
9494

9595
setup(
9696
name="fedml",
97-
version="0.8.8",
97+
version="0.8.9a2",
9898
author="FedML Team",
9999
author_email="[email protected]",
100100
description="A research and production integrated edge-cloud library for "

0 commit comments

Comments
 (0)