Merge pull request #1510 from FedML-AI/test/v0.7.0

fedml-alex · web-flow · commit 58d4b2b80b66 · 2023-10-28T07:35:05.000+08:00
Test/v0.7.0
diff --git a/python/fedml/__init__.py b/python/fedml/__init__.py
@@ -34,7 +34,7 @@
 _global_training_type = None
 _global_comm_backend = None
 
-__version__ = "0.8.8"
+__version__ = "0.8.9a2"
 
 
 # This is the deployment environment used for different roles (RD/PM/BD/Public Developers). Potential VALUE: local, dev, test, release
diff --git a/python/fedml/computing/scheduler/master/server_api.py b/python/fedml/computing/scheduler/master/server_api.py
@@ -39,8 +39,8 @@ async def get_history_job_status(request: Request):
     responses = list()
     history_jobs = FedMLServerDataInterface.get_instance().get_history_jobs()
     for job_item in history_jobs.job_list:
-        response = {"jobId": f"{history_jobs.job_id}_{history_jobs.edge_id}",
-                    "originalJobId": history_jobs.job_id,
+        response = {"jobId": f"{job_item.job_id}_{job_item.edge_id}",
+                    "originalJobId": job_item.job_id,
                     "edgeId": job_item.edge_id,
                     "startedTime": int(float(job_item.started_time)) if job_item.started_time != "" else 0,
                     "endedTime": int(float(job_item.ended_time)) if job_item.ended_time != "" else 0,
diff --git a/python/fedml/computing/scheduler/master/server_runner.py b/python/fedml/computing/scheduler/master/server_runner.py
@@ -957,45 +957,47 @@ def send_training_request_to_edges(self, active_edge_info_dict=None):
         run_config = self.request_json.get("run_config", {})
         run_params = run_config.get("parameters", {})
         job_yaml = run_params.get("job_yaml", {})
+        job_yaml_default_none = run_params.get("job_yaml", None)
         computing = job_yaml.get("computing", {})
         request_num_gpus = computing.get("minimum_num_gpus", None)
 
         logging.info("Send training request to Edge ids: " + str(edge_id_list))
 
-        SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(active_edge_info_dict, show_gpu_list=True)
-
-        # Match and assign gpus to each device
-        assigned_gpu_num_dict, assigned_gpu_ids_dict = SchedulerMatcher.match_and_assign_gpu_resources_to_devices(
-            request_num_gpus, edge_id_list, active_edge_info_dict)
-        if assigned_gpu_num_dict is None or assigned_gpu_ids_dict is None:
-            # If no resources available, send failed message to MLOps and send exception message to all edges.
-            gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
-                active_edge_info_dict, should_print=True)
-            logging.error(f"No resources available."
-                          f"Total available GPU count {gpu_available_count} is less than "
-                          f"request GPU count {request_num_gpus}")
-            self.mlops_metrics.report_server_id_status(
-                run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
-                server_id=self.edge_id, server_agent_id=self.server_agent_id)
-            self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
-            return
+        if job_yaml_default_none is not None and request_num_gpus is not None:
+            SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(active_edge_info_dict, show_gpu_list=True)
+
+            # Match and assign gpus to each device
+            assigned_gpu_num_dict, assigned_gpu_ids_dict = SchedulerMatcher.match_and_assign_gpu_resources_to_devices(
+                request_num_gpus, edge_id_list, active_edge_info_dict)
+            if assigned_gpu_num_dict is None or assigned_gpu_ids_dict is None:
+                # If no resources available, send failed message to MLOps and send exception message to all edges.
+                gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
+                    active_edge_info_dict, should_print=True)
+                logging.error(f"No resources available."
+                              f"Total available GPU count {gpu_available_count} is less than "
+                              f"request GPU count {request_num_gpus}")
+                self.mlops_metrics.report_server_id_status(
+                    run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
+                    server_id=self.edge_id, server_agent_id=self.server_agent_id)
+                self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
+                return
 
-        # Generate master node addr and port
-        master_node_addr, master_node_port = SchedulerMatcher.get_master_node_info(edge_id_list, active_edge_info_dict)
-
-        # Generate new edge id list after matched
-        edge_id_list = SchedulerMatcher.generate_new_edge_list_for_gpu_matching(assigned_gpu_num_dict)
-        if len(edge_id_list) <= 0:
-            gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
-                active_edge_info_dict, should_print=True)
-            logging.error(f"Request parameter for GPU num is invalid."
-                          f"Total available GPU count {gpu_available_count}."
-                          f"Request GPU num {request_num_gpus}")
-            self.mlops_metrics.report_server_id_status(
-                run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
-                server_id=self.edge_id, server_agent_id=self.server_agent_id)
-            self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
-            return
+            # Generate master node addr and port
+            master_node_addr, master_node_port = SchedulerMatcher.get_master_node_info(edge_id_list, active_edge_info_dict)
+
+            # Generate new edge id list after matched
+            edge_id_list = SchedulerMatcher.generate_new_edge_list_for_gpu_matching(assigned_gpu_num_dict)
+            if len(edge_id_list) <= 0:
+                gpu_count, gpu_available_count = SchedulerMatcher.parse_and_print_gpu_info_for_all_edges(
+                    active_edge_info_dict, should_print=True)
+                logging.error(f"Request parameter for GPU num is invalid."
+                              f"Total available GPU count {gpu_available_count}."
+                              f"Request GPU num {request_num_gpus}")
+                self.mlops_metrics.report_server_id_status(
+                    run_id, ServerConstants.MSG_MLOPS_SERVER_STATUS_FAILED, edge_id=self.edge_id,
+                    server_id=self.edge_id, server_agent_id=self.server_agent_id)
+                self.send_exit_train_with_exception_request_to_edges(edge_id_list, json.dumps(self.request_json))
+                return
 
         client_rank = 1
         for edge_id in edge_id_list:
@@ -1005,9 +1007,10 @@ def send_training_request_to_edges(self, active_edge_info_dict=None):
             request_json["client_rank"] = client_rank
             client_rank += 1
 
-            request_json["scheduler_match_info"] = SchedulerMatcher.generate_match_info_for_scheduler(
-                edge_id, edge_id_list, master_node_addr, master_node_port, assigned_gpu_num_dict, assigned_gpu_ids_dict
-            )
+            if job_yaml_default_none is not None and request_num_gpus is not None:
+                request_json["scheduler_match_info"] = SchedulerMatcher.generate_match_info_for_scheduler(
+                    edge_id, edge_id_list, master_node_addr, master_node_port, assigned_gpu_num_dict, assigned_gpu_ids_dict
+                )
 
             self.client_mqtt_mgr.send_message(topic_start_train, json.dumps(request_json))
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py b/python/fedml/computing/scheduler/model_scheduler/device_model_cards.py
@@ -550,7 +550,7 @@ def local_serve_model(self, model_name):
         import subprocess
         all_env_vars = os.environ.copy()
         for k, v in new_environment_vars.items():
-            all_env_vars[k] = v
+            all_env_vars[k] = str(v)
 
         print(f"Entering the main entry file {main_entry_file} ...")
 
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py b/python/fedml/computing/scheduler/model_scheduler/device_model_deployment.py
@@ -182,7 +182,7 @@ def start_deployment(end_point_id, end_point_name, model_id, model_version,
         except Exception as e:
             logging.info(
                 "Cannot locate the .bin file, will read it from"
-                " the fedml_model_cofig.yaml with the key [local_model_dir] ")
+                " the fedml_model_config.yaml with the key [local_model_dir] ")
             model_config_path = os.path.join(model_storage_local_path, "fedml_model_config.yaml")
             with open(model_config_path, 'r') as file:
                 config = yaml.safe_load(file)
diff --git a/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py b/python/fedml/computing/scheduler/model_scheduler/device_server_runner.py
@@ -182,8 +182,8 @@ def update_local_fedml_config(self, run_id, run_config):
 
     def get_usr_indicated_token(self, request_json) -> str:
         usr_indicated_token = ""
-        if "parameters" in request_json and "inference_token" in request_json["parameters"]:
-            usr_indicated_token = request_json["parameters"]["inference_token"]
+        if "parameters" in request_json and "authentication_token" in request_json["parameters"]:
+            usr_indicated_token = request_json["parameters"]["authentication_token"]
         return usr_indicated_token
     
     def build_dynamic_args(self, run_config, package_conf_object, base_dir):
diff --git a/python/fedml/computing/scheduler/scheduler_core/scheduler_matcher.py b/python/fedml/computing/scheduler/scheduler_core/scheduler_matcher.py
@@ -78,7 +78,7 @@ def match_and_assign_gpu_resources_to_devices(request_gpu_num, edge_id_list, act
             total_available_gpu_count += gpu_available_count
 
         # Check if total available gpu count is less than request gpu num
-        request_gpu_num = 0 if request_gpu_num < 0 else request_gpu_num
+        request_gpu_num = 0 if request_gpu_num is None or request_gpu_num < 0 else request_gpu_num
         if total_available_gpu_count < request_gpu_num:
             return None, None
 
diff --git a/python/fedml/computing/scheduler/slave/client_api.py b/python/fedml/computing/scheduler/slave/client_api.py
@@ -39,8 +39,8 @@ async def get_history_job_status(request: Request):
     responses = list()
     history_jobs = FedMLClientDataInterface.get_instance().get_history_jobs()
     for job_item in history_jobs.job_list:
-        response = {"jobId": f"{history_jobs.job_id}_{history_jobs.edge_id}",
-                    "originalJobId": history_jobs.job_id,
+        response = {"jobId": f"{job_item.job_id}_{job_item.edge_id}",
+                    "originalJobId": job_item.job_id,
                     "edgeId": job_item.edge_id,
                     "startedTime": int(float(job_item.started_time)) if job_item.started_time != "" else 0,
                     "endedTime": int(float(job_item.ended_time)) if job_item.ended_time != "" else 0,
diff --git a/python/setup.py b/python/setup.py
@@ -94,7 +94,7 @@ def finalize_options(self):
 
 setup(
     name="fedml",
-    version="0.8.8",
+    version="0.8.9a2",
     author="FedML Team",
     author_email="ch@fedml.ai",
     description="A research and production integrated edge-cloud library for "