From bf04d28fffecfa582294dd04d4681864305b127b Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 25 May 2024 03:24:59 +0800
Subject: [PATCH 01/25] Use glob ignore

---
 .gitignore | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 26a46f23f62..27be53151fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,8 @@
 .hypothesis
 buck-out/
 cmake-out/
-cmake-android-out/
-cmake-out-android/
-cmake-ios-out/
+cmake-*-out/
+cmake-out-*/
 ethos-u-scratch/
 executorch.egg-info
 pip-out/

From e58042c09b7b0e2a5ed78f27eeb44d08c268b3c9 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 25 May 2024 03:33:57 +0800
Subject: [PATCH 02/25] Support Windows

---
 backends/qualcomm/CMakeLists.txt              |   2 +-
 backends/qualcomm/aot/python/CMakeLists.txt   |   3 +
 backends/qualcomm/runtime/SharedBuffer.cpp    |  11 +
 backends/qualcomm/runtime/SharedBuffer.h      |   2 +-
 backends/qualcomm/runtime/Utils.cpp           |   8 +
 .../qualcomm/runtime/backends/CMakeLists.txt  |  11 +-
 .../runtime/backends/QnnBackendFactory.cpp    |   4 +
 .../runtime/backends/QnnFunctionInterface.h   |   5 +
 .../runtime/backends/QnnImplementation.cpp    |  49 +++++
 .../htpbackend/HtpContextCustomConfig.h       |   2 +-
 build/executorch-config.cmake                 |   7 +-
 examples/models/llama2/runner/util.h          |   8 +
 extension/data_loader/mmap_data_loader.cpp    | 196 +++++++++++++++++-
 extension/data_loader/mmap_data_loader.h      |  31 +++
 runtime/backend/interface.cpp                 |  79 ++++++-
 runtime/kernel/operator_registry.cpp          |  69 +++++-
 runtime/platform/system.h                     |  14 ++
 runtime/platform/targets.bzl                  |   1 +
 18 files changed, 479 insertions(+), 23 deletions(-)

diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index a840ab0bb92..77b43e96288 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -223,7 +223,7 @@ add_subdirectory(
 install(TARGETS qnn_executorch_backend DESTINATION lib)
 
 # QNN pybind
-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+if(EXECUTORCH_BUILD_PYBIND)
   add_subdirectory(
     ${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
     ${CMAKE_CURRENT_BINARY_DIR}/pybind11
diff --git a/backends/qualcomm/aot/python/CMakeLists.txt b/backends/qualcomm/aot/python/CMakeLists.txt
index 337cfae1776..f4ce70dc314 100644
--- a/backends/qualcomm/aot/python/CMakeLists.txt
+++ b/backends/qualcomm/aot/python/CMakeLists.txt
@@ -15,3 +15,6 @@ target_sources(
   PyQnnWrapperAdaptor PUBLIC ${CMAKE_CURRENT_LIST_DIR}/PyQnnWrapperAdaptor.cpp
                              ${CMAKE_CURRENT_LIST_DIR}/PyQnnWrapperAdaptor.h
 )
+
+target_compile_options(PyQnnManagerAdaptor PRIVATE -fexceptions)
+target_compile_options(PyQnnWrapperAdaptor PRIVATE -fexceptions)
diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
index 423c5d63723..0a3eb874104 100644
--- a/backends/qualcomm/runtime/SharedBuffer.cpp
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -5,7 +5,9 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#ifdef __ANDROID__
 #include <dlfcn.h>
+#endif
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/SharedBuffer.h>
 
@@ -99,6 +101,10 @@ bool SharedBuffer::IsAllocated(void* buf) {
 }
 
 Error SharedBuffer::Load() {
+#ifndef __ANDROID__
+  QNN_EXECUTORCH_LOG_ERROR("Shared buffer is not supported on this platform.");
+  return Error::Internal;
+#else
   // On Android, 32-bit and 64-bit libcdsprpc.so can be found at /vendor/lib/
   // and /vendor/lib64/ respectively.
   lib_cdsp_rpc_ = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
@@ -121,15 +127,20 @@ Error SharedBuffer::Load() {
     return Error::Internal;
   }
   return Error::Ok;
+#endif
 }
 
 Error SharedBuffer::UnLoad() {
+#ifndef __ANDROID__
+  return Error::Ok;
+#else
   if (dlclose(lib_cdsp_rpc_) != 0) {
     QNN_EXECUTORCH_LOG_ERROR(
         "Unable to close shared buffer. dlerror(): %s", dlerror());
     return Error::Internal;
   };
   return Error::Ok;
+#endif
 }
 } // namespace qnn
 } // namespace executor
diff --git a/backends/qualcomm/runtime/SharedBuffer.h b/backends/qualcomm/runtime/SharedBuffer.h
index 1803e8af879..9ee3e4b3a23 100644
--- a/backends/qualcomm/runtime/SharedBuffer.h
+++ b/backends/qualcomm/runtime/SharedBuffer.h
@@ -55,7 +55,7 @@ class SharedBuffer final {
 
   // Pointer to the dlopen'd libcdsprpc.so shared library which contains
   // rpcmem_alloc, rpcmem_free, rpcmem_to_fd APIs
-  void* lib_cdsp_rpc_;
+  [[maybe_unused]] void* lib_cdsp_rpc_;
   // Function pointer to rpcmem_alloc
   RpcMemAllocFn_t rpc_mem_alloc_;
   // Function pointer to rpcmem_free
diff --git a/backends/qualcomm/runtime/Utils.cpp b/backends/qualcomm/runtime/Utils.cpp
index c049d3720ee..5fe46fe6155 100644
--- a/backends/qualcomm/runtime/Utils.cpp
+++ b/backends/qualcomm/runtime/Utils.cpp
@@ -7,7 +7,11 @@
  */
 #include <executorch/backends/qualcomm/runtime/Logging.h>
 #include <executorch/backends/qualcomm/runtime/Utils.h>
+#ifdef _WIN32
+#include <direct.h>
+#else
 #include <sys/stat.h>
+#endif
 namespace torch {
 namespace executor {
 namespace qnn {
@@ -24,7 +28,11 @@ void CreateDirectory(const std::string& path) {
     return;
   }
   CreateDirectory(subdir);
+#ifdef _WIN32
+  int mkdir_err = _mkdir(subdir.c_str());
+#else
   int mkdir_err = mkdir(subdir.c_str(), S_IRWXU | S_IRWXG | S_IRWXO);
+#endif
   if (mkdir_err != 0 && errno != EEXIST) {
     std::string err_msg = "Failed to create " + subdir + " folder\n";
     QNN_EXECUTORCH_LOG_ERROR(err_msg.c_str());
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
index e173f08af08..e8971024a2d 100644
--- a/backends/qualcomm/runtime/backends/CMakeLists.txt
+++ b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -44,8 +44,17 @@ target_sources(
 )
 
 # qnn_device
+set(BACKEND_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+if(BACKEND_ARCH STREQUAL "arm64" OR BACKEND_ARCH STREQUAL "aarch64" OR
+  BACKEND_ARCH STREQUAL "arm64-v8a")
+  set(BACKEND_ARCH "aarch64")
+elseif(BACKEND_ARCH STREQUAL "x86_64")
+  set(BACKEND_ARCH "x86_64")
+else()
+  message(FATAL_ERROR "Unsupported architecture: ${BACKEND_ARCH}")
+endif()
 set(HOST_ARCHITECTURE
-    ${CMAKE_CURRENT_LIST_DIR}/htpbackend/${CMAKE_SYSTEM_PROCESSOR}
+    ${CMAKE_CURRENT_LIST_DIR}/htpbackend/${BACKEND_ARCH}
 )
 
 target_sources(
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
index acb95524682..131fb2459e0 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
+++ b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -23,8 +23,12 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
         const std::string skel_library_dir =
             htp_options->skel_library_dir()->str();
         if (!skel_library_dir.empty()) {
+#ifdef _WIN32
+          _putenv_s("ADSP_LIBRARY_PATH", skel_library_dir.c_str());
+#else
           setenv(
               "ADSP_LIBRARY_PATH", skel_library_dir.c_str(), /*overwrite=*/1);
+#endif
         }
         QNN_EXECUTORCH_LOG_INFO(
             "skel_library_dir: %s", skel_library_dir.c_str());
diff --git a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h
index 5ea187ffa52..c78ce7a781b 100644
--- a/backends/qualcomm/runtime/backends/QnnFunctionInterface.h
+++ b/backends/qualcomm/runtime/backends/QnnFunctionInterface.h
@@ -7,6 +7,11 @@
  */
 #pragma once
 
+// Dummy fix `interface` definition in Windows
+#if defined(__WIN32__) && defined(interface)
+#undef interface
+#endif
+
 #include "QnnInterface.h"
 #include "Saver/QnnSaver.h"
 
diff --git a/backends/qualcomm/runtime/backends/QnnImplementation.cpp b/backends/qualcomm/runtime/backends/QnnImplementation.cpp
index 0f7d45b54b2..5c537319c5c 100644
--- a/backends/qualcomm/runtime/backends/QnnImplementation.cpp
+++ b/backends/qualcomm/runtime/backends/QnnImplementation.cpp
@@ -5,7 +5,11 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+#ifdef _WIN32
+#include <windows.h>
+#else
 #include <dlfcn.h>
+#endif
 #include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
 
 #include "QnnInterface.h"
@@ -14,7 +18,11 @@ namespace executor {
 namespace qnn {
 template <typename Fn>
 Fn loadQnnFunction(void* handle, const char* function_name) {
+#ifdef _WIN32
+  return reinterpret_cast<Fn>(GetProcAddress(reinterpret_cast<HMODULE>(handle), function_name));
+#else
   return reinterpret_cast<Fn>(dlsym(handle, function_name)); // NOLINT
+#endif
 }
 
 Error QnnImplementation::InitBackend(
@@ -54,13 +62,21 @@ Error QnnImplementation::StartBackend(
     const std::string& lib_path,
     const QnnSaver_Config_t** saver_config) {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
+#ifdef _WIN32
+  void* lib_handle = LoadLibrary(lib_path.c_str());
+#else
   void* lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#endif
 
   if (lib_handle == nullptr) {
     QNN_EXECUTORCH_LOG_ERROR(
         "Cannot Open QNN library %s, with error: %s",
         lib_path.c_str(),
+#ifdef _WIN32
+        GetLastError());
+#else
         dlerror());
+#endif
     return Error::Internal;
   }
 
@@ -72,7 +88,11 @@ Error QnnImplementation::StartBackend(
     QNN_EXECUTORCH_LOG_ERROR(
         "QnnImplementation::Load Cannot load symbol "
         "QnnInterface_getProviders : %s",
+#ifdef _WIN32
+        GetLastError());
+#else
         dlerror());
+#endif
     return Error::Internal;
   }
 
@@ -120,6 +140,14 @@ Error QnnImplementation::StartBackend(
   if (loaded_lib_handle_.count(backend_id) > 0) {
     QNN_EXECUTORCH_LOG_WARN("closing %pK...", loaded_lib_handle_[backend_id]);
 
+#ifdef _WIN32
+    if (FreeLibrary(reinterpret_cast<HMODULE>(loaded_lib_handle_[backend_id])) == 0) {
+      QNN_EXECUTORCH_LOG_WARN(
+          "Sadly, fail to close %pK with error %d",
+          loaded_lib_handle_[backend_id],
+          GetLastError());
+    }
+#else
     int dlclose_error = dlclose(loaded_lib_handle_[backend_id]);
     if (dlclose_error != 0) {
       QNN_EXECUTORCH_LOG_WARN(
@@ -127,6 +155,7 @@ Error QnnImplementation::StartBackend(
           loaded_lib_handle_[backend_id],
           dlerror());
     }
+#endif
   }
   loaded_lib_handle_[backend_id] = lib_handle;
 
@@ -138,6 +167,15 @@ Error QnnImplementation::StartBackend(
     lib_path_to_backend_id_.erase(lib_path);
     loaded_backend_.erase(backend_id);
 
+#ifdef _WIN32
+    if (FreeLibrary(reinterpret_cast<HMODULE>(loaded_lib_handle_[backend_id])) == 0) {
+      QNN_EXECUTORCH_LOG_WARN(
+          "fail to close %pK after backend-init "
+          "failure, with error %d",
+          loaded_lib_handle_[backend_id],
+          GetLastError());
+    }
+#else
     int dlclose_error = dlclose(loaded_lib_handle_[backend_id]);
     if (dlclose_error != 0) {
       QNN_EXECUTORCH_LOG_WARN(
@@ -146,6 +184,7 @@ Error QnnImplementation::StartBackend(
           loaded_lib_handle_[backend_id],
           dlerror());
     }
+#endif
 
     loaded_lib_handle_.erase(backend_id);
     return be_init_st;
@@ -160,12 +199,22 @@ Error QnnImplementation::TerminateAllBackends() {
   loaded_backend_.clear();
 
   for (auto& it : loaded_lib_handle_) {
+#ifdef _WIN32
+    if (FreeLibrary(reinterpret_cast<HMODULE>(it.second)) == 0) {
+      QNN_EXECUTORCH_LOG_ERROR(
+          "Fail to close QNN backend %d with error %d",
+          it.first,
+          GetLastError());
+      ret_status = Error::Internal;
+    }
+#else
     int dlclose_error = dlclose(it.second);
     if (dlclose_error != 0) {
       QNN_EXECUTORCH_LOG_ERROR(
           "Fail to close QNN backend %d with error %s", it.first, dlerror());
       ret_status = Error::Internal;
     }
+#endif
   }
   loaded_lib_handle_.clear();
   lib_path_to_backend_id_.clear();
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h
index 00568bdc327..35fc2d373de 100644
--- a/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h
+++ b/backends/qualcomm/runtime/backends/htpbackend/HtpContextCustomConfig.h
@@ -39,7 +39,7 @@ class HtpContextCustomConfig {
     return htp_context_config_.back().get();
   }
 
-  const QnnContext* context_;
+  [[maybe_unused]] const QnnContext* context_;
   std::vector<std::unique_ptr<QnnHtpContext_CustomConfig_t>>
       htp_context_config_;
   [[maybe_unused]] const QnnExecuTorchHtpBackendOptions* htp_options_;
diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 8bfa002dad5..1682a676f11 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -76,7 +76,12 @@ foreach(lib ${lib_list})
       # keep all libs as static when CMAKE_TOOLCHAIN_IOS is used
       add_library(${lib} STATIC IMPORTED)
     endif()
-    set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
+    if ("${${lib_var}}" MATCHES ".dll.a$")
+      string(REGEX REPLACE ".dll.a$" ".dll" ${lib_var} "${${lib_var}}")
+      set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}" IMPORTED_IMPLIB "${${lib_var}}.a")
+    else()
+      set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")
+    endif()
     target_include_directories(${lib} INTERFACE ${_root})
   endif()
 endforeach()
diff --git a/examples/models/llama2/runner/util.h b/examples/models/llama2/runner/util.h
index 5d4792b6414..cfb0aa7d0b4 100644
--- a/examples/models/llama2/runner/util.h
+++ b/examples/models/llama2/runner/util.h
@@ -8,7 +8,11 @@
 
 #pragma once
 #include <stdio.h>
+#ifdef _WIN32
+#include <windows.h>
+#else
 #include <time.h>
+#endif
 #include <cctype>
 
 namespace torch {
@@ -39,9 +43,13 @@ void inline safe_printf(const char* piece) {
 
 long inline time_in_ms() {
   // return time in milliseconds, for benchmarking the model speed
+#ifdef _WIN32
+  return GetTickCount();
+#else
   struct timespec time;
   clock_gettime(CLOCK_REALTIME, &time);
   return time.tv_sec * 1000 + time.tv_nsec / 1000000;
+#endif
 }
 
 } // namespace util
diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index 1c16251668d..adb8c8e0b79 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -12,11 +12,16 @@
 #include <cstring>
 #include <limits>
 
+#ifdef _WIN32
+#include <windows.h>
+#include <algorithm>
+#else
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#endif
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
@@ -54,19 +59,69 @@ Range get_overlapping_pages(uintptr_t offset, size_t size, size_t page_size) {
 
 } // namespace
 
+#ifdef _WIN32
+const char* get_last_error_message() {
+    DWORD errorMessageID = GetLastError();
+    if(errorMessageID == 0) {
+        return ""; //No error message has been recorded
+    }
+    LPSTR messageBuffer = nullptr;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, errorMessageID, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&messageBuffer, 0, NULL);
+    return messageBuffer;
+}
+#endif
+
+#ifdef _WIN32
+class FileHandle {
+ public:
+  explicit FileHandle(HANDLE handle) : handle_(handle) {}
+  ~FileHandle() {
+    if (handle_ != INVALID_HANDLE_VALUE) {
+      CloseHandle(handle_);
+    }
+  }
+  HANDLE get() const { return handle_; }
+
+ private:
+  HANDLE handle_;
+};
+#else
+class FileHandle {
+ public:
+  explicit FileHandle(int fd) : fd_(fd) {}
+  ~FileHandle() {
+    if (fd_ >= 0) {
+      ::close(fd_);
+    }
+  }
+  int get() const { return fd_; }
+
+ private:
+  int fd_;
+};
+#endif
+
 MmapDataLoader::~MmapDataLoader() {
   // file_name_ can be nullptr if this instance was moved from, but freeing a
   // null pointer is safe.
   std::free(const_cast<char*>(file_name_));
-  // fd_ can be -1 if this instance was moved from, but closing a negative fd is
-  // safe (though it will return an error).
-  ::close(fd_);
+#ifdef _WIN32
+  if (mapping_handle_ != nullptr) {
+    CloseHandle(mapping_handle_);
+  }
+#endif
 }
 
 Result<MmapDataLoader> MmapDataLoader::from(
     const char* file_name,
     MmapDataLoader::MlockConfig mlock_config) {
   // Cache the page size.
+#ifdef _WIN32
+  SYSTEM_INFO system_info;
+  GetSystemInfo(&system_info);
+  size_t page_size = std::max(system_info.dwPageSize, system_info.dwAllocationGranularity);
+#else
   long page_size = sysconf(_SC_PAGESIZE);
   if (page_size < 0) {
     ET_LOG(Error, "Could not get page size: %s (%d)", ::strerror(errno), errno);
@@ -76,7 +131,55 @@ Result<MmapDataLoader> MmapDataLoader::from(
     ET_LOG(Error, "Page size 0x%ld is not a power of 2", page_size);
     return Error::InvalidState;
   }
+#endif
+
+#ifdef _WIN32
+  HANDLE file_handle = CreateFileA(
+      file_name,
+      GENERIC_READ,
+      FILE_SHARE_READ,
+      nullptr,
+      OPEN_EXISTING,
+      FILE_ATTRIBUTE_NORMAL,
+      nullptr);
+  if (file_handle == INVALID_HANDLE_VALUE) {
+    ET_LOG(
+        Error,
+        "Failed to open %s: %s",
+        file_name,
+        get_last_error_message());
+    return Error::AccessFailed;
+  }
 
+  LARGE_INTEGER file_size_li;
+  if (!GetFileSizeEx(file_handle, &file_size_li)) {
+    ET_LOG(
+        Error,
+        "Could not get length of %s: %s",
+        file_name,
+        get_last_error_message());
+    CloseHandle(file_handle);
+    return Error::AccessFailed;
+  }
+  size_t file_size = static_cast<size_t>(file_size_li.QuadPart);
+
+  HANDLE mapping_handle = CreateFileMappingA(
+      file_handle,
+      nullptr,
+      PAGE_READONLY,
+      0,
+      0,
+      nullptr);
+  if (mapping_handle == nullptr) {
+    ET_LOG(
+        Error,
+        "Could not create file mapping for %s: %s",
+        file_name,
+        get_last_error_message());
+    CloseHandle(file_handle);
+    return Error::AccessFailed;
+  }
+#else
   // Use open() instead of fopen() because mmap() needs a file descriptor.
   int fd = ::open(file_name, O_RDONLY);
   if (fd < 0) {
@@ -103,17 +206,28 @@ Result<MmapDataLoader> MmapDataLoader::from(
     return Error::AccessFailed;
   }
   size_t file_size = st.st_size;
+#endif
 
   // Copy the filename so we can print better debug messages if reads fail.
   const char* file_name_copy = ::strdup(file_name);
   if (file_name_copy == nullptr) {
     ET_LOG(Error, "strdup(%s) failed", file_name);
+#ifdef _WIN32
+    CloseHandle(mapping_handle);
+    CloseHandle(file_handle);
+#else
     ::close(fd);
+#endif
     return Error::MemoryAllocationFailed;
   }
 
   return MmapDataLoader(
+#ifdef _WIN32
+      file_handle,
+      mapping_handle,
+#else
       fd,
+#endif
       file_size,
       file_name_copy,
       static_cast<size_t>(page_size),
@@ -127,10 +241,19 @@ namespace {
  * `context` is actually the OS page size as a uintptr_t.
  */
 void MunmapSegment(void* context, void* data, size_t size) {
-  const uintptr_t page_size = reinterpret_cast<uintptr_t>(context);
+  const size_t page_size = reinterpret_cast<size_t>(context);
 
-  Range range =
-      get_overlapping_pages(reinterpret_cast<uintptr_t>(data), size, page_size);
+  Range range = get_overlapping_pages(reinterpret_cast<uintptr_t>(data), size, page_size);
+#ifdef _WIN32
+  if (!UnmapViewOfFile(reinterpret_cast<void*>(range.start))) {
+    ET_LOG(
+        Error,
+        "UnmapViewOfFile(0x%zx, %zu) failed: %s",
+        range.start,
+        range.size,
+        get_last_error_message());
+  }
+#else
   int ret = ::munmap(reinterpret_cast<void*>(range.start), range.size);
   if (ret < 0) {
     // Let the user know that something went wrong, but there's nothing we can
@@ -143,13 +266,18 @@ void MunmapSegment(void* context, void* data, size_t size) {
         ::strerror(errno),
         errno);
   }
+#endif
 }
 } // namespace
 
 Result<FreeableBuffer> MmapDataLoader::Load(size_t offset, size_t size) {
   ET_CHECK_OR_RETURN_ERROR(
       // Probably had its value moved to another instance.
+#ifdef _WIN32
+      file_handle_ != INVALID_HANDLE_VALUE,
+#else
       fd_ >= 0,
+#endif
       InvalidState,
       "Uninitialized");
   ET_CHECK_OR_RETURN_ERROR(
@@ -162,7 +290,11 @@ Result<FreeableBuffer> MmapDataLoader::Load(size_t offset, size_t size) {
       file_size_);
   ET_CHECK_OR_RETURN_ERROR(
       // Recommended by a lint warning.
+#ifdef _WIN32
+      offset <= std::numeric_limits<DWORD>::max(),
+#else
       offset <= std::numeric_limits<off_t>::max(),
+#endif
       InvalidArgument,
       "Offset %zu too large for off_t",
       offset);
@@ -179,6 +311,26 @@ Result<FreeableBuffer> MmapDataLoader::Load(size_t offset, size_t size) {
   // Map the pages read-only. MAP_PRIVATE vs. MAP_SHARED doesn't matter since
   // the data is read-only, but use PRIVATE just to further avoid accidentally
   // modifying the file.
+#ifdef _WIN32
+  if (range.start + range.size > file_size_) {
+    range.size = file_size_ - range.start;
+  }
+
+  void* pages = MapViewOfFile(
+      mapping_handle_,
+      FILE_MAP_READ | FILE_MAP_COPY,
+      static_cast<DWORD>(range.start >> 32),
+      static_cast<DWORD>(range.start & 0xFFFFFFFF),
+      range.size);
+  ET_CHECK_OR_RETURN_ERROR(
+      pages != nullptr,
+      AccessFailed,
+      "Failed to map %s: MapViewOfFile(..., size=%zd, ..., offset=0x%zx): %s",
+      file_name_,
+      range.size,
+      range.start,
+      get_last_error_message());
+#else
   void* pages = ::mmap(
       nullptr,
       range.size,
@@ -194,9 +346,36 @@ Result<FreeableBuffer> MmapDataLoader::Load(size_t offset, size_t size) {
       range.size,
       fd_,
       range.start);
+#endif
 
   if (mlock_config_ == MlockConfig::UseMlock ||
       mlock_config_ == MlockConfig::UseMlockIgnoreErrors) {
+#ifdef _WIN32
+    if (!VirtualLock(pages, size)) {
+      if (mlock_config_ == MlockConfig::UseMlockIgnoreErrors) {
+        ET_LOG(
+            Debug,
+            "Ignoring VirtualLock error for file %s (off=0x%zd): "
+            "VirtualLock(%p, %zu) failed: %s",
+            file_name_,
+            offset,
+            pages,
+            size,
+            get_last_error_message());
+      } else {
+        ET_LOG(
+            Error,
+            "File %s (off=0x%zd): VirtualLock(%p, %zu) failed: %s",
+            file_name_,
+            offset,
+            pages,
+            size,
+            get_last_error_message());
+        UnmapViewOfFile(pages);
+        return Error::NotSupported;
+      }
+    }
+#else
     int err = ::mlock(pages, size);
     if (err < 0) {
       if (mlock_config_ == MlockConfig::UseMlockIgnoreErrors) {
@@ -225,6 +404,7 @@ Result<FreeableBuffer> MmapDataLoader::Load(size_t offset, size_t size) {
       }
     }
     // No need to keep track of this. munmap() will unlock as a side effect.
+#endif
   }
 
   // The requested data is at an offset into the mapped pages.
@@ -245,7 +425,11 @@ Result<FreeableBuffer> MmapDataLoader::Load(size_t offset, size_t size) {
 Result<size_t> MmapDataLoader::size() const {
   ET_CHECK_OR_RETURN_ERROR(
       // Probably had its value moved to another instance.
+#ifdef _WIN32
+      file_handle_ != INVALID_HANDLE_VALUE,
+#else
       fd_ >= 0,
+#endif
       InvalidState,
       "Uninitialized");
   return file_size_;
diff --git a/extension/data_loader/mmap_data_loader.h b/extension/data_loader/mmap_data_loader.h
index b81a562624f..cf94f4fbc85 100644
--- a/extension/data_loader/mmap_data_loader.h
+++ b/extension/data_loader/mmap_data_loader.h
@@ -12,6 +12,12 @@
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/platform/compiler.h>
 
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/types.h>
+#endif
+
 namespace torch {
 namespace executor {
 namespace util {
@@ -75,12 +81,22 @@ class MmapDataLoader : public DataLoader {
       : file_name_(rhs.file_name_),
         file_size_(rhs.file_size_),
         page_size_(rhs.page_size_),
+#ifdef _WIN32
+        file_handle_(rhs.file_handle_),
+        mapping_handle_(rhs.mapping_handle_),
+#else
         fd_(rhs.fd_),
+#endif
         mlock_config_(rhs.mlock_config_) {
     rhs.file_name_ = nullptr;
     rhs.file_size_ = 0;
     rhs.page_size_ = 0;
+#ifdef _WIN32
+    rhs.file_handle_ = INVALID_HANDLE_VALUE;
+    rhs.mapping_handle_ = nullptr;
+#else
     rhs.fd_ = -1;
+#endif
     rhs.mlock_config_ = MlockConfig::NoMlock;
   }
 
@@ -93,7 +109,12 @@ class MmapDataLoader : public DataLoader {
 
  private:
   MmapDataLoader(
+#ifdef _WIN32
+      HANDLE file_handle,
+      HANDLE mapping_handle,
+#else
       int fd,
+#endif
       size_t file_size,
       const char* file_name,
       size_t page_size,
@@ -101,7 +122,12 @@ class MmapDataLoader : public DataLoader {
       : file_name_(file_name),
         file_size_(file_size),
         page_size_(page_size),
+#ifdef _WIN32
+        file_handle_(file_handle),
+        mapping_handle_(mapping_handle),
+#else
         fd_(fd),
+#endif
         mlock_config_(mlock_config) {}
 
   // Not safely copyable.
@@ -112,7 +138,12 @@ class MmapDataLoader : public DataLoader {
   const char* file_name_; // String data is owned by the instance.
   size_t file_size_;
   size_t page_size_;
+#ifdef _WIN32
+  HANDLE file_handle_;
+  HANDLE mapping_handle_;
+#else
   int fd_; // Owned by the instance.
+#endif
   MlockConfig mlock_config_;
 };
 
diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 3d2aa41a8dd..3ec3af6307f 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -9,21 +9,86 @@
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executor {
-
-PyTorchBackendInterface::~PyTorchBackendInterface() {}
+#ifdef _WIN32
+#include <memory>
+#include <windows.h>
+#include <tchar.h>
+#define getpid GetCurrentProcessId
+#else
+#include <unistd.h>
+#endif
 
 // Task t128866626: Remove global static variables.
 // We want to be able to run multiple Executor instances
 // and having a global registration isn't a viable solution
 // in the long term.
-BackendRegistry& getBackendRegistry();
-BackendRegistry& getBackendRegistry() {
-  static BackendRegistry backend_reg;
+#ifdef _WIN32
+
+#define SHARED_MEMORY_NAME "torch_executor_backend_registry"
+static std::shared_ptr<torch::executor::BackendRegistry> backend_reg;
+
+torch::executor::BackendRegistry& getBackendRegistry() {
+  if (backend_reg != nullptr) {
+    return *backend_reg;
+  }
+
+  HANDLE hMapFile = OpenFileMapping(
+    FILE_MAP_ALL_ACCESS,   // read/write access
+    FALSE,                 // do not inherit the name
+    _T(SHARED_MEMORY_NAME)  // name of mapping object
+  );
+
+  if (hMapFile == NULL) {
+    // Create a new file mapping object
+    hMapFile = CreateFileMapping(
+      INVALID_HANDLE_VALUE,    // use paging file
+      NULL,                    // default security
+      PAGE_READWRITE,          // read/write access
+      0,                       // maximum object size (high-order DWORD)
+      sizeof(torch::executor::BackendRegistry),                // maximum object size (low-order DWORD)
+      _T(SHARED_MEMORY_NAME)   // name of mapping object
+    );
+    if (hMapFile == NULL) {
+      return *backend_reg;
+    }
+  }
+
+  torch::executor::BackendRegistry* registry = (torch::executor::BackendRegistry*) MapViewOfFile(
+    hMapFile,   // handle to map object
+    FILE_MAP_ALL_ACCESS, // read/write permission
+    0,
+    0,
+    sizeof(torch::executor::BackendRegistry)
+  );
+
+  if (registry == NULL) {
+    return *backend_reg;
+  }
+
+  if (backend_reg == nullptr) {
+    backend_reg = std::shared_ptr<torch::executor::BackendRegistry>(registry, [](torch::executor::BackendRegistry* ptr) {
+      UnmapViewOfFile(ptr);
+    });
+  }
+
+  return *backend_reg;
+}
+
+#else
+
+torch::executor::BackendRegistry& getBackendRegistry();
+torch::executor::BackendRegistry& getBackendRegistry() {
+  static torch::executor::BackendRegistry backend_reg;
   return backend_reg;
 }
 
+#endif
+
+namespace torch {
+namespace executor {
+
+PyTorchBackendInterface::~PyTorchBackendInterface() {}
+
 PyTorchBackendInterface* get_backend_class(const char* name) {
   return getBackendRegistry().get_backend_class(name);
 }
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index 629077ca7e0..02ee9b60ff7 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -14,15 +14,74 @@
 
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executor {
+#ifdef _WIN32
+#include <memory>
+#include <windows.h>
+#include <tchar.h>
+
+#define SHARED_MEMORY_NAME "torch_executor_operator_registry"
+static std::shared_ptr<torch::executor::OperatorRegistry> operator_reg;
+
+torch::executor::OperatorRegistry& getOperatorRegistry() {
+  if (operator_reg != nullptr) {
+    return *operator_reg;
+  }
+
+  HANDLE hMapFile = OpenFileMapping(
+    FILE_MAP_ALL_ACCESS,   // read/write access
+    FALSE,                 // do not inherit the name
+    _T(SHARED_MEMORY_NAME)  // name of mapping object
+  );
+
+  if (hMapFile == NULL) {
+    // Create a new file mapping object
+    hMapFile = CreateFileMapping(
+      INVALID_HANDLE_VALUE,    // use paging file
+      NULL,                    // default security
+      PAGE_READWRITE,          // read/write access
+      0,                       // maximum object size (high-order DWORD)
+      sizeof(torch::executor::OperatorRegistry),                // maximum object size (low-order DWORD)
+      _T(SHARED_MEMORY_NAME)   // name of mapping object
+    );
+    if (hMapFile == NULL) {
+      return *operator_reg;
+    }
+  }
+
+  torch::executor::OperatorRegistry* registry = (torch::executor::OperatorRegistry*) MapViewOfFile(
+    hMapFile,   // handle to map object
+    FILE_MAP_ALL_ACCESS, // read/write permission
+    0,
+    0,
+    sizeof(torch::executor::OperatorRegistry)
+  );
+
+  if (registry == NULL) {
+    return *operator_reg;
+  }
+
+  if (operator_reg == nullptr) {
+    operator_reg = std::shared_ptr<torch::executor::OperatorRegistry>(registry, [](torch::executor::OperatorRegistry* ptr) {
+      UnmapViewOfFile(ptr);
+    });
+  }
+
+  return *operator_reg;
+}
 
-OperatorRegistry& getOperatorRegistry();
-OperatorRegistry& getOperatorRegistry() {
-  static OperatorRegistry operator_registry;
+#else
+
+torch::executor::OperatorRegistry& getOperatorRegistry();
+torch::executor::OperatorRegistry& getOperatorRegistry() {
+  static torch::executor::OperatorRegistry operator_registry;
   return operator_registry;
 }
 
+#endif
+
+namespace torch {
+namespace executor {
+
 Error register_kernels(const ArrayRef<Kernel>& kernels) {
   Error success = getOperatorRegistry().register_kernels(kernels);
   if (success == Error::InvalidArgument || success == Error::Internal) {
diff --git a/runtime/platform/system.h b/runtime/platform/system.h
index c836e5ff222..56bd8432950 100644
--- a/runtime/platform/system.h
+++ b/runtime/platform/system.h
@@ -21,6 +21,9 @@
 #if defined(ET_USE_LIBDL)
 #include <dlfcn.h>
 #endif
+#if defined(ET_USE_WINAPI)
+#include <windows.h>
+#endif
 
 static constexpr const char* DYNAMIC_LIBRARY_NOT_SUPPORTED = "NOT_SUPPORTED";
 static constexpr const char* DYNAMIC_LIBRARY_NOT_FOUND = "NOT_FOUND";
@@ -41,6 +44,17 @@ inline const char* et_pal_get_shared_library_name(const void* addr) {
   } else {
     return DYNAMIC_LIBRARY_NOT_FOUND;
   }
+#endif
+#if defined(ET_USE_WINAPI)
+  HMODULE hModule = NULL;
+  if (GetModuleHandleExA(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                         reinterpret_cast<LPCSTR>(addr), &hModule)) {
+    char path[MAX_PATH];
+    if (GetModuleFileNameA(hModule, path, sizeof(path))) {
+      return path;
+    }
+  }
+  return DYNAMIC_LIBRARY_NOT_FOUND;
 #endif
   return DYNAMIC_LIBRARY_NOT_SUPPORTED;
 }
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index a4af1274024..93685e322c6 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -109,6 +109,7 @@ def define_common_targets():
                 "DEFAULT": [],
                 "ovr_config//os:linux": ["-DET_USE_LIBDL"],
                 "ovr_config//os:macos": ["-DET_USE_LIBDL"],
+                "ovr_config//os:windows": ["-DET_USE_WINAPI"],
             },
         ),
         visibility = [

From 969b61293ceae010d4ce4646d790f9ac88250d36 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 25 May 2024 16:20:19 +0800
Subject: [PATCH 03/25] Fix unknown type

---
 backends/qualcomm/aot/ir/qcir_utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/qualcomm/aot/ir/qcir_utils.cpp b/backends/qualcomm/aot/ir/qcir_utils.cpp
index d32c36149ff..f36f1848dab 100755
--- a/backends/qualcomm/aot/ir/qcir_utils.cpp
+++ b/backends/qualcomm/aot/ir/qcir_utils.cpp
@@ -129,7 +129,7 @@ flatbuffers::Offset<qcir::QuantizeParam> ToQuantizeParam(
     case qcir::QuantizeType::AXIS_SCALE_OFFSET: {
       size_t len = param.axisScaleOffsetEncoding.numScaleOffsets;
       axis = param.axisScaleOffsetEncoding.axis;
-      for (uint i = 0; i < len; ++i) {
+      for (size_t i = 0; i < len; ++i) {
         data.emplace_back(qcir::ScaleOffset(
             param.axisScaleOffsetEncoding.scaleOffset[i].scale,
             param.axisScaleOffsetEncoding.scaleOffset[i].offset));

From e9f162f4e8a3ba6b5ea5841f3ad28add4e48fd39 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 25 May 2024 16:21:35 +0800
Subject: [PATCH 04/25] Add ignoring files

---
 .gitignore                   | 1 +
 backends/qualcomm/.gitignore | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 backends/qualcomm/.gitignore

diff --git a/.gitignore b/.gitignore
index 27be53151fa..ed5f48432b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ ethos-u-scratch/
 executorch.egg-info
 pip-out/
 __pycache__/
+.python-version
 
 # Any exported models and profiling outputs
 *.pte
diff --git a/backends/qualcomm/.gitignore b/backends/qualcomm/.gitignore
new file mode 100644
index 00000000000..b2ddb055dcb
--- /dev/null
+++ b/backends/qualcomm/.gitignore
@@ -0,0 +1 @@
+*_generated.h

From 00845eaec90dea5808beda1ac54ceaa9bdb0db7f Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 25 May 2024 16:43:07 +0800
Subject: [PATCH 05/25] Fix build for QNN

---
 .../runtime/backends/QnnSysImplementation.cpp | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/backends/qualcomm/runtime/backends/QnnSysImplementation.cpp b/backends/qualcomm/runtime/backends/QnnSysImplementation.cpp
index 519dd867d4a..e471b54881c 100644
--- a/backends/qualcomm/runtime/backends/QnnSysImplementation.cpp
+++ b/backends/qualcomm/runtime/backends/QnnSysImplementation.cpp
@@ -6,7 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#ifdef _WIN32
+#include <windows.h>
+#else
 #include <dlfcn.h>
+#endif
 #include <executorch/backends/qualcomm/runtime/backends/QnnSysImplementation.h>
 namespace torch {
 namespace executor {
@@ -14,6 +18,28 @@ namespace qnn {
 Error QnnSystemImplementation::Load() {
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
+#ifdef _WIN32
+  HMODULE lib_handle_ = LoadLibrary(lib_path_.c_str());
+  if (lib_handle_ == nullptr) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Cannot Open QNN library %s, with error: %d",
+        lib_path_.c_str(),
+        GetLastError());
+    return Error::Internal;
+  }
+
+  auto* get_providers =
+      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+      reinterpret_cast<QnnSystemInterfaceGetProvidersFn*>(
+          GetProcAddress(lib_handle_, "QnnSystemInterface_getProviders"));
+  if (get_providers == nullptr) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "QnnSystemImplementation::Load Cannot load symbol "
+        "QnnSystemInterface_getProviders : %d",
+        GetLastError());
+    return Error::Internal;
+  }
+#else
   void* lib_handle_ = dlopen(lib_path_.c_str(), RTLD_NOW | RTLD_LOCAL);
   if (lib_handle_ == nullptr) {
     QNN_EXECUTORCH_LOG_ERROR(
@@ -34,6 +60,7 @@ Error QnnSystemImplementation::Load() {
         dlerror());
     return Error::Internal;
   }
+#endif
 
   std::uint32_t num_providers;
   const QnnSystemInterface_t** provider_list = nullptr;
@@ -64,12 +91,20 @@ Error QnnSystemImplementation::Unload() {
   if (lib_handle_ == nullptr)
     return Error::Ok;
 
+#ifdef _WIN32
+  if (!FreeLibrary(reinterpret_cast<HMODULE>(lib_handle_))) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Failed to close QnnSystem library with error %d", GetLastError());
+    return Error::Internal;
+  }
+#else
   int dlclose_error = dlclose(lib_handle_);
   if (dlclose_error != 0) {
     QNN_EXECUTORCH_LOG_WARN(
         "Failed to close QnnSystem library with error %s", dlerror());
     return Error::Internal;
   }
+#endif
 
   lib_handle_ = nullptr;
 

From f4a52a58f6f3ffd75ec3ad55f85ad7e334c80444 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 25 May 2024 16:53:56 +0800
Subject: [PATCH 06/25] Correct impl

---
 extension/data_loader/mmap_data_loader.cpp | 34 +++-------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/extension/data_loader/mmap_data_loader.cpp b/extension/data_loader/mmap_data_loader.cpp
index adb8c8e0b79..fcde5bf5433 100644
--- a/extension/data_loader/mmap_data_loader.cpp
+++ b/extension/data_loader/mmap_data_loader.cpp
@@ -72,36 +72,6 @@ const char* get_last_error_message() {
 }
 #endif
 
-#ifdef _WIN32
-class FileHandle {
- public:
-  explicit FileHandle(HANDLE handle) : handle_(handle) {}
-  ~FileHandle() {
-    if (handle_ != INVALID_HANDLE_VALUE) {
-      CloseHandle(handle_);
-    }
-  }
-  HANDLE get() const { return handle_; }
-
- private:
-  HANDLE handle_;
-};
-#else
-class FileHandle {
- public:
-  explicit FileHandle(int fd) : fd_(fd) {}
-  ~FileHandle() {
-    if (fd_ >= 0) {
-      ::close(fd_);
-    }
-  }
-  int get() const { return fd_; }
-
- private:
-  int fd_;
-};
-#endif
-
 MmapDataLoader::~MmapDataLoader() {
   // file_name_ can be nullptr if this instance was moved from, but freeing a
   // null pointer is safe.
@@ -110,6 +80,10 @@ MmapDataLoader::~MmapDataLoader() {
   if (mapping_handle_ != nullptr) {
     CloseHandle(mapping_handle_);
   }
+#else
+  // fd_ can be -1 if this instance was moved from, but closing a negative fd is
+  // safe (though it will return an error).
+  ::close(fd_);
 #endif
 }
 

From b2a056af179323eaf46dca77ad3f7866a96e0600 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sun, 26 May 2024 03:43:23 +0800
Subject: [PATCH 07/25] Fix not found symbol on Windows DLL

---
 backends/qualcomm/runtime/QnnManager.cpp | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index dc3217fc1c8..fb628da46fc 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -370,14 +370,31 @@ Error QnnManager::Compile(
 } // namespace qnn
 } // namespace executor
 } // namespace torch
-void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
+
+extern "C" {
+
+#ifdef _WIN32
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdll-attribute-on-redeclaration"
+#define EXPORT __declspec(dllexport)
+#else
+#define EXPORT
+#endif
+
+EXPORT void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
   using torch::executor::qnn::SharedBuffer;
   void* buffer_ptr =
       SharedBuffer::GetSharedBufferManager().AllocMem(bytes, alignment);
   return buffer_ptr;
 }
 
-void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
+EXPORT void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
   using torch::executor::qnn::SharedBuffer;
   SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr);
 }
+
+#ifdef _WIN32
+#pragma clang diagnostic pop
+#endif
+
+} // extern "C"

From 91f455107f419808c9d1b80d8fb05a6efa852fcf Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Wed, 29 May 2024 19:16:38 +0800
Subject: [PATCH 08/25] Just warning once

---
 backends/qualcomm/runtime/SharedBuffer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/qualcomm/runtime/SharedBuffer.cpp b/backends/qualcomm/runtime/SharedBuffer.cpp
index 0a3eb874104..c15f6bc4113 100644
--- a/backends/qualcomm/runtime/SharedBuffer.cpp
+++ b/backends/qualcomm/runtime/SharedBuffer.cpp
@@ -102,8 +102,8 @@ bool SharedBuffer::IsAllocated(void* buf) {
 
 Error SharedBuffer::Load() {
 #ifndef __ANDROID__
-  QNN_EXECUTORCH_LOG_ERROR("Shared buffer is not supported on this platform.");
-  return Error::Internal;
+  QNN_EXECUTORCH_LOG_WARN("Shared buffer is not supported on this platform.");
+  return Error::Ok;
 #else
   // On Android, 32-bit and 64-bit libcdsprpc.so can be found at /vendor/lib/
   // and /vendor/lib64/ respectively.

From 3c80ec40e78ba69af14492632812e1da6763e5ba Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Wed, 29 May 2024 18:28:45 +0800
Subject: [PATCH 09/25] Fix build for QNN

---
 backends/qualcomm/runtime/QnnExecuTorchBackend.cpp        | 2 +-
 examples/qualcomm/executor_runner/qnn_executor_runner.cpp | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
index 77449703c5f..6d6128edc1b 100644
--- a/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
+++ b/backends/qualcomm/runtime/QnnExecuTorchBackend.cpp
@@ -26,7 +26,7 @@ Result<DelegateHandle*> QnnExecuTorchBackend::init(
     ArrayRef<CompileSpec> compile_specs) const {
   // covert SizedBuffer to qnn ExecuTorch option
   QnnExecuTorchContextBinary qnn_context_blob;
-  const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options;
+  const qnn_delegate::QnnExecuTorchOptions* qnn_executorch_options = nullptr;
 
   qnn_context_blob.buffer = const_cast<void*>(processed->data());
   qnn_context_blob.nbytes = processed->size();
diff --git a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
index 8998ee634e0..505a9596d53 100644
--- a/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
+++ b/examples/qualcomm/executor_runner/qnn_executor_runner.cpp
@@ -33,6 +33,7 @@
 
 #include <fstream>
 #include <memory>
+#include <chrono>
 
 static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB
 

From 86e2bc63570373c4498e79be05e805d4108e2e58 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Wed, 29 May 2024 18:28:11 +0800
Subject: [PATCH 10/25] Force export `QnnExecuTorchAllocCustomMem`,
 `QnnExecuTorchFreeCustomMem`

---
 backends/qualcomm/runtime/QnnManager.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index fb628da46fc..5557767a7b1 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -371,14 +371,12 @@ Error QnnManager::Compile(
 } // namespace executor
 } // namespace torch
 
-extern "C" {
-
 #ifdef _WIN32
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wdll-attribute-on-redeclaration"
 #define EXPORT __declspec(dllexport)
 #else
-#define EXPORT
+#define EXPORT __attribute__((visibility("default")))
 #endif
 
 EXPORT void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
@@ -396,5 +394,3 @@ EXPORT void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
 #ifdef _WIN32
 #pragma clang diagnostic pop
 #endif
-
-} // extern "C"

From e8192ad3b3992f82e65ee5e010a33f6e4252da3e Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Mon, 27 May 2024 18:21:30 +0800
Subject: [PATCH 11/25] Reduce dup-compile for custom kernels

---
 build/executorch-config.cmake         | 1 +
 examples/models/llama2/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 1682a676f11..982bc097a0d 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -55,6 +55,7 @@ set(lib_list
     optimized_native_cpu_ops_lib
     quantized_kernels
     quantized_ops_lib
+    custom_ops
 )
 foreach(lib ${lib_list})
   # Name of the variable which stores result of the find_library search
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
index 5044a5ce9bd..3bf5d97a795 100644
--- a/examples/models/llama2/CMakeLists.txt
+++ b/examples/models/llama2/CMakeLists.txt
@@ -126,7 +126,7 @@ endif()
 target_link_options_shared_lib(quantized_ops_lib)
 list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
+if(EXECUTORCH_BUILD_KERNELS_CUSTOM OR TARGET custom_ops)
   target_link_options_shared_lib(custom_ops)
   list(APPEND link_libraries custom_ops)
 endif()

From e99fc7378704c0a494e0f477bebaa5ef724aacca Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Wed, 29 May 2024 20:19:42 +0800
Subject: [PATCH 12/25] Use visibility instead dllexport

---
 backends/qualcomm/runtime/QnnManager.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
index 5557767a7b1..d4be4675d5b 100644
--- a/backends/qualcomm/runtime/QnnManager.cpp
+++ b/backends/qualcomm/runtime/QnnManager.cpp
@@ -371,13 +371,7 @@ Error QnnManager::Compile(
 } // namespace executor
 } // namespace torch
 
-#ifdef _WIN32
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdll-attribute-on-redeclaration"
-#define EXPORT __declspec(dllexport)
-#else
 #define EXPORT __attribute__((visibility("default")))
-#endif
 
 EXPORT void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment) {
   using torch::executor::qnn::SharedBuffer;
@@ -390,7 +384,3 @@ EXPORT void QnnExecuTorchFreeCustomMem(void* buffer_ptr) {
   using torch::executor::qnn::SharedBuffer;
   SharedBuffer::GetSharedBufferManager().FreeMem(buffer_ptr);
 }
-
-#ifdef _WIN32
-#pragma clang diagnostic pop
-#endif

From 4b6189f92f68109655744725a4617fa2479dd65b Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Thu, 30 May 2024 02:05:01 +0800
Subject: [PATCH 13/25] Fix QNN for Windows

---
 backends/qualcomm/runtime/QnnManager.h                   | 6 ++++++
 backends/qualcomm/runtime/backends/QnnBackendCache.h     | 4 ++++
 backends/qualcomm/runtime/backends/QnnImplementation.cpp | 7 +++++--
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
index 639d3534de4..091615c38ff 100644
--- a/backends/qualcomm/runtime/QnnManager.h
+++ b/backends/qualcomm/runtime/QnnManager.h
@@ -75,9 +75,15 @@ class QnnManager {
  private:
   Error LoadQnnLibrary();
 
+#ifdef _WIN32
+  static constexpr const char* htp_library_name_ = "QnnHtp.dll";
+  static constexpr const char* gpu_library_name_ = "QnnGpu.dll";
+  static constexpr const char* dsp_library_name_ = "QnnDsp.dll";
+#else
   static constexpr const char* htp_library_name_ = "libQnnHtp.so";
   static constexpr const char* gpu_library_name_ = "libQnnGpu.so";
   static constexpr const char* dsp_library_name_ = "libQnnDsp.so";
+#endif
 
   QnnExecuTorchContextBinary qnn_context_blob_;
   std::unique_ptr<BackendConfigParameters> backend_params_ptr_;
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h
index ad6d3d0bd7b..9cb77b4043c 100644
--- a/backends/qualcomm/runtime/backends/QnnBackendCache.h
+++ b/backends/qualcomm/runtime/backends/QnnBackendCache.h
@@ -58,7 +58,11 @@ class QnnBackendCache {
 
   QnnExecuTorchContextBinary qnn_context_blob_;
   QnnSystemContext_Handle_t sys_context_handle_{nullptr};
+#ifdef _WIN32
+  QnnSystemImplementation qnn_sys_impl_{ "QnnSystem.dll" };
+#else
   QnnSystemImplementation qnn_sys_impl_{"libQnnSystem.so"};
+#endif
   std::string graph_name_;
   std::vector<Qnn_Tensor_t> input_tensor_structs_;
   std::vector<Qnn_Tensor_t> output_tensor_structs_;
diff --git a/backends/qualcomm/runtime/backends/QnnImplementation.cpp b/backends/qualcomm/runtime/backends/QnnImplementation.cpp
index 5c537319c5c..71bfc151e66 100644
--- a/backends/qualcomm/runtime/backends/QnnImplementation.cpp
+++ b/backends/qualcomm/runtime/backends/QnnImplementation.cpp
@@ -69,12 +69,15 @@ Error QnnImplementation::StartBackend(
 #endif
 
   if (lib_handle == nullptr) {
+#ifdef _WIN32
     QNN_EXECUTORCH_LOG_ERROR(
-        "Cannot Open QNN library %s, with error: %s",
+        "Cannot Open QNN library %s, with error: %d",
         lib_path.c_str(),
-#ifdef _WIN32
         GetLastError());
 #else
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Cannot Open QNN library %s, with error: %s",
+        lib_path.c_str(),
         dlerror());
 #endif
     return Error::Internal;

From b512236ec647abc0589481b200f773142305f8d3 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Thu, 30 May 2024 02:09:12 +0800
Subject: [PATCH 14/25] Support Windows platform

---
 build/Utils.cmake                   |  14 ++
 build/extract_sources.py            |  24 +++-
 runtime/platform/target/Windows.cpp | 215 ++++++++++++++++++++++++++++
 runtime/platform/targets.bzl        |   8 +-
 4 files changed, 256 insertions(+), 5 deletions(-)
 create mode 100644 runtime/platform/target/Windows.cpp

diff --git a/build/Utils.cmake b/build/Utils.cmake
index 56fc1e104b0..20ba25f7f1b 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -171,11 +171,25 @@ function(extract_sources sources_file)
       set(executorch_root ${CMAKE_CURRENT_SOURCE_DIR})
     endif()
 
+    set(TARGET_PLATFORM ${CMAKE_SYSTEM_NAME})
+    string(TOLOWER ${TARGET_PLATFORM} TARGET_PLATFORM)
+
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR
+       CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+      set(TARGET_ARCH "aarch64")
+    elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+      set(TARGET_ARCH "x8664")
+    else()
+      set(TARGET_ARCH "default")
+    endif()
+
     execute_process(
       COMMAND
         ${PYTHON_EXECUTABLE} ${executorch_root}/build/extract_sources.py
         --config=${executorch_root}/build/cmake_deps.toml --out=${sources_file}
         --buck2=${BUCK2}
+        --platform=${TARGET_PLATFORM}
+        --arch=${TARGET_ARCH}
       OUTPUT_VARIABLE gen_srcs_output
       ERROR_VARIABLE gen_srcs_error
       RESULT_VARIABLE gen_srcs_exit_code
diff --git a/build/extract_sources.py b/build/extract_sources.py
index ce8b3de9812..4ed8a842904 100755
--- a/build/extract_sources.py
+++ b/build/extract_sources.py
@@ -96,7 +96,13 @@ def __init__(
             else:
                 self._config[k] = v
 
-    def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
+    def get_sources(
+        self,
+        graph: "Graph",
+        runner: Buck2Runner,
+        platform: str = "default",
+        arch: str = "default"
+    ) -> frozenset[str]:
         if self._state == Target._InitState.READY:
             return self._sources
         # Detect cycles.
@@ -113,7 +119,7 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
         )
 
         # Get the complete list of source files that this target depends on.
-        sources: set[str] = set(runner.run(["cquery", query]))
+        sources: set[str] = set(runner.run(["cquery", query, "--fake-host", platform, "--fake-arch", arch]))
 
         # Keep entries that match all of the filters.
         filters = [re.compile(p) for p in self._config.get("filters", [])]
@@ -128,7 +134,7 @@ def get_sources(self, graph: "Graph", runner: Buck2Runner) -> frozenset[str]:
         # its deps. Remove entries that are already covered by the transitive
         # set of dependencies.
         for dep in self._config.get("deps", []):
-            sources.difference_update(graph.by_name[dep].get_sources(graph, runner))
+            sources.difference_update(graph.by_name[dep].get_sources(graph, runner, platform, arch))
 
         self._sources = frozenset(sources)
         self._state = Target._InitState.READY
@@ -173,6 +179,16 @@ def parse_args() -> argparse.Namespace:
         metavar="file",
         help="Path to the file to generate.",
     )
+    parser.add_argument(
+        "--platform",
+        default="default",
+        help="Target platform for the build system",
+    )
+    parser.add_argument(
+        "--arch",
+        default="default",
+        help="Target architecture for the build system",
+    )
     return parser.parse_args()
 
 
@@ -200,7 +216,7 @@ def main():
     target_to_srcs: dict[str, list[str]] = {}
     runner: Buck2Runner = Buck2Runner(args.buck2)
     for name, target in graph.by_name.items():
-        target_to_srcs[name] = sorted(target.get_sources(graph, runner))
+        target_to_srcs[name] = sorted(target.get_sources(graph, runner, args.platform, args.arch))
 
     # Generate the requested format.
     output: bytes
diff --git a/runtime/platform/target/Windows.cpp b/runtime/platform/target/Windows.cpp
new file mode 100644
index 00000000000..8eb66604082
--- /dev/null
+++ b/runtime/platform/target/Windows.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/**
+ * @file
+ * Fallback PAL implementations for POSIX-compatible systems.
+ *
+ * Note that this assumes that the platform defines the symbols used in this
+ * file (like fprintf()), because this file will still be built even if the
+ * functions are later overridden. When building for a platform that does not
+ * provide the necessary symbols, clients can use Minimal.cpp instead, but they
+ * will need to override all of the functions.
+ */
+
+// This cpp file will provide weak implementations of the symbols declared in
+// Platform.h. Client users can strongly define any or all of the functions to
+// override them.
+#define ET_INTERNAL_PLATFORM_WEAKNESS __ET_WEAK
+#include <executorch/runtime/platform/platform.h>
+
+#include <windows.h>
+#include <tchar.h>
+
+#include <chrono>
+#include <cinttypes>
+#include <cstdio>
+#include <cstdlib>
+
+#include <executorch/runtime/platform/compiler.h>
+
+#define SHARED_MEMORY_NAME "torch_executor_platform_init_time"
+
+// The FILE* to write logs to.
+#define ET_LOG_OUTPUT_FILE stderr
+
+/**
+ * On debug builds, ensure that `et_pal_init` has been called before
+ * other PAL functions which depend on initialization.
+ */
+#ifdef NDEBUG
+
+/**
+ * Assert that the PAL has been initialized.
+ */
+#define _ASSERT_PAL_INITIALIZED() ((void)0)
+
+#else // NDEBUG
+
+/**
+ * Assert that the PAL has been initialized.
+ */
+#define _ASSERT_PAL_INITIALIZED()                                   \
+  ({                                                                \
+    if (!check_shared_memory()) {                                   \
+      fprintf(                                                      \
+          ET_LOG_OUTPUT_FILE,                                       \
+          "ExecuTorch PAL must be initialized before call to %s()", \
+          __ET_FUNCTION);                                           \
+      fflush(ET_LOG_OUTPUT_FILE);                                   \
+      et_pal_abort();                                               \
+    }                                                               \
+  })
+
+#endif // NDEBUG
+
+/// Start time of the system (used to zero the system timestamp).
+static std::shared_ptr<std::chrono::time_point<std::chrono::steady_clock>> systemStartTime = nullptr;
+
+bool check_shared_memory() {
+  if (systemStartTime != nullptr) {
+    return true;
+  }
+
+  HANDLE hMapFile = OpenFileMapping(
+    FILE_MAP_ALL_ACCESS,   // read/write access
+    FALSE,                 // do not inherit the name
+    _T(SHARED_MEMORY_NAME)  // name of mapping object
+  );
+
+  if (hMapFile == NULL) {
+    // Create a new file mapping object
+    hMapFile = CreateFileMapping(
+      INVALID_HANDLE_VALUE,    // use paging file
+      NULL,                    // default security
+      PAGE_READWRITE,          // read/write access
+      0,                       // maximum object size (high-order DWORD)
+      sizeof(std::chrono::time_point<std::chrono::steady_clock>),                // maximum object size (low-order DWORD)
+      _T(SHARED_MEMORY_NAME)   // name of mapping object
+    );
+    if (hMapFile == NULL) {
+      return false;
+    }
+  }
+
+  systemStartTime =std::shared_ptr<std::chrono::time_point<std::chrono::steady_clock>>(
+    (std::chrono::time_point<std::chrono::steady_clock>*) MapViewOfFile(
+      hMapFile,   // handle to map object
+      FILE_MAP_ALL_ACCESS, // read/write permission
+      0,
+      0,
+      sizeof(std::chrono::time_point<std::chrono::steady_clock>)
+    )
+  );
+
+  if (systemStartTime == nullptr) {
+    return false;
+  }
+
+  return true;
+}
+
+/**
+ * Initialize the platform abstraction layer.
+ *
+ * This function should be called before any other function provided by the PAL
+ * to initialize any global state. Typically overridden by PAL implementer.
+ */
+void et_pal_init(void) {
+  if (check_shared_memory()) {
+    return;
+  }
+  *systemStartTime = std::chrono::steady_clock::now();
+}
+
+/**
+ * Immediately abort execution, setting the device into an error state, if
+ * available.
+ */
+__ET_NORETURN void et_pal_abort(void) {
+  std::abort();
+}
+
+/**
+ * Return a monotonically non-decreasing timestamp in system ticks.
+ *
+ * @retval Timestamp value in system ticks.
+ */
+et_timestamp_t et_pal_current_ticks(void) {
+  _ASSERT_PAL_INITIALIZED();
+  auto systemCurrentTime = std::chrono::steady_clock::now();
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             systemCurrentTime - *systemStartTime)
+      .count();
+}
+
+/**
+ * Return the conversion rate from system ticks to nanoseconds, as a fraction.
+ * To convert an interval from system ticks to nanoseconds, multiply the tick
+ * count by the numerator and then divide by the denominator:
+ *   nanoseconds = ticks * numerator / denominator
+ *
+ * @retval The ratio of nanoseconds to system ticks.
+ */
+et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
+  // The system tick interval is 1 nanosecond, so the conversion factor is 1.
+  return {1, 1};
+}
+
+/**
+ * Emit a log message via platform output (serial port, console, etc).
+ *
+ * @param[in] timestamp Timestamp of the log event in system ticks since boot.
+ * @param[in] level Severity level of the message. Must be a printable 7-bit
+ *     ASCII uppercase letter.
+ * @param[in] filename Name of the file that created the log event.
+ * @param[in] function Name of the function that created the log event.
+ * @param[in] line Line in the source file where the log event was created.
+ * @param[in] message Message string to log.
+ * @param[in] length Message string length.
+ */
+void et_pal_emit_log_message(
+    et_timestamp_t timestamp,
+    et_pal_log_level_t level,
+    const char* filename,
+    __ET_UNUSED const char* function,
+    size_t line,
+    const char* message,
+    __ET_UNUSED size_t length) {
+  _ASSERT_PAL_INITIALIZED();
+
+  // Not all platforms have ticks == nanoseconds, but this one does.
+  timestamp /= 1000; // To microseconds
+  unsigned long int us = timestamp % 1000000;
+  timestamp /= 1000000; // To seconds
+  unsigned int sec = timestamp % 60;
+  timestamp /= 60; // To minutes
+  unsigned int min = timestamp % 60;
+  timestamp /= 60; // To hours
+  unsigned int hour = timestamp;
+
+  // Use a format similar to glog and folly::logging, except:
+  // - Print time since et_pal_init since we don't have wall time
+  // - Don't include the thread ID, to avoid adding a threading dependency
+  // - Add the string "executorch:" to make the logs more searchable
+  //
+  // Clients who want to change the format or add other fields can override this
+  // weak implementation of et_pal_emit_log_message.
+  fprintf(
+      ET_LOG_OUTPUT_FILE,
+      "%c %02u:%02u:%02u.%06lu executorch:%s:%zu] %s\n",
+      level,
+      hour,
+      min,
+      sec,
+      us,
+      filename,
+      line,
+      message);
+  fflush(ET_LOG_OUTPUT_FILE);
+}
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index 93685e322c6..d03129fe313 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -6,7 +6,12 @@ def _select_pal(dict_):
     `executorch.pal_default` build config value. Fails if no corresponding entry
     exists.
     """
-    pal_default = native.read_config("executorch", "pal_default", "posix")
+    # buck2 check target platform
+    # check config//os:
+    if host_info().os.is_windows:
+        pal_default = native.read_config("executorch", "pal_default", "windows")
+    else:
+        pal_default = native.read_config("executorch", "pal_default", "posix")
     if not pal_default in dict_:
         fail("Missing key for executorch.pal_default value '{}' in dict '{}'".format(pal_default, dict_))
     return dict_[pal_default]
@@ -44,6 +49,7 @@ def define_common_targets():
         srcs = _select_pal({
             "minimal": ["target/Minimal.cpp"],
             "posix": ["target/Posix.cpp"],
+            "windows": ["target/Windows.cpp"],
         }),
         deps = [
             ":pal_interface",

From f41acfd0f78f246c5c7e2bad07f589286af6425f Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Thu, 30 May 2024 02:09:44 +0800
Subject: [PATCH 15/25] File loader support Windows API

---
 extension/data_loader/file_data_loader.cpp | 95 ++++++++++++++++++++++
 extension/data_loader/file_data_loader.h   | 15 +++-
 2 files changed, 107 insertions(+), 3 deletions(-)

diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index f1a3d335c0d..47ed9d55fd9 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -12,10 +12,14 @@
 #include <cstddef>
 #include <cstring>
 
+#ifdef _WIN32
+#include <windows.h>
+#else
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
+#endif
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/result.h>
@@ -54,9 +58,15 @@ FileDataLoader::~FileDataLoader() {
   // file_name_ can be nullptr if this instance was moved from, but freeing a
   // null pointer is safe.
   std::free(const_cast<char*>(file_name_));
+#ifdef _WIN32
+  if (fd_ != INVALID_HANDLE_VALUE) {
+    CloseHandle(fd_);
+  }
+#else
   // fd_ can be -1 if this instance was moved from, but closing a negative fd is
   // safe (though it will return an error).
   ::close(fd_);
+#endif
 }
 
 Result<FileDataLoader> FileDataLoader::from(
@@ -68,6 +78,34 @@ Result<FileDataLoader> FileDataLoader::from(
       "Alignment %zu is not a power of 2",
       alignment);
 
+#ifdef _WIN32
+  HANDLE fd = CreateFile(
+      file_name,
+      GENERIC_READ,
+      FILE_SHARE_READ,
+      NULL,
+      OPEN_EXISTING,
+      FILE_ATTRIBUTE_NORMAL,
+      NULL);
+  if (fd == INVALID_HANDLE_VALUE) {
+    ET_LOG(
+        Error, "Failed to open %s: %lu", file_name, GetLastError());
+    return Error::AccessFailed;
+  }
+
+  LARGE_INTEGER file_size_li;
+  if (!GetFileSizeEx(fd, &file_size_li)) {
+    ET_LOG(
+        Error,
+        "Could not get length of %s: %lu",
+        file_name,
+        GetLastError());
+    CloseHandle(fd);
+    return Error::AccessFailed;
+  }
+  size_t file_size = static_cast<size_t>(file_size_li.QuadPart);
+
+#else
   // Use open() instead of fopen() to avoid the layer of buffering that
   // fopen() does. We will be reading large portions of the file in one shot,
   // so buffering does not help.
@@ -92,12 +130,17 @@ Result<FileDataLoader> FileDataLoader::from(
     return Error::AccessFailed;
   }
   size_t file_size = st.st_size;
+#endif
 
   // Copy the filename so we can print better debug messages if reads fail.
   const char* file_name_copy = ::strdup(file_name);
   if (file_name_copy == nullptr) {
     ET_LOG(Error, "strdup(%s) failed", file_name);
+#ifdef _WIN32
+    CloseHandle(fd);
+#else
     ::close(fd);
+#endif
     return Error::MemoryAllocationFailed;
   }
 
@@ -121,7 +164,11 @@ void FreeSegment(void* context, void* data, __ET_UNUSED size_t size) {
 Result<FreeableBuffer> FileDataLoader::Load(size_t offset, size_t size) {
   ET_CHECK_OR_RETURN_ERROR(
       // Probably had its value moved to another instance.
+#ifdef _WIN32
+      fd_ != INVALID_HANDLE_VALUE,
+#else
       fd_ >= 0,
+#endif
       InvalidState,
       "Uninitialized");
   ET_CHECK_OR_RETURN_ERROR(
@@ -138,6 +185,20 @@ Result<FreeableBuffer> FileDataLoader::Load(size_t offset, size_t size) {
     return FreeableBuffer(nullptr, 0, /*free_fn=*/nullptr);
   }
 
+#ifdef _WIN32
+  // Seek to the right place in the file.
+  LARGE_INTEGER li;
+  li.QuadPart = offset;
+  if (!SetFilePointerEx(fd_, li, NULL, FILE_BEGIN)) {
+    ET_LOG(
+        Error,
+        "Seeking %s to offset %zu failed: %lu",
+        file_name_,
+        offset,
+        GetLastError());
+    return Error::AccessFailed;
+  }
+#else
   // Seek to the right place in the file.
   off_t seek_offset = ::lseek(fd_, offset, SEEK_SET);
   if (seek_offset != offset) {
@@ -150,6 +211,7 @@ Result<FreeableBuffer> FileDataLoader::Load(size_t offset, size_t size) {
         strerror(errno));
     return Error::AccessFailed;
   }
+#endif
 
   // Allocate memory for the FreeableBuffer.
   size_t alloc_size = size;
@@ -185,6 +247,34 @@ Result<FreeableBuffer> FileDataLoader::Load(size_t offset, size_t size) {
   // Read the data into the aligned address.
   size_t needed = size;
   uint8_t* buf = reinterpret_cast<uint8_t*>(aligned_buffer);
+#ifdef _WIN32
+  while (needed > 0) {
+    DWORD nread = 0;
+    if (!ReadFile(fd_, buf, static_cast<DWORD>(needed), &nread, NULL)) {
+      ET_LOG(
+          Error,
+          "Reading from %s: failed to read %zu bytes at offset %zu: %lu",
+          file_name_,
+          size,
+          offset,
+          GetLastError());
+      std::free(buffer);
+      return Error::AccessFailed;
+    }
+    if (nread == 0) {
+      ET_LOG(
+          Error,
+          "Reading from %s: failed to read %zu bytes at offset %zu: EOF",
+          file_name_,
+          size,
+          offset);
+      std::free(buffer);
+      return Error::AccessFailed;
+    }
+    needed -= nread;
+    buf += nread;
+  }
+#else
   while (needed > 0) {
     ssize_t nread = ::read(fd_, buf, needed);
     if (nread < 0 && errno == EINTR) {
@@ -208,6 +298,7 @@ Result<FreeableBuffer> FileDataLoader::Load(size_t offset, size_t size) {
     needed -= nread;
     buf += nread;
   }
+#endif
 
   // We can't naively free this pointer, since it may not be what malloc() gave
   // us. Pass the offset to the real buffer as context. This is the number of
@@ -228,7 +319,11 @@ Result<FreeableBuffer> FileDataLoader::Load(size_t offset, size_t size) {
 Result<size_t> FileDataLoader::size() const {
   ET_CHECK_OR_RETURN_ERROR(
       // Probably had its value moved to another instance.
+#ifdef _WIN32
+      fd_ != INVALID_HANDLE_VALUE,
+#else
       fd_ >= 0,
+#endif
       InvalidState,
       "Uninitialized");
   return file_size_;
diff --git a/extension/data_loader/file_data_loader.h b/extension/data_loader/file_data_loader.h
index ade3b515abf..1e6a00043d9 100644
--- a/extension/data_loader/file_data_loader.h
+++ b/extension/data_loader/file_data_loader.h
@@ -8,6 +8,15 @@
 
 #pragma once
 
+#ifdef _WIN32
+#include <windows.h>
+#define FD_TYPE HANDLE
+#define INVALID_FD INVALID_HANDLE_VALUE
+#else
+#define FD_TYPE int
+#define INVALID_FD -1
+#endif
+
 #include <cstddef>
 
 #include <executorch/runtime/core/data_loader.h>
@@ -60,7 +69,7 @@ class FileDataLoader : public DataLoader {
     rhs.file_name_ = nullptr;
     rhs.file_size_ = 0;
     rhs.alignment_ = 0;
-    rhs.fd_ = -1;
+    rhs.fd_ = INVALID_FD;
   }
 
   ~FileDataLoader() override;
@@ -72,7 +81,7 @@ class FileDataLoader : public DataLoader {
 
  private:
   FileDataLoader(
-      int fd,
+      FD_TYPE fd,
       size_t file_size,
       size_t alignment,
       const char* file_name)
@@ -89,7 +98,7 @@ class FileDataLoader : public DataLoader {
   const char* file_name_; // Owned by the instance.
   size_t file_size_;
   size_t alignment_;
-  int fd_; // Owned by the instance.
+  FD_TYPE fd_; // Owned by the instance.
 };
 
 } // namespace util

From f0a8b034f2e859c353748dbbd0f9c94d552765ee Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 1 Jun 2024 04:18:57 +0800
Subject: [PATCH 16/25] Make cmake `LIB_xxx` point to correct lib path

---
 build/executorch-config.cmake | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake
index 982bc097a0d..a13861bc3c4 100644
--- a/build/executorch-config.cmake
+++ b/build/executorch-config.cmake
@@ -12,6 +12,10 @@
 
 cmake_minimum_required(VERSION 3.19)
 
+if(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+  set(CMAKE_FIND_LIBRARY_SUFFIXES ".dll;.a")
+endif()
+
 set(_root "${CMAKE_CURRENT_LIST_DIR}/../..")
 set(required_lib_list executorch executorch_no_prim_ops portable_kernels)
 foreach(lib ${required_lib_list})
@@ -77,8 +81,7 @@ foreach(lib ${lib_list})
       # keep all libs as static when CMAKE_TOOLCHAIN_IOS is used
       add_library(${lib} STATIC IMPORTED)
     endif()
-    if ("${${lib_var}}" MATCHES ".dll.a$")
-      string(REGEX REPLACE ".dll.a$" ".dll" ${lib_var} "${${lib_var}}")
+    if ("${${lib_var}}" MATCHES ".dll$")
       set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}" IMPORTED_IMPLIB "${${lib_var}}.a")
     else()
       set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}")

From 49add79d4d66073fa88a8ad997498bd56d5d7c9b Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sun, 2 Jun 2024 01:34:34 +0800
Subject: [PATCH 17/25] Move shared memory management to platform implement

---
 runtime/backend/interface.cpp        |  86 ++++---------------
 runtime/kernel/operator_registry.cpp |  81 ++++--------------
 runtime/platform/platform.h          |  21 +++++
 runtime/platform/target/Minimal.cpp  |   8 ++
 runtime/platform/target/Posix.cpp    |  32 +++++++
 runtime/platform/target/Windows.cpp  | 119 +++++++++++++++++++--------
 6 files changed, 176 insertions(+), 171 deletions(-)

diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 3ec3af6307f..2e84d8e4210 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -8,87 +8,33 @@
 
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/platform.h>
 
-#ifdef _WIN32
-#include <memory>
-#include <windows.h>
-#include <tchar.h>
-#define getpid GetCurrentProcessId
-#else
-#include <unistd.h>
-#endif
+namespace torch {
+namespace executor {
+
+PyTorchBackendInterface::~PyTorchBackendInterface() {}
 
 // Task t128866626: Remove global static variables.
 // We want to be able to run multiple Executor instances
 // and having a global registration isn't a viable solution
 // in the long term.
-#ifdef _WIN32
-
-#define SHARED_MEMORY_NAME "torch_executor_backend_registry"
-static std::shared_ptr<torch::executor::BackendRegistry> backend_reg;
-
+torch::executor::BackendRegistry& getBackendRegistry();
 torch::executor::BackendRegistry& getBackendRegistry() {
-  if (backend_reg != nullptr) {
-    return *backend_reg;
-  }
-
-  HANDLE hMapFile = OpenFileMapping(
-    FILE_MAP_ALL_ACCESS,   // read/write access
-    FALSE,                 // do not inherit the name
-    _T(SHARED_MEMORY_NAME)  // name of mapping object
+  // Operator registration happens in static initialization time when PAL init
+  // may or may not happen already. Here we are assuming et_pal_init() doesn't
+  // have any side effect even if falled multiple times.
+  ::et_pal_init();
+
+  static torch::executor::BackendRegistry* backend_reg = static_cast<torch::executor::BackendRegistry*>(
+    ::et_pal_get_shared_memory(
+      "torch_executor_backend_registry",
+      sizeof(torch::executor::BackendRegistry)
+    )
   );
-
-  if (hMapFile == NULL) {
-    // Create a new file mapping object
-    hMapFile = CreateFileMapping(
-      INVALID_HANDLE_VALUE,    // use paging file
-      NULL,                    // default security
-      PAGE_READWRITE,          // read/write access
-      0,                       // maximum object size (high-order DWORD)
-      sizeof(torch::executor::BackendRegistry),                // maximum object size (low-order DWORD)
-      _T(SHARED_MEMORY_NAME)   // name of mapping object
-    );
-    if (hMapFile == NULL) {
-      return *backend_reg;
-    }
-  }
-
-  torch::executor::BackendRegistry* registry = (torch::executor::BackendRegistry*) MapViewOfFile(
-    hMapFile,   // handle to map object
-    FILE_MAP_ALL_ACCESS, // read/write permission
-    0,
-    0,
-    sizeof(torch::executor::BackendRegistry)
-  );
-
-  if (registry == NULL) {
-    return *backend_reg;
-  }
-
-  if (backend_reg == nullptr) {
-    backend_reg = std::shared_ptr<torch::executor::BackendRegistry>(registry, [](torch::executor::BackendRegistry* ptr) {
-      UnmapViewOfFile(ptr);
-    });
-  }
-
   return *backend_reg;
 }
 
-#else
-
-torch::executor::BackendRegistry& getBackendRegistry();
-torch::executor::BackendRegistry& getBackendRegistry() {
-  static torch::executor::BackendRegistry backend_reg;
-  return backend_reg;
-}
-
-#endif
-
-namespace torch {
-namespace executor {
-
-PyTorchBackendInterface::~PyTorchBackendInterface() {}
-
 PyTorchBackendInterface* get_backend_class(const char* name) {
   return getBackendRegistry().get_backend_class(name);
 }
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index 02ee9b60ff7..e7bd216fde1 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -8,80 +8,32 @@
 
 #include <executorch/runtime/kernel/operator_registry.h>
 
+#include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/runtime/platform/system.h>
 #include <cinttypes>
 
 #include <executorch/runtime/platform/assert.h>
 
-#ifdef _WIN32
-#include <memory>
-#include <windows.h>
-#include <tchar.h>
-
-#define SHARED_MEMORY_NAME "torch_executor_operator_registry"
-static std::shared_ptr<torch::executor::OperatorRegistry> operator_reg;
+namespace torch {
+namespace executor {
 
+torch::executor::OperatorRegistry& getOperatorRegistry();
 torch::executor::OperatorRegistry& getOperatorRegistry() {
-  if (operator_reg != nullptr) {
-    return *operator_reg;
-  }
-
-  HANDLE hMapFile = OpenFileMapping(
-    FILE_MAP_ALL_ACCESS,   // read/write access
-    FALSE,                 // do not inherit the name
-    _T(SHARED_MEMORY_NAME)  // name of mapping object
-  );
-
-  if (hMapFile == NULL) {
-    // Create a new file mapping object
-    hMapFile = CreateFileMapping(
-      INVALID_HANDLE_VALUE,    // use paging file
-      NULL,                    // default security
-      PAGE_READWRITE,          // read/write access
-      0,                       // maximum object size (high-order DWORD)
-      sizeof(torch::executor::OperatorRegistry),                // maximum object size (low-order DWORD)
-      _T(SHARED_MEMORY_NAME)   // name of mapping object
-    );
-    if (hMapFile == NULL) {
-      return *operator_reg;
-    }
-  }
+  // Operator registration happens in static initialization time when PAL init
+  // may or may not happen already. Here we are assuming et_pal_init() doesn't
+  // have any side effect even if falled multiple times.
+  ::et_pal_init();
 
-  torch::executor::OperatorRegistry* registry = (torch::executor::OperatorRegistry*) MapViewOfFile(
-    hMapFile,   // handle to map object
-    FILE_MAP_ALL_ACCESS, // read/write permission
-    0,
-    0,
-    sizeof(torch::executor::OperatorRegistry)
+  static torch::executor::OperatorRegistry* operator_registry = static_cast<torch::executor::OperatorRegistry*>(
+    ::et_pal_get_shared_memory(
+      "torch_executor_operator_registry",
+      sizeof(torch::executor::OperatorRegistry)
+    )
   );
-
-  if (registry == NULL) {
-    return *operator_reg;
-  }
-
-  if (operator_reg == nullptr) {
-    operator_reg = std::shared_ptr<torch::executor::OperatorRegistry>(registry, [](torch::executor::OperatorRegistry* ptr) {
-      UnmapViewOfFile(ptr);
-    });
-  }
-
-  return *operator_reg;
-}
-
-#else
-
-torch::executor::OperatorRegistry& getOperatorRegistry();
-torch::executor::OperatorRegistry& getOperatorRegistry() {
-  static torch::executor::OperatorRegistry operator_registry;
-  return operator_registry;
+  return *operator_registry;
 }
 
-#endif
-
-namespace torch {
-namespace executor {
-
 Error register_kernels(const ArrayRef<Kernel>& kernels) {
   Error success = getOperatorRegistry().register_kernels(kernels);
   if (success == Error::InvalidArgument || success == Error::Internal) {
@@ -95,11 +47,6 @@ Error register_kernels(const ArrayRef<Kernel>& kernels) {
 }
 
 Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
-  // Operator registration happens in static initialization time when PAL init
-  // may or may not happen already. Here we are assuming et_pal_init() doesn't
-  // have any side effect even if falled multiple times.
-  ::et_pal_init();
-
   if (kernels.size() + this->num_kernels_ > kMaxNumOfKernels) {
     ET_LOG(
         Error,
diff --git a/runtime/platform/platform.h b/runtime/platform/platform.h
index 1f1b3b4c173..a166166e2f0 100644
--- a/runtime/platform/platform.h
+++ b/runtime/platform/platform.h
@@ -115,4 +115,25 @@ void et_pal_emit_log_message(
     const char* message,
     size_t length) ET_INTERNAL_PLATFORM_WEAKNESS;
 
+/**
+ * Get a shared memory region by name.
+ * If the shared memory region does not exist, it will be created.
+ * 
+ * @param[in] name Name of the shared memory region.
+ * @param[in] size Size of the shared memory region in bytes.
+ * 
+ * @retval A pointer to the shared memory region on success. nullptr on failure.
+ */
+void* et_pal_get_shared_memory(
+    const char* name,
+    size_t size) ET_INTERNAL_PLATFORM_WEAKNESS;
+
+/**
+ * Free a shared memory region by name.
+ * 
+ * @param[in] name Name of the shared memory region.
+ */
+void et_pal_free_shared_memory(
+    const char* name) ET_INTERNAL_PLATFORM_WEAKNESS;
+
 } // extern "C"
diff --git a/runtime/platform/target/Minimal.cpp b/runtime/platform/target/Minimal.cpp
index ed6cac7392c..65949e9c535 100644
--- a/runtime/platform/target/Minimal.cpp
+++ b/runtime/platform/target/Minimal.cpp
@@ -47,3 +47,11 @@ void et_pal_emit_log_message(
     __ET_UNUSED size_t line,
     __ET_UNUSED const char* message,
     __ET_UNUSED size_t length) {}
+
+void* et_pal_get_shared_memory(
+    __ET_UNUSED const char* name,
+    __ET_UNUSED size_t size) {
+  return nullptr;
+}
+
+void et_pal_free_shared_memory(__ET_UNUSED void* ptr) {}
diff --git a/runtime/platform/target/Posix.cpp b/runtime/platform/target/Posix.cpp
index f434043d838..79ea3b29c73 100644
--- a/runtime/platform/target/Posix.cpp
+++ b/runtime/platform/target/Posix.cpp
@@ -27,6 +27,9 @@
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
+#include <unordered_map>
+#include <memory>
+#include <string>
 
 #include <executorch/runtime/platform/compiler.h>
 
@@ -69,6 +72,9 @@ static std::chrono::time_point<std::chrono::steady_clock> systemStartTime;
 /// Flag set to true if the PAL has been successfully initialized.
 static bool initialized = false;
 
+// Shared memory
+static std::unordered_map<std::string, std::shared_ptr<void>> sharedMemoryMap;
+
 /**
  * Initialize the platform abstraction layer.
  *
@@ -170,3 +176,29 @@ void et_pal_emit_log_message(
       message);
   fflush(ET_LOG_OUTPUT_FILE);
 }
+
+void* et_pal_get_shared_memory(
+    const char* name,
+    size_t size) {
+  _ASSERT_PAL_INITIALIZED();
+
+  auto it = sharedMemoryMap.find(name);
+  if (it != sharedMemoryMap.end()) {
+    return it->second.get();
+  }
+
+  auto sharedMemory = std::shared_ptr<void>(malloc(size), free);
+  if (sharedMemory == nullptr) {
+    return nullptr;
+  }
+
+  sharedMemoryMap[name] = sharedMemory;
+  return sharedMemory.get();
+}
+
+void et_pal_free_shared_memory(
+    const char* name) {
+  _ASSERT_PAL_INITIALIZED();
+
+  sharedMemoryMap.erase(name);
+}
diff --git a/runtime/platform/target/Windows.cpp b/runtime/platform/target/Windows.cpp
index 8eb66604082..8b568da9fa1 100644
--- a/runtime/platform/target/Windows.cpp
+++ b/runtime/platform/target/Windows.cpp
@@ -30,11 +30,12 @@
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
+#include <unordered_map>
+#include <memory>
+#include <string>
 
 #include <executorch/runtime/platform/compiler.h>
 
-#define SHARED_MEMORY_NAME "torch_executor_platform_init_time"
-
 // The FILE* to write logs to.
 #define ET_LOG_OUTPUT_FILE stderr
 
@@ -69,48 +70,30 @@
 #endif // NDEBUG
 
 /// Start time of the system (used to zero the system timestamp).
-static std::shared_ptr<std::chrono::time_point<std::chrono::steady_clock>> systemStartTime = nullptr;
+static std::chrono::time_point<std::chrono::steady_clock>* systemStartTime = nullptr;
+
+// Shared memory
+typedef std::pair<std::shared_ptr<void>, HANDLE> SharedMemory;
+static std::unordered_map<std::string, SharedMemory> sharedMemoryMap;
 
+void* et_pal_get_shared_memory_internal(
+    const char* name,
+    size_t size);
 bool check_shared_memory() {
   if (systemStartTime != nullptr) {
     return true;
   }
 
-  HANDLE hMapFile = OpenFileMapping(
-    FILE_MAP_ALL_ACCESS,   // read/write access
-    FALSE,                 // do not inherit the name
-    _T(SHARED_MEMORY_NAME)  // name of mapping object
-  );
-
-  if (hMapFile == NULL) {
-    // Create a new file mapping object
-    hMapFile = CreateFileMapping(
-      INVALID_HANDLE_VALUE,    // use paging file
-      NULL,                    // default security
-      PAGE_READWRITE,          // read/write access
-      0,                       // maximum object size (high-order DWORD)
-      sizeof(std::chrono::time_point<std::chrono::steady_clock>),                // maximum object size (low-order DWORD)
-      _T(SHARED_MEMORY_NAME)   // name of mapping object
-    );
-    if (hMapFile == NULL) {
-      return false;
-    }
-  }
-
-  systemStartTime =std::shared_ptr<std::chrono::time_point<std::chrono::steady_clock>>(
-    (std::chrono::time_point<std::chrono::steady_clock>*) MapViewOfFile(
-      hMapFile,   // handle to map object
-      FILE_MAP_ALL_ACCESS, // read/write permission
-      0,
-      0,
-      sizeof(std::chrono::time_point<std::chrono::steady_clock>)
-    )
+  void *sharedMemory = et_pal_get_shared_memory_internal(
+    "torch_executor_platform_init_time",
+    sizeof(std::chrono::time_point<std::chrono::steady_clock>)
   );
-
-  if (systemStartTime == nullptr) {
+  if (sharedMemory == nullptr) {
     return false;
   }
 
+  systemStartTime = static_cast<std::chrono::time_point<std::chrono::steady_clock>*>(sharedMemory);
+  *systemStartTime = std::chrono::steady_clock::now();
   return true;
 }
 
@@ -213,3 +196,71 @@ void et_pal_emit_log_message(
       message);
   fflush(ET_LOG_OUTPUT_FILE);
 }
+
+void* et_pal_get_shared_memory_internal(
+    const char* name,
+    size_t size) {
+
+  auto it = sharedMemoryMap.find(name);
+  if (it != sharedMemoryMap.end()) {
+    return it->second.first.get();
+  }
+
+  HANDLE hMapFile = OpenFileMapping(
+      FILE_MAP_ALL_ACCESS,
+      FALSE,
+      name);
+  if (hMapFile == NULL) {
+    hMapFile = CreateFileMapping(
+        INVALID_HANDLE_VALUE,
+        NULL,
+        PAGE_READWRITE,
+        0,
+        size,
+        name);
+    if (hMapFile == NULL) {
+      return nullptr;
+    }
+  }
+
+  void* sharedMemory = MapViewOfFile(
+      hMapFile,
+      FILE_MAP_ALL_ACCESS,
+      0,
+      0,
+      size);
+  if (sharedMemory == NULL) {
+    CloseHandle(hMapFile);
+    return nullptr;
+  }
+
+  sharedMemoryMap[name] = {
+    std::shared_ptr<void>(
+      sharedMemory,
+      [hMapFile](void* ptr) {
+        UnmapViewOfFile(ptr);
+        CloseHandle(hMapFile);
+      }
+    ),
+    hMapFile
+  };
+  return sharedMemoryMap[name].first.get();
+}
+
+void* et_pal_get_shared_memory(
+    const char* name,
+    size_t size) {
+  _ASSERT_PAL_INITIALIZED();
+  return et_pal_get_shared_memory_internal(name, size);
+}
+
+void et_pal_free_shared_memory(
+    const char* name) {
+  _ASSERT_PAL_INITIALIZED();
+  auto it = sharedMemoryMap.find(name);
+  if (it == sharedMemoryMap.end()) {
+    return;
+  }
+
+  sharedMemoryMap.erase(it);
+}

From ba90d0b1f6b63a42b381503c6d36e6820b9167e7 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Mon, 3 Jun 2024 02:01:27 +0800
Subject: [PATCH 18/25] Revert "Move shared memory management to platform
 implement"

This reverts commit 49add79d4d66073fa88a8ad997498bd56d5d7c9b.
---
 runtime/backend/interface.cpp        |  86 +++++++++++++++----
 runtime/kernel/operator_registry.cpp |  81 ++++++++++++++----
 runtime/platform/platform.h          |  21 -----
 runtime/platform/target/Minimal.cpp  |   8 --
 runtime/platform/target/Posix.cpp    |  32 -------
 runtime/platform/target/Windows.cpp  | 119 ++++++++-------------------
 6 files changed, 171 insertions(+), 176 deletions(-)

diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 2e84d8e4210..3ec3af6307f 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -8,33 +8,87 @@
 
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/platform/assert.h>
-#include <executorch/runtime/platform/platform.h>
 
-namespace torch {
-namespace executor {
-
-PyTorchBackendInterface::~PyTorchBackendInterface() {}
+#ifdef _WIN32
+#include <memory>
+#include <windows.h>
+#include <tchar.h>
+#define getpid GetCurrentProcessId
+#else
+#include <unistd.h>
+#endif
 
 // Task t128866626: Remove global static variables.
 // We want to be able to run multiple Executor instances
 // and having a global registration isn't a viable solution
 // in the long term.
-torch::executor::BackendRegistry& getBackendRegistry();
+#ifdef _WIN32
+
+#define SHARED_MEMORY_NAME "torch_executor_backend_registry"
+static std::shared_ptr<torch::executor::BackendRegistry> backend_reg;
+
 torch::executor::BackendRegistry& getBackendRegistry() {
-  // Operator registration happens in static initialization time when PAL init
-  // may or may not happen already. Here we are assuming et_pal_init() doesn't
-  // have any side effect even if falled multiple times.
-  ::et_pal_init();
-
-  static torch::executor::BackendRegistry* backend_reg = static_cast<torch::executor::BackendRegistry*>(
-    ::et_pal_get_shared_memory(
-      "torch_executor_backend_registry",
-      sizeof(torch::executor::BackendRegistry)
-    )
+  if (backend_reg != nullptr) {
+    return *backend_reg;
+  }
+
+  HANDLE hMapFile = OpenFileMapping(
+    FILE_MAP_ALL_ACCESS,   // read/write access
+    FALSE,                 // do not inherit the name
+    _T(SHARED_MEMORY_NAME)  // name of mapping object
   );
+
+  if (hMapFile == NULL) {
+    // Create a new file mapping object
+    hMapFile = CreateFileMapping(
+      INVALID_HANDLE_VALUE,    // use paging file
+      NULL,                    // default security
+      PAGE_READWRITE,          // read/write access
+      0,                       // maximum object size (high-order DWORD)
+      sizeof(torch::executor::BackendRegistry),                // maximum object size (low-order DWORD)
+      _T(SHARED_MEMORY_NAME)   // name of mapping object
+    );
+    if (hMapFile == NULL) {
+      return *backend_reg;
+    }
+  }
+
+  torch::executor::BackendRegistry* registry = (torch::executor::BackendRegistry*) MapViewOfFile(
+    hMapFile,   // handle to map object
+    FILE_MAP_ALL_ACCESS, // read/write permission
+    0,
+    0,
+    sizeof(torch::executor::BackendRegistry)
+  );
+
+  if (registry == NULL) {
+    return *backend_reg;
+  }
+
+  if (backend_reg == nullptr) {
+    backend_reg = std::shared_ptr<torch::executor::BackendRegistry>(registry, [](torch::executor::BackendRegistry* ptr) {
+      UnmapViewOfFile(ptr);
+    });
+  }
+
   return *backend_reg;
 }
 
+#else
+
+torch::executor::BackendRegistry& getBackendRegistry();
+torch::executor::BackendRegistry& getBackendRegistry() {
+  static torch::executor::BackendRegistry backend_reg;
+  return backend_reg;
+}
+
+#endif
+
+namespace torch {
+namespace executor {
+
+PyTorchBackendInterface::~PyTorchBackendInterface() {}
+
 PyTorchBackendInterface* get_backend_class(const char* name) {
   return getBackendRegistry().get_backend_class(name);
 }
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index e7bd216fde1..02ee9b60ff7 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -8,32 +8,80 @@
 
 #include <executorch/runtime/kernel/operator_registry.h>
 
-#include <executorch/runtime/platform/platform.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <executorch/runtime/platform/system.h>
 #include <cinttypes>
 
 #include <executorch/runtime/platform/assert.h>
 
-namespace torch {
-namespace executor {
+#ifdef _WIN32
+#include <memory>
+#include <windows.h>
+#include <tchar.h>
+
+#define SHARED_MEMORY_NAME "torch_executor_operator_registry"
+static std::shared_ptr<torch::executor::OperatorRegistry> operator_reg;
 
-torch::executor::OperatorRegistry& getOperatorRegistry();
 torch::executor::OperatorRegistry& getOperatorRegistry() {
-  // Operator registration happens in static initialization time when PAL init
-  // may or may not happen already. Here we are assuming et_pal_init() doesn't
-  // have any side effect even if falled multiple times.
-  ::et_pal_init();
+  if (operator_reg != nullptr) {
+    return *operator_reg;
+  }
 
-  static torch::executor::OperatorRegistry* operator_registry = static_cast<torch::executor::OperatorRegistry*>(
-    ::et_pal_get_shared_memory(
-      "torch_executor_operator_registry",
-      sizeof(torch::executor::OperatorRegistry)
-    )
+  HANDLE hMapFile = OpenFileMapping(
+    FILE_MAP_ALL_ACCESS,   // read/write access
+    FALSE,                 // do not inherit the name
+    _T(SHARED_MEMORY_NAME)  // name of mapping object
   );
-  return *operator_registry;
+
+  if (hMapFile == NULL) {
+    // Create a new file mapping object
+    hMapFile = CreateFileMapping(
+      INVALID_HANDLE_VALUE,    // use paging file
+      NULL,                    // default security
+      PAGE_READWRITE,          // read/write access
+      0,                       // maximum object size (high-order DWORD)
+      sizeof(torch::executor::OperatorRegistry),                // maximum object size (low-order DWORD)
+      _T(SHARED_MEMORY_NAME)   // name of mapping object
+    );
+    if (hMapFile == NULL) {
+      return *operator_reg;
+    }
+  }
+
+  torch::executor::OperatorRegistry* registry = (torch::executor::OperatorRegistry*) MapViewOfFile(
+    hMapFile,   // handle to map object
+    FILE_MAP_ALL_ACCESS, // read/write permission
+    0,
+    0,
+    sizeof(torch::executor::OperatorRegistry)
+  );
+
+  if (registry == NULL) {
+    return *operator_reg;
+  }
+
+  if (operator_reg == nullptr) {
+    operator_reg = std::shared_ptr<torch::executor::OperatorRegistry>(registry, [](torch::executor::OperatorRegistry* ptr) {
+      UnmapViewOfFile(ptr);
+    });
+  }
+
+  return *operator_reg;
+}
+
+#else
+
+torch::executor::OperatorRegistry& getOperatorRegistry();
+torch::executor::OperatorRegistry& getOperatorRegistry() {
+  static torch::executor::OperatorRegistry operator_registry;
+  return operator_registry;
 }
 
+#endif
+
+namespace torch {
+namespace executor {
+
 Error register_kernels(const ArrayRef<Kernel>& kernels) {
   Error success = getOperatorRegistry().register_kernels(kernels);
   if (success == Error::InvalidArgument || success == Error::Internal) {
@@ -47,6 +95,11 @@ Error register_kernels(const ArrayRef<Kernel>& kernels) {
 }
 
 Error OperatorRegistry::register_kernels(const ArrayRef<Kernel>& kernels) {
+  // Operator registration happens in static initialization time when PAL init
+  // may or may not happen already. Here we are assuming et_pal_init() doesn't
+  // have any side effect even if falled multiple times.
+  ::et_pal_init();
+
   if (kernels.size() + this->num_kernels_ > kMaxNumOfKernels) {
     ET_LOG(
         Error,
diff --git a/runtime/platform/platform.h b/runtime/platform/platform.h
index a166166e2f0..1f1b3b4c173 100644
--- a/runtime/platform/platform.h
+++ b/runtime/platform/platform.h
@@ -115,25 +115,4 @@ void et_pal_emit_log_message(
     const char* message,
     size_t length) ET_INTERNAL_PLATFORM_WEAKNESS;
 
-/**
- * Get a shared memory region by name.
- * If the shared memory region does not exist, it will be created.
- * 
- * @param[in] name Name of the shared memory region.
- * @param[in] size Size of the shared memory region in bytes.
- * 
- * @retval A pointer to the shared memory region on success. nullptr on failure.
- */
-void* et_pal_get_shared_memory(
-    const char* name,
-    size_t size) ET_INTERNAL_PLATFORM_WEAKNESS;
-
-/**
- * Free a shared memory region by name.
- * 
- * @param[in] name Name of the shared memory region.
- */
-void et_pal_free_shared_memory(
-    const char* name) ET_INTERNAL_PLATFORM_WEAKNESS;
-
 } // extern "C"
diff --git a/runtime/platform/target/Minimal.cpp b/runtime/platform/target/Minimal.cpp
index 65949e9c535..ed6cac7392c 100644
--- a/runtime/platform/target/Minimal.cpp
+++ b/runtime/platform/target/Minimal.cpp
@@ -47,11 +47,3 @@ void et_pal_emit_log_message(
     __ET_UNUSED size_t line,
     __ET_UNUSED const char* message,
     __ET_UNUSED size_t length) {}
-
-void* et_pal_get_shared_memory(
-    __ET_UNUSED const char* name,
-    __ET_UNUSED size_t size) {
-  return nullptr;
-}
-
-void et_pal_free_shared_memory(__ET_UNUSED void* ptr) {}
diff --git a/runtime/platform/target/Posix.cpp b/runtime/platform/target/Posix.cpp
index 79ea3b29c73..f434043d838 100644
--- a/runtime/platform/target/Posix.cpp
+++ b/runtime/platform/target/Posix.cpp
@@ -27,9 +27,6 @@
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
-#include <unordered_map>
-#include <memory>
-#include <string>
 
 #include <executorch/runtime/platform/compiler.h>
 
@@ -72,9 +69,6 @@ static std::chrono::time_point<std::chrono::steady_clock> systemStartTime;
 /// Flag set to true if the PAL has been successfully initialized.
 static bool initialized = false;
 
-// Shared memory
-static std::unordered_map<std::string, std::shared_ptr<void>> sharedMemoryMap;
-
 /**
  * Initialize the platform abstraction layer.
  *
@@ -176,29 +170,3 @@ void et_pal_emit_log_message(
       message);
   fflush(ET_LOG_OUTPUT_FILE);
 }
-
-void* et_pal_get_shared_memory(
-    const char* name,
-    size_t size) {
-  _ASSERT_PAL_INITIALIZED();
-
-  auto it = sharedMemoryMap.find(name);
-  if (it != sharedMemoryMap.end()) {
-    return it->second.get();
-  }
-
-  auto sharedMemory = std::shared_ptr<void>(malloc(size), free);
-  if (sharedMemory == nullptr) {
-    return nullptr;
-  }
-
-  sharedMemoryMap[name] = sharedMemory;
-  return sharedMemory.get();
-}
-
-void et_pal_free_shared_memory(
-    const char* name) {
-  _ASSERT_PAL_INITIALIZED();
-
-  sharedMemoryMap.erase(name);
-}
diff --git a/runtime/platform/target/Windows.cpp b/runtime/platform/target/Windows.cpp
index 8b568da9fa1..8eb66604082 100644
--- a/runtime/platform/target/Windows.cpp
+++ b/runtime/platform/target/Windows.cpp
@@ -30,12 +30,11 @@
 #include <cinttypes>
 #include <cstdio>
 #include <cstdlib>
-#include <unordered_map>
-#include <memory>
-#include <string>
 
 #include <executorch/runtime/platform/compiler.h>
 
+#define SHARED_MEMORY_NAME "torch_executor_platform_init_time"
+
 // The FILE* to write logs to.
 #define ET_LOG_OUTPUT_FILE stderr
 
@@ -70,30 +69,48 @@
 #endif // NDEBUG
 
 /// Start time of the system (used to zero the system timestamp).
-static std::chrono::time_point<std::chrono::steady_clock>* systemStartTime = nullptr;
-
-// Shared memory
-typedef std::pair<std::shared_ptr<void>, HANDLE> SharedMemory;
-static std::unordered_map<std::string, SharedMemory> sharedMemoryMap;
+static std::shared_ptr<std::chrono::time_point<std::chrono::steady_clock>> systemStartTime = nullptr;
 
-void* et_pal_get_shared_memory_internal(
-    const char* name,
-    size_t size);
 bool check_shared_memory() {
   if (systemStartTime != nullptr) {
     return true;
   }
 
-  void *sharedMemory = et_pal_get_shared_memory_internal(
-    "torch_executor_platform_init_time",
-    sizeof(std::chrono::time_point<std::chrono::steady_clock>)
+  HANDLE hMapFile = OpenFileMapping(
+    FILE_MAP_ALL_ACCESS,   // read/write access
+    FALSE,                 // do not inherit the name
+    _T(SHARED_MEMORY_NAME)  // name of mapping object
+  );
+
+  if (hMapFile == NULL) {
+    // Create a new file mapping object
+    hMapFile = CreateFileMapping(
+      INVALID_HANDLE_VALUE,    // use paging file
+      NULL,                    // default security
+      PAGE_READWRITE,          // read/write access
+      0,                       // maximum object size (high-order DWORD)
+      sizeof(std::chrono::time_point<std::chrono::steady_clock>),                // maximum object size (low-order DWORD)
+      _T(SHARED_MEMORY_NAME)   // name of mapping object
+    );
+    if (hMapFile == NULL) {
+      return false;
+    }
+  }
+
+  systemStartTime =std::shared_ptr<std::chrono::time_point<std::chrono::steady_clock>>(
+    (std::chrono::time_point<std::chrono::steady_clock>*) MapViewOfFile(
+      hMapFile,   // handle to map object
+      FILE_MAP_ALL_ACCESS, // read/write permission
+      0,
+      0,
+      sizeof(std::chrono::time_point<std::chrono::steady_clock>)
+    )
   );
-  if (sharedMemory == nullptr) {
+
+  if (systemStartTime == nullptr) {
     return false;
   }
 
-  systemStartTime = static_cast<std::chrono::time_point<std::chrono::steady_clock>*>(sharedMemory);
-  *systemStartTime = std::chrono::steady_clock::now();
   return true;
 }
 
@@ -196,71 +213,3 @@ void et_pal_emit_log_message(
       message);
   fflush(ET_LOG_OUTPUT_FILE);
 }
-
-void* et_pal_get_shared_memory_internal(
-    const char* name,
-    size_t size) {
-
-  auto it = sharedMemoryMap.find(name);
-  if (it != sharedMemoryMap.end()) {
-    return it->second.first.get();
-  }
-
-  HANDLE hMapFile = OpenFileMapping(
-      FILE_MAP_ALL_ACCESS,
-      FALSE,
-      name);
-  if (hMapFile == NULL) {
-    hMapFile = CreateFileMapping(
-        INVALID_HANDLE_VALUE,
-        NULL,
-        PAGE_READWRITE,
-        0,
-        size,
-        name);
-    if (hMapFile == NULL) {
-      return nullptr;
-    }
-  }
-
-  void* sharedMemory = MapViewOfFile(
-      hMapFile,
-      FILE_MAP_ALL_ACCESS,
-      0,
-      0,
-      size);
-  if (sharedMemory == NULL) {
-    CloseHandle(hMapFile);
-    return nullptr;
-  }
-
-  sharedMemoryMap[name] = {
-    std::shared_ptr<void>(
-      sharedMemory,
-      [hMapFile](void* ptr) {
-        UnmapViewOfFile(ptr);
-        CloseHandle(hMapFile);
-      }
-    ),
-    hMapFile
-  };
-  return sharedMemoryMap[name].first.get();
-}
-
-void* et_pal_get_shared_memory(
-    const char* name,
-    size_t size) {
-  _ASSERT_PAL_INITIALIZED();
-  return et_pal_get_shared_memory_internal(name, size);
-}
-
-void et_pal_free_shared_memory(
-    const char* name) {
-  _ASSERT_PAL_INITIALIZED();
-  auto it = sharedMemoryMap.find(name);
-  if (it == sharedMemoryMap.end()) {
-    return;
-  }
-
-  sharedMemoryMap.erase(it);
-}

From 06a44297ca30a6d1be648c9bdbfd3ae0fca80462 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Mon, 3 Jun 2024 02:00:35 +0800
Subject: [PATCH 19/25] Fix time release

---
 runtime/platform/target/Windows.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/runtime/platform/target/Windows.cpp b/runtime/platform/target/Windows.cpp
index 8eb66604082..b6d387525a2 100644
--- a/runtime/platform/target/Windows.cpp
+++ b/runtime/platform/target/Windows.cpp
@@ -104,7 +104,10 @@ bool check_shared_memory() {
       0,
       0,
       sizeof(std::chrono::time_point<std::chrono::steady_clock>)
-    )
+    ),
+    [](std::chrono::time_point<std::chrono::steady_clock>* ptr) {
+      UnmapViewOfFile(ptr);
+    }
   );
 
   if (systemStartTime == nullptr) {

From b013a53c335652c4ebfad56e4857d4f3dc2b314e Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Mon, 23 Sep 2024 22:55:52 +0800
Subject: [PATCH 20/25] Fix build

---
 backends/qualcomm/CMakeLists.txt           |  8 ++--
 build/Utils.cmake                          |  9 +++-
 build/extract_sources.py                   | 13 ++++++
 extension/data_loader/file_data_loader.cpp | 48 +++++++++++-----------
 extension/data_loader/file_data_loader.h   |  4 +-
 runtime/platform/targets.bzl               |  2 +-
 6 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index 1ab77f1b880..e2a94ed9685 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -183,9 +183,11 @@ target_link_libraries(
   qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
                                  executorch_no_prim_ops qcir_utils extension_tensor
 )
-set_target_properties(
-  qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
-)
+if (NOT CMAKE_SYSTEM_NAME STREQUAL "Windows")
+  set_target_properties(
+    qnn_executorch_backend PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+  )
+endif()
 target_link_libraries(utils PRIVATE qnn_executorch_logging)
 target_link_libraries(
   shared_buffer PRIVATE qnn_executorch_logging ${CMAKE_DL_LIBS}
diff --git a/build/Utils.cmake b/build/Utils.cmake
index 3cd8bab3d06..4cf5370ee64 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -195,13 +195,20 @@ function(extract_sources sources_file)
       else()
         message(FATAL_ERROR "Unsupported ANDROID_ABI setting ${ANDROID_ABI}. Please add it here!")
       endif()
+    elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+      set(fake_host_arg "--fake-host=windows")
+      if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+        set(fake_arch_arg "--fake-arch=x8664")
+      elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64")
+        set(fake_arch_arg "--fake-arch=aarch64")
+      endif()
     endif()
 
     execute_process(
       COMMAND
         ${PYTHON_EXECUTABLE} ${executorch_root}/build/extract_sources.py
         --config=${executorch_root}/build/cmake_deps.toml --out=${sources_file}
-        --buck2=${BUCK2} ${target_platforms_arg}
+        --buck2=${BUCK2} ${target_platforms_arg} ${fake_host_arg} ${fake_arch_arg}
       OUTPUT_VARIABLE gen_srcs_output
       ERROR_VARIABLE gen_srcs_error
       RESULT_VARIABLE gen_srcs_exit_code
diff --git a/build/extract_sources.py b/build/extract_sources.py
index 5004fe0c508..1ab62f5621d 100755
--- a/build/extract_sources.py
+++ b/build/extract_sources.py
@@ -183,6 +183,12 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--target-platforms", help="--target-platforms to pass to buck cquery, if any."
     )
+    parser.add_argument(
+        "--fake-host", help="Fake host to pass to buck cquery, if any."
+    )
+    parser.add_argument(
+        "--fake-arch", help="Fake architecture to pass to buck cquery, if any."
+    )
     return parser.parse_args()
 
 
@@ -213,6 +219,13 @@ def main():
     if args.target_platforms:
         buck_args = ["--target-platforms"]
         buck_args.append(args.target_platforms)
+    if args.fake_host:
+        buck_args.append("--fake-host")
+        buck_args.append(args.fake_host)
+    if args.fake_arch:
+        buck_args.append("--fake-arch")
+        buck_args.append(args.fake_arch)
+    print(f"buck_args: {buck_args}")
     for name, target in graph.by_name.items():
         target_to_srcs[name] = sorted(target.get_sources(graph, runner, buck_args))
 
diff --git a/extension/data_loader/file_data_loader.cpp b/extension/data_loader/file_data_loader.cpp
index 36ef549b86e..29f2b0dbde5 100644
--- a/extension/data_loader/file_data_loader.cpp
+++ b/extension/data_loader/file_data_loader.cpp
@@ -182,11 +182,7 @@ Result<FreeableBuffer> FileDataLoader::load(
     ET_UNUSED const DataLoader::SegmentInfo& segment_info) const {
   ET_CHECK_OR_RETURN_ERROR(
       // Probably had its value moved to another instance.
-#ifdef _WIN32
-      fd_ != INVALID_HANDLE_VALUE,
-#else
-      fd_ >= 0,
-#endif
+      IS_VALID_FD(fd_),
       InvalidState,
       "Uninitialized");
   ET_CHECK_OR_RETURN_ERROR(
@@ -260,11 +256,7 @@ Result<FreeableBuffer> FileDataLoader::load(
 Result<size_t> FileDataLoader::size() const {
   ET_CHECK_OR_RETURN_ERROR(
       // Probably had its value moved to another instance.
-#ifdef _WIN32
-      fd_ != INVALID_HANDLE_VALUE,
-#else
-      fd_ >= 0,
-#endif
+      IS_VALID_FD(fd_),
       InvalidState,
       "Uninitialized");
   return file_size_;
@@ -277,7 +269,7 @@ ET_NODISCARD Error FileDataLoader::load_into(
     void* buffer) const {
   ET_CHECK_OR_RETURN_ERROR(
       // Probably had its value moved to another instance.
-      fd_ >= 0,
+      IS_VALID_FD(fd_),
       InvalidState,
       "Uninitialized");
   ET_CHECK_OR_RETURN_ERROR(
@@ -295,32 +287,32 @@ ET_NODISCARD Error FileDataLoader::load_into(
   size_t needed = size;
   uint8_t* buf = reinterpret_cast<uint8_t*>(buffer);
 
-  // Make a duplicate fd if pread() is not available and we have to seek().
-  // Cannot use the standard dup() or fcntl() calls because the returned
-  // duplicate will share the underlying file record and affect the original fd
-  // when seeking on multiple threads simultaneously.
-  const auto dup_fd = ET_HAVE_PREAD ? fd_ : ::open(file_name_, O_RDONLY);
-
 #ifdef _WIN32
+
   while (needed > 0) {
     const auto chunk_size = std::min<size_t>(
         needed, static_cast<size_t>(std::numeric_limits<int32_t>::max()));
     LARGE_INTEGER move;
     move.QuadPart = static_cast<LONGLONG>(offset);
-    if (!SetFilePointerEx(file_handle, move, nullptr, FILE_BEGIN)) {
-      std::cerr << "Failed to set file pointer: " << GetLastError() << std::endl;
+    if (!SetFilePointerEx(fd_, move, nullptr, FILE_BEGIN)) {
+      ET_LOG(
+          Error,
+          "Reading from %s: failed to set file pointer: %lx",
+          file_name_,
+          GetLastError());
       return Error::AccessFailed;
     }
     DWORD nread = 0;
-    if (!ReadFile(file_handle, buf, static_cast<DWORD>(chunk_size), &nread, nullptr)) {
+    if (!ReadFile(fd_, buf, static_cast<DWORD>(chunk_size), &nread, nullptr)) {
       DWORD error_code = GetLastError();
       if (error_code == ERROR_IO_PENDING) {
         continue;
       }
       ET_LOG(
           Error,
-          "Reading from %s: failed to read %zu bytes at offset %zu: %#x",
-          file_name,
+          "Reading from %s: failed to read %zu bytes at offset %lu: %lx",
+          file_name_,
+          chunk_size,
           offset,
           error_code);
       return Error::AccessFailed;
@@ -330,7 +322,7 @@ ET_NODISCARD Error FileDataLoader::load_into(
       ET_LOG(
           Error,
           "Reading from %s: EOF encountered unexpectedly at offset %zu",
-          file_name,
+          file_name_,
           offset);
       return Error::AccessFailed;
     }
@@ -339,7 +331,15 @@ ET_NODISCARD Error FileDataLoader::load_into(
     buf += nread;
     offset += nread;
   }
+
 #else
+
+  // Make a duplicate fd if pread() is not available and we have to seek().
+  // Cannot use the standard dup() or fcntl() calls because the returned
+  // duplicate will share the underlying file record and affect the original fd
+  // when seeking on multiple threads simultaneously.
+  const auto dup_fd = ET_HAVE_PREAD ? fd_ : ::open(file_name_, O_RDONLY);
+
   while (needed > 0) {
     // Reads on macOS will fail with EINVAL if size > INT32_MAX.
     const auto chunk_size = std::min<size_t>(
@@ -378,7 +378,9 @@ ET_NODISCARD Error FileDataLoader::load_into(
   if (!ET_HAVE_PREAD) {
     ::close(dup_fd);
   }
+
 #endif
+
   return Error::Ok;
 }
 
diff --git a/extension/data_loader/file_data_loader.h b/extension/data_loader/file_data_loader.h
index 27babd79edc..c18e055eca2 100644
--- a/extension/data_loader/file_data_loader.h
+++ b/extension/data_loader/file_data_loader.h
@@ -12,9 +12,11 @@
 #include <windows.h>
 #define FD_TYPE HANDLE
 #define INVALID_FD INVALID_HANDLE_VALUE
+#define IS_VALID_FD(fd) (fd != INVALID_HANDLE_VALUE)
 #else
 #define FD_TYPE int
 #define INVALID_FD -1
+#define IS_VALID_FD(fd) (fd >= 0)
 #endif
 
 #include <cstddef>
@@ -68,7 +70,7 @@ class FileDataLoader final : public executorch::runtime::DataLoader {
     const_cast<const char*&>(rhs.file_name_) = nullptr;
     const_cast<size_t&>(rhs.file_size_) = 0;
     const_cast<size_t&>(rhs.alignment_) = 0;
-    const_cast<int&>(rhs.fd_) = INVALID_FD;
+    const_cast<FD_TYPE&>(rhs.fd_) = INVALID_FD;
   }
 
   ~FileDataLoader() override;
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index db4ac57c04f..56ce4b3aeff 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -49,7 +49,7 @@ def define_common_targets():
         srcs = _select_pal({
             "minimal": ["default/minimal.cpp"],
             "posix": ["default/posix.cpp"],
-            "windows": ["target/Windows.cpp"],
+            "windows": ["default/Windows.cpp"],
         }),
         deps = [
             ":pal_interface",

From 5b87e5006d89e487f5cd5450e822ec52d59a526a Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Tue, 24 Sep 2024 01:53:10 +0800
Subject: [PATCH 21/25] Fix shared segment for MinGW

---
 runtime/backend/interface.cpp        |  9 ++-------
 runtime/kernel/operator_registry.cpp | 13 ++++---------
 runtime/platform/compiler.h          | 12 ++++++++++++
 runtime/platform/default/Windows.cpp | 11 ++++-------
 4 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/runtime/backend/interface.cpp b/runtime/backend/interface.cpp
index 1266ad173a6..dd54ea82693 100644
--- a/runtime/backend/interface.cpp
+++ b/runtime/backend/interface.cpp
@@ -19,24 +19,19 @@ namespace {
 // The max number of backends that can be registered globally.
 constexpr size_t kMaxRegisteredBackends = 16;
 
-#ifdef _WIN32
-#pragma data_seg(".shared")
-#endif
+#pragma data_seg(".SS_DLLMAIN")
 
 // TODO(T128866626): Remove global static variables. We want to be able to run
 // multiple Executor instances and having a global registration isn't a viable
 // solution in the long term.
 
 /// Global table of registered backends.
-Backend registered_backends[kMaxRegisteredBackends];
+Backend registered_backends[kMaxRegisteredBackends] ET_SHARED;
 
 /// The number of backends registered in the table.
 size_t num_registered_backends = 0;
 
-#ifdef _WIN32
 #pragma data_seg()
-#pragma comment(linker, "/SECTION:.shared,RWS")
-#endif
 
 } // namespace
 
diff --git a/runtime/kernel/operator_registry.cpp b/runtime/kernel/operator_registry.cpp
index 586a8ef9970..9ad3f48be2d 100644
--- a/runtime/kernel/operator_registry.cpp
+++ b/runtime/kernel/operator_registry.cpp
@@ -29,9 +29,7 @@ constexpr uint32_t kMaxKernelsPerOp = 8;
 constexpr uint32_t kMaxRegisteredKernels = kMaxOperators * kMaxKernelsPerOp;
 #endif
 
-#ifdef _WIN32
-#pragma data_seg(".shared")
-#endif
+#pragma data_seg(".SS_DLLMAIN")
 
 // Data that backs the kernel table. Since Kernel has a custom default
 // constructor (implicitly, because it contains KernelKey, which has a custom
@@ -41,18 +39,15 @@ constexpr uint32_t kMaxRegisteredKernels = kMaxOperators * kMaxKernelsPerOp;
 // and point the table at it.
 // @lint-ignore CLANGTIDY facebook-hte-CArray
 alignas(sizeof(Kernel)) uint8_t
-    registered_kernels_data[kMaxRegisteredKernels * sizeof(Kernel)];
+    registered_kernels_data[kMaxRegisteredKernels * sizeof(Kernel)] ET_SHARED;
 
 /// Global table of registered kernels.
-Kernel* registered_kernels = reinterpret_cast<Kernel*>(registered_kernels_data);
+Kernel* registered_kernels ET_SHARED = reinterpret_cast<Kernel*>(registered_kernels_data);
 
 /// The number of kernels registered in the table.
-size_t num_registered_kernels = 0;
+size_t num_registered_kernels ET_SHARED = 0;
 
-#ifdef _WIN32
 #pragma data_seg()
-#pragma comment(linker, "/SECTION:.shared,RWS")
-#endif
 
 // Registers the kernels, but may return an error.
 Error register_kernels_internal(const Span<const Kernel> kernels) {
diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h
index b6f7fc8642f..2120e585b69 100644
--- a/runtime/platform/compiler.h
+++ b/runtime/platform/compiler.h
@@ -162,6 +162,18 @@
 using ssize_t = ptrdiff_t;
 #endif
 
+// Shared variable section
+#ifdef _WIN32
+#ifdef __MINGW32__
+#define ET_SHARED __attribute__((section(".shr"), shared))
+#else
+#define ET_SHARED
+#endif
+#else
+#define ET_SHARED
+#endif
+
+
 // DEPRECATED: Use the non-underscore-prefixed versions instead.
 // TODO(T199005537): Remove these once all users have stopped using them.
 #define __ET_DEPRECATED ET_DEPRECATED
diff --git a/runtime/platform/default/Windows.cpp b/runtime/platform/default/Windows.cpp
index 81f020aea51..eb140b19cb6 100644
--- a/runtime/platform/default/Windows.cpp
+++ b/runtime/platform/default/Windows.cpp
@@ -33,8 +33,6 @@
 
 #include <executorch/runtime/platform/compiler.h>
 
-#define SHARED_MEMORY_NAME "torch_executor_platform_init_time"
-
 // The FILE* to write logs to.
 #define ET_LOG_OUTPUT_FILE stderr
 
@@ -68,16 +66,15 @@
 
 #endif // NDEBUG
 
-#pragma data_seg(".shared") // Start of shared data segment
+#pragma data_seg(".SS_DLLMAIN")
 
 /// Start time of the system (used to zero the system timestamp).
-static std::chrono::time_point<std::chrono::steady_clock> systemStartTime;
+static std::chrono::time_point<std::chrono::steady_clock> systemStartTime ET_SHARED;
 
 /// Flag set to true if the PAL has been successfully initialized.
-static bool initialized = false;
+static bool initialized ET_SHARED = false;
 
-#pragma data_seg() // End of shared data segment
-#pragma comment(linker, "/SECTION:.shared,RWS") // Make the shared data segment read-write-shared
+#pragma data_seg()
 
 /**
  * Initialize the platform abstraction layer.

From 25d77516938dff31f519d915513b7196f924a1d5 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Tue, 24 Sep 2024 02:03:40 +0800
Subject: [PATCH 22/25] Merge `runtime/platform/default/Windows.cpp` into
 `runtime/platform/default/posix.cpp`

---
 runtime/platform/default/Windows.cpp | 179 ---------------------------
 runtime/platform/default/posix.cpp   |   8 +-
 runtime/platform/targets.bzl         |   8 +-
 3 files changed, 7 insertions(+), 188 deletions(-)
 delete mode 100644 runtime/platform/default/Windows.cpp

diff --git a/runtime/platform/default/Windows.cpp b/runtime/platform/default/Windows.cpp
deleted file mode 100644
index eb140b19cb6..00000000000
--- a/runtime/platform/default/Windows.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @file
- * Fallback PAL implementations for POSIX-compatible systems.
- *
- * Note that this assumes that the platform defines the symbols used in this
- * file (like fprintf()), because this file will still be built even if the
- * functions are later overridden. When building for a platform that does not
- * provide the necessary symbols, clients can use Minimal.cpp instead, but they
- * will need to override all of the functions.
- */
-
-// This cpp file will provide weak implementations of the symbols declared in
-// Platform.h. Client users can strongly define any or all of the functions to
-// override them.
-#define ET_INTERNAL_PLATFORM_WEAKNESS __ET_WEAK
-#include <executorch/runtime/platform/platform.h>
-
-#include <windows.h>
-#include <tchar.h>
-
-#include <chrono>
-#include <cinttypes>
-#include <cstdio>
-#include <cstdlib>
-
-#include <executorch/runtime/platform/compiler.h>
-
-// The FILE* to write logs to.
-#define ET_LOG_OUTPUT_FILE stderr
-
-/**
- * On debug builds, ensure that `et_pal_init` has been called before
- * other PAL functions which depend on initialization.
- */
-#ifdef NDEBUG
-
-/**
- * Assert that the PAL has been initialized.
- */
-#define _ASSERT_PAL_INITIALIZED() ((void)0)
-
-#else // NDEBUG
-
-/**
- * Assert that the PAL has been initialized.
- */
-#define _ASSERT_PAL_INITIALIZED()                                   \
-  ({                                                                \
-    if (!initialized) {                                             \
-      fprintf(                                                      \
-          ET_LOG_OUTPUT_FILE,                                       \
-          "ExecuTorch PAL must be initialized before call to %s()", \
-          __ET_FUNCTION);                                           \
-      fflush(ET_LOG_OUTPUT_FILE);                                   \
-      et_pal_abort();                                               \
-    }                                                               \
-  })
-
-#endif // NDEBUG
-
-#pragma data_seg(".SS_DLLMAIN")
-
-/// Start time of the system (used to zero the system timestamp).
-static std::chrono::time_point<std::chrono::steady_clock> systemStartTime ET_SHARED;
-
-/// Flag set to true if the PAL has been successfully initialized.
-static bool initialized ET_SHARED = false;
-
-#pragma data_seg()
-
-/**
- * Initialize the platform abstraction layer.
- *
- * This function should be called before any other function provided by the PAL
- * to initialize any global state. Typically overridden by PAL implementer.
- */
-void et_pal_init(void) {
-  if (initialized) {
-    return;
-  }
-
-  systemStartTime = std::chrono::steady_clock::now();
-  initialized = true;
-}
-
-/**
- * Immediately abort execution, setting the device into an error state, if
- * available.
- */
-__ET_NORETURN void et_pal_abort(void) {
-  std::abort();
-}
-
-/**
- * Return a monotonically non-decreasing timestamp in system ticks.
- *
- * @retval Timestamp value in system ticks.
- */
-et_timestamp_t et_pal_current_ticks(void) {
-  _ASSERT_PAL_INITIALIZED();
-  auto systemCurrentTime = std::chrono::steady_clock::now();
-  return std::chrono::duration_cast<std::chrono::nanoseconds>(
-             systemCurrentTime - systemStartTime)
-      .count();
-}
-
-/**
- * Return the conversion rate from system ticks to nanoseconds, as a fraction.
- * To convert an interval from system ticks to nanoseconds, multiply the tick
- * count by the numerator and then divide by the denominator:
- *   nanoseconds = ticks * numerator / denominator
- *
- * @retval The ratio of nanoseconds to system ticks.
- */
-et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) {
-  // The system tick interval is 1 nanosecond, so the conversion factor is 1.
-  return {1, 1};
-}
-
-/**
- * Emit a log message via platform output (serial port, console, etc).
- *
- * @param[in] timestamp Timestamp of the log event in system ticks since boot.
- * @param[in] level Severity level of the message. Must be a printable 7-bit
- *     ASCII uppercase letter.
- * @param[in] filename Name of the file that created the log event.
- * @param[in] function Name of the function that created the log event.
- * @param[in] line Line in the source file where the log event was created.
- * @param[in] message Message string to log.
- * @param[in] length Message string length.
- */
-void et_pal_emit_log_message(
-    et_timestamp_t timestamp,
-    et_pal_log_level_t level,
-    const char* filename,
-    __ET_UNUSED const char* function,
-    size_t line,
-    const char* message,
-    __ET_UNUSED size_t length) {
-  _ASSERT_PAL_INITIALIZED();
-
-  // Not all platforms have ticks == nanoseconds, but this one does.
-  timestamp /= 1000; // To microseconds
-  unsigned long int us = timestamp % 1000000;
-  timestamp /= 1000000; // To seconds
-  unsigned int sec = timestamp % 60;
-  timestamp /= 60; // To minutes
-  unsigned int min = timestamp % 60;
-  timestamp /= 60; // To hours
-  unsigned int hour = timestamp;
-
-  // Use a format similar to glog and folly::logging, except:
-  // - Print time since et_pal_init since we don't have wall time
-  // - Don't include the thread ID, to avoid adding a threading dependency
-  // - Add the string "executorch:" to make the logs more searchable
-  //
-  // Clients who want to change the format or add other fields can override this
-  // weak implementation of et_pal_emit_log_message.
-  fprintf(
-      ET_LOG_OUTPUT_FILE,
-      "%c %02u:%02u:%02u.%06lu executorch:%s:%zu] %s\n",
-      level,
-      hour,
-      min,
-      sec,
-      us,
-      filename,
-      line,
-      message);
-  fflush(ET_LOG_OUTPUT_FILE);
-}
diff --git a/runtime/platform/default/posix.cpp b/runtime/platform/default/posix.cpp
index aba504f53e0..f808b717ab7 100644
--- a/runtime/platform/default/posix.cpp
+++ b/runtime/platform/default/posix.cpp
@@ -63,11 +63,15 @@
 
 #endif // NDEBUG
 
+#pragma data_seg(".SS_DLLMAIN") // Shared data segment for DLL main with MSVC
+
 /// Start time of the system (used to zero the system timestamp).
-static std::chrono::time_point<std::chrono::steady_clock> systemStartTime;
+static std::chrono::time_point<std::chrono::steady_clock> systemStartTime ET_SHARED;
 
 /// Flag set to true if the PAL has been successfully initialized.
-static bool initialized = false;
+static bool initialized ET_SHARED = false;
+
+#pragma data_seg()
 
 /**
  * Initialize the platform abstraction layer.
diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl
index 56ce4b3aeff..6d3141a4219 100644
--- a/runtime/platform/targets.bzl
+++ b/runtime/platform/targets.bzl
@@ -6,12 +6,7 @@ def _select_pal(dict_):
     `executorch.pal_default` build config value. Fails if no corresponding entry
     exists.
     """
-    # buck2 check target platform
-    # check config//os:
-    if host_info().os.is_windows:
-        pal_default = native.read_config("executorch", "pal_default", "windows")
-    else:
-        pal_default = native.read_config("executorch", "pal_default", "posix")
+    pal_default = native.read_config("executorch", "pal_default", "posix")
     if not pal_default in dict_:
         fail("Missing key for executorch.pal_default value '{}' in dict '{}'".format(pal_default, dict_))
     return dict_[pal_default]
@@ -49,7 +44,6 @@ def define_common_targets():
         srcs = _select_pal({
             "minimal": ["default/minimal.cpp"],
             "posix": ["default/posix.cpp"],
-            "windows": ["default/Windows.cpp"],
         }),
         deps = [
             ":pal_interface",

From 304734d944884cc5b7fee2634a23090d4dd9a00b Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Fri, 27 Sep 2024 15:50:48 +0800
Subject: [PATCH 23/25] Support cross-compile

---
 build/Utils.cmake | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/build/Utils.cmake b/build/Utils.cmake
index 4cf5370ee64..9228789e658 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -195,12 +195,17 @@ function(extract_sources sources_file)
       else()
         message(FATAL_ERROR "Unsupported ANDROID_ABI setting ${ANDROID_ABI}. Please add it here!")
       endif()
-    elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
-      set(fake_host_arg "--fake-host=windows")
-      if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
-        set(fake_arch_arg "--fake-arch=x8664")
-      elseif("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64")
-        set(fake_arch_arg "--fake-arch=aarch64")
+    else()
+      if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "${CMAKE_HOST_SYSTEM_NAME}")
+        string(TOLOWER "${CMAKE_SYSTEM_NAME}" lowercase_system_name)
+        set(fake_host_arg "--fake-host=${lowercase_system_name}")
+      endif()
+      if (NOT "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
+        if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")
+          set(fake_arch_arg "--fake-arch=x8664")
+        else()
+          set(fake_arch_arg "--fake-arch=aarch64")
+        endif()
       endif()
     endif()
 

From 93695c4f4e42dbec14e7608ddb7ee3b7342afe89 Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 28 Sep 2024 21:14:10 +0800
Subject: [PATCH 24/25] Support cross-compile for macOS

---
 build/Utils.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/build/Utils.cmake b/build/Utils.cmake
index 9228789e658..d3e458e381e 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -207,6 +207,11 @@ function(extract_sources sources_file)
           set(fake_arch_arg "--fake-arch=aarch64")
         endif()
       endif()
+      if ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64")
+        set(fake_arch_arg "--fake-arch=aarch64")
+      elseif("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "x86_64")
+        set(fake_arch_arg "--fake-arch=x8664")
+      endif()
     endif()
 
     execute_process(

From 2cbb25266d60a3f388062ec1841a59eaabcea0af Mon Sep 17 00:00:00 2001
From: Hans <hans.chen@bricks.tools>
Date: Sat, 28 Sep 2024 21:18:12 +0800
Subject: [PATCH 25/25] Correct arg value when cross compile macOS

---
 build/Utils.cmake | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/build/Utils.cmake b/build/Utils.cmake
index d3e458e381e..620ba27114e 100644
--- a/build/Utils.cmake
+++ b/build/Utils.cmake
@@ -197,8 +197,12 @@ function(extract_sources sources_file)
       endif()
     else()
       if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "${CMAKE_HOST_SYSTEM_NAME}")
-        string(TOLOWER "${CMAKE_SYSTEM_NAME}" lowercase_system_name)
-        set(fake_host_arg "--fake-host=${lowercase_system_name}")
+        if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
+          set(fake_host_arg "--fake-host=macos")
+        else()
+          string(TOLOWER "${CMAKE_SYSTEM_NAME}" lowercase_system_name)
+          set(fake_host_arg "--fake-host=${lowercase_system_name}")
+        endif()
       endif()
       if (NOT "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
         if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64")