Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/common/util/include/openvino/util/mmap_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <fstream>
#include <memory>
#include <string>
#include <variant>

namespace ov {

Expand All @@ -28,14 +29,13 @@ class MappedMemory {
};

/**
* @brief Returns mapped memory for a file from provided path.
* Instead of reading files, we can map the memory via mmap for Linux
* in order to avoid time-consuming reading and reduce memory consumption.
* @brief Returns mapped memory for a file using path or file descriptor.
* Accepts either a string (file path) or int (file descriptor).
*
* @param path Path to a file which memory will be mmaped.
* @param path_or_fd std::variant containing either std::string (path) or int (fd).
* @return MappedMemory shared ptr object which keep mmaped memory and control the lifetime.
*/
std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::string& path);
std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::variant<std::string, int>& path_or_fd);

#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT

Expand Down
33 changes: 23 additions & 10 deletions src/common/util/src/os/lin/lin_mmap_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,28 @@ class MapHolder : public MappedMemory {
MapHolder() = default;

void set(const std::string& path) {
int prot = PROT_READ;
int mode = O_RDONLY;
struct stat sb = {};
m_handle = HandleHolder(open(path.c_str(), mode));
if (m_handle.get() == -1) {
int fd = open(path.c_str(), mode);
if (fd == -1) {
throw std::runtime_error("Can not open file " + path +
" for mapping. Ensure that file exists and has appropriate permissions");
}
if (fstat(m_handle.get(), &sb) == -1) {
throw std::runtime_error("Can not get file size for " + path);
set_from_fd(fd);
}

void set_from_fd(const int fd) {
int prot = PROT_READ;
struct stat sb = {};
m_handle = HandleHolder(fd);
if (fstat(fd, &sb) == -1) {
throw std::runtime_error("Can not get file size for fd=" + std::to_string(fd));
}
m_size = sb.st_size;
if (m_size > 0) {
m_data = mmap(nullptr, m_size, prot, MAP_PRIVATE, m_handle.get(), 0);
m_data = mmap(nullptr, m_size, prot, MAP_PRIVATE, fd, 0);
if (m_data == MAP_FAILED) {
throw std::runtime_error("Can not create file mapping for " + path + ", err=" + std::strerror(errno));
throw std::runtime_error("Can not create file mapping for fd=" + std::to_string(fd) +
", err=" + std::strerror(errno));
}
} else {
m_data = MAP_FAILED;
Expand All @@ -101,9 +107,16 @@ class MapHolder : public MappedMemory {
}
};

std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::string& path) {
std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::variant<std::string, int>& path_or_fd) {
auto holder = std::make_shared<MapHolder>();
holder->set(path);
std::visit([&holder](auto&& arg) {
using T = std::decay_t<decltype(arg)>;
if constexpr (std::is_same_v<T, int>) {
holder->set_from_fd(arg); // fd
} else if constexpr (std::is_same_v<T, std::string>) {
holder->set(arg); // path
}
}, path_or_fd);
return holder;
}

Expand Down
15 changes: 11 additions & 4 deletions src/common/util/src/os/win/win_mmap_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,17 @@ class MapHolder : public ov::MappedMemory {
HandleHolder m_mapping;
};

std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::string& path) {
auto holder = std::make_shared<MapHolder>();
holder->set(path);
return holder;
std::shared_ptr<ov::MappedMemory> load_mmap_object(const std::variant<std::string, int>& path_or_fd) {
return std::visit([](auto&& arg) -> std::shared_ptr<ov::MappedMemory> {
using T = std::decay_t<decltype(arg)>;
if constexpr (std::is_same_v<T, int>) {
OPENVINO_THROW("File descriptor-based memory mapping is not supported on Windows. Use path-based load_mmap_object instead.");
} else if constexpr (std::is_same_v<T, std::string>) {
auto holder = std::make_shared<MapHolder>();
holder->set(arg);
return holder;
}
}, path_or_fd);
}

#ifdef OPENVINO_ENABLE_UNICODE_PATH_SUPPORT
Expand Down
35 changes: 34 additions & 1 deletion src/inference/include/openvino/runtime/properties.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1337,12 +1337,45 @@ static constexpr Property<int32_t, PropertyMutability::RW> compilation_num_threa
*/
static constexpr Property<std::vector<std::string>, PropertyMutability::RO> execution_devices{"EXECUTION_DEVICES"};

/**
* @brief Structure to represent weights path with optional file accessor function
* @ingroup ov_runtime_cpp_prop_api
*/
struct WeightsPath {
WeightsPath() = default;

WeightsPath(const std::string& path_) : path{path_}, file_accessor{} {}

template <typename Func>
WeightsPath(const std::string& path_, Func&& file_accessor_)
: path{path_}, file_accessor{std::forward<Func>(file_accessor_)} {}

operator std::string() const {
return path;
}

std::string path;
std::function<Any(const std::string&)> file_accessor;
};

/** @cond INTERNAL */
inline std::ostream& operator<<(std::ostream& os, const WeightsPath& weights_path_val) {
return os << weights_path_val.path;
}

inline std::istream& operator>>(std::istream& is, WeightsPath& weights_path_val) {
is >> weights_path_val.path;
return is;
}
/** @endcond */

/**
* @brief Path to the file with model's weights.
*
* @note This property is used for weightless caching. Only used when ov::CacheMode Property is set to "OPTIMIZE_SIZE".
* @ingroup ov_runtime_cpp_prop_api
*/
static constexpr Property<std::string, PropertyMutability::RW> weights_path{"WEIGHTS_PATH"};
static constexpr Property<WeightsPath, PropertyMutability::RW> weights_path{"WEIGHTS_PATH"};

/**
* @brief The precision of key cache compression
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, dynamic_quantization_group_size, 0, "Dynamic
OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, dynamic_quantization_group_size_max, UINT64_MAX, "Maximum dynamic quantization group size. When group_size is set as a higher value than this number, dynamic quantization will be turned off")
OV_CONFIG_RELEASE_OPTION(ov::hint, kv_cache_precision, ov::element::dynamic, "")
OV_CONFIG_RELEASE_OPTION(ov::intel_gpu::hint, enable_kernels_reuse, false, "")
OV_CONFIG_RELEASE_OPTION(ov, weights_path, "", "Path to the model weights file used for weightless caching")
OV_CONFIG_RELEASE_OPTION(ov, weights_path, ov::WeightsPath(), "Path to the model weights file used for weightless caching")
OV_CONFIG_RELEASE_OPTION(ov::hint, activations_scale_factor, -1.0f, "Scalar floating point value that is used for runtime activation tensor scaling with fp16 inference precision")
OV_CONFIG_RELEASE_OPTION(ov::internal, enable_lp_transformations, false, "Enable/Disable Low precision transformations set")
OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, config_file, "", "Path to custom layers config file")
Expand Down
21 changes: 15 additions & 6 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1137,14 +1137,14 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
read(stream, compiled->m_bf16_consts);

// Initialize weights stream if weightless flow
std::string weights_path;
ov::WeightsPath weights_path_prop;
std::shared_ptr<ov::Model> model_ptr;
// Cache model's constants
WeightsContext::ConstsCache consts_cache;
if (is_weightless) {
if (properties.find(ov::weights_path.name()) != properties.end()) {
weights_path = properties.at(ov::weights_path.name()).as<std::string>();
NPUW_ASSERT(!weights_path.empty() &&
weights_path_prop = properties.at(ov::weights_path.name()).as<ov::WeightsPath>();
NPUW_ASSERT(!weights_path_prop.path.empty() &&
"Empty weights_path. Please provide WEIGHTS_PATH or MODEL_PTR in the configuration.");
} else if (properties.find(ov::hint::model.name()) != properties.end()) {
model_ptr = std::const_pointer_cast<ov::Model>(
Expand Down Expand Up @@ -1180,8 +1180,16 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(

ov::npuw::s11n::WeightsPtr weights = nullptr;
if (is_weightless) {
if (!weights_path.empty()) {
auto mapped_memory = ov::load_mmap_object(weights_path);
if (!weights_path_prop.path.empty()) {
std::shared_ptr<ov::MappedMemory> mapped_memory;
// Use file_accessor if available to get fd, otherwise use path
if (weights_path_prop.file_accessor) {
auto result = weights_path_prop.file_accessor(weights_path_prop.path);
int fd = result.as<int>();
mapped_memory = ov::load_mmap_object(fd);
} else {
mapped_memory = ov::load_mmap_object(weights_path_prop.path);
}
weights = std::make_shared<ov::npuw::s11n::Weights>(mapped_memory->data(),
mapped_memory->size(),
mapped_memory);
Expand All @@ -1191,7 +1199,8 @@ std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
// FIXME: prolong lifetime of ov::Model for import with MODEL_PTR.
// Unclear why it's needed, but without saving consts_cache until bank evaluation,
// the memory is freed somewhere.
compiled->m_import_weights_ctx = WeightsContext(weights, weights_path, consts_cache, compiled->m_bf16_consts);
compiled->m_import_weights_ctx =
WeightsContext(weights, weights_path_prop, consts_cache, compiled->m_bf16_consts);

// Deserialize compiled submodels
std::size_t subm_size = 0;
Expand Down
16 changes: 12 additions & 4 deletions src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,18 @@ ov::Tensor Const::eval() const {
}

// Weightless import case. Mmmap CPU weight on demand to avoid allocating all weights at once.
if (!m_weights_path.empty()) {
if (!m_weights_path.path.empty()) {
NPUW_ASSERT(!m_read_from_bin &&
"Trying to read weight from weights file, but the weight has been already deserialized!");
auto mapped_memory = ov::load_mmap_object(m_weights_path);
std::shared_ptr<ov::MappedMemory> mapped_memory;
// Use file_accessor if available to get fd, otherwise use path
if (m_weights_path.file_accessor) {
auto result = m_weights_path.file_accessor(m_weights_path.path);
int fd = result.as<int>();
mapped_memory = ov::load_mmap_object(fd);
} else {
mapped_memory = ov::load_mmap_object(m_weights_path.path);
}
m_mmaped_weights =
std::make_shared<ov::npuw::s11n::Weights>(mapped_memory->data(), mapped_memory->size(), mapped_memory);
return ov::Tensor(m_cached_type, m_cached_shape, m_mmaped_weights->get_ptr(m_offset));
Expand All @@ -80,7 +88,7 @@ LazyTensor::Meta Const::eval_meta() const {
}

// Weightless import case
if (!m_weights_path.empty()) {
if (!m_weights_path.path.empty()) {
return {m_cached_shape, m_cached_type};
}

Expand Down Expand Up @@ -115,7 +123,7 @@ void Const::read_weight(const ov::npuw::s11n::WeightsContext& ctx) {
// It doesn't introduce extra allocation, however it allows to gradually 1 by 1
// read mmaped CPU weights and allocate them on device without loading all the weights first.
// Thus the memory consumption during import is greatly reduced but at the slight cost of performance.
NPUW_ASSERT(!ctx.weights_path.empty());
NPUW_ASSERT(!ctx.weights_path.path.empty());
// Just save weights_path for the eval() to call the actual mmap.
m_weights_path = ctx.weights_path;
}
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/plugin/npuw/lazy_tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class Const {
std::size_t m_offset = 0;
std::size_t m_byte_size = 0;
ov::Tensor m_read_from_bin;
std::string m_weights_path;
ov::WeightsPath m_weights_path;
mutable ov::npuw::s11n::WeightsPtr m_mmaped_weights = nullptr;
// FIXME: special case when a new Constant was added into the model,
// then made into LazyTensor during folding. We need to keep a copy of it,
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_npu/src/plugin/npuw/serialization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ ov::npuw::s11n::WeightsContext::WeightsContext(bool _is_weightless,

// NOTE: This construtor can and should only be used when importing blobs
ov::npuw::s11n::WeightsContext::WeightsContext(const ov::npuw::s11n::WeightsPtr& _weights,
const std::string& _weights_path,
const ov::WeightsPath& _weights_path,
const s11n::WeightsContext::ConstsCache& _consts_cache,
const BF16Cache& _bf16_consts)
: weights(_weights),
Expand Down
6 changes: 4 additions & 2 deletions src/plugins/intel_npu/src/plugin/npuw/serialization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
#include <unordered_set>
#include <vector>

#include "openvino/runtime/properties.hpp"

namespace ov {
namespace npuw {
namespace s11n {
Expand Down Expand Up @@ -124,7 +126,7 @@ struct WeightsContext {

// NOTE: This construtor can and should only be used when importing weightless blobs
WeightsContext(const ov::npuw::s11n::WeightsPtr& _weights,
const std::string& _weights_path,
const ov::WeightsPath& _weights_path,
const ConstsCache& _consts_cache,
const BF16Cache& _bf16_consts);

Expand All @@ -138,7 +140,7 @@ struct WeightsContext {
bool is_weightless = true;
std::unordered_map<const void*, std::size_t> const_to_offset;
ov::npuw::s11n::WeightsPtr weights = nullptr;
std::string weights_path;
ov::WeightsPath weights_path;
ConstsCache consts_cache;
BF16Cache bf16_consts;
};
Expand Down