-
Notifications
You must be signed in to change notification settings - Fork 2.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Snippets][CPU] Added external repacking via BrgemmCopyB #28179
base: master
Are you sure you want to change the base?
Changes from 3 commits
2b536b7
b665659
72bf13a
4b33eaa
f1c7435
b6ffdaf
cced16d
fea453e
ed31224
9bb9646
e1951cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,12 @@ | |
#pragma once | ||
|
||
#include "emitters/snippets/jit_snippets_call_args.hpp" | ||
|
||
#ifndef OPENVINO_ARCH_ARM64 | ||
# include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp" | ||
#endif | ||
|
||
#include "cache/multi_cache.h" | ||
#include "memory_desc/cpu_blocked_memory_desc.h" | ||
#include "snippets/lowered/port_descriptor.hpp" | ||
#include "snippets/runtime_configurator.hpp" | ||
|
@@ -21,27 +27,59 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { | |
std::string to_string() const override; | ||
#endif | ||
|
||
#ifndef OPENVINO_ARCH_ARM64 | ||
struct RepackedInput { | ||
RepackedInput() = default; | ||
RepackedInput(CpuBlockedMemoryDescPtr desc_, | ||
std::shared_ptr<BrgemmCopyBKernelExecutor> executor_, | ||
VectorDims in_offsets_, | ||
VectorDims out_offsets_) | ||
: desc(std::move(desc_)), | ||
executor(std::move(executor_)), | ||
in_offsets(std::move(in_offsets_)), | ||
out_offsets(std::move(out_offsets_)) {} | ||
|
||
CpuBlockedMemoryDescPtr desc{nullptr}; | ||
std::shared_ptr<BrgemmCopyBKernelExecutor> executor{nullptr}; | ||
VectorDims in_offsets{}; | ||
VectorDims out_offsets{}; | ||
}; | ||
std::unordered_map<size_t, RepackedInput> repacked_inputs = {}; | ||
|
||
enum class RepackingImplType { | ||
NONE, // no kernel-outside repacking | ||
IN_PARALLEL, // should be executed in parallel_nt by each thread | ||
SEPARATE, // should be separathy from kernel executed | ||
}; | ||
RepackingImplType repacking_impl_type = RepackingImplType::NONE; | ||
#endif // OPENVINO_ARCH_ARM64 | ||
|
||
std::vector<jit_snippets_call_args::loop_args_t> loop_args = {}; | ||
std::unordered_map<size_t, CpuBlockedMemoryDescPtr> m_in_requested_descs = {}; | ||
}; | ||
|
||
class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { | ||
public: | ||
CPURuntimeConfigurator(); | ||
CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache = {}); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor: do we need a default argument here? Wouldn't it be safer to force user to always provide a cache pointer? |
||
|
||
/** | ||
* @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig | ||
* @param linear_ir LinearIR | ||
*/ | ||
void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; | ||
|
||
const ov::intel_cpu::MultiCacheWeakPtr& get_cache() const { | ||
return compiled_kernel_cache; | ||
} | ||
Comment on lines
+72
to
+74
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do I understand correctly that this cache argument is needed solely for |
||
|
||
protected: | ||
void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; | ||
void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override; | ||
void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override; | ||
void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; | ||
|
||
static const size_t rank6D; | ||
|
||
ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache; | ||
}; | ||
|
||
} // namespace intel_cpu | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we can move this class to a separate file? In this case, we could move all ifdefs there and avoid it in configurator's code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's one option. Alternatively, we can create a base class for repacking kernels, so RepackedInput could be reused as-is on ARM . I think it would allow us to create a better interface.
After all, the main idea of Snippets is scalability (including different architectures), so we should focus on developing scalable pipeline, and these architecture-specific ifdefs indicate that we couldn't do it for some reason.