Skip to content
51 changes: 29 additions & 22 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2333,7 +2333,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
}

// Init hexagon tensor from GGML tensor and Hexagon buffer
static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t, bool is_src) {
h->data = 0; // updated by the receiver
h->type = t->type;
h->ne[0] = t->ne[0];
Expand All @@ -2344,6 +2344,13 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
h->nb[1] = t->nb[1];
h->nb[2] = t->nb[2];
h->nb[3] = t->nb[3];

if (is_src) {
h->div21 = init_fastdiv_values(h->ne[2] * h->ne[1]);
h->div3 = init_fastdiv_values(h->ne[3]);
h->div2 = init_fastdiv_values(h->ne[2]);
h->div1 = init_fastdiv_values(h->ne[1]);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We’re computing the fast-div parameters in init_htp_tensor, which runs for every DSP-dispatched op—even though many ops don’t need broadcasting.

Could we initialize them in ggml_backend_hexagon_buffer_init_tensor and store them per tensor to avoid recomputation? The catch is we don’t currently have a per-tensor custom context, so we’d need to add one...

}
}

static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
Expand Down Expand Up @@ -2372,9 +2379,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
req.op = HTP_OP_MUL_MAT;
req.flags = flags;

init_htp_tensor(&req.src0, src0);
init_htp_tensor(&req.src1, src1);
init_htp_tensor(&req.dst, dst);
init_htp_tensor(&req.src0, src0, true);
init_htp_tensor(&req.src1, src1, true);
init_htp_tensor(&req.dst, dst, false);

// Use opmask to override flags
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
Expand Down Expand Up @@ -2476,10 +2483,10 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
req.op = HTP_OP_MUL_MAT_ID;
req.flags = flags;

init_htp_tensor(&req.src0, src0);
init_htp_tensor(&req.src1, src1);
init_htp_tensor(&req.src2, src2);
init_htp_tensor(&req.dst, dst);
init_htp_tensor(&req.src0, src0, true);
init_htp_tensor(&req.src1, src1, true);
init_htp_tensor(&req.src2, src2, true);
init_htp_tensor(&req.dst, dst, false);

// Use opmask to override flags
if (!(opt_opmask & HTP_OPMASK_QUANTIZE)) {
Expand Down Expand Up @@ -2616,9 +2623,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
GGML_ABORT("ggml-hex: binary : unsupported op:%d\n", node->op);
}

init_htp_tensor(&req.src0, src0);
init_htp_tensor(&req.src1, src1);
init_htp_tensor(&req.dst, dst);
init_htp_tensor(&req.src0, src0, true);
init_htp_tensor(&req.src1, src1, true);
init_htp_tensor(&req.dst, dst, false);

dspqueue_buffer bufs[3];
memset(bufs, 0, sizeof(bufs));
Expand Down Expand Up @@ -2735,10 +2742,10 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
GGML_ABORT("ggml-hex: unsupported op:%d\n", node->op);
}

init_htp_tensor(&req.src0, src0);
init_htp_tensor(&req.src1, src1);
init_htp_tensor(&req.src2, src2);
init_htp_tensor(&req.dst, dst);
init_htp_tensor(&req.src0, src0, true);
init_htp_tensor(&req.src1, src1, true);
init_htp_tensor(&req.src2, src2, true);
init_htp_tensor(&req.dst, dst, false);

dspqueue_buffer bufs[4];
memset(bufs, 0, sizeof(bufs));
Expand Down Expand Up @@ -2871,10 +2878,10 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
GGML_ABORT("ggml-hex: unary : unsupported op:%d\n", op->op);
}

init_htp_tensor(&req.dst, dst);
init_htp_tensor(&req.src0, src0);
init_htp_tensor(&req.dst, dst, false);
init_htp_tensor(&req.src0, src0, true);
if (src1) {
init_htp_tensor(&req.src1, src1);
init_htp_tensor(&req.src1, src1, true);
}

// Use opmask to override flags
Expand Down Expand Up @@ -3007,11 +3014,11 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
req.flags = flags;
req.op = HTP_OP_ROPE;

init_htp_tensor(&req.dst, dst);
init_htp_tensor(&req.src0, src0);
init_htp_tensor(&req.src1, src1);
init_htp_tensor(&req.dst, dst, false);
init_htp_tensor(&req.src0, src0, true);
init_htp_tensor(&req.src1, src1, true);
if (src2) {
init_htp_tensor(&req.src2, src2);
init_htp_tensor(&req.src2, src2, true);
}

// Use opmask to override flags
Expand Down
12 changes: 10 additions & 2 deletions ggml/src/ggml-hexagon/htp/binary-ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,18 @@ static void binary_job_f32_per_thread(const struct htp_tensor * src0,
uint8_t * restrict dst_ptr = (uint8_t *) dst->data + (src0_start_row * dst_row_size);

const uint8_t * restrict data_src1 = (const uint8_t *) src1->data;
const uint8_t * restrict src1_ptr = NULL;

const uint32_t ne0201 = ne02 * ne01;
for (uint32_t ir = src0_start_row; ir < src0_end_row; ir++) {
src1_ptr = data_src1 + (ir % src1_nrows) * src1_row_size;
const uint32_t i03 = fastdiv(ir, &src0->div21);
const uint32_t i02 = fastdiv(ir - i03 * ne0201, &src0->div1);
const uint32_t i01 = (ir - i03 * ne0201 - i02 * ne01);

const uint32_t i13 = fastmodulo(i03, ne13, &src1->div3);
const uint32_t i12 = fastmodulo(i02, ne12, &src1->div2);
const uint32_t i11 = fastmodulo(i01, ne11, &src1->div1);

const uint8_t * restrict src1_ptr = data_src1 + i13 * nb13 + i12 * nb12 + i11 * src1_row_size;

if (ir + 1 < src0_end_row) {
htp_l2fetch(src0_ptr + ne00, 1, src0_row_size, src0_row_size);
Expand Down
35 changes: 31 additions & 4 deletions ggml/src/ggml-hexagon/htp/htp-msg.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,28 @@ static const char * htp_type_name(uint32_t t) {
return 0;
}

// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
// Precompute mp (m' in the paper) and L such that division
// can be computed using a multiply (high 32b of 64b result)
// and a shift:
//
// n/d = (mulhi(n, mp) + n) >> L;
struct fastdiv_values {
uint32_t mp;
uint32_t l;
};

static inline struct fastdiv_values init_fastdiv_values(uint32_t d) {
struct fastdiv_values result = { 0, 0 };
// compute L = ceil(log2(d));
while (result.l < 32 && ((uint32_t) 1 << result.l) < d) {
++(result.l);
}

result.mp = (uint32_t) (((uint64_t) 1 << 32) * (((uint64_t) 1 << result.l) - d) / d + 1);
return result;
}

// Internal types
#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128)
#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks
Expand All @@ -119,10 +141,15 @@ static const char * htp_type_name(uint32_t t) {
#define HTP_MAX_DIMS 4

struct htp_tensor {
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
uint32_t type; // Data type
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)
uint32_t data; // Buffer offset in the messages, and data pointer on the NSP
uint32_t type; // Data type
uint32_t ne[HTP_MAX_DIMS]; // Number of elements
uint32_t nb[HTP_MAX_DIMS]; // Stride in bytes (see ggml.h ggml_tensor)

struct fastdiv_values div21; // fastdiv values for ne2 * ne1
struct fastdiv_values div3; // fastdiv values for ne3
struct fastdiv_values div2; // fastdiv values for ne2
struct fastdiv_values div1; // fastdiv values for ne1
};

#define HTP_MAX_OP_PARAMS 64
Expand Down
11 changes: 11 additions & 0 deletions ggml/src/ggml-hexagon/htp/ops-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,17 @@ static inline uint32_t htp_round_up(uint32_t n, uint32_t m) {
return m * ((n + m - 1) / m);
}

static inline uint32_t fastdiv(uint32_t n, const struct fastdiv_values * vals) {
// Compute high 32 bits of n * mp
const uint32_t hi = (uint32_t) (((uint64_t) n * vals->mp) >> 32); // mulhi(n, mp)
// add n, apply bit shift
return (hi + n) >> vals->l;
}

static inline uint32_t fastmodulo(uint32_t n, uint32_t d, const struct fastdiv_values * vals) {
return n - fastdiv(n, vals) * d;
}

static inline void htp_l2fetch(const void * p, uint32_t height, uint32_t width, uint32_t stride) {
const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height));
asm volatile(" l2fetch(%0,%1) " : : "r"(p), "r"(control));
Expand Down
Loading