@@ -183,16 +183,14 @@ DEF_ISEL(PUNPCKLDQ_XMMdq_XMMq) = PUNPCKLDQ<V128W, V128, V128>;
183183
184184DEF_ISEL (PUNPCKLQDQ_XMMdq_MEMdq) = PUNPCKLQDQ<V128W, V128, MV128>;
185185DEF_ISEL (PUNPCKLQDQ_XMMdq_XMMq) = PUNPCKLQDQ<V128W, V128, V128>;
186+ IF_AVX (DEF_ISEL(VPUNPCKLQDQ_XMMdq_XMMdq_XMMdq) = PUNPCKLQDQ<V128W, V128, V128>;)
186187
187188// Adding new MMX Instructions
188189namespace {
189190
190191template <typename D, typename S1, typename S2>
191192DEF_SEM (PADDB, D dst, S1 src1, S2 src2) {
192- auto lhs_vec = UReadV8 (src1);
193- auto rhs_vec = UReadV8 (src2);
194- auto dst_vec = UAddV8 (lhs_vec, rhs_vec);
195- UWriteV8 (dst, dst_vec);
193+ UWriteV8 (dst, UAddV8 (UReadV8 (src1), UReadV8 (src2)));
196194 return memory;
197195}
198196
@@ -230,6 +228,8 @@ DEF_ISEL(PADDB_MMXq_MEMq) = PADDB<V64W, V64, MV64>;
230228DEF_ISEL (PADDB_XMMdq_XMMdq) = PADDB<V128W, V128, V128>;
231229DEF_ISEL (PADDB_XMMdq_MEMdq) = PADDB<V128W, V128, MV128>;
232230
231+ IF_AVX (DEF_ISEL(VPADDB_YMMqq_YMMqq_YMMqq) = PADDB<VV256W, VV256, VV256>;)
232+
233233DEF_ISEL (PADDW_MMXq_MMXq) = PADDW<V64W, V64, V64>;
234234DEF_ISEL (PADDW_MMXq_MEMq) = PADDW<V64W, V64, MV64>;
235235DEF_ISEL (PADDW_XMMdq_XMMdq) = PADDW<V128W, V128, V128>;
@@ -240,6 +240,8 @@ DEF_ISEL(PADDD_MMXq_MEMq) = PADDD<V64W, V64, MV64>;
240240DEF_ISEL (PADDD_XMMdq_XMMdq) = PADDD<V128W, V128, V128>;
241241DEF_ISEL (PADDD_XMMdq_MEMdq) = PADDD<V128W, V128, MV128>;
242242
243+ IF_AVX (DEF_ISEL(VPADDD_YMMqq_YMMqq_YMMqq) = PADDD<VV256W, VV256, VV256>;)
244+
243245DEF_ISEL (PADDQ_MMXq_MMXq) = PADDQ<V64W, V64, V64>;
244246DEF_ISEL (PADDQ_MMXq_MEMq) = PADDQ<V64W, V64, MV64>;
245247DEF_ISEL (PADDQ_XMMdq_XMMdq) = PADDQ<V128W, V128, V128>;
@@ -438,6 +440,8 @@ DEF_ISEL(PHADDD_MMXq_MEMq) = PHADDD<V64W, V64, MV64>;
438440DEF_ISEL (PHADDD_XMMdq_XMMdq) = PHADDD<V128W, V128, V128>;
439441DEF_ISEL (PHADDD_XMMdq_MEMdq) = PHADDD<V128W, V128, MV128>;
440442
443+ IF_AVX (DEF_ISEL(VPHADDD_YMMqq_YMMqq_YMMqq) = PHADDD<VV256W, VV256, VV256>;)
444+
441445template <typename D, typename S1, typename S2>
442446DEF_SEM (PHADDSW, D dst, S1 src1, S2 src2) {
443447 auto src1_vec = SReadV16 (src1);
@@ -530,6 +534,7 @@ DEF_ISEL(PSUBB_MMXq_MMXq) = PSUBB<V64W, V64, V64>;
530534DEF_ISEL (PSUBB_MMXq_MEMq) = PSUBB<V64W, V64, MV64>;
531535DEF_ISEL (PSUBB_XMMdq_XMMdq) = PSUBB<V128W, V128, V128>;
532536DEF_ISEL (PSUBB_XMMdq_MEMdq) = PSUBB<V128W, V128, MV128>;
537+ IF_AVX (DEF_ISEL(VPSUBB_YMMqq_YMMqq_YMMqq) = PSUBB<VV256W, VV256, VV256>;)
533538
534539DEF_ISEL (PSUBW_MMXq_MMXq) = PSUBW<V64W, V64, V64>;
535540DEF_ISEL (PSUBW_MMXq_MEMq) = PSUBW<V64W, V64, MV64>;
@@ -542,13 +547,15 @@ DEF_ISEL(PSUBD_XMMdq_XMMdq) = PSUBD<V128W, V128, V128>;
542547DEF_ISEL (PSUBD_XMMdq_MEMdq) = PSUBD<V128W, V128, MV128>;
543548IF_AVX (DEF_ISEL(VPSUBD_XMMdq_XMMdq_MEMdq) = PSUBD<VV128W, V128, MV128>;)
544549IF_AVX (DEF_ISEL(VPSUBD_XMMdq_XMMdq_XMMdq) = PSUBD<VV128W, V128, V128>;)
550+ IF_AVX (DEF_ISEL(VPSUBD_YMMqq_YMMqq_YMMqq) = PSUBD<VV256W, VV256, VV256>;)
545551
546552DEF_ISEL (PSUBQ_MMXq_MMXq) = PSUBQ<V64W, V64, V64>;
547553DEF_ISEL (PSUBQ_MMXq_MEMq) = PSUBQ<V64W, V64, MV64>;
548554DEF_ISEL (PSUBQ_XMMdq_XMMdq) = PSUBQ<V128W, V128, V128>;
549555DEF_ISEL (PSUBQ_XMMdq_MEMdq) = PSUBQ<V128W, V128, MV128>;
550556IF_AVX (DEF_ISEL(VPSUBQ_XMMdq_XMMdq_MEMdq) = PSUBQ<VV128W, V128, MV128>;)
551557IF_AVX (DEF_ISEL(VPSUBQ_XMMdq_XMMdq_XMMdq) = PSUBQ<VV128W, V128, V128>;)
558+ IF_AVX (DEF_ISEL(VPSUBQ_YMMqq_YMMqq_YMMqq) = PSUBQ<VV256W, VV256, VV256>;)
552559
553560/*
5545613305 VPSUBD VPSUBD_YMMqq_YMMqq_MEMqq AVX2 AVX2 AVX2 ATTRIBUTES:
@@ -1760,6 +1767,23 @@ DEF_SEM(PMULUDQ, D dst, S1 src1, S2 src2) {
17601767 return memory;
17611768}
17621769
1770+ template <typename D, typename S1, typename S2>
1771+ DEF_SEM (PMULLD, D dst, S1 src1, S2 src2) {
1772+ auto src1_vec = SReadV32 (src1);
1773+ auto src2_vec = SReadV32 (src2);
1774+ auto dst_vec = SClearV32 (SReadV32 (dst));
1775+
1776+ auto vec_count = NumVectorElems (src1_vec);
1777+ _Pragma (" unroll" ) for (size_t i = 0 ; i < vec_count; i++) {
1778+ auto v1 = SExtractV32 (src1_vec, i);
1779+ auto v2 = SExtractV32 (src2_vec, i);
1780+ auto mul = SMul (SExt (v1), SExt (v2));
1781+ dst_vec = SInsertV32 (dst_vec, i, Trunc (mul));
1782+ }
1783+ SWriteV32 (dst, dst_vec);
1784+ return memory;
1785+ }
1786+
17631787template <typename D, typename S1, typename S2>
17641788DEF_SEM (PMULLW, D dst, S1 src1, S2 src2) {
17651789 auto src1_vec = SReadV16 (src1);
@@ -1817,11 +1841,15 @@ DEF_ISEL(PMULUDQ_MMXq_MMXq) = PMULUDQ<V64W, V64, V64>;
18171841DEF_ISEL (PMULUDQ_MMXq_MEMq) = PMULUDQ<V64W, V64, MV64>;
18181842DEF_ISEL (PMULUDQ_XMMdq_XMMdq) = PMULUDQ<V128W, V128, V128>;
18191843DEF_ISEL (PMULUDQ_XMMdq_MEMdq) = PMULUDQ<V128W, V128, MV128>;
1844+ IF_AVX (DEF_ISEL(VPMULUDQ_YMMqq_YMMqq_YMMqq) = PMULUDQ<VV256W, VV256, VV256>;)
1845+
1846+ IF_AVX (DEF_ISEL(VPMULLD_YMMqq_YMMqq_YMMqq) = PMULLD<VV256W, VV256, VV256>;)
18201847
18211848DEF_ISEL (PMULLW_MMXq_MMXq) = PMULLW<V64W, V64, V64>;
18221849DEF_ISEL (PMULLW_MMXq_MEMq) = PMULLW<V64W, V64, MV64>;
18231850DEF_ISEL (PMULLW_XMMdq_XMMdq) = PMULLW<V128W, V128, V128>;
18241851DEF_ISEL (PMULLW_XMMdq_MEMdq) = PMULLW<V128W, V128, MV128>;
1852+ IF_AVX (DEF_ISEL(VPMULLW_YMMqq_YMMqq_YMMqq) = PMULLW<VV256W, VV256, VV256>;)
18251853
18261854DEF_ISEL (PMULHW_MMXq_MMXq) = PMULHW<V64W, V64, V64>;
18271855DEF_ISEL (PMULHW_MMXq_MEMq) = PMULHW<V64W, V64, MV64>;
0 commit comments