From 724506b0e9609c801abdefe36fe1f8a0198d3d18 Mon Sep 17 00:00:00 2001 From: Anton Kuklin Date: Sat, 5 Aug 2023 18:18:56 +0100 Subject: [PATCH 1/6] cmd/asm: add x86 AMX instructions Added instructions LDTILECFG, STTILECFG, TDPBF16PS, TDPBSSD, TDPBSUD, TDPBUSD, TDPBUUD, TILELOADD, TILELOADDT1, TILESTORED Fixes: #61079 --- src/cmd/internal/obj/x86/aenum.go | 10 ++++++ src/cmd/internal/obj/x86/anames.go | 10 ++++++ src/cmd/internal/obj/x86/asm6.go | 5 +-- src/cmd/internal/obj/x86/avx_optabs.go | 46 ++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 4 deletions(-) diff --git a/src/cmd/internal/obj/x86/aenum.go b/src/cmd/internal/obj/x86/aenum.go index 79cdd241a236a3..399b456d700a9f 100644 --- a/src/cmd/internal/obj/x86/aenum.go +++ b/src/cmd/internal/obj/x86/aenum.go @@ -1607,4 +1607,14 @@ const ( AXSETBV AXTEST ALAST + ALDTILECFG + ASTTILECFG + ATDPBF16PS + ATDPBSSD + ATDPBSUD + ATDPBUSD + ATDPBUUD + ATILELOADD + ATILELOADDT1 + ATILESTORED ) diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index 3966381e50d221..36948b88ed8c27 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -1605,4 +1605,14 @@ var Anames = []string{ "XSETBV", "XTEST", "LAST", + "LDTILECFG", + "STTILECFG", + "TDPBF16PS", + "TDPBSSD", + "TDPBSUD", + "TDPBUSD", + "TDPBUUD", + "TILELOADD", + "TILELOADDT1", + "TILESTORED", } diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 718da6a8a2caf6..3186cf5ac2aa94 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -192,6 +192,7 @@ const ( Ytextsize Yindir Ymax + Ytr ) const ( @@ -315,10 +316,6 @@ const ( // The P, L, and W fields are chosen to match // their eventual locations in the VEX prefix bytes. - // Encoding for VEX prefix in tables. - // The P, L, and W fields are chosen to match - // their eventual locations in the VEX prefix bytes. - // Using spare bit to make leading [E]VEX encoding byte different from // 0x0f even if all other VEX fields are 0. avxEscape = 1 << 6 diff --git a/src/cmd/internal/obj/x86/avx_optabs.go b/src/cmd/internal/obj/x86/avx_optabs.go index b8ff4699d1548c..1eea7db2351551 100644 --- a/src/cmd/internal/obj/x86/avx_optabs.go +++ b/src/cmd/internal/obj/x86/avx_optabs.go @@ -928,6 +928,22 @@ var _yvzeroall = []ytab{ {zcase: Zvex, zoffset: 2, args: argList{}}, } +var _yldtilecfg = []ytab{ + {zcase: Zvex_rm_v_ro, zoffset: 3, args: argList{Ym}}, +} + +var _ytdpbf16ps = []ytab{ + {zcase: Zvex_v_rm_r, zoffset: 2, args: argList{Ytr, Ytr, Ytr}}, +} + +var _ytileloadd = []ytab{ + {zcase: Zvex_rm_v_r, zoffset: 2, args: argList{Ym, Ytr}}, +} + +var _ytilestored = []ytab{ + {zcase: Zvex_r_v_rm, zoffset: 2, args: argList{Ytr, Ym}}, +} + var avxOptab = [...]Optab{ {as: AANDNL, ytab: _yandnl, prefix: Pavx, op: opBytes{ avxEscape | vex128 | vex0F38 | vexW0, 0xF2, @@ -4625,4 +4641,34 @@ var avxOptab = [...]Optab{ {as: AVZEROUPPER, ytab: _yvzeroall, prefix: Pavx, op: opBytes{ avxEscape | vex128 | vex0F | vexW0, 0x77, }}, + {as: ALDTILECFG, ytab: _yldtilecfg, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vex0F38 | vexW0, 0x49, 00, + }}, + {as: ASTTILECFG, ytab: _yldtilecfg, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vex66 | vex0F38 | vexW0, 0x49, 00, + }}, + {as: ATDPBF16PS, ytab: _ytdpbf16ps, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vexF3 | vex0F38 | vexW0, 0x5C, + }}, + {as: ATDPBSSD, ytab: _ytdpbf16ps, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vexF2 | vex0F38 | vexW0, 0x5E, + }}, + {as: ATDPBSUD, ytab: _ytdpbf16ps, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vexF3 | vex0F38 | vexW0, 0x5E, + }}, + {as: ATDPBUSD, ytab: _ytdpbf16ps, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vex66 | vex0F38 | vexW0, 0x5E, + }}, + {as: ATDPBUUD, ytab: _ytdpbf16ps, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vex0F38 | vexW0, 0x5E, + }}, + {as: ATILELOADD, ytab: _ytileloadd, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vexF2 | vex0F38 | vexW0, 0x4B, + }}, + {as: ATILELOADDT1, ytab: _ytileloadd, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vex66 | vex0F38 | vexW0, 0x4B, + }}, + {as: ATILESTORED, ytab: _ytilestored, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vexF3 | vex0F38 | vexW0, 0x4B, + }}, } From b9d92a6ababea0062f1c5eb66cfd92791ca0a2ec Mon Sep 17 00:00:00 2001 From: Anton Kuklin Date: Mon, 7 Aug 2023 19:02:02 +0100 Subject: [PATCH 2/6] added HasAMX flag --- src/internal/cpu/cpu.go | 1 + src/internal/cpu/cpu_x86.go | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go index 1352810f42ddfa..85be3e01b2ad0f 100644 --- a/src/internal/cpu/cpu.go +++ b/src/internal/cpu/cpu.go @@ -29,6 +29,7 @@ var X86 struct { HasADX bool HasAVX bool HasAVX2 bool + HasAMX bool HasBMI1 bool HasBMI2 bool HasERMS bool diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go index 96b8ef92b560df..c307b1bf5f7622 100644 --- a/src/internal/cpu/cpu_x86.go +++ b/src/internal/cpu/cpu_x86.go @@ -41,6 +41,9 @@ const ( cpuid_ADX = 1 << 19 cpuid_SHA = 1 << 29 + // edx bits + cpuid_AMX = 1 << 24 + // edx bits for CPUID 0x80000001 cpuid_RDTSCP = 1 << 27 ) @@ -73,6 +76,7 @@ func doinit() { options = append(options, option{Name: "avx", Feature: &X86.HasAVX}, option{Name: "avx2", Feature: &X86.HasAVX2}, + option{Name: "amx", Feature: &X86.HasAMX}, option{Name: "bmi1", Feature: &X86.HasBMI1}, option{Name: "bmi2", Feature: &X86.HasBMI2}, option{Name: "fma", Feature: &X86.HasFMA}) @@ -121,14 +125,14 @@ func doinit() { return } - _, ebx7, _, _ := cpuid(7, 0) + _, ebx7, _, edx7 := cpuid(7, 0) X86.HasBMI1 = isSet(ebx7, cpuid_BMI1) X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX + X86.HasAMX = isSet(edx7, cpuid_AMX) && osSupportsAVX X86.HasBMI2 = isSet(ebx7, cpuid_BMI2) X86.HasERMS = isSet(ebx7, cpuid_ERMS) X86.HasADX = isSet(ebx7, cpuid_ADX) X86.HasSHA = isSet(ebx7, cpuid_SHA) - var maxExtendedInformation uint32 maxExtendedInformation, _, _, _ = cpuid(0x80000000, 0) From 7c9330065a00a8fe36c019485167c44703ceb242 Mon Sep 17 00:00:00 2001 From: Anton Kuklin Date: Mon, 7 Aug 2023 20:10:48 +0100 Subject: [PATCH 3/6] fix flag --- src/internal/cpu/cpu_x86.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/internal/cpu/cpu_x86.go b/src/internal/cpu/cpu_x86.go index c307b1bf5f7622..000981f6b0528e 100644 --- a/src/internal/cpu/cpu_x86.go +++ b/src/internal/cpu/cpu_x86.go @@ -128,7 +128,7 @@ func doinit() { _, ebx7, _, edx7 := cpuid(7, 0) X86.HasBMI1 = isSet(ebx7, cpuid_BMI1) X86.HasAVX2 = isSet(ebx7, cpuid_AVX2) && osSupportsAVX - X86.HasAMX = isSet(edx7, cpuid_AMX) && osSupportsAVX + X86.HasAMX = isSet(edx7, cpuid_AMX) X86.HasBMI2 = isSet(ebx7, cpuid_BMI2) X86.HasERMS = isSet(ebx7, cpuid_ERMS) X86.HasADX = isSet(ebx7, cpuid_ADX) From 0ee3f5872916d3be5ccf400c1672e8013d984d81 Mon Sep 17 00:00:00 2001 From: Anton Kuklin Date: Tue, 8 Aug 2023 21:05:44 +0100 Subject: [PATCH 4/6] added TILEZERO & TILERELEASE --- src/cmd/internal/obj/x86/aenum.go | 2 ++ src/cmd/internal/obj/x86/anames.go | 2 ++ src/cmd/internal/obj/x86/avx_optabs.go | 14 ++++++++++++++ 3 files changed, 18 insertions(+) diff --git a/src/cmd/internal/obj/x86/aenum.go b/src/cmd/internal/obj/x86/aenum.go index 399b456d700a9f..fee2ab735bbf0b 100644 --- a/src/cmd/internal/obj/x86/aenum.go +++ b/src/cmd/internal/obj/x86/aenum.go @@ -1617,4 +1617,6 @@ const ( ATILELOADD ATILELOADDT1 ATILESTORED + ATILEZERO + ATILERELEASE ) diff --git a/src/cmd/internal/obj/x86/anames.go b/src/cmd/internal/obj/x86/anames.go index 36948b88ed8c27..907af13a614200 100644 --- a/src/cmd/internal/obj/x86/anames.go +++ b/src/cmd/internal/obj/x86/anames.go @@ -1615,4 +1615,6 @@ var Anames = []string{ "TILELOADD", "TILELOADDT1", "TILESTORED", + "TILEZERO", + "TILERELEASE", } diff --git a/src/cmd/internal/obj/x86/avx_optabs.go b/src/cmd/internal/obj/x86/avx_optabs.go index 1eea7db2351551..3742a74f7e1893 100644 --- a/src/cmd/internal/obj/x86/avx_optabs.go +++ b/src/cmd/internal/obj/x86/avx_optabs.go @@ -944,6 +944,14 @@ var _ytilestored = []ytab{ {zcase: Zvex_r_v_rm, zoffset: 2, args: argList{Ytr, Ym}}, } +var _ytilezero = []ytab{ + {zcase: Zvex_r_v_rm, zoffset: 2, args: argList{Ytr}}, +} + +var _ytilerelease = []ytab{ + {zcase: Zvex_rm_v_ro, zoffset: 3, args: argList{}}, +} + var avxOptab = [...]Optab{ {as: AANDNL, ytab: _yandnl, prefix: Pavx, op: opBytes{ avxEscape | vex128 | vex0F38 | vexW0, 0xF2, @@ -4671,4 +4679,10 @@ var avxOptab = [...]Optab{ {as: ATILESTORED, ytab: _ytilestored, prefix: Pavx, op: opBytes{ avxEscape | vex128 | vexF3 | vex0F38 | vexW0, 0x4B, }}, + {as: ATILEZERO, ytab: _ytilezero, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vexF2 | vex0F38 | vexW0, 0x49, + }}, + {as: ATILERELEASE, ytab: _ytilerelease, prefix: Pavx, op: opBytes{ + avxEscape | vex128 | vex0F38 | vexW0, 0x49, 00, + }}, } From 44191b44c6da42d5b59cef28fa1bad7cbe9633a1 Mon Sep 17 00:00:00 2001 From: Anton Kuklin Date: Tue, 8 Aug 2023 21:28:09 +0100 Subject: [PATCH 5/6] renamed regiter to Ytm --- src/cmd/internal/obj/x86/asm6.go | 2 +- src/cmd/internal/obj/x86/avx_optabs.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 3186cf5ac2aa94..51865f8a572865 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -192,7 +192,7 @@ const ( Ytextsize Yindir Ymax - Ytr + Ytm ) const ( diff --git a/src/cmd/internal/obj/x86/avx_optabs.go b/src/cmd/internal/obj/x86/avx_optabs.go index 3742a74f7e1893..487e34366e73fa 100644 --- a/src/cmd/internal/obj/x86/avx_optabs.go +++ b/src/cmd/internal/obj/x86/avx_optabs.go @@ -933,19 +933,19 @@ var _yldtilecfg = []ytab{ } var _ytdpbf16ps = []ytab{ - {zcase: Zvex_v_rm_r, zoffset: 2, args: argList{Ytr, Ytr, Ytr}}, + {zcase: Zvex_v_rm_r, zoffset: 2, args: argList{Ytm, Ytm, Ytm}}, } var _ytileloadd = []ytab{ - {zcase: Zvex_rm_v_r, zoffset: 2, args: argList{Ym, Ytr}}, + {zcase: Zvex_rm_v_r, zoffset: 2, args: argList{Ym, Ytm}}, } var _ytilestored = []ytab{ - {zcase: Zvex_r_v_rm, zoffset: 2, args: argList{Ytr, Ym}}, + {zcase: Zvex_r_v_rm, zoffset: 2, args: argList{Ytm, Ym}}, } var _ytilezero = []ytab{ - {zcase: Zvex_r_v_rm, zoffset: 2, args: argList{Ytr}}, + {zcase: Zvex_r_v_rm, zoffset: 2, args: argList{Ytm}}, } var _ytilerelease = []ytab{ From e5e426bc475cecb6134b76cf3b7968d46802ca50 Mon Sep 17 00:00:00 2001 From: Anton Kuklin Date: Tue, 8 Aug 2023 22:42:19 +0100 Subject: [PATCH 6/6] defined tmm registers --- src/cmd/internal/obj/x86/a.out.go | 9 +++++++++ src/cmd/internal/obj/x86/asm6.go | 14 ++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/cmd/internal/obj/x86/a.out.go b/src/cmd/internal/obj/x86/a.out.go index b121f6df7b2f8e..2d7efa958b1075 100644 --- a/src/cmd/internal/obj/x86/a.out.go +++ b/src/cmd/internal/obj/x86/a.out.go @@ -202,6 +202,15 @@ const ( REG_Z30 REG_Z31 + REG_TM0 + REG_TM1 + REG_TM2 + REG_TM3 + REG_TM4 + REG_TM5 + REG_TM6 + REG_TM7 + REG_CS REG_SS REG_DS diff --git a/src/cmd/internal/obj/x86/asm6.go b/src/cmd/internal/obj/x86/asm6.go index 51865f8a572865..fd6bb9faab1c50 100644 --- a/src/cmd/internal/obj/x86/asm6.go +++ b/src/cmd/internal/obj/x86/asm6.go @@ -2468,6 +2468,10 @@ func instinit(ctxt *obj.Link) { } } + if i >= REG_TM0 && i <= REG_TM0+7 { + reg[i] = (i - REG_TM0) & 7 + } + if i >= REG_CR+8 && i <= REG_CR+15 { regrex[i] = Rxr } @@ -3078,6 +3082,16 @@ func oclass(ctxt *obj.Link, p *obj.Prog, a *obj.Addr) int { } return Yzr + case REG_TM0 + 0, + REG_TM0 + 1, + REG_TM0 + 2, + REG_TM0 + 3, + REG_TM0 + 4, + REG_TM0 + 5, + REG_TM0 + 6, + REG_TM0 + 7: + return Ytm + case REG_K0: return Yk0