Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dot graviton sve #4797

Merged
merged 8 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 21 additions & 0 deletions adapters/repos/db/vector/hnsw/distancer/asm/dot_neon_arm64.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//

//go:build !noasm && arm64

// AUTO-GENERATED BY GOAT -- DO NOT EDIT

package asm

import "unsafe"

//go:noescape
func dot_neon(a, b, res, len unsafe.Pointer)
142 changes: 142 additions & 0 deletions adapters/repos/db/vector/hnsw/distancer/asm/dot_neon_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
//go:build !noasm && arm64
// AUTO-GENERATED BY GOAT -- DO NOT EDIT

TEXT ·dot_neon(SB), $0-32
MOVD a+0(FP), R0
MOVD b+8(FP), R1
MOVD res+16(FP), R2
MOVD len+24(FP), R3
WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
WORD $0xf9400069 // ldr x9, [x3]
WORD $0x910003fd // mov x29, sp
WORD $0x6b0903e8 // negs w8, w9
WORD $0x1200052a // and w10, w9, #0x3
WORD $0x12000508 // and w8, w8, #0x3
WORD $0x5a884548 // csneg w8, w10, w8, mi
WORD $0x4b08012a // sub w10, w9, w8
WORD $0x7100415f // cmp w10, #16
WORD $0x540000ea // b.ge .LBB0_2
WORD $0x6f00e400 // movi v0.2d, #0000000000000000
WORD $0x6f00e401 // movi v1.2d, #0000000000000000
WORD $0x2a1f03eb // mov w11, wzr
WORD $0x6f00e403 // movi v3.2d, #0000000000000000
WORD $0x6f00e402 // movi v2.2d, #0000000000000000
WORD $0x14000016 // b .LBB0_4

LBB0_2:
WORD $0x6f00e402 // movi v2.2d, #0000000000000000
WORD $0x6f00e403 // movi v3.2d, #0000000000000000
WORD $0xaa1f03eb // mov x11, xzr
WORD $0x6f00e401 // movi v1.2d, #0000000000000000
WORD $0x6f00e400 // movi v0.2d, #0000000000000000
WORD $0xaa0003ec // mov x12, x0
WORD $0xaa0103ed // mov x13, x1

LBB0_3:
WORD $0x4cdf2984 // ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [x12], #64
WORD $0x9100816e // add x14, x11, #32
WORD $0x4cdf29b0 // ld1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x13], #64
WORD $0xeb0a01df // cmp x14, x10
WORD $0x9100416b // add x11, x11, #16
WORD $0x6e30dc94 // fmul v20.4s, v4.4s, v16.4s
WORD $0x6e31dcb5 // fmul v21.4s, v5.4s, v17.4s
WORD $0x6e32dcd6 // fmul v22.4s, v6.4s, v18.4s
WORD $0x6e33dce4 // fmul v4.4s, v7.4s, v19.4s
WORD $0x4e34d442 // fadd v2.4s, v2.4s, v20.4s
WORD $0x4e35d463 // fadd v3.4s, v3.4s, v21.4s
WORD $0x4e36d421 // fadd v1.4s, v1.4s, v22.4s
WORD $0x4e24d400 // fadd v0.4s, v0.4s, v4.4s
WORD $0x54fffe69 // b.ls .LBB0_3

LBB0_4:
WORD $0x6b0a017f // cmp w11, w10
WORD $0x540001ca // b.ge .LBB0_7
WORD $0x2a0b03ec // mov w12, w11
WORD $0x93407d4a // sxtw x10, w10
WORD $0x2a0b03eb // mov w11, w11
WORD $0xd37e7d8d // ubfiz x13, x12, #2, #32
WORD $0x8b0d002c // add x12, x1, x13
WORD $0x8b0d000d // add x13, x0, x13

LBB0_6:
WORD $0x3cc105a4 // ldr q4, [x13], #16
WORD $0x9100116b // add x11, x11, #4
WORD $0x3cc10585 // ldr q5, [x12], #16
WORD $0xeb0a017f // cmp x11, x10
WORD $0x6e25dc84 // fmul v4.4s, v4.4s, v5.4s
WORD $0x4e24d442 // fadd v2.4s, v2.4s, v4.4s
WORD $0x54ffff4b // b.lt .LBB0_6

LBB0_7:
WORD $0x6e22d442 // faddp v2.4s, v2.4s, v2.4s
WORD $0x6e23d463 // faddp v3.4s, v3.4s, v3.4s
WORD $0x7100051f // cmp w8, #1
WORD $0x6e21d421 // faddp v1.4s, v1.4s, v1.4s
WORD $0x6e20d400 // faddp v0.4s, v0.4s, v0.4s
WORD $0x7e30d842 // faddp s2, v2.2s
WORD $0x7e30d863 // faddp s3, v3.2s
WORD $0x7e30d821 // faddp s1, v1.2s
WORD $0x7e30d800 // faddp s0, v0.2s
WORD $0x1e232842 // fadd s2, s2, s3
WORD $0x1e212841 // fadd s1, s2, s1
WORD $0x1e202820 // fadd s0, s1, s0
WORD $0x5400066b // b.lt .LBB0_13
WORD $0x93407d29 // sxtw x9, w9
WORD $0x4b0803ec // neg w12, w8
WORD $0xcb08012a // sub x10, x9, x8
WORD $0x9100054b // add x11, x10, #1
WORD $0xeb09017f // cmp x11, x9
WORD $0x9a8ad52a // csinc x10, x9, x10, le
WORD $0x8b08014a // add x10, x10, x8
WORD $0xcb09014b // sub x11, x10, x9
WORD $0x8b2cc12a // add x10, x9, w12, sxtw
WORD $0xf100217f // cmp x11, #8
WORD $0x54000423 // b.lo .LBB0_12
WORD $0xd37ef52c // lsl x12, x9, #2
WORD $0xcb28c98c // sub x12, x12, w8, sxtw #2
WORD $0x927df168 // and x8, x11, #0xfffffffffffffff8
WORD $0x8b08014a // add x10, x10, x8
WORD $0xaa0803ee // mov x14, x8
WORD $0x9100418d // add x13, x12, #16
WORD $0x8b0d002c // add x12, x1, x13
WORD $0x8b0d000d // add x13, x0, x13

LBB0_10:
WORD $0xad7f9181 // ldp q1, q4, [x12, #-16]
WORD $0xf10021ce // subs x14, x14, #8
WORD $0xad7f8da2 // ldp q2, q3, [x13, #-16]
WORD $0x9100818c // add x12, x12, #32
WORD $0x910081ad // add x13, x13, #32
WORD $0x6e21dc41 // fmul v1.4s, v2.4s, v1.4s
WORD $0x5e0c0422 // mov s2, v1.s[1]
WORD $0x1e212800 // fadd s0, s0, s1
WORD $0x5e140425 // mov s5, v1.s[2]
WORD $0x5e1c0421 // mov s1, v1.s[3]
WORD $0x1e222800 // fadd s0, s0, s2
WORD $0x6e24dc62 // fmul v2.4s, v3.4s, v4.4s
WORD $0x1e252800 // fadd s0, s0, s5
WORD $0x5e140443 // mov s3, v2.s[2]
WORD $0x1e212800 // fadd s0, s0, s1
WORD $0x5e0c0441 // mov s1, v2.s[1]
WORD $0x1e222800 // fadd s0, s0, s2
WORD $0x1e212800 // fadd s0, s0, s1
WORD $0x5e1c0441 // mov s1, v2.s[3]
WORD $0x1e232800 // fadd s0, s0, s3
WORD $0x1e212800 // fadd s0, s0, s1
WORD $0x54fffd61 // b.ne .LBB0_10
WORD $0xeb08017f // cmp x11, x8
WORD $0x54000100 // b.eq .LBB0_13

LBB0_12:
WORD $0xd37ef548 // lsl x8, x10, #2
WORD $0x9100054a // add x10, x10, #1
WORD $0xeb09015f // cmp x10, x9
WORD $0xbc686801 // ldr s1, [x0, x8]
WORD $0xbc686822 // ldr s2, [x1, x8]
WORD $0x1f020020 // fmadd s0, s1, s2, s0
WORD $0x54ffff4b // b.lt .LBB0_12

LBB0_13:
WORD $0xbd000040 // str s0, [x2]
WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
WORD $0xd65f03c0 // ret
45 changes: 42 additions & 3 deletions adapters/repos/db/vector/hnsw/distancer/asm/dot_stub_arm64.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ package asm
// go install github.com/gorse-io/[email protected]
// go generate

//go:generate goat ../c/dot_arm64.c -O3 -e="-mfpu=neon-fp-armv8" -e="-mfloat-abi=hard" -e="--target=arm64" -e="-march=armv8-a+simd+fp"
//// go:generate goat ../c/dot_arm64.c -O3 -e="-mfpu=neon-fp-armv8" -e="-mfloat-abi=hard" -e="--target=arm64" -e="-march=armv8-a+simd+fp"
//go:generate goat ../c/dot_neon_arm64.c -O3 -e="--target=arm64" -e="-march=armv8-a+simd+fp"
//go:generate goat ../c/dot_sve_arm64.c -O3 -e="-mcpu=neoverse-v1" -e="--target=arm64" -e="-march=armv8-a+sve"
//go:generate goat ../c/dot_byte_arm64.c -O3 -e="-mfpu=neon-fp-armv8" -e="-mfloat-abi=hard" -e="--target=arm64" -e="-march=armv8-a+simd+fp"

import (
Expand All @@ -25,7 +27,7 @@ import (

// Dot calculates the dot product between two vectors
// using SIMD instructions.
func Dot(x []float32, y []float32) float32 {
func Dot_Neon(x []float32, y []float32) float32 {
switch len(x) {
case 2:
return dot2[float32, float32](x, y)
Expand Down Expand Up @@ -54,7 +56,7 @@ func Dot(x []float32, y []float32) float32 {
hdry := (*reflect.SliceHeader)(unsafe.Pointer(&y))

l := len(x)
dot(
dot_neon(
// The slice header contains the address of the underlying array.
// We only need to cast it to a pointer.
unsafe.Pointer(hdrx.Data),
Expand All @@ -66,6 +68,43 @@ func Dot(x []float32, y []float32) float32 {
return res
}

func Dot_SVE(x []float32, y []float32) float32 {
switch len(x) {
case 2:
return dot2[float32, float32](x, y)
case 4:
return dot4[float32, float32](x, y)
case 6:
return dot6[float32, float32](x, y)
case 8:
// manually inlined dot8(x, y)
sum := x[7]*y[7] + x[6]*y[6]
return dot6[float32, float32](x, y) + sum
case 10:
// manually inlined dot10(x, y)
sum := x[9]*y[9] + x[8]*y[8] + x[7]*y[7] + x[6]*y[6]
return dot6[float32, float32](x, y) + sum
case 12:
// manually inlined dot12(x, y)
sum := x[11]*y[11] + x[10]*y[10] + x[9]*y[9] + x[8]*y[8] + x[7]*y[7] + x[6]*y[6]
return dot6[float32, float32](x, y) + sum
}

var res float32

l := len(x)
dot_sve(
// The slice header contains the address of the underlying array.
// We only need to cast it to a pointer.
unsafe.Pointer(unsafe.SliceData(x)),
unsafe.Pointer(unsafe.SliceData(y)),
// The C function expects pointers to the result and the length of the arrays.
unsafe.Pointer(&res),
unsafe.Pointer(&l))

return res
}

func DotByteARM64(x []uint8, y []uint8) uint32 {
switch len(x) {
case 2:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ package asm
import "unsafe"

//go:noescape
func dot(a, b, res, len unsafe.Pointer)
func dot_sve(a, b, res, len unsafe.Pointer)
129 changes: 129 additions & 0 deletions adapters/repos/db/vector/hnsw/distancer/asm/dot_sve_arm64.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
//go:build !noasm && arm64
// AUTO-GENERATED BY GOAT -- DO NOT EDIT

TEXT ·dot_sve(SB), $0-32
MOVD a+0(FP), R0
MOVD b+8(FP), R1
MOVD res+16(FP), R2
MOVD len+24(FP), R3
WORD $0xf9400068 // ldr x8, [x3]
WORD $0x04a0e3ea // cntw x10
WORD $0xcb0a03e9 // neg x9, x10
WORD $0x04bf502c // rdvl x12, #1
WORD $0x2598e3e0 // ptrue p0.s
WORD $0x8a090109 // and x9, x8, x9
WORD $0xeb09019f // cmp x12, x9
WORD $0x540000e9 // b.ls .LBB0_2
WORD $0x25b8c000 // mov z0.s, #0
WORD $0xaa1f03eb // mov x11, xzr
WORD $0x04603001 // mov z1.d, z0.d
WORD $0x04603002 // mov z2.d, z0.d
WORD $0x04603003 // mov z3.d, z0.d
WORD $0x14000027 // b .LBB0_5

LBB0_2:
WORD $0x25b8c004 // mov z4.s, #0
WORD $0x04bf5070 // rdvl x16, #3
WORD $0xaa1f03eb // mov x11, xzr
WORD $0x8b0c000f // add x15, x0, x12
WORD $0x8b0c0032 // add x18, x1, x12
WORD $0x04643086 // mov z6.d, z4.d
WORD $0x04bf5051 // rdvl x17, #2
WORD $0x04643087 // mov z7.d, z4.d
WORD $0x04643085 // mov z5.d, z4.d
WORD $0x8b10000d // add x13, x0, x16
WORD $0x8b11000e // add x14, x0, x17
WORD $0x8b100030 // add x16, x1, x16
WORD $0x8b110031 // add x17, x1, x17

LBB0_3:
WORD $0xa54b4003 // ld1w { z3.s }, p0/z, [x0, x11, lsl #2]
WORD $0xa54b41e2 // ld1w { z2.s }, p0/z, [x15, x11, lsl #2]
WORD $0xa54b41c1 // ld1w { z1.s }, p0/z, [x14, x11, lsl #2]
WORD $0xa54b41a0 // ld1w { z0.s }, p0/z, [x13, x11, lsl #2]
WORD $0xa54b4030 // ld1w { z16.s }, p0/z, [x1, x11, lsl #2]
WORD $0x65a48203 // fmad z3.s, p0/m, z16.s, z4.s
WORD $0xa54b4251 // ld1w { z17.s }, p0/z, [x18, x11, lsl #2]
WORD $0xa54b4232 // ld1w { z18.s }, p0/z, [x17, x11, lsl #2]
WORD $0xa54b4213 // ld1w { z19.s }, p0/z, [x16, x11, lsl #2]
WORD $0x65a68222 // fmad z2.s, p0/m, z17.s, z6.s
WORD $0x65a78241 // fmad z1.s, p0/m, z18.s, z7.s
WORD $0x65a58260 // fmad z0.s, p0/m, z19.s, z5.s
WORD $0x8b0c016b // add x11, x11, x12
WORD $0x8b0b0183 // add x3, x12, x11
WORD $0xeb09007f // cmp x3, x9
WORD $0x04633064 // mov z4.d, z3.d
WORD $0x04623046 // mov z6.d, z2.d
WORD $0x04613027 // mov z7.d, z1.d
WORD $0x04603005 // mov z5.d, z0.d
WORD $0x54fffda9 // b.ls .LBB0_3
WORD $0x14000005 // b .LBB0_5

LBB0_4:
WORD $0xa54b4004 // ld1w { z4.s }, p0/z, [x0, x11, lsl #2]
WORD $0xa54b4025 // ld1w { z5.s }, p0/z, [x1, x11, lsl #2]
WORD $0x8b0a016b // add x11, x11, x10
WORD $0x65a400a3 // fmla z3.s, p0/m, z5.s, z4.s

LBB0_5:
WORD $0xeb09017f // cmp x11, x9
WORD $0x54ffff63 // b.lo .LBB0_4
WORD $0x65802063 // faddv s3, p0, z3.s
WORD $0xeb08013f // cmp x9, x8
WORD $0x65802042 // faddv s2, p0, z2.s
WORD $0x1e222862 // fadd s2, s3, s2
WORD $0x65802021 // faddv s1, p0, z1.s
WORD $0x65802000 // faddv s0, p0, z0.s
WORD $0x1e212841 // fadd s1, s2, s1
WORD $0x1e202820 // fadd s0, s1, s0
WORD $0x54000520 // b.eq .LBB0_13
WORD $0xb240012a // orr x10, x9, #0x1
WORD $0xeb0a011f // cmp x8, x10
WORD $0x9a8a810a // csel x10, x8, x10, hi
WORD $0xcb09014b // sub x11, x10, x9
WORD $0x0460e3ea // cnth x10
WORD $0xeb0a017f // cmp x11, x10
WORD $0x54000062 // b.hs .LBB0_9
WORD $0xaa0903ea // mov x10, x9
WORD $0x14000019 // b .LBB0_12

LBB0_9:
WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
WORD $0xcb0a03ed // neg x13, x10
WORD $0x04bf504f // rdvl x15, #2
WORD $0x8b09080e // add x14, x0, x9, lsl #2
WORD $0x910003fd // mov x29, sp
WORD $0x8a0d016c // and x12, x11, x13
WORD $0x8b0c012a // add x10, x9, x12
WORD $0x8b090829 // add x9, x1, x9, lsl #2
WORD $0xaa0c03f0 // mov x16, x12

LBB0_10:
WORD $0xa540a1c1 // ld1w { z1.s }, p0/z, [x14]
WORD $0xa540a123 // ld1w { z3.s }, p0/z, [x9]
WORD $0xab0d0210 // adds x16, x16, x13
WORD $0x65830821 // fmul z1.s, z1.s, z3.s
WORD $0xa541a1c2 // ld1w { z2.s }, p0/z, [x14, #1, mul vl]
WORD $0x8b0f01ce // add x14, x14, x15
WORD $0x65982020 // fadda s0, p0, s0, z1.s
WORD $0xa541a121 // ld1w { z1.s }, p0/z, [x9, #1, mul vl]
WORD $0x8b0f0129 // add x9, x9, x15
WORD $0x65810841 // fmul z1.s, z2.s, z1.s
WORD $0x65982020 // fadda s0, p0, s0, z1.s
WORD $0x54fffea1 // b.ne .LBB0_10
WORD $0xeb0c017f // cmp x11, x12
WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
WORD $0x54000100 // b.eq .LBB0_13

LBB0_12:
WORD $0xbc6a7801 // ldr s1, [x0, x10, lsl #2]
WORD $0xbc6a7822 // ldr s2, [x1, x10, lsl #2]
WORD $0x9100054a // add x10, x10, #1
WORD $0xeb08015f // cmp x10, x8
WORD $0x1e220821 // fmul s1, s1, s2
WORD $0x1e212800 // fadd s0, s0, s1
WORD $0x54ffff43 // b.lo .LBB0_12

LBB0_13:
WORD $0xbd000040 // str s0, [x2]
WORD $0xd65f03c0 // ret