using System.Diagnostics.CodeAnalysis; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; namespace Misaki.HighPerformance.HPC; public static unsafe class WideLane { internal static readonly uint* s_pShuffleTable512_32bit; internal static readonly ulong* s_pShuffleTable512_64bit; internal static readonly uint* s_pShuffleTable256_32bit; internal static readonly ulong* s_pShuffleTable256_64bit; internal static readonly uint* s_pShuffleTable128_32bit; internal static readonly ulong* s_pShuffleTable128_64bit; /// /// Gets whether WideLane is supported on the current hardware. /// public static bool IsSupported => Vector.IsHardwareAccelerated; static WideLane() { s_pShuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit(); s_pShuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit(); s_pShuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit(); s_pShuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit(); s_pShuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit(); s_pShuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit(); } } // TODO: We can use source generator to generate the optimized code for different hardware (e.g.,SSE, AVX, AVX2, etc.) and select the best version at runtime. // Right now, we rely on Vector API to auto vectorize the code. // This works fine in jit, but require user to build multiple binaries with different target architectures to get the best performance in NativeAOT via IlcInstructionSet. [StructLayout(LayoutKind.Sequential)] public readonly unsafe partial struct WideLane : ISPMDLane, TNumber> where TNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators { public readonly Vector value; public static int LaneWidth { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Vector.Count; } public static WideLane Zero { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.BitCast, WideLane>(Vector.Zero); } public static WideLane One { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Unsafe.BitCast, WideLane>(Vector.One); } public static WideLane MinValue { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Create(TNumber.MinValue); } public static WideLane MaxValue { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Create(TNumber.MaxValue); } public static WideLane AllBitsSet { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Create(TNumber.AllBitsSet); } public readonly TNumber this[int index] { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => value[index]; } public WideLane(Vector value) { this.value = value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector VectorFloor(Vector vector) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(vector); var floored = Vector.Floor(v); return Unsafe.BitCast, Vector>(floored); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(vector); var floored = Vector.Floor(v); return Unsafe.BitCast, Vector>(floored); } return vector; } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector VectorTruncate(Vector vector) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(vector); var truncated = Vector.Truncate(v); return Unsafe.BitCast, Vector>(truncated); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(vector); var truncated = Vector.Truncate(v); return Unsafe.BitCast, Vector>(truncated); } return vector; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Create(TNumber value) { return Unsafe.BitCast, WideLane>(Vector.Create(value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Create(params ReadOnlySpan values) { return Unsafe.BitCast, WideLane>(Vector.Create(values)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Create(Vector value) { return Unsafe.BitCast, WideLane>(value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Sequence(TNumber start, TNumber step) { if (LaneWidth == Vector512.Count) { var v = Vector512.CreateSequence(start, step); return Unsafe.BitCast, WideLane>(v); } else if (LaneWidth == Vector256.Count) { var v = Vector256.CreateSequence(start, step); return Unsafe.BitCast, WideLane>(v); } else if (LaneWidth == Vector128.Count) { var v = Vector128.CreateSequence(start, step); return Unsafe.BitCast, WideLane>(v); } else if (LaneWidth == Vector64.Count) { var v = Vector64.CreateSequence(start, step); return Unsafe.BitCast, WideLane>(v); } else { return Unsafe.BitCast, WideLane>(Vector.Create(start) + (Vector.Create(step) * Vector.Indices)); } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Load(ref TNumber value) { return Unsafe.BitCast, WideLane>(Vector.LoadUnsafe(ref value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Load(TNumber* pValue) { return Unsafe.BitCast, WideLane>(Vector.Load(pValue)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane MaskLoad(TNumber* pValue, WideLane mask) { if (Avx.IsSupported) { if (LaneWidth == Vector128.Count) { if (sizeof(TNumber) == sizeof(float)) { var result = Avx.MaskLoad((float*)pValue, Unsafe.BitCast, Vector128>(mask)); return Unsafe.BitCast, WideLane>(result); } else if (sizeof(TNumber) == sizeof(double)) { var result = Avx.MaskLoad((double*)pValue, Unsafe.BitCast, Vector128>(mask)); return Unsafe.BitCast, WideLane>(result); } } else if (LaneWidth == Vector256.Count) { if (sizeof(TNumber) == sizeof(float)) { var result = Avx.MaskLoad((float*)pValue, Unsafe.BitCast, Vector256>(mask)); return Unsafe.BitCast, WideLane>(result); } else if (sizeof(TNumber) == sizeof(double)) { var result = Avx.MaskLoad((double*)pValue, Unsafe.BitCast, Vector256>(mask)); return Unsafe.BitCast, WideLane>(result); } } } Unsafe.SkipInit(out WideLane v); var pv = (TNumber*)&v; for (var i = 0; i < LaneWidth; i++) { // TODO: Can we assume pData is padded with TLane.LaneWidth? // In that case we can use Load + Select instead of checking mask for each element, which should be faster. pv[i] = (mask.value[i] != TNumber.Zero) ? pValue[i] : TNumber.Zero; } return v; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane MaskLoad(ref TNumber value, WideLane mask) { return MaskLoad((TNumber*)Unsafe.AsPointer(ref value), mask); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Gather(TNumber* pData, WideLane indices, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale) { if (Avx2.IsSupported) { if (LaneWidth == Vector128.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = SPMDUtility.GetIndicesAs128Int32(indices.value); var vx = Avx2.GatherVector128((uint*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = SPMDUtility.GetIndicesAs128Int64(indices.value); var vx = Avx2.GatherVector128((ulong*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } } else if (LaneWidth == Vector256.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = SPMDUtility.GetIndicesAs256Int32(indices.value); var vx = Avx2.GatherVector256((uint*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = SPMDUtility.GetIndicesAs256Int64(indices.value); var vx = Avx2.GatherVector256((ulong*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } } } Unsafe.SkipInit(out Vector result); var pResult = (TNumber*)&result; var count = Vector.Count; for (var i = 0; i < count; i++) { var idx = int.CreateTruncating(indices[i]); pResult[i] = *(TNumber*)((byte*)pData + (idx * scale)); } return Unsafe.BitCast, WideLane>(result); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Gather(TNumber* pData, int* pIndices, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale) { if (Avx2.IsSupported) { if (LaneWidth == Vector128.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = Vector128.Load(pIndices); var vx = Avx2.GatherVector128((uint*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = Vector128.Load(pIndices); var vx = Avx2.GatherVector128((ulong*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } } else if (LaneWidth == Vector256.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = Vector256.Load(pIndices); var vx = Avx2.GatherVector256((uint*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = Vector128.Load(pIndices); var vx = Avx2.GatherVector256((ulong*)pData, vidx, scale); return Unsafe.BitCast, WideLane>(vx); } } } Unsafe.SkipInit(out Vector result); var pResult = (TNumber*)&result; var count = Vector.Count; for (var i = 0; i < count; i++) { pResult[i] = *(TNumber*)((byte*)pData + (pIndices[i] * scale)); } return Unsafe.BitCast, WideLane>(result); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Gather(ref TNumber baseAddress, WideLane indices, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale) { return Gather((TNumber*)Unsafe.AsPointer(ref baseAddress), indices, scale); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Gather(ref TNumber baseAddress, ref int baseIndex, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale) { return Gather((TNumber*)Unsafe.AsPointer(ref baseAddress), (int*)Unsafe.AsPointer(ref baseIndex), scale); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane MaskGather(TNumber* pData, WideLane indices, WideLane mask, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale) { if (Avx2.IsSupported) { if (LaneWidth == Vector128.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = SPMDUtility.GetIndicesAs128Int32(indices.value); var vmask = Unsafe.BitCast, Vector128>(mask); var vx = Avx2.GatherMaskVector128(Vector128.Zero, (uint*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = SPMDUtility.GetIndicesAs128Int64(indices.value); var vmask = Unsafe.BitCast, Vector128>(mask); var vx = Avx2.GatherMaskVector128(Vector128.Zero, (ulong*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } } else if (LaneWidth == Vector256.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = SPMDUtility.GetIndicesAs256Int32(indices.value); var vmask = Unsafe.BitCast, Vector256>(mask); var vx = Avx2.GatherMaskVector256(Vector256.Zero, (uint*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = SPMDUtility.GetIndicesAs256Int64(indices.value); var vmask = Unsafe.BitCast, Vector256>(mask); var vx = Avx2.GatherMaskVector256(Vector256.Zero, (ulong*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } } } Unsafe.SkipInit(out Vector result); var pResult = (TNumber*)&result; var count = Vector.Count; for (var i = 0; i < count; i++) { var idx = int.CreateTruncating(indices[i]); pResult[i] = *(TNumber*)((byte*)pData + (idx * scale)); } return Unsafe.BitCast, WideLane>(result); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane MaskGather(TNumber* pData, int* pIndices, WideLane mask, [ConstantExpected(Min = (byte)(1), Max = (byte)(8))] byte scale) { if (Avx2.IsSupported) { if (LaneWidth == Vector128.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = Vector128.Load(pIndices); var vmask = Unsafe.BitCast, Vector128>(mask); var vx = Avx2.GatherMaskVector128(Vector128.Zero, (uint*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = Vector128.Load(pIndices); var vmask = Unsafe.BitCast, Vector128>(mask); var vx = Avx2.GatherMaskVector128(Vector128.Zero, (ulong*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } } else if (LaneWidth == Vector256.Count) { if (sizeof(TNumber) == sizeof(uint)) { var vidx = Vector256.Load(pIndices); var vmask = Unsafe.BitCast, Vector256>(mask); var vx = Avx2.GatherMaskVector256(Vector256.Zero, (uint*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } else if (sizeof(TNumber) == sizeof(ulong)) { var vidx = Vector128.Load(pIndices); var vmask = Unsafe.BitCast, Vector256>(mask); var vx = Avx2.GatherMaskVector256(Vector256.Zero, (ulong*)pData, vidx, vmask, scale); return Unsafe.BitCast, WideLane>(vx); } } } Unsafe.SkipInit(out Vector result); var pResult = (TNumber*)&result; var count = Vector.Count; for (var i = 0; i < count; i++) { pResult[i] = *(TNumber*)((byte*)pData + (pIndices[i] * scale)); } return Unsafe.BitCast, WideLane>(result); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public readonly void Store(ref TNumber destination) { value.StoreUnsafe(ref destination); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public readonly void Store(TNumber* pDestination) { value.Store(pDestination); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public int CompressStore(ref TNumber destination, WideLane mask) { return CompressStore((TNumber*)Unsafe.AsPointer(ref destination), mask); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public int CompressStore(TNumber* pDestination, WideLane mask) { if (LaneWidth == Vector512.Count && Vector512.IsHardwareAccelerated) { if (sizeof(TNumber) == 4) { var vec = Unsafe.BitCast, Vector512>(Unsafe.AsRef(in this)); var m = Unsafe.BitCast, Vector512>(mask); var moveMask = m.ExtractMostSignificantBits(); // Offset is (moveMask * 16) because each control vector has 16 elements var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_32bit + (moveMask * 16)); var compressed = Vector512.Shuffle(vec, shuffle); compressed.Store((uint*)pDestination); return BitOperations.PopCount(moveMask); } else if (sizeof(TNumber) == 8) { var vec = Unsafe.BitCast, Vector512>(Unsafe.AsRef(in this)); var m = Unsafe.BitCast, Vector512>(mask); var moveMask = m.ExtractMostSignificantBits(); // Offset is (moveMask * 8) because each control vector has 8 elements var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_64bit + (moveMask * 8)); var compressed = Vector512.Shuffle(vec, shuffle); compressed.Store((ulong*)pDestination); return BitOperations.PopCount(moveMask); } } else if (LaneWidth == Vector256.Count && Vector256.IsHardwareAccelerated) { if (sizeof(TNumber) == 4) { var vec = Unsafe.BitCast, Vector256>(Unsafe.AsRef(in this)); var m = Unsafe.BitCast, Vector256>(mask); var moveMask = m.ExtractMostSignificantBits(); // Offset is (moveMask * 8) because each control vector has 8 elements var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_32bit + (moveMask * 8)); var compressed = Vector256.Shuffle(vec, shuffle); compressed.Store((uint*)pDestination); return BitOperations.PopCount(moveMask); } else if (sizeof(TNumber) == 8) { var vec = Unsafe.BitCast, Vector256>(Unsafe.AsRef(in this)); var m = Unsafe.BitCast, Vector256>(mask); // For 64-bit, ExtractMostSignificantBits only populates 4 bits (0-15) var moveMask = m.ExtractMostSignificantBits(); // Offset is (moveMask * 4) because each control vector has 4 elements var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_64bit + (moveMask * 4)); var compressed = Vector256.Shuffle(vec, shuffle); compressed.Store((ulong*)pDestination); return BitOperations.PopCount(moveMask); } } else if (LaneWidth == Vector128.Count && Vector128.IsHardwareAccelerated) { if (sizeof(TNumber) == 4) { var vec = Unsafe.BitCast, Vector128>(Unsafe.AsRef(in this)); var m = Unsafe.BitCast, Vector128>(mask); var moveMask = m.ExtractMostSignificantBits(); // Offset is (moveMask * 4) because each control vector has 4 elements var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_32bit + (moveMask * 4)); var compressed = Vector128.Shuffle(vec, shuffle); compressed.Store((uint*)pDestination); return BitOperations.PopCount(moveMask); } else if (sizeof(TNumber) == 8) { var vec = Unsafe.BitCast, Vector128>(Unsafe.AsRef(in this)); var m = Unsafe.BitCast, Vector128>(mask); var moveMask = m.ExtractMostSignificantBits(); // Offset is (moveMask * 2) because each control vector has 2 elements var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_64bit + (moveMask * 2)); var compressed = Vector128.Shuffle(vec, shuffle); compressed.Store((ulong*)pDestination); return BitOperations.PopCount(moveMask); } } // This is slow, but correct on ANY hardware. // Check sign bit of the mask lane var count = 0; for (var i = 0; i < LaneWidth; i++) { // TODO: Can we assume pData is padded with TLane.LaneWidth? // In that case we can use Load + Select instead of checking mask for each element, which should be faster. if (mask.value[i] != TNumber.Zero) { pDestination[count++] = value[i]; } } return count; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void MaskStore(TNumber* pDst, WideLane mask) { if (Avx.IsSupported) { if (LaneWidth == Vector128.Count) { if (sizeof(TNumber) == sizeof(float)) { var v = Unsafe.BitCast, Vector128>(this); var m = Unsafe.BitCast, Vector128>(mask); Avx.MaskStore((float*)pDst, m, v); return; } else if (sizeof(TNumber) == sizeof(double)) { var v = Unsafe.BitCast, Vector128>(this); var m = Unsafe.BitCast, Vector128>(mask); Avx.MaskStore((double*)pDst, m, v); return; } } else if (LaneWidth == Vector256.Count) { if (sizeof(TNumber) == sizeof(float)) { var v = Unsafe.BitCast, Vector256>(this); var m = Unsafe.BitCast, Vector256>(mask); Avx.MaskStore((float*)pDst, m, v); return; } else if (sizeof(TNumber) == sizeof(double)) { var v = Unsafe.BitCast, Vector256>(this); var m = Unsafe.BitCast, Vector256>(mask); Avx.MaskStore((double*)pDst, m, v); return; } } } for (var i = 0; i < LaneWidth; i++) { if (mask.value[i] != TNumber.Zero) { pDst[i] = value[i]; } } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void MaskStore(ref TNumber destination, WideLane mask) { MaskStore((TNumber*)Unsafe.AsPointer(ref destination), mask); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Scatter(TNumber* pDst, WideLane indices) { for (var i = 0; i < LaneWidth; i++) { var idx = int.CreateTruncating(indices[i]); pDst[idx] = value[i]; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Scatter(ref TNumber destination, WideLane indices) { for (var i = 0; i < LaneWidth; i++) { var idx = int.CreateTruncating(indices[i]); Unsafe.Add(ref destination, idx) = value[i]; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Scatter(TNumber* pDst, int* pIndices) { for (var i = 0; i < LaneWidth; i++) { var idx = pIndices[i]; pDst[idx] = value[i]; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void Scatter(ref TNumber destination, int* pIndices) { for (var i = 0; i < LaneWidth; i++) { var idx = pIndices[i]; Unsafe.Add(ref destination, idx) = value[i]; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void MaskScatter(TNumber* pDst, WideLane indices, WideLane mask) { for (var i = 0; i < LaneWidth; i++) { if (mask.value[i] != TNumber.Zero) { var idx = int.CreateTruncating(indices[i]); pDst[idx] = value[i]; } } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void MaskScatter(ref TNumber destination, WideLane indices, WideLane mask) { for (var i = 0; i < LaneWidth; i++) { if (mask.value[i] != TNumber.Zero) { var idx = int.CreateTruncating(indices[i]); Unsafe.Add(ref destination, idx) = value[i]; } } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void MaskScatter(TNumber* pDst, int* pIndices, WideLane mask) { for (var i = 0; i < LaneWidth; i++) { if (mask.value[i] != TNumber.Zero) { var idx = pIndices[i]; pDst[idx] = value[i]; } } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public void MaskScatter(ref TNumber destination, int* pIndices, WideLane mask) { for (var i = 0; i < LaneWidth; i++) { if (mask.value[i] != TNumber.Zero) { var idx = pIndices[i]; Unsafe.Add(ref destination, idx) = value[i]; } } } [MethodImpl(MethodImplOptions.AggressiveInlining)] public readonly Vector AsVector() { return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public readonly TNumber* GetUnsafePtr() { return (TNumber*)Unsafe.AsPointer(ref Unsafe.AsRef(in value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public TOther BitCast() where TOther : ISPMDLane where TOtherNumber : unmanaged, INumber, IBinaryNumber, IMinMaxValue, IBitwiseOperators { return Unsafe.BitCast, TOther>(this); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator +(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value + b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator -(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value - b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator *(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value * b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator /(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value / b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator %(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value - VectorFloor(a.value / b.value) * b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator -(WideLane a) { return Unsafe.BitCast, WideLane>(-a.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator &(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value & b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator |(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value | b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator ^(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(a.value ^ b.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator ~(WideLane a) { return Unsafe.BitCast, WideLane>(~a.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator ==(WideLane a, WideLane b) { return Equal(a, b); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator !=(WideLane a, WideLane b) { return ~Equal(a, b); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator >(WideLane a, WideLane b) { return GreaterThan(a, b); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator >=(WideLane a, WideLane b) { return GreaterThanOrEqual(a, b); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator <(WideLane a, WideLane b) { return LessThan(a, b); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane operator <=(WideLane a, WideLane b) { return LessThanOrEqual(a, b); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static implicit operator WideLane(TNumber value) { return Create(value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Abs(WideLane value) { return Unsafe.BitCast, WideLane>(Vector.Abs(value.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Floor(WideLane value) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var floored = Vector.Floor(v); return Unsafe.BitCast, WideLane>(floored); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var floored = Vector.Floor(v); return Unsafe.BitCast, WideLane>(Unsafe.BitCast, Vector>(floored)); } return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Frac(WideLane value) { return Unsafe.BitCast, WideLane>(value.value - VectorFloor(value.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Sqrt(WideLane value) { return Unsafe.BitCast, WideLane>(Vector.SquareRoot(value.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Lerp(WideLane a, WideLane b, WideLane t) { return Unsafe.BitCast, WideLane>(a.value + (b.value - a.value) * t.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane MultiplyAdd(WideLane left, WideLane right, WideLane addend) { if (typeof(TNumber) == typeof(float)) { var va = Unsafe.BitCast, Vector>(left); var vb = Unsafe.BitCast, Vector>(right); var vc = Unsafe.BitCast, Vector>(addend); var result = Vector.FusedMultiplyAdd(va, vb, vc); return Unsafe.BitCast, WideLane>(result); } else if (typeof(TNumber) == typeof(double)) { var va = Unsafe.BitCast, Vector>(left); var vb = Unsafe.BitCast, Vector>(right); var vc = Unsafe.BitCast, Vector>(addend); var result = Vector.FusedMultiplyAdd(va, vb, vc); return Unsafe.BitCast, WideLane>(result); } return Unsafe.BitCast, WideLane>((left.value * right.value) + addend.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Min(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(Vector.Min(a.value, b.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Max(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(Vector.Max(a.value, b.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Clamp(WideLane value, WideLane min, WideLane max) { return Unsafe.BitCast, WideLane>(Vector.Clamp(value.value, min.value, max.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Saturate(WideLane value) { return Clamp(value, Create(TNumber.Zero), Create(TNumber.One)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Sin(WideLane value) { #if MHP_FASTMATH var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI var x_sin = value; var y_sin = x_sin * invPi; var k_sin = Round(y_sin); var z_sin = y_sin - k_sin; var half = Create(TNumber.CreateTruncating(0.5f)); var two = Create(TNumber.CreateTruncating(2.0f)); var k_even_sin = Round(k_sin * half) * two; var sign_sin = One - two * Abs(k_sin - k_even_sin); var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6 var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120 var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040 var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880 var z2_sin = z_sin * z_sin; var poly_sin = MultiplyAdd(z2_sin, c9, c7); // c7 + c9*z^2 poly_sin = MultiplyAdd(z2_sin, poly_sin, c5); // c5 + z^2*(...) poly_sin = MultiplyAdd(z2_sin, poly_sin, c3); // c3 + z^2*(...) poly_sin = MultiplyAdd(z2_sin, poly_sin, c1); // c1 + z^2*(...) poly_sin = z_sin * poly_sin; // z * (...) return poly_sin * sign_sin; #else if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Sin(v); return Unsafe.BitCast, WideLane>(result)); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Sin(v); return Unsafe.BitCast, WideLane>(result); } return value; #endif } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Cos(WideLane value) { #if MHP_FASTMATH var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI var x_cos = value + halfPi; var y_cos = x_cos * invPi; var k_cos = Round(y_cos); var z_cos = y_cos - k_cos; var half = Create(TNumber.CreateTruncating(0.5f)); var two = Create(TNumber.CreateTruncating(2.0f)); var k_even_cos = Round(k_cos * half) * two; var sign_cos = One - two * Abs(k_cos - k_even_cos); var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6 var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120 var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040 var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880 var z2_cos = z_cos * z_cos; var poly_cos = MultiplyAdd(z2_cos, c9, c7); poly_cos = MultiplyAdd(z2_cos, poly_cos, c5); poly_cos = MultiplyAdd(z2_cos, poly_cos, c3); poly_cos = MultiplyAdd(z2_cos, poly_cos, c1); poly_cos = z_cos * poly_cos; return poly_cos * sign_cos; #else if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Cos(v); return Unsafe.BitCast, WideLane>(result)); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Cos(v); return Unsafe.BitCast, WideLane>(result); } return value; #endif } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void SinCos(WideLane value, out WideLane sin, out WideLane cos) { #if MHP_FASTMATH // We use Taylor/Remez polynomial approximation for Sin(PI * z) and Cos(PI * z) on the reduced range of z in [-0.5, 0.5]. var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI var x_sin = value; var x_cos = value + halfPi; // Range Reduction // We map any angle to the interval [-0.5, 0.5] (corresponding to the actual angle range [-PI/2, PI/2]) // y = x * (1 / PI) var y_sin = x_sin * invPi; var y_cos = x_cos * invPi; // k = Round(y) var k_sin = Round(y_sin); var k_cos = Round(y_cos); // z = y - k (Now, the range of z is perfectly reduced to [-0.5, 0.5]) var z_sin = y_sin - k_sin; var z_cos = y_cos - k_cos; // 2. Branchless Sign Flip // Mathematical principle: Sin(x + k*PI) = Sin(x) * (-1)^k // We need to compute (-1)^k. To avoid inefficient bit operations or branches, we compute it with floating-point math: // sign = 1.0 - 2.0 * Abs(k - 2.0 * Round(k * 0.5)) var half = Create(TNumber.CreateTruncating(0.5f)); var two = Create(TNumber.CreateTruncating(2.0f)); var one = One; var k_even_sin = Round(k_sin * half) * two; var sign_sin = one - two * Abs(k_sin - k_even_sin); var k_even_cos = Round(k_cos * half) * two; var sign_cos = one - two * Abs(k_cos - k_even_cos); // 3. Taylor/Remez Polynomial for Sin(PI * z) // For z in [-0.5, 0.5],Calculate sin(PI * z) // z * (C1 + z^2 * (C3 + z^2 * (C5 + z^2 * (C7 + z^2 * C9)))) var c1 = Create(TNumber.CreateTruncating(3.14159265f)); // PI var c3 = Create(TNumber.CreateTruncating(-5.16771278f)); // -PI^3 / 6 var c5 = Create(TNumber.CreateTruncating(2.55016404f)); // PI^5 / 120 var c7 = Create(TNumber.CreateTruncating(-0.59926453f)); // -PI^7 / 5040 var c9 = Create(TNumber.CreateTruncating(0.08214589f)); // PI^9 / 362880 var z2_sin = z_sin * z_sin; var poly_sin = MultiplyAdd(z2_sin, c9, c7); // c7 + c9*z^2 poly_sin = MultiplyAdd(z2_sin, poly_sin, c5); // c5 + z^2*(...) poly_sin = MultiplyAdd(z2_sin, poly_sin, c3); // c3 + z^2*(...) poly_sin = MultiplyAdd(z2_sin, poly_sin, c1); // c1 + z^2*(...) poly_sin = z_sin * poly_sin; // z * (...) var z2_cos = z_cos * z_cos; var poly_cos = MultiplyAdd(z2_cos, c9, c7); poly_cos = MultiplyAdd(z2_cos, poly_cos, c5); poly_cos = MultiplyAdd(z2_cos, poly_cos, c3); poly_cos = MultiplyAdd(z2_cos, poly_cos, c1); poly_cos = z_cos * poly_cos; sin = poly_sin * sign_sin; cos = poly_cos * sign_cos; #else if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var (sinResult, cosResult) = Vector.SinCos(v); sin = Unsafe.BitCast, WideLane>(sinResult)); cos = Unsafe.BitCast, WideLane>(cosResult)); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var (sinResult, cosResult) = Vector.SinCos(v); sin = Unsafe.BitCast, WideLane>(sinResult); cos = Unsafe.BitCast, WideLane>(cosResult); } else { sin = value; cos = value; } #endif } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Tan(WideLane value) { // 1. Range Reduction // Transform value into range [-pi/4, pi/4]. // This is complex to do right (Payne-Hanek), but for games // a simple approximation: value = value - (PI * Round(value / PI)) is good enough. var pi = Create(TNumber.CreateTruncating(Math.PI)); var x = value - pi * Round(value / pi); // 2. The Approximation (Remez Polynomial) // tan(value) ~= value + c1*value^3 + c2*value^5 // Factored (Horner's Method) for fewer ops: value * (1 + value^2 * (c1 + c2*value^2)) var x2 = x * x; var vc1 = Create(TNumber.CreateTruncating(0.3333314036)); // 1/3 var vc2 = Create(TNumber.CreateTruncating(0.1333923995)); // 2/15 // x2 * (c1 + c2 * x2) var poly = MultiplyAdd(x2, vc2, vc1); // value * (1 + x2 * poly) return MultiplyAdd(x, MultiplyAdd(x2, poly, One), Zero); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Asin(WideLane value) { // asin(value) = pi/2 - acos(value) var piOver2 = Create(TNumber.CreateTruncating(Math.PI / 2)); return piOver2 - Acos(value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Acos(WideLane value) { // 0 <= value <= 1 : acos(value) = sqrt(1 - value) * (c0 + c1*value + c2*value^2 + c3*value^3) // value < 0 : acos(value) = pi - acos(-value) var x = Abs(value); var c0 = Create(TNumber.CreateTruncating(1.5707288f)); // pi/2 var c1 = Create(TNumber.CreateTruncating(-0.2121144f)); var c2 = Create(TNumber.CreateTruncating(0.0742610f)); var c3 = Create(TNumber.CreateTruncating(-0.0187293f)); var term1 = MultiplyAdd(x, c3, c2); var term2 = MultiplyAdd(x, term1, c1); var poly = MultiplyAdd(x, term2, c0); var sqrtTerm = Sqrt(One - x); var result = poly * sqrtTerm; var pi = Create(TNumber.CreateTruncating(Math.PI)); var isNegative = LessThan(value, Zero); return Select(isNegative, pi - result, result); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Atan(WideLane value) { // atan(value) = value * (c1 + c2*value^2) var c1 = Create(TNumber.CreateTruncating(0.97239411f)); var c2 = Create(TNumber.CreateTruncating(-0.19194795f)); var x2 = value * value; var poly = MultiplyAdd(x2, c2, c1); return value * poly; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Atan2(WideLane y, WideLane x) { var absX = Abs(x); var absY = Abs(y); // 1. Determine the ratio (input to Atan) // If |value| > |y|, we are in the "shallow" region, ratio = y/value // If |y| > |value|, we are in the "steep" region, ratio = value/y (and we transform result) var yGtX = GreaterThan(absY, absX); // Select numerator and denominator to ensure ratio is always in [-1, 1] var num = Select(yGtX, absX, absY); var den = Select(yGtX, absY, absX); var t = num * Rcp(den); // t is now in [0, 1] var t2 = t * t; // 2. Polynomial Approximation (Odd function: value * (c1 + c2*value^2)) var c1 = Create(TNumber.CreateTruncating(0.97239411f)); var c2 = Create(TNumber.CreateTruncating(-0.19194795f)); // (c1 + c2 * t2) var poly = MultiplyAdd(c2, t2, c1); // result = t * poly var result = t * poly; // 3. Reconstruct the angle // If we swapped value/y (yGtX), the identity is: atan(value/y) = PI/2 - atan(y/value) var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); result = Select(yGtX, halfPi - result, result); // 4. Adjust for Quadrants (Signs) // If value < 0, we are in quadrants 2 or 3, so we need to add PI var pi = Create(TNumber.CreateTruncating(3.141592654f)); var xLtZero = LessThan(x, Zero); result = Select(xLtZero, pi - result, result); // If y < 0, the result should be negative (standard atan2 convention) // NOTE: This sign flip strategy depends on exact polynomial range mapping, // but typically just copy the sign of Y to the result. var yLtZero = LessThan(y, Zero); // If original Y was negative, negate the result // (This works because our ratio logic effectively computed atan(|y|/|value|) above) var negativeResult = -result; return Select(yLtZero, negativeResult, result); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Pow(WideLane x, WideLane y) { return Exp(y * Log(x)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Exp(WideLane value) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Exp(v); return Unsafe.BitCast, WideLane>(result); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Exp(v); return Unsafe.BitCast, WideLane>(result); } return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Exp2(WideLane value) { return Pow(Create(TNumber.CreateTruncating(2)), value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Log(WideLane value) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Log(v); return Unsafe.BitCast, WideLane>(result); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Log(v); return Unsafe.BitCast, WideLane>(result); } return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Log2(WideLane value) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Log2(v); return Unsafe.BitCast, WideLane>(result); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Log2(v); return Unsafe.BitCast, WideLane>(result); } return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Ceil(WideLane value) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Ceiling(v); return Unsafe.BitCast, WideLane>(result); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Ceiling(v); return Unsafe.BitCast, WideLane>(result); } return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Round(WideLane value) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Round(v); return Unsafe.BitCast, WideLane>(result); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Round(v); return Unsafe.BitCast, WideLane>(result); } return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Trunc(WideLane value) { if (typeof(TNumber) == typeof(float)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Truncate(v); return Unsafe.BitCast, WideLane>(result); } else if (typeof(TNumber) == typeof(double)) { var v = Unsafe.BitCast, Vector>(value); var result = Vector.Truncate(v); return Unsafe.BitCast, WideLane>(result); } return value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Sign(WideLane value) { return Select( GreaterThan(value, Zero), One, Select( LessThan(value, Zero), AllBitsSet, Zero)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane CopySign(WideLane magnitude, WideLane sign) { return Unsafe.BitCast, WideLane>(Vector.CopySign(magnitude.value, sign.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Rcp(WideLane value) { if (typeof(TNumber) == typeof(float)) { if (Sse.IsSupported && LaneWidth == Vector128.Count) { var vf = Unsafe.BitCast, Vector128>(value); var x0 = Sse.Reciprocal(vf); #if MHP_FASTMATH return Unsafe.BitCast, WideLane>(x0); #else // SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits). // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. var x1 = x0 * (Vector128.Create(2.0f) - x0 * vf); return Unsafe.BitCast, WideLane>(x1); #endif } else if (Avx.IsSupported && LaneWidth == Vector256.Count) { var vf = Unsafe.BitCast, Vector256>(value); var x0 = Avx.Reciprocal(vf); #if MHP_FASTMATH return Unsafe.BitCast, WideLane>(x0); #else // SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits). // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. var x1 = x0 * (Vector256.Create(2.0f) - x0 * vf); return Unsafe.BitCast, WideLane>(x1); #endif } } return Create(TNumber.One) / value; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Rsqrt(WideLane value) { if (typeof(TNumber) == typeof(float)) { if (Sse.IsSupported && LaneWidth == Vector128.Count) { var vf = Unsafe.BitCast, Vector128>(value); var x0 = Sse.ReciprocalSqrt(vf); #if MHP_FASTMATH return Unsafe.BitCast, WideLane>(x0); #else // SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits). // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. var x1 = x0 * Vector128.Create(0.5f) * (Vector128.Create(3.0f) - (vf * x0 * x0)); return Unsafe.BitCast, WideLane>(x1); #endif } else if (Avx.IsSupported && LaneWidth == Vector256.Count) { var vf = Unsafe.BitCast, Vector256>(value); var x0 = Avx.ReciprocalSqrt(vf); #if MHP_FASTMATH return Unsafe.BitCast, WideLane>(x0); #else // SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits). // In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases. var x1 = x0 * Vector256.Create(0.5f) * (Vector256.Create(3.0f) - (vf * x0 * x0)); return Unsafe.BitCast, WideLane>(x1); #endif } } return Create(TNumber.One) / Sqrt(value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static TNumber ReduceAdd(WideLane value) { return Vector.Sum(value.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static TNumber ReduceMax(WideLane value) { // TODO: Use shuffle and max. var max = TNumber.Zero; for (var i = 0; i < LaneWidth; i++) { if (value[i] > max) { max = value[i]; } } return max; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static TNumber ReduceMin(WideLane value) { // TODO: Use shuffle and min. var min = TNumber.Zero; for (var i = 0; i < LaneWidth; i++) { if (value[i] < min) { min = value[i]; } } return min; } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Select(WideLane conditionMask, WideLane ifTrue, WideLane ifFalse) { return Unsafe.BitCast, WideLane>(Vector.ConditionalSelect( conditionMask.value, ifTrue.value, ifFalse.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Select(byte conditionMask, WideLane ifTrue, WideLane ifFalse) { throw new NotImplementedException(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane GreaterThan(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(Vector.GreaterThan(a.value, b.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane GreaterThanOrEqual(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(Vector.GreaterThanOrEqual(a.value, b.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane LessThan(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(Vector.LessThan(a.value, b.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane LessThanOrEqual(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(Vector.LessThanOrEqual(a.value, b.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static WideLane Equal(WideLane a, WideLane b) { return Unsafe.BitCast, WideLane>(Vector.Equals(a.value, b.value)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool Any(WideLane mask) { return !Vector.EqualsAll(mask.value, Vector.Zero); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool All(WideLane mask) { return Vector.EqualsAll(mask.value, Vector.AllBitsSet); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool None(WideLane mask) { return Vector.EqualsAll(mask.value, Vector.Zero); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool Equals(WideLane other) { return value.Equals(other.value); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public override bool Equals(object? obj) { return obj is WideLane other && Equals(other); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public override int GetHashCode() { return value.GetHashCode(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public override string ToString() { return value.ToString(); } [MethodImpl(MethodImplOptions.AggressiveInlining)] public static implicit operator WideLane(Vector v) { return Unsafe.BitCast, WideLane>(v); } }