Add AllBitsSet, refactor WideLane, improve math paths

- Add static AllBitsSet property to ISPMDLane and implement in ScalarLane and WideLane
- Refactor WideLane shuffle table pointers and update usages
- Improve pointer safety and mask handling in CompressStore, Gather, and MaskLoad
- Enhance Sin, Cos, SinCos with fast-math and hardware fallback
- Add Newton-Raphson refinement for reciprocal/sqrt when not fast-math
- Optimize MathV.Vector vector loading (struct init, pointer ops)
- Update project file: version 1.3.4, content packaging, AOT settings
- Minor code cleanup and naming consistency fixes
This commit is contained in:
2026-05-01 12:19:58 +09:00
parent 5b4832a886
commit 18a181f57a
7 changed files with 209 additions and 124 deletions

View File

@@ -16,14 +16,6 @@ public interface ISPMDLane
} }
} }
// TODO:
// - ReduceAdd
// - ReduceMin
// - ReduceMax
// - LeadingZeroCount
// - TrailingZeroCount
// - PopCount
/// <summary> /// <summary>
/// Represents a single-lane or multi-lane (vectorized) SPMD value and the operations supported on it. /// Represents a single-lane or multi-lane (vectorized) SPMD value and the operations supported on it.
/// </summary> /// </summary>
@@ -65,6 +57,14 @@ public unsafe interface ISPMDLane<TSelf, TNumber> : ISPMDLane, IEquatable<TSelf>
get; get;
} }
/// <summary>
/// Gets a lane value where all bits are set to 1 for each lane.
/// </summary>
static abstract TSelf AllBitsSet
{
get;
}
/// <summary> /// <summary>
/// Gets the element value for the specified lane index. /// Gets the element value for the specified lane index.
/// </summary> /// </summary>

View File

@@ -7,23 +7,31 @@
<AllowUnsafeBlocks>true</AllowUnsafeBlocks> <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild> <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
<Authors>Misaki</Authors> <Authors>Misaki</Authors>
<AssemblyVersion>1.3.3</AssemblyVersion> <AssemblyVersion>1.3.4</AssemblyVersion>
<Version>$(AssemblyVersion)</Version> <Version>$(AssemblyVersion)</Version>
<PackageProjectUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</PackageProjectUrl> <PackageProjectUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</PackageProjectUrl>
<RepositoryUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</RepositoryUrl> <RepositoryUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</RepositoryUrl>
<IncludeBuildOutput>false</IncludeBuildOutput>
<ContentTargetFolders>contentFiles</ContentTargetFolders>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
<IsAotCompatible>True</IsAotCompatible> <IsAotCompatible>True</IsAotCompatible>
<DefineConstants>$(DefineConstants)</DefineConstants>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'"> <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
<IsAotCompatible>True</IsAotCompatible> <IsAotCompatible>True</IsAotCompatible>
<DefineConstants>$(DefineConstants)</DefineConstants>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<Compile Remove="Templates\Vector2 - Copy (2).gen.cs" /> <Content Include="**\*.cs" Exclude="obj\**;bin\**">
<Compile Remove="Templates\Vector2 - Copy.gen.cs" /> <Pack>true</Pack>
<PackagePath>contentFiles\cs\any\Misaki.HighPerformance.LowLevel\</PackagePath>
<PackageCopyToOutput>false</PackageCopyToOutput>
<BuildAction>Compile</BuildAction>
</Content>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>

View File

@@ -40,6 +40,12 @@ public readonly unsafe struct ScalarLane<TNumber> : ISPMDLane<ScalarLane<TNumber
get => new ScalarLane<TNumber>(TNumber.MaxValue); get => new ScalarLane<TNumber>(TNumber.MaxValue);
} }
public static ScalarLane<TNumber> AllBitsSet
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => new ScalarLane<TNumber>(TNumber.AllBitsSet);
}
public readonly TNumber this[int index] public readonly TNumber this[int index]
{ {
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]

View File

@@ -10,7 +10,6 @@ using System.Numerics;
namespace Misaki.HighPerformance.Mathematics.SPMD; namespace Misaki.HighPerformance.Mathematics.SPMD;
<# <#
const string TLane = "TLane"; const string TLane = "TLane";
const string TNumber = "TNumber"; const string TNumber = "TNumber";
const string GenericParameters = $"{TLane}, {TNumber}"; const string GenericParameters = $"{TLane}, {TNumber}";

View File

@@ -44,21 +44,21 @@ public static unsafe partial class MathV
where TLane : unmanaged, ISPMDLane<TLane, TNumber> where TLane : unmanaged, ISPMDLane<TLane, TNumber>
where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber> where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber>
{ {
var width = TLane.LaneWidth; Unsafe.SkipInit(out TLane x);
var px = (TNumber*)&x;
Unsafe.SkipInit(out TLane y);
var py = (TNumber*)&y;
var x = stackalloc TNumber[width]; for (var i = 0; i < TLane.LaneWidth; i++)
var y = stackalloc TNumber[width];
for (var i = 0; i < width; i++)
{ {
x[i] = pSrc[i * 2 + 0]; px[i] = pSrc[i * 2 + 0];
y[i] = pSrc[i * 2 + 1]; py[i] = pSrc[i * 2 + 1];
} }
return new Vector2<TLane, TNumber> return new Vector2<TLane, TNumber>
{ {
x = TLane.Load(x), x = x,
y = TLane.Load(y), y = y,
}; };
} }
@@ -515,24 +515,25 @@ public static unsafe partial class MathV
where TLane : unmanaged, ISPMDLane<TLane, TNumber> where TLane : unmanaged, ISPMDLane<TLane, TNumber>
where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber> where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber>
{ {
var width = TLane.LaneWidth; Unsafe.SkipInit(out TLane x);
var px = (TNumber*)&x;
Unsafe.SkipInit(out TLane y);
var py = (TNumber*)&y;
Unsafe.SkipInit(out TLane z);
var pz = (TNumber*)&z;
var x = stackalloc TNumber[width]; for (var i = 0; i < TLane.LaneWidth; i++)
var y = stackalloc TNumber[width];
var z = stackalloc TNumber[width];
for (var i = 0; i < width; i++)
{ {
x[i] = pSrc[i * 3 + 0]; px[i] = pSrc[i * 3 + 0];
y[i] = pSrc[i * 3 + 1]; py[i] = pSrc[i * 3 + 1];
z[i] = pSrc[i * 3 + 2]; pz[i] = pSrc[i * 3 + 2];
} }
return new Vector3<TLane, TNumber> return new Vector3<TLane, TNumber>
{ {
x = TLane.Load(x), x = x,
y = TLane.Load(y), y = y,
z = TLane.Load(z), z = z,
}; };
} }
@@ -1024,27 +1025,29 @@ public static unsafe partial class MathV
where TLane : unmanaged, ISPMDLane<TLane, TNumber> where TLane : unmanaged, ISPMDLane<TLane, TNumber>
where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber> where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber>
{ {
var width = TLane.LaneWidth; Unsafe.SkipInit(out TLane x);
var px = (TNumber*)&x;
Unsafe.SkipInit(out TLane y);
var py = (TNumber*)&y;
Unsafe.SkipInit(out TLane z);
var pz = (TNumber*)&z;
Unsafe.SkipInit(out TLane w);
var pw = (TNumber*)&w;
var x = stackalloc TNumber[width]; for (var i = 0; i < TLane.LaneWidth; i++)
var y = stackalloc TNumber[width];
var z = stackalloc TNumber[width];
var w = stackalloc TNumber[width];
for (var i = 0; i < width; i++)
{ {
x[i] = pSrc[i * 4 + 0]; px[i] = pSrc[i * 4 + 0];
y[i] = pSrc[i * 4 + 1]; py[i] = pSrc[i * 4 + 1];
z[i] = pSrc[i * 4 + 2]; pz[i] = pSrc[i * 4 + 2];
w[i] = pSrc[i * 4 + 3]; pw[i] = pSrc[i * 4 + 3];
} }
return new Vector4<TLane, TNumber> return new Vector4<TLane, TNumber>
{ {
x = TLane.Load(x), x = x,
y = TLane.Load(y), y = y,
z = TLane.Load(z), z = z,
w = TLane.Load(w), w = w,
}; };
} }

View File

@@ -65,23 +65,22 @@ public static unsafe partial class MathV
<#= TLaneRestrictions #> <#= TLaneRestrictions #>
<#= TNumberRestrictions #> <#= TNumberRestrictions #>
{ {
var width = TLane.LaneWidth;
<# for (int i = 0; i < dimension; i++) { #> <# for (int i = 0; i < dimension; i++) { #>
var <#= components[i] #> = stackalloc <#= TNumber #>[width]; Unsafe.SkipInit(out TLane <#= components[i] #>);
var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>;
<# } #> <# } #>
for (var i = 0; i < width; i++) for (var i = 0; i < TLane.LaneWidth; i++)
{ {
<# for (int i = 0; i < dimension; i++) { #> <# for (int i = 0; i < dimension; i++) { #>
<#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>]; p<#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>];
<# } #> <# } #>
} }
return new <#= vectorType #> return new <#= vectorType #>
{ {
<# for (int i = 0; i < dimension; i++) { #> <# for (int i = 0; i < dimension; i++) { #>
<#= components[i] #> = <#= TLane #>.Load(<#= components[i] #>), <#= components[i] #> = <#= components[i] #>,
<# } #> <# } #>
}; };
} }

View File

@@ -8,12 +8,12 @@ namespace Misaki.HighPerformance.Mathematics.SPMD;
public static unsafe class WideLane public static unsafe class WideLane
{ {
internal static readonly uint* s_shuffleTable512_32bit; internal static readonly uint* s_pShuffleTable512_32bit;
internal static readonly ulong* s_shuffleTable512_64bit; internal static readonly ulong* s_pShuffleTable512_64bit;
internal static readonly uint* s_shuffleTable256_32bit; internal static readonly uint* s_pShuffleTable256_32bit;
internal static readonly ulong* s_shuffleTable256_64bit; internal static readonly ulong* s_pShuffleTable256_64bit;
internal static readonly uint* s_shuffleTable128_32bit; internal static readonly uint* s_pShuffleTable128_32bit;
internal static readonly ulong* s_shuffleTable128_64bit; internal static readonly ulong* s_pShuffleTable128_64bit;
/// <summary> /// <summary>
/// Gets whether WideLane is supported on the current hardware. /// Gets whether WideLane is supported on the current hardware.
@@ -22,12 +22,12 @@ public static unsafe class WideLane
static WideLane() static WideLane()
{ {
s_shuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit(); s_pShuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
s_shuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit(); s_pShuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
s_shuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit(); s_pShuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
s_shuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit(); s_pShuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
s_shuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit(); s_pShuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
s_shuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit(); s_pShuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
} }
} }
@@ -69,6 +69,12 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
get => Create(TNumber.MaxValue); get => Create(TNumber.MaxValue);
} }
public static WideLane<TNumber> AllBitsSet
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Create(TNumber.AllBitsSet);
}
public readonly TNumber this[int index] public readonly TNumber this[int index]
{ {
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -194,48 +200,26 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, ref TNumber value) public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, ref TNumber value)
{ {
return MaskLoad(mask, (TNumber*)Unsafe.AsPointer(ref value)); var vector = Vector.LoadUnsafe(ref value);
return new WideLane<TNumber>(Vector.ConditionalSelect(mask.value, vector, Vector<TNumber>.Zero));
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, TNumber* pValue) public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, TNumber* pValue)
{ {
var vector = Vector.Load(pValue); return MaskLoad(mask, ref Unsafe.AsRef<TNumber>(pValue));
return new WideLane<TNumber>(Vector.ConditionalSelect(mask.value, vector, Vector<TNumber>.Zero));
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Gather(TNumber* pData, WideLane<TNumber> indices, int scale) public static WideLane<TNumber> Gather(TNumber* pData, WideLane<TNumber> indices, int scale)
{ {
Unsafe.SkipInit(out Vector<TNumber> result); return Gather(ref Unsafe.AsRef<TNumber>(pData), indices, scale);
var pResult = (TNumber*)&result;
var pIndices = (TNumber*)&indices;
var count = Vector<TNumber>.Count;
for (var i = 0; i < count; i++)
{
var idx = int.CreateTruncating(pIndices[i]);
pResult[i] = pData[idx * scale / sizeof(TNumber)];
}
return new WideLane<TNumber>(result);
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Gather(TNumber* pData, int* pIndices, int scale) public static WideLane<TNumber> Gather(TNumber* pData, int* pIndices, int scale)
{ {
Unsafe.SkipInit(out Vector<TNumber> result); return Gather(ref Unsafe.AsRef<TNumber>(pData), ref Unsafe.AsRef<int>(pIndices), scale);
var pResult = (TNumber*)&result;
var count = Vector<TNumber>.Count;
for (var i = 0; i < count; i++)
{
pResult[i] = pData[pIndices[i] * scale / sizeof(TNumber)];
}
return new WideLane<TNumber>(result);
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -287,12 +271,6 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane<TNumber> mask, ref TNumber destination) public int CompressStore(WideLane<TNumber> mask, ref TNumber destination)
{
return CompressStore(mask, (TNumber*)Unsafe.AsPointer(in destination));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane<TNumber> mask, TNumber* pDestination)
{ {
if (LaneWidth == Vector512<TNumber>.Count && Vector512.IsHardwareAccelerated) if (LaneWidth == Vector512<TNumber>.Count && Vector512.IsHardwareAccelerated)
{ {
@@ -303,10 +281,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits(); var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 16) because each control vector has 16 elements // Offset is (moveMask * 16) because each control vector has 16 elements
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_32bit + (moveMask * 16)); var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_32bit + (moveMask * 16));
var compressed = Vector512.Shuffle(vec, shuffle); var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination); compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
return BitOperations.PopCount(moveMask); return BitOperations.PopCount(moveMask);
} }
@@ -317,10 +295,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits(); var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements // Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_64bit + (moveMask * 8)); var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_64bit + (moveMask * 8));
var compressed = Vector512.Shuffle(vec, shuffle); var compressed = Vector512.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination); compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
return BitOperations.PopCount(moveMask); return BitOperations.PopCount(moveMask);
} }
} }
@@ -333,10 +311,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits(); var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 8) because each control vector has 8 elements // Offset is (moveMask * 8) because each control vector has 8 elements
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_32bit + (moveMask * 8)); var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_32bit + (moveMask * 8));
var compressed = Vector256.Shuffle(vec, shuffle); var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination); compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
return BitOperations.PopCount(moveMask); return BitOperations.PopCount(moveMask);
} }
@@ -349,10 +327,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits(); var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements // Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_64bit + (moveMask * 4)); var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_64bit + (moveMask * 4));
var compressed = Vector256.Shuffle(vec, shuffle); var compressed = Vector256.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination); compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
return BitOperations.PopCount(moveMask); return BitOperations.PopCount(moveMask);
} }
} }
@@ -365,10 +343,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var moveMask = m.ExtractMostSignificantBits(); var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 4) because each control vector has 4 elements // Offset is (moveMask * 4) because each control vector has 4 elements
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_32bit + (moveMask * 4)); var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_32bit + (moveMask * 4));
var compressed = Vector128.Shuffle(vec, shuffle); var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((uint*)pDestination); compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
return BitOperations.PopCount(moveMask); return BitOperations.PopCount(moveMask);
} }
@@ -378,9 +356,9 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var m = Unsafe.As<WideLane<TNumber>, Vector128<ulong>>(ref mask); var m = Unsafe.As<WideLane<TNumber>, Vector128<ulong>>(ref mask);
var moveMask = m.ExtractMostSignificantBits(); var moveMask = m.ExtractMostSignificantBits();
// Offset is (moveMask * 2) because each control vector has 2 elements // Offset is (moveMask * 2) because each control vector has 2 elements
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_64bit + (moveMask * 2)); var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_64bit + (moveMask * 2));
var compressed = Vector128.Shuffle(vec, shuffle); var compressed = Vector128.Shuffle(vec, shuffle);
compressed.Store((ulong*)pDestination); compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
return BitOperations.PopCount(moveMask); return BitOperations.PopCount(moveMask);
} }
} }
@@ -390,15 +368,21 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var count = 0; var count = 0;
for (var i = 0; i < LaneWidth; i++) for (var i = 0; i < LaneWidth; i++)
{ {
if (mask.value[i] == ~TNumber.Zero) if (mask.value[i] == TNumber.AllBitsSet)
{ {
pDestination[count++] = value[i]; Unsafe.Add(ref destination, count++) = value[i];
} }
} }
return count; return count;
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int CompressStore(WideLane<TNumber> mask, TNumber* pDestination)
{
return CompressStore(mask, ref Unsafe.AsRef<TNumber>(pDestination));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public readonly Vector<TNumber> AsVector() public readonly Vector<TNumber> AsVector()
{ {
@@ -617,6 +601,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Sin(WideLane<TNumber> value) public static WideLane<TNumber> Sin(WideLane<TNumber> value)
{ {
#if MHP_FASTMATH
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
var x_sin = value; var x_sin = value;
@@ -644,11 +629,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
poly_sin = z_sin * poly_sin; // z * (...) poly_sin = z_sin * poly_sin; // z * (...)
return poly_sin * sign_sin; return poly_sin * sign_sin;
#else
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
var result = Vector.Sin(v);
return new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
var result = Vector.Sin(v);
return new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref result));
}
return value;
#endif
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WideLane<TNumber> Cos(WideLane<TNumber> value) public static WideLane<TNumber> Cos(WideLane<TNumber> value)
{ {
#if MHP_FASTMATH
var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
@@ -677,11 +679,30 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
poly_cos = z_cos * poly_cos; poly_cos = z_cos * poly_cos;
return poly_cos * sign_cos; return poly_cos * sign_cos;
#else
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
var result = Vector.Cos(v);
return new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref result));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
var result = Vector.Cos(v);
return new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref result));
}
return value;
#endif
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void SinCos(WideLane<TNumber> value, out WideLane<TNumber> sin, out WideLane<TNumber> cos) public static void SinCos(WideLane<TNumber> value, out WideLane<TNumber> sin, out WideLane<TNumber> cos)
{ {
#if MHP_FASTMATH
// We use Taylor/Remez polynomial approximation for Sin(PI * z) and Cos(PI * z) on the reduced range of z in [-0.5, 0.5].
var halfPi = Create(TNumber.CreateTruncating(1.570796327f)); var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
@@ -741,6 +762,27 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
sin = poly_sin * sign_sin; sin = poly_sin * sign_sin;
cos = poly_cos * sign_cos; cos = poly_cos * sign_cos;
#else
if (typeof(TNumber) == typeof(float))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
var (sinResult, cosResult) = Vector.SinCos(v);
sin = new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref sinResult));
cos = new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref cosResult));
}
else if (typeof(TNumber) == typeof(double))
{
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
var (sinResult, cosResult) = Vector.SinCos(v);
sin = new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref sinResult));
cos = new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref cosResult));
}
else
{
sin = value;
cos = value;
}
#endif
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)] [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -799,7 +841,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
var pi = Create(TNumber.CreateTruncating(Math.PI)); var pi = Create(TNumber.CreateTruncating(Math.PI));
var isNegative = LessThan(value, Zero); var isNegative = LessThan(value, Zero);
return Select(isNegative, pi - result, result); return Select(isNegative, pi - result, result);
} }
@@ -999,7 +1041,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
One, One,
Select( Select(
LessThan(value, Zero), LessThan(value, Zero),
~Zero, AllBitsSet,
Zero)); Zero));
} }
@@ -1017,14 +1059,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
if (Sse.IsSupported && LaneWidth == Vector128<float>.Count) if (Sse.IsSupported && LaneWidth == Vector128<float>.Count)
{ {
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value); ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value);
var result = Sse.Reciprocal(vf); var x0 = Sse.Reciprocal(vf);
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref result); #if MHP_FASTMATH
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * (Vector128.Create(2.0f) - x0 * vf);
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x1);
#endif
} }
else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count) else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count)
{ {
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value); ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value);
var result = Avx.Reciprocal(vf); var x0 = Avx.Reciprocal(vf);
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref result); #if MHP_FASTMATH
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * (Vector256.Create(2.0f) - x0 * vf);
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x1);
#endif
} }
} }
@@ -1039,14 +1095,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
if (Sse.IsSupported && LaneWidth == Vector128<float>.Count) if (Sse.IsSupported && LaneWidth == Vector128<float>.Count)
{ {
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value); ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value);
var result = Sse.ReciprocalSqrt(vf); var x0 = Sse.ReciprocalSqrt(vf);
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref result); #if MHP_FASTMATH
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * Vector128.Create(0.5f) * (Vector128.Create(3.0f) - (vf * x0 * x0));
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x1);
#endif
} }
else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count) else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count)
{ {
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value); ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value);
var result = Avx.ReciprocalSqrt(vf); var x0 = Avx.ReciprocalSqrt(vf);
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref result); #if MHP_FASTMATH
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x0);
#else
// SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
var x1 = x0 * Vector256.Create(0.5f) * (Vector256.Create(3.0f) - (vf * x0 * x0));
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x1);
#endif
} }
} }