Add AllBitsSet, refactor WideLane, improve math paths
- Add static AllBitsSet property to ISPMDLane and implement in ScalarLane and WideLane - Refactor WideLane shuffle table pointers and update usages - Improve pointer safety and mask handling in CompressStore, Gather, and MaskLoad - Enhance Sin, Cos, SinCos with fast-math and hardware fallback - Add Newton-Raphson refinement for reciprocal/sqrt when not fast-math - Optimize MathV.Vector vector loading (struct init, pointer ops) - Update project file: version 1.3.4, content packaging, AOT settings - Minor code cleanup and naming consistency fixes
This commit is contained in:
@@ -16,14 +16,6 @@ public interface ISPMDLane
|
||||
}
|
||||
}
|
||||
|
||||
// TODO:
|
||||
// - ReduceAdd
|
||||
// - ReduceMin
|
||||
// - ReduceMax
|
||||
// - LeadingZeroCount
|
||||
// - TrailingZeroCount
|
||||
// - PopCount
|
||||
|
||||
/// <summary>
|
||||
/// Represents a single-lane or multi-lane (vectorized) SPMD value and the operations supported on it.
|
||||
/// </summary>
|
||||
@@ -65,6 +57,14 @@ public unsafe interface ISPMDLane<TSelf, TNumber> : ISPMDLane, IEquatable<TSelf>
|
||||
get;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets a lane value where all bits are set to 1 for each lane.
|
||||
/// </summary>
|
||||
static abstract TSelf AllBitsSet
|
||||
{
|
||||
get;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the element value for the specified lane index.
|
||||
/// </summary>
|
||||
|
||||
@@ -7,23 +7,31 @@
|
||||
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
|
||||
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
|
||||
<Authors>Misaki</Authors>
|
||||
<AssemblyVersion>1.3.3</AssemblyVersion>
|
||||
<AssemblyVersion>1.3.4</AssemblyVersion>
|
||||
<Version>$(AssemblyVersion)</Version>
|
||||
<PackageProjectUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</PackageProjectUrl>
|
||||
<RepositoryUrl>https://git.personalnas.com/Misaki/Misaki.HighPerformance.git</RepositoryUrl>
|
||||
<IncludeBuildOutput>false</IncludeBuildOutput>
|
||||
<ContentTargetFolders>contentFiles</ContentTargetFolders>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
|
||||
<IsAotCompatible>True</IsAotCompatible>
|
||||
<DefineConstants>$(DefineConstants)</DefineConstants>
|
||||
</PropertyGroup>
|
||||
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
|
||||
<IsAotCompatible>True</IsAotCompatible>
|
||||
<DefineConstants>$(DefineConstants)</DefineConstants>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Compile Remove="Templates\Vector2 - Copy (2).gen.cs" />
|
||||
<Compile Remove="Templates\Vector2 - Copy.gen.cs" />
|
||||
<Content Include="**\*.cs" Exclude="obj\**;bin\**">
|
||||
<Pack>true</Pack>
|
||||
<PackagePath>contentFiles\cs\any\Misaki.HighPerformance.LowLevel\</PackagePath>
|
||||
<PackageCopyToOutput>false</PackageCopyToOutput>
|
||||
<BuildAction>Compile</BuildAction>
|
||||
</Content>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
|
||||
@@ -40,6 +40,12 @@ public readonly unsafe struct ScalarLane<TNumber> : ISPMDLane<ScalarLane<TNumber
|
||||
get => new ScalarLane<TNumber>(TNumber.MaxValue);
|
||||
}
|
||||
|
||||
public static ScalarLane<TNumber> AllBitsSet
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
get => new ScalarLane<TNumber>(TNumber.AllBitsSet);
|
||||
}
|
||||
|
||||
public readonly TNumber this[int index]
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
|
||||
@@ -10,7 +10,6 @@ using System.Numerics;
|
||||
namespace Misaki.HighPerformance.Mathematics.SPMD;
|
||||
|
||||
<#
|
||||
|
||||
const string TLane = "TLane";
|
||||
const string TNumber = "TNumber";
|
||||
const string GenericParameters = $"{TLane}, {TNumber}";
|
||||
|
||||
@@ -44,21 +44,21 @@ public static unsafe partial class MathV
|
||||
where TLane : unmanaged, ISPMDLane<TLane, TNumber>
|
||||
where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber>
|
||||
{
|
||||
var width = TLane.LaneWidth;
|
||||
Unsafe.SkipInit(out TLane x);
|
||||
var px = (TNumber*)&x;
|
||||
Unsafe.SkipInit(out TLane y);
|
||||
var py = (TNumber*)&y;
|
||||
|
||||
var x = stackalloc TNumber[width];
|
||||
var y = stackalloc TNumber[width];
|
||||
|
||||
for (var i = 0; i < width; i++)
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
x[i] = pSrc[i * 2 + 0];
|
||||
y[i] = pSrc[i * 2 + 1];
|
||||
px[i] = pSrc[i * 2 + 0];
|
||||
py[i] = pSrc[i * 2 + 1];
|
||||
}
|
||||
|
||||
return new Vector2<TLane, TNumber>
|
||||
{
|
||||
x = TLane.Load(x),
|
||||
y = TLane.Load(y),
|
||||
x = x,
|
||||
y = y,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -515,24 +515,25 @@ public static unsafe partial class MathV
|
||||
where TLane : unmanaged, ISPMDLane<TLane, TNumber>
|
||||
where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber>
|
||||
{
|
||||
var width = TLane.LaneWidth;
|
||||
Unsafe.SkipInit(out TLane x);
|
||||
var px = (TNumber*)&x;
|
||||
Unsafe.SkipInit(out TLane y);
|
||||
var py = (TNumber*)&y;
|
||||
Unsafe.SkipInit(out TLane z);
|
||||
var pz = (TNumber*)&z;
|
||||
|
||||
var x = stackalloc TNumber[width];
|
||||
var y = stackalloc TNumber[width];
|
||||
var z = stackalloc TNumber[width];
|
||||
|
||||
for (var i = 0; i < width; i++)
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
x[i] = pSrc[i * 3 + 0];
|
||||
y[i] = pSrc[i * 3 + 1];
|
||||
z[i] = pSrc[i * 3 + 2];
|
||||
px[i] = pSrc[i * 3 + 0];
|
||||
py[i] = pSrc[i * 3 + 1];
|
||||
pz[i] = pSrc[i * 3 + 2];
|
||||
}
|
||||
|
||||
return new Vector3<TLane, TNumber>
|
||||
{
|
||||
x = TLane.Load(x),
|
||||
y = TLane.Load(y),
|
||||
z = TLane.Load(z),
|
||||
x = x,
|
||||
y = y,
|
||||
z = z,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1024,27 +1025,29 @@ public static unsafe partial class MathV
|
||||
where TLane : unmanaged, ISPMDLane<TLane, TNumber>
|
||||
where TNumber : unmanaged, INumber<TNumber>, IBinaryNumber<TNumber>, IMinMaxValue<TNumber>, IBitwiseOperators<TNumber, TNumber, TNumber>
|
||||
{
|
||||
var width = TLane.LaneWidth;
|
||||
Unsafe.SkipInit(out TLane x);
|
||||
var px = (TNumber*)&x;
|
||||
Unsafe.SkipInit(out TLane y);
|
||||
var py = (TNumber*)&y;
|
||||
Unsafe.SkipInit(out TLane z);
|
||||
var pz = (TNumber*)&z;
|
||||
Unsafe.SkipInit(out TLane w);
|
||||
var pw = (TNumber*)&w;
|
||||
|
||||
var x = stackalloc TNumber[width];
|
||||
var y = stackalloc TNumber[width];
|
||||
var z = stackalloc TNumber[width];
|
||||
var w = stackalloc TNumber[width];
|
||||
|
||||
for (var i = 0; i < width; i++)
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
x[i] = pSrc[i * 4 + 0];
|
||||
y[i] = pSrc[i * 4 + 1];
|
||||
z[i] = pSrc[i * 4 + 2];
|
||||
w[i] = pSrc[i * 4 + 3];
|
||||
px[i] = pSrc[i * 4 + 0];
|
||||
py[i] = pSrc[i * 4 + 1];
|
||||
pz[i] = pSrc[i * 4 + 2];
|
||||
pw[i] = pSrc[i * 4 + 3];
|
||||
}
|
||||
|
||||
return new Vector4<TLane, TNumber>
|
||||
{
|
||||
x = TLane.Load(x),
|
||||
y = TLane.Load(y),
|
||||
z = TLane.Load(z),
|
||||
w = TLane.Load(w),
|
||||
x = x,
|
||||
y = y,
|
||||
z = z,
|
||||
w = w,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -65,23 +65,22 @@ public static unsafe partial class MathV
|
||||
<#= TLaneRestrictions #>
|
||||
<#= TNumberRestrictions #>
|
||||
{
|
||||
var width = TLane.LaneWidth;
|
||||
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
var <#= components[i] #> = stackalloc <#= TNumber #>[width];
|
||||
Unsafe.SkipInit(out TLane <#= components[i] #>);
|
||||
var p<#= components[i] #> = (<#= TNumber #>*)&<#= components[i] #>;
|
||||
<# } #>
|
||||
|
||||
for (var i = 0; i < width; i++)
|
||||
for (var i = 0; i < TLane.LaneWidth; i++)
|
||||
{
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
<#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>];
|
||||
p<#= components[i] #>[i] = pSrc[i * <#= dimension #> + <#= i #>];
|
||||
<# } #>
|
||||
}
|
||||
|
||||
return new <#= vectorType #>
|
||||
{
|
||||
<# for (int i = 0; i < dimension; i++) { #>
|
||||
<#= components[i] #> = <#= TLane #>.Load(<#= components[i] #>),
|
||||
<#= components[i] #> = <#= components[i] #>,
|
||||
<# } #>
|
||||
};
|
||||
}
|
||||
|
||||
@@ -8,12 +8,12 @@ namespace Misaki.HighPerformance.Mathematics.SPMD;
|
||||
|
||||
public static unsafe class WideLane
|
||||
{
|
||||
internal static readonly uint* s_shuffleTable512_32bit;
|
||||
internal static readonly ulong* s_shuffleTable512_64bit;
|
||||
internal static readonly uint* s_shuffleTable256_32bit;
|
||||
internal static readonly ulong* s_shuffleTable256_64bit;
|
||||
internal static readonly uint* s_shuffleTable128_32bit;
|
||||
internal static readonly ulong* s_shuffleTable128_64bit;
|
||||
internal static readonly uint* s_pShuffleTable512_32bit;
|
||||
internal static readonly ulong* s_pShuffleTable512_64bit;
|
||||
internal static readonly uint* s_pShuffleTable256_32bit;
|
||||
internal static readonly ulong* s_pShuffleTable256_64bit;
|
||||
internal static readonly uint* s_pShuffleTable128_32bit;
|
||||
internal static readonly ulong* s_pShuffleTable128_64bit;
|
||||
|
||||
/// <summary>
|
||||
/// Gets whether WideLane is supported on the current hardware.
|
||||
@@ -22,12 +22,12 @@ public static unsafe class WideLane
|
||||
|
||||
static WideLane()
|
||||
{
|
||||
s_shuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
|
||||
s_shuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
|
||||
s_shuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
|
||||
s_shuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
|
||||
s_shuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
|
||||
s_shuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
|
||||
s_pShuffleTable512_32bit = ShuffleTableGenerator.ComputeShuffleTable512_32Bit();
|
||||
s_pShuffleTable512_64bit = ShuffleTableGenerator.ComputeShuffleTable512_64Bit();
|
||||
s_pShuffleTable256_32bit = ShuffleTableGenerator.ComputeShuffleTable256_32Bit();
|
||||
s_pShuffleTable256_64bit = ShuffleTableGenerator.ComputeShuffleTable256_64Bit();
|
||||
s_pShuffleTable128_32bit = ShuffleTableGenerator.ComputeShuffleTable128_32Bit();
|
||||
s_pShuffleTable128_64bit = ShuffleTableGenerator.ComputeShuffleTable128_64Bit();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,6 +69,12 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
get => Create(TNumber.MaxValue);
|
||||
}
|
||||
|
||||
public static WideLane<TNumber> AllBitsSet
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
get => Create(TNumber.AllBitsSet);
|
||||
}
|
||||
|
||||
public readonly TNumber this[int index]
|
||||
{
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
@@ -194,48 +200,26 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, ref TNumber value)
|
||||
{
|
||||
return MaskLoad(mask, (TNumber*)Unsafe.AsPointer(ref value));
|
||||
var vector = Vector.LoadUnsafe(ref value);
|
||||
return new WideLane<TNumber>(Vector.ConditionalSelect(mask.value, vector, Vector<TNumber>.Zero));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> MaskLoad(WideLane<TNumber> mask, TNumber* pValue)
|
||||
{
|
||||
var vector = Vector.Load(pValue);
|
||||
return new WideLane<TNumber>(Vector.ConditionalSelect(mask.value, vector, Vector<TNumber>.Zero));
|
||||
return MaskLoad(mask, ref Unsafe.AsRef<TNumber>(pValue));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Gather(TNumber* pData, WideLane<TNumber> indices, int scale)
|
||||
{
|
||||
Unsafe.SkipInit(out Vector<TNumber> result);
|
||||
|
||||
var pResult = (TNumber*)&result;
|
||||
var pIndices = (TNumber*)&indices;
|
||||
|
||||
var count = Vector<TNumber>.Count;
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
var idx = int.CreateTruncating(pIndices[i]);
|
||||
pResult[i] = pData[idx * scale / sizeof(TNumber)];
|
||||
}
|
||||
|
||||
return new WideLane<TNumber>(result);
|
||||
return Gather(ref Unsafe.AsRef<TNumber>(pData), indices, scale);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Gather(TNumber* pData, int* pIndices, int scale)
|
||||
{
|
||||
Unsafe.SkipInit(out Vector<TNumber> result);
|
||||
|
||||
var pResult = (TNumber*)&result;
|
||||
|
||||
var count = Vector<TNumber>.Count;
|
||||
for (var i = 0; i < count; i++)
|
||||
{
|
||||
pResult[i] = pData[pIndices[i] * scale / sizeof(TNumber)];
|
||||
}
|
||||
|
||||
return new WideLane<TNumber>(result);
|
||||
return Gather(ref Unsafe.AsRef<TNumber>(pData), ref Unsafe.AsRef<int>(pIndices), scale);
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
@@ -287,12 +271,6 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public int CompressStore(WideLane<TNumber> mask, ref TNumber destination)
|
||||
{
|
||||
return CompressStore(mask, (TNumber*)Unsafe.AsPointer(in destination));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public int CompressStore(WideLane<TNumber> mask, TNumber* pDestination)
|
||||
{
|
||||
if (LaneWidth == Vector512<TNumber>.Count && Vector512.IsHardwareAccelerated)
|
||||
{
|
||||
@@ -303,10 +281,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
|
||||
var moveMask = m.ExtractMostSignificantBits();
|
||||
// Offset is (moveMask * 16) because each control vector has 16 elements
|
||||
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_32bit + (moveMask * 16));
|
||||
var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_32bit + (moveMask * 16));
|
||||
var compressed = Vector512.Shuffle(vec, shuffle);
|
||||
|
||||
compressed.Store((uint*)pDestination);
|
||||
compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
|
||||
@@ -317,10 +295,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
|
||||
var moveMask = m.ExtractMostSignificantBits();
|
||||
// Offset is (moveMask * 8) because each control vector has 8 elements
|
||||
var shuffle = Vector512.Load(WideLane.s_shuffleTable512_64bit + (moveMask * 8));
|
||||
var shuffle = Vector512.Load(WideLane.s_pShuffleTable512_64bit + (moveMask * 8));
|
||||
var compressed = Vector512.Shuffle(vec, shuffle);
|
||||
|
||||
compressed.Store((ulong*)pDestination);
|
||||
compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
}
|
||||
@@ -333,10 +311,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
|
||||
var moveMask = m.ExtractMostSignificantBits();
|
||||
// Offset is (moveMask * 8) because each control vector has 8 elements
|
||||
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_32bit + (moveMask * 8));
|
||||
var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_32bit + (moveMask * 8));
|
||||
var compressed = Vector256.Shuffle(vec, shuffle);
|
||||
|
||||
compressed.Store((uint*)pDestination);
|
||||
compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
|
||||
@@ -349,10 +327,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
var moveMask = m.ExtractMostSignificantBits();
|
||||
|
||||
// Offset is (moveMask * 4) because each control vector has 4 elements
|
||||
var shuffle = Vector256.Load(WideLane.s_shuffleTable256_64bit + (moveMask * 4));
|
||||
var shuffle = Vector256.Load(WideLane.s_pShuffleTable256_64bit + (moveMask * 4));
|
||||
var compressed = Vector256.Shuffle(vec, shuffle);
|
||||
|
||||
compressed.Store((ulong*)pDestination);
|
||||
compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
}
|
||||
@@ -365,10 +343,10 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
|
||||
var moveMask = m.ExtractMostSignificantBits();
|
||||
// Offset is (moveMask * 4) because each control vector has 4 elements
|
||||
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_32bit + (moveMask * 4));
|
||||
var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_32bit + (moveMask * 4));
|
||||
var compressed = Vector128.Shuffle(vec, shuffle);
|
||||
|
||||
compressed.Store((uint*)pDestination);
|
||||
compressed.StoreUnsafe(ref Unsafe.As<TNumber, uint>(ref destination));
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
|
||||
@@ -378,9 +356,9 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
var m = Unsafe.As<WideLane<TNumber>, Vector128<ulong>>(ref mask);
|
||||
var moveMask = m.ExtractMostSignificantBits();
|
||||
// Offset is (moveMask * 2) because each control vector has 2 elements
|
||||
var shuffle = Vector128.Load(WideLane.s_shuffleTable128_64bit + (moveMask * 2));
|
||||
var shuffle = Vector128.Load(WideLane.s_pShuffleTable128_64bit + (moveMask * 2));
|
||||
var compressed = Vector128.Shuffle(vec, shuffle);
|
||||
compressed.Store((ulong*)pDestination);
|
||||
compressed.StoreUnsafe(ref Unsafe.As<TNumber, ulong>(ref destination));
|
||||
return BitOperations.PopCount(moveMask);
|
||||
}
|
||||
}
|
||||
@@ -390,15 +368,21 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
var count = 0;
|
||||
for (var i = 0; i < LaneWidth; i++)
|
||||
{
|
||||
if (mask.value[i] == ~TNumber.Zero)
|
||||
if (mask.value[i] == TNumber.AllBitsSet)
|
||||
{
|
||||
pDestination[count++] = value[i];
|
||||
Unsafe.Add(ref destination, count++) = value[i];
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public int CompressStore(WideLane<TNumber> mask, TNumber* pDestination)
|
||||
{
|
||||
return CompressStore(mask, ref Unsafe.AsRef<TNumber>(pDestination));
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public readonly Vector<TNumber> AsVector()
|
||||
{
|
||||
@@ -617,6 +601,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Sin(WideLane<TNumber> value)
|
||||
{
|
||||
#if MHP_FASTMATH
|
||||
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
|
||||
|
||||
var x_sin = value;
|
||||
@@ -644,11 +629,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
poly_sin = z_sin * poly_sin; // z * (...)
|
||||
|
||||
return poly_sin * sign_sin;
|
||||
#else
|
||||
if (typeof(TNumber) == typeof(float))
|
||||
{
|
||||
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
|
||||
var result = Vector.Sin(v);
|
||||
return new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref result));
|
||||
}
|
||||
else if (typeof(TNumber) == typeof(double))
|
||||
{
|
||||
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
|
||||
var result = Vector.Sin(v);
|
||||
return new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref result));
|
||||
}
|
||||
|
||||
return value;
|
||||
#endif
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static WideLane<TNumber> Cos(WideLane<TNumber> value)
|
||||
{
|
||||
#if MHP_FASTMATH
|
||||
var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
|
||||
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
|
||||
|
||||
@@ -677,11 +679,30 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
poly_cos = z_cos * poly_cos;
|
||||
|
||||
return poly_cos * sign_cos;
|
||||
#else
|
||||
if (typeof(TNumber) == typeof(float))
|
||||
{
|
||||
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
|
||||
var result = Vector.Cos(v);
|
||||
return new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref result));
|
||||
}
|
||||
else if (typeof(TNumber) == typeof(double))
|
||||
{
|
||||
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
|
||||
var result = Vector.Cos(v);
|
||||
return new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref result));
|
||||
}
|
||||
|
||||
return value;
|
||||
#endif
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
public static void SinCos(WideLane<TNumber> value, out WideLane<TNumber> sin, out WideLane<TNumber> cos)
|
||||
{
|
||||
#if MHP_FASTMATH
|
||||
// We use Taylor/Remez polynomial approximation for Sin(PI * z) and Cos(PI * z) on the reduced range of z in [-0.5, 0.5].
|
||||
|
||||
var halfPi = Create(TNumber.CreateTruncating(1.570796327f));
|
||||
var invPi = Create(TNumber.CreateTruncating(0.318309886f)); // 1 / PI
|
||||
|
||||
@@ -741,6 +762,27 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
|
||||
sin = poly_sin * sign_sin;
|
||||
cos = poly_cos * sign_cos;
|
||||
#else
|
||||
if (typeof(TNumber) == typeof(float))
|
||||
{
|
||||
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<float>>(ref value);
|
||||
var (sinResult, cosResult) = Vector.SinCos(v);
|
||||
sin = new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref sinResult));
|
||||
cos = new WideLane<TNumber>(Unsafe.As<Vector<float>, Vector<TNumber>>(ref cosResult));
|
||||
}
|
||||
else if (typeof(TNumber) == typeof(double))
|
||||
{
|
||||
ref var v = ref Unsafe.As<WideLane<TNumber>, Vector<double>>(ref value);
|
||||
var (sinResult, cosResult) = Vector.SinCos(v);
|
||||
sin = new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref sinResult));
|
||||
cos = new WideLane<TNumber>(Unsafe.As<Vector<double>, Vector<TNumber>>(ref cosResult));
|
||||
}
|
||||
else
|
||||
{
|
||||
sin = value;
|
||||
cos = value;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
||||
@@ -999,7 +1041,7 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
One,
|
||||
Select(
|
||||
LessThan(value, Zero),
|
||||
~Zero,
|
||||
AllBitsSet,
|
||||
Zero));
|
||||
}
|
||||
|
||||
@@ -1017,14 +1059,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
if (Sse.IsSupported && LaneWidth == Vector128<float>.Count)
|
||||
{
|
||||
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value);
|
||||
var result = Sse.Reciprocal(vf);
|
||||
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref result);
|
||||
var x0 = Sse.Reciprocal(vf);
|
||||
#if MHP_FASTMATH
|
||||
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x0);
|
||||
#else
|
||||
// SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
|
||||
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
|
||||
var x1 = x0 * (Vector128.Create(2.0f) - x0 * vf);
|
||||
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x1);
|
||||
#endif
|
||||
}
|
||||
else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count)
|
||||
{
|
||||
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value);
|
||||
var result = Avx.Reciprocal(vf);
|
||||
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref result);
|
||||
var x0 = Avx.Reciprocal(vf);
|
||||
#if MHP_FASTMATH
|
||||
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x0);
|
||||
#else
|
||||
// SSE and AVX provide fast approximate reciprocal instructions but the precision is very low (11 bits).
|
||||
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
|
||||
var x1 = x0 * (Vector256.Create(2.0f) - x0 * vf);
|
||||
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1039,14 +1095,28 @@ public readonly unsafe partial struct WideLane<TNumber> : ISPMDLane<WideLane<TNu
|
||||
if (Sse.IsSupported && LaneWidth == Vector128<float>.Count)
|
||||
{
|
||||
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector128<float>>(ref value);
|
||||
var result = Sse.ReciprocalSqrt(vf);
|
||||
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref result);
|
||||
var x0 = Sse.ReciprocalSqrt(vf);
|
||||
#if MHP_FASTMATH
|
||||
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x0);
|
||||
#else
|
||||
// SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
|
||||
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
|
||||
var x1 = x0 * Vector128.Create(0.5f) * (Vector128.Create(3.0f) - (vf * x0 * x0));
|
||||
return Unsafe.As<Vector128<float>, WideLane<TNumber>>(ref x1);
|
||||
#endif
|
||||
}
|
||||
else if (Avx.IsSupported && LaneWidth == Vector256<float>.Count)
|
||||
{
|
||||
ref var vf = ref Unsafe.As<WideLane<TNumber>, Vector256<float>>(ref value);
|
||||
var result = Avx.ReciprocalSqrt(vf);
|
||||
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref result);
|
||||
var x0 = Avx.ReciprocalSqrt(vf);
|
||||
#if MHP_FASTMATH
|
||||
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x0);
|
||||
#else
|
||||
// SSE and AVX provide fast approximate reciprocal sqrt instructions but the precision is very low (11 bits).
|
||||
// In non-MHP_FASTMATH path, we can do one step of Newton-Raphson iteration to improve the precision to about 22 bits, which is good enough for most game use cases.
|
||||
var x1 = x0 * Vector256.Create(0.5f) * (Vector256.Create(3.0f) - (vf * x0 * x0));
|
||||
return Unsafe.As<Vector256<float>, WideLane<TNumber>>(ref x1);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user