using System.Diagnostics.CodeAnalysis; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.Intrinsics; namespace Misaki.HighPerformance.LowLevel.Utilities; public static unsafe partial class MemoryUtility { [DoesNotReturn] private static void ThrowMustBeNullTerminatedString() { throw new ArgumentException("Arg_MustBeNullTerminatedString"); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 LoadVector128(ref byte start, nuint offset) { return Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256 LoadVector256(ref byte start, nuint offset) { return Unsafe.ReadUnaligned>(ref Unsafe.AddByteOffset(ref start, offset)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static nuint GetByteVector128SpanLength(nuint offset, int length) { return (uint)((length - (int)offset) & ~(Vector128.Count - 1)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static nuint GetByteVector256SpanLength(nuint offset, int length) { return (uint)((length - (int)offset) & ~(Vector256.Count - 1)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static nuint GetByteVector512SpanLength(nuint offset, int length) { return (uint)((length - (int)offset) & ~(Vector512.Count - 1)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe nuint UnalignedCountVector128(byte* searchSpace) { var unaligned = (nint)searchSpace & (Vector128.Count - 1); return (uint)((Vector128.Count - unaligned) & (Vector128.Count - 1)); } /// /// Searches for the first occurrence of a null byte (0x00) in a given byte array. /// /// A pointer to the byte array where the search will be performed. /// Returns the index of the first null byte found in the array.. /// Thrown if the byte array is not null-terminated." public static unsafe int IndexOfNullByte(byte* searchSpace) { const int Length = int.MaxValue; const uint uValue = 0; // Use uint for comparisons to avoid unnecessary 8->32 extensions nuint offset = 0; // Use nuint for arithmetic to avoid unnecessary 64->32->64 truncations var lengthToExamine = (nuint)(uint)Length; if (Vector128.IsHardwareAccelerated) { // Avx2 branch also operates on Sse2 sizes, so check is combined. lengthToExamine = UnalignedCountVector128(searchSpace); } SequentialScan: while (lengthToExamine >= 8) { lengthToExamine -= 8; if (uValue == searchSpace[offset]) goto Found; if (uValue == searchSpace[offset + 1]) goto Found1; if (uValue == searchSpace[offset + 2]) goto Found2; if (uValue == searchSpace[offset + 3]) goto Found3; if (uValue == searchSpace[offset + 4]) goto Found4; if (uValue == searchSpace[offset + 5]) goto Found5; if (uValue == searchSpace[offset + 6]) goto Found6; if (uValue == searchSpace[offset + 7]) goto Found7; offset += 8; } if (lengthToExamine >= 4) { lengthToExamine -= 4; if (uValue == searchSpace[offset]) goto Found; if (uValue == searchSpace[offset + 1]) goto Found1; if (uValue == searchSpace[offset + 2]) goto Found2; if (uValue == searchSpace[offset + 3]) goto Found3; offset += 4; } while (lengthToExamine > 0) { lengthToExamine -= 1; if (uValue == searchSpace[offset]) goto Found; offset += 1; } // We get past SequentialScan only if IsHardwareAccelerated is true; and remain length is greater than Vector length. // However, we still have the redundant check to allow the JIT to see that the code is unreachable and eliminate it when the platform does not // have hardware accelerated. After processing Vector lengths we return to SequentialScan to finish any remaining. if (Vector512.IsHardwareAccelerated) { if (offset < Length) { if ((((uint)searchSpace + offset) & (nuint)(Vector256.Count - 1)) != 0) { // Invert currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches // with no upper bound e.g. String.strlen. // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. // This ensures we do not fault across memory pages while searching for an end of string. var search = Vector128.Load(searchSpace + offset); // Same method as below var matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits(); if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector128.Count; } else { // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } } if ((((uint)searchSpace + offset) & (nuint)(Vector512.Count - 1)) != 0) { // Invert currently aligned to Vector512 (is aligned to Vector256); this can cause a problem for searches // with no upper bound e.g. String.strlen. // Start with a check on Vector256 to align to Vector512, before moving to processing Vector256. // This ensures we do not fault across memory pages while searching for an end of string. var search = Vector256.Load(searchSpace + offset); // Same method as below var matches = Vector256.Equals(Vector256.Zero, search).ExtractMostSignificantBits(); if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector256.Count; } else { // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } } lengthToExamine = GetByteVector512SpanLength(offset, Length); if (lengthToExamine > offset) { do { var search = Vector512.Load(searchSpace + offset); var matches = Vector512.Equals(Vector512.Zero, search).ExtractMostSignificantBits(); // Note that MoveMask has converted the equal vector elements into a set of bit flags, // So the bit position in 'matches' corresponds to the element offset. if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector512.Count; continue; } // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } while (lengthToExamine > offset); } lengthToExamine = GetByteVector256SpanLength(offset, Length); if (lengthToExamine > offset) { var search = Vector256.Load(searchSpace + offset); // Same method as above var matches = Vector256.Equals(Vector256.Zero, search).ExtractMostSignificantBits(); if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector256.Count; } else { // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } } lengthToExamine = GetByteVector128SpanLength(offset, Length); if (lengthToExamine > offset) { var search = Vector128.Load(searchSpace + offset); // Same method as above var matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits(); if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector128.Count; } else { // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } } if (offset < Length) { lengthToExamine = (Length - offset); goto SequentialScan; } } } else if (Vector256.IsHardwareAccelerated) { if (offset < Length) { if ((((uint)searchSpace + offset) & (nuint)(Vector256.Count - 1)) != 0) { // Invert currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches // with no upper bound e.g. String.strlen. // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. // This ensures we do not fault across memory pages while searching for an end of string. var search = Vector128.Load(searchSpace + offset); // Same method as below var matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits(); if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector128.Count; } else { // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } } lengthToExamine = GetByteVector256SpanLength(offset, Length); if (lengthToExamine > offset) { do { var search = Vector256.Load(searchSpace + offset); var matches = Vector256.Equals(Vector256.Zero, search).ExtractMostSignificantBits(); // Note that MoveMask has converted the equal vector elements into a set of bit flags, // So the bit position in 'matches' corresponds to the element offset. if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector256.Count; continue; } // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } while (lengthToExamine > offset); } lengthToExamine = GetByteVector128SpanLength(offset, Length); if (lengthToExamine > offset) { var search = Vector128.Load(searchSpace + offset); // Same method as above var matches = Vector128.Equals(Vector128.Zero, search).ExtractMostSignificantBits(); if (matches == 0) { // Zero flags set so no matches offset += (nuint)Vector128.Count; } else { // Find bitflag offset of first match and add to current offset return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } } if (offset < Length) { lengthToExamine = (Length - offset); goto SequentialScan; } } } else if (Vector128.IsHardwareAccelerated) { if (offset < Length) { lengthToExamine = GetByteVector128SpanLength(offset, Length); while (lengthToExamine > offset) { var search = Vector128.Load(searchSpace + offset); // Same method as above var compareResult = Vector128.Equals(Vector128.Zero, search); if (compareResult == Vector128.Zero) { // Zero flags set so no matches offset += (nuint)Vector128.Count; continue; } // Find bitflag offset of first match and add to current offset var matches = compareResult.ExtractMostSignificantBits(); return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); } if (offset < Length) { lengthToExamine = (Length - offset); goto SequentialScan; } } } ThrowMustBeNullTerminatedString(); Found: // Workaround for https://github.com/dotnet/runtime/issues/8795 return (int)offset; Found1: return (int)(offset + 1); Found2: return (int)(offset + 2); Found3: return (int)(offset + 3); Found4: return (int)(offset + 4); Found5: return (int)(offset + 5); Found6: return (int)(offset + 6); Found7: return (int)(offset + 7); } }