Compiler Explorer

Source code

#include <type_traits>
#include <array>
#ifdef __x86_64__
#include <bit>
#endif
//===-- Common internal contructs -------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SUPPORT_COMMON_H
#define LLVM_LIBC_SUPPORT_COMMON_H

#define LIBC_INLINE_ASM __asm__ __volatile__

#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(x, 0)
#endif
#ifndef UNUSED
#define UNUSED __attribute__((unused))
#endif

#ifndef LLVM_LIBC_FUNCTION_ATTR
#define LLVM_LIBC_FUNCTION_ATTR
#endif

// MacOS needs to be excluded because it does not support aliasing.
#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && (!defined(__APPLE__))
#define LLVM_LIBC_FUNCTION(type, name, arglist)                                \
  LLVM_LIBC_FUNCTION_ATTR decltype(__llvm_libc::name)                          \
      __##name##_impl__ __asm__(#name);                                        \
  decltype(__llvm_libc::name) name [[gnu::alias(#name)]];                      \
  type __##name##_impl__ arglist
#else
#define LLVM_LIBC_FUNCTION(type, name, arglist) type name arglist
#endif

namespace __llvm_libc {
namespace internal {
constexpr bool same_string(char const *lhs, char const *rhs) {
  for (; *lhs || *rhs; ++lhs, ++rhs)
    if (*lhs != *rhs)
      return false;
  return true;
}
} // namespace internal
} // namespace __llvm_libc

// LLVM_LIBC_IS_DEFINED checks whether a particular macro is defined.
// Usage: constexpr bool kUseAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
//
// This works by comparing the stringified version of the macro with and without
// evaluation. If FOO is not undefined both stringifications yield "FOO". If FOO
// is defined, one stringification yields "FOO" while the other yields its
// stringified value "1".
#define LLVM_LIBC_IS_DEFINED(macro)                                            \
  !__llvm_libc::internal::same_string(                                         \
      LLVM_LIBC_IS_DEFINED__EVAL_AND_STRINGIZE(macro), #macro)
#define LLVM_LIBC_IS_DEFINED__EVAL_AND_STRINGIZE(s) #s

#endif // LLVM_LIBC_SUPPORT_COMMON_H
//===-- Compile time architecture detection ---------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SUPPORT_ARCHITECTURES_H
#define LLVM_LIBC_SUPPORT_ARCHITECTURES_H

#if defined(__pnacl__) || defined(__CLR_VER)
#define LLVM_LIBC_ARCH_VM
#endif

#if (defined(_M_IX86) || defined(__i386__)) && !defined(LLVM_LIBC_ARCH_VM)
#define LLVM_LIBC_ARCH_X86_32
#endif

#if (defined(_M_X64) || defined(__x86_64__)) && !defined(LLVM_LIBC_ARCH_VM)
#define LLVM_LIBC_ARCH_X86_64
#endif

#if defined(LLVM_LIBC_ARCH_X86_32) || defined(LLVM_LIBC_ARCH_X86_64)
#define LLVM_LIBC_ARCH_X86
#endif

#if (defined(__arm__) || defined(_M_ARM))
#define LLVM_LIBC_ARCH_ARM
#endif

#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
#define LLVM_LIBC_ARCH_AARCH64
#endif

#if (defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_ARM))
#define LLVM_LIBC_ARCH_ANY_ARM
#endif

#if defined(LLVM_LIBC_ARCH_AARCH64)
#define LIBC_TARGET_HAS_FMA
#elif defined(LLVM_LIBC_ARCH_X86_64)
#if (defined(__AVX2__) || defined(__FMA__))
#define LIBC_TARGET_HAS_FMA
#endif
#endif

#endif // LLVM_LIBC_SUPPORT_ARCHITECTURES_H
//===-- Endianness support --------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_SUPPORT_ENDIAN_H
#define LLVM_LIBC_SRC_SUPPORT_ENDIAN_H

#include <stdint.h>

namespace __llvm_libc {

// We rely on compiler preprocessor defines to allow for cross compilation.
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
    !defined(__ORDER_BIG_ENDIAN__)
#error "Missing preprocessor definitions for endianness detection."
#endif

namespace internal {

// Converts uint8_t, uint16_t, uint32_t, uint64_t to its big or little endian
// counterpart.
// We use explicit template specialization:
// - to prevent accidental integer promotion.
// - to prevent fallback in (unlikely) case of middle-endianness.

template <unsigned ORDER> struct Endian {
  static constexpr const bool IS_LITTLE = ORDER == __ORDER_LITTLE_ENDIAN__;
  static constexpr const bool IS_BIG = ORDER == __ORDER_BIG_ENDIAN__;
  template <typename T> static T to_big_endian(T value);
  template <typename T> static T to_little_endian(T value);
};

// Little Endian specializations
template <>
template <>
inline uint8_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint8_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint16_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint16_t>(uint16_t v) {
  return __builtin_bswap16(v);
}
template <>
template <>
inline uint16_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint16_t>(uint16_t v) {
  return v;
}
template <>
template <>
inline uint32_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint32_t>(uint32_t v) {
  return __builtin_bswap32(v);
}
template <>
template <>
inline uint32_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint32_t>(uint32_t v) {
  return v;
}
template <>
template <>
inline uint64_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint64_t>(uint64_t v) {
  return __builtin_bswap64(v);
}
template <>
template <>
inline uint64_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint64_t>(uint64_t v) {
  return v;
}

// Big Endian specializations
template <>
template <>
inline uint8_t Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint8_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint16_t
Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint16_t>(uint16_t v) {
  return v;
}
template <>
template <>
inline uint16_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint16_t>(uint16_t v) {
  return __builtin_bswap16(v);
}
template <>
template <>
inline uint32_t
Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint32_t>(uint32_t v) {
  return v;
}
template <>
template <>
inline uint32_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint32_t>(uint32_t v) {
  return __builtin_bswap32(v);
}
template <>
template <>
inline uint64_t
Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint64_t>(uint64_t v) {
  return v;
}
template <>
template <>
inline uint64_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint64_t>(uint64_t v) {
  return __builtin_bswap64(v);
}

} // namespace internal

using Endian = internal::Endian<__BYTE_ORDER__>;

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_SUPPORT_ENDIAN_H
//===-- Memory utils --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
#define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H

// Cache line sizes for ARM: These values are not strictly correct since
// cache line sizes depend on implementations, not architectures.  There
// are even implementations with cache line sizes configurable at boot
// time.
#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86)
#define LLVM_LIBC_CACHELINE_SIZE 64
#elif defined(LLVM_LIBC_ARCH_ARM)
#define LLVM_LIBC_CACHELINE_SIZE 32
#else
#error "Unsupported platform for memory functions."
#endif

#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t

namespace __llvm_libc {

// Allows compile time error reporting in `if constexpr` branches.
template <bool flag = false>
static void deferred_static_assert(const char *msg) {
  static_assert(flag, "compilation error");
  (void)msg;
}

// Return whether `value` is zero or a power of two.
static constexpr bool is_power2_or_zero(size_t value) {
  return (value & (value - 1U)) == 0;
}

// Return whether `value` is a power of two.
static constexpr bool is_power2(size_t value) {
  return value && is_power2_or_zero(value);
}

// Compile time version of log2 that handles 0.
static constexpr size_t log2(size_t value) {
  return (value == 0 || value == 1) ? 0 : 1 + log2(value / 2);
}

// Returns the first power of two preceding value or value if it is already a
// power of two (or 0 when value is 0).
static constexpr size_t le_power2(size_t value) {
  return value == 0 ? value : 1ULL << log2(value);
}

// Returns the first power of two following value or value if it is already a
// power of two (or 0 when value is 0).
static constexpr size_t ge_power2(size_t value) {
  return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
}

// Returns the number of bytes to substract from ptr to get to the previous
// multiple of alignment. If ptr is already aligned returns 0.
template <size_t alignment>
intptr_t offset_from_last_aligned_or_zero(const void *ptr) {
  static_assert(is_power2(alignment), "alignment must be a power of 2");
  return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns 0.
template <size_t alignment>
intptr_t offset_to_next_aligned_or_zero(const void *ptr) {
  static_assert(is_power2(alignment), "alignment must be a power of 2");
  // The logic is not straightforward and involves unsigned modulo arithmetic
  // but the generated code is as fast as it can be.
  return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns alignment.
template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
  return alignment - offset_from_last_aligned_or_zero<alignment>(ptr);
}

// Returns the offset from `ptr` to the next cache line.
static inline intptr_t offset_to_next_cache_line(const void *ptr) {
  return offset_to_next_aligned_or_zero<LLVM_LIBC_CACHELINE_SIZE>(ptr);
}

template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
  return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
}

#if defined __has_builtin
#if __has_builtin(__builtin_memcpy_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
#endif
#endif

#if defined __has_builtin
#if __has_builtin(__builtin_memset_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
#endif
#endif

// Performs a constant count copy.
template <size_t Size>
static inline void memcpy_inline(void *__restrict dst,
                                 const void *__restrict src) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
  __builtin_memcpy_inline(dst, src, Size);
#else
  for (size_t i = 0; i < Size; ++i)
    static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i];
#endif
}

using Ptr = char *;        // Pointer to raw data.
using CPtr = const char *; // Const pointer to raw data.

template <typename T> struct StrictScalarType {
  static_assert(std::is_integral_v<T>);

StrictScalarType() = default;

template <typename U, std::enable_if_t<std::is_same_v<U, T>, bool> = 0>
  StrictScalarType(U value) : value(value) {}

operator T() const { return value; }

static inline constexpr StrictScalarType ZERO() { return {T(0)}; }

private:
  T value;
};
using MemcmpType = StrictScalarType<int32_t>;
using BcmpType = StrictScalarType<uint32_t>;

// Loads bytes from memory (possibly unaligned) and materializes them as
// type.
template <typename T> static inline T load(CPtr ptr) {
  T Out;
  memcpy_inline<sizeof(T)>(&Out, ptr);
  return Out;
}

// Stores a value of type T in memory (possibly unaligned)
template <typename T> static inline void store(Ptr ptr, T value) {
  memcpy_inline<sizeof(T)>(ptr, &value);
}

// For an operation like memcpy or memcmp that operates on two pointers and a
// count, advances the pointers by offset bytes and decrease count by the same
// amount.
template <typename T1, typename T2>
static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
                          T2 *__restrict &p2, size_t &count) {
  p1 += offset;
  p2 += offset;
  count -= offset;
}

// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
// and decrease count by the same amount.
// We make sure the compiler knows about the adjusted pointer alignment.
template <size_t SIZE, typename T1, typename T2>
void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
                               size_t &count) {
  adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
  p1 = assume_aligned<SIZE>(p1);
}

template <size_t SIZE, typename T1>
void align_to_next_boundary(T1 *&p1, size_t &count) {
  CPtr dummy;
  align_p1_to_next_boundary<SIZE>(p1, dummy, count);
}

enum class Arg { P1, P2, Dst = P1, Src = P2 };

// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
                            size_t &count) {
  if constexpr (AlignOn == Arg::P1)
    align_p1_to_next_boundary<SIZE>(p1, p2, count);
  else if constexpr (AlignOn == Arg::P2)
    align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
  else
    deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
//===-- Higher order algorithms for memory funtions -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BASE_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BASE_H

#include <stddef.h> // size_t
#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t

namespace __llvm_libc::base {

///////////////////////////////////////////////////////////////////////////////
// Tail operations - applies T::block on the last T::SIZE bytes.
///////////////////////////////////////////////////////////////////////////////

template <typename T>
static inline void memcpy_tail(Ptr __restrict dst, CPtr __restrict src,
                               size_t count) {
  T::block(dst + count - T::SIZE, src + count - T::SIZE);
}

template <typename T>
static inline void memset_tail(Ptr dst, uint8_t value, size_t count) {
  T::block(dst + count - T::SIZE, value);
}

template <typename T>
static inline auto cmp_tail(CPtr p1, CPtr p2, size_t count) {
  return T::block(p1 + count - T::SIZE, p2 + count - T::SIZE);
}

///////////////////////////////////////////////////////////////////////////////
// Head Tail operations - applies T::block on the first and last T::SIZE bytes.
///////////////////////////////////////////////////////////////////////////////

template <typename T>
static inline void memcpy_head_tail(Ptr __restrict dst, CPtr __restrict src,
                                    size_t count) {
  T::block(dst, src);
  T::tail(dst, src, count);
}

template <typename T>
static inline void memset_head_tail(Ptr dst, uint8_t value, size_t count) {
  T::block(dst, value);
  T::tail(dst, value, count);
}

///////////////////////////////////////////////////////////////////////////////
// Loop operations - applies T::block in a loop to cover count and ends with a
// tail operation.
///////////////////////////////////////////////////////////////////////////////

template <typename T>
static inline void memcpy_loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
                                        size_t count) {
  size_t offset = 0;
#pragma nounroll
  do {
    T::block(dst + offset, src + offset);
    offset += T::SIZE;
  } while (offset < count - T::SIZE);
  T::tail(dst, src, count);
}

template <typename T>
static inline void memset_loop_and_tail(Ptr dst, uint8_t value, size_t count) {
  size_t offset = 0;
#pragma nounroll
  do {
    T::block(dst + offset, value);
    offset += T::SIZE;
  } while (offset < count - T::SIZE);
  T::tail(dst, value, count);
}

template <typename T>
static inline auto cmp_loop_and_tail(CPtr p1, CPtr p2, size_t count) {
  size_t offset = 0;
#pragma nounroll
  do {
    if (auto value = T::block(p1 + offset, p2 + offset))
      return value;
    offset += T::SIZE;
  } while (offset < count - T::SIZE);
  return T::tail(p1, p2, count);
}

///////////////////////////////////////////////////////////////////////////////
// Align operations
///////////////////////////////////////////////////////////////////////////////
template <typename T, Arg AlignOn>
static inline void memcpy_block_and_align(Ptr __restrict dst,
                                          CPtr __restrict src, size_t count) {
  T::block(dst, src);
  align_to_next_boundary<T::SIZE, AlignOn>(dst, src, count);
}

template <typename T>
static inline void memset_block_and_align(Ptr dst, uint8_t value,
                                          size_t count) {
  T::block(dst, value);
  align_to_next_boundary<T::SIZE>(dst, count);
}

template <typename T, Arg AlignOn = Arg::P1>
static inline auto cmp_block_and_align(CPtr p1, CPtr p2, size_t count) {
  using R = decltype(T::block(p1, p2));
  if (auto value = T::block(p1, p2))
    return value;
  align_to_next_boundary<T::SIZE, AlignOn>(p1, p2, count);
  return R::ZERO();
}

} // namespace __llvm_libc::base

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BASE_H
//===-- Implementation using the __builtin_XXX_inline ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides generic C++ building blocks to compose memory functions.
// They rely on the compiler to generate the best possible code through the use
// of the `__builtin_XXX_inline` builtins. These builtins are currently only
// available in Clang.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H

namespace __llvm_libc::builtin {

///////////////////////////////////////////////////////////////////////////////
// Memcpy
template <size_t Size> struct Memcpy {
  using ME = Memcpy;
  static constexpr size_t SIZE = Size;
  static inline void block(Ptr __restrict dst, CPtr __restrict src) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
    return __builtin_memcpy_inline(dst, src, SIZE);
#else
    deferred_static_assert("Missing __builtin_memcpy_inline");
    (void)dst;
    (void)src;
#endif
  }

static inline void tail(Ptr __restrict dst, CPtr __restrict src,
                          size_t count) {
    return base::memcpy_tail<ME>(dst, src, count);
  }

static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
                               size_t count) {
    return base::memcpy_head_tail<ME>(dst, src, count);
  }

static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
                                   size_t count) {
    return base::memcpy_loop_and_tail<ME>(dst, src, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memset
template <size_t Size> struct Memset {
  using ME = Memset;
  static constexpr size_t SIZE = Size;
  static inline void block(Ptr dst, uint8_t value) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
    __builtin_memset_inline(dst, value, Size);
#else
    deferred_static_assert("Missing __builtin_memset_inline");
    (void)dst;
    (void)value;
#endif
  }

static inline void tail(Ptr dst, uint8_t value, size_t count) {
    base::memset_tail<ME>(dst, value, count);
  }

static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
    base::memset_head_tail<ME>(dst, value, count);
  }

static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
    base::memset_loop_and_tail<ME>(dst, value, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Bcmp
template <size_t Size> struct Bcmp {
  using ME = Bcmp;
  static constexpr size_t SIZE = Size;
  static inline BcmpType block(CPtr, CPtr) {
    deferred_static_assert("Missing __builtin_memcmp_inline");
  }

static inline BcmpType tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return 0U;
  }

static inline BcmpType head_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return 0U;
  }

static inline BcmpType loop_and_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return 0U;
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memcmp
template <size_t Size> struct Memcmp {
  using ME = Memcmp;
  static constexpr size_t SIZE = Size;
  static inline MemcmpType block(CPtr, CPtr) {
    deferred_static_assert("Missing __builtin_memcmp_inline");
  }

static inline MemcmpType tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return MemcmpType::ZERO();
  }

static inline MemcmpType head_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return MemcmpType::ZERO();
  }

static inline MemcmpType loop_and_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return MemcmpType::ZERO();
  }
};

} // namespace __llvm_libc::builtin

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
//===-- Generic implementation of memory function building blocks ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides generic C++ building blocks to compose memory functions.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H

namespace __llvm_libc::generic {

// CTPair and CTMap below implement a compile time map.
// Example usage:
// using MyMap = CTMap<CTPair<1, uint8_t>,
//                     CTPair<2, uint16_t>,
//                     >;
// ...
// using UInt8T = MyMap::find_type<1>;
template <size_t I, typename T> struct CTPair {
  using type = T;
  static CTPair get_pair(std::integral_constant<size_t, I>) { return {}; }
};
template <typename... Ps> struct CTMap : public Ps... {
  using Ps::get_pair...;
  template <size_t I>
  using find_type =
      typename decltype(get_pair(std::integral_constant<size_t, I>{}))::type;
};

// Helper to test if a type is void.
template <typename T> inline constexpr bool is_void_v = std::is_same_v<T, void>;

// Implements load, store and splat for unsigned integral types.
template <typename T> struct ScalarType {
  using Type = T;
  static_assert(std::is_integral_v<Type> && !std::is_signed_v<Type>);

static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
  static inline void store(Ptr dst, Type value) {
    ::__llvm_libc::store<Type>(dst, value);
  }
  static inline void splat(Ptr dst, uint8_t value) {
    Type splatted_value = Type(~0) / Type(0xFF) * Type(value);
    store(dst, splatted_value);
  }
};

// Implements load, store and splat for vector types.
template <size_t Size> struct VectorType {
  using Type = uint8_t __attribute__((__vector_size__(Size)));
  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
  static inline void store(Ptr dst, Type value) {
    ::__llvm_libc::store<Type>(dst, value);
  }
  static inline void splat(Ptr dst, uint8_t value) {
    Type splat;
    // This for loop is optimized out for vector types.
    for (size_t i = 0; i < Size; ++i)
      splat[i] = static_cast<uint8_t>(value);
    store(dst, splat);
  }
};

// Map from sizes to structures offering static load, store and splat methods.
// Note: On platforms lacking vector support, we use the ArrayType below and
// decompose the operation in smaller pieces.
using NativeTypeMap = CTMap<CTPair<1, ScalarType<uint8_t>>,  //
                            CTPair<2, ScalarType<uint16_t>>, //
                            CTPair<4, ScalarType<uint32_t>>, //
                            CTPair<8, ScalarType<uint64_t>>, //
                            CTPair<16, VectorType<16>>,      //
                            CTPair<32, VectorType<32>>,      //
                            CTPair<64, VectorType<64>>>;

// Implements load, store and splat for sizes not natively supporter by the
// platform. SubType is either ScalarType or VectorType.
template <typename SubType, size_t ArraySize> struct ArrayType {
  using Type = std::array<typename SubType::Type, ArraySize>;
  static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
  static inline Type load(CPtr src) {
    Type Value;
    for (size_t I = 0; I < ArraySize; ++I)
      Value[I] = SubType::load(src + (I * SizeOfElement));
    return Value;
  }
  static inline void store(Ptr dst, Type Value) {
    for (size_t I = 0; I < ArraySize; ++I)
      SubType::store(dst + (I * SizeOfElement), Value[I]);
  }
  static inline void splat(Ptr dst, uint8_t value) {
    for (size_t I = 0; I < ArraySize; ++I)
      SubType::splat(dst + (I * SizeOfElement), value);
  }
};

// Checks whether we should use an ArrayType.
template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
  return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
         !is_void_v<NativeTypeMap::find_type<MaxSize>>;
}

// Compute the type to handle an operation of Size bytes knowing that the
// underlying platform only support native types up to MaxSize bytes.
template <size_t Size, size_t MaxSize>
using getTypeFor = std::conditional_t<
    useArrayType<Size, MaxSize>(),
    ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
    NativeTypeMap::find_type<Size>>;

///////////////////////////////////////////////////////////////////////////////
// Memcpy
// When building with clang we can delegate to the builtin implementation
///////////////////////////////////////////////////////////////////////////////

template <size_t Size> using Memcpy = builtin::Memcpy<Size>;

///////////////////////////////////////////////////////////////////////////////
// Memset
///////////////////////////////////////////////////////////////////////////////
template <size_t Size, size_t MaxSize> struct Memset {
  static_assert(is_power2(MaxSize));
  using ME = Memset;
  static constexpr size_t SIZE = Size;

static inline void block(Ptr dst, uint8_t value) {
    if constexpr (Size == 3) {
      Memset<1, MaxSize>::block(dst + 2, value);
      Memset<2, MaxSize>::block(dst, value);
    } else {
      using T = getTypeFor<Size, MaxSize>;
      if constexpr (is_void_v<T>) {
        deferred_static_assert("Unimplemented Size");
      } else {
        T::splat(dst, value);
      }
    }
  }

static inline void tail(Ptr dst, uint8_t value, size_t count) {
    return base::memset_tail<ME>(dst, value, count);
  }

static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
    return base::memset_head_tail<ME>(dst, value, count);
  }

static inline void block_and_align(Ptr dst, uint8_t value, size_t count) {
    return base::memset_block_and_align<ME>(dst, value, count);
  }

static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
    return base::memset_loop_and_tail<ME>(dst, value, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Bcmp
///////////////////////////////////////////////////////////////////////////////
template <size_t Size> struct Bcmp {
  using ME = Bcmp;
  static constexpr size_t SIZE = Size;

template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
    return load<T>(p1) ^ load<T>(p2);
  }

template <typename T>
  static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
    return load<T>(p1) != load<T>(p2);
  }

static inline BcmpType block(CPtr p1, CPtr p2) {
    static constexpr size_t MaxSize = 8;
    if constexpr (Size == 1) {
      return load_xor<uint8_t>(p1, p2);
    } else if constexpr (Size == 2) {
      return load_xor<uint16_t>(p1, p2);
    } else if constexpr (Size == 4) {
      return load_xor<uint32_t>(p1, p2);
    } else if constexpr (Size == 8) {
      return load_not_equal<uint64_t>(p1, p2);
    } else if constexpr (useArrayType<Size, MaxSize>()) {
      for (size_t offset = 0; offset < Size; offset += MaxSize)
        if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
          return value;
      return BcmpType::ZERO();
    } else {
      deferred_static_assert("Unimplemented Size");
    }
  }

static inline BcmpType tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_tail<ME>(p1, p2, count);
  }

static inline BcmpType head_tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1, p2) | tail(p1, p2, count);
  }

template <Arg AlignOn>
  static inline BcmpType block_and_align(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_block_and_align<ME, AlignOn>(p1, p2, count);
  }

static inline BcmpType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_loop_and_tail<ME>(p1, p2, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memcmp
///////////////////////////////////////////////////////////////////////////////
template <size_t Size> struct Memcmp {
  using ME = Memcmp;
  static constexpr size_t SIZE = Size;

template <typename T> static inline T load_be(CPtr ptr) {
    return Endian::to_big_endian(load<T>(ptr));
  }

template <typename T>
  static inline MemcmpType load_be_diff(CPtr p1, CPtr p2) {
    return load_be<T>(p1) - load_be<T>(p2);
  }

template <typename T> static inline MemcmpType load_be_cmp(CPtr p1, CPtr p2) {
    const auto la = load_be<T>(p1);
    const auto lb = load_be<T>(p2);
    return la > lb ? 1 : la < lb ? -1 : 0;
  }

static inline MemcmpType block(CPtr p1, CPtr p2) {
    static constexpr size_t MaxSize = 8;
    if constexpr (Size == 1) {
      return load_be_diff<uint8_t>(p1, p2);
    } else if constexpr (Size == 2) {
      return load_be_diff<uint16_t>(p1, p2);
    } else if constexpr (Size == 4) {
      return load_be_cmp<uint32_t>(p1, p2);
    } else if constexpr (Size == 8) {
      return load_be_cmp<uint64_t>(p1, p2);
    } else if constexpr (useArrayType<Size, MaxSize>()) {
      for (size_t offset = 0; offset < Size; offset += MaxSize)
        if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
          return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
      return MemcmpType::ZERO();
    } else if constexpr (Size == 3) {
      if (auto value = Memcmp<2>::block(p1, p2))
        return value;
      return Memcmp<1>::block(p1 + 2, p2 + 2);
    } else {
      deferred_static_assert("Unimplemented Size");
    }
  }

static inline MemcmpType tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_tail<ME>(p1, p2, count);
  }

static inline MemcmpType head_tail(CPtr p1, CPtr p2, size_t count) {
    if (auto value = block(p1, p2))
      return value;
    return tail(p1, p2, count);
  }

template <Arg AlignOn>
  static inline MemcmpType block_and_align(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_block_and_align<ME, AlignOn>(p1, p2, count);
  }

static inline MemcmpType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_loop_and_tail<ME>(p1, p2, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memmove
///////////////////////////////////////////////////////////////////////////////

template <size_t Size, size_t MaxSize> struct Memmove {
  static_assert(is_power2(MaxSize));
  using ME = Memmove;
  using T = getTypeFor<Size, MaxSize>;
  static constexpr size_t SIZE = Size;

static inline void block(Ptr dst, CPtr src) {
    if constexpr (is_void_v<T>) {
      deferred_static_assert("Unimplemented Size");
    } else {
      T::store(dst, T::load(src));
    }
  }

static inline void head_tail(Ptr dst, CPtr src, size_t count) {
    const size_t offset = count - Size;
    if constexpr (is_void_v<T>) {
      deferred_static_assert("Unimplemented Size");
    } else {
      // The load and store operations can be performed in any order as long as
      // they are not interleaved. More investigations are needed to determine
      // the best order.
      const auto head = T::load(src);
      const auto tail = T::load(src + offset);
      T::store(dst, head);
      T::store(dst + offset, tail);
    }
  }

// Align forward suitable when dst < src. The alignment is performed with
  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
  //
  // e.g. Moving two bytes and making sure src is then aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
  // [____LLLLLLLL_____________________]
  // [___________LLLLLLLL______________]
  // [_SSSSSSSS________________________]
  // [________SSSSSSSS_________________]
  //
  // e.g. Moving two bytes and making sure dst is then aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
  // [____LLLLLLLL_____________________]
  // [______LLLLLLLL___________________]
  // [_SSSSSSSS________________________]
  // [___SSSSSSSS______________________]
  template <Arg AlignOn>
  static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
    Ptr prev_dst = dst;
    CPtr prev_src = src;
    size_t prev_count = count;
    align_to_next_boundary<Size, AlignOn>(dst, src, count);
    adjust(Size, dst, src, count);
    head_tail(prev_dst, prev_src, prev_count - count);
  }

// Align backward suitable when dst > src. The alignment is performed with
  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
  //
  // e.g. Moving two bytes backward and making sure src is then aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
  // [ _________________LLLLLLLL_______]
  // [ ___________________LLLLLLLL_____]
  // [____________________SSSSSSSS_____]
  // [______________________SSSSSSSS___]
  //
  // e.g. Moving two bytes and making sure dst is then aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
  // [ _______________LLLLLLLL_________]
  // [ ___________________LLLLLLLL_____]
  // [__________________SSSSSSSS_______]
  // [______________________SSSSSSSS___]
  template <Arg AlignOn>
  static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
    Ptr headtail_dst = dst + count;
    CPtr headtail_src = src + count;
    size_t headtail_size = 0;
    align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
                                          headtail_size);
    adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
    head_tail(headtail_dst, headtail_src, headtail_size);
    count -= headtail_size;
  }

// Move forward suitable when dst < src. We load the tail bytes before
  // handling the loop.
  //
  // e.g. Moving two bytes
  // [   |       |       |       |       |]
  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
  // [_________________________LLLLLLLL___]
  // [___LLLLLLLL_________________________]
  // [_SSSSSSSS___________________________]
  // [___________LLLLLLLL_________________]
  // [_________SSSSSSSS___________________]
  // [___________________LLLLLLLL_________]
  // [_________________SSSSSSSS___________]
  // [_______________________SSSSSSSS_____]
  static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
    const size_t tail_offset = count - Size;
    const auto tail_value = T::load(src + tail_offset);
    size_t offset = 0;
#pragma nounroll
    do {
      block(dst + offset, src + offset);
      offset += Size;
    } while (offset < count - Size);
    T::store(dst + tail_offset, tail_value);
  }

// Move backward suitable when dst > src. We load the head bytes before
  // handling the loop.
  //
  // e.g. Moving two bytes
  // [   |       |       |       |       |]
  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
  // [___LLLLLLLL_________________________]
  // [_________________________LLLLLLLL___]
  // [___________________________SSSSSSSS_]
  // [_________________LLLLLLLL___________]
  // [___________________SSSSSSSS_________]
  // [_________LLLLLLLL___________________]
  // [___________SSSSSSSS_________________]
  // [_____SSSSSSSS_______________________]
  static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
    const auto head_value = T::load(src);
    ptrdiff_t offset = count - Size;
#pragma nounroll
    do {
      block(dst + offset, src + offset);
      offset -= Size;
    } while (offset >= 0);
    T::store(dst, head_value);
  }
};

} // namespace __llvm_libc::generic

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
//===-- x86 implementation of memory function building blocks -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides x86 specific building blocks to compose memory functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H

#if defined(LLVM_LIBC_ARCH_X86_64)

#ifdef __SSE2__
#include <immintrin.h>
#endif //  __SSE2__

namespace __llvm_libc::x86 {

#if defined(__AVX512F__)
static constexpr size_t MAX_REG_SIZE = 64;
#elif defined(__AVX__)
static constexpr size_t MAX_REG_SIZE = 32;
#elif defined(__SSE2__)
static constexpr size_t MAX_REG_SIZE = 16;
#else
static constexpr size_t MAX_REG_SIZE = 8;
#endif

#if defined(__AVX512BW__)
static constexpr size_t CMP_MAX_REG_SIZE = 64;
#elif defined(__AVX2__)
static constexpr size_t CMP_MAX_REG_SIZE = 32;
#elif defined(__SSE2__)
static constexpr size_t CMP_MAX_REG_SIZE = 16;
#else
static constexpr size_t CMP_MAX_REG_SIZE = 8;
#endif

///////////////////////////////////////////////////////////////////////////////
// Memcpy
template <size_t Size> using Memcpy = builtin::Memcpy<Size>;

struct MemcpyAccelerator {
  static void copy(char *dst, const char *src, size_t count) {
    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memmove
template <size_t Size> using Memmove = generic::Memmove<Size, MAX_REG_SIZE>;

///////////////////////////////////////////////////////////////////////////////
// Memset
template <size_t Size> using Memset = generic::Memset<Size, MAX_REG_SIZE>;

///////////////////////////////////////////////////////////////////////////////
// Bcmp

template <size_t Size> static inline auto mismatch_mask(CPtr p1, CPtr p2) {
  using T = char __attribute__((__vector_size__(Size)));
  if constexpr (Size == 16) {
    return _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
  } else if constexpr (Size == 32) {
    return _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
  } else if constexpr (Size == 64) {
    return _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
  } else {
    deferred_static_assert("SIZE not implemented");
  }
}

template <size_t Size> struct Bcmp {
  using ME = Bcmp;
  static constexpr size_t SIZE = Size;

static inline BcmpType block(CPtr p1, CPtr p2) {
    if constexpr (Size <= 8) {
      return generic::Bcmp<Size>::block(p1, p2);
    } else if constexpr ((Size == 16 || Size == 32) &&
                         Size <= CMP_MAX_REG_SIZE) {
      const int mask = mismatch_mask<Size>(p1, p2);
      return static_cast<uint32_t>(mask);
    } else if constexpr (Size == 64 && Size <= CMP_MAX_REG_SIZE) {
      const uint64_t mask = mismatch_mask<Size>(p1, p2);
      const uint32_t mask_is_set = mask != 0;
      return mask_is_set;
    } else if constexpr (Size % CMP_MAX_REG_SIZE == 0) {
      for (size_t offset = 0; offset < Size; offset += CMP_MAX_REG_SIZE)
        if (auto value =
                Bcmp<CMP_MAX_REG_SIZE>::block(p1 + offset, p2 + offset))
          return value;
      return BcmpType::ZERO();
    } else {
      deferred_static_assert("SIZE not implemented");
    }
  }

static inline BcmpType tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_tail<ME>(p1, p2, count);
  }

static inline BcmpType head_tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1, p2) | tail(p1, p2, count);
  }

template <Arg AlignOn>
  static inline BcmpType block_and_align(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_block_and_align<ME, AlignOn>(p1, p2, count);
  }

static inline BcmpType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_loop_and_tail<ME>(p1, p2, count);
  }
};

static inline MemcmpType char_diff_no_zero(CPtr p1, CPtr p2, uint64_t mask) {
  const size_t diff_index = __builtin_ctzll(mask);
  const int16_t ca = p1[diff_index];
  const int16_t cb = p2[diff_index];
  return ca - cb;
}

///////////////////////////////////////////////////////////////////////////////
// Memcmp
template <size_t Size> struct Memcmp {
  using ME = Memcmp;
  static constexpr size_t SIZE = Size;

static inline MemcmpType block(CPtr p1, CPtr p2) {
    if constexpr (Size <= 8) {
      return generic::Memcmp<Size>::block(p1, p2);
    } else if constexpr ((Size == 16 || Size == 32 || Size == 64) &&
                         Size <= CMP_MAX_REG_SIZE) {
      if (auto mask = mismatch_mask<Size>(p1, p2))
        return char_diff_no_zero(p1, p2, mask);
      return MemcmpType::ZERO();
    } else if constexpr (Size % CMP_MAX_REG_SIZE == 0) {
      for (size_t offset = 0; offset < Size; offset += CMP_MAX_REG_SIZE)
        if (auto value =
                Memcmp<CMP_MAX_REG_SIZE>::block(p1 + offset, p2 + offset))
          return value;
      return MemcmpType::ZERO();
    } else {
      deferred_static_assert("SIZE not implemented");
    }
  }

static inline MemcmpType tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_tail<ME>(p1, p2, count);
  }

static inline MemcmpType head_tail(CPtr p1, CPtr p2, size_t count) {
    if (auto value = block(p1, p2))
      return value;
    return tail(p1, p2, count);
  }

template <Arg AlignOn>
  static inline MemcmpType block_and_align(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_block_and_align<ME, AlignOn>(p1, p2, count);
  }

static inline MemcmpType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_loop_and_tail<ME>(p1, p2, count);
  }
};

} // namespace __llvm_libc::x86

#endif // LLVM_LIBC_ARCH_X86_64

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
//===-- aarch64 implementation of memory function building blocks ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides aarch64 specific building blocks to compose memory
// functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H

#if defined(LLVM_LIBC_ARCH_AARCH64)

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif //__ARM_NEON

namespace __llvm_libc::aarch64 {

#ifdef __ARM_NEON
static constexpr size_t MAX_SIZE = 16;
#else
static constexpr size_t MAX_SIZE = 8;
#endif

///////////////////////////////////////////////////////////////////////////////
// Memcpy
template <size_t Size> using Memcpy = builtin::Memcpy<Size>;

///////////////////////////////////////////////////////////////////////////////
// Memset
template <size_t Size> using Memset = generic::Memset<Size, MAX_SIZE>;

template <size_t Size> struct BzeroCacheLine {
  using ME = BzeroCacheLine;
  static constexpr size_t SIZE = Size;

static inline void block(Ptr dst, uint8_t) {
    static_assert(Size == 64);
#if __SIZEOF_POINTER__ == 4
    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
#else
    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
#endif
  }

static inline void tail(Ptr dst, uint8_t value, size_t count) {
    return Memset<Size>::tail(dst, value, count);
  }

static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
    return base::memset_loop_and_tail<ME>(dst, value, count);
  }
};

inline static bool hasZva() {
  uint64_t zva_val;
  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
  // DC ZVA is permitted if DZP, bit [4] is zero.
  // BS, bits [3:0] is log2 of the block count in words.
  // So the next line checks whether the instruction is permitted and block
  // count is 16 words (i.e. 64 bytes).
  return (zva_val & 0b11111) == 0b00100;
}

static const unsigned char *as_u8(CPtr ptr) {
    return reinterpret_cast<const unsigned char *>(ptr);
  }

static inline BcmpType block(CPtr p1, CPtr p2) {
    if constexpr (Size <= 16) {
      return generic::Bcmp<Size>::block(p1, p2);
    } else if constexpr (Size == 16) {
      deferred_static_assert("SIZE not implemented");
    } else if constexpr (Size == 32) {
      auto _p1 = as_u8(p1);
      auto _p2 = as_u8(p2);
      uint8x16_t a = vld1q_u8(_p1);
      uint8x16_t b = vld1q_u8(_p1 + 16);
      uint8x16_t n = vld1q_u8(_p2);
      uint8x16_t o = vld1q_u8(_p2 + 16);
      uint8x16_t an = veorq_u8(a, n);
      uint8x16_t bo = veorq_u8(b, o);
      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
      // a difference between the two buffers.  We reduce this value down to 4
      // bytes in two steps. First, calculate the saturated move value when
      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
      // a single 32 bit nonzero value if a mismatch occurred.
      uint8x16_t anbo = vorrq_u8(an, bo);
      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
      return vmaxv_u32(anbo_reduced);
    } else if constexpr (is_power2(Size) && ((Size % MAX_SIZE) == 0)) {
      for (size_t offset = 0; offset < Size; offset += MAX_SIZE)
        if (auto value = Bcmp<MAX_SIZE>::block(p1 + offset, p2 + offset))
          return value;
      return 0U;
    } else {
      deferred_static_assert("SIZE not implemented");
    }
  }

static inline BcmpType tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_tail<ME>(p1, p2, count);
  }

static inline BcmpType head_tail(CPtr p1, CPtr p2, size_t count) {
    if constexpr (Size <= 8) {
      return generic::Bcmp<Size>::head_tail(p1, p2, count);
    } else if constexpr (Size == 16) {
      auto _p1 = as_u8(p1);
      auto _p2 = as_u8(p2);
      uint8x16_t a = vld1q_u8(_p1);
      uint8x16_t b = vld1q_u8(_p1 + count - 16);
      uint8x16_t n = vld1q_u8(_p2);
      uint8x16_t o = vld1q_u8(_p2 + count - 16);
      uint8x16_t an = veorq_s8(a, n);
      uint8x16_t bo = veorq_s8(b, o);
      // anbo = (a ^ n) | (b ^ o)
      uint8x16_t anbo = vorrq_s8(an, bo);
      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
      return vmaxv_u32(anbo_reduced);
    } else if constexpr (Size == 32) {
      auto _p1 = as_u8(p1);
      auto _p2 = as_u8(p2);
      uint8x16_t a = vld1q_u8(_p1);
      uint8x16_t b = vld1q_u8(_p1 + 16);
      uint8x16_t c = vld1q_u8(_p1 + count - 16);
      uint8x16_t d = vld1q_u8(_p1 + count - 32);
      uint8x16_t n = vld1q_u8(_p2);
      uint8x16_t o = vld1q_u8(_p2 + 16);
      uint8x16_t p = vld1q_u8(_p2 + count - 16);
      uint8x16_t q = vld1q_u8(_p2 + count - 32);
      uint8x16_t an = veorq_s8(a, n);
      uint8x16_t bo = veorq_s8(b, o);
      uint8x16_t cp = veorq_s8(c, p);
      uint8x16_t dq = veorq_s8(d, q);
      uint8x16_t anbo = vorrq_s8(an, bo);
      uint8x16_t cpdq = vorrq_s8(cp, dq);
      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
      // a nonzero 32 bit value if a mismatch occurred.
      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
      return vmaxv_u32(abnocpdq_reduced);
    } else {
      deferred_static_assert("SIZE not implemented");
    }
  }

static inline BcmpType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    return base::cmp_loop_and_tail<ME>(p1, p2, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memcmp
template <size_t Size> using Memcmp = generic::Memcmp<Size>;

} // namespace __llvm_libc::aarch64

#endif // LLVM_LIBC_ARCH_AARCH64

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
//===-- Implementation of bcmp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BCMP_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BCMP_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

static inline BcmpType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
#if defined(LLVM_LIBC_ARCH_AARCH64)
  using namespace ::__llvm_libc::aarch64;
  if (likely(count <= 32)) {
    if (unlikely(count >= 16)) {
      return Bcmp<16>::head_tail(p1, p2, count);
    }
    switch (count) {
    case 0:
      return BcmpType::ZERO();
    case 1:
      return Bcmp<1>::block(p1, p2);
    case 2:
      return Bcmp<2>::block(p1, p2);
    case 3:
      return Bcmp<2>::head_tail(p1, p2, count);
    case 4:
      return Bcmp<4>::block(p1, p2);
    case 5 ... 7:
      return Bcmp<4>::head_tail(p1, p2, count);
    case 8:
      return Bcmp<8>::block(p1, p2);
    case 9 ... 15:
      return Bcmp<8>::head_tail(p1, p2, count);
    }
  }

if (count <= 64)
    return Bcmp<32>::head_tail(p1, p2, count);

// Aligned loop if > 256, otherwise normal loop
  if (count > 256) {
    if (auto value = Bcmp<32>::block(p1, p2))
      return value;
    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
  }
  return Bcmp<32>::loop_and_tail(p1, p2, count);
#else
#if defined(LLVM_LIBC_ARCH_X86)
  using namespace ::__llvm_libc::x86;
#else
  using namespace ::__llvm_libc::generic;
#endif
  if (count == 0)
    return BcmpType::ZERO();
  if (count == 1)
    return Bcmp<1>::block(p1, p2);
  if (count == 2)
    return Bcmp<2>::block(p1, p2);
  if (count <= 4)
    return Bcmp<2>::head_tail(p1, p2, count);
  if (count <= 8)
    return Bcmp<4>::head_tail(p1, p2, count);
  if (count <= 16)
    return Bcmp<8>::head_tail(p1, p2, count);
  if (count <= 32)
    return Bcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
    return Bcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
    return Bcmp<64>::head_tail(p1, p2, count);
  if (auto value = base::cmp_block_and_align<Bcmp<32>>(p1, p2, count))
    return value;
  return Bcmp<32>::loop_and_tail(p1, p2, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BCMP_IMPLEMENTATIONS_H
//===-- Implementation of memcmp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

static inline MemcmpType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_X86
  /////////////////////////////////////////////////////////////////////////////
  using namespace __llvm_libc::x86;
  if (count == 0)
    return MemcmpType::ZERO();
  if (count == 1)
    return Memcmp<1>::block(p1, p2);
  if (count == 2)
    return Memcmp<2>::block(p1, p2);
  if (count == 3)
    return Memcmp<3>::block(p1, p2);
  if (count <= 8)
    return Memcmp<4>::head_tail(p1, p2, count);
  if (count <= 16)
    return Memcmp<8>::head_tail(p1, p2, count);
  if (count <= 32)
    return Memcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
    return Memcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
    return Memcmp<64>::head_tail(p1, p2, count);
  if (auto value = base::cmp_block_and_align<Memcmp<32>>(p1, p2, count))
    return value;
  return Memcmp<32>::loop_and_tail(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_AARCH64
  /////////////////////////////////////////////////////////////////////////////
  using namespace ::__llvm_libc::aarch64;
  if (count == 0) // [0, 0]
    return MemcmpType::ZERO();
  if (count == 1) // [1, 1]
    return Memcmp<1>::block(p1, p2);
  if (count == 2) // [2, 2]
    return Memcmp<2>::block(p1, p2);
  if (count == 3) // [3, 3]
    return Memcmp<3>::block(p1, p2);
  if (count < 8) // [4, 7]
    return Memcmp<4>::head_tail(p1, p2, count);
  if (count < 16) // [8, 15]
    return Memcmp<8>::head_tail(p1, p2, count);
  if (unlikely(count >= 128)) { // [128, ∞]
    if (auto value = base::cmp_block_and_align<Memcmp<16>>(p1, p2, count))
      return value;
    return Memcmp<32>::loop_and_tail(p1, p2, count);
  }
  if (Bcmp<16>::block(p1, p2)) // [16, 16]
    return Memcmp<16>::block(p1, p2);
  if (count < 32) // [17, 31]
    return Memcmp<16>::tail(p1, p2, count);
  if (Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
    return Memcmp<16>::block(p1 + 16, p2 + 16);
  if (count < 64) // [33, 63]
    return Memcmp<32>::tail(p1, p2, count);
  // [64, 127]
  return Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
#else
  /////////////////////////////////////////////////////////////////////////////
  // Default
  /////////////////////////////////////////////////////////////////////////////
  using namespace ::__llvm_libc::generic;
  if (count == 0)
    return MemcmpType::ZERO();
  if (count == 1)
    return Memcmp<1>::block(p1, p2);
  if (count == 2)
    return Memcmp<2>::block(p1, p2);
  if (count == 3)
    return Memcmp<3>::block(p1, p2);
  if (count <= 8)
    return Memcmp<4>::head_tail(p1, p2, count);
  if (count <= 16)
    return Memcmp<8>::head_tail(p1, p2, count);
  if (count <= 32)
    return Memcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
    return Memcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
    return Memcmp<64>::head_tail(p1, p2, count);
  if (auto value = base::cmp_block_and_align<Memcmp<32>>(p1, p2, count))
    return value;
  return Memcmp<32>::loop_and_tail(p1, p2, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H
//===-- Implementation of memset and bzero --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

// A general purpose implementation assuming cheap unaligned writes for sizes:
// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
// or 64 Bytes at a time, the compiler will expand them as needed.
//
// This implementation is subject to change as we benchmark more processors. We
// may also want to customize it for processors with specialized instructions
// that performs better (e.g. `rep stosb`).
//
// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
// We want to balance two things here:
//  - The number of redundant writes (when using `SetBlockOverlap`),
//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
//    such sizes).
//
// For the range 64-128:
//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
//  is wasteful near 65 but efficient toward 128.
//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
//  96 or 128 Bytes.
//  - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
//  for 65-96 and copy<96>+Overlap<32> for 97-128
//
// Benchmarks showed that redundant writes were cheap (for Intel X86) but
// conditional were expensive, even on processor that do not support writing 64B
// at a time (pre-AVX512F). We also want to favor short functions that allow
// more hot code to fit in the iL1 cache.
//
// Above 128 we have to use conditionals since we don't know the upper bound in
// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
// superior for sizes that mattered.
inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_X86
  /////////////////////////////////////////////////////////////////////////////
  using namespace __llvm_libc::x86;
  if (count == 0)
    return;
  if (count == 1)
    return Memset<1>::block(dst, value);
  if (count == 2)
    return Memset<2>::block(dst, value);
  if (count == 3)
    return Memset<3>::block(dst, value);
  if (count <= 8)
    return Memset<4>::head_tail(dst, value, count);
  if (count <= 16)
    return Memset<8>::head_tail(dst, value, count);
  if (count <= 32)
    return Memset<16>::head_tail(dst, value, count);
  if (count <= 64)
    return Memset<32>::head_tail(dst, value, count);
  if (count <= 128)
    return Memset<64>::head_tail(dst, value, count);

// Aligned loop
  base::memset_block_and_align<Memset<32>>(dst, value, count);
  return Memset<32>::loop_and_tail(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_AARCH64
  /////////////////////////////////////////////////////////////////////////////
  using namespace __llvm_libc::aarch64;
  if (count == 0)
    return;
  if (count <= 3) {
    Memset<1>::block(dst, value);
    if (count > 1)
      Memset<2>::tail(dst, value, count);
    return;
  }
  if (count <= 8)
    return Memset<4>::head_tail(dst, value, count);
  if (count <= 16)
    return Memset<8>::head_tail(dst, value, count);
  if (count <= 32)
    return Memset<16>::head_tail(dst, value, count);
  if (count <= (32 + 64)) {
    Memset<32>::block(dst, value);
    if (count <= 64)
      return Memset<32>::tail(dst, value, count);
    Memset<32>::block(dst + 32, value);
    Memset<32>::tail(dst, value, count);
    return;
  }
  if (count >= 448 && value == 0 && hasZva()) {
    base::memset_block_and_align<Memset<64>>(dst, 0, count);
    return BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
  } else {
    base::memset_block_and_align<Memset<16>>(dst, value, count);
    return Memset<64>::loop_and_tail(dst, value, count);
  }
#else
  /////////////////////////////////////////////////////////////////////////////
  // Default
  /////////////////////////////////////////////////////////////////////////////
  using namespace __llvm_libc::generic;
  if (count == 0)
    return;
  if (count == 1)
    return Memset<1>::block(dst, value);
  if (count == 2)
    return Memset<2>::block(dst, value);
  if (count == 3)
    return Memset<3>::block(dst, value);
  if (count <= 8)
    return Memset<4>::head_tail(dst, value, count);
  if (count <= 16)
    return Memset<8>::head_tail(dst, value, count);
  if (count <= 32)
    return Memset<16>::head_tail(dst, value, count);
  if (count <= 64)
    return Memset<32>::head_tail(dst, value, count);
  if (count <= 128)
    return Memset<64>::head_tail(dst, value, count);

// Aligned loop
  base::memset_block_and_align<Memset<32>>(dst, value, count);
  return Memset<32>::loop_and_tail(dst, value, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
//===-- Memcpy implementation -----------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

// Design rationale
// ================
//
// Using a profiler to observe size distributions for calls into libc
// functions, it was found most operations act on a small number of bytes.
// This makes it important to favor small sizes.
//
// The tests for `count` are in ascending order so the cost of branching is
// proportional to the cost of copying.
//
// The function is written in C++ for several reasons:
// - The compiler can __see__ the code, this is useful when performing Profile
//   Guided Optimization as the optimized code can take advantage of branching
//   probabilities.
// - It also allows for easier customization and favors testing multiple
//   implementation parameters.
// - As compilers and processors get better, the generated code is improved
//   with little change on the code side.

namespace __llvm_libc {

static inline void inline_memcpy(char *__restrict dst,
                                 const char *__restrict src, size_t count) {
  using namespace __llvm_libc::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_X86
  /////////////////////////////////////////////////////////////////////////////

// Whether to use only rep;movsb.
  constexpr bool USE_ONLY_REP_MOVSB =
      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);

// kRepMovsBSize == -1 : Only CopyAligned is used.
  // kRepMovsBSize ==  0 : Only RepMovsb is used.
  // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
  constexpr size_t REP_MOVS_B_SIZE =
#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
#else
      -1;
#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE

// Whether target supports AVX instructions.
  constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);

#if defined(__AVX__)
  static constexpr size_t LoopBlockSize = 64;
#else
  static constexpr size_t LoopBlockSize = 32;
#endif

if (USE_ONLY_REP_MOVSB)
    return x86::MemcpyAccelerator::copy(dst, src, count);

if (count == 0)
    return;
  if (count == 1)
    return Memcpy<1>::block(dst, src);
  if (count == 2)
    return Memcpy<2>::block(dst, src);
  if (count == 3)
    return Memcpy<3>::block(dst, src);
  if (count == 4)
    return Memcpy<4>::block(dst, src);
  if (count < 8)
    return Memcpy<4>::head_tail(dst, src, count);
  if (count < 16)
    return Memcpy<8>::head_tail(dst, src, count);
  if (count < 32)
    return Memcpy<16>::head_tail(dst, src, count);
  if (count < 64)
    return Memcpy<32>::head_tail(dst, src, count);
  if (count < 128)
    return Memcpy<64>::head_tail(dst, src, count);
  if (HAS_AVX && count < 256)
    return Memcpy<128>::head_tail(dst, src, count);
  if (count <= REP_MOVS_B_SIZE) {
    base::memcpy_block_and_align<Memcpy<32>, Arg::Dst>(dst, src, count);
    return Memcpy<LoopBlockSize>::loop_and_tail(dst, src, count);
  }
  return x86::MemcpyAccelerator::copy(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_AARCH64
  /////////////////////////////////////////////////////////////////////////////
  if (count == 0)
    return;
  if (count == 1)
    return Memcpy<1>::block(dst, src);
  if (count == 2)
    return Memcpy<2>::block(dst, src);
  if (count == 3)
    return Memcpy<3>::block(dst, src);
  if (count == 4)
    return Memcpy<4>::block(dst, src);
  if (count < 8)
    return Memcpy<4>::head_tail(dst, src, count);
  if (count < 16)
    return Memcpy<8>::head_tail(dst, src, count);
  if (count < 32)
    return Memcpy<16>::head_tail(dst, src, count);
  if (count < 64)
    return Memcpy<32>::head_tail(dst, src, count);
  if (count < 128)
    return Memcpy<64>::head_tail(dst, src, count);
  base::memcpy_block_and_align<Memcpy<16>, Arg::Src>(dst, src, count);
  return Memcpy<64>::loop_and_tail(dst, src, count);
#else
  /////////////////////////////////////////////////////////////////////////////
  // Default
  /////////////////////////////////////////////////////////////////////////////
  if (count == 0)
    return;
  if (count == 1)
    return Memcpy<1>::block(dst, src);
  if (count == 2)
    return Memcpy<2>::block(dst, src);
  if (count == 3)
    return Memcpy<3>::block(dst, src);
  if (count == 4)
    return Memcpy<4>::block(dst, src);
  if (count < 8)
    return Memcpy<4>::head_tail(dst, src, count);
  if (count < 16)
    return Memcpy<8>::head_tail(dst, src, count);
  if (count < 32)
    return Memcpy<16>::head_tail(dst, src, count);
  if (count < 64)
    return Memcpy<32>::head_tail(dst, src, count);
  if (count < 128)
    return Memcpy<64>::head_tail(dst, src, count);
  base::memcpy_block_and_align<Memcpy<32>, Arg::Src>(dst, src, count);
  return Memcpy<32>::loop_and_tail(dst, src, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
//===-- Implementation of bzero -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

inline static void inline_bzero(char *dst, size_t count) {
  inline_memset(dst, 0, count);
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H