Compiler Explorer

Source code

#include <type_traits>
#include <array>
#ifdef __x86_64__
#include <bit>
#endif
//===-- Common internal contructs -------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SUPPORT_COMMON_H
#define LLVM_LIBC_SUPPORT_COMMON_H

#define LIBC_INLINE_ASM __asm__ __volatile__

#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(x, 0)
#endif
#ifndef UNUSED
#define UNUSED __attribute__((unused))
#endif

#ifndef LLVM_LIBC_FUNCTION_ATTR
#define LLVM_LIBC_FUNCTION_ATTR
#endif

// MacOS needs to be excluded because it does not support aliasing.
#if defined(LLVM_LIBC_PUBLIC_PACKAGING) && (!defined(__APPLE__))
#define LLVM_LIBC_FUNCTION(type, name, arglist)                                \
  LLVM_LIBC_FUNCTION_ATTR decltype(__llvm_libc::name)                          \
      __##name##_impl__ __asm__(#name);                                        \
  decltype(__llvm_libc::name) name [[gnu::alias(#name)]];                      \
  type __##name##_impl__ arglist
#else
#define LLVM_LIBC_FUNCTION(type, name, arglist) type name arglist
#endif

namespace __llvm_libc {
namespace internal {
constexpr bool same_string(char const *lhs, char const *rhs) {
  for (; *lhs || *rhs; ++lhs, ++rhs)
    if (*lhs != *rhs)
      return false;
  return true;
}
} // namespace internal
} // namespace __llvm_libc

// LLVM_LIBC_IS_DEFINED checks whether a particular macro is defined.
// Usage: constexpr bool kUseAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
//
// This works by comparing the stringified version of the macro with and without
// evaluation. If FOO is not undefined both stringifications yield "FOO". If FOO
// is defined, one stringification yields "FOO" while the other yields its
// stringified value "1".
#define LLVM_LIBC_IS_DEFINED(macro)                                            \
  !__llvm_libc::internal::same_string(                                         \
      LLVM_LIBC_IS_DEFINED__EVAL_AND_STRINGIZE(macro), #macro)
#define LLVM_LIBC_IS_DEFINED__EVAL_AND_STRINGIZE(s) #s

#endif // LLVM_LIBC_SUPPORT_COMMON_H
//===-- Compile time architecture detection ---------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SUPPORT_ARCHITECTURES_H
#define LLVM_LIBC_SUPPORT_ARCHITECTURES_H

#if defined(__pnacl__) || defined(__CLR_VER)
#define LLVM_LIBC_ARCH_VM
#endif

#if (defined(_M_IX86) || defined(__i386__)) && !defined(LLVM_LIBC_ARCH_VM)
#define LLVM_LIBC_ARCH_X86_32
#endif

#if (defined(_M_X64) || defined(__x86_64__)) && !defined(LLVM_LIBC_ARCH_VM)
#define LLVM_LIBC_ARCH_X86_64
#endif

#if defined(LLVM_LIBC_ARCH_X86_32) || defined(LLVM_LIBC_ARCH_X86_64)
#define LLVM_LIBC_ARCH_X86
#endif

#if (defined(__arm__) || defined(_M_ARM))
#define LLVM_LIBC_ARCH_ARM
#endif

#if defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64)
#define LLVM_LIBC_ARCH_AARCH64
#endif

#if (defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_ARM))
#define LLVM_LIBC_ARCH_ANY_ARM
#endif

#if defined(LLVM_LIBC_ARCH_AARCH64)
#define LIBC_TARGET_HAS_FMA
#elif defined(LLVM_LIBC_ARCH_X86_64)
#if (defined(__AVX2__) || defined(__FMA__))
#define LIBC_TARGET_HAS_FMA
#endif
#endif

#endif // LLVM_LIBC_SUPPORT_ARCHITECTURES_H
//===-- Endianness support --------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_SUPPORT_ENDIAN_H
#define LLVM_LIBC_SRC_SUPPORT_ENDIAN_H

#include <stdint.h>

namespace __llvm_libc {

// We rely on compiler preprocessor defines to allow for cross compilation.
#if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) ||           \
    !defined(__ORDER_BIG_ENDIAN__)
#error "Missing preprocessor definitions for endianness detection."
#endif

namespace internal {

// Converts uint8_t, uint16_t, uint32_t, uint64_t to its big or little endian
// counterpart.
// We use explicit template specialization:
// - to prevent accidental integer promotion.
// - to prevent fallback in (unlikely) case of middle-endianness.

template <unsigned ORDER> struct Endian {
  static constexpr const bool IS_LITTLE = ORDER == __ORDER_LITTLE_ENDIAN__;
  static constexpr const bool IS_BIG = ORDER == __ORDER_BIG_ENDIAN__;
  template <typename T> static T to_big_endian(T value);
  template <typename T> static T to_little_endian(T value);
};

// Little Endian specializations
template <>
template <>
inline uint8_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint8_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint16_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint16_t>(uint16_t v) {
  return __builtin_bswap16(v);
}
template <>
template <>
inline uint16_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint16_t>(uint16_t v) {
  return v;
}
template <>
template <>
inline uint32_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint32_t>(uint32_t v) {
  return __builtin_bswap32(v);
}
template <>
template <>
inline uint32_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint32_t>(uint32_t v) {
  return v;
}
template <>
template <>
inline uint64_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_big_endian<uint64_t>(uint64_t v) {
  return __builtin_bswap64(v);
}
template <>
template <>
inline uint64_t
Endian<__ORDER_LITTLE_ENDIAN__>::to_little_endian<uint64_t>(uint64_t v) {
  return v;
}

// Big Endian specializations
template <>
template <>
inline uint8_t Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint8_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint8_t>(uint8_t v) {
  return v;
}
template <>
template <>
inline uint16_t
Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint16_t>(uint16_t v) {
  return v;
}
template <>
template <>
inline uint16_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint16_t>(uint16_t v) {
  return __builtin_bswap16(v);
}
template <>
template <>
inline uint32_t
Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint32_t>(uint32_t v) {
  return v;
}
template <>
template <>
inline uint32_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint32_t>(uint32_t v) {
  return __builtin_bswap32(v);
}
template <>
template <>
inline uint64_t
Endian<__ORDER_BIG_ENDIAN__>::to_big_endian<uint64_t>(uint64_t v) {
  return v;
}
template <>
template <>
inline uint64_t
Endian<__ORDER_BIG_ENDIAN__>::to_little_endian<uint64_t>(uint64_t v) {
  return __builtin_bswap64(v);
}

} // namespace internal

using Endian = internal::Endian<__BYTE_ORDER__>;

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_SUPPORT_ENDIAN_H
//===-- Freestanding version of bit_cast  -----------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SUPPORT_CPP_BIT_H
#define LLVM_LIBC_SUPPORT_CPP_BIT_H

namespace __llvm_libc::cpp {

#if defined __has_builtin
#if __has_builtin(__builtin_bit_cast)
#define LLVM_LIBC_HAS_BUILTIN_BIT_CAST
#endif
#endif

#if defined __has_builtin
#if __has_builtin(__builtin_memcpy_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
#endif
#endif

// This function guarantees the bitcast to be optimized away by the compiler for
// GCC >= 8 and Clang >= 6.
template <class To, class From> constexpr To bit_cast(const From &from) {
  static_assert(sizeof(To) == sizeof(From), "To and From must be of same size");
#if defined(LLVM_LIBC_HAS_BUILTIN_BIT_CAST)
  return __builtin_bit_cast(To, from);
#else
  To to;
  char *dst = reinterpret_cast<char *>(&to);
  const char *src = reinterpret_cast<const char *>(&from);
#if defined(LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE)
  __builtin_memcpy_inline(dst, src, sizeof(To));
#else
  for (unsigned i = 0; i < sizeof(To); ++i)
    dst[i] = src[i];
#endif // defined(LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE)
  return to;
#endif // defined(LLVM_LIBC_HAS_BUILTIN_BIT_CAST)
}

} // namespace __llvm_libc::cpp

#endif // LLVM_LIBC_SUPPORT_CPP_BIT_H
//===-- Memory utils --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
#define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H

#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t

namespace __llvm_libc {

// Allows compile time error reporting in `if constexpr` branches.
template <bool flag = false>
static void deferred_static_assert(const char *msg) {
  static_assert(flag, "compilation error");
  (void)msg;
}

// Return whether `value` is zero or a power of two.
static constexpr bool is_power2_or_zero(size_t value) {
  return (value & (value - 1U)) == 0;
}

// Return whether `value` is a power of two.
static constexpr bool is_power2(size_t value) {
  return value && is_power2_or_zero(value);
}

// Compile time version of log2 that handles 0.
static constexpr size_t log2(size_t value) {
  return (value == 0 || value == 1) ? 0 : 1 + log2(value / 2);
}

// Returns the first power of two preceding value or value if it is already a
// power of two (or 0 when value is 0).
static constexpr size_t le_power2(size_t value) {
  return value == 0 ? value : 1ULL << log2(value);
}

// Returns the first power of two following value or value if it is already a
// power of two (or 0 when value is 0).
static constexpr size_t ge_power2(size_t value) {
  return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
}

// Returns the number of bytes to substract from ptr to get to the previous
// multiple of alignment. If ptr is already aligned returns 0.
template <size_t alignment> intptr_t offset_to_align_up(const void *ptr) {
  static_assert(is_power2(alignment), "alignment must be a power of 2");
  return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns 0.
template <size_t alignment>
intptr_t offset_to_next_aligned_or_zero(const void *ptr) {
  static_assert(is_power2(alignment), "alignment must be a power of 2");
  // The logic is not straightforward and involves unsigned modulo arithmetic
  // but the generated code is as fast as it can be.
  return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}

// Returns the number of bytes to add to ptr to get to the next multiple of
// alignment. If ptr is already aligned returns alignment.
template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
  return alignment - offset_to_align_up<alignment>(ptr);
}

// Returns the same pointer but notifies the compiler that it is aligned.
template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
  return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
}

#if defined __has_builtin
#if __has_builtin(__builtin_memcpy_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
#endif
#endif

#if defined __has_builtin
#if __has_builtin(__builtin_memset_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
#endif
#endif

// Performs a constant count copy.
template <size_t Size>
static inline void memcpy_inline(void *__restrict dst,
                                 const void *__restrict src) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
  __builtin_memcpy_inline(dst, src, Size);
#else
  for (size_t i = 0; i < Size; ++i)
    static_cast<char *>(dst)[i] = static_cast<const char *>(src)[i];
#endif
}

using Ptr = char *;        // Pointer to raw data.
using CPtr = const char *; // Const pointer to raw data.

// This type makes sure that we don't accidentally promote an integral type to
// another one. It is only constructible from the exact T type.
template <typename T> struct StrictIntegralType {
  static_assert(std::is_integral_v<T>);

// Can only be constructed from a T.
  template <typename U, std::enable_if_t<std::is_same_v<U, T>, bool> = 0>
  StrictIntegralType(U value) : value(value) {}

// Allows using the type in an if statement.
  explicit operator bool() const { return value; }

// If type is unsigned (bcmp) we allow bitwise OR operations.
  StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
    static_assert(!std::is_signed_v<T>);
    return value | Rhs.value;
  }

// For interation with the C API we allow explicit conversion back to the
  // `int` type.
  explicit operator int() const {
    // bit_cast makes sure that T and int have the same size.
    return cpp::bit_cast<int>(value);
  }

// Helper to get the zero value.
  static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }

private:
  T value;
};

using MemcmpReturnType = StrictIntegralType<int32_t>;
using BcmpReturnType = StrictIntegralType<uint32_t>;

// Loads bytes from memory (possibly unaligned) and materializes them as
// type.
template <typename T> static inline T load(CPtr ptr) {
  T Out;
  memcpy_inline<sizeof(T)>(&Out, ptr);
  return Out;
}

// Stores a value of type T in memory (possibly unaligned).
template <typename T> static inline void store(Ptr ptr, T value) {
  memcpy_inline<sizeof(T)>(ptr, &value);
}

// Advances the pointers p1 and p2 by offset bytes and decrease count by the
// same amount.
template <typename T1, typename T2>
static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
                          T2 *__restrict &p2, size_t &count) {
  p1 += offset;
  p2 += offset;
  count -= offset;
}

// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
// and decrease count by the same amount.
// We make sure the compiler knows about the adjusted pointer alignment.
template <size_t SIZE, typename T1, typename T2>
void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
                               size_t &count) {
  adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
  p1 = assume_aligned<SIZE>(p1);
}

// Same as align_p1_to_next_boundary above but with a single pointer instead.
template <size_t SIZE, typename T1>
void align_to_next_boundary(T1 *&p1, size_t &count) {
  CPtr dummy;
  align_p1_to_next_boundary<SIZE>(p1, dummy, count);
}

// An enum class that discriminates between the first and second pointer.
enum class Arg { P1, P2, Dst = P1, Src = P2 };

// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
// Precondition: &p1 != &p2
template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
                            size_t &count) {
  if constexpr (AlignOn == Arg::P1)
    align_p1_to_next_boundary<SIZE>(p1, p2, count);
  else if constexpr (AlignOn == Arg::P2)
    align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
  else
    deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
//===-- Implementation using the __builtin_XXX_inline ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides generic C++ building blocks to compose memory functions.
// They rely on the compiler to generate the best possible code through the use
// of the `__builtin_XXX_inline` builtins. These builtins are currently only
// available in Clang.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H

namespace __llvm_libc::builtin {

///////////////////////////////////////////////////////////////////////////////
// Memcpy
template <size_t Size> struct Memcpy {
  static constexpr size_t SIZE = Size;
  static inline void block(Ptr __restrict dst, CPtr __restrict src) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
    return __builtin_memcpy_inline(dst, src, SIZE);
#else
    deferred_static_assert("Missing __builtin_memcpy_inline");
    (void)dst;
    (void)src;
#endif
  }

static inline void tail(Ptr __restrict dst, CPtr __restrict src,
                          size_t count) {
    block(dst + count - SIZE, src + count - SIZE);
  }

static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
                               size_t count) {
    block(dst, src);
    tail(dst, src, count);
  }

static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
                                   size_t count) {
    size_t offset = 0;
    do {
      block(dst + offset, src + offset);
      offset += SIZE;
    } while (offset < count - SIZE);
    tail(dst, src, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memset
template <size_t Size> struct Memset {
  using ME = Memset;
  static constexpr size_t SIZE = Size;
  static inline void block(Ptr dst, uint8_t value) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
    __builtin_memset_inline(dst, value, Size);
#else
    deferred_static_assert("Missing __builtin_memset_inline");
    (void)dst;
    (void)value;
#endif
  }

static inline void tail(Ptr dst, uint8_t value, size_t count) {
    block(dst + count - SIZE, value);
  }

static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
    block(dst, value);
    tail(dst, value, count);
  }

static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
    size_t offset = 0;
    do {
      block(dst + offset, value);
      offset += SIZE;
    } while (offset < count - SIZE);
    tail(dst, value, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Bcmp
template <size_t Size> struct Bcmp {
  using ME = Bcmp;
  static constexpr size_t SIZE = Size;
  static inline BcmpReturnType block(CPtr, CPtr) {
    deferred_static_assert("Missing __builtin_memcmp_inline");
    return BcmpReturnType::ZERO();
  }

static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return BcmpReturnType::ZERO();
  }

static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return BcmpReturnType::ZERO();
  }

static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return BcmpReturnType::ZERO();
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memcmp
template <size_t Size> struct Memcmp {
  using ME = Memcmp;
  static constexpr size_t SIZE = Size;
  static inline MemcmpReturnType block(CPtr, CPtr) {
    deferred_static_assert("Missing __builtin_memcmp_inline");
    return MemcmpReturnType::ZERO();
  }

static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return MemcmpReturnType::ZERO();
  }

static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return MemcmpReturnType::ZERO();
  }

static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
    deferred_static_assert("Not implemented");
    return MemcmpReturnType::ZERO();
  }
};

} // namespace __llvm_libc::builtin

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
//===-- Generic implementation of memory function building blocks ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides generic C++ building blocks.
// Depending on the requested size, the block operation uses unsigned integral
// types, vector types or an array of the type with the maximum size.
//
// The maximum size is passed as a template argument. For instance, on x86
// platforms that only supports integral types the maximum size would be 8
// (corresponding to uint64_t). On this platform if we request the size 32, this
// would be treated as a std::array<uint64_t, 4>.
//
// On the other hand, if the platform is x86 with support for AVX the maximum
// size is 32 and the operation can be handled with a single native operation.
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H

#include <stdint.h>

namespace __llvm_libc::generic {

// CTPair and CTMap below implement a compile time map.
// This is useful to map from a Size to a type handling this size.
//
// Example usage:
// using MyMap = CTMap<CTPair<1, uint8_t>,
//                     CTPair<2, uint16_t>,
//                     >;
// ...
// using UInt8T = MyMap::find_type<1>;
template <size_t I, typename T> struct CTPair {
  using type = T;
  static CTPair get_pair(std::integral_constant<size_t, I>) { return {}; }
};
template <typename... Pairs> struct CTMap : public Pairs... {
  using Pairs::get_pair...;
  template <size_t I>
  using find_type =
      typename decltype(get_pair(std::integral_constant<size_t, I>{}))::type;
};

// Helper to test if a type is void.
template <typename T> inline constexpr bool is_void_v = std::is_same_v<T, void>;

// Implements load, store and splat for unsigned integral types.
template <typename T> struct ScalarType {
  using Type = T;
  static_assert(std::is_integral_v<Type> && !std::is_signed_v<Type>);

static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
  static inline void store(Ptr dst, Type value) {
    ::__llvm_libc::store<Type>(dst, value);
  }
  static inline Type splat(uint8_t value) {
    return Type(~0) / Type(0xFF) * Type(value);
  }
};

// Implements load, store and splat for vector types.
template <size_t Size> struct VectorType {
  using Type = uint8_t __attribute__((__vector_size__(Size)));
  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
  static inline void store(Ptr dst, Type value) {
    ::__llvm_libc::store<Type>(dst, value);
  }
  static inline Type splat(uint8_t value) {
    Type Out;
    // This for loop is optimized out for vector types.
    for (size_t i = 0; i < Size; ++i)
      Out[i] = static_cast<uint8_t>(value);
    return Out;
  }
};

// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit.
static_assert((UINTPTR_MAX == 4294967295U) ||
              (UINTPTR_MAX == 18446744073709551615UL));

// Map from sizes to structures offering static load, store and splat methods.
// Note: On platforms lacking vector support, we use the ArrayType below and
// decompose the operation in smaller pieces.
using NativeTypeMap =
    CTMap<CTPair<1, ScalarType<uint8_t>>,  //
          CTPair<2, ScalarType<uint16_t>>, //
          CTPair<4, ScalarType<uint32_t>>, //
#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
          CTPair<8, ScalarType<uint64_t>>, // Not available on 32bit
#endif                                     //
          CTPair<16, VectorType<16>>,      //
          CTPair<32, VectorType<32>>,      //
          CTPair<64, VectorType<64>>>;

// Implements load, store and splat for sizes not natively supported by the
// platform. SubType is either ScalarType or VectorType.
template <typename SubType, size_t ArraySize> struct ArrayType {
  using Type = std::array<typename SubType::Type, ArraySize>;
  static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
  static inline Type load(CPtr src) {
    Type Value;
    for (size_t I = 0; I < ArraySize; ++I)
      Value[I] = SubType::load(src + (I * SizeOfElement));
    return Value;
  }
  static inline void store(Ptr dst, Type Value) {
    for (size_t I = 0; I < ArraySize; ++I)
      SubType::store(dst + (I * SizeOfElement), Value[I]);
  }
  static inline Type splat(uint8_t value) {
    Type Out;
    for (size_t I = 0; I < ArraySize; ++I)
      Out[I] = SubType::splat(value);
    return Out;
  }
};

// Checks whether we should use an ArrayType.
template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
  return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
         !is_void_v<NativeTypeMap::find_type<MaxSize>>;
}

// Compute the type to handle an operation of Size bytes knowing that the
// underlying platform only support native types up to MaxSize bytes.
template <size_t Size, size_t MaxSize>
using getTypeFor = std::conditional_t<
    useArrayType<Size, MaxSize>(),
    ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
    NativeTypeMap::find_type<Size>>;

///////////////////////////////////////////////////////////////////////////////
// Memcpy
// When building with clang we can delegate to the builtin implementation.
///////////////////////////////////////////////////////////////////////////////

template <size_t Size> using Memcpy = builtin::Memcpy<Size>;

///////////////////////////////////////////////////////////////////////////////
// Memset
// The MaxSize template argument gives the maximum size handled natively by the
// platform. For instance on x86 with AVX support this would be 32. If a size
// greater than MaxSize is requested we break the operation down in smaller
// pieces of size MaxSize.
///////////////////////////////////////////////////////////////////////////////
template <size_t Size, size_t MaxSize> struct Memset {
  static_assert(is_power2(MaxSize));
  static constexpr size_t SIZE = Size;

static inline void block(Ptr dst, uint8_t value) {
    if constexpr (Size == 3) {
      Memset<1, MaxSize>::block(dst + 2, value);
      Memset<2, MaxSize>::block(dst, value);
    } else {
      using T = getTypeFor<Size, MaxSize>;
      if constexpr (is_void_v<T>) {
        deferred_static_assert("Unimplemented Size");
      } else {
        T::store(dst, T::splat(value));
      }
    }
  }

static inline void tail(Ptr dst, uint8_t value, size_t count) {
    block(dst + count - SIZE, value);
  }

static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
    block(dst, value);
    tail(dst, value, count);
  }

///////////////////////////////////////////////////////////////////////////////
// Bcmp
///////////////////////////////////////////////////////////////////////////////
template <size_t Size> struct Bcmp {
  static constexpr size_t SIZE = Size;
  static constexpr size_t MaxSize = 8;

template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
    return load<T>(p1) ^ load<T>(p2);
  }

template <typename T>
  static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
    return load<T>(p1) != load<T>(p2);
  }

static inline BcmpReturnType block(CPtr p1, CPtr p2) {
    static constexpr size_t MaxSize = 8;
    if constexpr (Size == 1) {
      return load_xor<uint8_t>(p1, p2);
    } else if constexpr (Size == 2) {
      return load_xor<uint16_t>(p1, p2);
    } else if constexpr (Size == 4) {
      return load_xor<uint32_t>(p1, p2);
    } else if constexpr (Size == 8) {
      return load_not_equal<uint64_t>(p1, p2);
    } else if constexpr (useArrayType<Size, MaxSize>()) {
      for (size_t offset = 0; offset < Size; offset += MaxSize)
        if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
          return value;
    } else {
      deferred_static_assert("Unimplemented Size");
    }
    return BcmpReturnType::ZERO();
  }

static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1 + count - SIZE, p2 + count - SIZE);
  }

static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1, p2) | tail(p1, p2, count);
  }

static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    size_t offset = 0;
    do {
      if (auto value = block(p1 + offset, p2 + offset))
        return value;
      offset += SIZE;
    } while (offset < count - SIZE);
    return tail(p1, p2, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memcmp
///////////////////////////////////////////////////////////////////////////////
template <size_t Size> struct Memcmp {
  static constexpr size_t SIZE = Size;
  static constexpr size_t MaxSize = 8;

template <typename T> static inline T load_be(CPtr ptr) {
    return Endian::to_big_endian(load<T>(ptr));
  }

template <typename T>
  static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) {
    return load_be<T>(p1) - load_be<T>(p2);
  }

template <typename T>
  static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
    const auto la = load_be<T>(p1);
    const auto lb = load_be<T>(p2);
    return la > lb ? 1 : la < lb ? -1 : 0;
  }

static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
    if constexpr (Size == 1) {
      return load_be_diff<uint8_t>(p1, p2);
    } else if constexpr (Size == 2) {
      return load_be_diff<uint16_t>(p1, p2);
    } else if constexpr (Size == 4) {
      return load_be_cmp<uint32_t>(p1, p2);
    } else if constexpr (Size == 8) {
      return load_be_cmp<uint64_t>(p1, p2);
    } else if constexpr (useArrayType<Size, MaxSize>()) {
      for (size_t offset = 0; offset < Size; offset += MaxSize)
        if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
          return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
      return MemcmpReturnType::ZERO();
    } else if constexpr (Size == 3) {
      if (auto value = Memcmp<2>::block(p1, p2))
        return value;
      return Memcmp<1>::block(p1 + 2, p2 + 2);
    } else {
      deferred_static_assert("Unimplemented Size");
    }
  }

static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1 + count - SIZE, p2 + count - SIZE);
  }

static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
    if (auto value = block(p1, p2))
      return value;
    return tail(p1, p2, count);
  }

static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    size_t offset = 0;
    do {
      if (auto value = block(p1 + offset, p2 + offset))
        return value;
      offset += SIZE;
    } while (offset < count - SIZE);
    return tail(p1, p2, count);
  }
};

///////////////////////////////////////////////////////////////////////////////
// Memmove
///////////////////////////////////////////////////////////////////////////////

template <size_t Size, size_t MaxSize> struct Memmove {
  static_assert(is_power2(MaxSize));
  using T = getTypeFor<Size, MaxSize>;
  static constexpr size_t SIZE = Size;

static inline void block(Ptr dst, CPtr src) {
    if constexpr (is_void_v<T>) {
      deferred_static_assert("Unimplemented Size");
    } else {
      T::store(dst, T::load(src));
    }
  }

static inline void head_tail(Ptr dst, CPtr src, size_t count) {
    const size_t offset = count - Size;
    if constexpr (is_void_v<T>) {
      deferred_static_assert("Unimplemented Size");
    } else {
      // The load and store operations can be performed in any order as long as
      // they are not interleaved. More investigations are needed to determine
      // the best order.
      const auto head = T::load(src);
      const auto tail = T::load(src + offset);
      T::store(dst, head);
      T::store(dst + offset, tail);
    }
  }

// Align forward suitable when dst < src. The alignment is performed with
  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
  //
  // e.g. Moving two bytes forward, we make sure src is aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
  // [____LLLLLLLL_____________________]
  // [___________LLLLLLLA______________]
  // [_SSSSSSSS________________________]
  // [________SSSSSSSS_________________]
  //
  // e.g. Moving two bytes forward, we make sure dst is aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
  // [____LLLLLLLL_____________________]
  // [______LLLLLLLL___________________]
  // [_SSSSSSSS________________________]
  // [___SSSSSSSA______________________]
  template <Arg AlignOn>
  static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
    Ptr prev_dst = dst;
    CPtr prev_src = src;
    size_t prev_count = count;
    align_to_next_boundary<Size, AlignOn>(dst, src, count);
    adjust(Size, dst, src, count);
    head_tail(prev_dst, prev_src, prev_count - count);
  }

// Align backward suitable when dst > src. The alignment is performed with
  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
  //
  // e.g. Moving two bytes backward, we make sure src is aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
  // [ _________________ALLLLLLL_______]
  // [ ___________________LLLLLLLL_____]
  // [____________________SSSSSSSS_____]
  // [______________________SSSSSSSS___]
  //
  // e.g. Moving two bytes backward, we make sure dst is aligned.
  // [  |       |       |       |      ]
  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
  // [ _______________LLLLLLLL_________]
  // [ ___________________LLLLLLLL_____]
  // [__________________ASSSSSSS_______]
  // [______________________SSSSSSSS___]
  template <Arg AlignOn>
  static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
    Ptr headtail_dst = dst + count;
    CPtr headtail_src = src + count;
    size_t headtail_size = 0;
    align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
                                          headtail_size);
    adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
    head_tail(headtail_dst, headtail_src, headtail_size);
    count -= headtail_size;
  }

// Move forward suitable when dst < src. We load the tail bytes before
  // handling the loop.
  //
  // e.g. Moving two bytes
  // [   |       |       |       |       |]
  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
  // [_________________________LLLLLLLL___]
  // [___LLLLLLLL_________________________]
  // [_SSSSSSSS___________________________]
  // [___________LLLLLLLL_________________]
  // [_________SSSSSSSS___________________]
  // [___________________LLLLLLLL_________]
  // [_________________SSSSSSSS___________]
  // [_______________________SSSSSSSS_____]
  static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
    const size_t tail_offset = count - Size;
    const auto tail_value = T::load(src + tail_offset);
    size_t offset = 0;
#pragma nounroll
    do {
      block(dst + offset, src + offset);
      offset += Size;
    } while (offset < count - Size);
    T::store(dst + tail_offset, tail_value);
  }

// Move backward suitable when dst > src. We load the head bytes before
  // handling the loop.
  //
  // e.g. Moving two bytes
  // [   |       |       |       |       |]
  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
  // [___LLLLLLLL_________________________]
  // [_________________________LLLLLLLL___]
  // [___________________________SSSSSSSS_]
  // [_________________LLLLLLLL___________]
  // [___________________SSSSSSSS_________]
  // [_________LLLLLLLL___________________]
  // [___________SSSSSSSS_________________]
  // [_____SSSSSSSS_______________________]
  static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
    const auto head_value = T::load(src);
    ptrdiff_t offset = count - Size;
#pragma nounroll
    do {
      block(dst + offset, src + offset);
      offset -= Size;
    } while (offset >= 0);
    T::store(dst, head_value);
  }
};

} // namespace __llvm_libc::generic

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
//===-- x86 implementation of memory function building blocks -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides x86 specific building blocks to compose memory functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H

#if defined(LLVM_LIBC_ARCH_X86_64)

#ifdef __SSE2__
#include <immintrin.h>
#else
// Define fake functions to prevent the compiler from failing on undefined
// functions in case SSE2 is not present.
#define _mm512_cmpneq_epi8_mask(A, B) 0
#define _mm_movemask_epi8(A) 0
#define _mm256_movemask_epi8(A) 0
#endif //  __SSE2__

namespace __llvm_libc::x86 {

// A set of constants to check compile time features.
static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);

///////////////////////////////////////////////////////////////////////////////
// Memcpy repmovsb implementation
struct Memcpy {
  static void repmovsb(char *dst, const char *src, size_t count) {
    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
  }
};

///////////////////////////////////////////////////////////////////////////////
// Bcmp

// Base implementation for the Bcmp specializations.
//  - BlockSize is either 16, 32 or 64 depending on the available compile time
// features, it is used to switch between "single native operation" or a
// "sequence of native operations".
//  - BlockBcmp is the function that implements the bcmp logic.
template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
    if constexpr (Size == BlockSize) {
      return BlockBcmp(p1, p2);
    } else if constexpr (Size % BlockSize == 0) {
      for (size_t offset = 0; offset < Size; offset += BlockSize)
        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
          return value;
    } else {
      deferred_static_assert("SIZE not implemented");
    }
    return BcmpReturnType::ZERO();
  }

static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1 + count - Size, p2 + count - Size);
  }

static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1, p2) | tail(p1, p2, count);
  }

static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    size_t offset = 0;
    do {
      if (auto value = block(p1 + offset, p2 + offset))
        return value;
      offset += Size;
    } while (offset < count - Size);
    return tail(p1, p2, count);
  }
};

namespace sse2 {
static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
  using T = char __attribute__((__vector_size__(16)));
  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
  const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
  return static_cast<uint32_t>(mask);
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
} // namespace sse2

namespace avx2 {
static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
  using T = char __attribute__((__vector_size__(32)));
  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
  const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
  // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
  // mask.
  return static_cast<uint32_t>(mask);
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
} // namespace avx2

namespace avx512bw {
static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
  using T = char __attribute__((__vector_size__(64)));
  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
  const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
  const bool mask_is_set = mask != 0;
  return static_cast<uint32_t>(mask_is_set);
}
template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
} // namespace avx512bw

// Assuming that the mask is non zero, the index of the first mismatching byte
// is the number of trailing zeros in the mask. Trailing zeros and not leading
// zeros because the x86 architecture is little endian.
static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
                                                 uint64_t mask) {
  const size_t diff_index = __builtin_ctzll(mask);
  const int16_t ca = p1[diff_index];
  const int16_t cb = p2[diff_index];
  return ca - cb;
}

///////////////////////////////////////////////////////////////////////////////
// Memcmp

// Base implementation for the Memcmp specializations.
//  - BlockSize is either 16, 32 or 64 depending on the available compile time
// features, it is used to switch between "single native operation" or a
// "sequence of native operations".
//  - BlockMemcmp is the function that implements the memcmp logic.
//  - BlockBcmp is the function that implements the bcmp logic.
template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
struct MemcmpImpl {
  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
    if constexpr (Size == BlockSize) {
      return BlockMemcmp(p1, p2);
    } else if constexpr (Size % BlockSize == 0) {
      for (size_t offset = 0; offset < Size; offset += BlockSize)
        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
          return BlockMemcmp(p1 + offset, p2 + offset);
    } else {
      deferred_static_assert("SIZE not implemented");
    }
    return MemcmpReturnType::ZERO();
  }

static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1 + count - Size, p2 + count - Size);
  }

static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
    if (auto value = block(p1, p2))
      return value;
    return tail(p1, p2, count);
  }

static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
    size_t offset = 0;
    do {
      if (auto value = block(p1 + offset, p2 + offset))
        return value;
      offset += Size;
    } while (offset < count - Size);
    return tail(p1, p2, count);
  }
};

namespace sse2 {
static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
  using T = char __attribute__((__vector_size__(16)));
  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
  if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
    return char_diff_no_zero(p1, p2, mask);
  return MemcmpReturnType::ZERO();
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
} // namespace sse2

namespace avx2 {
static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
  using T = char __attribute__((__vector_size__(32)));
  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
  if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
    return char_diff_no_zero(p1, p2, mask);
  return MemcmpReturnType::ZERO();
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
} // namespace avx2

namespace avx512bw {
static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
  using T = char __attribute__((__vector_size__(64)));
  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
  if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
    return char_diff_no_zero(p1, p2, mask);
  return MemcmpReturnType::ZERO();
}
template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
} // namespace avx512bw

} // namespace __llvm_libc::x86

#endif // LLVM_LIBC_ARCH_X86_64

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
//===-- aarch64 implementation of memory function building blocks ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file provides aarch64 specific building blocks to compose memory
// functions.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H

#if defined(LLVM_LIBC_ARCH_AARCH64)

#ifdef __ARM_NEON
#include <arm_neon.h>
#endif //__ARM_NEON

namespace __llvm_libc::aarch64 {

static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);

namespace neon {

template <size_t Size> struct BzeroCacheLine {
  static constexpr size_t SIZE = Size;

static inline void block(Ptr dst, uint8_t) {
    static_assert(Size == 64);
#if __SIZEOF_POINTER__ == 4
    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
#else
    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
#endif
  }

static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
    size_t offset = 0;
    do {
      block(dst + offset, value);
      offset += SIZE;
    } while (offset < count - SIZE);
    // Unaligned store, we can't use 'dc zva' here.
    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
  }
};

inline static bool hasZva() {
  uint64_t zva_val;
  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
  // DC ZVA is permitted if DZP, bit [4] is zero.
  // BS, bits [3:0] is log2 of the block count in words.
  // So the next line checks whether the instruction is permitted and block
  // count is 16 words (i.e. 64 bytes).
  return (zva_val & 0b11111) == 0b00100;
}

} // namespace neon

///////////////////////////////////////////////////////////////////////////////
// Memset

///////////////////////////////////////////////////////////////////////////////
// Bcmp
template <size_t Size> struct Bcmp {
  static constexpr size_t SIZE = Size;
  static constexpr size_t BlockSize = 32;

static const unsigned char *as_u8(CPtr ptr) {
    return reinterpret_cast<const unsigned char *>(ptr);
  }

static inline BcmpReturnType block(CPtr p1, CPtr p2) {
    if constexpr (Size == BlockSize) {
      auto _p1 = as_u8(p1);
      auto _p2 = as_u8(p2);
      uint8x16_t a = vld1q_u8(_p1);
      uint8x16_t b = vld1q_u8(_p1 + 16);
      uint8x16_t n = vld1q_u8(_p2);
      uint8x16_t o = vld1q_u8(_p2 + 16);
      uint8x16_t an = veorq_u8(a, n);
      uint8x16_t bo = veorq_u8(b, o);
      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
      // a difference between the two buffers.  We reduce this value down to 4
      // bytes in two steps. First, calculate the saturated move value when
      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
      // a single 32 bit nonzero value if a mismatch occurred.
      uint8x16_t anbo = vorrq_u8(an, bo);
      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
      return vmaxv_u32(anbo_reduced);
    } else if constexpr ((Size % BlockSize) == 0) {
      for (size_t offset = 0; offset < Size; offset += BlockSize)
        if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
          return value;
    } else {
      deferred_static_assert("SIZE not implemented");
    }
    return BcmpReturnType::ZERO();
  }

static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
    return block(p1 + count - SIZE, p2 + count - SIZE);
  }

static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
    if constexpr (Size <= 8) {
      return generic::Bcmp<Size>::head_tail(p1, p2, count);
    } else if constexpr (Size == 16) {
      auto _p1 = as_u8(p1);
      auto _p2 = as_u8(p2);
      uint8x16_t a = vld1q_u8(_p1);
      uint8x16_t b = vld1q_u8(_p1 + count - 16);
      uint8x16_t n = vld1q_u8(_p2);
      uint8x16_t o = vld1q_u8(_p2 + count - 16);
      uint8x16_t an = veorq_s8(a, n);
      uint8x16_t bo = veorq_s8(b, o);
      // anbo = (a ^ n) | (b ^ o)
      uint8x16_t anbo = vorrq_s8(an, bo);
      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
      return vmaxv_u32(anbo_reduced);
    } else if constexpr (Size == 32) {
      auto _p1 = as_u8(p1);
      auto _p2 = as_u8(p2);
      uint8x16_t a = vld1q_u8(_p1);
      uint8x16_t b = vld1q_u8(_p1 + 16);
      uint8x16_t c = vld1q_u8(_p1 + count - 16);
      uint8x16_t d = vld1q_u8(_p1 + count - 32);
      uint8x16_t n = vld1q_u8(_p2);
      uint8x16_t o = vld1q_u8(_p2 + 16);
      uint8x16_t p = vld1q_u8(_p2 + count - 16);
      uint8x16_t q = vld1q_u8(_p2 + count - 32);
      uint8x16_t an = veorq_s8(a, n);
      uint8x16_t bo = veorq_s8(b, o);
      uint8x16_t cp = veorq_s8(c, p);
      uint8x16_t dq = veorq_s8(d, q);
      uint8x16_t anbo = vorrq_s8(an, bo);
      uint8x16_t cpdq = vorrq_s8(cp, dq);
      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
      // a nonzero 32 bit value if a mismatch occurred.
      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
      return vmaxv_u32(abnocpdq_reduced);
    } else {
      deferred_static_assert("SIZE not implemented");
    }
    return BcmpReturnType::ZERO();
  }

} // namespace __llvm_libc::aarch64

#endif // LLVM_LIBC_ARCH_AARCH64

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
//===-- Implementation of bcmp --------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BCMP_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BCMP_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
#if defined(LLVM_LIBC_ARCH_AARCH64)
  if (likely(count <= 32)) {
    if (unlikely(count >= 16)) {
      return generic::Bcmp<16>::head_tail(p1, p2, count);
    }
    switch (count) {
    case 0:
      return BcmpReturnType::ZERO();
    case 1:
      return generic::Bcmp<1>::block(p1, p2);
    case 2:
      return generic::Bcmp<2>::block(p1, p2);
    case 3:
      return generic::Bcmp<2>::head_tail(p1, p2, count);
    case 4:
      return generic::Bcmp<4>::block(p1, p2);
    case 5 ... 7:
      return generic::Bcmp<4>::head_tail(p1, p2, count);
    case 8:
      return generic::Bcmp<8>::block(p1, p2);
    case 9 ... 15:
      return generic::Bcmp<8>::head_tail(p1, p2, count);
    }
  }

if (count <= 64)
    return generic::Bcmp<32>::head_tail(p1, p2, count);

// Aligned loop if > 256, otherwise normal loop
  if (count > 256) {
    if (auto value = generic::Bcmp<32>::block(p1, p2))
      return value;
    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
  }
  return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_X86)
  if (count == 0)
    return BcmpReturnType::ZERO();
  if (count == 1)
    return generic::Bcmp<1>::block(p1, p2);
  if (count == 2)
    return generic::Bcmp<2>::block(p1, p2);
  if (count <= 4)
    return generic::Bcmp<2>::head_tail(p1, p2, count);
  if (count <= 8)
    return generic::Bcmp<4>::head_tail(p1, p2, count);
  if (count <= 16)
    return generic::Bcmp<8>::head_tail(p1, p2, count);
  if (count <= 32)
    return generic::Bcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
    return generic::Bcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
    return generic::Bcmp<64>::head_tail(p1, p2, count);
  if (auto value = generic::Bcmp<32>::block(p1, p2))
    return value;
  align_to_next_boundary<32, Arg::P1>(p1, p2, count);
  return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
#else
  if (count == 0)
    return BcmpReturnType::ZERO();
  if (count == 1)
    return generic::Bcmp<1>::block(p1, p2);
  if (count == 2)
    return generic::Bcmp<2>::block(p1, p2);
  if (count <= 4)
    return generic::Bcmp<2>::head_tail(p1, p2, count);
  if (count <= 8)
    return generic::Bcmp<4>::head_tail(p1, p2, count);
  if (count <= 16)
    return generic::Bcmp<8>::head_tail(p1, p2, count);
  if (count <= 32)
    return generic::Bcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
    return generic::Bcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
    return generic::Bcmp<64>::head_tail(p1, p2, count);
  if (auto value = generic::Bcmp<32>::block(p1, p2))
    return value;
  align_to_next_boundary<32, Arg::P1>(p1, p2, count);
  return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BCMP_IMPLEMENTATIONS_H
//===-- Implementation of memcmp ------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

static inline MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2,
                                                          size_t count) {
  if (count > 512) {
    if (auto value = generic::Memcmp<16>::block(p1, p2))
      return value;
    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
  }
  return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
}

#if defined(LLVM_LIBC_ARCH_X86)
static inline MemcmpReturnType inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2,
                                                           size_t count) {
  if (count > 512) {
    if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
      return value;
    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
  }
  return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
}

static inline MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2,
                                                           size_t count) {
  if (count <= 32)
    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
    return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
  if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
    return value;
  align_to_next_boundary<32, Arg::P1>(p1, p2, count);
  return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
}

static inline MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2,
                                                               size_t count) {
  if (count <= 32)
    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
  if (count <= 64)
    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
  if (count <= 128)
    return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
  if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
    return value;
  align_to_next_boundary<32, Arg::P1>(p1, p2, count);
  return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
}
#endif // defined(LLVM_LIBC_ARCH_X86)

#if defined(LLVM_LIBC_ARCH_AARCH64)
static inline MemcmpReturnType inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2,
                                                               size_t count) {
  if (unlikely(count >= 128)) { // [128, ∞]
    if (auto value = generic::Memcmp<16>::block(p1, p2))
      return value;
    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
    return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
  }
  if (count < 32) // [17, 31]
    return generic::Memcmp<16>::tail(p1, p2, count);
  if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
    return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
  if (count < 64) // [33, 63]
    return generic::Memcmp<32>::tail(p1, p2, count);
  // [64, 127]
  return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
}
#endif // defined(LLVM_LIBC_ARCH_AARCH64)

static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {

if (count == 0)
    return MemcmpReturnType::ZERO();
  if (count == 1)
    return generic::Memcmp<1>::block(p1, p2);
  if (count == 2)
    return generic::Memcmp<2>::block(p1, p2);
  if (count == 3)
    return generic::Memcmp<3>::block(p1, p2);
  if (count <= 8)
    return generic::Memcmp<4>::head_tail(p1, p2, count);
  if (count <= 16)
    return generic::Memcmp<8>::head_tail(p1, p2, count);
#if defined(LLVM_LIBC_ARCH_X86)
  if constexpr (x86::kAvx512BW)
    return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
  else if constexpr (x86::kAvx2)
    return inline_memcmp_x86_avx2_gt16(p1, p2, count);
  else if constexpr (x86::kSse2)
    return inline_memcmp_x86_sse2_gt16(p1, p2, count);
  else
    return inline_memcmp_generic_gt16(p1, p2, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_AARCH64
  /////////////////////////////////////////////////////////////////////////////
  if constexpr (aarch64::kNeon)
    return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
  else
    return inline_memcmp_generic_gt16(p1, p2, count);
#else
  return inline_memcmp_generic_gt16(p1, p2, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H
//===-- Implementation of memset and bzero --------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

// A general purpose implementation assuming cheap unaligned writes for sizes:
// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
// or 64 Bytes at a time, the compiler will expand them as needed.
//
// This implementation is subject to change as we benchmark more processors. We
// may also want to customize it for processors with specialized instructions
// that performs better (e.g. `rep stosb`).
//
// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
// We want to balance two things here:
//  - The number of redundant writes (when using `SetBlockOverlap`),
//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
//    such sizes).
//
// For the range 64-128:
//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
//  is wasteful near 65 but efficient toward 128.
//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
//  96 or 128 Bytes.
//  - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
//  for 65-96 and copy<96>+Overlap<32> for 97-128
//
// Benchmarks showed that redundant writes were cheap (for Intel X86) but
// conditional were expensive, even on processor that do not support writing 64B
// at a time (pre-AVX512F). We also want to favor short functions that allow
// more hot code to fit in the iL1 cache.
//
// Above 128 we have to use conditionals since we don't know the upper bound in
// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
// superior for sizes that mattered.
inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_X86
  /////////////////////////////////////////////////////////////////////////////
  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
                                     : x86::kAvx   ? 32
                                     : x86::kSse2  ? 16
                                                   : 8;
  if (count == 0)
    return;
  if (count == 1)
    return generic::Memset<1, kMaxSize>::block(dst, value);
  if (count == 2)
    return generic::Memset<2, kMaxSize>::block(dst, value);
  if (count == 3)
    return generic::Memset<3, kMaxSize>::block(dst, value);
  if (count <= 8)
    return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
  if (count <= 16)
    return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
  if (count <= 32)
    return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
  if (count <= 64)
    return generic::Memset<32, kMaxSize>::head_tail(dst, value, count);
  if (count <= 128)
    return generic::Memset<64, kMaxSize>::head_tail(dst, value, count);
  // Aligned loop
  generic::Memset<32, kMaxSize>::block(dst, value);
  align_to_next_boundary<32>(dst, count);
  return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_AARCH64
  /////////////////////////////////////////////////////////////////////////////
  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
  if (count == 0)
    return;
  if (count <= 3) {
    generic::Memset<1, kMaxSize>::block(dst, value);
    if (count > 1)
      generic::Memset<2, kMaxSize>::tail(dst, value, count);
    return;
  }
  if (count <= 8)
    return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
  if (count <= 16)
    return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
  if (count <= 32)
    return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
  if (count <= (32 + 64)) {
    generic::Memset<32, kMaxSize>::block(dst, value);
    if (count <= 64)
      return generic::Memset<32, kMaxSize>::tail(dst, value, count);
    generic::Memset<32, kMaxSize>::block(dst + 32, value);
    generic::Memset<32, kMaxSize>::tail(dst, value, count);
    return;
  }
  if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
    generic::Memset<64, kMaxSize>::block(dst, 0);
    align_to_next_boundary<64>(dst, count);
    return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
  } else {
    generic::Memset<16, kMaxSize>::block(dst, value);
    align_to_next_boundary<16>(dst, count);
    return generic::Memset<64, kMaxSize>::loop_and_tail(dst, value, count);
  }
#else
  /////////////////////////////////////////////////////////////////////////////
  // Default
  /////////////////////////////////////////////////////////////////////////////
  static constexpr size_t kMaxSize = 8;
  if (count == 0)
    return;
  if (count == 1)
    return generic::Memset<1, kMaxSize>::block(dst, value);
  if (count == 2)
    return generic::Memset<2, kMaxSize>::block(dst, value);
  if (count == 3)
    return generic::Memset<3, kMaxSize>::block(dst, value);
  if (count <= 8)
    return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
  if (count <= 16)
    return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
  if (count <= 32)
    return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
  if (count <= 64)
    return generic::Memset<32, kMaxSize>::head_tail(dst, value, count);
  if (count <= 128)
    return generic::Memset<64, kMaxSize>::head_tail(dst, value, count);
  // Aligned loop
  generic::Memset<32, kMaxSize>::block(dst, value);
  align_to_next_boundary<32>(dst, count);
  return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
//===-- Memcpy implementation -----------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

// Design rationale
// ================
//
// Using a profiler to observe size distributions for calls into libc
// functions, it was found most operations act on a small number of bytes.
// This makes it important to favor small sizes.
//
// The tests for `count` are in ascending order so the cost of branching is
// proportional to the cost of copying.
//
// The function is written in C++ for several reasons:
// - The compiler can __see__ the code, this is useful when performing Profile
//   Guided Optimization as the optimized code can take advantage of branching
//   probabilities.
// - It also allows for easier customization and favors testing multiple
//   implementation parameters.
// - As compilers and processors get better, the generated code is improved
//   with little change on the code side.

namespace __llvm_libc {

static inline void inline_memcpy(char *__restrict dst,
                                 const char *__restrict src, size_t count) {
  using namespace __llvm_libc::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_X86
  /////////////////////////////////////////////////////////////////////////////

// Whether to use only rep;movsb.
  constexpr bool USE_ONLY_REP_MOVSB =
      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);

// kRepMovsBSize == -1 : Only CopyAligned is used.
  // kRepMovsBSize ==  0 : Only RepMovsb is used.
  // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
  constexpr size_t REP_MOVS_B_SIZE =
#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
#else
      -1;
#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE

static constexpr size_t LoopBlockSize = x86::kAvx ? 64 : 32;

if (USE_ONLY_REP_MOVSB)
    return x86::Memcpy::repmovsb(dst, src, count);

if (count == 0)
    return;
  if (count == 1)
    return Memcpy<1>::block(dst, src);
  if (count == 2)
    return Memcpy<2>::block(dst, src);
  if (count == 3)
    return Memcpy<3>::block(dst, src);
  if (count == 4)
    return Memcpy<4>::block(dst, src);
  if (count < 8)
    return Memcpy<4>::head_tail(dst, src, count);
  if (count < 16)
    return Memcpy<8>::head_tail(dst, src, count);
  if (count < 32)
    return Memcpy<16>::head_tail(dst, src, count);
  if (count < 64)
    return Memcpy<32>::head_tail(dst, src, count);
  if (count < 128)
    return Memcpy<64>::head_tail(dst, src, count);
  if (x86::kAvx && count < 256)
    return Memcpy<128>::head_tail(dst, src, count);
  if (count <= REP_MOVS_B_SIZE) {
    Memcpy<32>::block(dst, src);
    align_to_next_boundary<32, Arg::Dst>(dst, src, count);
    return Memcpy<LoopBlockSize>::loop_and_tail(dst, src, count);
  }
  return x86::Memcpy::repmovsb(dst, src, count);
#elif defined(LLVM_LIBC_ARCH_AARCH64)
  /////////////////////////////////////////////////////////////////////////////
  // LLVM_LIBC_ARCH_AARCH64
  /////////////////////////////////////////////////////////////////////////////
  if (count == 0)
    return;
  if (count == 1)
    return Memcpy<1>::block(dst, src);
  if (count == 2)
    return Memcpy<2>::block(dst, src);
  if (count == 3)
    return Memcpy<3>::block(dst, src);
  if (count == 4)
    return Memcpy<4>::block(dst, src);
  if (count < 8)
    return Memcpy<4>::head_tail(dst, src, count);
  if (count < 16)
    return Memcpy<8>::head_tail(dst, src, count);
  if (count < 32)
    return Memcpy<16>::head_tail(dst, src, count);
  if (count < 64)
    return Memcpy<32>::head_tail(dst, src, count);
  if (count < 128)
    return Memcpy<64>::head_tail(dst, src, count);
  Memcpy<16>::block(dst, src);
  align_to_next_boundary<16, Arg::Src>(dst, src, count);
  return Memcpy<64>::loop_and_tail(dst, src, count);
#else
  /////////////////////////////////////////////////////////////////////////////
  // Default
  /////////////////////////////////////////////////////////////////////////////
  if (count == 0)
    return;
  if (count == 1)
    return Memcpy<1>::block(dst, src);
  if (count == 2)
    return Memcpy<2>::block(dst, src);
  if (count == 3)
    return Memcpy<3>::block(dst, src);
  if (count == 4)
    return Memcpy<4>::block(dst, src);
  if (count < 8)
    return Memcpy<4>::head_tail(dst, src, count);
  if (count < 16)
    return Memcpy<8>::head_tail(dst, src, count);
  if (count < 32)
    return Memcpy<16>::head_tail(dst, src, count);
  if (count < 64)
    return Memcpy<32>::head_tail(dst, src, count);
  if (count < 128)
    return Memcpy<64>::head_tail(dst, src, count);
  Memcpy<32>::block(dst, src);
  align_to_next_boundary<32, Arg::Src>(dst, src, count);
  return Memcpy<32>::loop_and_tail(dst, src, count);
#endif
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H
//===-- Implementation of bzero -------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H

#include <stddef.h> // size_t

namespace __llvm_libc {

inline static void inline_bzero(char *dst, size_t count) {
  inline_memset(dst, 0, count);
}

} // namespace __llvm_libc

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H