[libc] Simplifies multi implementations

This is a roll forward of D101895 with two additional fixes:

Original Patch description:
> This is a follow up on D101524 which:
>
> - simplifies cpu features detection and usage,
> - flattens target dependent optimizations so it's obvious which implementations are generated,
> - provides an implementation targeting the host (march/mtune=native) for the mem* functions,
> - makes sure all implementations are unittested (provided the host can run them).

Additional fixes:
 - Fix uninitialized ALL_CPU_FEATURES
 - Use non pseudo microarch as it is only supported from Clang 12 on

Differential Revision: https://reviews.llvm.org/D102233
diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
index a44186e..b40b5d1 100644
--- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -2,11 +2,19 @@
 # Cpu features definition and flags
 # ------------------------------------------------------------------------------
 
+# Initialize ALL_CPU_FEATURES as empty list.
+set(ALL_CPU_FEATURES "")
+
 if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
-  set(ALL_CPU_FEATURES SSE SSE2 AVX AVX2 AVX512F)
-  list(SORT ALL_CPU_FEATURES)
+  set(ALL_CPU_FEATURES SSE2 SSE4_2 AVX2 AVX512F)
+  set(LIBC_COMPILE_OPTIONS_NATIVE -march=native)
+elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
+  set(LIBC_COMPILE_OPTIONS_NATIVE -mcpu=native)
 endif()
 
+# Making sure ALL_CPU_FEATURES is sorted.
+list(SORT ALL_CPU_FEATURES)
+
 # Function to check whether the target CPU supports the provided set of features.
 # Usage:
 # cpu_supports(
@@ -22,49 +30,6 @@
   endif()
 endfunction()
 
-# Function to compute the flags to pass down to the compiler.
-# Usage:
-# compute_flags(
-#   <output variable>
-#   MARCH <arch name or "native">
-#   REQUIRE <list of mandatory features to enable>
-#   REJECT <list of features to disable>
-# )
-function(compute_flags output_var)
-  cmake_parse_arguments(
-    "COMPUTE_FLAGS"
-    "" # Optional arguments
-    "MARCH" # Single value arguments
-    "REQUIRE;REJECT" # Multi value arguments
-    ${ARGN})
-  # Check that features are not required and rejected at the same time.
-  if(COMPUTE_FLAGS_REQUIRE AND COMPUTE_FLAGS_REJECT)
-    _intersection(var ${COMPUTE_FLAGS_REQUIRE} ${COMPUTE_FLAGS_REJECT})
-    if(var)
-      message(FATAL_ERROR "Cpu Features REQUIRE and REJECT ${var}")
-    endif()
-  endif()
-  # Generate the compiler flags in `current`.
-  if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang|GNU")
-    if(COMPUTE_FLAGS_MARCH)
-      list(APPEND current "-march=${COMPUTE_FLAGS_MARCH}")
-    endif()
-    foreach(feature IN LISTS COMPUTE_FLAGS_REQUIRE)
-      string(TOLOWER ${feature} lowercase_feature)
-      list(APPEND current "-m${lowercase_feature}")
-    endforeach()
-    foreach(feature IN LISTS COMPUTE_FLAGS_REJECT)
-      string(TOLOWER ${feature} lowercase_feature)
-      list(APPEND current "-mno-${lowercase_feature}")
-    endforeach()
-  else()
-    # In future, we can extend for other compilers.
-    message(FATAL_ERROR "Unkown compiler ${CMAKE_CXX_COMPILER_ID}.")
-  endif()
-  # Export the list of flags.
-  set(${output_var} "${current}" PARENT_SCOPE)
-endfunction()
-
 # ------------------------------------------------------------------------------
 # Internal helpers and utilities.
 # ------------------------------------------------------------------------------
@@ -94,39 +59,7 @@
 endfunction()
 _generate_check_code()
 
-# Compiles and runs the code generated above with the specified requirements.
-# This is helpful to infer which features a particular target supports or if
-# a specific features implies other features (e.g. BMI2 implies SSE2 and SSE).
-function(_check_defined_cpu_feature output_var)
-  cmake_parse_arguments(
-    "CHECK_DEFINED"
-    "" # Optional arguments
-    "MARCH" # Single value arguments
-    "REQUIRE;REJECT" # Multi value arguments
-    ${ARGN})
-  compute_flags(
-    flags
-    MARCH  ${CHECK_DEFINED_MARCH}
-    REQUIRE ${CHECK_DEFINED_REQUIRE}
-    REJECT  ${CHECK_DEFINED_REJECT})
-  try_run(
-    run_result compile_result "${CMAKE_CURRENT_BINARY_DIR}/check_${feature}"
-    "${CMAKE_CURRENT_BINARY_DIR}/cpu_features/check_cpu_features.cpp"
-    COMPILE_DEFINITIONS ${flags}
-    COMPILE_OUTPUT_VARIABLE compile_output
-    RUN_OUTPUT_VARIABLE run_output)
-  if("${run_result}" EQUAL 0)
-    set(${output_var}
-        "${run_output}"
-        PARENT_SCOPE)
-  elseif(NOT ${compile_result})
-    message(FATAL_ERROR "Failed to compile: ${compile_output}")
-  else()
-    message(FATAL_ERROR "Failed to run: ${run_output}")
-  endif()
-endfunction()
-
-set(LIBC_CPU_FEATURES "" CACHE PATH "supported CPU features")
+set(LIBC_CPU_FEATURES "" CACHE PATH "Host supported CPU features")
 
 if(CMAKE_CROSSCOMPILING)
   _intersection(cpu_features "${ALL_CPU_FEATURES}" "${LIBC_CPU_FEATURES}")
@@ -135,13 +68,18 @@
   endif()
   set(LIBC_CPU_FEATURES "${cpu_features}")
 else()
-  # Populates the LIBC_CPU_FEATURES list.
-  # Use -march=native only when the compiler supports it.
-  include(CheckCXXCompilerFlag)
-  CHECK_CXX_COMPILER_FLAG("-march=native" COMPILER_SUPPORTS_MARCH_NATIVE)
-  if(COMPILER_SUPPORTS_MARCH_NATIVE)
-    _check_defined_cpu_feature(LIBC_CPU_FEATURES MARCH native)
+  # Populates the LIBC_CPU_FEATURES list from host.
+  try_run(
+    run_result compile_result "${CMAKE_CURRENT_BINARY_DIR}/check_${feature}"
+    "${CMAKE_CURRENT_BINARY_DIR}/cpu_features/check_cpu_features.cpp"
+    COMPILE_DEFINITIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
+    COMPILE_OUTPUT_VARIABLE compile_output
+    RUN_OUTPUT_VARIABLE run_output)
+  if("${run_result}" EQUAL 0)
+    set(LIBC_CPU_FEATURES "${run_output}")
+  elseif(NOT ${compile_result})
+    message(FATAL_ERROR "Failed to compile: ${compile_output}")
   else()
-    _check_defined_cpu_feature(LIBC_CPU_FEATURES)
+    message(FATAL_ERROR "Failed to run: ${run_output}")
   endif()
 endif()
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 351e942..f7a0406 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -186,20 +186,15 @@
   cmake_parse_arguments(
     "ADD_IMPL"
     "" # Optional arguments
-    "MARCH" # Single value arguments
-    "REQUIRE;REJECT;SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi value arguments
+    "" # Single value arguments
+    "REQUIRE;SRCS;HDRS;DEPENDS;COMPILE_OPTIONS" # Multi value arguments
     ${ARGN})
-  compute_flags(flags
-    MARCH ${ADD_IMPL_MARCH}
-    REQUIRE ${ADD_IMPL_REQUIRE}
-    REJECT ${ADD_IMPL_REJECT}
-  )
   add_entrypoint_object(${impl_name}
     NAME ${name}
     SRCS ${ADD_IMPL_SRCS}
     HDRS ${ADD_IMPL_HDRS}
     DEPENDS ${ADD_IMPL_DEPENDS}
-    COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} ${flags} -O2
+    COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS}
   )
   get_fq_target_name(${impl_name} fq_target_name)
   set_target_properties(${fq_target_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_IMPL_REQUIRE}")
@@ -210,17 +205,6 @@
 # memcpy
 # ------------------------------------------------------------------------------
 
-# include the relevant architecture specific implementations
-if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
-  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/${LIBC_TARGET_ARCHITECTURE}/memcpy.cpp)
-elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
-  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/${LIBC_TARGET_ARCHITECTURE}/memcpy.cpp)
-#Disable tail merging as it leads to lower performance
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mllvm --tail-merge-threshold=0")
-else()
-  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp)
-endif()
-
 function(add_memcpy memcpy_name)
   add_implementation(memcpy ${memcpy_name}
     SRCS ${MEMCPY_SRC}
@@ -235,8 +219,23 @@
 endfunction()
 
 if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
-  add_memcpy(memcpy MARCH native)
+  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86_64/memcpy.cpp)
+  add_memcpy(memcpy_x86_64_opt_sse2   COMPILE_OPTIONS -march=k8             REQUIRE SSE2)
+  add_memcpy(memcpy_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
+  add_memcpy(memcpy_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
+  add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
+  add_memcpy(memcpy)
+elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
+  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp)
+  # Disable tail merging as it leads to lower performance.
+  # Note that '-mllvm' needs to be prefixed with 'SHELL:' to prevent CMake flag deduplication.
+  add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}
+                                      COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0")
+  add_memcpy(memcpy                   COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0")
 else()
+  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp)
+  add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memcpy(memcpy)
 endif()
 
@@ -258,8 +257,14 @@
 endfunction()
 
 if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
-  add_memset(memset MARCH native)
+  add_memset(memset_x86_64_opt_sse2   COMPILE_OPTIONS -march=k8             REQUIRE SSE2)
+  add_memset(memset_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
+  add_memset(memset_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
+  add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
+  add_memset(memset)
 else()
+  add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memset(memset)
 endif()
 
@@ -282,15 +287,13 @@
 endfunction()
 
 if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
-  add_bzero(bzero MARCH native)
-else()
+  add_bzero(bzero_x86_64_opt_sse2   COMPILE_OPTIONS -march=k8             REQUIRE SSE2)
+  add_bzero(bzero_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
+  add_bzero(bzero_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
+  add_bzero(bzero_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_bzero(bzero_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_bzero(bzero)
-endif()
-
-# ------------------------------------------------------------------------------
-# Add all other relevant implementations for the native target.
-# ------------------------------------------------------------------------------
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_ARCHITECTURE})
-  include(${LIBC_TARGET_ARCHITECTURE}/CMakeLists.txt)
+else()
+  add_bzero(bzero_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
+  add_bzero(bzero)
 endif()
diff --git a/libc/src/string/aarch64/CMakeLists.txt b/libc/src/string/aarch64/CMakeLists.txt
deleted file mode 100644
index c673f5b..0000000
--- a/libc/src/string/aarch64/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}")
diff --git a/libc/src/string/x86_64/CMakeLists.txt b/libc/src/string/x86_64/CMakeLists.txt
deleted file mode 100644
index 9d0dffa..0000000
--- a/libc/src/string/x86_64/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_none" REJECT "${ALL_CPU_FEATURES}")
-add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_sse" REQUIRE "SSE" REJECT "SSE2")
-add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_avx" REQUIRE "AVX" REJECT "AVX2")
-add_memcpy("memcpy_${LIBC_TARGET_ARCHITECTURE}_opt_avx512f" REQUIRE "AVX512F")
-
-add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_none" REJECT "${ALL_CPU_FEATURES}")
-add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_sse" REQUIRE "SSE" REJECT "SSE2")
-add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_avx" REQUIRE "AVX" REJECT "AVX2")
-add_memset("memset_${LIBC_TARGET_ARCHITECTURE}_opt_avx512f" REQUIRE "AVX512F")
-
-add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_none" REJECT "${ALL_CPU_FEATURES}")
-add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_sse" REQUIRE "SSE" REJECT "SSE2")
-add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_avx" REQUIRE "AVX" REJECT "AVX2")
-add_bzero("bzero_${LIBC_TARGET_ARCHITECTURE}_opt_avx512f" REQUIRE "AVX512F")
diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
index 511575f..722d85a 100644
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -196,6 +196,8 @@
           libc_string_unittests
         DEPENDS
           ${fq_config_name}
+        COMPILE_OPTIONS
+          ${LIBC_COMPILE_OPTIONS_NATIVE}
         ${ARGN}
       )
     else()