1 files changed, 854 insertions, 0 deletions
diff --git a/vendor/portable-atomic/src/imp/atomic128/x86_64.rs b/vendor/portable-atomic/src/imp/atomic128/x86_64.rs
new file mode 100644
index 0000000..3b9d141
--- /dev/null
+++ b/vendor/portable-atomic/src/imp/atomic128/x86_64.rs
@@ -0,0 +1,854 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// Atomic{I,U}128 implementation on x86_64 using CMPXCHG16B (DWCAS).
+//
+// Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
+// this module and use intrinsics.rs instead.
+//
+// Refs:
+// - x86 and amd64 instruction reference https://www.felixcloutier.com/x86
+// - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
+//
+// Generated asm:
+// - x86_64 (+cmpxchg16b) https://godbolt.org/z/55n54WeKr
+
+include!("macros.rs");
+
+#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
+#[path = "../fallback/outline_atomics.rs"]
+mod fallback;
+
+#[cfg(not(portable_atomic_no_outline_atomics))]
+#[cfg(not(target_env = "sgx"))]
+#[path = "detect/x86_64.rs"]
+mod detect;
+
+#[cfg(not(portable_atomic_no_asm))]
+use core::arch::asm;
+use core::sync::atomic::Ordering;
+
+use crate::utils::{Pair, U128};
+
+// Asserts that the function is called in the correct context.
+macro_rules! debug_assert_cmpxchg16b {
+    () => {
+        #[cfg(not(any(
+            target_feature = "cmpxchg16b",
+            portable_atomic_target_feature = "cmpxchg16b",
+        )))]
+        {
+            debug_assert!(detect::detect().has_cmpxchg16b());
+        }
+    };
+}
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+macro_rules! debug_assert_vmovdqa_atomic {
+    () => {{
+        debug_assert_cmpxchg16b!();
+        debug_assert!(detect::detect().has_vmovdqa_atomic());
+    }};
+}
+
+#[allow(unused_macros)]
+#[cfg(target_pointer_width = "32")]
+macro_rules! ptr_modifier {
+    () => {
+        ":e"
+    };
+}
+#[allow(unused_macros)]
+#[cfg(target_pointer_width = "64")]
+macro_rules! ptr_modifier {
+    () => {
+        ""
+    };
+}
+
+#[cfg_attr(
+    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+    target_feature(enable = "cmpxchg16b")
+)]
+#[inline]
+unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
+    debug_assert!(dst as usize % 16 == 0);
+    debug_assert_cmpxchg16b!();
+
+    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+    // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
+    // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
+    //
+    // If the value at `dst` (destination operand) and rdx:rax are equal, the
+    // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
+    // `dst` is loaded to rdx:rax.
+    //
+    // The ZF flag is set if the value at `dst` and rdx:rax are equal,
+    // otherwise it is cleared. Other flags are unaffected.
+    //
+    // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
+    unsafe {
+        // cmpxchg16b is always SeqCst.
+        let r: u8;
+        let old = U128 { whole: old };
+        let new = U128 { whole: new };
+        let (prev_lo, prev_hi);
+        macro_rules! cmpxchg16b {
+            ($rdi:tt) => {
+                asm!(
+                    // rbx is reserved by LLVM
+                    "xchg {rbx_tmp}, rbx",
+                    concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
+                    "sete r8b",
+                    "mov rbx, {rbx_tmp}", // restore rbx
+                    rbx_tmp = inout(reg) new.pair.lo => _,
+                    in("rcx") new.pair.hi,
+                    inout("rax") old.pair.lo => prev_lo,
+                    inout("rdx") old.pair.hi => prev_hi,
+                    in($rdi) dst,
+                    out("r8b") r,
+                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
+                    options(nostack),
+                )
+            };
+        }
+        #[cfg(target_pointer_width = "32")]
+        cmpxchg16b!("edi");
+        #[cfg(target_pointer_width = "64")]
+        cmpxchg16b!("rdi");
+        (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0)
+    }
+}
+
+// VMOVDQA is atomic on Intel and AMD CPUs with AVX.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
+//
+// Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
+//
+// Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
+// https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+#[target_feature(enable = "avx")]
+#[inline]
+unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
+    debug_assert!(src as usize % 16 == 0);
+    debug_assert_vmovdqa_atomic!();
+
+    // SAFETY: the caller must uphold the safety contract.
+    //
+    // atomic load by vmovdqa is always SeqCst.
+    unsafe {
+        let out: core::arch::x86_64::__m128;
+        asm!(
+            concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
+            src = in(reg) src,
+            out = out(xmm_reg) out,
+            options(nostack, preserves_flags),
+        );
+        core::mem::transmute(out)
+    }
+}
+#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
+#[cfg(target_feature = "sse")]
+#[target_feature(enable = "avx")]
+#[inline]
+unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
+    debug_assert!(dst as usize % 16 == 0);
+    debug_assert_vmovdqa_atomic!();
+
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        let val: core::arch::x86_64::__m128 = core::mem::transmute(val);
+        match order {
+            // Relaxed and Release stores are equivalent.
+            Ordering::Relaxed | Ordering::Release => {
+                asm!(
+                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
+                    dst = in(reg) dst,
+                    val = in(xmm_reg) val,
+                    options(nostack, preserves_flags),
+                );
+            }
+            Ordering::SeqCst => {
+                asm!(
+                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
+                    "mfence",
+                    dst = in(reg) dst,
+                    val = in(xmm_reg) val,
+                    options(nostack, preserves_flags),
+                );
+            }
+            _ => unreachable!("{:?}", order),
+        }
+    }
+}
+
+#[cfg(not(all(
+    any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+    any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
+)))]
+macro_rules! load_store_detect {
+    (
+        vmovdqa = $vmovdqa:ident
+        cmpxchg16b = $cmpxchg16b:ident
+        fallback = $fallback:ident
+    ) => {{
+        let cpuid = detect::detect();
+        #[cfg(not(any(
+            target_feature = "cmpxchg16b",
+            portable_atomic_target_feature = "cmpxchg16b",
+        )))]
+        {
+            // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
+            if cpuid.has_cmpxchg16b() {
+                // We do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
+                #[cfg(target_feature = "sse")]
+                {
+                    if cpuid.has_vmovdqa_atomic() {
+                        $vmovdqa
+                    } else {
+                        $cmpxchg16b
+                    }
+                }
+                #[cfg(not(target_feature = "sse"))]
+                {
+                    $cmpxchg16b
+                }
+            } else {
+                fallback::$fallback
+            }
+        }
+        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+        {
+            if cpuid.has_vmovdqa_atomic() {
+                $vmovdqa
+            } else {
+                $cmpxchg16b
+            }
+        }
+    }};
+}
+
+#[inline]
+unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
+    // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
+    // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
+    // SGX doesn't support CPUID.
+    #[cfg(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
+    ))]
+    // SAFETY: the caller must uphold the safety contract.
+    // cfg guarantees that CMPXCHG16B is available at compile-time.
+    unsafe {
+        // cmpxchg16b is always SeqCst.
+        atomic_load_cmpxchg16b(src)
+    }
+    #[cfg(not(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
+    )))]
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        ifunc!(unsafe fn(src: *mut u128) -> u128 {
+            load_store_detect! {
+                vmovdqa = atomic_load_vmovdqa
+                cmpxchg16b = atomic_load_cmpxchg16b
+                // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
+                fallback = atomic_load_seqcst
+            }
+        })
+    }
+}
+#[cfg_attr(
+    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+    target_feature(enable = "cmpxchg16b")
+)]
+#[inline]
+unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
+    debug_assert!(src as usize % 16 == 0);
+    debug_assert_cmpxchg16b!();
+
+    // SAFETY: the caller must guarantee that `src` is valid for both writes and
+    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
+    // cfg guarantees that the CPU supports CMPXCHG16B.
+    //
+    // See cmpxchg16b function for more.
+    //
+    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+    // omitting the storing of condition flags and avoid use of xchg to handle rbx.
+    unsafe {
+        // cmpxchg16b is always SeqCst.
+        let (out_lo, out_hi);
+        macro_rules! cmpxchg16b {
+            ($rdi:tt) => {
+                asm!(
+                    // rbx is reserved by LLVM
+                    "mov {rbx_tmp}, rbx",
+                    "xor rbx, rbx", // zeroed rbx
+                    concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
+                    "mov rbx, {rbx_tmp}", // restore rbx
+                    // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
+                    rbx_tmp = out(reg) _,
+                    in("rcx") 0_u64,
+                    inout("rax") 0_u64 => out_lo,
+                    inout("rdx") 0_u64 => out_hi,
+                    in($rdi) src,
+                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
+                    options(nostack),
+                )
+            };
+        }
+        #[cfg(target_pointer_width = "32")]
+        cmpxchg16b!("edi");
+        #[cfg(target_pointer_width = "64")]
+        cmpxchg16b!("rdi");
+        U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
+    }
+}
+
+#[inline]
+unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
+    // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
+    // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
+    // SGX doesn't support CPUID.
+    #[cfg(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
+    ))]
+    // SAFETY: the caller must uphold the safety contract.
+    // cfg guarantees that CMPXCHG16B is available at compile-time.
+    unsafe {
+        // cmpxchg16b is always SeqCst.
+        let _ = order;
+        atomic_store_cmpxchg16b(dst, val);
+    }
+    #[cfg(not(all(
+        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
+        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
+    )))]
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        #[cfg(target_feature = "sse")]
+        fn_alias! {
+            #[target_feature(enable = "avx")]
+            unsafe fn(dst: *mut u128, val: u128);
+            // atomic store by vmovdqa has at least release semantics.
+            atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
+            atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
+        }
+        match order {
+            // Relaxed and Release stores are equivalent in all implementations
+            // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
+            // core::arch's cmpxchg16b will never called here.
+            Ordering::Relaxed | Ordering::Release => {
+                ifunc!(unsafe fn(dst: *mut u128, val: u128) {
+                    load_store_detect! {
+                        vmovdqa = atomic_store_vmovdqa_non_seqcst
+                        cmpxchg16b = atomic_store_cmpxchg16b
+                        fallback = atomic_store_non_seqcst
+                    }
+                });
+            }
+            Ordering::SeqCst => {
+                ifunc!(unsafe fn(dst: *mut u128, val: u128) {
+                    load_store_detect! {
+                        vmovdqa = atomic_store_vmovdqa_seqcst
+                        cmpxchg16b = atomic_store_cmpxchg16b
+                        fallback = atomic_store_seqcst
+                    }
+                });
+            }
+            _ => unreachable!("{:?}", order),
+        }
+    }
+}
+#[cfg_attr(
+    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+    target_feature(enable = "cmpxchg16b")
+)]
+unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
+    // SAFETY: the caller must uphold the safety contract.
+    unsafe {
+        // cmpxchg16b is always SeqCst.
+        atomic_swap_cmpxchg16b(dst, val, Ordering::SeqCst);
+    }
+}
+
+#[inline]
+unsafe fn atomic_compare_exchange(
+    dst: *mut u128,
+    old: u128,
+    new: u128,
+    _success: Ordering,
+    _failure: Ordering,
+) -> Result<u128, u128> {
+    #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+    // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
+    // and cfg guarantees that CMPXCHG16B is available at compile-time.
+    let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
+    #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
+    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+    // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
+    let (prev, ok) = unsafe {
+        ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
+            if detect::detect().has_cmpxchg16b() {
+                cmpxchg16b
+            } else {
+                // Use SeqCst because cmpxchg16b is always SeqCst.
+                fallback::atomic_compare_exchange_seqcst
+            }
+        })
+    };
+    if ok {
+        Ok(prev)
+    } else {
+        Err(prev)
+    }
+}
+
+// cmpxchg16b is always strong.
+use atomic_compare_exchange as atomic_compare_exchange_weak;
+
+#[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+use atomic_swap_cmpxchg16b as atomic_swap;
+#[cfg_attr(
+    not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+    target_feature(enable = "cmpxchg16b")
+)]
+#[inline]
+unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
+    debug_assert!(dst as usize % 16 == 0);
+    debug_assert_cmpxchg16b!();
+
+    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
+    // cfg guarantees that the CPU supports CMPXCHG16B.
+    //
+    // See cmpxchg16b function for more.
+    //
+    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+    // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
+    //
+    // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
+    unsafe {
+        // cmpxchg16b is always SeqCst.
+        let val = U128 { whole: val };
+        let (mut prev_lo, mut prev_hi);
+        macro_rules! cmpxchg16b {
+            ($rdi:tt) => {
+                asm!(
+                    // rbx is reserved by LLVM
+                    "xchg {rbx_tmp}, rbx",
+                    // This is not single-copy atomic reads, but this is ok because subsequent
+                    // CAS will check for consistency.
+                    //
+                    // This is based on the code generated for the first load in DW RMWs by LLVM.
+                    //
+                    // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                    // so we must use inline assembly to implement this.
+                    // (i.e., byte-wise atomic based on the standard library's atomic types
+                    // cannot be used here).
+                    concat!("mov rax, qword ptr [", $rdi, "]"),
+                    concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
+                    "2:",
+                        concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
+                        "jne 2b",
+                    "mov rbx, {rbx_tmp}", // restore rbx
+                    rbx_tmp = inout(reg) val.pair.lo => _,
+                    in("rcx") val.pair.hi,
+                    out("rax") prev_lo,
+                    out("rdx") prev_hi,
+                    in($rdi) dst,
+                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
+                    options(nostack),
+                )
+            };
+        }
+        #[cfg(target_pointer_width = "32")]
+        cmpxchg16b!("edi");
+        #[cfg(target_pointer_width = "64")]
+        cmpxchg16b!("rdi");
+        U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
+    }
+}
+
+/// Atomic RMW by CAS loop (3 arguments)
+/// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;`
+///
+/// `$op` can use the following registers:
+/// - rsi/r8 pair: val argument (read-only for `$op`)
+/// - rax/rdx pair: previous value loaded (read-only for `$op`)
+/// - rbx/rcx pair: new value that will be stored
+// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
+macro_rules! atomic_rmw_cas_3 {
+    ($name:ident as $reexport_name:ident, $($op:tt)*) => {
+        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+        use $name as $reexport_name;
+        #[cfg_attr(
+            not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+            target_feature(enable = "cmpxchg16b")
+        )]
+        #[inline]
+        unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
+            debug_assert!(dst as usize % 16 == 0);
+            debug_assert_cmpxchg16b!();
+            // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+            // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
+            // cfg guarantees that the CPU supports CMPXCHG16B.
+            //
+            // See cmpxchg16b function for more.
+            unsafe {
+                // cmpxchg16b is always SeqCst.
+                let val = U128 { whole: val };
+                let (mut prev_lo, mut prev_hi);
+                macro_rules! cmpxchg16b {
+                    ($rdi:tt) => {
+                        asm!(
+                            // rbx is reserved by LLVM
+                            "mov {rbx_tmp}, rbx",
+                            // This is not single-copy atomic reads, but this is ok because subsequent
+                            // CAS will check for consistency.
+                            //
+                            // This is based on the code generated for the first load in DW RMWs by LLVM.
+                            //
+                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                            // so we must use inline assembly to implement this.
+                            // (i.e., byte-wise atomic based on the standard library's atomic types
+                            // cannot be used here).
+                            concat!("mov rax, qword ptr [", $rdi, "]"),
+                            concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
+                            "2:",
+                                $($op)*
+                                concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
+                                "jne 2b",
+                            "mov rbx, {rbx_tmp}", // restore rbx
+                            rbx_tmp = out(reg) _,
+                            out("rcx") _,
+                            out("rax") prev_lo,
+                            out("rdx") prev_hi,
+                            in($rdi) dst,
+                            in("rsi") val.pair.lo,
+                            in("r8") val.pair.hi,
+                            // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
+                            options(nostack),
+                        )
+                    };
+                }
+                #[cfg(target_pointer_width = "32")]
+                cmpxchg16b!("edi");
+                #[cfg(target_pointer_width = "64")]
+                cmpxchg16b!("rdi");
+                U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
+            }
+        }
+    };
+}
+/// Atomic RMW by CAS loop (2 arguments)
+/// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
+///
+/// `$op` can use the following registers:
+/// - rax/rdx pair: previous value loaded (read-only for `$op`)
+/// - rbx/rcx pair: new value that will be stored
+// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
+// omitting the storing of condition flags and avoid use of xchg to handle rbx.
+macro_rules! atomic_rmw_cas_2 {
+    ($name:ident as $reexport_name:ident, $($op:tt)*) => {
+        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+        use $name as $reexport_name;
+        #[cfg_attr(
+            not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
+            target_feature(enable = "cmpxchg16b")
+        )]
+        #[inline]
+        unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
+            debug_assert!(dst as usize % 16 == 0);
+            debug_assert_cmpxchg16b!();
+            // SAFETY: the caller must guarantee that `dst` is valid for both writes and
+            // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
+            // cfg guarantees that the CPU supports CMPXCHG16B.
+            //
+            // See cmpxchg16b function for more.
+            unsafe {
+                // cmpxchg16b is always SeqCst.
+                let (mut prev_lo, mut prev_hi);
+                macro_rules! cmpxchg16b {
+                    ($rdi:tt) => {
+                        asm!(
+                            // rbx is reserved by LLVM
+                            "mov {rbx_tmp}, rbx",
+                            // This is not single-copy atomic reads, but this is ok because subsequent
+                            // CAS will check for consistency.
+                            //
+                            // This is based on the code generated for the first load in DW RMWs by LLVM.
+                            //
+                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
+                            // so we must use inline assembly to implement this.
+                            // (i.e., byte-wise atomic based on the standard library's atomic types
+                            // cannot be used here).
+                            concat!("mov rax, qword ptr [", $rdi, "]"),
+                            concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
+                            "2:",
+                                $($op)*
+                                concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
+                                "jne 2b",
+                            "mov rbx, {rbx_tmp}", // restore rbx
+                            rbx_tmp = out(reg) _,
+                            out("rcx") _,
+                            out("rax") prev_lo,
+                            out("rdx") prev_hi,
+                            in($rdi) dst,
+                            // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
+                            options(nostack),
+                        )
+                    };
+                }
+                #[cfg(target_pointer_width = "32")]
+                cmpxchg16b!("edi");
+                #[cfg(target_pointer_width = "64")]
+                cmpxchg16b!("rdi");
+                U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
+            }
+        }
+    };
+}
+
+atomic_rmw_cas_3! {
+    atomic_add_cmpxchg16b as atomic_add,
+    "mov rbx, rax",
+    "add rbx, rsi",
+    "mov rcx, rdx",
+    "adc rcx, r8",
+}
+atomic_rmw_cas_3! {
+    atomic_sub_cmpxchg16b as atomic_sub,
+    "mov rbx, rax",
+    "sub rbx, rsi",
+    "mov rcx, rdx",
+    "sbb rcx, r8",
+}
+atomic_rmw_cas_3! {
+    atomic_and_cmpxchg16b as atomic_and,
+    "mov rbx, rax",
+    "and rbx, rsi",
+    "mov rcx, rdx",
+    "and rcx, r8",
+}
+atomic_rmw_cas_3! {
+    atomic_nand_cmpxchg16b as atomic_nand,
+    "mov rbx, rax",
+    "and rbx, rsi",
+    "not rbx",
+    "mov rcx, rdx",
+    "and rcx, r8",
+    "not rcx",
+}
+atomic_rmw_cas_3! {
+    atomic_or_cmpxchg16b as atomic_or,
+    "mov rbx, rax",
+    "or rbx, rsi",
+    "mov rcx, rdx",
+    "or rcx, r8",
+}
+atomic_rmw_cas_3! {
+    atomic_xor_cmpxchg16b as atomic_xor,
+    "mov rbx, rax",
+    "xor rbx, rsi",
+    "mov rcx, rdx",
+    "xor rcx, r8",
+}
+
+atomic_rmw_cas_2! {
+    atomic_not_cmpxchg16b as atomic_not,
+    "mov rbx, rax",
+    "not rbx",
+    "mov rcx, rdx",
+    "not rcx",
+}
+atomic_rmw_cas_2! {
+    atomic_neg_cmpxchg16b as atomic_neg,
+    "mov rbx, rax",
+    "neg rbx",
+    "mov rcx, 0",
+    "sbb rcx, rdx",
+}
+
+atomic_rmw_cas_3! {
+    atomic_max_cmpxchg16b as atomic_max,
+    "cmp rsi, rax",
+    "mov rcx, r8",
+    "sbb rcx, rdx",
+    "mov rcx, r8",
+    "cmovl rcx, rdx",
+    "mov rbx, rsi",
+    "cmovl rbx, rax",
+}
+atomic_rmw_cas_3! {
+    atomic_umax_cmpxchg16b as atomic_umax,
+    "cmp rsi, rax",
+    "mov rcx, r8",
+    "sbb rcx, rdx",
+    "mov rcx, r8",
+    "cmovb rcx, rdx",
+    "mov rbx, rsi",
+    "cmovb rbx, rax",
+}
+atomic_rmw_cas_3! {
+    atomic_min_cmpxchg16b as atomic_min,
+    "cmp rsi, rax",
+    "mov rcx, r8",
+    "sbb rcx, rdx",
+    "mov rcx, r8",
+    "cmovge rcx, rdx",
+    "mov rbx, rsi",
+    "cmovge rbx, rax",
+}
+atomic_rmw_cas_3! {
+    atomic_umin_cmpxchg16b as atomic_umin,
+    "cmp rsi, rax",
+    "mov rcx, r8",
+    "sbb rcx, rdx",
+    "mov rcx, r8",
+    "cmovae rcx, rdx",
+    "mov rbx, rsi",
+    "cmovae rbx, rax",
+}
+
+macro_rules! atomic_rmw_with_ifunc {
+    (
+        unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
+        cmpxchg16b = $cmpxchg16b_fn:ident;
+        fallback = $seqcst_fallback_fn:ident;
+    ) => {
+        #[cfg(not(any(
+            target_feature = "cmpxchg16b",
+            portable_atomic_target_feature = "cmpxchg16b",
+        )))]
+        #[inline]
+        unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
+            fn_alias! {
+                #[cfg_attr(
+                    not(any(
+                        target_feature = "cmpxchg16b",
+                        portable_atomic_target_feature = "cmpxchg16b",
+                    )),
+                    target_feature(enable = "cmpxchg16b")
+                )]
+                unsafe fn($($arg)*) $(-> $ret_ty)?;
+                // cmpxchg16b is always SeqCst.
+                cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
+            }
+            // SAFETY: the caller must uphold the safety contract.
+            // we only calls cmpxchg16b_fn if cmpxchg16b is available.
+            unsafe {
+                ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
+                    if detect::detect().has_cmpxchg16b() {
+                        cmpxchg16b_seqcst_fn
+                    } else {
+                        // Use SeqCst because cmpxchg16b is always SeqCst.
+                        fallback::$seqcst_fallback_fn
+                    }
+                })
+            }
+        }
+    };
+}
+
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_swap_cmpxchg16b;
+    fallback = atomic_swap_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_add_cmpxchg16b;
+    fallback = atomic_add_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_sub_cmpxchg16b;
+    fallback = atomic_sub_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_and_cmpxchg16b;
+    fallback = atomic_and_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_nand_cmpxchg16b;
+    fallback = atomic_nand_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_or_cmpxchg16b;
+    fallback = atomic_or_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_xor_cmpxchg16b;
+    fallback = atomic_xor_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_max_cmpxchg16b;
+    fallback = atomic_max_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_umax_cmpxchg16b;
+    fallback = atomic_umax_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_min_cmpxchg16b;
+    fallback = atomic_min_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
+    cmpxchg16b = atomic_umin_cmpxchg16b;
+    fallback = atomic_umin_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_not(dst: *mut u128) -> u128;
+    cmpxchg16b = atomic_not_cmpxchg16b;
+    fallback = atomic_not_seqcst;
+}
+atomic_rmw_with_ifunc! {
+    unsafe fn atomic_neg(dst: *mut u128) -> u128;
+    cmpxchg16b = atomic_neg_cmpxchg16b;
+    fallback = atomic_neg_seqcst;
+}
+
+#[inline]
+fn is_lock_free() -> bool {
+    #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
+    {
+        // CMPXCHG16B is available at compile-time.
+        true
+    }
+    #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
+    {
+        detect::detect().has_cmpxchg16b()
+    }
+}
+const IS_ALWAYS_LOCK_FREE: bool =
+    cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));
+
+atomic128!(AtomicI128, i128, atomic_max, atomic_min);
+atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
+
+#[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    test_atomic_int!(i128);
+    test_atomic_int!(u128);
+
+    // load/store/swap implementation is not affected by signedness, so it is
+    // enough to test only unsigned types.
+    stress_test!(u128);
+}