@@ -3744,16 +3744,73 @@ static HWY_INLINE V VsxF2INormalizeSrcVals(V v) {
37443744#endif
37453745}
37463746
3747+ template <class VF32 >
3748+ static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<int64_t , DFromV<VF32>>>
3749+ VsxXvcvspsxds (VF32 vf32) {
3750+ using VI64 = VFromD<Repartition<int64_t , DFromV<VF32>>>;
3751+ #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
3752+ HWY_HAS_BUILTIN (__builtin_vsx_xvcvspsxds)
3753+ // Use __builtin_vsx_xvcvspsxds if it is available (which is the case with
3754+ // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10)
3755+ return VI64{__builtin_vsx_xvcvspsxds (vf32.raw )};
3756+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
3757+ // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64
3758+ // vec_signedo intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been
3759+ // removed from GCC in GCC 15
3760+ return VI64{vec_signedo (vf32.raw )};
3761+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
3762+ // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->I64
3763+ // vec_signede intrinsic as the __builtin_vsx_xvcvspsxds intrinsic has been
3764+ // removed from GCC in GCC 15
3765+ return VI64{vec_signede (vf32.raw )};
3766+ #else
3767+ // Inline assembly fallback for older versions of Clang that do not have the
3768+ // __builtin_vsx_xvcvspsxds intrinsic
3769+ __vector signed long long raw_result;
3770+ __asm__ (" xvcvspsxds %x0, %x1" : " =wa" (raw_result) : " wa" (vf32.raw ) :);
3771+ return VI64{raw_result};
3772+ #endif
3773+ }
3774+
3775+ template <class VF32 >
3776+ static HWY_INLINE HWY_MAYBE_UNUSED VFromD<Repartition<uint64_t , DFromV<VF32>>>
3777+ VsxXvcvspuxds (VF32 vf32) {
3778+ using VU64 = VFromD<Repartition<uint64_t , DFromV<VF32>>>;
3779+ #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1500) || \
3780+ HWY_HAS_BUILTIN (__builtin_vsx_xvcvspuxds)
3781+ // Use __builtin_vsx_xvcvspuxds if it is available (which is the case with
3782+ // GCC 4.8 through GCC 14 or Clang 13 or later on PPC8/PPC9/PPC10)
3783+ return VU64{reinterpret_cast <__vector unsigned long long >(
3784+ __builtin_vsx_xvcvspuxds (vf32.raw ))};
3785+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_LITTLE_ENDIAN
3786+ // On little-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64
3787+ // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been
3788+ // removed from GCC in GCC 15
3789+ return VU64{vec_unsignedo (vf32.raw )};
3790+ #elif HWY_COMPILER_GCC_ACTUAL >= 1500 && HWY_IS_BIG_ENDIAN
3791+ // On big-endian PPC8/PPC9/PPC10 with GCC 15 or later, use the F32->U64
3792+ // vec_unsignedo intrinsic as the __builtin_vsx_xvcvspuxds intrinsic has been
3793+ // removed from GCC in GCC 15
3794+ return VU64{vec_unsignede (vf32.raw )};
3795+ #else
3796+ // Inline assembly fallback for older versions of Clang that do not have the
3797+ // __builtin_vsx_xvcvspuxds intrinsic
3798+ __vector unsigned long long raw_result;
3799+ __asm__ (" xvcvspuxds %x0, %x1" : " =wa" (raw_result) : " wa" (vf32.raw ) :);
3800+ return VU64{raw_result};
3801+ #endif
3802+ }
3803+
37473804} // namespace detail
37483805#endif // !HWY_S390X_HAVE_Z14
37493806
37503807template <class D , HWY_IF_I64_D(D)>
37513808HWY_API VFromD<D> PromoteTo (D di64, VFromD<Rebind<float , D>> v) {
3752- #if !HWY_S390X_HAVE_Z14 && \
3753- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspsxds))
3754- const __vector float raw_v =
3755- detail::VsxF2INormalizeSrcVals ( InterleaveLower (v, v)). raw ;
3756- return VFromD< decltype (di64)>{ __builtin_vsx_xvcvspsxds (raw_v)} ;
3809+ #if !HWY_S390X_HAVE_Z14
3810+ const Repartition< float , decltype (di64)> dt_f32;
3811+ const auto vt_f32 = ResizeBitCast (dt_f32, v);
3812+ return detail::VsxXvcvspsxds (
3813+ detail::VsxF2INormalizeSrcVals ( InterleaveLower (vt_f32, vt_f32))) ;
37573814#else
37583815 const RebindToFloat<decltype (di64)> df64;
37593816 return ConvertTo (di64, PromoteTo (df64, v));
@@ -3762,12 +3819,11 @@ HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
37623819
37633820template <class D , HWY_IF_U64_D(D)>
37643821HWY_API VFromD<D> PromoteTo (D du64, VFromD<Rebind<float , D>> v) {
3765- #if !HWY_S390X_HAVE_Z14 && \
3766- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspuxds))
3767- const __vector float raw_v =
3768- detail::VsxF2INormalizeSrcVals (InterleaveLower (v, v)).raw ;
3769- return VFromD<decltype (du64)>{reinterpret_cast <__vector unsigned long long >(
3770- __builtin_vsx_xvcvspuxds (raw_v))};
3822+ #if !HWY_S390X_HAVE_Z14
3823+ const Repartition<float , decltype (du64)> dt_f32;
3824+ const auto vt_f32 = ResizeBitCast (dt_f32, v);
3825+ return detail::VsxXvcvspuxds (
3826+ detail::VsxF2INormalizeSrcVals (InterleaveLower (vt_f32, vt_f32)));
37713827#else
37723828 const RebindToFloat<decltype (du64)> df64;
37733829 return ConvertTo (du64, PromoteTo (df64, v));
@@ -3876,12 +3932,10 @@ HWY_API VFromD<D> PromoteUpperTo(D df64, Vec128<uint32_t> v) {
38763932
38773933template <class D , HWY_IF_V_SIZE_D(D, 16 ), HWY_IF_I64_D(D)>
38783934HWY_API VFromD<D> PromoteUpperTo (D di64, Vec128<float > v) {
3879- #if !HWY_S390X_HAVE_Z14 && \
3880- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspsxds))
3881- const __vector float raw_v =
3882- detail::VsxF2INormalizeSrcVals (InterleaveUpper (Full128<float >(), v, v))
3883- .raw ;
3884- return VFromD<decltype (di64)>{__builtin_vsx_xvcvspsxds (raw_v)};
3935+ #if !HWY_S390X_HAVE_Z14
3936+ (void )di64;
3937+ return detail::VsxXvcvspsxds (
3938+ detail::VsxF2INormalizeSrcVals (InterleaveUpper (Full128<float >(), v, v)));
38853939#else
38863940 const RebindToFloat<decltype (di64)> df64;
38873941 return ConvertTo (di64, PromoteUpperTo (df64, v));
@@ -3890,13 +3944,10 @@ HWY_API VFromD<D> PromoteUpperTo(D di64, Vec128<float> v) {
38903944
38913945template <class D , HWY_IF_V_SIZE_D(D, 16 ), HWY_IF_U64_D(D)>
38923946HWY_API VFromD<D> PromoteUpperTo (D du64, Vec128<float > v) {
3893- #if !HWY_S390X_HAVE_Z14 && \
3894- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspuxds))
3895- const __vector float raw_v =
3896- detail::VsxF2INormalizeSrcVals (InterleaveUpper (Full128<float >(), v, v))
3897- .raw ;
3898- return VFromD<decltype (du64)>{reinterpret_cast <__vector unsigned long long >(
3899- __builtin_vsx_xvcvspuxds (raw_v))};
3947+ #if !HWY_S390X_HAVE_Z14
3948+ (void )du64;
3949+ return detail::VsxXvcvspuxds (
3950+ detail::VsxF2INormalizeSrcVals (InterleaveUpper (Full128<float >(), v, v)));
39003951#else
39013952 const RebindToFloat<decltype (du64)> df64;
39023953 return ConvertTo (du64, PromoteUpperTo (df64, v));
@@ -3984,20 +4035,18 @@ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::SignedTag /*to_type_tag*/,
39844035 hwy::SizeTag<8 > /* to_lane_size_tag*/ ,
39854036 hwy::FloatTag /* from_type_tag*/ , D d_to,
39864037 V v) {
3987- #if !HWY_S390X_HAVE_Z14 && \
3988- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspsxds))
4038+ #if !HWY_S390X_HAVE_Z14
39894039 (void )d_to;
39904040 const auto normalized_v = detail::VsxF2INormalizeSrcVals (v);
39914041#if HWY_IS_LITTLE_ENDIAN
3992- // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
3993- // on little-endian PPC, and the vec_sld operation below will shift the even
4042+ // VsxXvcvspsxds expects the source values to be in the odd lanes on
4043+ // little-endian PPC, and the Shuffle2103 operation below will shift the even
39944044 // lanes of normalized_v into the odd lanes.
3995- return VFromD<D>{
3996- __builtin_vsx_xvcvspsxds (vec_sld (normalized_v.raw , normalized_v.raw , 4 ))};
4045+ return VsxXvcvspsxds (Shuffle2103 (normalized_v));
39974046#else
3998- // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
3999- // on big-endian PPC.
4000- return VFromD<D>{ __builtin_vsx_xvcvspsxds (normalized_v. raw )} ;
4047+ // VsxXvcvspsxds expects the source values to be in the even lanes on
4048+ // big-endian PPC.
4049+ return VsxXvcvspsxds (normalized_v) ;
40014050#endif
40024051#else
40034052 const RebindToFloat<decltype (d_to)> df64;
@@ -4012,22 +4061,18 @@ HWY_INLINE VFromD<D> PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/,
40124061 hwy::SizeTag<8 > /* to_lane_size_tag*/ ,
40134062 hwy::FloatTag /* from_type_tag*/ , D d_to,
40144063 V v) {
4015- #if !HWY_S390X_HAVE_Z14 && \
4016- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspuxds))
4064+ #if !HWY_S390X_HAVE_Z14
40174065 (void )d_to;
40184066 const auto normalized_v = detail::VsxF2INormalizeSrcVals (v);
40194067#if HWY_IS_LITTLE_ENDIAN
4020- // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
4021- // on little-endian PPC, and the vec_sld operation below will shift the even
4022- // lanes of normalized_v into the odd lanes.
4023- return VFromD<D>{
4024- reinterpret_cast <__vector unsigned long long >(__builtin_vsx_xvcvspuxds (
4025- vec_sld (normalized_v.raw , normalized_v.raw , 4 )))};
4068+ // VsxXvcvspuxds expects the source values to be in the odd lanes
4069+ // on little-endian PPC, and the Shuffle2103 operation below will shift the
4070+ // even lanes of normalized_v into the odd lanes.
4071+ return VsxXvcvspuxds (Shuffle2103 (normalized_v));
40264072#else
4027- // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
4073+ // VsxXvcvspuxds expects the source values to be in the even lanes
40284074 // on big-endian PPC.
4029- return VFromD<D>{reinterpret_cast <__vector unsigned long long >(
4030- __builtin_vsx_xvcvspuxds (normalized_v.raw ))};
4075+ return VsxXvcvspuxds (normalized_v);
40314076#endif
40324077#else
40334078 const RebindToFloat<decltype (d_to)> df64;
@@ -4069,20 +4114,18 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::SignedTag /*to_type_tag*/,
40694114 hwy::SizeTag<8 > /* to_lane_size_tag*/ ,
40704115 hwy::FloatTag /* from_type_tag*/ , D d_to,
40714116 V v) {
4072- #if !HWY_S390X_HAVE_Z14 && \
4073- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspsxds))
4117+ #if !HWY_S390X_HAVE_Z14
40744118 (void )d_to;
40754119 const auto normalized_v = detail::VsxF2INormalizeSrcVals (v);
40764120#if HWY_IS_LITTLE_ENDIAN
4077- // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes
4121+ // VsxXvcvspsxds expects the source values to be in the odd lanes
40784122 // on little-endian PPC
4079- return VFromD<D>{ __builtin_vsx_xvcvspsxds (normalized_v. raw )} ;
4123+ return VsxXvcvspsxds (normalized_v) ;
40804124#else
4081- // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes
4082- // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4083- // of normalized_v into the even lanes.
4084- return VFromD<D>{
4085- __builtin_vsx_xvcvspsxds (vec_sld (normalized_v.raw , normalized_v.raw , 4 ))};
4125+ // VsxXvcvspsxds expects the source values to be in the even lanes
4126+ // on big-endian PPC, and the Shuffle0321 operation below will shift the odd
4127+ // lanes of normalized_v into the even lanes.
4128+ return VsxXvcvspsxds (Shuffle0321 (normalized_v));
40864129#endif
40874130#else
40884131 const RebindToFloat<decltype (d_to)> df64;
@@ -4097,22 +4140,18 @@ HWY_INLINE VFromD<D> PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/,
40974140 hwy::SizeTag<8 > /* to_lane_size_tag*/ ,
40984141 hwy::FloatTag /* from_type_tag*/ , D d_to,
40994142 V v) {
4100- #if !HWY_S390X_HAVE_Z14 && \
4101- (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN (__builtin_vsx_xvcvspuxds))
4143+ #if !HWY_S390X_HAVE_Z14
41024144 (void )d_to;
41034145 const auto normalized_v = detail::VsxF2INormalizeSrcVals (v);
41044146#if HWY_IS_LITTLE_ENDIAN
4105- // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes
4147+ // VsxXvcvspuxds expects the source values to be in the odd lanes
41064148 // on little-endian PPC
4107- return VFromD<D>{reinterpret_cast <__vector unsigned long long >(
4108- __builtin_vsx_xvcvspuxds (normalized_v.raw ))};
4149+ return VsxXvcvspuxds (normalized_v);
41094150#else
4110- // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes
4111- // on big-endian PPC, and the vec_sld operation below will shift the odd lanes
4112- // of normalized_v into the even lanes.
4113- return VFromD<D>{
4114- reinterpret_cast <__vector unsigned long long >(__builtin_vsx_xvcvspuxds (
4115- vec_sld (normalized_v.raw , normalized_v.raw , 4 )))};
4151+ // VsxXvcvspuxds expects the source values to be in the even lanes
4152+ // on big-endian PPC, and the Shuffle0321 operation below will shift the odd
4153+ // lanes of normalized_v into the even lanes.
4154+ return VsxXvcvspuxds (Shuffle0321 (normalized_v));
41164155#endif
41174156#else
41184157 const RebindToFloat<decltype (d_to)> df64;
0 commit comments