Ben Gamari pushed to branch wip/T26166 at Glasgow Haskell Compiler / GHC
Commits:
-
32224b2d
by Ben Gamari at 2025-10-14T19:11:29-04:00
-
d16c1769
by Ben Gamari at 2025-10-14T19:11:29-04:00
-
085f00fe
by Ben Gamari at 2025-10-14T19:11:29-04:00
-
95de617d
by Ben Gamari at 2025-10-14T19:14:05-04:00
7 changed files:
- compiler/GHC/Cmm/CLabel.hs
- compiler/GHC/CmmToC.hs
- rts/Printer.c
- rts/include/stg/Prim.h
- rts/posix/OSMem.c
- rts/prim/int64x2minmax.c
- rts/prim/vectorQuotRem.c
Changes:
| ... | ... | @@ -102,7 +102,7 @@ module GHC.Cmm.CLabel ( |
| 102 | 102 | needsCDecl,
|
| 103 | 103 | maybeLocalBlockLabel,
|
| 104 | 104 | externallyVisibleCLabel,
|
| 105 | - isMathFun,
|
|
| 105 | + isLibcFun,
|
|
| 106 | 106 | isCFunctionLabel,
|
| 107 | 107 | isGcPtrLabel,
|
| 108 | 108 | labelDynamic,
|
| ... | ... | @@ -1028,7 +1028,7 @@ needsCDecl (CmmLabel pkgId (NeedExternDecl external) _ _) |
| 1028 | 1028 | -- For other labels we inline one into the HC file directly.
|
| 1029 | 1029 | | otherwise = True
|
| 1030 | 1030 | |
| 1031 | -needsCDecl l@(ForeignLabel{}) = not (isMathFun l)
|
|
| 1031 | +needsCDecl l@(ForeignLabel{}) = not (isLibcFun l)
|
|
| 1032 | 1032 | needsCDecl (CC_Label _) = True
|
| 1033 | 1033 | needsCDecl (CCS_Label _) = True
|
| 1034 | 1034 | needsCDecl (IPE_Label {}) = True
|
| ... | ... | @@ -1055,15 +1055,19 @@ maybeLocalBlockLabel _ = Nothing |
| 1055 | 1055 | |
| 1056 | 1056 | |
| 1057 | 1057 | -- | Check whether a label corresponds to a C function that has
|
| 1058 | --- a prototype in a system header somewhere, or is built-in
|
|
| 1059 | --- to the C compiler. For these labels we avoid generating our
|
|
| 1060 | --- own C prototypes.
|
|
| 1061 | -isMathFun :: CLabel -> Bool
|
|
| 1062 | -isMathFun (ForeignLabel fs _ _) = fs `elementOfUniqSet` math_funs
|
|
| 1063 | -isMathFun _ = False
|
|
| 1064 | - |
|
| 1065 | -math_funs :: UniqSet FastString
|
|
| 1066 | -math_funs = mkUniqSet [
|
|
| 1058 | +-- a prototype in a system header somewhere, or is built-in
|
|
| 1059 | +-- to the C compiler. For these labels we avoid generating our
|
|
| 1060 | +-- own C prototypes.
|
|
| 1061 | +isLibcFun :: CLabel -> Bool
|
|
| 1062 | +isLibcFun (ForeignLabel fs _ _) = fs `elementOfUniqSet` libc_funs
|
|
| 1063 | +isLibcFun _ = False
|
|
| 1064 | + |
|
| 1065 | +libc_funs :: UniqSet FastString
|
|
| 1066 | +libc_funs = mkUniqSet [
|
|
| 1067 | + ---------------------
|
|
| 1068 | + -- Math functions
|
|
| 1069 | + ---------------------
|
|
| 1070 | + |
|
| 1067 | 1071 | -- _ISOC99_SOURCE
|
| 1068 | 1072 | (fsLit "acos"), (fsLit "acosf"), (fsLit "acosh"),
|
| 1069 | 1073 | (fsLit "acoshf"), (fsLit "acoshl"), (fsLit "acosl"),
|
| ... | ... | @@ -245,7 +245,7 @@ pprStmt platform stmt = |
| 245 | 245 | CmmLit (CmmLabel lbl)
|
| 246 | 246 | | CmmNeverReturns <- ret ->
|
| 247 | 247 | pprCall platform cast_fn cconv hresults hargs <> semi <> text "__builtin_unreachable();"
|
| 248 | - | not (isMathFun lbl) ->
|
|
| 248 | + | not (isLibcFun lbl) ->
|
|
| 249 | 249 | pprForeignCall platform (pprCLabel platform lbl) cconv hresults hargs
|
| 250 | 250 | _ ->
|
| 251 | 251 | pprCall platform cast_fn cconv hresults hargs <> semi
|
| ... | ... | @@ -1033,8 +1033,8 @@ findPtr(P_ p, int follow) |
| 1033 | 1033 | {
|
| 1034 | 1034 | uint32_t g, n;
|
| 1035 | 1035 | bdescr *bd;
|
| 1036 | - const int arr_size = 1024;
|
|
| 1037 | - StgPtr arr[arr_size];
|
|
| 1036 | +#define ARR_SIZE 1024
|
|
| 1037 | + StgPtr arr[ARR_SIZE];
|
|
| 1038 | 1038 | int i = 0;
|
| 1039 | 1039 | searched = 0;
|
| 1040 | 1040 | |
| ... | ... | @@ -1044,24 +1044,24 @@ findPtr(P_ p, int follow) |
| 1044 | 1044 | // just before a block is used.
|
| 1045 | 1045 | for (n = 0; n < getNumCapabilities(); n++) {
|
| 1046 | 1046 | bd = nurseries[i].blocks;
|
| 1047 | - i = findPtrBlocks(p,bd,arr,arr_size,i);
|
|
| 1048 | - if (i >= arr_size) return;
|
|
| 1047 | + i = findPtrBlocks(p,bd,arr,ARR_SIZE,i);
|
|
| 1048 | + if (i >= ARR_SIZE) return;
|
|
| 1049 | 1049 | }
|
| 1050 | 1050 | #endif
|
| 1051 | 1051 | |
| 1052 | 1052 | for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
|
| 1053 | 1053 | bd = generations[g].blocks;
|
| 1054 | - i = findPtrBlocks(p,bd,arr,arr_size,i);
|
|
| 1054 | + i = findPtrBlocks(p,bd,arr,ARR_SIZE,i);
|
|
| 1055 | 1055 | bd = generations[g].large_objects;
|
| 1056 | - i = findPtrBlocks(p,bd,arr,arr_size,i);
|
|
| 1057 | - if (i >= arr_size) return;
|
|
| 1056 | + i = findPtrBlocks(p,bd,arr,ARR_SIZE,i);
|
|
| 1057 | + if (i >= ARR_SIZE) return;
|
|
| 1058 | 1058 | for (n = 0; n < getNumCapabilities(); n++) {
|
| 1059 | 1059 | i = findPtrBlocks(p, gc_threads[n]->gens[g].part_list,
|
| 1060 | - arr, arr_size, i);
|
|
| 1060 | + arr, ARR_SIZE, i);
|
|
| 1061 | 1061 | i = findPtrBlocks(p, gc_threads[n]->gens[g].todo_bd,
|
| 1062 | - arr, arr_size, i);
|
|
| 1062 | + arr, ARR_SIZE, i);
|
|
| 1063 | 1063 | }
|
| 1064 | - if (i >= arr_size) return;
|
|
| 1064 | + if (i >= ARR_SIZE) return;
|
|
| 1065 | 1065 | }
|
| 1066 | 1066 | if (follow && i == 1) {
|
| 1067 | 1067 | debugBelch("-->\n");
|
| ... | ... | @@ -145,30 +145,28 @@ W_ hs_mulIntMayOflo(W_ a, W_ b); |
| 145 | 145 | |
| 146 | 146 | /* rts/prim/int64x2minmax and rts/prim/vectorQuotRem */
|
| 147 | 147 | #if defined(__SSE2__)
|
| 148 | -#include <emmintrin.h>
|
|
| 149 | - |
|
| 150 | -__m128i hs_minInt64X2(__m128i, __m128i);
|
|
| 151 | -__m128i hs_maxInt64X2(__m128i, __m128i);
|
|
| 152 | -__m128i hs_minWord64X2(__m128i, __m128i);
|
|
| 153 | -__m128i hs_maxWord64X2(__m128i, __m128i);
|
|
| 154 | - |
|
| 155 | -__m128i hs_quotInt8X16(__m128i, __m128i);
|
|
| 156 | -__m128i hs_quotInt16X8(__m128i, __m128i);
|
|
| 157 | -__m128i hs_quotInt32X4(__m128i, __m128i);
|
|
| 158 | -__m128i hs_quotInt64X2(__m128i, __m128i);
|
|
| 159 | -__m128i hs_quotWord8X16(__m128i, __m128i);
|
|
| 160 | -__m128i hs_quotWord16X8(__m128i, __m128i);
|
|
| 161 | -__m128i hs_quotWord32X4(__m128i, __m128i);
|
|
| 162 | -__m128i hs_quotWord64X2(__m128i, __m128i);
|
|
| 163 | -__m128i hs_remInt8X16(__m128i, __m128i);
|
|
| 164 | -__m128i hs_remInt16X8(__m128i, __m128i);
|
|
| 165 | -__m128i hs_remInt32X4(__m128i, __m128i);
|
|
| 166 | -__m128i hs_remInt64X2(__m128i, __m128i);
|
|
| 167 | -__m128i hs_remWord8X16(__m128i, __m128i);
|
|
| 168 | -__m128i hs_remWord16X8(__m128i, __m128i);
|
|
| 169 | -__m128i hs_remWord32X4(__m128i, __m128i);
|
|
| 170 | -__m128i hs_remWord64X2(__m128i, __m128i);
|
|
| 171 | - |
|
| 148 | +typedef char v128 __attribute__((vector_size(16)));
|
|
| 149 | +v128 hs_minInt64X2(v128, v128);
|
|
| 150 | +v128 hs_maxInt64X2(v128, v128);
|
|
| 151 | +v128 hs_minWord64X2(v128, v128);
|
|
| 152 | +v128 hs_maxWord64X2(v128, v128);
|
|
| 153 | + |
|
| 154 | +v128 hs_quotInt8X16(v128, v128);
|
|
| 155 | +v128 hs_quotInt16X8(v128, v128);
|
|
| 156 | +v128 hs_quotInt32X4(v128, v128);
|
|
| 157 | +v128 hs_quotInt64X2(v128, v128);
|
|
| 158 | +v128 hs_quotWord8X16(v128, v128);
|
|
| 159 | +v128 hs_quotWord16X8(v128, v128);
|
|
| 160 | +v128 hs_quotWord32X4(v128, v128);
|
|
| 161 | +v128 hs_quotWord64X2(v128, v128);
|
|
| 162 | +v128 hs_remInt8X16(v128, v128);
|
|
| 163 | +v128 hs_remInt16X8(v128, v128);
|
|
| 164 | +v128 hs_remInt32X4(v128, v128);
|
|
| 165 | +v128 hs_remInt64X2(v128, v128);
|
|
| 166 | +v128 hs_remWord8X16(v128, v128);
|
|
| 167 | +v128 hs_remWord16X8(v128, v128);
|
|
| 168 | +v128 hs_remWord32X4(v128, v128);
|
|
| 169 | +v128 hs_remWord64X2(v128, v128);
|
|
| 172 | 170 | #endif
|
| 173 | 171 | |
| 174 | 172 | /* bitcasts, instead of creating a new C file we static inline these here. We
|
| ... | ... | @@ -585,7 +585,7 @@ void *osReserveHeapMemory(void *startAddressPtr, W_ *len) |
| 585 | 585 | }
|
| 586 | 586 | #endif
|
| 587 | 587 | |
| 588 | - const int MAX_ATTEMPTS = 256;
|
|
| 588 | +#define MAX_ATTEMPTS 256
|
|
| 589 | 589 | void *bad_allocs[MAX_ATTEMPTS];
|
| 590 | 590 | size_t bad_alloc_lens[MAX_ATTEMPTS];
|
| 591 | 591 | memset(bad_allocs, 0, sizeof(void*) * MAX_ATTEMPTS);
|
| ... | ... | @@ -12,44 +12,44 @@ |
| 12 | 12 | // * enable SSE4.2, or
|
| 13 | 13 | // * implement min/max in NCG.
|
| 14 | 14 | |
| 15 | -__m128i hs_minInt64X2(__m128i xx, __m128i yy)
|
|
| 15 | +v128 hs_minInt64X2(v128 xx, v128 yy)
|
|
| 16 | 16 | {
|
| 17 | 17 | int64_t x[2], y[2];
|
| 18 | 18 | memcpy(x, &xx, 16);
|
| 19 | 19 | memcpy(y, &yy, 16);
|
| 20 | 20 | int64_t z0 = x[0] < y[0] ? x[0] : y[0];
|
| 21 | 21 | int64_t z1 = x[1] < y[1] ? x[1] : y[1];
|
| 22 | - return _mm_set_epi64x(z1, z0);
|
|
| 22 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 23 | 23 | }
|
| 24 | 24 | |
| 25 | -__m128i hs_maxInt64X2(__m128i xx, __m128i yy)
|
|
| 25 | +v128 hs_maxInt64X2(v128 xx, v128 yy)
|
|
| 26 | 26 | {
|
| 27 | 27 | int64_t x[2], y[2];
|
| 28 | 28 | memcpy(x, &xx, 16);
|
| 29 | 29 | memcpy(y, &yy, 16);
|
| 30 | 30 | int64_t z0 = x[0] < y[0] ? y[0] : x[0];
|
| 31 | 31 | int64_t z1 = x[1] < y[1] ? y[1] : x[1];
|
| 32 | - return _mm_set_epi64x(z1, z0);
|
|
| 32 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 33 | 33 | }
|
| 34 | 34 | |
| 35 | -__m128i hs_minWord64X2(__m128i xx, __m128i yy)
|
|
| 35 | +v128 hs_minWord64X2(v128 xx, v128 yy)
|
|
| 36 | 36 | {
|
| 37 | 37 | uint64_t x[2], y[2];
|
| 38 | 38 | memcpy(x, &xx, 16);
|
| 39 | 39 | memcpy(y, &yy, 16);
|
| 40 | 40 | uint64_t z0 = x[0] < y[0] ? x[0] : y[0];
|
| 41 | 41 | uint64_t z1 = x[1] < y[1] ? x[1] : y[1];
|
| 42 | - return _mm_set_epi64x(z1, z0);
|
|
| 42 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 43 | 43 | }
|
| 44 | 44 | |
| 45 | -__m128i hs_maxWord64X2(__m128i xx, __m128i yy)
|
|
| 45 | +v128 hs_maxWord64X2(v128 xx, v128 yy)
|
|
| 46 | 46 | {
|
| 47 | 47 | uint64_t x[2], y[2];
|
| 48 | 48 | memcpy(x, &xx, 16);
|
| 49 | 49 | memcpy(y, &yy, 16);
|
| 50 | 50 | uint64_t z0 = x[0] < y[0] ? y[0] : x[0];
|
| 51 | 51 | uint64_t z1 = x[1] < y[1] ? y[1] : x[1];
|
| 52 | - return _mm_set_epi64x(z1, z0);
|
|
| 52 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 53 | 53 | }
|
| 54 | 54 | |
| 55 | 55 | #endif |
| ... | ... | @@ -16,7 +16,7 @@ int8x16_t hs_quotInt8X16(int8x16_t x, int8x16_t y) |
| 16 | 16 | |
| 17 | 17 | */
|
| 18 | 18 | |
| 19 | -__m128i hs_quotInt8X16(__m128i xx, __m128i yy)
|
|
| 19 | +v128 hs_quotInt8X16(v128 xx, v128 yy)
|
|
| 20 | 20 | {
|
| 21 | 21 | int8_t x[16], y[16];
|
| 22 | 22 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -37,10 +37,10 @@ __m128i hs_quotInt8X16(__m128i xx, __m128i yy) |
| 37 | 37 | int8_t z13 = x[13] / y[13];
|
| 38 | 38 | int8_t z14 = x[14] / y[14];
|
| 39 | 39 | int8_t z15 = x[15] / y[15];
|
| 40 | - return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 40 | + return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 41 | 41 | }
|
| 42 | 42 | |
| 43 | -__m128i hs_quotInt16X8(__m128i xx, __m128i yy)
|
|
| 43 | +v128 hs_quotInt16X8(v128 xx, v128 yy)
|
|
| 44 | 44 | {
|
| 45 | 45 | int16_t x[8], y[8];
|
| 46 | 46 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -53,10 +53,10 @@ __m128i hs_quotInt16X8(__m128i xx, __m128i yy) |
| 53 | 53 | int16_t z5 = x[5] / y[5];
|
| 54 | 54 | int16_t z6 = x[6] / y[6];
|
| 55 | 55 | int16_t z7 = x[7] / y[7];
|
| 56 | - return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 56 | + return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 57 | 57 | }
|
| 58 | 58 | |
| 59 | -__m128i hs_quotInt32X4(__m128i xx, __m128i yy)
|
|
| 59 | +v128 hs_quotInt32X4(v128 xx, v128 yy)
|
|
| 60 | 60 | {
|
| 61 | 61 | int32_t x[4], y[4];
|
| 62 | 62 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -65,20 +65,20 @@ __m128i hs_quotInt32X4(__m128i xx, __m128i yy) |
| 65 | 65 | int32_t z1 = x[1] / y[1];
|
| 66 | 66 | int32_t z2 = x[2] / y[2];
|
| 67 | 67 | int32_t z3 = x[3] / y[3];
|
| 68 | - return _mm_set_epi32(z3, z2, z1, z0);
|
|
| 68 | + return (v128) _mm_set_epi32(z3, z2, z1, z0);
|
|
| 69 | 69 | }
|
| 70 | 70 | |
| 71 | -__m128i hs_quotInt64X2(__m128i xx, __m128i yy)
|
|
| 71 | +v128 hs_quotInt64X2(v128 xx, v128 yy)
|
|
| 72 | 72 | {
|
| 73 | 73 | int64_t x[2], y[2];
|
| 74 | 74 | memcpy(x, &xx, 16);
|
| 75 | 75 | memcpy(y, &yy, 16);
|
| 76 | 76 | int64_t z0 = x[0] / y[0];
|
| 77 | 77 | int64_t z1 = x[1] / y[1];
|
| 78 | - return _mm_set_epi64x(z1, z0);
|
|
| 78 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 79 | 79 | }
|
| 80 | 80 | |
| 81 | -__m128i hs_quotWord8X16(__m128i xx, __m128i yy)
|
|
| 81 | +v128 hs_quotWord8X16(v128 xx, v128 yy)
|
|
| 82 | 82 | {
|
| 83 | 83 | uint8_t x[16], y[16];
|
| 84 | 84 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -99,10 +99,10 @@ __m128i hs_quotWord8X16(__m128i xx, __m128i yy) |
| 99 | 99 | uint8_t z13 = x[13] / y[13];
|
| 100 | 100 | uint8_t z14 = x[14] / y[14];
|
| 101 | 101 | uint8_t z15 = x[15] / y[15];
|
| 102 | - return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 102 | + return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 103 | 103 | }
|
| 104 | 104 | |
| 105 | -__m128i hs_quotWord16X8(__m128i xx, __m128i yy)
|
|
| 105 | +v128 hs_quotWord16X8(v128 xx, v128 yy)
|
|
| 106 | 106 | {
|
| 107 | 107 | uint16_t x[8], y[8];
|
| 108 | 108 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -115,10 +115,10 @@ __m128i hs_quotWord16X8(__m128i xx, __m128i yy) |
| 115 | 115 | uint16_t z5 = x[5] / y[5];
|
| 116 | 116 | uint16_t z6 = x[6] / y[6];
|
| 117 | 117 | uint16_t z7 = x[7] / y[7];
|
| 118 | - return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 118 | + return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 119 | 119 | }
|
| 120 | 120 | |
| 121 | -__m128i hs_quotWord32X4(__m128i xx, __m128i yy)
|
|
| 121 | +v128 hs_quotWord32X4(v128 xx, v128 yy)
|
|
| 122 | 122 | {
|
| 123 | 123 | uint32_t x[4], y[4];
|
| 124 | 124 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -127,20 +127,20 @@ __m128i hs_quotWord32X4(__m128i xx, __m128i yy) |
| 127 | 127 | uint32_t z1 = x[1] / y[1];
|
| 128 | 128 | uint32_t z2 = x[2] / y[2];
|
| 129 | 129 | uint32_t z3 = x[3] / y[3];
|
| 130 | - return _mm_set_epi32(z3, z2, z1, z0);
|
|
| 130 | + return (v128) _mm_set_epi32(z3, z2, z1, z0);
|
|
| 131 | 131 | }
|
| 132 | 132 | |
| 133 | -__m128i hs_quotWord64X2(__m128i xx, __m128i yy)
|
|
| 133 | +v128 hs_quotWord64X2(v128 xx, v128 yy)
|
|
| 134 | 134 | {
|
| 135 | 135 | uint64_t x[2], y[2];
|
| 136 | 136 | memcpy(x, &xx, 16);
|
| 137 | 137 | memcpy(y, &yy, 16);
|
| 138 | 138 | uint64_t z0 = x[0] / y[0];
|
| 139 | 139 | uint64_t z1 = x[1] / y[1];
|
| 140 | - return _mm_set_epi64x(z1, z0);
|
|
| 140 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 141 | 141 | }
|
| 142 | 142 | |
| 143 | -__m128i hs_remInt8X16(__m128i xx, __m128i yy)
|
|
| 143 | +v128 hs_remInt8X16(v128 xx, v128 yy)
|
|
| 144 | 144 | {
|
| 145 | 145 | int8_t x[16], y[16];
|
| 146 | 146 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -161,10 +161,10 @@ __m128i hs_remInt8X16(__m128i xx, __m128i yy) |
| 161 | 161 | int8_t z13 = x[13] % y[13];
|
| 162 | 162 | int8_t z14 = x[14] % y[14];
|
| 163 | 163 | int8_t z15 = x[15] % y[15];
|
| 164 | - return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 164 | + return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 165 | 165 | }
|
| 166 | 166 | |
| 167 | -__m128i hs_remInt16X8(__m128i xx, __m128i yy)
|
|
| 167 | +v128 hs_remInt16X8(v128 xx, v128 yy)
|
|
| 168 | 168 | {
|
| 169 | 169 | int16_t x[8], y[8];
|
| 170 | 170 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -177,10 +177,10 @@ __m128i hs_remInt16X8(__m128i xx, __m128i yy) |
| 177 | 177 | int16_t z5 = x[5] % y[5];
|
| 178 | 178 | int16_t z6 = x[6] % y[6];
|
| 179 | 179 | int16_t z7 = x[7] % y[7];
|
| 180 | - return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 180 | + return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 181 | 181 | }
|
| 182 | 182 | |
| 183 | -__m128i hs_remInt32X4(__m128i xx, __m128i yy)
|
|
| 183 | +v128 hs_remInt32X4(v128 xx, v128 yy)
|
|
| 184 | 184 | {
|
| 185 | 185 | int32_t x[4], y[4];
|
| 186 | 186 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -189,20 +189,20 @@ __m128i hs_remInt32X4(__m128i xx, __m128i yy) |
| 189 | 189 | int32_t z1 = x[1] % y[1];
|
| 190 | 190 | int32_t z2 = x[2] % y[2];
|
| 191 | 191 | int32_t z3 = x[3] % y[3];
|
| 192 | - return _mm_set_epi32(z3, z2, z1, z0);
|
|
| 192 | + return (v128) _mm_set_epi32(z3, z2, z1, z0);
|
|
| 193 | 193 | }
|
| 194 | 194 | |
| 195 | -__m128i hs_remInt64X2(__m128i xx, __m128i yy)
|
|
| 195 | +v128 hs_remInt64X2(v128 xx, v128 yy)
|
|
| 196 | 196 | {
|
| 197 | 197 | int64_t x[2], y[2];
|
| 198 | 198 | memcpy(x, &xx, 16);
|
| 199 | 199 | memcpy(y, &yy, 16);
|
| 200 | 200 | int64_t z0 = x[0] % y[0];
|
| 201 | 201 | int64_t z1 = x[1] % y[1];
|
| 202 | - return _mm_set_epi64x(z1, z0);
|
|
| 202 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 203 | 203 | }
|
| 204 | 204 | |
| 205 | -__m128i hs_remWord8X16(__m128i xx, __m128i yy)
|
|
| 205 | +v128 hs_remWord8X16(v128 xx, v128 yy)
|
|
| 206 | 206 | {
|
| 207 | 207 | uint8_t x[16], y[16];
|
| 208 | 208 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -223,10 +223,10 @@ __m128i hs_remWord8X16(__m128i xx, __m128i yy) |
| 223 | 223 | uint8_t z13 = x[13] % y[13];
|
| 224 | 224 | uint8_t z14 = x[14] % y[14];
|
| 225 | 225 | uint8_t z15 = x[15] % y[15];
|
| 226 | - return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 226 | + return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 227 | 227 | }
|
| 228 | 228 | |
| 229 | -__m128i hs_remWord16X8(__m128i xx, __m128i yy)
|
|
| 229 | +v128 hs_remWord16X8(v128 xx, v128 yy)
|
|
| 230 | 230 | {
|
| 231 | 231 | uint16_t x[8], y[8];
|
| 232 | 232 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -239,10 +239,10 @@ __m128i hs_remWord16X8(__m128i xx, __m128i yy) |
| 239 | 239 | uint16_t z5 = x[5] % y[5];
|
| 240 | 240 | uint16_t z6 = x[6] % y[6];
|
| 241 | 241 | uint16_t z7 = x[7] % y[7];
|
| 242 | - return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 242 | + return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
|
|
| 243 | 243 | }
|
| 244 | 244 | |
| 245 | -__m128i hs_remWord32X4(__m128i xx, __m128i yy)
|
|
| 245 | +v128 hs_remWord32X4(v128 xx, v128 yy)
|
|
| 246 | 246 | {
|
| 247 | 247 | uint32_t x[4], y[4];
|
| 248 | 248 | memcpy(x, &xx, 16);
|
| ... | ... | @@ -251,17 +251,17 @@ __m128i hs_remWord32X4(__m128i xx, __m128i yy) |
| 251 | 251 | uint32_t z1 = x[1] % y[1];
|
| 252 | 252 | uint32_t z2 = x[2] % y[2];
|
| 253 | 253 | uint32_t z3 = x[3] % y[3];
|
| 254 | - return _mm_set_epi32(z3, z2, z1, z0);
|
|
| 254 | + return (v128) _mm_set_epi32(z3, z2, z1, z0);
|
|
| 255 | 255 | }
|
| 256 | 256 | |
| 257 | -__m128i hs_remWord64X2(__m128i xx, __m128i yy)
|
|
| 257 | +v128 hs_remWord64X2(v128 xx, v128 yy)
|
|
| 258 | 258 | {
|
| 259 | 259 | uint64_t x[2], y[2];
|
| 260 | 260 | memcpy(x, &xx, 16);
|
| 261 | 261 | memcpy(y, &yy, 16);
|
| 262 | 262 | uint64_t z0 = x[0] % y[0];
|
| 263 | 263 | uint64_t z1 = x[1] % y[1];
|
| 264 | - return _mm_set_epi64x(z1, z0);
|
|
| 264 | + return (v128) _mm_set_epi64x(z1, z0);
|
|
| 265 | 265 | }
|
| 266 | 266 | |
| 267 | 267 | #endif |