Ben Gamari pushed to branch wip/T26166 at Glasgow Haskell Compiler / GHC

Commits:

7 changed files:

Changes:

  • compiler/GHC/Cmm/CLabel.hs
    ... ... @@ -102,7 +102,7 @@ module GHC.Cmm.CLabel (
    102 102
             needsCDecl,
    
    103 103
             maybeLocalBlockLabel,
    
    104 104
             externallyVisibleCLabel,
    
    105
    -        isMathFun,
    
    105
    +        isLibcFun,
    
    106 106
             isCFunctionLabel,
    
    107 107
             isGcPtrLabel,
    
    108 108
             labelDynamic,
    
    ... ... @@ -1028,7 +1028,7 @@ needsCDecl (CmmLabel pkgId (NeedExternDecl external) _ _)
    1028 1028
             -- For other labels we inline one into the HC file directly.
    
    1029 1029
             | otherwise                     = True
    
    1030 1030
     
    
    1031
    -needsCDecl l@(ForeignLabel{})           = not (isMathFun l)
    
    1031
    +needsCDecl l@(ForeignLabel{})           = not (isLibcFun l)
    
    1032 1032
     needsCDecl (CC_Label _)                 = True
    
    1033 1033
     needsCDecl (CCS_Label _)                = True
    
    1034 1034
     needsCDecl (IPE_Label {})               = True
    
    ... ... @@ -1055,15 +1055,19 @@ maybeLocalBlockLabel _ = Nothing
    1055 1055
     
    
    1056 1056
     
    
    1057 1057
     -- | Check whether a label corresponds to a C function that has
    
    1058
    ---      a prototype in a system header somewhere, or is built-in
    
    1059
    ---      to the C compiler. For these labels we avoid generating our
    
    1060
    ---      own C prototypes.
    
    1061
    -isMathFun :: CLabel -> Bool
    
    1062
    -isMathFun (ForeignLabel fs _ _)       = fs `elementOfUniqSet` math_funs
    
    1063
    -isMathFun _ = False
    
    1064
    -
    
    1065
    -math_funs :: UniqSet FastString
    
    1066
    -math_funs = mkUniqSet [
    
    1058
    +-- a prototype in a system header somewhere, or is built-in
    
    1059
    +-- to the C compiler. For these labels we avoid generating our
    
    1060
    +-- own C prototypes.
    
    1061
    +isLibcFun :: CLabel -> Bool
    
    1062
    +isLibcFun (ForeignLabel fs _ _)  = fs `elementOfUniqSet` libc_funs
    
    1063
    +isLibcFun _ = False
    
    1064
    +
    
    1065
    +libc_funs :: UniqSet FastString
    
    1066
    +libc_funs = mkUniqSet [
    
    1067
    +        ---------------------
    
    1068
    +        -- Math functions
    
    1069
    +        ---------------------
    
    1070
    +
    
    1067 1071
             -- _ISOC99_SOURCE
    
    1068 1072
             (fsLit "acos"),         (fsLit "acosf"),        (fsLit "acosh"),
    
    1069 1073
             (fsLit "acoshf"),       (fsLit "acoshl"),       (fsLit "acosl"),
    

  • compiler/GHC/CmmToC.hs
    ... ... @@ -245,7 +245,7 @@ pprStmt platform stmt =
    245 245
                   CmmLit (CmmLabel lbl)
    
    246 246
                     | CmmNeverReturns <- ret ->
    
    247 247
                         pprCall platform cast_fn cconv hresults hargs <> semi <> text "__builtin_unreachable();"
    
    248
    -                | not (isMathFun lbl) ->
    
    248
    +                | not (isLibcFun lbl) ->
    
    249 249
                         pprForeignCall platform (pprCLabel platform lbl) cconv hresults hargs
    
    250 250
                   _ ->
    
    251 251
                         pprCall platform cast_fn cconv hresults hargs <> semi
    

  • rts/Printer.c
    ... ... @@ -1033,8 +1033,8 @@ findPtr(P_ p, int follow)
    1033 1033
     {
    
    1034 1034
       uint32_t g, n;
    
    1035 1035
       bdescr *bd;
    
    1036
    -  const int arr_size = 1024;
    
    1037
    -  StgPtr arr[arr_size];
    
    1036
    +#define ARR_SIZE 1024
    
    1037
    +  StgPtr arr[ARR_SIZE];
    
    1038 1038
       int i = 0;
    
    1039 1039
       searched = 0;
    
    1040 1040
     
    
    ... ... @@ -1044,24 +1044,24 @@ findPtr(P_ p, int follow)
    1044 1044
       // just before a block is used.
    
    1045 1045
       for (n = 0; n < getNumCapabilities(); n++) {
    
    1046 1046
           bd = nurseries[i].blocks;
    
    1047
    -      i = findPtrBlocks(p,bd,arr,arr_size,i);
    
    1048
    -      if (i >= arr_size) return;
    
    1047
    +      i = findPtrBlocks(p,bd,arr,ARR_SIZE,i);
    
    1048
    +      if (i >= ARR_SIZE) return;
    
    1049 1049
       }
    
    1050 1050
     #endif
    
    1051 1051
     
    
    1052 1052
       for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
    
    1053 1053
           bd = generations[g].blocks;
    
    1054
    -      i = findPtrBlocks(p,bd,arr,arr_size,i);
    
    1054
    +      i = findPtrBlocks(p,bd,arr,ARR_SIZE,i);
    
    1055 1055
           bd = generations[g].large_objects;
    
    1056
    -      i = findPtrBlocks(p,bd,arr,arr_size,i);
    
    1057
    -      if (i >= arr_size) return;
    
    1056
    +      i = findPtrBlocks(p,bd,arr,ARR_SIZE,i);
    
    1057
    +      if (i >= ARR_SIZE) return;
    
    1058 1058
           for (n = 0; n < getNumCapabilities(); n++) {
    
    1059 1059
               i = findPtrBlocks(p, gc_threads[n]->gens[g].part_list,
    
    1060
    -                            arr, arr_size, i);
    
    1060
    +                            arr, ARR_SIZE, i);
    
    1061 1061
               i = findPtrBlocks(p, gc_threads[n]->gens[g].todo_bd,
    
    1062
    -                            arr, arr_size, i);
    
    1062
    +                            arr, ARR_SIZE, i);
    
    1063 1063
           }
    
    1064
    -      if (i >= arr_size) return;
    
    1064
    +      if (i >= ARR_SIZE) return;
    
    1065 1065
       }
    
    1066 1066
       if (follow && i == 1) {
    
    1067 1067
           debugBelch("-->\n");
    

  • rts/include/stg/Prim.h
    ... ... @@ -145,30 +145,28 @@ W_ hs_mulIntMayOflo(W_ a, W_ b);
    145 145
     
    
    146 146
     /* rts/prim/int64x2minmax and rts/prim/vectorQuotRem */
    
    147 147
     #if defined(__SSE2__)
    
    148
    -#include <emmintrin.h>
    
    149
    -
    
    150
    -__m128i hs_minInt64X2(__m128i, __m128i);
    
    151
    -__m128i hs_maxInt64X2(__m128i, __m128i);
    
    152
    -__m128i hs_minWord64X2(__m128i, __m128i);
    
    153
    -__m128i hs_maxWord64X2(__m128i, __m128i);
    
    154
    -
    
    155
    -__m128i hs_quotInt8X16(__m128i, __m128i);
    
    156
    -__m128i hs_quotInt16X8(__m128i, __m128i);
    
    157
    -__m128i hs_quotInt32X4(__m128i, __m128i);
    
    158
    -__m128i hs_quotInt64X2(__m128i, __m128i);
    
    159
    -__m128i hs_quotWord8X16(__m128i, __m128i);
    
    160
    -__m128i hs_quotWord16X8(__m128i, __m128i);
    
    161
    -__m128i hs_quotWord32X4(__m128i, __m128i);
    
    162
    -__m128i hs_quotWord64X2(__m128i, __m128i);
    
    163
    -__m128i hs_remInt8X16(__m128i, __m128i);
    
    164
    -__m128i hs_remInt16X8(__m128i, __m128i);
    
    165
    -__m128i hs_remInt32X4(__m128i, __m128i);
    
    166
    -__m128i hs_remInt64X2(__m128i, __m128i);
    
    167
    -__m128i hs_remWord8X16(__m128i, __m128i);
    
    168
    -__m128i hs_remWord16X8(__m128i, __m128i);
    
    169
    -__m128i hs_remWord32X4(__m128i, __m128i);
    
    170
    -__m128i hs_remWord64X2(__m128i, __m128i);
    
    171
    -
    
    148
    +typedef char v128 __attribute__((vector_size(16)));
    
    149
    +v128 hs_minInt64X2(v128, v128);
    
    150
    +v128 hs_maxInt64X2(v128, v128);
    
    151
    +v128 hs_minWord64X2(v128, v128);
    
    152
    +v128 hs_maxWord64X2(v128, v128);
    
    153
    +
    
    154
    +v128 hs_quotInt8X16(v128, v128);
    
    155
    +v128 hs_quotInt16X8(v128, v128);
    
    156
    +v128 hs_quotInt32X4(v128, v128);
    
    157
    +v128 hs_quotInt64X2(v128, v128);
    
    158
    +v128 hs_quotWord8X16(v128, v128);
    
    159
    +v128 hs_quotWord16X8(v128, v128);
    
    160
    +v128 hs_quotWord32X4(v128, v128);
    
    161
    +v128 hs_quotWord64X2(v128, v128);
    
    162
    +v128 hs_remInt8X16(v128, v128);
    
    163
    +v128 hs_remInt16X8(v128, v128);
    
    164
    +v128 hs_remInt32X4(v128, v128);
    
    165
    +v128 hs_remInt64X2(v128, v128);
    
    166
    +v128 hs_remWord8X16(v128, v128);
    
    167
    +v128 hs_remWord16X8(v128, v128);
    
    168
    +v128 hs_remWord32X4(v128, v128);
    
    169
    +v128 hs_remWord64X2(v128, v128);
    
    172 170
     #endif
    
    173 171
     
    
    174 172
     /* bitcasts, instead of creating a new C file we static inline these here. We
    

  • rts/posix/OSMem.c
    ... ... @@ -585,7 +585,7 @@ void *osReserveHeapMemory(void *startAddressPtr, W_ *len)
    585 585
         }
    
    586 586
     #endif
    
    587 587
     
    
    588
    -    const int MAX_ATTEMPTS = 256;
    
    588
    +#define MAX_ATTEMPTS 256
    
    589 589
         void *bad_allocs[MAX_ATTEMPTS];
    
    590 590
         size_t bad_alloc_lens[MAX_ATTEMPTS];
    
    591 591
         memset(bad_allocs, 0, sizeof(void*) * MAX_ATTEMPTS);
    

  • rts/prim/int64x2minmax.c
    ... ... @@ -12,44 +12,44 @@
    12 12
     //   * enable SSE4.2, or
    
    13 13
     //   * implement min/max in NCG.
    
    14 14
     
    
    15
    -__m128i hs_minInt64X2(__m128i xx, __m128i yy)
    
    15
    +v128 hs_minInt64X2(v128 xx, v128 yy)
    
    16 16
     {
    
    17 17
       int64_t x[2], y[2];
    
    18 18
       memcpy(x, &xx, 16);
    
    19 19
       memcpy(y, &yy, 16);
    
    20 20
       int64_t z0 = x[0] < y[0] ? x[0] : y[0];
    
    21 21
       int64_t z1 = x[1] < y[1] ? x[1] : y[1];
    
    22
    -  return _mm_set_epi64x(z1, z0);
    
    22
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    23 23
     }
    
    24 24
     
    
    25
    -__m128i hs_maxInt64X2(__m128i xx, __m128i yy)
    
    25
    +v128 hs_maxInt64X2(v128 xx, v128 yy)
    
    26 26
     {
    
    27 27
       int64_t x[2], y[2];
    
    28 28
       memcpy(x, &xx, 16);
    
    29 29
       memcpy(y, &yy, 16);
    
    30 30
       int64_t z0 = x[0] < y[0] ? y[0] : x[0];
    
    31 31
       int64_t z1 = x[1] < y[1] ? y[1] : x[1];
    
    32
    -  return _mm_set_epi64x(z1, z0);
    
    32
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    33 33
     }
    
    34 34
     
    
    35
    -__m128i hs_minWord64X2(__m128i xx, __m128i yy)
    
    35
    +v128 hs_minWord64X2(v128 xx, v128 yy)
    
    36 36
     {
    
    37 37
       uint64_t x[2], y[2];
    
    38 38
       memcpy(x, &xx, 16);
    
    39 39
       memcpy(y, &yy, 16);
    
    40 40
       uint64_t z0 = x[0] < y[0] ? x[0] : y[0];
    
    41 41
       uint64_t z1 = x[1] < y[1] ? x[1] : y[1];
    
    42
    -  return _mm_set_epi64x(z1, z0);
    
    42
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    43 43
     }
    
    44 44
     
    
    45
    -__m128i hs_maxWord64X2(__m128i xx, __m128i yy)
    
    45
    +v128 hs_maxWord64X2(v128 xx, v128 yy)
    
    46 46
     {
    
    47 47
       uint64_t x[2], y[2];
    
    48 48
       memcpy(x, &xx, 16);
    
    49 49
       memcpy(y, &yy, 16);
    
    50 50
       uint64_t z0 = x[0] < y[0] ? y[0] : x[0];
    
    51 51
       uint64_t z1 = x[1] < y[1] ? y[1] : x[1];
    
    52
    -  return _mm_set_epi64x(z1, z0);
    
    52
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    53 53
     }
    
    54 54
     
    
    55 55
     #endif

  • rts/prim/vectorQuotRem.c
    ... ... @@ -16,7 +16,7 @@ int8x16_t hs_quotInt8X16(int8x16_t x, int8x16_t y)
    16 16
     
    
    17 17
     */
    
    18 18
     
    
    19
    -__m128i hs_quotInt8X16(__m128i xx, __m128i yy)
    
    19
    +v128 hs_quotInt8X16(v128 xx, v128 yy)
    
    20 20
     {
    
    21 21
       int8_t x[16], y[16];
    
    22 22
       memcpy(x, &xx, 16);
    
    ... ... @@ -37,10 +37,10 @@ __m128i hs_quotInt8X16(__m128i xx, __m128i yy)
    37 37
       int8_t z13 = x[13] / y[13];
    
    38 38
       int8_t z14 = x[14] / y[14];
    
    39 39
       int8_t z15 = x[15] / y[15];
    
    40
    -  return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    40
    +  return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    41 41
     }
    
    42 42
     
    
    43
    -__m128i hs_quotInt16X8(__m128i xx, __m128i yy)
    
    43
    +v128 hs_quotInt16X8(v128 xx, v128 yy)
    
    44 44
     {
    
    45 45
       int16_t x[8], y[8];
    
    46 46
       memcpy(x, &xx, 16);
    
    ... ... @@ -53,10 +53,10 @@ __m128i hs_quotInt16X8(__m128i xx, __m128i yy)
    53 53
       int16_t z5 = x[5] / y[5];
    
    54 54
       int16_t z6 = x[6] / y[6];
    
    55 55
       int16_t z7 = x[7] / y[7];
    
    56
    -  return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    56
    +  return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    57 57
     }
    
    58 58
     
    
    59
    -__m128i hs_quotInt32X4(__m128i xx, __m128i yy)
    
    59
    +v128 hs_quotInt32X4(v128 xx, v128 yy)
    
    60 60
     {
    
    61 61
       int32_t x[4], y[4];
    
    62 62
       memcpy(x, &xx, 16);
    
    ... ... @@ -65,20 +65,20 @@ __m128i hs_quotInt32X4(__m128i xx, __m128i yy)
    65 65
       int32_t z1 = x[1] / y[1];
    
    66 66
       int32_t z2 = x[2] / y[2];
    
    67 67
       int32_t z3 = x[3] / y[3];
    
    68
    -  return _mm_set_epi32(z3, z2, z1, z0);
    
    68
    +  return (v128) _mm_set_epi32(z3, z2, z1, z0);
    
    69 69
     }
    
    70 70
     
    
    71
    -__m128i hs_quotInt64X2(__m128i xx, __m128i yy)
    
    71
    +v128 hs_quotInt64X2(v128 xx, v128 yy)
    
    72 72
     {
    
    73 73
       int64_t x[2], y[2];
    
    74 74
       memcpy(x, &xx, 16);
    
    75 75
       memcpy(y, &yy, 16);
    
    76 76
       int64_t z0 = x[0] / y[0];
    
    77 77
       int64_t z1 = x[1] / y[1];
    
    78
    -  return _mm_set_epi64x(z1, z0);
    
    78
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    79 79
     }
    
    80 80
     
    
    81
    -__m128i hs_quotWord8X16(__m128i xx, __m128i yy)
    
    81
    +v128 hs_quotWord8X16(v128 xx, v128 yy)
    
    82 82
     {
    
    83 83
       uint8_t x[16], y[16];
    
    84 84
       memcpy(x, &xx, 16);
    
    ... ... @@ -99,10 +99,10 @@ __m128i hs_quotWord8X16(__m128i xx, __m128i yy)
    99 99
       uint8_t z13 = x[13] / y[13];
    
    100 100
       uint8_t z14 = x[14] / y[14];
    
    101 101
       uint8_t z15 = x[15] / y[15];
    
    102
    -  return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    102
    +  return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    103 103
     }
    
    104 104
     
    
    105
    -__m128i hs_quotWord16X8(__m128i xx, __m128i yy)
    
    105
    +v128 hs_quotWord16X8(v128 xx, v128 yy)
    
    106 106
     {
    
    107 107
       uint16_t x[8], y[8];
    
    108 108
       memcpy(x, &xx, 16);
    
    ... ... @@ -115,10 +115,10 @@ __m128i hs_quotWord16X8(__m128i xx, __m128i yy)
    115 115
       uint16_t z5 = x[5] / y[5];
    
    116 116
       uint16_t z6 = x[6] / y[6];
    
    117 117
       uint16_t z7 = x[7] / y[7];
    
    118
    -  return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    118
    +  return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    119 119
     }
    
    120 120
     
    
    121
    -__m128i hs_quotWord32X4(__m128i xx, __m128i yy)
    
    121
    +v128 hs_quotWord32X4(v128 xx, v128 yy)
    
    122 122
     {
    
    123 123
       uint32_t x[4], y[4];
    
    124 124
       memcpy(x, &xx, 16);
    
    ... ... @@ -127,20 +127,20 @@ __m128i hs_quotWord32X4(__m128i xx, __m128i yy)
    127 127
       uint32_t z1 = x[1] / y[1];
    
    128 128
       uint32_t z2 = x[2] / y[2];
    
    129 129
       uint32_t z3 = x[3] / y[3];
    
    130
    -  return _mm_set_epi32(z3, z2, z1, z0);
    
    130
    +  return (v128) _mm_set_epi32(z3, z2, z1, z0);
    
    131 131
     }
    
    132 132
     
    
    133
    -__m128i hs_quotWord64X2(__m128i xx, __m128i yy)
    
    133
    +v128 hs_quotWord64X2(v128 xx, v128 yy)
    
    134 134
     {
    
    135 135
       uint64_t x[2], y[2];
    
    136 136
       memcpy(x, &xx, 16);
    
    137 137
       memcpy(y, &yy, 16);
    
    138 138
       uint64_t z0 = x[0] / y[0];
    
    139 139
       uint64_t z1 = x[1] / y[1];
    
    140
    -  return _mm_set_epi64x(z1, z0);
    
    140
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    141 141
     }
    
    142 142
     
    
    143
    -__m128i hs_remInt8X16(__m128i xx, __m128i yy)
    
    143
    +v128 hs_remInt8X16(v128 xx, v128 yy)
    
    144 144
     {
    
    145 145
       int8_t x[16], y[16];
    
    146 146
       memcpy(x, &xx, 16);
    
    ... ... @@ -161,10 +161,10 @@ __m128i hs_remInt8X16(__m128i xx, __m128i yy)
    161 161
       int8_t z13 = x[13] % y[13];
    
    162 162
       int8_t z14 = x[14] % y[14];
    
    163 163
       int8_t z15 = x[15] % y[15];
    
    164
    -  return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    164
    +  return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    165 165
     }
    
    166 166
     
    
    167
    -__m128i hs_remInt16X8(__m128i xx, __m128i yy)
    
    167
    +v128 hs_remInt16X8(v128 xx, v128 yy)
    
    168 168
     {
    
    169 169
       int16_t x[8], y[8];
    
    170 170
       memcpy(x, &xx, 16);
    
    ... ... @@ -177,10 +177,10 @@ __m128i hs_remInt16X8(__m128i xx, __m128i yy)
    177 177
       int16_t z5 = x[5] % y[5];
    
    178 178
       int16_t z6 = x[6] % y[6];
    
    179 179
       int16_t z7 = x[7] % y[7];
    
    180
    -  return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    180
    +  return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    181 181
     }
    
    182 182
     
    
    183
    -__m128i hs_remInt32X4(__m128i xx, __m128i yy)
    
    183
    +v128 hs_remInt32X4(v128 xx, v128 yy)
    
    184 184
     {
    
    185 185
       int32_t x[4], y[4];
    
    186 186
       memcpy(x, &xx, 16);
    
    ... ... @@ -189,20 +189,20 @@ __m128i hs_remInt32X4(__m128i xx, __m128i yy)
    189 189
       int32_t z1 = x[1] % y[1];
    
    190 190
       int32_t z2 = x[2] % y[2];
    
    191 191
       int32_t z3 = x[3] % y[3];
    
    192
    -  return _mm_set_epi32(z3, z2, z1, z0);
    
    192
    +  return (v128) _mm_set_epi32(z3, z2, z1, z0);
    
    193 193
     }
    
    194 194
     
    
    195
    -__m128i hs_remInt64X2(__m128i xx, __m128i yy)
    
    195
    +v128 hs_remInt64X2(v128 xx, v128 yy)
    
    196 196
     {
    
    197 197
       int64_t x[2], y[2];
    
    198 198
       memcpy(x, &xx, 16);
    
    199 199
       memcpy(y, &yy, 16);
    
    200 200
       int64_t z0 = x[0] % y[0];
    
    201 201
       int64_t z1 = x[1] % y[1];
    
    202
    -  return _mm_set_epi64x(z1, z0);
    
    202
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    203 203
     }
    
    204 204
     
    
    205
    -__m128i hs_remWord8X16(__m128i xx, __m128i yy)
    
    205
    +v128 hs_remWord8X16(v128 xx, v128 yy)
    
    206 206
     {
    
    207 207
       uint8_t x[16], y[16];
    
    208 208
       memcpy(x, &xx, 16);
    
    ... ... @@ -223,10 +223,10 @@ __m128i hs_remWord8X16(__m128i xx, __m128i yy)
    223 223
       uint8_t z13 = x[13] % y[13];
    
    224 224
       uint8_t z14 = x[14] % y[14];
    
    225 225
       uint8_t z15 = x[15] % y[15];
    
    226
    -  return _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    226
    +  return (v128) _mm_set_epi8(z15, z14, z13, z12, z11, z10, z9, z8, z7, z6, z5, z4, z3, z2, z1, z0);
    
    227 227
     }
    
    228 228
     
    
    229
    -__m128i hs_remWord16X8(__m128i xx, __m128i yy)
    
    229
    +v128 hs_remWord16X8(v128 xx, v128 yy)
    
    230 230
     {
    
    231 231
       uint16_t x[8], y[8];
    
    232 232
       memcpy(x, &xx, 16);
    
    ... ... @@ -239,10 +239,10 @@ __m128i hs_remWord16X8(__m128i xx, __m128i yy)
    239 239
       uint16_t z5 = x[5] % y[5];
    
    240 240
       uint16_t z6 = x[6] % y[6];
    
    241 241
       uint16_t z7 = x[7] % y[7];
    
    242
    -  return _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    242
    +  return (v128) _mm_set_epi16(z7, z6, z5, z4, z3, z2, z1, z0);
    
    243 243
     }
    
    244 244
     
    
    245
    -__m128i hs_remWord32X4(__m128i xx, __m128i yy)
    
    245
    +v128 hs_remWord32X4(v128 xx, v128 yy)
    
    246 246
     {
    
    247 247
       uint32_t x[4], y[4];
    
    248 248
       memcpy(x, &xx, 16);
    
    ... ... @@ -251,17 +251,17 @@ __m128i hs_remWord32X4(__m128i xx, __m128i yy)
    251 251
       uint32_t z1 = x[1] % y[1];
    
    252 252
       uint32_t z2 = x[2] % y[2];
    
    253 253
       uint32_t z3 = x[3] % y[3];
    
    254
    -  return _mm_set_epi32(z3, z2, z1, z0);
    
    254
    +  return (v128) _mm_set_epi32(z3, z2, z1, z0);
    
    255 255
     }
    
    256 256
     
    
    257
    -__m128i hs_remWord64X2(__m128i xx, __m128i yy)
    
    257
    +v128 hs_remWord64X2(v128 xx, v128 yy)
    
    258 258
     {
    
    259 259
       uint64_t x[2], y[2];
    
    260 260
       memcpy(x, &xx, 16);
    
    261 261
       memcpy(y, &yy, 16);
    
    262 262
       uint64_t z0 = x[0] % y[0];
    
    263 263
       uint64_t z1 = x[1] % y[1];
    
    264
    -  return _mm_set_epi64x(z1, z0);
    
    264
    +  return (v128) _mm_set_epi64x(z1, z0);
    
    265 265
     }
    
    266 266
     
    
    267 267
     #endif