| ... |
... |
@@ -1294,7 +1294,7 @@ getRegister' config plat expr = |
|
1294
|
1294
|
MO_V_Sub length w -> vecOp (intVecFormat length w) VSUB
|
|
1295
|
1295
|
MO_VF_Mul length w -> vecOp (floatVecFormat length w) VMUL
|
|
1296
|
1296
|
MO_V_Mul length w -> vecOp (intVecFormat length w) VMUL
|
|
1297
|
|
- MO_VF_Quot length w -> vecOp (floatVecFormat length w) VQUOT
|
|
|
1297
|
+ MO_VF_Quot length w -> vecOp (floatVecFormat length w) (VQUOT Nothing)
|
|
1298
|
1298
|
-- See https://godbolt.org/z/PvcWKMKoW
|
|
1299
|
1299
|
MO_VS_Min length w -> vecOp (intVecFormat length w) VSMIN
|
|
1300
|
1300
|
MO_VS_Max length w -> vecOp (intVecFormat length w) VSMAX
|
| ... |
... |
@@ -1302,6 +1302,66 @@ getRegister' config plat expr = |
|
1302
|
1302
|
MO_VU_Max length w -> vecOp (intVecFormat length w) VUMAX
|
|
1303
|
1303
|
MO_VF_Min length w -> vecOp (floatVecFormat length w) VFMIN
|
|
1304
|
1304
|
MO_VF_Max length w -> vecOp (floatVecFormat length w) VFMAX
|
|
|
1305
|
+ MO_V_Shuffle length w idxs -> do
|
|
|
1306
|
+ -- Our strategy:
|
|
|
1307
|
+ -- - Gather elemens of v1 on the right positions
|
|
|
1308
|
+ -- - Gather elemenrs of v2 of the right positions
|
|
|
1309
|
+ -- - Merge v1 and v2 with an adequate bitmask (v0)
|
|
|
1310
|
+ lbl_selVec_v1 <- getNewLabelNat
|
|
|
1311
|
+ lbl_selVec_v2 <- getNewLabelNat
|
|
|
1312
|
+
|
|
|
1313
|
+ (reg_x, format_x, code_x) <- getSomeReg x
|
|
|
1314
|
+ (reg_y, format_y, code_y) <- getSomeReg y
|
|
|
1315
|
+
|
|
|
1316
|
+ let (idxs_v1, idxs_v2) =
|
|
|
1317
|
+ mapTuple reverse
|
|
|
1318
|
+ $ foldl'
|
|
|
1319
|
+ ( \(acc1, acc2) i ->
|
|
|
1320
|
+ if i < length then (Just i : acc1, Nothing : acc2) else (Nothing : acc1, Just (i - length) : acc2)
|
|
|
1321
|
+ )
|
|
|
1322
|
+ ([], [])
|
|
|
1323
|
+ idxs
|
|
|
1324
|
+ selVecData_v1 = selVecData idxs_v1
|
|
|
1325
|
+ selVecData_v2 = selVecData idxs_v2
|
|
|
1326
|
+ selVecFormat = intVecFormat length W16
|
|
|
1327
|
+ dstFormat = intVecFormat length w
|
|
|
1328
|
+ addrFormat = intFormat W64
|
|
|
1329
|
+ sel_v1 <- getNewRegNat selVecFormat
|
|
|
1330
|
+ sel_v2 <- getNewRegNat selVecFormat
|
|
|
1331
|
+ sel_v1_addr <- getNewRegNat addrFormat
|
|
|
1332
|
+ sel_v2_addr <- getNewRegNat addrFormat
|
|
|
1333
|
+ gathered_x <- getNewRegNat format_x
|
|
|
1334
|
+ gathered_y <- getNewRegNat format_y
|
|
|
1335
|
+ pure $ Any dstFormat $ \dst ->
|
|
|
1336
|
+ toOL
|
|
|
1337
|
+ [ LDATA (Section ReadOnlyData lbl_selVec_v1) (CmmStaticsRaw lbl_selVec_v1 selVecData_v1),
|
|
|
1338
|
+ LDATA (Section ReadOnlyData lbl_selVec_v2) (CmmStaticsRaw lbl_selVec_v2 selVecData_v2)
|
|
|
1339
|
+ ]
|
|
|
1340
|
+ `appOL` code_x
|
|
|
1341
|
+ `appOL` code_y
|
|
|
1342
|
+ `appOL` toOL
|
|
|
1343
|
+ [ LDR addrFormat (OpReg addrFormat sel_v1_addr) (OpImm (ImmCLbl lbl_selVec_v1)),
|
|
|
1344
|
+ LDR addrFormat (OpReg addrFormat sel_v2_addr) (OpImm (ImmCLbl lbl_selVec_v2)),
|
|
|
1345
|
+ LDRU selVecFormat (OpReg selVecFormat sel_v1) (OpAddr (AddrReg sel_v1_addr)),
|
|
|
1346
|
+ LDRU selVecFormat (OpReg selVecFormat sel_v2) (OpAddr (AddrReg sel_v2_addr)),
|
|
|
1347
|
+ VRGATHER (OpReg format_x gathered_x) (OpReg format_x reg_x) (OpReg selVecFormat sel_v1),
|
|
|
1348
|
+ VRGATHER (OpReg format_y gathered_y) (OpReg format_y reg_y) (OpReg selVecFormat sel_v2),
|
|
|
1349
|
+ VMV (OpReg selVecFormat v0Reg) (OpReg selVecFormat sel_v1),
|
|
|
1350
|
+ VMERGE (OpReg dstFormat dst)(OpReg format_x gathered_x)(OpReg format_y gathered_y) (OpReg selVecFormat v0Reg)
|
|
|
1351
|
+ ]
|
|
|
1352
|
+ where
|
|
|
1353
|
+ mapTuple :: (a -> b) -> (a, a) -> (b, b)
|
|
|
1354
|
+ mapTuple f (x, y) = (f x, f y)
|
|
|
1355
|
+ selVecData :: [Maybe Int] -> [CmmStatic]
|
|
|
1356
|
+ selVecData idxs =
|
|
|
1357
|
+ (CmmStaticLit . (flip CmmInt) W16 . fromIntegral)
|
|
|
1358
|
+ `map` ( map
|
|
|
1359
|
+ ( \i -> case i of
|
|
|
1360
|
+ Just i' -> i'
|
|
|
1361
|
+ Nothing -> 0
|
|
|
1362
|
+ )
|
|
|
1363
|
+ idxs
|
|
|
1364
|
+ )
|
|
1305
|
1365
|
_e -> panic $ "Missing operation " ++ show expr
|
|
1306
|
1366
|
|
|
1307
|
1367
|
-- Generic ternary case.
|
| ... |
... |
@@ -1331,7 +1391,6 @@ getRegister' config plat expr = |
|
1331
|
1391
|
expr
|
|
1332
|
1392
|
(VMV (OpReg targetFormat dst) (OpReg format_x reg_x))
|
|
1333
|
1393
|
`snocOL` VFMA var (OpReg targetFormat dst) (OpReg format_y reg_y) (OpReg format_z reg_z)
|
|
1334
|
|
-
|
|
1335
|
1394
|
MO_VF_Insert length width -> vecInsert floatVecFormat length width
|
|
1336
|
1395
|
MO_V_Insert length width -> vecInsert intVecFormat length width
|
|
1337
|
1396
|
_ ->
|
| ... |
... |
@@ -1348,7 +1407,7 @@ getRegister' config plat expr = |
|
1348
|
1407
|
(reg_idx, format_idx, code_idx) <- getSomeReg z
|
|
1349
|
1408
|
let format = toFormat length width
|
|
1350
|
1409
|
format_mask = intVecFormat length W8 -- Actually, W1 (one bit) would be correct, but that does not exist.
|
|
1351
|
|
- format_vid = intVecFormat length vidWidth
|
|
|
1410
|
+ format_vid = intVecFormat length (vidWidth length)
|
|
1352
|
1411
|
vidReg <- getNewRegNat format_vid
|
|
1353
|
1412
|
tmp <- getNewRegNat format
|
|
1354
|
1413
|
pure $ Any format $ \dst ->
|
| ... |
... |
@@ -1373,18 +1432,20 @@ getRegister' config plat expr = |
|
1373
|
1432
|
`snocOL`
|
|
1374
|
1433
|
-- 4. Merge with mask -> set element at index
|
|
1375
|
1434
|
VMERGE (OpReg format dst) (OpReg format_v reg_v) (OpReg format tmp) (OpReg format_mask v0Reg)
|
|
|
1435
|
+
|
|
|
1436
|
+ -- Which element width do I need in my vector to store indexes in it?
|
|
|
1437
|
+ vidWidth :: Int -> Width
|
|
|
1438
|
+ vidWidth length = case bitWidthFixed (fromIntegral length :: Word) of
|
|
|
1439
|
+ x
|
|
|
1440
|
+ | x <= widthInBits W8 -> W8
|
|
|
1441
|
+ | x <= widthInBits W16 -> W16
|
|
|
1442
|
+ | x <= widthInBits W32 -> W32
|
|
|
1443
|
+ | x <= widthInBits W64 -> W64
|
|
|
1444
|
+ | x <= widthInBits W128 -> W128
|
|
|
1445
|
+ | x <= widthInBits W256 -> W256
|
|
|
1446
|
+ | x <= widthInBits W512 -> W512
|
|
|
1447
|
+ e -> panic $ "length " ++ show length ++ "not representable in a single element's Width (" ++ show e ++ ")"
|
|
1376
|
1448
|
where
|
|
1377
|
|
- -- Which element width do I need in my vector to store indexes in it?
|
|
1378
|
|
- vidWidth = case bitWidthFixed (fromIntegral length :: Word) of
|
|
1379
|
|
- x
|
|
1380
|
|
- | x <= widthInBits W8 -> W8
|
|
1381
|
|
- | x <= widthInBits W16 -> W16
|
|
1382
|
|
- | x <= widthInBits W32 -> W32
|
|
1383
|
|
- | x <= widthInBits W64 -> W64
|
|
1384
|
|
- | x <= widthInBits W128 -> W128
|
|
1385
|
|
- | x <= widthInBits W256 -> W256
|
|
1386
|
|
- | x <= widthInBits W512 -> W512
|
|
1387
|
|
- e -> panic $ "length " ++ show length ++ "not representable in a single element's Width (" ++ show e ++ ")"
|
|
1388
|
1449
|
bitWidthFixed :: Word -> Int
|
|
1389
|
1450
|
bitWidthFixed 0 = 1
|
|
1390
|
1451
|
bitWidthFixed n = finiteBitSize n - countLeadingZeros n
|
| ... |
... |
@@ -1489,14 +1550,6 @@ getRegister' config plat expr = |
|
1489
|
1550
|
)
|
|
1490
|
1551
|
|
|
1491
|
1552
|
-- TODO: Missing MachOps:
|
|
1492
|
|
--- - MO_V_Add
|
|
1493
|
|
--- - MO_V_Sub
|
|
1494
|
|
--- - MO_V_Mul
|
|
1495
|
|
--- - MO_VS_Quot
|
|
1496
|
|
--- - MO_VS_Rem
|
|
1497
|
|
--- - MO_VS_Neg
|
|
1498
|
|
--- - MO_VU_Quot
|
|
1499
|
|
--- - MO_VU_Rem
|
|
1500
|
1553
|
-- - MO_V_Shuffle
|
|
1501
|
1554
|
-- - MO_VF_Shuffle
|
|
1502
|
1555
|
|
| ... |
... |
@@ -2142,19 +2195,45 @@ genCCall (PrimTarget mop) dest_regs arg_regs = do |
|
2142
|
2195
|
MO_AddIntC _w -> unsupported mop
|
|
2143
|
2196
|
MO_SubIntC _w -> unsupported mop
|
|
2144
|
2197
|
MO_U_Mul2 _w -> unsupported mop
|
|
|
2198
|
+ MO_VS_Quot length w
|
|
|
2199
|
+ | [x, y] <- arg_regs,
|
|
|
2200
|
+ [dst_reg] <- dest_regs ->
|
|
|
2201
|
+ v3op mop (intVecFormat length w) dst_reg x y (VQUOT (Just Signed))
|
|
2145
|
2202
|
MO_VS_Quot {} -> unsupported mop
|
|
|
2203
|
+ MO_VU_Quot length w
|
|
|
2204
|
+ | [x, y] <- arg_regs,
|
|
|
2205
|
+ [dst_reg] <- dest_regs ->
|
|
|
2206
|
+ v3op mop (intVecFormat length w) dst_reg x y (VQUOT (Just Unsigned))
|
|
2146
|
2207
|
MO_VU_Quot {} -> unsupported mop
|
|
2147
|
2208
|
MO_VS_Rem length w
|
|
2148
|
2209
|
| [x, y] <- arg_regs,
|
|
2149
|
|
- [dst_reg] <- dest_regs -> vrem mop length w dst_reg x y Signed
|
|
|
2210
|
+ [dst_reg] <- dest_regs ->
|
|
|
2211
|
+ v3op mop (intVecFormat length w) dst_reg x y (VREM Signed)
|
|
2150
|
2212
|
MO_VS_Rem {} -> unsupported mop
|
|
2151
|
2213
|
MO_VU_Rem length w
|
|
2152
|
2214
|
| [x, y] <- arg_regs,
|
|
2153
|
|
- [dst_reg] <- dest_regs -> vrem mop length w dst_reg x y Unsigned
|
|
|
2215
|
+ [dst_reg] <- dest_regs ->
|
|
|
2216
|
+ v3op mop (intVecFormat length w) dst_reg x y (VREM Unsigned)
|
|
2154
|
2217
|
MO_VU_Rem {} -> unsupported mop
|
|
|
2218
|
+ MO_I64X2_Min
|
|
|
2219
|
+ | [x, y] <- arg_regs,
|
|
|
2220
|
+ [dst_reg] <- dest_regs ->
|
|
|
2221
|
+ v3op mop (intVecFormat 2 W64) dst_reg x y VSMIN
|
|
2155
|
2222
|
MO_I64X2_Min -> unsupported mop
|
|
|
2223
|
+ MO_I64X2_Max
|
|
|
2224
|
+ | [x, y] <- arg_regs,
|
|
|
2225
|
+ [dst_reg] <- dest_regs ->
|
|
|
2226
|
+ v3op mop (intVecFormat 2 W64) dst_reg x y VSMAX
|
|
2156
|
2227
|
MO_I64X2_Max -> unsupported mop
|
|
|
2228
|
+ MO_W64X2_Min
|
|
|
2229
|
+ | [x, y] <- arg_regs,
|
|
|
2230
|
+ [dst_reg] <- dest_regs ->
|
|
|
2231
|
+ v3op mop (intVecFormat 2 W64) dst_reg x y VUMIN
|
|
2157
|
2232
|
MO_W64X2_Min -> unsupported mop
|
|
|
2233
|
+ MO_W64X2_Max
|
|
|
2234
|
+ | [x, y] <- arg_regs,
|
|
|
2235
|
+ [dst_reg] <- dest_regs ->
|
|
|
2236
|
+ v3op mop (intVecFormat 2 W64) dst_reg x y VUMAX
|
|
2158
|
2237
|
MO_W64X2_Max -> unsupported mop
|
|
2159
|
2238
|
-- Memory Ordering
|
|
2160
|
2239
|
-- The related C functions are:
|
| ... |
... |
@@ -2275,24 +2354,23 @@ genCCall (PrimTarget mop) dest_regs arg_regs = do |
|
2275
|
2354
|
let code = code_fx `appOL` op (OpReg fmt dst) (OpReg format_x reg_fx)
|
|
2276
|
2355
|
pure code
|
|
2277
|
2356
|
|
|
2278
|
|
- vrem :: CallishMachOp -> Int -> Width -> LocalReg -> CmmExpr -> CmmExpr -> Signage -> NatM InstrBlock
|
|
2279
|
|
- vrem mop length w dst_reg x y s = do
|
|
2280
|
|
- platform <- getPlatform
|
|
2281
|
|
- let dst = getRegisterReg platform (CmmLocal dst_reg)
|
|
2282
|
|
- format = intVecFormat length w
|
|
2283
|
|
- moDescr = pprCallishMachOp mop
|
|
2284
|
|
- (reg_x, format_x, code_x) <- getSomeReg x
|
|
2285
|
|
- (reg_y, format_y, code_y) <- getSomeReg y
|
|
2286
|
|
- massertPpr (isVecFormat format_x && isVecFormat format_y)
|
|
2287
|
|
- $ text "vecOp: non-vector operand. operands: "
|
|
2288
|
|
- <+> ppr format_x
|
|
2289
|
|
- <+> ppr format_y
|
|
2290
|
|
- pure
|
|
2291
|
|
- $ code_x
|
|
2292
|
|
- `appOL` code_y
|
|
2293
|
|
- `snocOL`
|
|
2294
|
|
- ann moDescr
|
|
2295
|
|
- (VREM s (OpReg format dst) (OpReg format_x reg_x) (OpReg format_y reg_y))
|
|
|
2357
|
+ v3op :: CallishMachOp -> Format -> LocalReg -> CmmExpr -> CmmExpr -> (Operand -> Operand -> Operand -> Instr) -> NatM InstrBlock
|
|
|
2358
|
+ v3op mop dst_format dst_reg x y op = do
|
|
|
2359
|
+ platform <- getPlatform
|
|
|
2360
|
+ let dst = getRegisterReg platform (CmmLocal dst_reg)
|
|
|
2361
|
+ moDescr = pprCallishMachOp mop
|
|
|
2362
|
+ (reg_x, format_x, code_x) <- getSomeReg x
|
|
|
2363
|
+ (reg_y, format_y, code_y) <- getSomeReg y
|
|
|
2364
|
+ massertPpr (isVecFormat format_x && isVecFormat format_y)
|
|
|
2365
|
+ $ text "vecOp: non-vector operand. operands: "
|
|
|
2366
|
+ <+> ppr format_x
|
|
|
2367
|
+ <+> ppr format_y
|
|
|
2368
|
+ pure
|
|
|
2369
|
+ $ code_x
|
|
|
2370
|
+ `appOL` code_y
|
|
|
2371
|
+ `snocOL` ann
|
|
|
2372
|
+ moDescr
|
|
|
2373
|
+ (op (OpReg dst_format dst) (OpReg format_x reg_x) (OpReg format_y reg_y))
|
|
2296
|
2374
|
|
|
2297
|
2375
|
{- Note [RISCV64 far jumps]
|
|
2298
|
2376
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
| ... |
... |
@@ -2540,6 +2618,7 @@ makeFarBranches {- only used when debugging -} _platform statics basic_blocks = |
|
2540
|
2618
|
VUMAX {} -> 2
|
|
2541
|
2619
|
VFMIN {} -> 2
|
|
2542
|
2620
|
VFMAX {} -> 2
|
|
|
2621
|
+ VRGATHER {} -> 2
|
|
2543
|
2622
|
VFMA {} -> 3
|
|
2544
|
2623
|
-- estimate the subsituted size for jumps to lables
|
|
2545
|
2624
|
-- jumps to registers have size 1
|