GitLab

Sven Tennie pushed to branch wip/supersven/riscv-vectors at Glasgow Haskell Compiler / GHC

Commits:

eef2ec66
by Sven Tennie at 2025-06-22T13:41:41+02:00
```
Give up with vectors passed by reference
```
3e510f90
by Sven Tennie at 2025-06-22T16:11:22+02:00
```
Adjust test
```
6c85b16d
by Sven Tennie at 2025-06-22T16:11:40+02:00
```
Fix warnings
```
7f5ae460
by Sven Tennie at 2025-06-22T17:35:12+02:00
```
Use OrdList to insert vector configs
```

6 changed files:

compiler/GHC/CmmToAsm/RV64/CodeGen.hs
compiler/GHC/CmmToAsm/RV64/Instr.hs
compiler/GHC/CmmToAsm/RV64/Ppr.hs
testsuite/tests/simd/should_run/VectorCCallConv.hs
+ testsuite/tests/simd/should_run/VectorCCallConv.stdout
testsuite/tests/simd/should_run/VectorCCallConv_c.c

Changes:

compiler/GHC/CmmToAsm/RV64/CodeGen.hs

@@ -1934,6 +1934,27 @@ genCondBranch true false expr =
  -- -----------------------------------------------------------------------------
  --  Generating C calls
 +-- Note [RISC-V vector C calling convention]
 +-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +--
 +-- In short:
 +--  1. The first 16 vector arguments are passed in registers v8 - v23
 +--  2. If there are free general registers, the pointers (references) to more
 +--     vectors are passed in them
 +--  3. Otherwise, the pointers are passed on stack
 +--
 +-- (1) is easy to accomplish. (2) and (3) require the vector register to be
 +-- stored with its full width. This width is unknown at compile time. So, the
 +-- natural way of storing it as temporary variable on the C stack conflicts
 +-- with GHC wanting to know the exact stack size at compile time.
 +--
 +-- One could consider to allocate space for the vector registers to be passed
 +-- by reference on the heap. However, this turned out to be very complex and is
 +-- left for later versions of this NCG.
 +--
 +-- For now, we expect that 16 vector arguments is probably sufficient for very
 +-- most function types.
++
  -- | Generate a call to a C function.
  --
  -- - Integer values are passed in GP registers a0-a7.
@@ -2033,27 +2054,6 @@ genCCall target@(ForeignTarget expr _cconv) dest_regs arg_regs = do
            `appOL` moveStackUp stackSpaceWords
    return code
    where
 -    -- TODO: Deallocate heap-allocated vectors after the main call
 -    allocVectorHeap :: (Reg, Format, ForeignHint, InstrBlock) -> NatM (Reg, Format, ForeignHint, InstrBlock)
 -    allocVectorHeap arg@(r, format, hint, code_r) | isVecFormat format = do
 -      platform <- getPlatform
 -      resRegUnq <- getUniqueM
 -      let resLocalReg = LocalReg resRegUnq b64
 -          resReg = getRegisterReg platform (CmmLocal resLocalReg)
 -      callCode <- mkCCall "malloc_vlen_vector" [resLocalReg] []
 -      let code = callCode `appOL` code_r `appOL` toOL [VS1R (OpReg format r) (OpAddr (AddrReg resReg))]
 -      pure (resReg, II64, hint, code)
 -    allocVectorHeap _ = panic "Unsupported general case"
+-
 -    mkCCall :: FastString ->   [CmmFormal] -> [CmmActual] -> NatM InstrBlock
 -    mkCCall name dest_regs arg_regs = do
 -      config <- getConfig
 -      target <-
 -        cmmMakeDynamicReference config CallReference
 -          $ mkForeignLabel name ForeignLabelInThisPackage IsFunction
 -      let cconv = ForeignConvention CCallConv [NoHint] [NoHint] CmmMayReturn
 -      genCCall (ForeignTarget target cconv) dest_regs arg_regs
+-
      -- Implementation of the RISCV ABI calling convention.
      -- https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/948463cd5dbebea7c1869e20146b17a2cc8fda2f/riscv-cc.adoc#integer-calling-convention
      passArguments :: [Reg] -> [Reg] -> [Reg] -> [(Reg, Format, ForeignHint, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)
@@ -2135,23 +2135,11 @@ genCCall target@(ForeignTarget expr _cconv) dest_regs arg_regs = do
                `snocOL` ann (text "Pass vector argument: " <> ppr r) mov
        passArguments gpRegs fpRegs vRegs args stackSpaceWords (vReg : accumRegs) accumCode'
 -    -- No more vector but free gp regs, and we want to pass a vector argument: Pass vector on heap and move its address to gp vector.
 -    passArguments (gpReg : gpRegs) fpRegs [] (arg@(r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode
 -      | isVecFormat format = do
 -      (r', format', _hint, code_r') <- allocVectorHeap arg
 -      let code = code_r' `appOL` toOL [MOV (OpReg II64 gpReg) (OpReg II64 r')]
 -      passArguments gpRegs fpRegs [] args stackSpaceWords (gpReg : accumRegs) (accumCode `appOL` code)
+-
 -    -- No more vector and gp regs, and we want to pass a vector argument: Pass vector address on stack and the vector itself on heap.
 -    -- We need to put its address in the next slot
 -    -- In RISC-V terms we pass an "aggregate by reference"
 -    passArguments [] fpRegs [] (arg@(r, format, _hint, code_r) : args) stackSpaceWords accumRegs accumCode
 -      | isVecFormat format = do
 -      (r', format', _hint, code_r') <- allocVectorHeap arg
 -      let spOffet = 8 * stackSpaceWords
 -          str = STR format' (OpReg II64 r') (OpAddr (AddrRegImm spMachReg (ImmInt spOffet)))
 -          code = code_r' `snocOL` str
 -      passArguments [] fpRegs [] args (stackSpaceWords + 1) accumRegs (accumCode `appOL` code)
 +    -- No more free vector argument registers , and we want to pass a vector argument.
 +    -- See Note [RISC-V vector C calling convention]
 +    passArguments _gpRegs _fpRegs [] ((_r, format, _hint, _code_r) : _args) _stackSpaceWords _accumRegs _accumCode
 +      | isVecFormat format =
 +      panic "C call: no free vector argument registers. We only support 16 vector arguments (registers v8 - v23)."
      passArguments _ _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
@@ -2704,7 +2692,6 @@ makeFarBranches {- only used when debugging -} _platform statics basic_blocks =
        VFMIN {} -> 2
        VFMAX {} -> 2
        VRGATHER {} -> 2
 -      VS1R {} -> 1
        VFMA {} -> 3
        -- estimate the subsituted size for jumps to lables
        -- jumps to registers have size 1

compiler/GHC/CmmToAsm/RV64/Instr.hs

@@ -132,7 +132,6 @@ regUsageOfInstr platform instr = case instr of
    -- allocator doesn't use the src* registers as dst. (Otherwise, we end up
    -- with an illegal instruction.)
    VRGATHER dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst ++ regOp src1 ++ regOp src2)
 -  VS1R src dst -> usage (regOp src ++ regOp dst, [])
    FMA _ dst src1 src2 src3 ->
      usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
    VFMA _ op1 op2 op3 ->
@@ -249,7 +248,6 @@ patchRegsOfInstr instr env = case instr of
    VFMIN o1 o2 o3 -> VFMIN (patchOp o1) (patchOp o2) (patchOp o3)
    VFMAX o1 o2 o3 -> VFMAX (patchOp o1) (patchOp o2) (patchOp o3)
    VRGATHER o1 o2 o3 -> VRGATHER (patchOp o1) (patchOp o2) (patchOp o3)
 -  VS1R o1 o2 -> VS1R (patchOp o1) (patchOp o2)
    FMA s o1 o2 o3 o4 ->
      FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
    VFMA s o1 o2 o3 ->
@@ -701,7 +699,6 @@ data Instr
    | VFMAX Operand Operand Operand
    | VFMA FMASign Operand Operand Operand
    | VRGATHER Operand Operand Operand
 -  | VS1R Operand Operand
  data Signage = Signed | Unsigned
    deriving (Eq, Show)
@@ -793,7 +790,6 @@ instrCon i =
      VFMIN {} -> "VFMIN"
      VFMAX {} -> "VFMAX"
      VRGATHER {} -> "VRGATHER"
 -    VS1R {} -> "VS1R"
      FMA variant _ _ _ _ ->
        case variant of
          FMAdd -> "FMADD"

compiler/GHC/CmmToAsm/RV64/Ppr.hs

@@ -22,6 +22,7 @@ import GHC.Types.Basic (Alignment, alignmentBytes, mkAlignment)
  import GHC.Types.Unique (getUnique, pprUniqueAlways)
  import GHC.Utils.Outputable
  import GHC.Utils.Panic
 +import GHC.Data.OrdList
  pprNatCmmDecl :: forall doc. (IsDoc doc) => NCGConfig -> NatCmmDecl RawCmmStatics Instr -> doc
  pprNatCmmDecl config (CmmData section dats) =
@@ -143,7 +144,7 @@ pprBasicBlock ::
  pprBasicBlock config info_env (BasicBlock blockid instrs) =
    maybe_infotable
      $ pprLabel platform asmLbl
 -    $$ vcat (map (pprInstr platform) (id {-detectTrivialDeadlock-} instrs'))
 +    $$ (vcat . fromOL) (mapOL (pprInstr platform) (id {-detectTrivialDeadlock-} instrs'))
      $$ ppWhen
        (ncgDwarfEnabled config)
        ( -- Emit both end labels since this may end up being a standalone
@@ -154,7 +155,7 @@ pprBasicBlock config info_env (BasicBlock blockid instrs) =
+           )
+       )
    where
 -    instrs' = injectVectorConfig optInstrs
 +    instrs' = injectVectorConfig (toOL optInstrs)
      -- TODO: Check if we can  filter more instructions here.
      -- TODO: Shouldn't this be a more general check on a higher level? And, is this still needed?
      -- Filter out identity moves. E.g. mov x18, x18 will be dropped.
@@ -163,34 +164,34 @@ pprBasicBlock config info_env (BasicBlock blockid instrs) =
          f (MOV o1 o2) | o1 == o2 = False
          f _ = True
 -    injectVectorConfig :: [Instr] -> [Instr]
 -    injectVectorConfig instrs = fst $ foldl injectVectorConfig' ([], Nothing) instrs
 +    injectVectorConfig :: OrdList Instr -> OrdList Instr
 +    injectVectorConfig instrs = fst $ foldlOL injectVectorConfig' (nilOL, Nothing) instrs
      -- TODO: Fuse this with optInstrs
      -- TODO: Check config and only run this when vectors are configured
      -- TODO: Check if vectorMinBits is sufficient for the vector config
 -    injectVectorConfig' :: ([Instr], Maybe Format) -> Instr -> ([Instr], Maybe Format)
 +    injectVectorConfig' :: (OrdList Instr, Maybe Format) -> Instr -> (OrdList Instr, Maybe Format)
      injectVectorConfig' (accInstr, configuredVecFmt) currInstr =
        let configuredVecFmt' Nothing = Nothing
            configuredVecFmt' (Just fmt') = if isJumpishInstr currInstr then Nothing else Just fmt'
         in case (configuredVecFmt, instrVecFormat platform currInstr) of
 -            (fmtA, Nothing) ->
 +            (_fmtA, Nothing) ->
                -- no vector instruction
                ( accInstr
 -                  -- TODO: The performance of this appending is probably horrible. Check OrdList.
 -                  ++ [ -- (MULTILINE_COMMENT (text "No vector instruction" <> colon <+> text (instrCon currInstr) <+> pprInstr platform currInstr <> dot <> text "Current context" <> colon <+> ppr fmtA <> comma <+> text "New context" <+> ppr (configuredVecFmt' configuredVecFmt))),
 -                       currInstr
 -                     ],
 +                  `appOL` toOL
 +                    [ -- (MULTILINE_COMMENT (text "No vector instruction" <> colon <+> text (instrCon currInstr) <+> pprInstr platform currInstr)),
 +                      currInstr
 +                    ],
                  configuredVecFmt' configuredVecFmt
+               )
              (Nothing, Just fmtB) ->
                -- vector instruction, but no active config
                ( accInstr
 -                  -- TODO: The performance of this appending is probably horrible. Check OrdList.
 -                  ++ [ COMMENT (text "No active vector config. Setting" <+> ppr fmtB),
 -                       (configVec fmtB),
 -                       currInstr
 -                     ],
 +                  `appOL` toOL
 +                    [ COMMENT (text "No active vector config. Setting" <+> ppr fmtB),
 +                      (configVec fmtB),
 +                      currInstr
 +                    ],
                  configuredVecFmt' (Just fmtB)
+               )
              (Just fmtA, Just fmtB) ->
@@ -198,15 +199,20 @@ pprBasicBlock config info_env (BasicBlock blockid instrs) =
                  then
                    -- vectors already correctly configured
                    ( accInstr
 -                      -- TODO: The performance of this appending is probably horrible. Check OrdList.
 -                      ++ [COMMENT (text "Active vector config. Keeping" <+> ppr fmtB), currInstr],
 +                      `appOL` toOL
 +                        [ COMMENT (text "Active vector config. Keeping" <+> ppr fmtB),
 +                          currInstr
 +                        ],
                      configuredVecFmt' (Just fmtA)
+                   )
                  else
                    -- re-configure
                    ( accInstr
 -                      -- TODO: The performance of this appending is probably horrible. Check OrdList.
 -                      ++ [(COMMENT (text "Wrong active vector config. Setting" <+> ppr fmtB)), (configVec fmtB), currInstr],
 +                      `appOL` toOL
 +                        [ (COMMENT (text "Wrong active vector config. Setting" <+> ppr fmtB)),
 +                          (configVec fmtB),
 +                          currInstr
 +                        ],
                      configuredVecFmt' (Just fmtB)
+                   )
@@ -876,8 +882,6 @@ pprInstr platform instr = case instr of
    VFMAX o1 o2 o3 -> pprPanic "RV64.pprInstr - VFMAX wrong operands." (pprOps platform [o1, o2, o3])
    VRGATHER o1 o2 o3 | allVectorRegOps [o1, o2, o3] -> op3 (text "\tvrgather.vv") o1 o2 o3
    VRGATHER o1 o2 o3 -> pprPanic "RV64.pprInstr - VRGATHER wrong operands." (pprOps platform [o1, o2, o3])
 -  VS1R o1 o2  | isVectorRegOp o1 -> op2 (text "\tvs1r.v") o1 o2
 -  VS1R o1 o2  -> pprPanic "RV64.pprInstr - VS1R wrong operands." (pprOps platform [o1, o2])
    instr -> panic $ "RV64.pprInstr - Unknown instruction: " ++ instrCon instr
    where
      op1 op o1 = line $ op <+> pprOp platform o1

testsuite/tests/simd/should_run/VectorCCallConv.hs

@@ -9,16 +9,10 @@ module Main where
  import Data.Int
  import GHC.Int
  import GHC.Prim
 +import System.IO
  foreign import ccall "printVecs_int64x2_c"
    printVecs_int64x2# ::
 -    Int64X2# -> -- v1
 -    Int64X2# -> -- v2
 -    Int64X2# -> -- v3
 -    Int64X2# -> -- v4
 -    Int64X2# -> -- v5
 -    Int64X2# -> -- v6
 -    Int64X2# -> -- v7
      Int64X2# -> -- v8
      Int64X2# -> -- v9
      Int64X2# -> -- v10
@@ -35,75 +29,38 @@ foreign import ccall "printVecs_int64x2_c"
      Int64X2# -> -- v21
      Int64X2# -> -- v22
      Int64X2# -> -- v23
 -    Int64X2# -> -- v24
 -    Int64X2# -> -- v25
 -    --    Int64X2# -> -- v26
 -    --    Int64X2# -> -- v27
 -    --    Int64X2# -> -- v28
 -    --    Int64X2# -> -- v29
 -    --    Int64X2# -> -- v30
 -    --    Int64X2# ->
 -    --    Int64X2# ->
 -    --    Int64X2# ->
 -    --    Int64X2# ->
 -    --    Int64X2# ->
 -    --    Int64X2# ->
      IO ()
 --- foreign import ccall "return_int64X2"
 ---   return_int64X2# :: (# #) -> Int64X2#
 ---
 --- unpackInt64X2 :: Int64X2# -> (Int64, Int64)
 --- unpackInt64X2 v = case unpackInt64X2# v of
 ---   (# x0, x1 #) -> (I64# x0, I64# x1)
 +foreign import ccall "return_int64X2"
 +  return_int64X2# :: (# #) -> Int64X2#
++
 +unpackInt64X2 :: Int64X2# -> (Int64, Int64)
 +unpackInt64X2 v = case unpackInt64X2# v of
 +  (# x0, x1 #) -> (I64# x0, I64# x1)
  main :: IO ()
  main = do
 -  let v1 = packInt64X2# (# 0#Int64, 1#Int64 #)
 -      v2 = packInt64X2# (# 2#Int64, 3#Int64 #)
 -      v3 = packInt64X2# (# 4#Int64, 5#Int64 #)
 -      v4 = packInt64X2# (# 6#Int64, 7#Int64 #)
 -      v5 = packInt64X2# (# 8#Int64, 9#Int64 #)
 -      v6 = packInt64X2# (# 10#Int64, 11#Int64 #)
 -      v7 = packInt64X2# (# 12#Int64, 13#Int64 #)
 -      v8 = packInt64X2# (# 14#Int64, 15#Int64 #)
 -      v9 = packInt64X2# (# 16#Int64, 17#Int64 #)
 -      v10 = packInt64X2# (# 18#Int64, 19#Int64 #)
 -      v11 = packInt64X2# (# 20#Int64, 21#Int64 #)
 -      v12 = packInt64X2# (# 22#Int64, 23#Int64 #)
 -      v13 = packInt64X2# (# 24#Int64, 25#Int64 #)
 -      v14 = packInt64X2# (# 26#Int64, 27#Int64 #)
 -      v15 = packInt64X2# (# 28#Int64, 29#Int64 #)
 -      v16 = packInt64X2# (# 30#Int64, 31#Int64 #)
 -      v17 = packInt64X2# (# 32#Int64, 33#Int64 #)
 -      v18 = packInt64X2# (# 34#Int64, 35#Int64 #)
 -      v19 = packInt64X2# (# 36#Int64, 37#Int64 #)
 -      v20 = packInt64X2# (# 38#Int64, 39#Int64 #)
 -      v21 = packInt64X2# (# 40#Int64, 41#Int64 #)
 -      v22 = packInt64X2# (# 42#Int64, 43#Int64 #)
 -      v23 = packInt64X2# (# 44#Int64, 45#Int64 #)
 -      v24 = packInt64X2# (# 46#Int64, 47#Int64 #)
 -      v25 = packInt64X2# (# 48#Int64, 49#Int64 #)
 -      v26 = packInt64X2# (# 50#Int64, 51#Int64 #)
 -      v27 = packInt64X2# (# 52#Int64, 53#Int64 #)
 -      v28 = packInt64X2# (# 54#Int64, 55#Int64 #)
 -      v29 = packInt64X2# (# 56#Int64, 57#Int64 #)
 -      v30 = packInt64X2# (# 58#Int64, 59#Int64 #)
 -  --      v31 = packInt64X2# (# 60#Int64, 61#Int64 #)
 -  --      v32 = packInt64X2# (# 62#Int64, 63#Int64 #)
 -  --      v33 = packInt64X2# (# 64#Int64, 65#Int64 #)
 -  --      v34 = packInt64X2# (# 66#Int64, 67#Int64 #)
 -  --      v35 = packInt64X2# (# 68#Int64, 69#Int64 #)
 -  --      v36 = packInt64X2# (# 70#Int64, 71#Int64 #)
 +  -- Use some negative values to fill more bits and discover possible overlaps.
 +  let v8 = packInt64X2# (# 0#Int64, -1#Int64 #)
 +      v9 = packInt64X2# (# -2#Int64, 3#Int64 #)
 +      v10 = packInt64X2# (# -4#Int64, 5#Int64 #)
 +      v11 = packInt64X2# (# -6#Int64, 7#Int64 #)
 +      v12 = packInt64X2# (# -8#Int64, 9#Int64 #)
 +      v13 = packInt64X2# (# -10#Int64, 11#Int64 #)
 +      v14 = packInt64X2# (# -12#Int64, 13#Int64 #)
 +      v15 = packInt64X2# (# -14#Int64, 15#Int64 #)
 +      v16 = packInt64X2# (# -16#Int64, 17#Int64 #)
 +      v17 = packInt64X2# (# -18#Int64, 19#Int64 #)
 +      v18 = packInt64X2# (# -20#Int64, 21#Int64 #)
 +      v19 = packInt64X2# (# -22#Int64, 23#Int64 #)
 +      v20 = packInt64X2# (# -24#Int64, 25#Int64 #)
 +      v21 = packInt64X2# (# -26#Int64, 27#Int64 #)
 +      v22 = packInt64X2# (# -28#Int64, 29#Int64 #)
 +      v23 = packInt64X2# (# -30#Int64, 31#Int64 #)
 +  print "Arguments"
 +  hFlush stdout
    printVecs_int64x2#
 -    v1
 -    v2
 -    v3
 -    v4
 -    v5
 -    v6
 -    v7
      v8
      v9
      v10
@@ -120,22 +77,7 @@ main = do
      v21
      v22
      v23
 -    v24
 -    v25
+-
 ---    v26
+-
 ---    v27
 ---    v28
 ---    v29
 ---    v30
+-
 ---    v31
 ---    v32
 ---    v33
 ---    v34
 ---    v35
 ---    v26
 ---    let v = return_int64X2#
 ---    print $ unpackInt64X2 v
 +  print "Return values"
 +  let v = return_int64X2# (# #)
 +  print $ unpackInt64X2 v

testsuite/tests/simd/should_run/VectorCCallConv.stdout

 +"Arguments"
 +[0, -1]
 +[-2, 3]
 +[-4, 5]
 +[-6, 7]
 +[-8, 9]
 +[-10, 11]
 +[-12, 13]
 +[-14, 15]
 +[-16, 17]
 +[-18, 19]
 +[-20, 21]
 +[-22, 23]
 +[-24, 25]
 +[-26, 27]
 +[-28, 29]
 +[-30, 31]
 +"Return values"
 +(-9223372036854775808,9223372036854775807)

testsuite/tests/simd/should_run/VectorCCallConv_c.c

@@ -19,10 +19,7 @@ void printVecs_int64x2_c(vint64m1_t v8, vint64m1_t v9, vint64m1_t v10,
                           vint64m1_t v14, vint64m1_t v15, vint64m1_t v16,
                           vint64m1_t v17, vint64m1_t v18, vint64m1_t v19,
                           vint64m1_t v20, vint64m1_t v21, vint64m1_t v22,
 -                         vint64m1_t v23, vint64m1_t a0, vint64m1_t a1,
 -                         vint64m1_t a2, vint64m1_t a3, vint64m1_t a4,
 -                         vint64m1_t a5, vint64m1_t a6, vint64m1_t a7,
 -                         vint64m1_t s0) {
 +                         vint64m1_t v23) {
    printVec_int64(v8, 2);
    printVec_int64(v9, 2);
    printVec_int64(v10, 2);
@@ -39,31 +36,11 @@ void printVecs_int64x2_c(vint64m1_t v8, vint64m1_t v9, vint64m1_t v10,
    printVec_int64(v21, 2);
    printVec_int64(v22, 2);
    printVec_int64(v23, 2);
 -  printVec_int64(a0, 2);
 -  printVec_int64(a1, 2);
 -  printVec_int64(a2, 2);
 -  printVec_int64(a3, 2);
 -  printVec_int64(a4, 2);
 -  printVec_int64(a5, 2);
 -  printVec_int64(a6, 2);
 -  printVec_int64(a7, 2);
 -  printVec_int64(s0, 2);
 -  // printVec_int64(v26, 2);
 -  //  printVec_int64(v27, 2);
 -  //  printVec_int64(v28, 2);
 -  //  printVec_int64(v29, 2);
 -  //  printVec_int64(v30, 2);
 -  //  printVec_int64(v31, 2);
 -  //  printVec_int64(v32, 2);
 -  //  printVec_int64(v33, 2);
 -  //  printVec_int64(v34, 2);
 -  //  printVec_int64(v35, 2);
 -  //  printVec_int64(v36, 2);
 -  //
++
    fflush(stdout);
+ }
 -// vint64m1_t return_int64X2() {
 -//   int64_t v[] = {INT64_MIN, INT64_MAX};
 -//   return __riscv_vle64_v_i64m1(v, 2);
 -// }
 +vint64m1_t return_int64X2() {
 +  int64_t v[] = {INT64_MIN, INT64_MAX};
 +  return __riscv_vle64_v_i64m1(v, 2);
 +}