Simon Peyton Jones pushed to branch wip/T23109a at Glasgow Haskell Compiler / GHC
Commits:
-
d4279916
by Simon Peyton Jones at 2025-04-29T11:25:18+01:00
-
2d087046
by Simon Peyton Jones at 2025-04-29T11:25:28+01:00
-
680f1b60
by Simon Peyton Jones at 2025-04-29T11:25:28+01:00
10 changed files:
- compiler/GHC/Core/Opt/OccurAnal.hs
- compiler/GHC/Core/Opt/Pipeline.hs
- compiler/GHC/Core/Opt/SetLevels.hs
- compiler/GHC/Core/Opt/Simplify/Env.hs
- compiler/GHC/Core/Opt/Simplify/Iteration.hs
- compiler/GHC/Core/Opt/Simplify/Utils.hs
- compiler/GHC/Core/Utils.hs
- libraries/ghc-internal/src/GHC/Internal/Base.hs
- libraries/ghc-internal/src/GHC/Internal/Float.hs
- testsuite/tests/simplCore/should_run/simplrun009.hs
Changes:
... | ... | @@ -55,7 +55,7 @@ import GHC.Types.Tickish |
55 | 55 | import GHC.Types.Var.Set
|
56 | 56 | import GHC.Types.Var.Env
|
57 | 57 | import GHC.Types.Var
|
58 | -import GHC.Types.Demand ( argOneShots, argsOneShots, isDeadEndSig )
|
|
58 | +import GHC.Types.Demand ( argOneShots, argsOneShots {- , isDeadEndSig -} )
|
|
59 | 59 | |
60 | 60 | import GHC.Utils.Outputable
|
61 | 61 | import GHC.Utils.Panic
|
... | ... | @@ -1096,14 +1096,14 @@ mkNonRecRhsCtxt lvl bndr unf |
1096 | 1096 | certainly_inline -- See Note [Cascading inlines]
|
1097 | 1097 | = -- mkNonRecRhsCtxt is only used for non-join points, so occAnalBind
|
1098 | 1098 | -- has set the OccInfo for this binder before calling occAnalNonRecRhs
|
1099 | + -- Distressing delicacy ... has to line up with preInlineUnconditionally
|
|
1099 | 1100 | case idOccInfo bndr of
|
1100 | 1101 | OneOcc { occ_in_lam = NotInsideLam, occ_n_br = 1 }
|
1101 | - -> active && not stable_unf && not top_bottoming
|
|
1102 | + -> active && not (isTopLevel lvl) && not stable_unf
|
|
1102 | 1103 | _ -> False
|
1103 | 1104 | |
1104 | 1105 | active = isAlwaysActive (idInlineActivation bndr)
|
1105 | 1106 | stable_unf = isStableUnfolding unf
|
1106 | - top_bottoming = isTopLevel lvl && isDeadEndId bndr
|
|
1107 | 1107 | |
1108 | 1108 | -----------------
|
1109 | 1109 | occAnalRecBind :: OccEnv -> TopLevelFlag -> ImpRuleEdges -> [(Var,CoreExpr)]
|
... | ... | @@ -2580,8 +2580,9 @@ occAnalArgs !env fun args !one_shots |
2580 | 2580 | |
2581 | 2581 | -- Make bottoming functions interesting
|
2582 | 2582 | -- See Note [Bottoming function calls]
|
2583 | - encl | Var f <- fun, isDeadEndSig (idDmdSig f) = OccScrut
|
|
2584 | - | otherwise = OccVanilla
|
|
2583 | +-- encl | Var f <- fun, isDeadEndSig (idDmdSig f) = OccScrut
|
|
2584 | +-- | otherwise = OccVanilla
|
|
2585 | + encl = OccVanilla
|
|
2585 | 2586 | |
2586 | 2587 | go uds fun [] _ = WUD uds fun
|
2587 | 2588 | go uds fun (arg:args) one_shots
|
... | ... | @@ -2606,7 +2607,8 @@ Consider |
2606 | 2607 | let x = (a,b) in
|
2607 | 2608 | case p of
|
2608 | 2609 | A -> ...(error x)..
|
2609 | - B -> ...(ertor x)...
|
|
2610 | + B -> ...(error x)...
|
|
2611 | + C -> ..blah...
|
|
2610 | 2612 | |
2611 | 2613 | postInlineUnconditionally may duplicate x's binding, but sometimes it
|
2612 | 2614 | does so only if the use site IsInteresting. Pushing allocation into error
|
... | ... | @@ -2616,6 +2618,9 @@ setting occ_encl = OccScrut for such calls. |
2616 | 2618 | The slightly-artificial test T21128 is a good example. It's probably
|
2617 | 2619 | not a huge deal.
|
2618 | 2620 | |
2621 | +ToDo!!! Fix comment. Now postinlineUnconditionally ignores intersting-ness for
|
|
2622 | +non-top-level things.
|
|
2623 | + |
|
2619 | 2624 | Note [Arguments of let-bound constructors]
|
2620 | 2625 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
2621 | 2626 | Consider
|
... | ... | @@ -217,7 +217,7 @@ getCoreToDo dflags hpt_rule_base extra_vars |
217 | 217 | if full_laziness then
|
218 | 218 | CoreDoFloatOutwards $ FloatOutSwitches
|
219 | 219 | { floatOutLambdas = Just 0
|
220 | - , floatOutConstants = True
|
|
220 | + , floatOutConstants = False -- Initially
|
|
221 | 221 | , floatOutOverSatApps = False
|
222 | 222 | , floatToTopLevelOnly = False
|
223 | 223 | , floatJoinsToTop = False -- Initially, don't float join points at all
|
... | ... | @@ -284,7 +284,7 @@ getCoreToDo dflags hpt_rule_base extra_vars |
284 | 284 | -- f_el22 (f_el21 r_midblock)
|
285 | 285 | runWhen full_laziness $ CoreDoFloatOutwards $ FloatOutSwitches
|
286 | 286 | { floatOutLambdas = floatLamArgs dflags
|
287 | - , floatOutConstants = True
|
|
287 | + , floatOutConstants = True -- For SpecConstr and CSE
|
|
288 | 288 | , floatOutOverSatApps = True
|
289 | 289 | , floatToTopLevelOnly = False
|
290 | 290 | , floatJoinsToTop = True },
|
... | ... | @@ -406,7 +406,7 @@ lvlApp env orig_expr ((_,AnnVar fn), args) |
406 | 406 | , arity < n_val_args
|
407 | 407 | , Nothing <- isClassOpId_maybe fn
|
408 | 408 | = do { rargs' <- mapM (lvlNonTailMFE env False) rargs
|
409 | - ; lapp' <- lvlNonTailMFE env False lapp
|
|
409 | + ; lapp' <- lvlNonTailMFE env True lapp
|
|
410 | 410 | ; return (foldl' App lapp' rargs') }
|
411 | 411 | |
412 | 412 | | otherwise
|
... | ... | @@ -482,14 +482,14 @@ Consider this: |
482 | 482 | f :: T Int -> blah
|
483 | 483 | f x vs = case x of { MkT y ->
|
484 | 484 | let f vs = ...(case y of I# w -> e)...f..
|
485 | - in f vs
|
|
485 | + in f vs }
|
|
486 | 486 | |
487 | 487 | Here we can float the (case y ...) out, because y is sure
|
488 | 488 | to be evaluated, to give
|
489 | 489 | f x vs = case x of { MkT y ->
|
490 | - case y of I# w ->
|
|
490 | + case y of { I# w ->
|
|
491 | 491 | let f vs = ...(e)...f..
|
492 | - in f vs
|
|
492 | + in f vs }}
|
|
493 | 493 | |
494 | 494 | That saves unboxing it every time round the loop. It's important in
|
495 | 495 | some DPH stuff where we really want to avoid that repeated unboxing in
|
... | ... | @@ -614,7 +614,8 @@ lvlMFE env strict_ctxt e@(_, AnnCase {}) |
614 | 614 | = lvlExpr env e -- See Note [Case MFEs]
|
615 | 615 | |
616 | 616 | lvlMFE env strict_ctxt ann_expr
|
617 | - | not float_me
|
|
617 | + | notWorthFloating expr abs_vars
|
|
618 | + || not float_me
|
|
618 | 619 | || floatTopLvlOnly env && not (isTopLvl dest_lvl)
|
619 | 620 | -- Only floating to the top level is allowed.
|
620 | 621 | || hasFreeJoin env fvs -- If there is a free join, don't float
|
... | ... | @@ -623,9 +624,6 @@ lvlMFE env strict_ctxt ann_expr |
623 | 624 | -- We can't let-bind an expression if we don't know
|
624 | 625 | -- how it will be represented at runtime.
|
625 | 626 | -- See Note [Representation polymorphism invariants] in GHC.Core
|
626 | - || notWorthFloating expr abs_vars
|
|
627 | - -- Test notWorhtFloating last;
|
|
628 | - -- See Note [Large free-variable sets]
|
|
629 | 627 | = -- Don't float it out
|
630 | 628 | lvlExpr env ann_expr
|
631 | 629 | |
... | ... | @@ -676,12 +674,11 @@ lvlMFE env strict_ctxt ann_expr |
676 | 674 | is_function = isFunction ann_expr
|
677 | 675 | mb_bot_str = exprBotStrictness_maybe expr
|
678 | 676 | -- See Note [Bottoming floats]
|
679 | - -- esp Bottoming floats (2)
|
|
677 | + -- esp Bottoming floats (BF2)
|
|
680 | 678 | expr_ok_for_spec = exprOkForSpeculation expr
|
681 | 679 | abs_vars = abstractVars dest_lvl env fvs
|
682 | 680 | dest_lvl = destLevel env fvs fvs_ty is_function is_bot_lam
|
683 | - -- NB: is_bot_lam not is_bot; see (3) in
|
|
684 | - -- Note [Bottoming floats]
|
|
681 | + -- NB: is_bot_lam not is_bot; see (BF2) in Note [Bottoming floats]
|
|
685 | 682 | |
686 | 683 | -- float_is_new_lam: the floated thing will be a new value lambda
|
687 | 684 | -- replacing, say (g (x+4)) by (lvl x). No work is saved, nor is
|
... | ... | @@ -698,20 +695,32 @@ lvlMFE env strict_ctxt ann_expr |
698 | 695 | |
699 | 696 | -- A decision to float entails let-binding this thing, and we only do
|
700 | 697 | -- that if we'll escape a value lambda, or will go to the top level.
|
698 | + -- Never float trivial expressions;
|
|
699 | + -- notably, save_work might be true of a lone evaluated variable.
|
|
701 | 700 | float_me = saves_work || saves_alloc || is_mk_static
|
702 | 701 | |
703 | 702 | -- See Note [Saving work]
|
704 | - saves_work = escapes_value_lam -- (a)
|
|
705 | - && not (exprIsHNF expr) -- (b)
|
|
706 | - && not float_is_new_lam -- (c)
|
|
703 | + is_hnf = exprIsHNF expr
|
|
704 | + saves_work = escapes_value_lam -- (SW-a)
|
|
705 | + && not is_hnf -- (SW-b)
|
|
706 | + && not float_is_new_lam -- (SW-c)
|
|
707 | 707 | escapes_value_lam = dest_lvl `ltMajLvl` (le_ctxt_lvl env)
|
708 | 708 | |
709 | - -- See Note [Saving allocation] and Note [Floating to the top]
|
|
710 | - saves_alloc = isTopLvl dest_lvl
|
|
711 | - && floatConsts env
|
|
712 | - && ( not strict_ctxt -- (a)
|
|
713 | - || exprIsHNF expr -- (b)
|
|
714 | - || (is_bot_lam && escapes_value_lam)) -- (c)
|
|
709 | + -- See Note [Floating to the top]
|
|
710 | +-- is_con_app = isSaturatedConApp expr -- True of literal strings too
|
|
711 | + saves_alloc = isTopLvl dest_lvl
|
|
712 | + && (escapes_value_lam || floatConsts env)
|
|
713 | + -- Always float allocation out of a value lambda
|
|
714 | + -- if it gets to top level
|
|
715 | + && (not strict_ctxt || is_hnf || is_bot_lam)
|
|
716 | + -- is_con_app: don't float PAPs to the top; they may well end
|
|
717 | + -- up getting eta-expanded and re-inlined
|
|
718 | + -- E.g. f = \x -> (++) ys
|
|
719 | + -- If we float, then eta-expand we get
|
|
720 | + -- lvl = (++) ys
|
|
721 | + -- f = \x \zs -> lvl zs
|
|
722 | + -- and now we'll inline lvl. Silly.
|
|
723 | + |
|
715 | 724 | |
716 | 725 | hasFreeJoin :: LevelEnv -> DVarSet -> Bool
|
717 | 726 | -- Has a free join point which is not being floated to top level.
|
... | ... | @@ -726,22 +735,22 @@ hasFreeJoin env fvs |
726 | 735 | The key idea in let-floating is to
|
727 | 736 | * float a redex out of a (value) lambda
|
728 | 737 | Doing so can save an unbounded amount of work.
|
729 | -But see also Note [Saving allocation].
|
|
738 | +But see also Note [Floating to the top].
|
|
730 | 739 | |
731 | 740 | So we definitely float an expression out if
|
732 | -(a) It will escape a value lambda (escapes_value_lam)
|
|
733 | -(b) The expression is not a head-normal form (exprIsHNF); see (SW1, SW2).
|
|
734 | -(c) Floating does not require wrapping it in value lambdas (float_is_new_lam).
|
|
741 | +(SW-a) It will escape a value lambda (escapes_value_lam)
|
|
742 | +(SW-b) The expression is not a head-normal form (exprIsHNF); see (SW1, SW2).
|
|
743 | +(SW-c) Floating does not require wrapping it in value lambdas (float_is_new_lam).
|
|
735 | 744 | See (SW3) below
|
736 | 745 | |
737 | 746 | Wrinkles:
|
738 | 747 | |
739 | -(SW1) Concerning (b) I experimented with using `exprIsCheap` rather than
|
|
748 | +(SW1) Concerning (SW-b) I experimented with using `exprIsCheap` rather than
|
|
740 | 749 | `exprIsHNF` but the latter seems better, according to nofib
|
741 | 750 | (`spectral/mate` got 10% worse with exprIsCheap). It's really a bit of a
|
742 | 751 | heuristic.
|
743 | 752 | |
744 | -(SW2) What about omitting (b), and hence floating HNFs as well? The danger of
|
|
753 | +(SW2) What about omitting (SW-b), and hence floating HNFs as well? The danger of
|
|
745 | 754 | doing so is that we end up floating out a HNF from a cold path (where it
|
746 | 755 | might never get allocated at all) and allocating it all the time
|
747 | 756 | regardless. Example
|
... | ... | @@ -760,7 +769,7 @@ Wrinkles: |
760 | 769 | - Occasionally decreases runtime allocation (T12996 -2.5%)
|
761 | 770 | - Slightly mixed effect on nofib: (puzzle -10%, mate -5%, cichelli +5%)
|
762 | 771 | but geometric mean is -0.09%.
|
763 | - Overall, a win.
|
|
772 | + Overall, a small win.
|
|
764 | 773 | |
765 | 774 | (SW3) Concerning (c), if we are wrapping the thing in extra value lambdas (in
|
766 | 775 | abs_vars), then nothing is saved. E.g.
|
... | ... | @@ -771,10 +780,12 @@ Wrinkles: |
771 | 780 | we have saved nothing: one pair will still be allocated for each
|
772 | 781 | call of `f`. Hence the (not float_is_new_lam) in saves_work.
|
773 | 782 | |
774 | -Note [Saving allocation]
|
|
775 | -~~~~~~~~~~~~~~~~~~~~~~~~
|
|
783 | +Note [Floating to the top]
|
|
784 | +~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
776 | 785 | Even if `saves_work` is false, we we may want to float even cheap/HNF
|
777 | -expressions out of value lambdas, for several reasons:
|
|
786 | +expressions out of value lambdas. Data suggests, however, that it is better
|
|
787 | +/only/ to do so, /if/ they can go to top level. If the expression goes to top
|
|
788 | +level we don't pay the cost of allocating cold-path thunks described in (SW2).
|
|
778 | 789 | |
779 | 790 | * Doing so may save allocation. Consider
|
780 | 791 | f = \x. .. (\y.e) ...
|
... | ... | @@ -782,6 +793,11 @@ expressions out of value lambdas, for several reasons: |
782 | 793 | (assuming e does not mention x). An example where this really makes a
|
783 | 794 | difference is simplrun009.
|
784 | 795 | |
796 | +* In principle this would be true even if the (\y.e) didn't go to top level; but
|
|
797 | + in practice we only float a HNF if it goes all way to the top. We don't pay
|
|
798 | + /any/ allocation cost for a top-level floated expression; it just becomes
|
|
799 | + static data.
|
|
800 | + |
|
785 | 801 | * It may allow SpecContr to fire on functions. Consider
|
786 | 802 | f = \x. ....(f (\y.e))....
|
787 | 803 | After floating we get
|
... | ... | @@ -793,21 +809,7 @@ expressions out of value lambdas, for several reasons: |
793 | 809 | a big difference for string literals and bottoming expressions: see Note
|
794 | 810 | [Floating to the top]
|
795 | 811 | |
796 | -Data suggests, however, that it is better /only/ to float HNFS, /if/ they can go
|
|
797 | -to top level. See (SW2) of Note [Saving work]. If the expression goes to top
|
|
798 | -level we don't pay the cost of allocating cold-path thunks described in (SW2).
|
|
799 | - |
|
800 | -Hence `isTopLvl dest_lvl` in `saves_alloc`.
|
|
801 | - |
|
802 | -Note [Floating to the top]
|
|
803 | -~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
804 | -Even though Note [Saving allocation] suggests that we should not, in
|
|
805 | -general, float HNFs, the balance change if it goes to the top:
|
|
806 | - |
|
807 | -* We don't pay an allocation cost for the floated expression; it
|
|
808 | - just becomes static data.
|
|
809 | - |
|
810 | -* Floating string literal is valuable -- no point in duplicating the
|
|
812 | +* Floating string literals is valuable -- no point in duplicating the
|
|
811 | 813 | at each call site!
|
812 | 814 | |
813 | 815 | * Floating bottoming expressions is valuable: they are always cold
|
... | ... | @@ -815,32 +817,32 @@ general, float HNFs, the balance change if it goes to the top: |
815 | 817 | can be quite big, inhibiting inlining. See Note [Bottoming floats]
|
816 | 818 | |
817 | 819 | So we float an expression to the top if:
|
818 | - (a) the context is lazy (so we get allocation), or
|
|
819 | - (b) the expression is a HNF (so we get allocation), or
|
|
820 | - (c) the expression is bottoming and floating would escape a
|
|
821 | - value lambda (NB: if the expression itself is a lambda, (b)
|
|
822 | - will apply; so this case only catches bottoming thunks)
|
|
820 | + (FT1) the context is lazy (so we get allocation), or
|
|
821 | + (FT2) the expression is a HNF (so we get allocation), or
|
|
822 | + (FT3) the expression is bottoming and floating would escape a
|
|
823 | + value lambda (NB: if the expression itself is a lambda, (b)
|
|
824 | + will apply; so this case only catches bottoming thunks)
|
|
823 | 825 | |
824 | 826 | Examples:
|
825 | 827 | |
826 | -* (a) Strict. Case scrutinee
|
|
828 | +* (FT1) Strict. Case scrutinee
|
|
827 | 829 | f = case g True of ....
|
828 | 830 | Don't float (g True) to top level; then we have the admin of a
|
829 | 831 | top-level thunk to worry about, with zero gain.
|
830 | 832 | |
831 | -* (a) Strict. Case alternative
|
|
833 | +* (FT1) Strict. Case alternative
|
|
832 | 834 | h = case y of
|
833 | 835 | True -> g True
|
834 | 836 | False -> False
|
835 | 837 | Don't float (g True) to the top level
|
836 | 838 | |
837 | -* (b) HNF
|
|
839 | +* (FT2) HNF
|
|
838 | 840 | f = case y of
|
839 | 841 | True -> p:q
|
840 | 842 | False -> blah
|
841 | 843 | We may as well float the (p:q) so it becomes a static data structure.
|
842 | 844 | |
843 | -* (c) Bottoming expressions; see also Note [Bottoming floats]
|
|
845 | +* (FT3) Bottoming expressions; see also Note [Bottoming floats]
|
|
844 | 846 | f x = case x of
|
845 | 847 | 0 -> error <big thing>
|
846 | 848 | _ -> x+1
|
... | ... | @@ -853,7 +855,7 @@ Examples: |
853 | 855 | 'foo' anyway. So float bottoming things only if they escape
|
854 | 856 | a lambda.
|
855 | 857 | |
856 | -* Arguments
|
|
858 | +* (FT4) Arguments
|
|
857 | 859 | t = f (g True)
|
858 | 860 | Prior to Apr 22 we didn't float (g True) to the top if f was strict.
|
859 | 861 | But (a) this only affected CAFs, because if it escapes a value lambda
|
... | ... | @@ -868,28 +870,6 @@ early loses opportunities for RULES which (needless to say) are |
868 | 870 | important in some nofib programs (gcd is an example). [SPJ note:
|
869 | 871 | I think this is obsolete; the flag seems always on.]
|
870 | 872 | |
871 | -Note [Large free-variable sets]
|
|
872 | -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
873 | -In #24471 we had something like
|
|
874 | - x1 = I# 1
|
|
875 | - ...
|
|
876 | - x1000 = I# 1000
|
|
877 | - foo = f x1 (f x2 (f x3 ....))
|
|
878 | -So every sub-expression in `foo` has lots and lots of free variables. But
|
|
879 | -none of these sub-expressions float anywhere; the entire float-out pass is a
|
|
880 | -no-op.
|
|
881 | - |
|
882 | -In lvlMFE, we want to find out quickly if the MFE is not-floatable; that is
|
|
883 | -the common case. In #24471 it turned out that we were testing `abs_vars` (a
|
|
884 | -relatively complicated calculation that takes at least O(n-free-vars) time to
|
|
885 | -compute) for every sub-expression.
|
|
886 | - |
|
887 | -Better instead to test `float_me` early. That still involves looking at
|
|
888 | -dest_lvl, which means looking at every free variable, but the constant factor
|
|
889 | -is a lot better.
|
|
890 | - |
|
891 | -ToDo: find a way to fix the bad asymptotic complexity.
|
|
892 | - |
|
893 | 873 | Note [Floating join point bindings]
|
894 | 874 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
895 | 875 | Mostly we don't float join points at all -- we want them to /stay/ join points.
|
... | ... | @@ -1053,30 +1033,36 @@ we'd like to float the call to error, to get |
1053 | 1033 | |
1054 | 1034 | But, as ever, we need to be careful:
|
1055 | 1035 | |
1056 | -(1) We want to float a bottoming
|
|
1036 | +(BF1) We want to float a bottoming
|
|
1057 | 1037 | expression even if it has free variables:
|
1058 | 1038 | f = \x. g (let v = h x in error ("urk" ++ v))
|
1059 | 1039 | Then we'd like to abstract over 'x', and float the whole arg of g:
|
1060 | 1040 | lvl = \x. let v = h x in error ("urk" ++ v)
|
1061 | 1041 | f = \x. g (lvl x)
|
1062 | - To achieve this we pass is_bot to destLevel
|
|
1063 | - |
|
1064 | -(2) We do not do this for lambdas that return
|
|
1065 | - bottom. Instead we treat the /body/ of such a function specially,
|
|
1066 | - via point (1). For example:
|
|
1042 | + To achieve this we pass `is_bot` to destLevel
|
|
1043 | + |
|
1044 | +(BF2) We do the same for /lambdas/ that return bottom.
|
|
1045 | + Suppose the original lambda had /no/ free vars:
|
|
1046 | + f = \x. ....(\y z. error (y++z))...
|
|
1047 | + then we'd like to float that whole lambda
|
|
1048 | + lvl = \y z. error (y++z)
|
|
1049 | + f = \x. ....lvl....
|
|
1050 | + If we just floated its bottom-valued body, we might abstract the arguments in
|
|
1051 | + the "wrong" order and end up with this bad result
|
|
1052 | + lvl = \z y. error (y++z)
|
|
1053 | + f = \x. ....(\y z. lvl z y)....
|
|
1054 | + |
|
1055 | + If the lambda does have free vars, this will happen:
|
|
1067 | 1056 | f = \x. ....(\y z. if x then error y else error z)....
|
1068 | - If we float the whole lambda thus
|
|
1057 | + We float the whole lambda thus
|
|
1069 | 1058 | lvl = \x. \y z. if x then error y else error z
|
1070 | 1059 | f = \x. ...(lvl x)...
|
1071 | - we may well end up eta-expanding that PAP to
|
|
1060 | + And we may well end up eta-expanding that PAP to
|
|
1061 | + lvl = \x. \y z. if b then error y else error z
|
|
1072 | 1062 | f = \x. ...(\y z. lvl x y z)...
|
1063 | + so we get a (small) closure. So be it.
|
|
1073 | 1064 | |
1074 | - ===>
|
|
1075 | - lvl = \x z y. if b then error y else error z
|
|
1076 | - f = \x. ...(\y z. lvl x z y)...
|
|
1077 | - (There is no guarantee that we'll choose the perfect argument order.)
|
|
1078 | - |
|
1079 | -(3) If we have a /binding/ that returns bottom, we want to float it to top
|
|
1065 | +(BF3) If we have a /binding/ that returns bottom, we want to float it to top
|
|
1080 | 1066 | level, even if it has free vars (point (1)), and even it has lambdas.
|
1081 | 1067 | Example:
|
1082 | 1068 | ... let { v = \y. error (show x ++ show y) } in ...
|
... | ... | @@ -1092,7 +1078,6 @@ But, as ever, we need to be careful: |
1092 | 1078 | join points (#24768), and floating to the top would abstract over those join
|
1093 | 1079 | points, which we should never do.
|
1094 | 1080 | |
1095 | - |
|
1096 | 1081 | See Maessen's paper 1999 "Bottom extraction: factoring error handling out
|
1097 | 1082 | of functional programs" (unpublished I think).
|
1098 | 1083 | |
... | ... | @@ -1135,7 +1120,6 @@ float the case (as advocated here) we won't float the (build ...y..) |
1135 | 1120 | either, so fusion will happen. It can be a big effect, esp in some
|
1136 | 1121 | artificial benchmarks (e.g. integer, queens), but there is no perfect
|
1137 | 1122 | answer.
|
1138 | - |
|
1139 | 1123 | -}
|
1140 | 1124 | |
1141 | 1125 | annotateBotStr :: Id -> Arity -> Maybe (Arity, DmdSig, CprSig) -> Id
|
... | ... | @@ -1152,69 +1136,124 @@ annotateBotStr id n_extra mb_bot_str |
1152 | 1136 | = id
|
1153 | 1137 | |
1154 | 1138 | notWorthFloating :: CoreExpr -> [Var] -> Bool
|
1155 | --- Returns True if the expression would be replaced by
|
|
1156 | --- something bigger than it is now. For example:
|
|
1157 | --- abs_vars = tvars only: return True if e is trivial,
|
|
1158 | --- but False for anything bigger
|
|
1159 | --- abs_vars = [x] (an Id): return True for trivial, or an application (f x)
|
|
1160 | --- but False for (f x x)
|
|
1161 | ---
|
|
1162 | --- One big goal is that floating should be idempotent. Eg if
|
|
1163 | --- we replace e with (lvl79 x y) and then run FloatOut again, don't want
|
|
1164 | --- to replace (lvl79 x y) with (lvl83 x y)!
|
|
1165 | - |
|
1139 | +-- See Note [notWorthFloating]
|
|
1166 | 1140 | notWorthFloating e abs_vars
|
1167 | - = go e (count isId abs_vars)
|
|
1141 | + = go e 0
|
|
1168 | 1142 | where
|
1169 | - go (Var {}) n = n >= 0
|
|
1170 | - go (Lit lit) n = assert (n==0) $
|
|
1171 | - litIsTrivial lit -- Note [Floating literals]
|
|
1172 | - go (Type {}) _ = True
|
|
1173 | - go (Coercion {}) _ = True
|
|
1143 | + n_abs_vars = count isId abs_vars -- See (NWF5)
|
|
1144 | + |
|
1145 | + go :: CoreExpr -> Int -> Bool
|
|
1146 | + -- (go e n) return True if (e x1 .. xn) is not worth floating
|
|
1147 | + -- where `e` has n trivial value arguments x1..xn
|
|
1148 | + -- See (NWF4)
|
|
1149 | + go (Lit lit) n = assert (n==0) $
|
|
1150 | + litIsTrivial lit -- See (NWF1)
|
|
1151 | + go (Type {}) _ = True
|
|
1152 | + go (Tick t e) n = not (tickishIsCode t) && go e n
|
|
1153 | + go (Cast e _) n = n==0 || go e n -- See (NWF3)
|
|
1154 | + go (Coercion {}) _ = True
|
|
1174 | 1155 | go (App e arg) n
|
1175 | - -- See Note [Floating applications to coercions]
|
|
1176 | - | not (isRuntimeArg arg) = go e n
|
|
1177 | - | n==0 = False
|
|
1178 | - | exprIsTrivial arg = go e (n-1) -- NB: exprIsTrivial arg = go arg 0
|
|
1179 | - | otherwise = False
|
|
1180 | - go (Tick t e) n = not (tickishIsCode t) && go e n
|
|
1181 | - go (Cast e _) n = go e n
|
|
1182 | - go (Case e b _ as) n
|
|
1156 | + | Type {} <- arg = go e n -- Just types, not coercions (NWF2)
|
|
1157 | + | exprIsTrivial arg = go e (n+1)
|
|
1158 | + | otherwise = False -- (f non-triv) is worth floating
|
|
1159 | + |
|
1160 | + go (Case e b _ as) _
|
|
1161 | + -- Do not float the `case` part of trivial cases (NWF3)
|
|
1162 | + -- We'll have a look at the RHS when we get there
|
|
1183 | 1163 | | null as
|
1184 | - = go e n -- See Note [Empty case is trivial]
|
|
1185 | - | Just rhs <- isUnsafeEqualityCase e b as
|
|
1186 | - = go rhs n -- See (U2) of Note [Implementing unsafeCoerce] in base:Unsafe.Coerce
|
|
1187 | - go _ _ = False
|
|
1164 | + = True -- See Note [Empty case is trivial]
|
|
1165 | + | Just {} <- isUnsafeEqualityCase e b as
|
|
1166 | + = True -- See (U2) of Note [Implementing unsafeCoerce] in base:Unsafe.Coerce
|
|
1167 | + | otherwise
|
|
1168 | + = False
|
|
1188 | 1169 | |
1189 | -{-
|
|
1190 | -Note [Floating literals]
|
|
1191 | -~~~~~~~~~~~~~~~~~~~~~~~~
|
|
1192 | -It's important to float Integer literals, so that they get shared,
|
|
1193 | -rather than being allocated every time round the loop.
|
|
1194 | -Hence the litIsTrivial.
|
|
1170 | + go (Var _) n
|
|
1171 | + | n==0 = True -- Naked variable
|
|
1172 | + | n <= n_abs_vars = True -- (f a b c) is not worth floating if
|
|
1173 | + | otherwise = False -- a,b,c are all abstracted; see (NWF5)
|
|
1195 | 1174 | |
1196 | -Ditto literal strings (LitString), which we'd like to float to top
|
|
1197 | -level, which is now possible.
|
|
1175 | + go _ _ = False -- Let etc is worth floating
|
|
1198 | 1176 | |
1199 | -Note [Floating applications to coercions]
|
|
1200 | -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
1201 | -We don’t float out variables applied only to type arguments, since the
|
|
1202 | -extra binding would be pointless: type arguments are completely erased.
|
|
1203 | -But *coercion* arguments aren’t (see Note [Coercion tokens] in
|
|
1204 | -"GHC.CoreToStg" and Note [inlineBoringOk] in"GHC.Core.Unfold"),
|
|
1205 | -so we still want to float out variables applied only to
|
|
1206 | -coercion arguments.
|
|
1177 | +{- Note [notWorthFloating]
|
|
1178 | +~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
1179 | +`notWorthFloating` returns True if the expression would be replaced by something
|
|
1180 | +bigger than it is now. One big goal is that floating should be idempotent. Eg
|
|
1181 | +if we replace e with (lvl79 x y) and then run FloatOut again, don't want to
|
|
1182 | +replace (lvl79 x y) with (lvl83 x y)!
|
|
1207 | 1183 | |
1184 | +For example:
|
|
1185 | + abs_vars = tvars only: return True if e is trivial,
|
|
1186 | + but False for anything bigger
|
|
1187 | + abs_vars = [x] (an Id): return True for trivial, or an application (f x)
|
|
1188 | + but False for (f x x)
|
|
1189 | + |
|
1190 | +(NWF1) It's important to float Integer literals, so that they get shared, rather
|
|
1191 | + than being allocated every time round the loop. Hence the litIsTrivial.
|
|
1192 | + |
|
1193 | + Ditto literal strings (LitString), which we'd like to float to top
|
|
1194 | + level, which is now possible.
|
|
1195 | + |
|
1196 | +(NWF2) We don’t float out variables applied only to type arguments, since the
|
|
1197 | + extra binding would be pointless: type arguments are completely erased.
|
|
1198 | + But *coercion* arguments aren’t (see Note [Coercion tokens] in
|
|
1199 | + "GHC.CoreToStg" and Note [inlineBoringOk] in"GHC.Core.Unfold"),
|
|
1200 | + so we still want to float out variables applied only to
|
|
1201 | + coercion arguments.
|
|
1202 | + |
|
1203 | +(NWF3) Some expressions have trivial wrappers:
|
|
1204 | + - Casts (e |> co)
|
|
1205 | + - Unary-class applications:
|
|
1206 | + - Dictionary applications (MkC meth)
|
|
1207 | + - Class-op applictions (op dict)
|
|
1208 | + - Case of empty alts
|
|
1209 | + - Unsafe-equality case
|
|
1210 | + In all these cases we say "not worth floating", and we do so /regardless/
|
|
1211 | + of the wrapped expression. The SetLevels stuff may subsequently float the
|
|
1212 | + components of the expression.
|
|
1213 | + |
|
1214 | + Example: is it worth floating (f x |> co)? No! If we did we'd get
|
|
1215 | + lvl = f x |> co
|
|
1216 | + ...lvl....
|
|
1217 | + Then we'd do cast worker/wrapper and end up with.
|
|
1218 | + lvl' = f x
|
|
1219 | + ...(lvl' |> co)...
|
|
1220 | + Silly! Better not to float it in the first place. If we say "no" here,
|
|
1221 | + we'll subsequently say "yes" for (f x) and get
|
|
1222 | + lvl = f x
|
|
1223 | + ....(lvl |> co)...
|
|
1224 | + which is what we want. In short: don't float trivial wrappers.
|
|
1225 | + |
|
1226 | +(NWF4) The only non-trivial expression that we say "not worth floating" for
|
|
1227 | + is an application
|
|
1228 | + f x y z
|
|
1229 | + where the number of value arguments is <= the number of abstracted Ids.
|
|
1230 | + This is what makes floating idempotent. Hence counting the number of
|
|
1231 | + value arguments in `go`
|
|
1232 | + |
|
1233 | +(NWF5) In #24471 we had something like
|
|
1234 | + x1 = I# 1
|
|
1235 | + ...
|
|
1236 | + x1000 = I# 1000
|
|
1237 | + foo = f x1 (f x2 (f x3 ....))
|
|
1238 | + So every sub-expression in `foo` has lots and lots of free variables. But
|
|
1239 | + none of these sub-expressions float anywhere; the entire float-out pass is a
|
|
1240 | + no-op.
|
|
1208 | 1241 | |
1209 | -************************************************************************
|
|
1210 | -* *
|
|
1211 | -\subsection{Bindings}
|
|
1212 | -* *
|
|
1213 | -************************************************************************
|
|
1242 | + So `notWorthFloating` tries to avoid evaluating `n_abs_vars`, in cases where
|
|
1243 | + it obviously /is/ worth floating. (In #24471 it turned out that we were
|
|
1244 | + testing `abs_vars` (a relatively complicated calculation that takes at least
|
|
1245 | + O(n-free-vars) time to compute) for every sub-expression.)
|
|
1214 | 1246 | |
1215 | -The binding stuff works for top level too.
|
|
1247 | + Hence testing `n_abs_vars only` at the very end.
|
|
1216 | 1248 | -}
|
1217 | 1249 | |
1250 | +{- *********************************************************************
|
|
1251 | +* *
|
|
1252 | + Bindings
|
|
1253 | + This binding stuff works for top level too.
|
|
1254 | +* *
|
|
1255 | +********************************************************************* -}
|
|
1256 | + |
|
1218 | 1257 | lvlBind :: LevelEnv
|
1219 | 1258 | -> CoreBindWithFVs
|
1220 | 1259 | -> LvlM (LevelledBind, LevelEnv)
|
... | ... | @@ -1261,7 +1300,7 @@ lvlBind env (AnnNonRec bndr rhs) |
1261 | 1300 | -- is_bot_lam: looks like (\xy. bot), maybe zero lams
|
1262 | 1301 | -- NB: not isBottomThunk!
|
1263 | 1302 | -- NB: not is_join: don't send bottoming join points to the top.
|
1264 | - -- See Note [Bottoming floats] point (3)
|
|
1303 | + -- See Note [Bottoming floats] (BF3)
|
|
1265 | 1304 | |
1266 | 1305 | is_top_bindable = exprIsTopLevelBindable deann_rhs bndr_ty
|
1267 | 1306 | n_extra = count isId abs_vars
|
... | ... | @@ -1552,9 +1591,8 @@ destLevel env fvs fvs_ty is_function is_bot |
1552 | 1591 | -- See Note [Floating join point bindings]
|
1553 | 1592 | = tOP_LEVEL
|
1554 | 1593 | |
1555 | - | is_bot -- Send bottoming bindings to the top
|
|
1556 | - = as_far_as_poss -- regardless; see Note [Bottoming floats]
|
|
1557 | - -- Esp Bottoming floats (1) and (3)
|
|
1594 | + | is_bot -- Send bottoming bindings to the top regardless;
|
|
1595 | + = as_far_as_poss -- see (BF1) and (BF2) in Note [Bottoming floats]
|
|
1558 | 1596 | |
1559 | 1597 | | Just n_args <- floatLams env
|
1560 | 1598 | , n_args > 0 -- n=0 case handled uniformly by the 'otherwise' case
|
... | ... | @@ -1568,8 +1606,13 @@ destLevel env fvs fvs_ty is_function is_bot |
1568 | 1606 | max_fv_id_level = maxFvLevel isId env fvs -- Max over Ids only; the
|
1569 | 1607 | -- tyvars will be abstracted
|
1570 | 1608 | |
1609 | + -- as_far_as_poss: destination level depends only on the free Ids (more
|
|
1610 | + -- precisely, free CoVars) of the /type/, not the free Ids of the /term/.
|
|
1611 | + -- Why worry about the free CoVars? See Note [Floating and kind casts]
|
|
1612 | + --
|
|
1613 | + -- There may be free Ids in the term, but then we'll just
|
|
1614 | + -- lambda-abstract over them
|
|
1571 | 1615 | as_far_as_poss = maxFvLevel' isId env fvs_ty
|
1572 | - -- See Note [Floating and kind casts]
|
|
1573 | 1616 | |
1574 | 1617 | {- Note [Floating and kind casts]
|
1575 | 1618 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
... | ... | @@ -1732,10 +1775,9 @@ maxFvLevel max_me env var_set |
1732 | 1775 | -- It's OK to use a non-deterministic fold here because maxIn commutes.
|
1733 | 1776 | |
1734 | 1777 | maxFvLevel' :: (Var -> Bool) -> LevelEnv -> TyCoVarSet -> Level
|
1735 | --- Same but for TyCoVarSet
|
|
1778 | +-- Precisely the same as `maxFvLevel` but for TyCoVarSet rather than DVarSet
|
|
1736 | 1779 | maxFvLevel' max_me env var_set
|
1737 | 1780 | = nonDetStrictFoldUniqSet (maxIn max_me env) tOP_LEVEL var_set
|
1738 | - -- It's OK to use a non-deterministic fold here because maxIn commutes.
|
|
1739 | 1781 | |
1740 | 1782 | maxIn :: (Var -> Bool) -> LevelEnv -> InVar -> Level -> Level
|
1741 | 1783 | maxIn max_me (LE { le_lvl_env = lvl_env, le_env = id_env }) in_var lvl
|
... | ... | @@ -11,7 +11,7 @@ module GHC.Core.Opt.Simplify.Env ( |
11 | 11 | SimplMode(..), updMode, smPlatform,
|
12 | 12 | |
13 | 13 | -- * Environments
|
14 | - SimplEnv(..), pprSimplEnv, -- Temp not abstract
|
|
14 | + SimplEnv(..), StaticEnv, pprSimplEnv,
|
|
15 | 15 | seArityOpts, seCaseCase, seCaseFolding, seCaseMerge, seCastSwizzle,
|
16 | 16 | seDoEtaReduction, seEtaExpand, seFloatEnable, seInline, seNames,
|
17 | 17 | seOptCoercionOpts, sePhase, sePlatform, sePreInline,
|
... | ... | @@ -170,6 +170,8 @@ coercion we don't apply optCoercion to it if seInlineDepth>0. |
170 | 170 | Reason: it has already been optimised once, no point in doing so again.
|
171 | 171 | -}
|
172 | 172 | |
173 | +type StaticEnv = SimplEnv -- Just the static part is relevant
|
|
174 | + |
|
173 | 175 | data SimplEnv
|
174 | 176 | = SimplEnv {
|
175 | 177 | ----------- Static part of the environment -----------
|
... | ... | @@ -407,7 +409,6 @@ data SimplSR |
407 | 409 | -- and ja = Just a <=> x is a join-point of arity a
|
408 | 410 | -- See Note [Join arity in SimplIdSubst]
|
409 | 411 | |
410 | - |
|
411 | 412 | | DoneId OutId
|
412 | 413 | -- If x :-> DoneId v is in the SimplIdSubst
|
413 | 414 | -- then replace occurrences of x by v
|
... | ... | @@ -778,7 +779,7 @@ emptyJoinFloats = nilOL |
778 | 779 | isEmptyJoinFloats :: JoinFloats -> Bool
|
779 | 780 | isEmptyJoinFloats = isNilOL
|
780 | 781 | |
781 | -unitLetFloat :: OutBind -> LetFloats
|
|
782 | +unitLetFloat :: HasDebugCallStack => OutBind -> LetFloats
|
|
782 | 783 | -- This key function constructs a singleton float with the right form
|
783 | 784 | unitLetFloat bind = assert (all (not . isJoinId) (bindersOf bind)) $
|
784 | 785 | LetFloats (unitOL bind) (flag bind)
|
... | ... | @@ -447,7 +447,7 @@ we want to do something very similar to worker/wrapper: |
447 | 447 | |
448 | 448 | We call this making a cast worker/wrapper in tryCastWorkerWrapper.
|
449 | 449 | |
450 | -The main motivaiton is that x can be inlined freely. There's a chance
|
|
450 | +The main motivation is that x can be inlined freely. There's a chance
|
|
451 | 451 | that e will be a constructor application or function, or something
|
452 | 452 | like that, so moving the coercion to the usage site may well cancel
|
453 | 453 | the coercions and lead to further optimisation. Example:
|
... | ... | @@ -576,11 +576,13 @@ Note [Concrete types] in GHC.Tc.Utils.Concrete. |
576 | 576 | -}
|
577 | 577 | |
578 | 578 | tryCastWorkerWrapper :: SimplEnv -> BindContext
|
579 | - -> InId -> OutId -> OutExpr
|
|
580 | - -> SimplM (SimplFloats, SimplEnv)
|
|
579 | + -> OutId -> OutExpr
|
|
580 | + -> SimplM (Maybe (LetFloats, OutId, OutExpr))
|
|
581 | 581 | -- See Note [Cast worker/wrapper]
|
582 | -tryCastWorkerWrapper env bind_cxt old_bndr bndr (Cast rhs co)
|
|
583 | - | BC_Let top_lvl is_rec <- bind_cxt -- Not join points
|
|
582 | +-- Given input x = rhs |> co, the result will be
|
|
583 | +-- (x' = rhs, x, x' |> co))
|
|
584 | +tryCastWorkerWrapper env bind_cxt bndr (Cast rhs co)
|
|
585 | + | BC_Let top_lvl _ <- bind_cxt -- Not join points
|
|
584 | 586 | , not (isDFunId bndr) -- nor DFuns; cast w/w is no help, and we can't transform
|
585 | 587 | -- a DFunUnfolding in mk_worker_unfolding
|
586 | 588 | , not (exprIsTrivial rhs) -- Not x = y |> co; Wrinkle 1
|
... | ... | @@ -588,38 +590,23 @@ tryCastWorkerWrapper env bind_cxt old_bndr bndr (Cast rhs co) |
588 | 590 | , typeHasFixedRuntimeRep work_ty -- Don't peel off a cast if doing so would
|
589 | 591 | -- lose the underlying runtime representation.
|
590 | 592 | -- See Note [Preserve RuntimeRep info in cast w/w]
|
591 | - , not (isOpaquePragma (idInlinePragma old_bndr)) -- Not for OPAQUE bindings
|
|
592 | - -- See Note [OPAQUE pragma]
|
|
593 | + , not (isOpaquePragma (idInlinePragma bndr)) -- Not for OPAQUE bindings
|
|
594 | + -- See Note [OPAQUE pragma]
|
|
593 | 595 | = do { uniq <- getUniqueM
|
594 | 596 | ; let work_name = mkSystemVarName uniq occ_fs
|
595 | 597 | work_id = mkLocalIdWithInfo work_name ManyTy work_ty work_info
|
596 | - is_strict = isStrictId bndr
|
|
597 | 598 | |
598 | - ; (rhs_floats, work_rhs) <- prepareBinding env top_lvl is_rec is_strict
|
|
599 | - work_id (emptyFloats env) rhs
|
|
600 | - |
|
601 | - ; work_unf <- mk_worker_unfolding top_lvl work_id work_rhs
|
|
599 | + ; work_unf <- mk_worker_unfolding top_lvl work_id rhs
|
|
602 | 600 | ; let work_id_w_unf = work_id `setIdUnfolding` work_unf
|
603 | - floats = rhs_floats `addLetFloats`
|
|
604 | - unitLetFloat (NonRec work_id_w_unf work_rhs)
|
|
605 | - |
|
606 | - triv_rhs = Cast (Var work_id_w_unf) co
|
|
607 | - |
|
608 | - ; if postInlineUnconditionally env bind_cxt old_bndr bndr triv_rhs
|
|
609 | - -- Almost always True, because the RHS is trivial
|
|
610 | - -- In that case we want to eliminate the binding fast
|
|
611 | - -- We conservatively use postInlineUnconditionally so that we
|
|
612 | - -- check all the right things
|
|
613 | - then do { tick (PostInlineUnconditionally bndr)
|
|
614 | - ; return ( floats
|
|
615 | - , extendIdSubst (setInScopeFromF env floats) old_bndr $
|
|
616 | - DoneEx triv_rhs NotJoinPoint ) }
|
|
617 | - |
|
618 | - else do { wrap_unf <- mkLetUnfolding env top_lvl VanillaSrc bndr False triv_rhs
|
|
619 | - ; let bndr' = bndr `setInlinePragma` mkCastWrapperInlinePrag (idInlinePragma bndr)
|
|
620 | - `setIdUnfolding` wrap_unf
|
|
621 | - floats' = floats `extendFloats` NonRec bndr' triv_rhs
|
|
622 | - ; return ( floats', setInScopeFromF env floats' ) } }
|
|
601 | + work_bind = NonRec work_id_w_unf rhs
|
|
602 | + triv_rhs = Cast (Var work_id_w_unf) co
|
|
603 | + |
|
604 | + ; wrap_unf <- mkLetUnfolding env top_lvl VanillaSrc bndr False triv_rhs
|
|
605 | + ; let wrap_prag = mkCastWrapperInlinePrag (inlinePragInfo info)
|
|
606 | + bndr' = bndr `setInlinePragma` wrap_prag
|
|
607 | + `setIdUnfolding` wrap_unf
|
|
608 | + |
|
609 | + ; return (Just (unitLetFloat work_bind, bndr', triv_rhs)) }
|
|
623 | 610 | where
|
624 | 611 | -- Force the occ_fs so that the old Id is not retained in the new Id.
|
625 | 612 | !occ_fs = getOccFS bndr
|
... | ... | @@ -647,10 +634,10 @@ tryCastWorkerWrapper env bind_cxt old_bndr bndr (Cast rhs co) |
647 | 634 | | isStableSource src -> return (unf { uf_tmpl = mkCast unf_rhs (mkSymCo co) })
|
648 | 635 | _ -> mkLetUnfolding env top_lvl VanillaSrc work_id False work_rhs
|
649 | 636 | |
650 | -tryCastWorkerWrapper env _ _ bndr rhs -- All other bindings
|
|
637 | +tryCastWorkerWrapper _ _ bndr rhs -- All other bindings
|
|
651 | 638 | = do { traceSmpl "tcww:no" (vcat [ text "bndr:" <+> ppr bndr
|
652 | 639 | , text "rhs:" <+> ppr rhs ])
|
653 | - ; return (mkFloatBind env (NonRec bndr rhs)) }
|
|
640 | + ; return Nothing }
|
|
654 | 641 | |
655 | 642 | mkCastWrapperInlinePrag :: InlinePragma -> InlinePragma
|
656 | 643 | -- See Note [Cast worker/wrapper]
|
... | ... | @@ -810,39 +797,40 @@ makeTrivial :: HasDebugCallStack |
810 | 797 | -- Binds the expression to a variable, if it's not trivial, returning the variable
|
811 | 798 | -- For the Demand argument, see Note [Keeping demand info in StrictArg Plan A]
|
812 | 799 | makeTrivial env top_lvl dmd occ_fs expr
|
813 | - | exprIsTrivial expr -- Already trivial
|
|
814 | - || not (bindingOk top_lvl expr expr_ty) -- Cannot trivialise
|
|
815 | - -- See Note [Cannot trivialise]
|
|
800 | + | exprIsTrivial expr -- Already trivial
|
|
816 | 801 | = return (emptyLetFloats, expr)
|
817 | 802 | |
818 | - | Cast expr' co <- expr
|
|
819 | - = do { (floats, triv_expr) <- makeTrivial env top_lvl dmd occ_fs expr'
|
|
820 | - ; return (floats, Cast triv_expr co) }
|
|
803 | + | not (bindingOk top_lvl expr expr_ty) -- Cannot trivialise
|
|
804 | + = return (emptyLetFloats, expr) -- See Note [Cannot trivialise]
|
|
821 | 805 | |
822 | - | otherwise -- 'expr' is not of form (Cast e co)
|
|
806 | + | otherwise
|
|
823 | 807 | = do { (floats, expr1) <- prepareRhs env top_lvl occ_fs expr
|
824 | 808 | ; uniq <- getUniqueM
|
825 | 809 | ; let name = mkSystemVarName uniq occ_fs
|
826 | - var = mkLocalIdWithInfo name ManyTy expr_ty id_info
|
|
810 | + bndr = mkLocalIdWithInfo name ManyTy expr_ty id_info
|
|
811 | + bind_ctxt = BC_Let top_lvl NonRecursive
|
|
827 | 812 | |
828 | 813 | -- Now something very like completeBind,
|
829 | 814 | -- but without the postInlineUnconditionally part
|
830 | - ; (arity_type, expr2) <- tryEtaExpandRhs env (BC_Let top_lvl NonRecursive) var expr1
|
|
815 | + ; (arity_type, expr2) <- tryEtaExpandRhs env bind_ctxt bndr expr1
|
|
831 | 816 | -- Technically we should extend the in-scope set in 'env' with
|
832 | 817 | -- the 'floats' from prepareRHS; but they are all fresh, so there is
|
833 | 818 | -- no danger of introducing name shadowing in eta expansion
|
834 | 819 | |
835 | - ; unf <- mkLetUnfolding env top_lvl VanillaSrc var False expr2
|
|
836 | - |
|
837 | - ; let final_id = addLetBndrInfo var arity_type unf
|
|
838 | - bind = NonRec final_id expr2
|
|
820 | + ; unf <- mkLetUnfolding env top_lvl VanillaSrc bndr False expr2
|
|
821 | + ; let bndr' = addLetBndrInfo bndr arity_type unf
|
|
822 | + anf_bind = NonRec bndr' expr2
|
|
839 | 823 | |
840 | - ; traceSmpl "makeTrivial" (vcat [text "final_id" <+> ppr final_id, text "rhs" <+> ppr expr2 ])
|
|
841 | - ; return ( floats `addLetFlts` unitLetFloat bind, Var final_id ) }
|
|
824 | + ; mb_cast_ww <- tryCastWorkerWrapper env bind_ctxt bndr' expr2
|
|
825 | + ; case mb_cast_ww of
|
|
826 | + Nothing -> return (floats `addLetFlts` unitLetFloat anf_bind, Var bndr')
|
|
827 | + Just (work_flts, _, triv_rhs)
|
|
828 | + -> return (floats `addLetFlts` work_flts, triv_rhs) }
|
|
842 | 829 | where
|
843 | 830 | id_info = vanillaIdInfo `setDemandInfo` dmd
|
844 | 831 | expr_ty = exprType expr
|
845 | 832 | |
833 | + |
|
846 | 834 | bindingOk :: TopLevelFlag -> CoreExpr -> Type -> Bool
|
847 | 835 | -- True iff we can have a binding of this expression at this level
|
848 | 836 | -- Precondition: the type is the type of the expression
|
... | ... | @@ -936,26 +924,50 @@ completeBind bind_cxt (old_bndr, unf_se) (new_bndr, new_rhs, env) |
936 | 924 | eta_rhs (idType new_bndr) new_arity old_unf
|
937 | 925 | |
938 | 926 | ; let new_bndr_w_info = addLetBndrInfo new_bndr new_arity new_unfolding
|
939 | - -- See Note [In-scope set as a substitution]
|
|
927 | + -- See Note [In-scope set as a substitution]
|
|
928 | + occ_anald_rhs = maybeUnfoldingTemplate new_unfolding `orElse` eta_rhs
|
|
929 | + -- occ_anald_rhs: see Note [Use occ-anald RHS in postInlineUnconditionally]
|
|
940 | 930 | |
931 | + -- Try postInlineUnconditionally for (x = rhs)
|
|
932 | + -- If that succeeds we don't want to do tryCastWorkerWrapper
|
|
941 | 933 | ; if postInlineUnconditionally env bind_cxt old_bndr new_bndr_w_info eta_rhs
|
942 | - |
|
943 | - then -- Inline and discard the binding
|
|
944 | - do { tick (PostInlineUnconditionally old_bndr)
|
|
945 | - ; let unf_rhs = maybeUnfoldingTemplate new_unfolding `orElse` eta_rhs
|
|
946 | - -- See Note [Use occ-anald RHS in postInlineUnconditionally]
|
|
947 | - ; simplTrace "PostInlineUnconditionally" (ppr new_bndr <+> ppr unf_rhs) $
|
|
948 | - return ( emptyFloats env
|
|
949 | - , extendIdSubst env old_bndr $
|
|
950 | - DoneEx unf_rhs (idJoinPointHood new_bndr)) }
|
|
934 | + then post_inline_it (emptyFloats env) occ_anald_rhs
|
|
935 | + else
|
|
936 | + |
|
937 | + do { -- Try cast worker-wrapper
|
|
938 | + mb_cast_ww <- tryCastWorkerWrapper env bind_cxt new_bndr_w_info eta_rhs
|
|
939 | + ; case mb_cast_ww of
|
|
940 | + Nothing -> no_post_inline (emptyFloats env) new_bndr_w_info eta_rhs
|
|
941 | + |
|
942 | + Just (cast_let_flts, new_bndr, new_rhs)
|
|
943 | + -- Try postInlineUnconditionally for (new_bndr = new_rhs)
|
|
944 | + -- Almost always fires, because `new_rhs` is small, but we conservatively
|
|
945 | + -- use `postInlineUnconditionally` so that we check all the right things
|
|
946 | + | postInlineUnconditionally env bind_cxt old_bndr new_bndr new_rhs
|
|
947 | + -> post_inline_it cast_floats new_rhs
|
|
948 | + -- new_rhs is (x |> co) so no need to occ-anal
|
|
949 | + | otherwise
|
|
950 | + -> no_post_inline cast_floats new_bndr new_rhs
|
|
951 | + where
|
|
952 | + cast_floats = emptyFloats env `addLetFloats` cast_let_flts
|
|
953 | + } }
|
|
954 | + where
|
|
955 | + no_post_inline floats new_bndr new_rhs
|
|
956 | + = do { let the_bind = NonRec new_bndr new_rhs
|
|
957 | + floats' = floats `extendFloats` the_bind
|
|
958 | + env' = env `setInScopeFromF` floats'
|
|
959 | + ; return (floats', env') }
|
|
960 | + |
|
961 | + post_inline_it floats rhs
|
|
962 | + = do { simplTrace "PostInlineUnconditionally" (ppr old_bndr <+> ppr rhs) $
|
|
963 | + tick (PostInlineUnconditionally old_bndr)
|
|
964 | + ; let env' = env `setInScopeFromF` floats
|
|
965 | + ; return ( floats
|
|
966 | + , extendIdSubst env' old_bndr $
|
|
967 | + DoneEx rhs (idJoinPointHood old_bndr)) }
|
|
951 | 968 | -- Use the substitution to make quite, quite sure that the
|
952 | 969 | -- substitution will happen, since we are going to discard the binding
|
953 | 970 | |
954 | - else -- Keep the binding; do cast worker/wrapper
|
|
955 | --- simplTrace "completeBind" (vcat [ text "bndrs" <+> ppr old_bndr <+> ppr new_bndr
|
|
956 | --- , text "eta_rhs" <+> ppr eta_rhs ]) $
|
|
957 | - tryCastWorkerWrapper env bind_cxt old_bndr new_bndr_w_info eta_rhs }
|
|
958 | - |
|
959 | 971 | addLetBndrInfo :: OutId -> ArityType -> Unfolding -> OutId
|
960 | 972 | addLetBndrInfo new_bndr new_arity_type new_unf
|
961 | 973 | = new_bndr `setIdInfo` info5
|
... | ... | @@ -3955,7 +3967,17 @@ mkDupableContWithDmds env dmds |
3955 | 3967 | ; (floats1, cont') <- mkDupableContWithDmds env cont_dmds cont
|
3956 | 3968 | ; let env' = env `setInScopeFromF` floats1
|
3957 | 3969 | ; (_, se', arg') <- simplLazyArg env' dup hole_ty Nothing se arg
|
3958 | - ; (let_floats2, arg'') <- makeTrivial env NotTopLevel dmd (fsLit "karg") arg'
|
|
3970 | + |
|
3971 | + -- Make the argument duplicable. Danger: if arg is small and we let-bind
|
|
3972 | + -- it, then postInlineUnconditionally will just inline it again, perhaps
|
|
3973 | + -- taking an extra Simplifier iteration (e.g. in test T21839c). So make
|
|
3974 | + -- a `let` only if `couldBeSmallEnoughToInline` says that it is big enough
|
|
3975 | + ; let uf_opts = seUnfoldingOpts env
|
|
3976 | + ; (let_floats2, arg'')
|
|
3977 | + <- if couldBeSmallEnoughToInline uf_opts (unfoldingUseThreshold uf_opts) arg'
|
|
3978 | + then return (emptyLetFloats, arg')
|
|
3979 | + else makeTrivial env NotTopLevel dmd (fsLit "karg") arg'
|
|
3980 | + |
|
3959 | 3981 | ; let all_floats = floats1 `addLetFloats` let_floats2
|
3960 | 3982 | ; return ( all_floats
|
3961 | 3983 | , ApplyToVal { sc_arg = arg''
|
... | ... | @@ -4592,7 +4614,8 @@ mkLetUnfolding :: SimplEnv -> TopLevelFlag -> UnfoldingSource |
4592 | 4614 | -> InId -> Bool -- True <=> this is a join point
|
4593 | 4615 | -> OutExpr -> SimplM Unfolding
|
4594 | 4616 | mkLetUnfolding env top_lvl src id is_join new_rhs
|
4595 | - = return (mkUnfolding uf_opts src is_top_lvl is_bottoming is_join new_rhs Nothing)
|
|
4617 | + = -- Monadic to force those where-bindings
|
|
4618 | + return (mkUnfolding uf_opts src is_top_lvl is_bottoming is_join new_rhs Nothing)
|
|
4596 | 4619 | -- We make an unfolding *even for loop-breakers*.
|
4597 | 4620 | -- Reason: (a) It might be useful to know that they are WHNF
|
4598 | 4621 | -- (b) In GHC.Iface.Tidy we currently assume that, if we want to
|
... | ... | @@ -216,8 +216,6 @@ data SimplCont |
216 | 216 | CoreTickish -- Tick tickish <hole>
|
217 | 217 | SimplCont
|
218 | 218 | |
219 | -type StaticEnv = SimplEnv -- Just the static part is relevant
|
|
220 | - |
|
221 | 219 | data FromWhat = FromLet | FromBeta Levity
|
222 | 220 | |
223 | 221 | -- See Note [DupFlag invariants]
|
... | ... | @@ -723,7 +721,6 @@ which it is on the LHS of a rule (see updModeForRules), then don't |
723 | 721 | make use of the strictness info for the function.
|
724 | 722 | -}
|
725 | 723 | |
726 | - |
|
727 | 724 | {-
|
728 | 725 | ************************************************************************
|
729 | 726 | * *
|
... | ... | @@ -1423,8 +1420,12 @@ preInlineUnconditionally for |
1423 | 1420 | Note [Top-level bottoming Ids]
|
1424 | 1421 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
1425 | 1422 | Don't inline top-level Ids that are bottoming, even if they are used just
|
1426 | -once, because FloatOut has gone to some trouble to extract them out.
|
|
1427 | -Inlining them won't make the program run faster!
|
|
1423 | +once, because FloatOut has gone to some trouble to extract them out. e.g.
|
|
1424 | + report x y = error (..lots of stuff...)
|
|
1425 | + f x y z = if z then report x y else ...blah...
|
|
1426 | +Here `f` might be small enough to inline; but if we put all the `report`
|
|
1427 | +stuff inside it, it'll look to big. In general we don't want to duplicate
|
|
1428 | +all the error-reporting goop.
|
|
1428 | 1429 | |
1429 | 1430 | Note [Do not inline CoVars unconditionally]
|
1430 | 1431 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
... | ... | @@ -1460,51 +1461,25 @@ preInlineUnconditionally env top_lvl bndr rhs rhs_env |
1460 | 1461 | extend_subst_with inl_rhs = extendIdSubst env bndr $! (mkContEx rhs_env inl_rhs)
|
1461 | 1462 | |
1462 | 1463 | one_occ IAmDead = True -- Happens in ((\x.1) v)
|
1463 | - one_occ OneOcc{ occ_n_br = 1
|
|
1464 | - , occ_in_lam = NotInsideLam } = isNotTopLevel top_lvl || early_phase
|
|
1464 | + one_occ OneOcc{ occ_n_br = 1
|
|
1465 | + , occ_in_lam = NotInsideLam
|
|
1466 | + , occ_int_cxt = int_cxt }
|
|
1467 | + = isNotTopLevel top_lvl -- Get rid of allocation
|
|
1468 | + || (int_cxt==IsInteresting) -- Function is applied
|
|
1469 | + -- || (early_phase && not (isConLikeUnfolding unf)) -- See early_phase
|
|
1465 | 1470 | one_occ OneOcc{ occ_n_br = 1
|
1466 | 1471 | , occ_in_lam = IsInsideLam
|
1467 | - , occ_int_cxt = IsInteresting } = canInlineInLam rhs
|
|
1468 | - one_occ _ = False
|
|
1472 | + , occ_int_cxt = IsInteresting }
|
|
1473 | + = canInlineInLam rhs
|
|
1474 | + one_occ _
|
|
1475 | + = False
|
|
1469 | 1476 | |
1470 | 1477 | pre_inline_unconditionally = sePreInline env
|
1471 | 1478 | active = isActive (sePhase env) (inlinePragmaActivation inline_prag)
|
1472 | 1479 | -- See Note [pre/postInlineUnconditionally in gentle mode]
|
1473 | 1480 | inline_prag = idInlinePragma bndr
|
1474 | 1481 | |
1475 | --- Be very careful before inlining inside a lambda, because (a) we must not
|
|
1476 | --- invalidate occurrence information, and (b) we want to avoid pushing a
|
|
1477 | --- single allocation (here) into multiple allocations (inside lambda).
|
|
1478 | --- Inlining a *function* with a single *saturated* call would be ok, mind you.
|
|
1479 | --- || (if is_cheap && not (canInlineInLam rhs) then pprTrace "preinline" (ppr bndr <+> ppr rhs) ok else ok)
|
|
1480 | --- where
|
|
1481 | --- is_cheap = exprIsCheap rhs
|
|
1482 | --- ok = is_cheap && int_cxt
|
|
1483 | - |
|
1484 | - -- int_cxt The context isn't totally boring
|
|
1485 | - -- E.g. let f = \ab.BIG in \y. map f xs
|
|
1486 | - -- Don't want to substitute for f, because then we allocate
|
|
1487 | - -- its closure every time the \y is called
|
|
1488 | - -- But: let f = \ab.BIG in \y. map (f y) xs
|
|
1489 | - -- Now we do want to substitute for f, even though it's not
|
|
1490 | - -- saturated, because we're going to allocate a closure for
|
|
1491 | - -- (f y) every time round the loop anyhow.
|
|
1492 | - |
|
1493 | - -- canInlineInLam => free vars of rhs are (Once in_lam) or Many,
|
|
1494 | - -- so substituting rhs inside a lambda doesn't change the occ info.
|
|
1495 | - -- Sadly, not quite the same as exprIsHNF.
|
|
1496 | - canInlineInLam (Lit _) = True
|
|
1497 | - canInlineInLam (Lam b e) = isRuntimeVar b || canInlineInLam e
|
|
1498 | - canInlineInLam (Tick t e) = not (tickishIsCode t) && canInlineInLam e
|
|
1499 | - canInlineInLam (Var v) = case idOccInfo v of
|
|
1500 | - OneOcc { occ_in_lam = IsInsideLam } -> True
|
|
1501 | - ManyOccs {} -> True
|
|
1502 | - _ -> False
|
|
1503 | - canInlineInLam _ = False
|
|
1504 | - -- not ticks. Counting ticks cannot be duplicated, and non-counting
|
|
1505 | - -- ticks around a Lam will disappear anyway.
|
|
1506 | - |
|
1507 | - early_phase = sePhase env /= FinalPhase
|
|
1482 | +-- early_phase = sePhase env /= FinalPhase
|
|
1508 | 1483 | -- If we don't have this early_phase test, consider
|
1509 | 1484 | -- x = length [1,2,3]
|
1510 | 1485 | -- The full laziness pass carefully floats all the cons cells to
|
... | ... | @@ -1532,6 +1507,52 @@ preInlineUnconditionally env top_lvl bndr rhs rhs_env |
1532 | 1507 | -- (Nor can we check for `exprIsExpandable rhs`, because that needs to look
|
1533 | 1508 | -- at the non-existent unfolding for the `I# 2#` which is also floated out.)
|
1534 | 1509 | |
1510 | + |
|
1511 | +-- Be very careful before inlining inside a lambda, because (a) we must not
|
|
1512 | +-- invalidate occurrence information, and (b) we want to avoid pushing a
|
|
1513 | +-- single allocation (here) into multiple allocations (inside lambda).
|
|
1514 | +-- Inlining a *function* with a single *saturated* call would be ok, mind you.
|
|
1515 | +-- || (if is_cheap && not (canInlineInLam rhs) then pprTrace "preinline" (ppr bndr <+> ppr rhs) ok else ok)
|
|
1516 | +-- where
|
|
1517 | +-- is_cheap = exprIsCheap rhs
|
|
1518 | +-- ok = is_cheap && int_cxt
|
|
1519 | + |
|
1520 | + -- int_cxt The context isn't totally boring
|
|
1521 | + -- E.g. let f = \ab.BIG in \y. map f xs
|
|
1522 | + -- Don't want to substitute for f, because then we allocate
|
|
1523 | + -- its closure every time the \y is called
|
|
1524 | + -- But: let f = \ab.BIG in \y. map (f y) xs
|
|
1525 | + -- Now we do want to substitute for f, even though it's not
|
|
1526 | + -- saturated, because we're going to allocate a closure for
|
|
1527 | + -- (f y) every time round the loop anyhow.
|
|
1528 | + |
|
1529 | + -- canInlineInLam => free vars of rhs are (Once in_lam) or Many,
|
|
1530 | + -- so substituting rhs inside a lambda doesn't change the occ info.
|
|
1531 | + -- Sadly, not quite the same as exprIsHNF.
|
|
1532 | +canInlineInLam ::CoreExpr -> Bool
|
|
1533 | +canInlineInLam e
|
|
1534 | + = go e
|
|
1535 | + where
|
|
1536 | + go (Lit _) = True
|
|
1537 | + go (Lam b e) = isRuntimeVar b || go e
|
|
1538 | + go (Cast e _) = go e
|
|
1539 | + go (Tick t e) = not (tickishIsCode t) && go e
|
|
1540 | + -- This matters only for:
|
|
1541 | + -- x = y -- or y|>co
|
|
1542 | + -- f = \p. ..x.. -- One occurrence of x
|
|
1543 | + -- ..y.. -- Multiple other occurrences of y
|
|
1544 | + -- Then it is safe to inline x unconditionally
|
|
1545 | + -- For postInlineUncondionally we have already tested exprIsTrivial
|
|
1546 | + -- so this Var case never arises
|
|
1547 | + go (Var v) = case idOccInfo v of
|
|
1548 | + OneOcc { occ_in_lam = IsInsideLam } -> True
|
|
1549 | + ManyOccs {} -> True
|
|
1550 | + _ -> False
|
|
1551 | + go _ = False
|
|
1552 | + -- not ticks. Counting ticks cannot be duplicated, and non-counting
|
|
1553 | + -- ticks around a Lam will disappear anyway.
|
|
1554 | + |
|
1555 | + |
|
1535 | 1556 | {-
|
1536 | 1557 | ************************************************************************
|
1537 | 1558 | * *
|
... | ... | @@ -1582,71 +1603,77 @@ postInlineUnconditionally |
1582 | 1603 | -- Reason: we don't want to inline single uses, or discard dead bindings,
|
1583 | 1604 | -- for unlifted, side-effect-ful bindings
|
1584 | 1605 | postInlineUnconditionally env bind_cxt old_bndr bndr rhs
|
1585 | - | not active = False
|
|
1586 | - | isWeakLoopBreaker occ_info = False -- If it's a loop-breaker of any kind, don't inline
|
|
1587 | - -- because it might be referred to "earlier"
|
|
1588 | - | isStableUnfolding unfolding = False -- Note [Stable unfoldings and postInlineUnconditionally]
|
|
1589 | - | isTopLevel (bindContextLevel bind_cxt)
|
|
1590 | - = False -- Note [Top level and postInlineUnconditionally]
|
|
1591 | - | exprIsTrivial rhs = True
|
|
1592 | - | BC_Join {} <- bind_cxt = False -- See point (1) of Note [Duplicating join points]
|
|
1606 | + | not active = False
|
|
1607 | + | isWeakLoopBreaker occ_info = False -- If it's a loop-breaker of any kind, don't inline
|
|
1608 | + -- because it might be referred to "earlier"
|
|
1609 | + | isStableUnfolding unfolding = False -- Note [Stable unfoldings and postInlineUnconditionally]
|
|
1610 | + | BC_Join {} <- bind_cxt = exprIsTrivial rhs
|
|
1611 | + -- See point (DJ1) of Note [Duplicating join points]
|
|
1593 | 1612 | -- in GHC.Core.Opt.Simplify.Iteration
|
1613 | + | is_top_lvl, isDeadEndId bndr = False -- Note [Top-level bottoming Ids]
|
|
1594 | 1614 | | otherwise
|
1595 | 1615 | = case occ_info of
|
1596 | - OneOcc { occ_in_lam = in_lam, occ_int_cxt = int_cxt, occ_n_br = n_br }
|
|
1597 | - -- See Note [Inline small things to avoid creating a thunk]
|
|
1616 | + IAmALoopBreaker {} -> False
|
|
1617 | + ManyOccs {} | is_top_lvl -> False -- Note [Top level and postInlineUnconditionally]
|
|
1618 | + | otherwise -> exprIsTrivial rhs
|
|
1598 | 1619 | |
1599 | - | n_br >= 100 -> False -- See #23627
|
|
1620 | + OneOcc { occ_in_lam = in_lam, occ_int_cxt = int_cxt, occ_n_br = n_br }
|
|
1621 | + | exprIsTrivial rhs -> True
|
|
1622 | + | otherwise -> check_one_occ in_lam int_cxt n_br
|
|
1600 | 1623 | |
1601 | - | n_br == 1, NotInsideLam <- in_lam -- One syntactic occurrence
|
|
1602 | - -> True -- See Note [Post-inline for single-use things]
|
|
1624 | + IAmDead -> True -- This happens; for example, the case_bndr during case of
|
|
1625 | + -- known constructor: case (a,b) of x { (p,q) -> ... }
|
|
1626 | + -- Here x isn't mentioned in the RHS, so we don't want to
|
|
1627 | + -- create the (dead) let-binding let x = (a,b) in ...
|
|
1628 | + where
|
|
1629 | + is_top_lvl = isTopLevel (bindContextLevel bind_cxt)
|
|
1630 | + is_demanded = isStrUsedDmd (idDemandInfo bndr)
|
|
1631 | + occ_info = idOccInfo old_bndr
|
|
1632 | + unfolding = idUnfolding bndr
|
|
1633 | + arity = idArity bndr
|
|
1634 | +-- is_cheap = isCheapUnfolding unfolding
|
|
1635 | + uf_opts = seUnfoldingOpts env
|
|
1636 | + phase = sePhase env
|
|
1637 | + active = isActive phase (idInlineActivation bndr)
|
|
1638 | + -- See Note [pre/postInlineUnconditionally in gentle mode]
|
|
1603 | 1639 | |
1640 | + -- Check for code-size blow-up from inlining in multiple places
|
|
1641 | + code_dup_ok n_br
|
|
1642 | + | n_br == 1 = True -- No duplication
|
|
1643 | + | n_br >= 100 = False -- See #23627
|
|
1644 | + | is_demanded = False -- Demanded => no allocation (it'll be a case expression
|
|
1645 | + -- in the end) so inlining duplicates code but nothing more
|
|
1646 | + | otherwise = smallEnoughToInline uf_opts unfolding
|
|
1647 | + |
|
1648 | + -- See Note [Post-inline for single-use things]
|
|
1649 | + check_one_occ NotInsideLam NotInteresting n_br = not is_top_lvl && code_dup_ok n_br
|
|
1650 | + check_one_occ NotInsideLam IsInteresting n_br = code_dup_ok n_br
|
|
1651 | + check_one_occ IsInsideLam NotInteresting _ = False
|
|
1652 | + check_one_occ IsInsideLam IsInteresting n_br = arity > 0 && code_dup_ok n_br
|
|
1653 | + -- IsInteresting: inlining inside a lambda only with good reason
|
|
1654 | + -- See the notes on int_cxt in preInlineUnconditionally
|
|
1655 | + -- arity>0: do not inline data strutures under lambdas, only functions
|
|
1656 | + |
|
1657 | +---------------
|
|
1658 | +-- A wrong bit of code, left here in case you are tempted to do this
|
|
1604 | 1659 | -- | is_unlifted -- Unlifted binding, hence ok-for-spec
|
1605 | 1660 | -- -> True -- hence cheap to inline probably just a primop
|
1606 | --- -- Not a big deal either way
|
|
1607 | 1661 | -- No, this is wrong. {v = p +# q; x = K v}.
|
1608 | 1662 | -- Don't inline v; it'll just get floated out again. Stupid.
|
1663 | +---------------
|
|
1609 | 1664 | |
1610 | - | is_demanded
|
|
1611 | - -> False -- No allocation (it'll be a case expression in the end)
|
|
1612 | - -- so inlining duplicates code but nothing more
|
|
1613 | 1665 | |
1614 | - | otherwise
|
|
1615 | - -> work_ok in_lam int_cxt && smallEnoughToInline uf_opts unfolding
|
|
1616 | - -- Multiple syntactic occurences; but lazy, and small enough to dup
|
|
1617 | - -- ToDo: consider discount on smallEnoughToInline if int_cxt is true
|
|
1618 | - |
|
1619 | - IAmDead -> True -- This happens; for example, the case_bndr during case of
|
|
1620 | - -- known constructor: case (a,b) of x { (p,q) -> ... }
|
|
1621 | - -- Here x isn't mentioned in the RHS, so we don't want to
|
|
1622 | - -- create the (dead) let-binding let x = (a,b) in ...
|
|
1623 | - |
|
1624 | - _ -> False
|
|
1625 | - |
|
1626 | - where
|
|
1627 | - work_ok NotInsideLam _ = True
|
|
1628 | - work_ok IsInsideLam IsInteresting = isCheapUnfolding unfolding
|
|
1629 | - work_ok IsInsideLam NotInteresting = False
|
|
1630 | - -- NotInsideLam: outside a lambda, we want to be reasonably aggressive
|
|
1631 | - -- about inlining into multiple branches of case
|
|
1666 | + -- NotInsideLam: outside a lambda, when not at top-level we want to be
|
|
1667 | + -- reasonably aggressive about inlining into multiple branches of case
|
|
1632 | 1668 | -- e.g. let x = <non-value>
|
1633 | 1669 | -- in case y of { C1 -> ..x..; C2 -> ..x..; C3 -> ... }
|
1634 | 1670 | -- Inlining can be a big win if C3 is the hot-spot, even if
|
1635 | 1671 | -- the uses in C1, C2 are not 'interesting'
|
1636 | 1672 | -- An example that gets worse if you add int_cxt here is 'clausify'
|
1637 | 1673 | |
1638 | - -- InsideLam: check for acceptable work duplication, using isCheapUnfoldign
|
|
1639 | - -- int_cxt to prevent us inlining inside a lambda without some
|
|
1640 | - -- good reason. See the notes on int_cxt in preInlineUnconditionally
|
|
1674 | + -- InsideLam:
|
|
1641 | 1675 | |
1642 | 1676 | -- is_unlifted = isUnliftedType (idType bndr)
|
1643 | - is_demanded = isStrUsedDmd (idDemandInfo bndr)
|
|
1644 | - occ_info = idOccInfo old_bndr
|
|
1645 | - unfolding = idUnfolding bndr
|
|
1646 | - uf_opts = seUnfoldingOpts env
|
|
1647 | - phase = sePhase env
|
|
1648 | - active = isActive phase (idInlineActivation bndr)
|
|
1649 | - -- See Note [pre/postInlineUnconditionally in gentle mode]
|
|
1650 | 1677 | |
1651 | 1678 | {- Note [Inline small things to avoid creating a thunk]
|
1652 | 1679 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
... | ... | @@ -1669,23 +1696,24 @@ where GHC.Driver.CmdLine.$wprocessArgs allocated hugely more. |
1669 | 1696 | Note [Post-inline for single-use things]
|
1670 | 1697 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
1671 | 1698 | If we have
|
1672 | - |
|
1673 | 1699 | let x = rhs in ...x...
|
1674 | - |
|
1675 | 1700 | and `x` is used exactly once, and not inside a lambda, then we will usually
|
1676 | 1701 | preInlineUnconditinally. But we can still get this situation in
|
1677 | 1702 | postInlineUnconditionally:
|
1678 | - |
|
1679 | 1703 | case K rhs of K x -> ...x....
|
1680 | - |
|
1681 | 1704 | Here we'll use `simplAuxBind` to bind `x` to (the already-simplified) `rhs`;
|
1682 | 1705 | and `x` is used exactly once. It's beneficial to inline right away; otherwise
|
1683 | 1706 | we risk creating
|
1684 | - |
|
1685 | 1707 | let x = rhs in ...x...
|
1708 | +which will take another iteration of the Simplifier to eliminate.
|
|
1686 | 1709 | |
1687 | -which will take another iteration of the Simplifier to eliminate. We do this in
|
|
1688 | -two places
|
|
1710 | +A similar, but less frequent, case is
|
|
1711 | + let f = \x.blah in ...(\y. ...(f e)...) ...
|
|
1712 | +Again `preInlineUnconditionally will usually inline `f`, but it can arise
|
|
1713 | +via `simplAuxBind` if we have something like
|
|
1714 | + (\f \y. ...(f e)..) (\x.blah)
|
|
1715 | + |
|
1716 | +We do unconditional post-inlining in two places:
|
|
1689 | 1717 | |
1690 | 1718 | 1. In the full `postInlineUnconditionally` look for the special case
|
1691 | 1719 | of "one occurrence, not under a lambda", and inline unconditionally then.
|
... | ... | @@ -1714,24 +1742,20 @@ Alas! |
1714 | 1742 | |
1715 | 1743 | Note [Top level and postInlineUnconditionally]
|
1716 | 1744 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
1717 | -We don't do postInlineUnconditionally for top-level things (even for
|
|
1718 | -ones that are trivial):
|
|
1745 | +We must take care when considering postInlineUnconditionally for top-level things
|
|
1719 | 1746 | |
1720 | - * Doing so will inline top-level error expressions that have been
|
|
1721 | - carefully floated out by FloatOut. More generally, it might
|
|
1722 | - replace static allocation with dynamic.
|
|
1747 | + * Don't inline top-level error expressions that have been carefully floated
|
|
1748 | + out by FloatOut. See Note [Top-level bottoming Ids].
|
|
1723 | 1749 | |
1724 | - * Even for trivial expressions there's a problem. Consider
|
|
1750 | + * Even for trivial expressions we need to take care: we must not
|
|
1751 | + postInlineUnconditionally a top-level ManyOccs binder, even if its
|
|
1752 | + RHS is trivial. Consider
|
|
1725 | 1753 | {-# RULE "foo" forall (xs::[T]). reverse xs = ruggle xs #-}
|
1726 | 1754 | blah xs = reverse xs
|
1727 | 1755 | ruggle = sort
|
1728 | - In one simplifier pass we might fire the rule, getting
|
|
1756 | + We must not postInlineUnconditionally `ruggle`, because in the same
|
|
1757 | + simplifier pass we might fire the rule, getting
|
|
1729 | 1758 | blah xs = ruggle xs
|
1730 | - but in *that* simplifier pass we must not do postInlineUnconditionally
|
|
1731 | - on 'ruggle' because then we'll have an unbound occurrence of 'ruggle'
|
|
1732 | - |
|
1733 | - If the rhs is trivial it'll be inlined by callSiteInline, and then
|
|
1734 | - the binding will be dead and discarded by the next use of OccurAnal
|
|
1735 | 1759 | |
1736 | 1760 | * There is less point, because the main goal is to get rid of local
|
1737 | 1761 | bindings used in multiple case branches.
|
... | ... | @@ -397,10 +397,12 @@ mkTicks :: [CoreTickish] -> CoreExpr -> CoreExpr |
397 | 397 | mkTicks ticks expr = foldr mkTick expr ticks
|
398 | 398 | |
399 | 399 | isSaturatedConApp :: CoreExpr -> Bool
|
400 | +-- Also includes literals
|
|
400 | 401 | isSaturatedConApp e = go e []
|
401 | 402 | where go (App f a) as = go f (a:as)
|
402 | 403 | go (Var fun) args
|
403 | 404 | = isConLikeId fun && idArity fun == valArgCount args
|
405 | + go (Lit {}) _ = True
|
|
404 | 406 | go (Cast f _) as = go f as
|
405 | 407 | go _ _ = False
|
406 | 408 |
... | ... | @@ -1809,7 +1809,12 @@ build g = g (:) [] |
1809 | 1809 | |
1810 | 1810 | augment :: forall a. (forall b. (a->b->b) -> b -> b) -> [a] -> [a]
|
1811 | 1811 | {-# INLINE [1] augment #-}
|
1812 | -augment g xs = g (:) xs
|
|
1812 | +-- Give it one argument so that it inlines with one arg
|
|
1813 | +-- But (crucially) the body is a lambda so that `g` is visibly applied
|
|
1814 | +-- to two args, and hence we know that in a call
|
|
1815 | +-- augment (\c n. blah)
|
|
1816 | +-- both c and n are OneShot
|
|
1817 | +augment g = \xs -> g (:) xs
|
|
1813 | 1818 | |
1814 | 1819 | {-# RULES
|
1815 | 1820 | "fold/build" forall k z (g::forall b. (a->b->b) -> b -> b) .
|
... | ... | @@ -1975,7 +1980,7 @@ The rules for map work like this. |
1975 | 1980 | "++/literal_utf8" forall x. (++) (unpackCStringUtf8# x) = unpackAppendCStringUtf8# x #-}
|
1976 | 1981 | |
1977 | 1982 | {-# RULES
|
1978 | -"++" [~1] forall xs ys. xs ++ ys = augment (\c n -> foldr c n xs) ys
|
|
1983 | +"++" [~1] forall xs. (++) xs = augment (\c n -> foldr c n xs)
|
|
1979 | 1984 | #-}
|
1980 | 1985 | |
1981 | 1986 |
... | ... | @@ -5,6 +5,7 @@ |
5 | 5 | , MagicHash
|
6 | 6 | , UnboxedTuples
|
7 | 7 | , UnliftedFFITypes
|
8 | + , TypeApplications
|
|
8 | 9 | #-}
|
9 | 10 | {-# LANGUAGE CApiFFI #-}
|
10 | 11 | -- We believe we could deorphan this module, by moving lots of things
|
... | ... | @@ -1696,6 +1697,16 @@ showSignedFloat showPos p x |
1696 | 1697 | = showParen (p > 6) (showChar '-' . showPos (-x))
|
1697 | 1698 | | otherwise = showPos x
|
1698 | 1699 | |
1700 | + |
|
1701 | +-- Speicialise showSignedFloat for (a) the type and (b) the argument function
|
|
1702 | +-- The particularly targets are the calls in `instance Show Float` and
|
|
1703 | +-- `instance Show Double`
|
|
1704 | +-- Specialising for both (a) and (b) is obviously more efficient; and if you
|
|
1705 | +-- don't you find that the `x` argument is strict, but boxed, and that can cause
|
|
1706 | +-- functions calling showSignedFloat to have box their argument.
|
|
1707 | +{-# SPECIALISE showSignedFloat @Float showFloat #-}
|
|
1708 | +{-# SPECIALISE showSignedFloat @Double showFloat #-}
|
|
1709 | + |
|
1699 | 1710 | {-
|
1700 | 1711 | We need to prevent over/underflow of the exponent in encodeFloat when
|
1701 | 1712 | called from scaleFloat, hence we clamp the scaling parameter.
|
... | ... | @@ -6,7 +6,10 @@ |
6 | 6 | -- It produces a nested unfold that should look something
|
7 | 7 | -- like the code below. Note the 'lvl1_shW'. It is BAD
|
8 | 8 | -- if this is a lambda instead; you get a lot more allocation
|
9 | --- See Note [Saving allocation] in GHC.Core.Opt.SetLevels
|
|
9 | +--
|
|
10 | +-- LATER (2025): But in the end it seems better NOT to float lambdas,
|
|
11 | +-- unless they go to top level.
|
|
12 | +-- See (SW2) in Note [Saving work] in GHC.Core.Opt.SetLevels
|
|
10 | 13 | |
11 | 14 | |
12 | 15 | {-
|