GitLab

Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC

Commits:

ce616f49

by Simon Peyton Jones at 2025-04-27T21:10:25+01:00

Fix infelicities in the Specialiser

On the way to #23109 (unary classes) I discovered some infelicities
(or maybe tiny bugs, I forget) in the type-class specialiser.

I also tripped over #25965, an outright bug in the rule matcher

Specifically:

* Refactor: I enhanced `wantCallsFor`, whih previously always said
`True`, to discard calls of class-ops, data constructors etc.  This is
a bit more efficient; and it means we don't need to worry about
filtering them out later.

* Fix: I tidied up some tricky logic that eliminated redundant
  specialisations.  It wasn't working correctly.  See the expanded
  Note [Specialisations already covered], and
  (MP3) in Note [Specialising polymorphic dictionaries].

  See also the new top-level `alreadyCovered`
  function, which now goes via `GHC.Core.Rules.ruleLhsIsMoreSpecific`

  I also added a useful Note [The (CI-KEY) invariant]

* Fix #25965: fixed a tricky bug in the `go_fam_fam` in
  `GHC.Core.Unify.uVarOrFam`, which allows matching to succeed
  without binding all type varibles.

  I enhanced Note [Apartness and type families] some more

* #25703. This ticket "just works" with -fpolymorphic-specialisation;
  but I was surprised that it worked! In this MR I added documentation
  to Note [Interesting dictionary arguments] to explain; and tests to
  ensure it stays fixed.

11 changed files:

compiler/GHC/Core/Opt/Specialise.hs
compiler/GHC/Core/Rules.hs
compiler/GHC/Core/Unify.hs
compiler/GHC/Tc/Solver/Equality.hs
compiler/GHC/Types/Basic.hs
+ testsuite/tests/simplCore/should_compile/T25703.hs
+ testsuite/tests/simplCore/should_compile/T25703.stderr
+ testsuite/tests/simplCore/should_compile/T25703a.hs
+ testsuite/tests/simplCore/should_compile/T25703a.stderr
+ testsuite/tests/simplCore/should_compile/T25965.hs
testsuite/tests/simplCore/should_compile/all.T

Changes:

compiler/GHC/Core/Opt/Specialise.hs

@@ -1243,14 +1243,15 @@ specExpr env (Let bind body)
  --     Note [Fire rules in the specialiser]
  fireRewriteRules :: SpecEnv -> InExpr -> [OutExpr] -> (InExpr, [OutExpr])
  fireRewriteRules env (Var f) args
 -  | Just (rule, expr) <- specLookupRule env f args InitialPhase (getRules (se_rules env) f)
 +  | let rules = getRules (se_rules env) f
 +  , Just (rule, expr) <- specLookupRule env f args activeInInitialPhase rules
    , let rest_args    = drop (ruleArity rule) args -- See Note [Extra args in the target]
          zapped_subst = Core.zapSubst (se_subst env)
          expr'        = simpleOptExprWith defaultSimpleOpts zapped_subst expr
                         -- simplOptExpr needed because lookupRule returns
                         --   (\x y. rhs) arg1 arg2
 -  , (fun, args) <- collectArgs expr'
 -  = fireRewriteRules env fun (args++rest_args)
 +  , (fun', args') <- collectArgs expr'
 +  = fireRewriteRules env fun' (args'++rest_args)
  fireRewriteRules _ fun args = (fun, args)
  --------------
@@ -1620,7 +1621,7 @@ specCalls :: Bool              -- True  =>  specialising imported fn
  -- This function checks existing rules, and does not create
  -- duplicate ones. So the caller does not need to do this filtering.
 --- See 'already_covered'
 +-- See `alreadyCovered`
  type SpecInfo = ( [CoreRule]       -- Specialisation rules
                  , [(Id,CoreExpr)]  -- Specialised definition
@@ -1644,15 +1645,13 @@ specCalls spec_imp env existing_rules calls_for_me fn rhs
    = -- pprTrace "specCalls: some" (vcat
      --   [ text "function" <+> ppr fn
 -    --   , text "calls:" <+> ppr calls_for_me
 -    --   , text "subst" <+> ppr (se_subst env) ]) $
 +    --    , text "calls:" <+> ppr calls_for_me
 +    --    , text "subst" <+> ppr (se_subst env) ]) $
      foldlM spec_call ([], [], emptyUDs) calls_for_me
    | otherwise   -- No calls or RHS doesn't fit our preconceptions
 -  = warnPprTrace (not (exprIsTrivial rhs) && notNull calls_for_me && not (isClassOpId fn))
 +  = warnPprTrace (not (exprIsTrivial rhs) && notNull calls_for_me)
            "Missed specialisation opportunity for" (ppr fn $$ trace_doc) $
 -          -- isClassOpId: class-op Ids never inline; we specialise them
 -          -- through fireRewriteRules. So don't complain about missed opportunities
            -- Note [Specialisation shape]
      -- pprTrace "specCalls: none" (ppr fn <+> ppr calls_for_me) $
      return ([], [], emptyUDs)
@@ -1664,6 +1663,10 @@ specCalls spec_imp env existing_rules calls_for_me fn rhs
      fn_unf    = realIdUnfolding fn  -- Ignore loop-breaker-ness here
      inl_prag  = idInlinePragma fn
      inl_act   = inlinePragmaActivation inl_prag
 +    is_active = isActive (beginPhase inl_act) :: Activation -> Bool
 +         -- is_active: inl_act is the activation we are going to put in the new
 +         --   SPEC rule; so we want to see if it is covered by another rule with
 +         --   that same activation.
      is_local  = isLocalId fn
      is_dfun   = isDFunId fn
      dflags    = se_dflags env
@@ -1674,16 +1677,6 @@ specCalls spec_imp env existing_rules calls_for_me fn rhs
      (rhs_bndrs, rhs_body) = collectBindersPushingCo rhs
                              -- See Note [Account for casts in binding]
 -    already_covered :: SpecEnv -> [CoreRule] -> [CoreExpr] -> Bool
 -    already_covered env new_rules args      -- Note [Specialisations already covered]
 -       = isJust (specLookupRule env fn args (beginPhase inl_act)
 -                                (new_rules ++ existing_rules))
 -         -- Rules: we look both in the new_rules (generated by this invocation
 -         --   of specCalls), and in existing_rules (passed in to specCalls)
 -         -- inl_act: is the activation we are going to put in the new SPEC
 -         --   rule; so we want to see if it is covered by another rule with
 -         --   that same activation.
+-
      ----------------------------------------------------------
          -- Specialise to one particular call pattern
      spec_call :: SpecInfo                         -- Accumulating parameter
@@ -1717,8 +1710,12 @@ specCalls spec_imp env existing_rules calls_for_me fn rhs
  --                , ppr dx_binds ]) $
  --             return ()
 +           ; let all_rules = rules_acc ++ existing_rules
 +                 -- all_rules: we look both in the rules_acc (generated by this invocation
 +                 --   of specCalls), and in existing_rules (passed in to specCalls)
             ; if not useful  -- No useful specialisation
 -                || already_covered rhs_env2 rules_acc rule_lhs_args
 +                || alreadyCovered rhs_env2 rule_bndrs fn rule_lhs_args is_active all_rules
 +                   -- See (SC1) in Note [Specialisations already covered]
               then return spec_acc
               else
          do { -- Run the specialiser on the specialised RHS
@@ -1780,7 +1777,7 @@ specCalls spec_imp env existing_rules calls_for_me fn rhs
                   spec_fn_details
                     = case idDetails fn of
                         JoinId join_arity _ -> JoinId (join_arity - join_arity_decr) Nothing
 -                       DFunId is_nt        -> DFunId is_nt
 +                       DFunId unary        -> DFunId unary
                         _                   -> VanillaId
             ; spec_fn <- newSpecIdSM (idName fn) spec_fn_ty spec_fn_details spec_fn_info
@@ -1804,6 +1801,8 @@ specCalls spec_imp env existing_rules calls_for_me fn rhs
                                         , ppr spec_fn  <+> dcolon <+> ppr spec_fn_ty
                                         , ppr rhs_bndrs, ppr call_args
                                         , ppr spec_rule
 +                                       , text "acc" <+> ppr rules_acc
 +                                       , text "existing" <+> ppr existing_rules
+                                        ]
             ; -- pprTrace "spec_call: rule" _rule_trace_doc
@@ -1812,19 +1811,35 @@ specCalls spec_imp env existing_rules calls_for_me fn rhs
                      , spec_uds           `thenUDs` uds_acc
                      ) } }
 +alreadyCovered :: SpecEnv
 +               -> [Var] -> Id -> [CoreExpr]   -- LHS of possible new rule
 +               -> (Activation -> Bool)        -- Which rules are active
 +               -> [CoreRule] -> Bool
 +-- Note [Specialisations already covered] esp (SC2)
 +alreadyCovered env bndrs fn args is_active rules
 +  = case specLookupRule env fn args is_active rules of
 +      Nothing             -> False
 +      Just (rule, _)
 +        | isAutoRule rule -> -- Discard identical rules
 +                             -- We know that (fn args) is an instance of RULE
 +                             -- Check if RULE is an instance of (fn args)
 +                             ruleLhsIsMoreSpecific in_scope bndrs args rule
 +        | otherwise       -> True  -- User rules dominate
 +  where
 +    in_scope = substInScopeSet (se_subst env)
++
  -- Convenience function for invoking lookupRule from Specialise
  -- The SpecEnv's InScopeSet should include all the Vars in the [CoreExpr]
  specLookupRule :: SpecEnv -> Id -> [CoreExpr]
 -               -> CompilerPhase  -- Look up rules as if we were in this phase
 +               -> (Activation -> Bool)  -- Which rules are active
                 -> [CoreRule] -> Maybe (CoreRule, CoreExpr)
 -specLookupRule env fn args phase rules
 +specLookupRule env fn args is_active rules
    = lookupRule ropts in_scope_env is_active fn args rules
    where
      dflags       = se_dflags env
      in_scope     = substInScopeSet (se_subst env)
      in_scope_env = ISE in_scope (whenActiveUnfoldingFun is_active)
      ropts        = initRuleOpts dflags
 -    is_active    = isActive phase
  {- Note [Specialising DFuns]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2323,21 +2338,24 @@ This plan is implemented in the Rec case of specBindItself.
  Note [Specialisations already covered]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  We obviously don't want to generate two specialisations for the same
 -argument pattern.  There are two wrinkles
+-
 -1. We do the already-covered test in specDefn, not when we generate
 -the CallInfo in mkCallUDs.  We used to test in the latter place, but
 -we now iterate the specialiser somewhat, and the Id at the call site
 -might therefore not have all the RULES that we can see in specDefn
+-
 -2. What about two specialisations where the second is an *instance*
 -of the first?  If the more specific one shows up first, we'll generate
 -specialisations for both.  If the *less* specific one shows up first,
 -we *don't* currently generate a specialisation for the more specific
 -one.  (See the call to lookupRule in already_covered.)  Reasons:
 -  (a) lookupRule doesn't say which matches are exact (bad reason)
 -  (b) if the earlier specialisation is user-provided, it's
 -      far from clear that we should auto-specialise further
 +argument pattern.  Wrinkles
++
 +(SC1) We do the already-covered test in specDefn, not when we generate
 +    the CallInfo in mkCallUDs.  We used to test in the latter place, but
 +    we now iterate the specialiser somewhat, and the Id at the call site
 +    might therefore not have all the RULES that we can see in specDefn
++
 +(SC2) What about two specialisations where the second is an *instance*
 +   of the first?  It's a bit arbitrary, but here's what we do:
 +   * If the existing one is user-specified, via a SPECIALISE pragma, we
 +     suppress the further specialisation.
 +   * If the existing one is auto-generated, we generate a second RULE
 +     for the more specialised version.
 +   The latter is important because we don't want the accidental order
 +   of calls to determine what specialisations we generate.
++
 +(SC3) Annoyingly, we /also/ eliminate duplicates in `filterCalls`.
 +   See (MP3) in Note [Specialising polymorphic dictionaries]
  Note [Auto-specialisation and RULES]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -2800,12 +2818,10 @@ non-dictionary bindings too.
  Note [Specialising polymorphic dictionaries]
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-
  Note June 2023: This has proved to be quite a tricky optimisation to get right
  see (#23469, #23109, #21229, #23445) so it is now guarded by a flag
  `-fpolymorphic-specialisation`.
+-
  Consider
      class M a where { foo :: a -> Int }
@@ -2845,11 +2861,26 @@ Here are the moving parts:
        function.
  (MP3) If we have f :: forall m. Monoid m => blah, and two calls
 -        (f @(Endo b)      (d :: Monoid (Endo b))
 -        (f @(Endo (c->c)) (d :: Monoid (Endo (c->c)))
 +        (f @(Endo b)      (d1 :: Monoid (Endo b))
 +        (f @(Endo (c->c)) (d2 :: Monoid (Endo (c->c)))
        we want to generate a specialisation only for the first.  The second
        is just a substitution instance of the first, with no greater specialisation.
 -      Hence the call to `remove_dups` in `filterCalls`.
 +      Hence the use of `removeDupCalls` in `filterCalls`.
++
 +      You might wonder if `d2` might be more specialised than `d1`; but no.
 +      This `removeDupCalls` thing is at the definition site of `f`, and both `d1`
 +      and `d2` are in scope. So `d1` is simply more polymorphic than `d2`, but
 +      is just as specialised.
++
 +      This distinction is sadly lost once we build a RULE, so `alreadyCovered`
 +      can't be so clever.  E.g if we have an existing RULE
 +            forall @a (d1:Ord Int) (d2: Eq a). f @a @Int d1 d2 = ...
 +      and a putative new rule
 +            forall (d1:Ord Int) (d2: Eq Int). f @Int @Int d1 d2 = ...
 +      we /don't/ want the existing rule to subsume the new one.
++
 +      So we sadly put up with having two rather different places where we
 +      eliminate duplicates: `alreadyCovered` and `removeDupCalls`.
  All this arose in #13873, in the unexpected form that a SPECIALISE
  pragma made the program slower!  The reason was that the specialised
@@ -2947,16 +2978,29 @@ data CallInfoSet = CIS Id (Bag CallInfo)
    -- The list of types and dictionaries is guaranteed to
    -- match the type of f
    -- The Bag may contain duplicate calls (i.e. f @T and another f @T)
 -  -- These dups are eliminated by already_covered in specCalls
 +  -- These dups are eliminated by alreadyCovered in specCalls
  data CallInfo
 -  = CI { ci_key  :: [SpecArg]   -- All arguments
 +  = CI { ci_key  :: [SpecArg]   -- Arguments of the call
 +                                -- See Note [The (CI-KEY) invariant]
++
         , ci_fvs  :: IdSet       -- Free Ids of the ci_key call
                                  -- /not/ including the main id itself, of course
                                  -- NB: excluding tyvars:
                                  --     See Note [Specialising polymorphic dictionaries]
+     }
 +{- Note [The (CI-KEY) invariant]
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +Invariant (CI-KEY):
 +   In the `ci_key :: [SpecArg]` field of `CallInfo`,
 +     * The list is non-empty
 +     * The least element is always a `SpecDict`
++
 +In this way the RULE has as few args as possible, which broadens its
 +applicability, since rules only fire when saturated.
 +-}
++
  type DictExpr = CoreExpr
  ciSetFilter :: (CallInfo -> Bool) -> CallInfoSet -> CallInfoSet
@@ -3045,10 +3089,7 @@ mkCallUDs' env f args
      ci_key :: [SpecArg]
      ci_key = dropWhileEndLE (not . isSpecDict) $
               zipWith mk_spec_arg args pis
 -             -- Drop trailing args until we get to a SpecDict
 -             -- In this way the RULE has as few args as possible,
 -             -- which broadens its applicability, since rules only
 -             -- fire when saturated
 +             -- Establish (CI-KEY): drop trailing args until we get to a SpecDict
      mk_spec_arg :: OutExpr -> PiTyBinder -> SpecArg
      mk_spec_arg arg (Named bndr)
@@ -3086,34 +3127,76 @@ site, so we only look through ticks that RULE matching looks through
  -}
  wantCallsFor :: SpecEnv -> Id -> Bool
 -wantCallsFor _env _f = True
 - -- We could reduce the size of the UsageDetails by being less eager
 - -- about collecting calls for LocalIds: there is no point for
 - -- ones that are lambda-bound.  We can't decide this by looking at
 - -- the (absence of an) unfolding, because unfoldings for local
 - -- functions are discarded by cloneBindSM, so no local binder will
 - -- have an unfolding at this stage.  We'd have to keep a candidate
 - -- set of let-binders.
 - --
 - -- Not many lambda-bound variables have dictionary arguments, so
 - -- this would make little difference anyway.
 - --
 - -- For imported Ids we could check for an unfolding, but we have to
 - -- do so anyway in canSpecImport, and it seems better to have it
 - -- all in one place.  So we simply collect usage info for imported
 - -- overloaded functions.
+-
 -{- Note [Interesting dictionary arguments]
 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 -Consider this
 -         \a.\d:Eq a.  let f = ... in ...(f d)...
 -There really is not much point in specialising f wrt the dictionary d,
 -because the code for the specialised f is not improved at all, because
 -d is lambda-bound.  We simply get junk specialisations.
+-
 -What is "interesting"?  Just that it has *some* structure.  But what about
 -variables?  We look in the variable's /unfolding/.  And that means
 -that we must be careful to ensure that dictionaries have unfoldings,
 +-- See Note [wantCallsFor]
 +wantCallsFor _env f
 +  = case idDetails f of
 +      RecSelId {}      -> False
 +      DataConWorkId {} -> False
 +      DataConWrapId {} -> False
 +      ClassOpId {}     -> False
 +      PrimOpId {}      -> False
 +      FCallId {}       -> False
 +      TickBoxOpId {}   -> False
 +      CoVarId {}       -> False
++
 +      DFunId {}        -> True
 +      VanillaId {}     -> True
 +      JoinId {}        -> True
 +      WorkerLikeId {}  -> True
 +      RepPolyId {}     -> True
++
 +{- Note [wantCallsFor]
 +~~~~~~~~~~~~~~~~~~~~~~
 +`wantCallsFor env f` says whether the Specialiser should collect calls for
 +function `f`; other thing being equal, the fewer calls we collect the better. It
 +is False for things we can't specialise:
++
 +* ClassOpId: never inline and we don't have a defn to specialise; we specialise
 +  them through fireRewriteRules.
 +* PrimOpId: are never overloaded
 +* Data constructors: we never specialise them
++
 +We could reduce the size of the UsageDetails by being less eager about
 +collecting calls for some LocalIds: there is no point for ones that are
 +lambda-bound.  We can't decide this by looking at the (absence of an) unfolding,
 +because unfoldings for local functions are discarded by cloneBindSM, so no local
 +binder will have an unfolding at this stage.  We'd have to keep a candidate set
 +of let-binders.
++
 +Not many lambda-bound variables have dictionary arguments, so this would make
 +little difference anyway.
++
 +For imported Ids we could check for an unfolding, but we have to do so anyway in
 +canSpecImport, and it seems better to have it all in one place.  So we simply
 +collect usage info for imported overloaded functions.
++
 +Note [Interesting dictionary arguments]
 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 +In `mkCallUDs` we only use `SpecDict` for dictionaries of which
 +`interestingDict` holds.  Otherwise we use `UnspecArg`.  Two reasons:
++
 +* Consider this
 +       \a.\d:Eq a.  let f = ... in ...(f d)...
 +  There really is not much point in specialising f wrt the dictionary d,
 +  because the code for the specialised f is not improved at all, because
 +  d is lambda-bound.  We simply get junk specialisations.
++
 +* Consider this (#25703):
 +     f :: (Eq a, Show b) => a -> b -> INt
 +     goo :: forall x. (Eq x) => x -> blah
 +     goo @x (d:Eq x) (arg:x) = ...(f @x @Int d $fShowInt)...
 +  If we built a `ci_key` with a (SpecDict d) for `d`, we would end up
 +  discarding the call at the `\d`.  But if we use `UnspecArg` for that
 +  uninteresting `d`, we'll get a `ci_key` of
 +      f @x @Int UnspecArg (SpecDict $fShowInt)
 +  and /that/ can float out to f's definition and specialise nicely.
 +  Hooray.  (NB: the call can float only if `-fpolymorphic-specialisation`
 +  is on; otherwise it'll be trapped by the `\@x -> ...`.)(
++
 +What is "interesting"?  (See `interestingDict`.)  Just that it has *some*
 +structure.  But what about variables?  We look in the variable's /unfolding/.
 +And that means that we must be careful to ensure that dictionaries /have/
 +unfoldings,
  * cloneBndrSM discards non-Stable unfoldings
  * specBind updates the unfolding after specialisation
@@ -3159,7 +3242,7 @@ Now `f` turns into:
            meth @a dc ....
  When we specialise `f`, at a=Int say, that superclass selection can
 -nfire (via rewiteClassOps), but that info (that 'dc' is now a
 +fire (via rewiteClassOps), but that info (that 'dc' is now a
  particular dictionary `C`, of type `C Int`) must be available to
  the call `meth @a dc`, so that we can fire the `meth` class-op, and
  thence specialise `wombat`.
@@ -3286,7 +3369,11 @@ dumpUDs :: [CoreBndr] -> UsageDetails -> (UsageDetails, OrdList DictBind)
  -- Used at a lambda or case binder; just dump anything mentioning the binder
  dumpUDs bndrs uds@(MkUD { ud_binds = orig_dbs, ud_calls = orig_calls })
    | null bndrs = (uds, nilOL)  -- Common in case alternatives
 -  | otherwise  = -- pprTrace "dumpUDs" (ppr bndrs $$ ppr free_uds $$ ppr dump_dbs) $
 +  | otherwise  = -- pprTrace "dumpUDs" (vcat
 +                 --    [ text "bndrs" <+> ppr bndrs
 +                 --    , text "uds" <+> ppr uds
 +                 --    , text "free_uds" <+> ppr free_uds
 +                 --    , text "dump-dbs" <+> ppr dump_dbs ]) $
                   (free_uds, dump_dbs)
    where
      free_uds = uds { ud_binds = free_dbs, ud_calls = free_calls }
@@ -3325,20 +3412,17 @@ callsForMe fn uds@MkUD { ud_binds = orig_dbs, ud_calls = orig_calls }
      calls_for_me = case lookupDVarEnv orig_calls fn of
                          Nothing -> []
                          Just cis -> filterCalls cis orig_dbs
 -         -- filterCalls: drop calls that (directly or indirectly)
 -         -- refer to fn.  See Note [Avoiding loops (DFuns)]
  ----------------------
  filterCalls :: CallInfoSet -> FloatedDictBinds -> [CallInfo]
 --- Remove dominated calls (Note [Specialising polymorphic dictionaries])
 --- and loopy DFuns (Note [Avoiding loops (DFuns)])
 +-- Remove
 +--   (a) dominated calls: (MP3) in Note [Specialising polymorphic dictionaries]
 +--   (b) loopy DFuns: Note [Avoiding loops (DFuns)]
  filterCalls (CIS fn call_bag) (FDB { fdb_binds = dbs })
 -  | isDFunId fn  -- Note [Avoiding loops (DFuns)] applies only to DFuns
 -  = filter ok_call de_dupd_calls
 -  | otherwise         -- Do not apply it to non-DFuns
 -  = de_dupd_calls  -- See Note [Avoiding loops (non-DFuns)]
 +  | isDFunId fn  = filter ok_call de_dupd_calls  -- Deals with (b)
 +  | otherwise    = de_dupd_calls
    where
 -    de_dupd_calls = remove_dups call_bag
 +    de_dupd_calls = removeDupCalls call_bag -- Deals with (a)
      dump_set = foldl' go (unitVarSet fn) dbs
        -- This dump-set could also be computed by splitDictBinds
@@ -3352,10 +3436,10 @@ filterCalls (CIS fn call_bag) (FDB { fdb_binds = dbs })
      ok_call (CI { ci_fvs = fvs }) = fvs `disjointVarSet` dump_set
 -remove_dups :: Bag CallInfo -> [CallInfo]
 +removeDupCalls :: Bag CallInfo -> [CallInfo]
  -- Calls involving more generic instances beat more specific ones.
  -- See (MP3) in Note [Specialising polymorphic dictionaries]
 -remove_dups calls = foldr add [] calls
 +removeDupCalls calls = foldr add [] calls
    where
      add :: CallInfo -> [CallInfo] -> [CallInfo]
      add ci [] = [ci]
@@ -3364,12 +3448,20 @@ remove_dups calls = foldr add [] calls
                        | otherwise               = ci2 : add ci1 cis
  beats_or_same :: CallInfo -> CallInfo -> Bool
 +-- (beats_or_same ci1 ci2) is True if specialising on ci1 subsumes ci2
 +-- That is: ci1's types are less specialised than ci2
 +--          ci1   specialises on the same dict args as ci2
  beats_or_same (CI { ci_key = args1 }) (CI { ci_key = args2 })
    = go args1 args2
    where
 -    go [] _ = True
 +    go []           []           = True
      go (arg1:args1) (arg2:args2) = go_arg arg1 arg2 && go args1 args2
 -    go (_:_)        []           = False
++
 +    -- If one or the other runs dry, the other must still have a SpecDict
 +    -- because of the (CI-KEY) invariant.  So neither subsumes the other;
 +    -- one is more specialised (faster code) but the other is more generally
 +    -- applicable.
 +    go  _ _ = False
      go_arg (SpecType ty1) (SpecType ty2) = isJust (tcMatchTy ty1 ty2)
      go_arg UnspecType     UnspecType     = True

compiler/GHC/Core/Rules.hs

@@ -9,7 +9,7 @@
  -- The 'CoreRule' datatype itself is declared elsewhere.
  module GHC.Core.Rules (
          -- ** Looking up rules
 -        lookupRule, matchExprs,
 +        lookupRule, matchExprs, ruleLhsIsMoreSpecific,
          -- ** RuleBase, RuleEnv
          RuleBase, RuleEnv(..), mkRuleEnv, emptyRuleEnv,
@@ -587,8 +587,8 @@ findBest :: InScopeSet -> (Id, [CoreExpr])
  findBest _        _      (rule,ans)   [] = (rule,ans)
  findBest in_scope target (rule1,ans1) ((rule2,ans2):prs)
 -  | isMoreSpecific in_scope rule1 rule2 = findBest in_scope target (rule1,ans1) prs
 -  | isMoreSpecific in_scope rule2 rule1 = findBest in_scope target (rule2,ans2) prs
 +  | ruleIsMoreSpecific in_scope rule1 rule2 = findBest in_scope target (rule1,ans1) prs
 +  | ruleIsMoreSpecific in_scope rule2 rule1 = findBest in_scope target (rule2,ans2) prs
    | debugIsOn = let pp_rule rule
                        = ifPprDebug (ppr rule)
                                     (doubleQuotes (ftext (ruleName rule)))
@@ -603,15 +603,25 @@ findBest in_scope target (rule1,ans1) ((rule2,ans2):prs)
    where
      (fn,args) = target
 -isMoreSpecific :: InScopeSet -> CoreRule -> CoreRule -> Bool
 --- The call (rule1 `isMoreSpecific` rule2)
 +ruleIsMoreSpecific :: InScopeSet -> CoreRule -> CoreRule -> Bool
 +-- The call (rule1 `ruleIsMoreSpecific` rule2)
  -- sees if rule2 can be instantiated to look like rule1
 --- See Note [isMoreSpecific]
 -isMoreSpecific _        (BuiltinRule {}) _                = False
 -isMoreSpecific _        (Rule {})        (BuiltinRule {}) = True
 -isMoreSpecific in_scope (Rule { ru_bndrs = bndrs1, ru_args = args1 })
 -                        (Rule { ru_bndrs = bndrs2, ru_args = args2 })
 -  = isJust (matchExprs in_scope_env bndrs2 args2 args1)
 +-- See Note [ruleIsMoreSpecific]
 +ruleIsMoreSpecific in_scope rule1 rule2
 +  = case rule1 of
 +       BuiltinRule {} -> False
 +       Rule { ru_bndrs = bndrs1, ru_args = args1 }
 +                      -> ruleLhsIsMoreSpecific in_scope bndrs1 args1 rule2
++
 +ruleLhsIsMoreSpecific :: InScopeSet
 +                      -> [Var] -> [CoreExpr]  -- LHS of a possible new rule
 +                      -> CoreRule             -- An existing rule
 +                      -> Bool                 -- New one is more specific
 +ruleLhsIsMoreSpecific in_scope bndrs1 args1 rule2
 +  = case rule2 of
 +       BuiltinRule {} -> True
 +       Rule { ru_bndrs = bndrs2, ru_args = args2 }
 +                      -> isJust (matchExprs in_scope_env bndrs2 args2 args1)
    where
     full_in_scope = in_scope `extendInScopeSetList` bndrs1
     in_scope_env  = ISE full_in_scope noUnfoldingFun
@@ -620,9 +630,9 @@ isMoreSpecific in_scope (Rule { ru_bndrs = bndrs1, ru_args = args1 })
  noBlackList :: Activation -> Bool
  noBlackList _ = False           -- Nothing is black listed
 -{- Note [isMoreSpecific]
 +{- Note [ruleIsMoreSpecific]
  ~~~~~~~~~~~~~~~~~~~~~~~~
 -The call (rule1 `isMoreSpecific` rule2)
 +The call (rule1 `ruleIsMoreSpecific` rule2)
  sees if rule2 can be instantiated to look like rule1.
  Wrinkle:
@@ -825,7 +835,7 @@ bound on the LHS:
    The rule looks like
      forall (a::*) (d::Eq Char) (x :: Foo a Char).
 -         f (Foo a Char) d x = True
 +         f @(Foo a Char) d x = True
    Matching the rule won't bind 'a', and legitimately so.  We fudge by
    pretending that 'a' is bound to (Any :: *).

compiler/GHC/Core/Unify.hs

@@ -331,35 +331,57 @@ Wrinkles
     `DontBindMe`, the unifier must return `SurelyApart`, not `MaybeApart`.  See
     `go_fam` in `uVarOrFam`
 -(ATF6) You might think that when /matching/ the um_fam_env will always be empty,
 -   because type-class-instance and type-family-instance heads can't include type
 -   families.  E.g.   instance C (F a) where ...   -- Illegal
+-
 -   But you'd be wrong: when "improving" type family constraint we may have a
 -   type family on the LHS of a match. Consider
 +(ATF6) When /matching/ can we ever have a type-family application on the LHS, in
 +   the template?  You might think not, because type-class-instance and
 +   type-family-instance heads can't include type families.  E.g.
 +            instance C (F a) where ...  -- Illegal
++
 +   But you'd be wrong: even when matching, we can see type families in the LHS template:
 +   * In `checkValidClass`, in `check_dm` we check that the default method has the
 +      right type, using matching, both ways.  And that type may have type-family
 +      applications in it. Example in test CoOpt_Singletons.
++
 +   * In the specialiser: see the call to `tcMatchTy` in
 +     `GHC.Core.Opt.Specialise.beats_or_same`
++
 +   * With -fpolymorphic-specialsation, we might get a specialiation rule like
 +         RULE forall a (d :: Eq (Maybe (F a))) .
 +                 f @(Maybe (F a)) d = ...
 +     See #25965.
++
 +   * A user-written RULE could conceivably have a type-family application
 +     in the template.  It might not be a good rule, but I don't think we currently
 +     check for this.
++
 +    In all these cases we are only interested in finding a substitution /for
 +    type variables/ that makes the match work.  So we simply want to recurse into
 +    the arguments of the type family.  E.g.
 +       Template:   forall a.  Maybe (F a)
 +       Target:     Mabybe (F Int)
 +    We want to succeed with substitution [a :-> Int].  See (ATF9).
++
 +    Conclusion: where we enter via `tcMatchTy`, `tcMatchTys`, `tc_match_tys`,
 +    etc, we always end up in `tc_match_tys_x`.  There we invoke the unifier
 +    but we do not distinguish between `SurelyApart` and `MaybeApart`. So in
 +    these cases we can set `um_bind_fam_fun` to `neverBindFam`.
++
 +(ATF7) There is one other, very special case of matching where we /do/ want to
 +   bind type families in `um_fam_env`, namely in GHC.Tc.Solver.Equality, the call
 +   to `tcUnifyTyForInjectivity False` in `improve_injective_wanted_top`.
 +   Consider
 +   of a match. Consider
        type family G6 a = r | r -> a
        type instance G6 [a]  = [G a]
        type instance G6 Bool = Int
 -   and the Wanted constraint [W] G6 alpha ~ [Int].  We /match/ each type instance
 -   RHS against [Int]!  So we try
 -        [G a] ~ [Int]
 +   and suppose we haev a Wanted constraint
 +      [W] G6 alpha ~ [Int]
 +.  According to Section 5.2 of "Injective type families for Haskell", we /match/
 +   the RHS each type instance [Int].  So we try
 +        Template: [G a]    Target: [Int]
     and we want to succeed with MaybeApart, so that we can generate the improvement
 -   constraint  [W] alpha ~ [beta]  where beta is fresh.
 -   See Section 5.2 of "Injective type families for Haskell".
+-
 -   A second place that we match with type-fams on the LHS is in `checkValidClass`.
 -   In `check_dm` we check that the default method has the right type, using matching,
 -   both ways.  And that type may have type-family applications in it. Example in
 -   test CoOpt_Singletons.
+-
 -(ATF7) You might think that (ATF6) is a very special case, and in /other/ uses of
 -  matching, where we enter via `tc_match_tys_x` we will never see a type-family
 -  in the template. But actually we do see that case in the specialiser: see
 -  the call to `tcMatchTy` in `GHC.Core.Opt.Specialise.beats_or_same`
+-
 -  Also: a user-written RULE could conceivably have a type-family application
 -  in the template.  It might not be a good rule, but I don't think we currently
 -  check for this.
 +   constraint
 +        [W] alpha ~ [beta]
 +   where beta is fresh.  We do this by binding [G a :-> Int]
  (ATF8) The treatment of type families is governed by
           um_bind_fam_fun :: BindFamFun
@@ -399,6 +421,8 @@ Wrinkles
    Key point: when decomposing (F tys1 ~ F tys2), we should /also/ extend the
    type-family substitution.
 +  (ATF11-1) All this cleverness only matters when unifying, not when matching
++
  (ATF12) There is a horrid exception for the injectivity check. See (UR1) in
    in Note [Specification of unification].
@@ -595,7 +619,7 @@ tc_match_tys_x :: HasDebugCallStack
                 -> [Type]
                 -> Maybe Subst
  tc_match_tys_x bind_tv match_kis (Subst in_scope id_env tv_env cv_env) tys1 tys2
 -  = case tc_unify_tys alwaysBindFam  -- (ATF7) in Note [Apartness and type families]
 +  = case tc_unify_tys neverBindFam  -- (ATF7) in Note [Apartness and type families]
                        bind_tv
                        False  -- Matching, not unifying
                        False  -- Not an injectivity check
@@ -1857,6 +1881,7 @@ uVarOrFam env ty1 ty2 kco
        = go_fam_fam tc1 tys1 tys2 kco
        -- Now check if we can bind the (F tys) to the RHS
 +      -- This can happen even when matching: see (ATF7)
        | BindMe <- um_bind_fam_fun env tc1 tys1 rhs
        = -- ToDo: do we need an occurs check here?
          do { extendFamEnv tc1 tys1 rhs
@@ -1881,11 +1906,6 @@ uVarOrFam env ty1 ty2 kco
      -- go_fam_fam: LHS and RHS are both saturated type-family applications,
      --             for the same type-family F
      go_fam_fam tc tys1 tys2 kco
 -      | tcEqTyConAppArgs tys1 tys2
 -      -- Detect (F tys ~ F tys); otherwise we'd build an infinite substitution
 -      = return ()
+-
 -      | otherwise
         -- Decompose (F tys1 ~ F tys2): (ATF9)
         -- Use injectivity information of F: (ATF10)
         -- But first bind the type-fam if poss: (ATF11)
@@ -1902,13 +1922,19 @@ uVarOrFam env ty1 ty2 kco
         (inj_tys1, noninj_tys1) = partitionByList inj tys1
         (inj_tys2, noninj_tys2) = partitionByList inj tys2
 -       bind_fam_if_poss | BindMe <- um_bind_fam_fun env tc tys1 rhs1
 -                        = extendFamEnv tc tys1 rhs1
 -                        | um_unif env
 -                        , BindMe <- um_bind_fam_fun env tc tys2 rhs2
 -                        = extendFamEnv tc tys2 rhs2
 -                        | otherwise
 -                        = return ()
 +       bind_fam_if_poss
 +         | not (um_unif env)  -- Not when matching (ATF11-1)
 +         = return ()
 +         | tcEqTyConAppArgs tys1 tys2   -- Detect (F tys ~ F tys);
 +         = return ()                    -- otherwise we'd build an infinite substitution
 +         | BindMe <- um_bind_fam_fun env tc tys1 rhs1
 +         = extendFamEnv tc tys1 rhs1
 +         | um_unif env
 +         , BindMe <- um_bind_fam_fun env tc tys2 rhs2
 +         = extendFamEnv tc tys2 rhs2
 +         | otherwise
 +         = return ()
++
         rhs1 = mkTyConApp tc tys2 `mkCastTy` mkSymCo kco
         rhs2 = mkTyConApp tc tys1 `mkCastTy` kco
@@ -1993,7 +2019,7 @@ data UMState = UMState
    -- in um_foralls; i.e. variables bound by foralls inside the types being unified
    -- When /matching/ um_fam_env is usually empty; but not quite always.
 -  -- See (ATF6) and (ATF7) of Note [Apartness and type families]
 +  -- See (ATF7) of Note [Apartness and type families]
  newtype UM a
    = UM' { unUM :: UMState -> UnifyResultM (UMState, a) }

compiler/GHC/Tc/Solver/Equality.hs

@@ -3017,6 +3017,7 @@ improve_wanted_top_fun_eqs fam_tc lhs_tys rhs_ty
  improve_injective_wanted_top :: FamInstEnvs -> [Bool] -> TyCon -> [TcType] -> Xi -> TcS [TypeEqn]
  -- Interact with top-level instance declarations
 +-- See Section 5.2 in the Injective Type Families paper
  improve_injective_wanted_top fam_envs inj_args fam_tc lhs_tys rhs_ty
    = concatMapM do_one branches
    where
@@ -3035,6 +3036,7 @@ improve_injective_wanted_top fam_envs inj_args fam_tc lhs_tys rhs_ty
      do_one branch@(CoAxBranch { cab_tvs = branch_tvs, cab_lhs = branch_lhs_tys, cab_rhs = branch_rhs })
        | let in_scope1 = in_scope `extendInScopeSetList` branch_tvs
        , Just subst <- tcUnifyTyForInjectivity False in_scope1 branch_rhs rhs_ty
 +                      -- False: matching, not unifying
        = do { let inSubst tv = tv `elemVarEnv` getTvSubstEnv subst
                   unsubstTvs = filterOut inSubst branch_tvs
                   -- The order of unsubstTvs is important; it must be

compiler/GHC/Types/Basic.hs

@@ -87,7 +87,7 @@ module GHC.Types.Basic (
          CompilerPhase(..), PhaseNum, beginPhase, nextPhase, laterPhase,
          Activation(..), isActive, competesWith,
 -        isNeverActive, isAlwaysActive, activeInFinalPhase,
 +        isNeverActive, isAlwaysActive, activeInFinalPhase, activeInInitialPhase,
          activateAfterInitial, activateDuringFinal, activeAfter,
          RuleMatchInfo(..), isConLike, isFunLike,

testsuite/tests/simplCore/should_compile/T25703.hs

 +module T25703 where
++
 +f :: (Eq a, Show b) => a -> b -> Int
 +f x y = f x y
++
 +goo :: forall x. (Eq x) => x -> Int
 +goo arg = f arg (3::Int)

testsuite/tests/simplCore/should_compile/T25703.stderr

	1	+Rule fired: SPEC f @_ @Int (T25703)
	2	+Rule fired: SPEC f @_ @Int (T25703)

testsuite/tests/simplCore/should_compile/T25703a.hs

 +{-# LANGUAGE DataKinds #-}
 +{-# LANGUAGE GADTs #-}
++
 +{-# OPTIONS_GHC -O2 -fspecialise-aggressively #-}
++
 +-- This pragma is just here to pretend that the function body of 'foo' is huge
 +-- and should never be inlined.
 +{-# OPTIONS_GHC -funfolding-use-threshold=-200 #-}
++
 +module T25703a where
++
 +import Data.Kind
 +import Data.Type.Equality
 +import Data.Proxy
 +import GHC.TypeNats
++
 +-- Pretend this is some big dictionary that absolutely must get
 +-- specialised away for performance reasons.
 +type C :: Nat -> Constraint
 +class C i where
 +  meth :: Proxy i -> Double
 +instance C 0 where
 +  meth _ = 0.1
 +instance C 1 where
 +  meth _ = 1.1
 +instance C 2 where
 +  meth _ = 2.1
++
 +{-# INLINEABLE foo #-}
 +foo :: forall a (n :: Nat) (m :: Nat)
 +    .  ( Eq a, C n, C m )
 +    => a -> ( Proxy n, Proxy m ) -> Int -> Double
 +-- Pretend this is a big complicated function, too big to inline,
 +-- for which we absolutely must specialise away the 'C n', 'C m'
 +-- dictionaries for performance reasons.
 +foo a b c
 +  = if a == a
 +    then meth @n Proxy + fromIntegral c
 +    else 2 * meth @m Proxy
++
 +-- Runtime dispatch to a specialisation of 'foo'
 +foo_spec :: forall a (n :: Nat) (m :: Nat)
 +         .  ( Eq a, KnownNat n, KnownNat m )
 +         => a -> ( Proxy n, Proxy m ) -> Int -> Double
 +foo_spec a b c
 +  | Just Refl <- sameNat @n @0 Proxy Proxy
 +  , Just Refl <- sameNat @m @0 Proxy Proxy
 +  = foo @a @0 @0 a b c
 +  | Just Refl <- sameNat @n @0 Proxy Proxy
 +  , Just Refl <- sameNat @m @1 Proxy Proxy
 +  = foo @a @0 @1 a b c
 +  | Just Refl <- sameNat @n @1 Proxy Proxy
 +  , Just Refl <- sameNat @m @1 Proxy Proxy
 +  = foo @a @1 @1 a b c
 +  | Just Refl <- sameNat @n @0 Proxy Proxy
 +  , Just Refl <- sameNat @m @2 Proxy Proxy
 +  = foo @a @0 @2 a b c
 +  | Just Refl <- sameNat @n @1 Proxy Proxy
 +  , Just Refl <- sameNat @m @2 Proxy Proxy
 +  = foo @a @1 @2 a b c
 +  | Just Refl <- sameNat @n @2 Proxy Proxy
 +  , Just Refl <- sameNat @m @2 Proxy Proxy
 +  = foo @a @2 @2 a b c
 +  | otherwise
 +  = error $ unlines
 +      [ "f: no specialisation"
 +      , "n: " ++ show (natVal @n Proxy)
 +      , "m: " ++ show (natVal @m Proxy)
 +      ]

testsuite/tests/simplCore/should_compile/T25703a.stderr

 +Rule fired: SPEC foo @_ @2 @2 (T25703a)
 +Rule fired: SPEC foo @_ @1 @2 (T25703a)
 +Rule fired: SPEC foo @_ @0 @2 (T25703a)
 +Rule fired: SPEC foo @_ @1 @1 (T25703a)
 +Rule fired: SPEC foo @_ @0 @1 (T25703a)
 +Rule fired: SPEC foo @_ @0 @0 (T25703a)

testsuite/tests/simplCore/should_compile/T25965.hs

 +{-# LANGUAGE TypeFamilies #-}
 +{-# OPTIONS_GHC -O -fpolymorphic-specialisation #-}
++
 +module Foo where
++
 +type family F a
++
 +data T a = T1
++
 +instance Eq (T a) where { (==) x y = False }
++
 +foo :: Eq a => a -> Bool
 +foo x | x==x = True
 +      | otherwise = foo x
++
 +bar :: forall b. b -> T (F b) -> Bool
 +bar y x = foo x
++

testsuite/tests/simplCore/should_compile/all.T

@@ -543,3 +543,8 @@ test('T25883c', normal, compile_grep_core, [''])
  test('T25883d', [extra_files(['T25883d_import.hs'])], multimod_compile_filter, ['T25883d', '-O -ddump-simpl -dno-typeable-binds -dsuppress-all -dsuppress-uniques', r'grep -e "y ="'])
  test('T25976', [grep_errmsg('Dead Code')], compile, ['-O -ddump-simpl -dsuppress-uniques -dno-typeable-binds'])
++
 +test('T25965', normal, compile, ['-O'])
 +test('T25703',  [grep_errmsg(r'SPEC')], compile, ['-O -fpolymorphic-specialisation -ddump-rule-firings'])
 +test('T25703a', [grep_errmsg(r'SPEC')], compile, ['-O -fpolymorphic-specialisation -ddump-rule-firings'])
++