%
% (c) The University of Glasgow 2006
% (c) The AQUA Project, Glasgow University, 19941998
%
Coresyntax unfoldings
Unfoldings (which can travel across module boundaries) are in Core
syntax (namely @CoreExpr@s).
The type @Unfolding@ sits ``above'' simplyCoreexpressions
unfoldings, capturing ``higherlevel'' things we know about a binding,
usually things that the simplifier found out (e.g., ``it's a
literal''). In the corner of a @CoreUnfolding@ unfolding, you will
find, unsurprisingly, a Core expression.
\begin{code}
module CoreUnfold (
Unfolding, UnfoldingGuidance,
noUnfolding, mkImplicitUnfolding,
mkUnfolding, mkCoreUnfolding,
mkTopUnfolding, mkSimpleUnfolding,
mkInlineUnfolding, mkInlinableUnfolding, mkWwInlineRule,
mkCompulsoryUnfolding, mkDFunUnfolding,
interestingArg, ArgSummary(..),
couldBeSmallEnoughToInline,
certainlyWillInline, smallEnoughToInline,
callSiteInline, CallCtxt(..),
exprIsConApp_maybe
) where
#include "HsVersions.h"
import StaticFlags
import DynFlags
import CoreSyn
import PprCore ()
import TcType ( tcSplitSigmaTy, tcSplitDFunHead )
import OccurAnal
import CoreSubst hiding( substTy )
import CoreFVs ( exprFreeVars )
import CoreArity ( manifestArity, exprBotStrictness_maybe )
import CoreUtils
import Id
import DataCon
import TyCon
import Literal
import PrimOp
import IdInfo
import BasicTypes ( Arity )
import TcType ( tcSplitDFunTy )
import Type
import Coercion
import PrelNames
import VarEnv ( mkInScopeSet )
import Bag
import Util
import FastTypes
import FastString
import Outputable
import Data.Maybe
\end{code}
%************************************************************************
%* *
\subsection{Making unfoldings}
%* *
%************************************************************************
\begin{code}
mkTopUnfolding :: Bool -> CoreExpr -> Unfolding
mkTopUnfolding = mkUnfolding InlineRhs True
mkImplicitUnfolding :: CoreExpr -> Unfolding
mkImplicitUnfolding expr = mkTopUnfolding False (simpleOptExpr expr)
mkSimpleUnfolding :: CoreExpr -> Unfolding
mkSimpleUnfolding = mkUnfolding InlineRhs False False
mkDFunUnfolding :: Type -> [CoreExpr] -> Unfolding
mkDFunUnfolding dfun_ty ops
= DFunUnfolding dfun_nargs data_con ops
where
(tvs, theta, head_ty) = tcSplitSigmaTy dfun_ty
(cls, _) = tcSplitDFunHead head_ty
dfun_nargs = length tvs + length theta
data_con = classDataCon cls
mkWwInlineRule :: Id -> CoreExpr -> Arity -> Unfolding
mkWwInlineRule id expr arity
= mkCoreUnfolding (InlineWrapper id) True
(simpleOptExpr expr) arity
(UnfWhen unSaturatedOk boringCxtNotOk)
mkCompulsoryUnfolding :: CoreExpr -> Unfolding
mkCompulsoryUnfolding expr
= mkCoreUnfolding InlineCompulsory True
expr 0
(UnfWhen unSaturatedOk boringCxtOk)
mkInlineUnfolding :: Maybe Arity -> CoreExpr -> Unfolding
mkInlineUnfolding mb_arity expr
= mkCoreUnfolding InlineStable
True
expr' arity
(UnfWhen unsat_ok boring_ok)
where
expr' = simpleOptExpr expr
(unsat_ok, arity) = case mb_arity of
Nothing -> (unSaturatedOk, manifestArity expr')
Just ar -> (needSaturated, ar)
boring_ok = case calcUnfoldingGuidance True
False
(arity+1) expr' of
(_, UnfWhen _ boring_ok) -> boring_ok
_other -> boringCxtNotOk
mkInlinableUnfolding :: CoreExpr -> Unfolding
mkInlinableUnfolding expr
= mkUnfolding InlineStable True is_bot expr'
where
expr' = simpleOptExpr expr
is_bot = isJust (exprBotStrictness_maybe expr')
\end{code}
Internal functions
\begin{code}
mkCoreUnfolding :: UnfoldingSource -> Bool -> CoreExpr
-> Arity -> UnfoldingGuidance -> Unfolding
mkCoreUnfolding src top_lvl expr arity guidance
= CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
uf_src = src,
uf_arity = arity,
uf_is_top = top_lvl,
uf_is_value = exprIsHNF expr,
uf_is_conlike = exprIsConLike expr,
uf_is_cheap = exprIsCheap expr,
uf_expandable = exprIsExpandable expr,
uf_guidance = guidance }
mkUnfolding :: UnfoldingSource -> Bool -> Bool -> CoreExpr -> Unfolding
mkUnfolding src top_lvl is_bottoming expr
= CoreUnfolding { uf_tmpl = occurAnalyseExpr expr,
uf_src = src,
uf_arity = arity,
uf_is_top = top_lvl,
uf_is_value = exprIsHNF expr,
uf_is_conlike = exprIsConLike expr,
uf_expandable = exprIsExpandable expr,
uf_is_cheap = is_cheap,
uf_guidance = guidance }
where
is_cheap = exprIsCheap expr
(arity, guidance) = calcUnfoldingGuidance is_cheap (top_lvl && is_bottoming)
opt_UF_CreationThreshold expr
\end{code}
%************************************************************************
%* *
\subsection{The UnfoldingGuidance type}
%* *
%************************************************************************
\begin{code}
calcUnfoldingGuidance
:: Bool
-> Bool
-> Int
-> CoreExpr
-> (Arity, UnfoldingGuidance)
calcUnfoldingGuidance expr_is_cheap top_bot bOMB_OUT_SIZE expr
= case collectBinders expr of { (bndrs, body) ->
let
val_bndrs = filter isId bndrs
n_val_bndrs = length val_bndrs
guidance
= case (sizeExpr (iUnbox bOMB_OUT_SIZE) val_bndrs body) of
TooBig -> UnfNever
SizeIs size cased_bndrs scrut_discount
| uncondInline n_val_bndrs (iBox size)
, expr_is_cheap
-> UnfWhen unSaturatedOk boringCxtOk
| top_bot
-> UnfNever
| otherwise
-> UnfIfGoodArgs { ug_args = map (discount cased_bndrs) val_bndrs
, ug_size = iBox size
, ug_res = iBox scrut_discount }
discount cbs bndr
= foldlBag (\acc (b',n) -> if bndr==b' then acc+n else acc)
0 cbs
in
(n_val_bndrs, guidance) }
\end{code}
Note [Computing the size of an expression]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The basic idea of sizeExpr is obvious enough: count nodes. But getting the
heuristics right has taken a long time. Here's the basic strategy:
* Variables, literals: 0
(Exception for string literals, see litSize.)
* Function applications (f e1 .. en): 1 + #value args
* Constructor applications: 1, regardless of #args
* Let(rec): 1 + size of components
* Note, cast: 0
Examples
Size Term
0 42#
0 x
0 True
2 f x
1 Just x
4 f (g x)
Notice that 'x' counts 0, while (f x) counts 2. That's deliberate: there's
a function call to account for. Notice also that constructor applications
are very cheap, because exposing them to a caller is so valuable.
Note [Do not inline toplevel bottoming functions]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The FloatOut pass has gone to some trouble to float out calls to 'error'
and similar friends. See Note [Bottoming floats] in SetLevels.
Do not reinline them! But we *do* still inline if they are very small
(the uncondInline stuff).
Note [INLINE for small functions]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Consider
f x = Just x
g y = f y
Then f's RHS is no larger than its LHS, so we should inline it into
even the most boring context. In general, f the function is
sufficiently small that its body is as small as the call itself, the
inline unconditionally, regardless of how boring the context is.
Things to note:
* We inline *unconditionally* if inlined thing is smaller (using sizeExpr)
than the thing it's replacing. Notice that
(f x) --> (g 3)
(f x) --> x : []
x --> g 3
x --> Just v
It's very important not to unconditionally replace a variable by
a nonatomic term.
* We do this even if the thing isn't saturated, else we end up with the
silly situation that
f x y = x
...map (f 3)...
doesn't inline. Even in a boring context, inlining without being
saturated will give a lambda instead of a PAP, and will be more
efficient at runtime.
* However, when the function's arity > 0, we do insist that it
has at least one value argument at the call site. Otherwise we find this:
f = /\a \x:a. x
d = /\b. MkD (f b)
If we inline f here we get
d = /\b. MkD (\x:b. x)
and then prepareRhs floats out the argument, abstracting the type
variables, so we end up with the original again!
\begin{code}
uncondInline :: Arity -> Int -> Bool
uncondInline arity size
| arity == 0 = size == 0
| otherwise = size <= arity + 1
\end{code}
\begin{code}
sizeExpr :: FastInt
-> [Id]
-> CoreExpr
-> ExprSize
sizeExpr bOMB_OUT_SIZE top_args expr
= size_up expr
where
size_up (Cast e _) = size_up e
size_up (Note _ e) = size_up e
size_up (Type _) = sizeZero
size_up (Lit lit) = sizeN (litSize lit)
size_up (Var f) = size_up_call f []
size_up (App fun (Type _)) = size_up fun
size_up (App fun arg) = size_up arg `addSizeNSD`
size_up_app fun [arg]
size_up (Lam b e) | isId b = lamScrutDiscount (size_up e `addSizeN` 1)
| otherwise = size_up e
size_up (Let (NonRec binder rhs) body)
= size_up rhs `addSizeNSD`
size_up body `addSizeN`
(if isUnLiftedType (idType binder) then 0 else 1)
size_up (Let (Rec pairs) body)
= foldr (addSizeNSD . size_up . snd)
(size_up body `addSizeN` length pairs)
pairs
size_up (Case (Var v) _ _ alts)
| v `elem` top_args
= alts_size (foldr1 addAltSize alt_sizes)
(foldr1 maxSize alt_sizes)
where
alt_sizes = map size_up_alt alts
alts_size (SizeIs tot tot_disc tot_scrut)
(SizeIs max _ _)
= SizeIs tot (unitBag (v, iBox (_ILIT(2) +# tot -# max)) `unionBags` tot_disc) tot_scrut
alts_size tot_size _ = tot_size
size_up (Case e _ _ alts) = size_up e `addSizeNSD`
foldr (addAltSize . size_up_alt) sizeZero alts
size_up_app (App fun arg) args
| isTypeArg arg = size_up_app fun args
| otherwise = size_up arg `addSizeNSD`
size_up_app fun (arg:args)
size_up_app (Var fun) args = size_up_call fun args
size_up_app other args = size_up other `addSizeN` length args
size_up_call :: Id -> [CoreExpr] -> ExprSize
size_up_call fun val_args
= case idDetails fun of
FCallId _ -> sizeN opt_UF_DearOp
DataConWorkId dc -> conSize dc (length val_args)
PrimOpId op -> primOpSize op (length val_args)
ClassOpId _ -> classOpSize top_args val_args
_ -> funSize top_args fun (length val_args)
size_up_alt (_con, _bndrs, rhs) = size_up rhs `addSizeN` 1
addSizeN TooBig _ = TooBig
addSizeN (SizeIs n xs d) m = mkSizeIs bOMB_OUT_SIZE (n +# iUnbox m) xs d
addAltSize TooBig _ = TooBig
addAltSize _ TooBig = TooBig
addAltSize (SizeIs n1 xs d1) (SizeIs n2 ys d2)
= mkSizeIs bOMB_OUT_SIZE (n1 +# n2)
(xs `unionBags` ys)
(d1 +# d2)
addSizeNSD TooBig _ = TooBig
addSizeNSD _ TooBig = TooBig
addSizeNSD (SizeIs n1 xs _) (SizeIs n2 ys d2)
= mkSizeIs bOMB_OUT_SIZE (n1 +# n2)
(xs `unionBags` ys)
d2
\end{code}
\begin{code}
litSize :: Literal -> Int
litSize (MachStr str) = 1 + ((lengthFS str + 3) `div` 4)
litSize _other = 0
classOpSize :: [Id] -> [CoreExpr] -> ExprSize
classOpSize _ []
= sizeZero
classOpSize top_args (arg1 : other_args)
= SizeIs (iUnbox size) arg_discount (_ILIT(0))
where
size = 2 + length other_args
arg_discount = case arg1 of
Var dict | dict `elem` top_args
-> unitBag (dict, opt_UF_DictDiscount)
_other -> emptyBag
funSize :: [Id] -> Id -> Int -> ExprSize
funSize top_args fun n_val_args
| fun `hasKey` buildIdKey = buildSize
| fun `hasKey` augmentIdKey = augmentSize
| otherwise = SizeIs (iUnbox size) arg_discount (iUnbox res_discount)
where
some_val_args = n_val_args > 0
arg_discount | some_val_args && fun `elem` top_args
= unitBag (fun, opt_UF_FunAppDiscount)
| otherwise = emptyBag
res_discount | idArity fun > n_val_args = opt_UF_FunAppDiscount
| otherwise = 0
size | some_val_args = 1 + n_val_args
| otherwise = 0
conSize :: DataCon -> Int -> ExprSize
conSize dc n_val_args
| n_val_args == 0 = SizeIs (_ILIT(0)) emptyBag (_ILIT(1))
| isUnboxedTupleCon dc = SizeIs (_ILIT(0)) emptyBag (iUnbox n_val_args +# _ILIT(1))
| otherwise = SizeIs (_ILIT(1)) emptyBag (iUnbox n_val_args +# _ILIT(1))
\end{code}
Note [Constructor size]
~~~~~~~~~~~~~~~~~~~~~~~
Treat a constructors application as size 1, regardless of how many
arguments it has; we are keen to expose them (and we charge separately
for their args). We can't treat them as size zero, else we find that
(Just x) has size 0, which is the same as a lone variable; and hence
'v' will always be replaced by (Just x), where v is bound to Just x.
However, unboxed tuples count as size zero. I found occasions where we had
f x y z = case op# x y z of { s -> (# s, () #) }
and f wasn't getting inlined.
Note [Unboxed tuple result discount]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I tried giving unboxed tuples a *result discount* of zero (see the
commentedout line). Why? When returned as a result they do not
allocate, so maybe we don't want to charge so much for them If you
have a nonzero discount here, we find that workers often get inlined
back into wrappers, because it look like
f x = case $wf x of (# a,b #) -> (a,b)
and we are keener because of the case. However while this change
shrank binary sizes by 0.5% it also made spectral/boyer allocate 5%
more. All other changes were very small. So it's not a big deal but I
didn't adopt the idea.
\begin{code}
primOpSize :: PrimOp -> Int -> ExprSize
primOpSize op n_val_args
| not (primOpIsDupable op) = sizeN opt_UF_DearOp
| not (primOpOutOfLine op) = sizeN 1
| otherwise = sizeN n_val_args
buildSize :: ExprSize
buildSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(4))
augmentSize :: ExprSize
augmentSize = SizeIs (_ILIT(0)) emptyBag (_ILIT(4))
lamScrutDiscount :: ExprSize -> ExprSize
lamScrutDiscount (SizeIs n vs _) = SizeIs n vs (iUnbox opt_UF_FunAppDiscount)
lamScrutDiscount TooBig = TooBig
\end{code}
Note [addAltSize result discounts]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When adding the size of alternatives, we *add* the result discounts
too, rather than take the *maximum*. For a multibranch case, this
gives a discount for each branch that returns a constructor, making us
keener to inline. I did try using 'max' instead, but it makes nofib
'rewrite' and 'puzzle' allocate significantly more, and didn't make
binary sizes shrink significantly either.
Note [Discounts and thresholds]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Constants for discounts and thesholds are defined in main/StaticFlags,
all of form opt_UF_xxxx. They are:
opt_UF_CreationThreshold (45)
At a definition site, if the unfolding is bigger than this, we
may discard it altogether
opt_UF_UseThreshold (6)
At a call site, if the unfolding, less discounts, is smaller than
this, then it's small enough inline
opt_UF_KeennessFactor (1.5)
Factor by which the discounts are multiplied before
subtracting from size
opt_UF_DictDiscount (1)
The discount for each occurrence of a dictionary argument
as an argument of a class method. Should be pretty small
else big functions may get inlined
opt_UF_FunAppDiscount (6)
Discount for a function argument that is applied. Quite
large, because if we inline we avoid the higherorder call.
opt_UF_DearOp (4)
The size of a foreign call or notdupable PrimOp
Note [Function applications]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In a function application (f a b)
If 'f' is an argument to the function being analysed,
and there's at least one value arg, record a FunAppDiscount for f
If the application if a PAP (arity > 2 in this example)
record a *result* discount (because inlining
with "extra" args in the call may mean that we now
get a saturated application)
Code for manipulating sizes
\begin{code}
data ExprSize = TooBig
| SizeIs FastInt
(Bag (Id,Int))
FastInt
instance Outputable ExprSize where
ppr TooBig = ptext (sLit "TooBig")
ppr (SizeIs a _ c) = brackets (int (iBox a) <+> int (iBox c))
mkSizeIs :: FastInt -> FastInt -> Bag (Id, Int) -> FastInt -> ExprSize
mkSizeIs max n xs d | (n -# d) ># max = TooBig
| otherwise = SizeIs n xs d
maxSize :: ExprSize -> ExprSize -> ExprSize
maxSize TooBig _ = TooBig
maxSize _ TooBig = TooBig
maxSize s1@(SizeIs n1 _ _) s2@(SizeIs n2 _ _) | n1 ># n2 = s1
| otherwise = s2
sizeZero :: ExprSize
sizeN :: Int -> ExprSize
sizeZero = SizeIs (_ILIT(0)) emptyBag (_ILIT(0))
sizeN n = SizeIs (iUnbox n) emptyBag (_ILIT(0))
\end{code}
%************************************************************************
%* *
\subsection[considerUnfolding]{Given all the info, do (not) do the unfolding}
%* *
%************************************************************************
We use 'couldBeSmallEnoughToInline' to avoid exporting inlinings that
we ``couldn't possibly use'' on the other side. Can be overridden w/
flaggery. Just the same as smallEnoughToInline, except that it has no
actual arguments.
\begin{code}
couldBeSmallEnoughToInline :: Int -> CoreExpr -> Bool
couldBeSmallEnoughToInline threshold rhs
= case sizeExpr (iUnbox threshold) [] body of
TooBig -> False
_ -> True
where
(_, body) = collectBinders rhs
smallEnoughToInline :: Unfolding -> Bool
smallEnoughToInline (CoreUnfolding {uf_guidance = UnfIfGoodArgs {ug_size = size}})
= size <= opt_UF_UseThreshold
smallEnoughToInline _
= False
certainlyWillInline :: Unfolding -> Bool
certainlyWillInline (CoreUnfolding { uf_is_cheap = is_cheap, uf_arity = n_vals, uf_guidance = guidance })
= case guidance of
UnfNever -> False
UnfWhen {} -> True
UnfIfGoodArgs { ug_size = size}
-> is_cheap && size (n_vals +1) <= opt_UF_UseThreshold
certainlyWillInline _
= False
\end{code}
%************************************************************************
%* *
\subsection{callSiteInline}
%* *
%************************************************************************
This is the key function. It decides whether to inline a variable at a call site
callSiteInline is used at call sites, so it is a bit more generous.
It's a very important function that embodies lots of heuristics.
A nonWHNF can be inlined if it doesn't occur inside a lambda,
and occurs exactly once or
occurs once in each branch of a case and is small
If the thing is in WHNF, there's no danger of duplicating work,
so we can inline if it occurs once, or is small
NOTE: we don't want to inline toplevel functions that always diverge.
It just makes the code bigger. Tt turns out that the convenient way to prevent
them inlining is to give them a NOINLINE pragma, which we do in
StrictAnal.addStrictnessInfoToTopId
\begin{code}
callSiteInline :: DynFlags
-> Id
-> Unfolding
-> Bool
-> [ArgSummary]
-> CallCtxt
-> Maybe CoreExpr
instance Outputable ArgSummary where
ppr TrivArg = ptext (sLit "TrivArg")
ppr NonTrivArg = ptext (sLit "NonTrivArg")
ppr ValueArg = ptext (sLit "ValueArg")
data CallCtxt = BoringCtxt
| ArgCtxt
Bool
| ValAppCtxt
| CaseCtxt
instance Outputable CallCtxt where
ppr BoringCtxt = ptext (sLit "BoringCtxt")
ppr (ArgCtxt rules) = ptext (sLit "ArgCtxt") <+> ppr rules
ppr CaseCtxt = ptext (sLit "CaseCtxt")
ppr ValAppCtxt = ptext (sLit "ValAppCtxt")
callSiteInline dflags id unfolding lone_variable arg_infos cont_info
= case unfolding of {
NoUnfolding -> Nothing ;
OtherCon _ -> Nothing ;
DFunUnfolding {} -> Nothing ;
CoreUnfolding { uf_tmpl = unf_template, uf_is_top = is_top,
uf_is_cheap = is_cheap, uf_arity = uf_arity, uf_guidance = guidance } ->
let
n_val_args = length arg_infos
saturated = n_val_args >= uf_arity
result | yes_or_no = Just unf_template
| otherwise = Nothing
interesting_args = any nonTriv arg_infos
some_benefit
| not saturated = interesting_args
| n_val_args > uf_arity = True
| otherwise = interesting_args
|| interesting_saturated_call
interesting_saturated_call
= case cont_info of
BoringCtxt -> not is_top && uf_arity > 0
CaseCtxt -> not (lone_variable && is_cheap)
ArgCtxt {} -> uf_arity > 0
ValAppCtxt -> True
(yes_or_no, extra_doc)
= case guidance of
UnfNever -> (False, empty)
UnfWhen unsat_ok boring_ok
-> (enough_args && (boring_ok || some_benefit), empty )
where
enough_args = saturated || (unsat_ok && n_val_args > 0)
UnfIfGoodArgs { ug_args = arg_discounts, ug_res = res_discount, ug_size = size }
-> ( is_cheap && some_benefit && small_enough
, (text "discounted size =" <+> int discounted_size) )
where
discounted_size = size discount
small_enough = discounted_size <= opt_UF_UseThreshold
discount = computeDiscount uf_arity arg_discounts
res_discount arg_infos cont_info
in
if (dopt Opt_D_dump_inlinings dflags && dopt Opt_D_verbose_core2core dflags) then
pprTrace ("Considering inlining: " ++ showSDoc (ppr id))
(vcat [text "arg infos" <+> ppr arg_infos,
text "uf arity" <+> ppr uf_arity,
text "interesting continuation" <+> ppr cont_info,
text "some_benefit" <+> ppr some_benefit,
text "is cheap:" <+> ppr is_cheap,
text "guidance" <+> ppr guidance,
extra_doc,
text "ANSWER =" <+> if yes_or_no then text "YES" else text "NO"])
result
else
result
}
\end{code}
Note [RHS of lets]
~~~~~~~~~~~~~~~~~~
Be a tiny bit keener to inline in the RHS of a let, because that might
lead to good thing later
f y = (y,y,y)
g y = let x = f y in ...(case x of (a,b,c) -> ...) ...
We'd inline 'f' if the call was in a case context, and it kindofis,
only we can't see it. So we treat the RHS of a let as nottotallyboring.
Note [Unsaturated applications]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When a call is not saturated, we *still* inline if one of the
arguments has interesting structure. That's sometimes very important.
A good example is the Ord instance for Bool in Base:
Rec {
$fOrdBool =GHC.Classes.D:Ord
@ Bool
...
$cmin_ajX
$cmin_ajX [Occ=LoopBreaker] :: Bool -> Bool -> Bool
$cmin_ajX = GHC.Classes.$dmmin @ Bool $fOrdBool
}
But the defn of GHC.Classes.$dmmin is:
$dmmin :: forall a. GHC.Classes.Ord a => a -> a -> a
We *really* want to inline $dmmin, even though it has arity 3, in
order to unravel the recursion.
Note [Things to watch]
~~~~~~~~~~~~~~~~~~~~~~
* { y = I# 3; x = y `cast` co; ...case (x `cast` co) of ... }
Assume x is exported, so not inlined unconditionally.
Then we want x to inline unconditionally; no reason for it
not to, and doing so avoids an indirection.
* { x = I# 3; ....f x.... }
Make sure that x does not inline unconditionally!
Lest we get extra allocation.
Note [Inlining an InlineRule]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
An InlineRules is used for
(a) programmer INLINE pragmas
(b) inlinings from worker/wrapper
For (a) the RHS may be large, and our contract is that we *only* inline
when the function is applied to all the arguments on the LHS of the
sourcecode defn. (The uf_arity in the rule.)
However for worker/wrapper it may be worth inlining even if the
arity is not satisfied (as we do in the CoreUnfolding case) so we don't
require saturation.
Note [Nested functions]
~~~~~~~~~~~~~~~~~~~~~~~
If a function has a nested defn we also record somebenefit, on the
grounds that we are often able to eliminate the binding, and hence the
allocation, for the function altogether; this is good for join points.
But this only makes sense for *functions*; inlining a constructor
doesn't help allocation unless the result is scrutinised. UNLESS the
constructor occurs just once, albeit possibly in multiple case
branches. Then inlining it doesn't increase allocation, but it does
increase the chance that the constructor won't be allocated at all in
the branches that don't use it.
Note [Cast then apply]
~~~~~~~~~~~~~~~~~~~~~~
Consider
myIndex = __inline_me ( (/\a. <blah>) |> co )
co :: (forall a. a -> a) ~ (forall a. T a)
... /\a.\x. case ((myIndex a) |> sym co) x of { ... } ...
We need to inline myIndex to unravel this; but the actual call (myIndex a) has
no value arguments. The ValAppCtxt gives it enough incentive to inline.
Note [Inlining in ArgCtxt]
~~~~~~~~~~~~~~~~~~~~~~~~~~
The condition (arity > 0) here is very important, because otherwise
we end up inlining toplevel stuff into useless places; eg
x = I# 3#
f = \y. g x
This can make a very big difference: it adds 16% to nofib 'integer' allocs,
and 20% to 'power'.
At one stage I replaced this condition by 'True' (leading to the above
slowdown). The motivation was test eyeball/inline1.hs; but that seems
to work ok now.
NOTE: arguably, we should inline in ArgCtxt only if the result of the
call is at least CONLIKE. At least for the cases where we use ArgCtxt
for the RHS of a 'let', we only profit from the inlining if we get a
CONLIKE thing (modulo lets).
Note [Lone variables] See also Note [Interaction of exprIsCheap and lone variables]
~~~~~~~~~~~~~~~~~~~~~ which appears below
The "lone-variable" case is important. I spent ages messing about
with unsatisfactory varaints, but this is nice. The idea is that if a
variable appears all alone
as an arg of lazy fn, or rhs BoringCtxt
as scrutinee of a case CaseCtxt
as arg of a fn ArgCtxt
AND
it is bound to a cheap expression
then we should not inline it (unless there is some other reason,
e.g. is is the sole occurrence). That is what is happening at
the use of 'lone_variable' in 'interesting_saturated_call'.
Why? At least in the casescrutinee situation, turning
let x = (a,b) in case x of y -> ...
into
let x = (a,b) in case (a,b) of y -> ...
and thence to
let x = (a,b) in let y = (a,b) in ...
is bad if the binding for x will remain.
Another example: I discovered that strings
were getting inlined straight back into applications of 'error'
because the latter is strict.
s = "foo"
f = \x -> ...(error s)...
Fundamentally such contexts should not encourage inlining because the
context can ``see'' the unfolding of the variable (e.g. case or a
RULE) so there's no gain. If the thing is bound to a value.
However, watch out:
* Consider this:
foo = _inline_ (\n. [n])
bar = _inline_ (foo 20)
baz = \n. case bar of { (m:_) -> m + n }
Here we really want to inline 'bar' so that we can inline 'foo'
and the whole thing unravels as it should obviously do. This is
important: in the NDP project, 'bar' generates a closure data
structure rather than a list.
So the noninlining of lone_variables should only apply if the
unfolding is regarded as cheap; because that is when exprIsConApp_maybe
looks through the unfolding. Hence the "&& is_cheap" in the
InlineRule branch.
* Even a type application or coercion isn't a lone variable.
Consider
case $fMonadST @ RealWorld of { :DMonad a b c -> c }
We had better inline that sucker! The case won't see through it.
For now, I'm treating treating a variable applied to types
in a *lazy* context "lone". The motivating example was
f = /\a. \x. BIG
g = /\a. \y. h (f a)
There's no advantage in inlining f here, and perhaps
a significant disadvantage. Hence some_val_args in the Stop case
Note [Interaction of exprIsCheap and lone variables]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The lonevariable test says "don't inline if a case expression
scrutines a lone variable whose unfolding is cheap". It's very
important that, under these circumstances, exprIsConApp_maybe
can spot a constructor application. So, for example, we don't
consider
let x = e in (x,x)
to be cheap, and that's good because exprIsConApp_maybe doesn't
think that expression is a constructor application.
I used to test is_value rather than is_cheap, which was utterly
wrong, because the above expression responds True to exprIsHNF.
This kind of thing can occur if you have
foo = let x = e in (x,x)
which Roman did.
\begin{code}
computeDiscount :: Int -> [Int] -> Int -> [ArgSummary] -> CallCtxt -> Int
computeDiscount n_vals_wanted arg_discounts res_discount arg_infos cont_info
= 1
+ length (take n_vals_wanted arg_infos)
+ round (opt_UF_KeenessFactor *
fromIntegral (arg_discount + res_discount'))
where
arg_discount = sum (zipWith mk_arg_discount arg_discounts arg_infos)
mk_arg_discount _ TrivArg = 0
mk_arg_discount _ NonTrivArg = 1
mk_arg_discount discount ValueArg = discount
res_discount' = case cont_info of
BoringCtxt -> 0
CaseCtxt -> res_discount
_other -> 4 `min` res_discount
\end{code}
%************************************************************************
%* *
Interesting arguments
%* *
%************************************************************************
Note [Interesting arguments]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
An argument is interesting if it deserves a discount for unfoldings
with a discount in that argument position. The idea is to avoid
unfolding a function that is applied only to variables that have no
unfolding (i.e. they are probably lambda bound): f x y z There is
little point in inlining f here.
Generally, *values* (like (C a b) and (\x.e)) deserve discounts. But
we must look through lets, eg (let x = e in C a b), because the let will
float, exposing the value, if we inline. That makes it different to
exprIsHNF.
Before 2009 we said it was interesting if the argument had *any* structure
at all; i.e. (hasSomeUnfolding v). But does too much inlining; see Trac #3016.
But we don't regard (f x y) as interesting, unless f is unsaturated.
If it's saturated and f hasn't inlined, then it's probably not going
to now!
Note [Conlike is interesting]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Consider
f d = ...((*) d x y)...
... f (df d')...
where df is conlike. Then we'd really like to inline 'f' so that the
rule for (*) (df d) can fire. To do this
a) we give a discount for being an argument of a classop (eg (*) d)
b) we say that a conlike argument (eg (df d)) is interesting
\begin{code}
data ArgSummary = TrivArg
| NonTrivArg
| ValueArg
interestingArg :: CoreExpr -> ArgSummary
interestingArg e = go e 0
where
go (Lit {}) _ = ValueArg
go (Var v) n
| isConLikeId v = ValueArg
| idArity v > n = ValueArg
| n > 0 = NonTrivArg
| conlike_unfolding = ValueArg
| otherwise = TrivArg
where
conlike_unfolding = isConLikeUnfolding (idUnfolding v)
go (Type _) _ = TrivArg
go (App fn (Type _)) n = go fn n
go (App fn _) n = go fn (n+1)
go (Note _ a) n = go a n
go (Cast e _) n = go e n
go (Lam v e) n
| isTyCoVar v = go e n
| n>0 = go e (n1)
| otherwise = ValueArg
go (Let _ e) n = case go e n of { ValueArg -> ValueArg; _ -> NonTrivArg }
go (Case {}) _ = NonTrivArg
nonTriv :: ArgSummary -> Bool
nonTriv TrivArg = False
nonTriv _ = True
\end{code}
%************************************************************************
%* *
exprIsConApp_maybe
%* *
%************************************************************************
Note [exprIsConApp_maybe]
~~~~~~~~~~~~~~~~~~~~~~~~~
exprIsConApp_maybe is a very important function. There are two principal
uses:
* case e of { .... }
* cls_op e, where cls_op is a class operation
In both cases you want to know if e is of form (C e1..en) where C is
a data constructor.
However e might not *look* as if
\begin{code}
exprIsConApp_maybe :: IdUnfoldingFun -> CoreExpr -> Maybe (DataCon, [Type], [CoreExpr])
exprIsConApp_maybe id_unf (Note note expr)
| notSccNote note
= exprIsConApp_maybe id_unf expr
exprIsConApp_maybe id_unf (Cast expr co)
=
case exprIsConApp_maybe id_unf expr of {
Nothing -> Nothing ;
Just (dc, _dc_univ_args, dc_args) ->
let (_from_ty, to_ty) = coercionKind co
dc_tc = dataConTyCon dc
in
case splitTyConApp_maybe to_ty of {
Nothing -> Nothing ;
Just (to_tc, to_tc_arg_tys)
| dc_tc /= to_tc -> Nothing
| otherwise ->
let
tc_arity = tyConArity dc_tc
dc_univ_tyvars = dataConUnivTyVars dc
dc_ex_tyvars = dataConExTyVars dc
arg_tys = dataConRepArgTys dc
dc_eqs :: [(Type,Type)]
dc_eqs = [(mkTyVarTy tv, ty) | (tv,ty) <- dataConEqSpec dc] ++
[getEqPredTys eq_pred | eq_pred <- dataConEqTheta dc]
(ex_args, rest1) = splitAtList dc_ex_tyvars dc_args
(co_args, val_args) = splitAtList dc_eqs rest1
gammas = decomposeCo tc_arity co
theta = zipOpenTvSubst (dc_univ_tyvars ++ dc_ex_tyvars)
(gammas ++ stripTypeArgs ex_args)
cast_co (ty1, ty2) (Type co)
= Type $ mkSymCoercion (substTy theta ty1)
`mkTransCoercion` co
`mkTransCoercion` (substTy theta ty2)
cast_co _ other_arg = pprPanic "cast_co" (ppr other_arg)
new_co_args = zipWith cast_co dc_eqs co_args
new_val_args = zipWith cast_arg arg_tys val_args
cast_arg arg_ty arg = mkCoerce (substTy theta arg_ty) arg
in
#ifdef DEBUG
let dump_doc = vcat [ppr dc, ppr dc_univ_tyvars, ppr dc_ex_tyvars,
ppr arg_tys, ppr dc_args, ppr _dc_univ_args,
ppr ex_args, ppr val_args]
in
ASSERT2( coreEqType _from_ty (mkTyConApp dc_tc _dc_univ_args), dump_doc )
ASSERT2( all isTypeArg (ex_args ++ co_args), dump_doc )
ASSERT2( equalLength val_args arg_tys, dump_doc )
#endif
Just (dc, to_tc_arg_tys, ex_args ++ new_co_args ++ new_val_args)
}}
exprIsConApp_maybe id_unf expr
= analyse expr []
where
analyse (App fun arg) args = analyse fun (arg:args)
analyse fun@(Lam {}) args = beta fun [] args
analyse (Var fun) args
| Just con <- isDataConWorkId_maybe fun
, count isValArg args == idArity fun
, let (univ_ty_args, rest_args) = splitAtList (dataConUnivTyVars con) args
= Just (con, stripTypeArgs univ_ty_args, rest_args)
| DFunUnfolding dfun_nargs con ops <- unfolding
, let sat = length args == dfun_nargs
in if sat then True else
pprTrace "Unsaturated dfun" (ppr fun <+> int dfun_nargs $$ ppr args) False
, let (dfun_tvs, _cls, dfun_res_tys) = tcSplitDFunTy (idType fun)
subst = zipOpenTvSubst dfun_tvs (stripTypeArgs (takeList dfun_tvs args))
= Just (con, substTys subst dfun_res_tys,
[mkApps op args | op <- ops])
| Just rhs <- expandUnfolding_maybe unfolding
=
analyse rhs args
where
unfolding = id_unf fun
analyse _ _ = Nothing
beta (Lam v body) pairs (arg : args)
| isTypeArg arg
= beta body ((v,arg):pairs) args
beta (Lam {}) _ _
= Nothing
beta fun pairs args
= analyse (substExpr (text "subst-expr-is-con-app") subst fun) args
where
subst = mkOpenSubst (mkInScopeSet (exprFreeVars fun)) pairs
stripTypeArgs :: [CoreExpr] -> [Type]
stripTypeArgs args = ASSERT2( all isTypeArg args, ppr args )
[ty | Type ty <- args]
\end{code}
Note [Unfolding DFuns]
~~~~~~~~~~~~~~~~~~~~~~
DFuns look like
df :: forall a b. (Eq a, Eq b) -> Eq (a,b)
df a b d_a d_b = MkEqD (a,b) ($c1 a b d_a d_b)
($c2 a b d_a d_b)
So to split it up we just need to apply the ops $c1, $c2 etc
to the very same args as the dfun. It takes a little more work
to compute the type arguments to the dictionary constructor.
Note [DFun arity check]
~~~~~~~~~~~~~~~~~~~~~~~
Here we check that the total number of supplied arguments (inclding
type args) matches what the dfun is expecting. This may be *less*
than the ordinary arity of the dfun: see Note [DFun unfoldings] in CoreSyn