{-# LANGUAGE ViewPatterns #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE FlexibleContexts #-} {-# OPTIONS_GHC -fno-warn-warnings-deprecations #-} -- This module implements generalized code motion for assignments to -- local registers, inlining and sinking when possible. It also does -- some amount of rewriting for stores to register slots, which are -- effectively equivalent to local registers. module CmmRewriteAssignments ( rewriteAssignments ) where import StgCmmUtils -- XXX layering violation import Cmm import CmmUtils import CmmOpt import DynFlags import UniqSupply import UniqFM import Unique import BlockId import Hoopl import Compiler.Hoopl ((<*>), mkMiddle, mkLast) import Data.Maybe import Control.Monad import Prelude hiding (succ, zip) ---------------------------------------------------------------- --- Main function rewriteAssignments :: DynFlags -> CmmGraph -> UniqSM CmmGraph rewriteAssignments dflags g = do -- Because we need to act on forwards and backwards information, we -- first perform usage analysis and bake this information into the -- graph (backwards transform), and then do a forwards transform -- to actually perform inlining and sinking. g' <- annotateUsage dflags g g'' <- liftM fst $ dataflowPassFwd g' [(g_entry g, fact_bot assignmentLattice)] $ analRewFwd assignmentLattice (assignmentTransfer dflags) (assignmentRewrite dflags `thenFwdRw` machOpFoldRewrite dflags) return (modifyGraph eraseRegUsage g'') ---------------------------------------------------------------- --- Usage information -- We decorate all register assignments with approximate usage -- information, that is, the maximum number of times the register is -- referenced while it is live along all outgoing control paths. -- This analysis provides a precise upper bound for usage, so if a -- register is never referenced, we can remove it, as that assignment is -- dead. -- -- This analysis is very similar to liveness analysis; we just keep a -- little extra info. (Maybe we should move it to CmmLive, and subsume -- the old liveness analysis.) -- -- There are a few subtleties here: -- -- - If a register goes dead, and then becomes live again, the usages -- of the disjoint live range don't count towards the original range. -- -- a = 1; // used once -- b = a; -- a = 2; // used once -- c = a; -- -- - A register may be used multiple times, but these all reside in -- different control paths, such that any given execution only uses -- it once. In that case, the usage count may still be 1. -- -- a = 1; // used once -- if (b) { -- c = a + 3; -- } else { -- c = a + 1; -- } -- -- This policy corresponds to an inlining strategy that does not -- duplicate computation but may increase binary size. -- -- - If we naively implement a usage count, we have a counting to -- infinity problem across joins. Furthermore, knowing that -- something is used 2 or more times in one runtime execution isn't -- particularly useful for optimizations (inlining may be beneficial, -- but there's no way of knowing that without register pressure -- information.) -- -- while (...) { -- // first iteration, b used once -- // second iteration, b used twice -- // third iteration ... -- a = b; -- } -- // b used zero times -- -- There is an orthogonal question, which is that for every runtime -- execution, the register may be used only once, but if we inline it -- in every conditional path, the binary size might increase a lot. -- But tracking this information would be tricky, because it violates -- the finite lattice restriction Hoopl requires for termination; -- we'd thus need to supply an alternate proof, which is probably -- something we should defer until we actually have an optimization -- that would take advantage of this. (This might also interact -- strangely with liveness information.) -- -- a = ...; -- // a is used one time, but in X different paths -- case (b) of -- 1 -> ... a ... -- 2 -> ... a ... -- 3 -> ... a ... -- ... -- -- - Memory stores to local register slots (CmmStore (CmmStackSlot -- (LocalReg _) 0) _) have similar behavior to local registers, -- in that these locations are all disjoint from each other. Thus, -- we attempt to inline them too. Note that because these are only -- generated as part of the spilling process, most of the time this -- will refer to a local register and the assignment will immediately -- die on the subsequent call. However, if we manage to replace that -- local register with a memory location, it means that we've managed -- to preserve a value on the stack without having to move it to -- another memory location again! We collect usage information just -- to be safe in case extra computation is involved. data RegUsage = SingleUse | ManyUse deriving (Ord, Eq, Show) -- Absence in map = ZeroUse {- -- minBound is bottom, maxBound is top, least-upper-bound is max -- ToDo: Put this in Hoopl. Note that this isn't as useful as I -- originally hoped, because you usually want to leave out the bottom -- element when you have things like this put in maps. Maybe f is -- useful on its own as a combining function. boundedOrdLattice :: (Bounded a, Ord a) => String -> DataflowLattice a boundedOrdLattice n = DataflowLattice n minBound f where f _ (OldFact x) (NewFact y) | x >= y = (NoChange, x) | otherwise = (SomeChange, y) -} -- Custom node type we'll rewrite to. CmmAssign nodes to local -- registers are replaced with AssignLocal nodes. data WithRegUsage n e x where -- Plain will not contain CmmAssign nodes immediately after -- transformation, but as we rewrite assignments, we may have -- assignments here: these are assignments that should not be -- rewritten! Plain :: n e x -> WithRegUsage n e x AssignLocal :: LocalReg -> CmmExpr -> RegUsage -> WithRegUsage n O O instance UserOfRegs LocalReg (n e x) => UserOfRegs LocalReg (WithRegUsage n e x) where foldRegsUsed dflags f z (Plain n) = foldRegsUsed dflags f z n foldRegsUsed dflags f z (AssignLocal _ e _) = foldRegsUsed dflags f z e instance DefinerOfRegs LocalReg (n e x) => DefinerOfRegs LocalReg (WithRegUsage n e x) where foldRegsDefd dflags f z (Plain n) = foldRegsDefd dflags f z n foldRegsDefd dflags f z (AssignLocal r _ _) = foldRegsDefd dflags f z r instance NonLocal n => NonLocal (WithRegUsage n) where entryLabel (Plain n) = entryLabel n successors (Plain n) = successors n liftRegUsage :: Graph n e x -> Graph (WithRegUsage n) e x liftRegUsage = mapGraph Plain eraseRegUsage :: Graph (WithRegUsage CmmNode) e x -> Graph CmmNode e x eraseRegUsage = mapGraph f where f :: WithRegUsage CmmNode e x -> CmmNode e x f (AssignLocal l e _) = CmmAssign (CmmLocal l) e f (Plain n) = n type UsageMap = UniqFM RegUsage usageLattice :: DataflowLattice UsageMap usageLattice = DataflowLattice "usage counts for registers" emptyUFM (joinUFM f) where f _ (OldFact x) (NewFact y) | x >= y = (NoChange, x) | otherwise = (SomeChange, y) -- We reuse the names 'gen' and 'kill', although we're doing something -- slightly different from the Dragon Book usageTransfer :: DynFlags -> BwdTransfer (WithRegUsage CmmNode) UsageMap usageTransfer dflags = mkBTransfer3 first middle last where first _ f = f middle :: WithRegUsage CmmNode O O -> UsageMap -> UsageMap middle n f = gen_kill n f last :: WithRegUsage CmmNode O C -> FactBase UsageMap -> UsageMap -- Checking for CmmCall/CmmForeignCall is unnecessary, because -- spills/reloads have already occurred by the time we do this -- analysis. -- XXX Deprecated warning is puzzling: what label are we -- supposed to use? -- ToDo: With a bit more cleverness here, we can avoid -- disappointment and heartbreak associated with the inability -- to inline into CmmCall and CmmForeignCall by -- over-estimating the usage to be ManyUse. last n f = gen_kill n (joinOutFacts usageLattice n f) gen_kill :: WithRegUsage CmmNode e x -> UsageMap -> UsageMap gen_kill a = gen a . kill a gen :: WithRegUsage CmmNode e x -> UsageMap -> UsageMap gen a f = foldLocalRegsUsed dflags increaseUsage f a kill :: WithRegUsage CmmNode e x -> UsageMap -> UsageMap kill a f = foldLocalRegsDefd dflags delFromUFM f a increaseUsage f r = addToUFM_C combine f r SingleUse where combine _ _ = ManyUse usageRewrite :: BwdRewrite UniqSM (WithRegUsage CmmNode) UsageMap usageRewrite = mkBRewrite3 first middle last where first _ _ = return Nothing middle :: Monad m => WithRegUsage CmmNode O O -> UsageMap -> m (Maybe (Graph (WithRegUsage CmmNode) O O)) middle (Plain (CmmAssign (CmmLocal l) e)) f = return . Just $ case lookupUFM f l of Nothing -> emptyGraph Just usage -> mkMiddle (AssignLocal l e usage) middle _ _ = return Nothing last _ _ = return Nothing type CmmGraphWithRegUsage = GenCmmGraph (WithRegUsage CmmNode) annotateUsage :: DynFlags -> CmmGraph -> UniqSM (CmmGraphWithRegUsage) annotateUsage dflags vanilla_g = let g = modifyGraph liftRegUsage vanilla_g in liftM fst $ dataflowPassBwd g [(g_entry g, fact_bot usageLattice)] $ analRewBwd usageLattice (usageTransfer dflags) usageRewrite ---------------------------------------------------------------- --- Assignment tracking -- The idea is to maintain a map of local registers do expressions, -- such that the value of that register is the same as the value of that -- expression at any given time. We can then do several things, -- as described by Assignment. -- Assignment describes the various optimizations that are valid -- at a given point in the program. data Assignment = -- This assignment can always be inlined. It is cheap or single-use. AlwaysInline CmmExpr -- This assignment should be sunk down to its first use. (This will -- increase code size if the register is used in multiple control flow -- paths, but won't increase execution time, and the reduction of -- register pressure is worth it, I think.) | AlwaysSink CmmExpr -- We cannot safely optimize occurrences of this local register. (This -- corresponds to top in the lattice structure.) | NeverOptimize -- Extract the expression that is being assigned to xassign :: Assignment -> Maybe CmmExpr xassign (AlwaysInline e) = Just e xassign (AlwaysSink e) = Just e xassign NeverOptimize = Nothing -- Extracts the expression, but only if they're the same constructor xassign2 :: (Assignment, Assignment) -> Maybe (CmmExpr, CmmExpr) xassign2 (AlwaysInline e, AlwaysInline e') = Just (e, e') xassign2 (AlwaysSink e, AlwaysSink e') = Just (e, e') xassign2 _ = Nothing -- Note: We'd like to make decisions about "not optimizing" as soon as -- possible, because this will make running the transfer function more -- efficient. type AssignmentMap = UniqFM Assignment assignmentLattice :: DataflowLattice AssignmentMap assignmentLattice = DataflowLattice "assignments for registers" emptyUFM (joinUFM add) where add _ (OldFact old) (NewFact new) = case (old, new) of (NeverOptimize, _) -> (NoChange, NeverOptimize) (_, NeverOptimize) -> (SomeChange, NeverOptimize) (xassign2 -> Just (e, e')) | e == e' -> (NoChange, old) | otherwise -> (SomeChange, NeverOptimize) _ -> (SomeChange, NeverOptimize) -- Deletes sinks from assignment map, because /this/ is the place -- where it will be sunk to. deleteSinks :: UserOfRegs LocalReg n => DynFlags -> n -> AssignmentMap -> AssignmentMap deleteSinks dflags n m = foldLocalRegsUsed dflags (adjustUFM f) m n where f (AlwaysSink _) = NeverOptimize f old = old -- Invalidates any expressions that use a register. invalidateUsersOf :: CmmReg -> AssignmentMap -> AssignmentMap -- foldUFM_Directly :: (Unique -> elt -> a -> a) -> a -> UniqFM elt -> a invalidateUsersOf reg m = foldUFM_Directly f m m -- [foldUFM performance] where f u (xassign -> Just e) m | reg `regUsedIn` e = addToUFM_Directly m u NeverOptimize f _ _ m = m {- This requires the entire spine of the map to be continually rebuilt, - which causes crazy memory usage! invalidateUsersOf reg = mapUFM (invalidateUsers' reg) where invalidateUsers' reg (xassign -> Just e) | reg `regUsedIn` e = NeverOptimize invalidateUsers' _ old = old -} -- Note [foldUFM performance] -- These calls to fold UFM no longer leak memory, but they do cause -- pretty killer amounts of allocation. So they'll be something to -- optimize; we need an algorithmic change to prevent us from having to -- traverse the /entire/ map continually. middleAssignment :: DynFlags -> WithRegUsage CmmNode O O -> AssignmentMap -> AssignmentMap -- Algorithm for annotated assignments: -- 1. Delete any sinking assignments that were used by this instruction -- 2. Add the assignment to our list of valid local assignments with -- the correct optimization policy. -- 3. Look for all assignments that reference that register and -- invalidate them. middleAssignment dflags n@(AssignLocal r e usage) assign = invalidateUsersOf (CmmLocal r) . add . deleteSinks dflags n $ assign where add m = addToUFM m r $ case usage of SingleUse -> AlwaysInline e ManyUse -> decide e decide CmmLit{} = AlwaysInline e decide CmmReg{} = AlwaysInline e decide CmmLoad{} = AlwaysSink e decide CmmStackSlot{} = AlwaysSink e decide CmmMachOp{} = AlwaysSink e -- We'll always inline simple operations on the global -- registers, to reduce register pressure: Sp - 4 or Hp - 8 -- EZY: Justify this optimization more carefully. decide CmmRegOff{} = AlwaysInline e -- Algorithm for unannotated assignments of global registers: -- 1. Delete any sinking assignments that were used by this instruction -- 2. Look for all assignments that reference this register and -- invalidate them. middleAssignment dflags (Plain n@(CmmAssign reg@(CmmGlobal _) _)) assign = invalidateUsersOf reg . deleteSinks dflags n $ assign -- Algorithm for unannotated assignments of *local* registers: do -- nothing (it's a reload, so no state should have changed) middleAssignment _ (Plain (CmmAssign (CmmLocal _) _)) assign = assign -- Algorithm for stores: -- 1. Delete any sinking assignments that were used by this instruction -- 2. Look for all assignments that load from memory locations that -- were clobbered by this store and invalidate them. middleAssignment dflags (Plain n@(CmmStore lhs rhs)) assign = let m = deleteSinks dflags n assign in foldUFM_Directly f m m -- [foldUFM performance] where f u (xassign -> Just x) m | clobbers dflags (lhs, rhs) (u, x) = addToUFM_Directly m u NeverOptimize f _ _ m = m {- Also leaky = mapUFM_Directly p . deleteSinks n $ assign -- ToDo: There's a missed opportunity here: even if a memory -- access we're attempting to sink gets clobbered at some -- location, it's still /better/ to sink it to right before the -- point where it gets clobbered. How might we do this? -- Unfortunately, it's too late to change the assignment... where p r (xassign -> Just x) | (lhs, rhs) `clobbers` (r, x) = NeverOptimize p _ old = old -} -- Assumption: Unsafe foreign calls don't clobber memory -- Since foreign calls clobber caller saved registers, we need -- invalidate any assignments that reference those global registers. -- This is kind of expensive. (One way to optimize this might be to -- store extra information about expressions that allow this and other -- checks to be done cheaply.) middleAssignment dflags (Plain n@(CmmUnsafeForeignCall{})) assign = deleteCallerSaves (foldLocalRegsDefd dflags (\m r -> addToUFM m r NeverOptimize) (deleteSinks dflags n assign) n) where deleteCallerSaves m = foldUFM_Directly f m m f u (xassign -> Just x) m | wrapRecExpf g x False = addToUFM_Directly m u NeverOptimize f _ _ m = m g (CmmReg (CmmGlobal r)) _ | callerSaves platform r = True g (CmmRegOff (CmmGlobal r) _) _ | callerSaves platform r = True g _ b = b platform = targetPlatform dflags middleAssignment _ (Plain (CmmComment {})) assign = assign -- Assumptions: -- * Writes using Hp do not overlap with any other memory locations -- (An important invariant being relied on here is that we only ever -- use Hp to allocate values on the heap, which appears to be the -- case given hpReg usage, and that our heap writing code doesn't -- do anything stupid like overlapping writes.) -- * Stack slots do not overlap with any other memory locations -- * Stack slots for different areas do not overlap -- * Stack slots within the same area and different offsets may -- overlap; we need to do a size check (see 'overlaps'). -- * Register slots only overlap with themselves. (But this shouldn't -- happen in practice, because we'll fail to inline a reload across -- the next spill.) -- * Non stack-slot stores always conflict with each other. (This is -- not always the case; we could probably do something special for Hp) clobbers :: DynFlags -> (CmmExpr, CmmExpr) -- (lhs, rhs) of clobbering CmmStore -> (Unique, CmmExpr) -- (register, expression) that may be clobbered -> Bool clobbers _ (CmmRegOff (CmmGlobal Hp) _, _) (_, _) = False clobbers _ (CmmReg (CmmGlobal Hp), _) (_, _) = False -- ToDo: Also catch MachOp case clobbers _ (ss@CmmStackSlot{}, CmmReg (CmmLocal r)) (u, CmmLoad (ss'@CmmStackSlot{}) _) | getUnique r == u, ss == ss' = False -- No-op on the stack slot (XXX: Do we need this special case?) clobbers dflags (CmmStackSlot a o, rhs) (_, expr) = f expr where f (CmmLoad (CmmStackSlot a' o') t) = (a, o, widthInBytes (cmmExprWidth dflags rhs)) `overlaps` (a', o', widthInBytes (typeWidth t)) f (CmmLoad e _) = containsStackSlot e f (CmmMachOp _ es) = or (map f es) f _ = False -- Maybe there's an invariant broken if this actually ever -- returns True containsStackSlot (CmmLoad{}) = True -- load of a load, all bets off containsStackSlot (CmmMachOp _ es) = or (map containsStackSlot es) containsStackSlot (CmmStackSlot{}) = True containsStackSlot _ = False clobbers _ _ (_, e) = f e where f (CmmLoad (CmmStackSlot _ _) _) = False f (CmmLoad{}) = True -- conservative f (CmmMachOp _ es) = or (map f es) f _ = False -- Check for memory overlapping. -- Diagram: -- 4 8 12 -- s -w- o -- [ I32 ] -- [ F64 ] -- s' -w'- o' type CallSubArea = (Area, Int, Int) -- area, offset, width overlaps :: CallSubArea -> CallSubArea -> Bool overlaps (a, _, _) (a', _, _) | a /= a' = False overlaps (_, o, w) (_, o', w') = let s = o - w s' = o' - w' in (s' < o) && (s < o) -- Not LTE, because [ I32 ][ I32 ] is OK lastAssignment :: DynFlags -> WithRegUsage CmmNode O C -> AssignmentMap -> [(Label, AssignmentMap)] lastAssignment _ (Plain (CmmCall _ (Just k) _ _ _ _)) assign = [(k, invalidateVolatile k assign)] lastAssignment _ (Plain (CmmForeignCall {succ=k})) assign = [(k, invalidateVolatile k assign)] lastAssignment dflags l assign = map (\id -> (id, deleteSinks dflags l assign)) $ successors l -- Invalidates any expressions that have volatile contents: essentially, -- all terminals volatile except for literals and loads of stack slots -- that do not correspond to the call area for 'k' (the current call -- area is volatile because overflow return parameters may be written -- there.) -- Note: mapUFM could be expensive, but hopefully block boundaries -- aren't too common. If it is a problem, replace with something more -- clever. invalidateVolatile :: BlockId -> AssignmentMap -> AssignmentMap invalidateVolatile k m = mapUFM p m where p (AlwaysInline e) = if exp e then AlwaysInline e else NeverOptimize where exp CmmLit{} = True exp (CmmLoad (CmmStackSlot (Young k') _) _) | k' == k = False exp (CmmLoad (CmmStackSlot _ _) _) = True exp (CmmMachOp _ es) = and (map exp es) exp _ = False p _ = NeverOptimize -- probably shouldn't happen with AlwaysSink assignmentTransfer :: DynFlags -> FwdTransfer (WithRegUsage CmmNode) AssignmentMap assignmentTransfer dflags = mkFTransfer3 (flip const) (middleAssignment dflags) ((mkFactBase assignmentLattice .) . lastAssignment dflags) -- Note [Soundness of inlining] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- In the Hoopl paper, the soundness condition on rewrite functions is -- described as follows: -- -- "If it replaces a node n by a replacement graph g, then g must -- be observationally equivalent to n under the assumptions -- expressed by the incoming dataflow fact f. Moreover, analysis of -- g must produce output fact(s) that are at least as informative -- as the fact(s) produced by applying the transfer function to n." -- -- We consider the second condition in more detail here. It says given -- the rewrite R(n, f) = g, then for any incoming fact f' consistent -- with f (f' >= f), then running the transfer function T(f', n) <= T(f', g). -- For inlining this is not necessarily the case: -- -- n = "x = a + 2" -- f = f' = {a = y} -- g = "x = y + 2" -- T(f', n) = {x = a + 2, a = y} -- T(f', g) = {x = y + 2, a = y} -- -- y + 2 and a + 2 are not obviously comparable, and a naive -- implementation of the lattice would say they are incomparable. -- At best, this means we may be over-conservative, at worst, it means -- we may not terminate. -- -- However, in the original Lerner-Grove-Chambers paper, soundness and -- termination are separated, and only equivalence of facts is required -- for soundness. Monotonicity of the transfer function is not required -- for termination (as the calculation of least-upper-bound prevents -- this from being a problem), but it means we won't necessarily find -- the least-fixed point. -- Note [Coherency of annotations] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- Is it possible for our usage annotations to become invalid after we -- start performing transformations? As the usage info only provides -- an upper bound, we only need to consider cases where the usages of -- a register may increase due to transformations--e.g. any reference -- to a local register in an AlwaysInline or AlwaysSink instruction, whose -- originating assignment was single use (we don't care about the -- many use case, because it is the top of the lattice). But such a -- case is not possible, because we always inline any single use -- register. QED. -- -- TODO: A useful lint option would be to check this invariant that -- there is never a local register in the assignment map that is -- single-use. -- Note [Soundness of store rewriting] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- Its soundness depends on the invariant that no assignment is made to -- the local register before its store is accessed. This is clearly -- true with unoptimized spill-reload code, and as the store will always -- be rewritten first (if possible), there is no chance of it being -- propagated down before getting written (possibly with incorrect -- values from the assignment map, due to reassignment of the local -- register.) This is probably not locally sound. assignmentRewrite :: DynFlags -> FwdRewrite UniqSM (WithRegUsage CmmNode) AssignmentMap assignmentRewrite dflags = mkFRewrite3 first middle last where first _ _ = return Nothing middle :: WithRegUsage CmmNode O O -> AssignmentMap -> GenCmmReplGraph (WithRegUsage CmmNode) O O middle (Plain m) assign = return $ rewrite assign (precompute assign m) mkMiddle m middle (AssignLocal l e u) assign = return $ rewriteLocal assign (precompute assign (CmmAssign (CmmLocal l) e)) l e u last (Plain l) assign = return $ rewrite assign (precompute assign l) mkLast l -- Tuple is (inline?, reloads for sinks) precompute :: AssignmentMap -> CmmNode O x -> (Bool, [WithRegUsage CmmNode O O]) precompute assign n = foldLocalRegsUsed dflags f (False, []) n -- duplicates are harmless where f (i, l) r = case lookupUFM assign r of Just (AlwaysSink e) -> (i, (Plain (CmmAssign (CmmLocal r) e)):l) Just (AlwaysInline _) -> (True, l) Just NeverOptimize -> (i, l) -- This case can show up when we have -- limited optimization fuel. Nothing -> (i, l) rewrite :: AssignmentMap -> (Bool, [WithRegUsage CmmNode O O]) -> (WithRegUsage CmmNode O x -> Graph (WithRegUsage CmmNode) O x) -> CmmNode O x -> Maybe (Graph (WithRegUsage CmmNode) O x) rewrite _ (False, []) _ _ = Nothing -- Note [CmmCall Inline Hack] -- Conservative hack: don't do any inlining on what will -- be translated into an OldCmm CmmCalls, since the code -- produced here tends to be unproblematic and I need to write -- lint passes to ensure that we don't put anything in the -- arguments that could be construed as a global register by -- some later translation pass. (For example, slots will turn -- into dereferences of Sp). See [Register parameter passing]. -- ToDo: Fix this up to only bug out if all inlines were for -- CmmExprs with global registers (we can't use the -- straightforward mapExpDeep call, in this case.) ToDo: We miss -- an opportunity here, where all possible inlinings should -- instead be sunk. rewrite _ (True, []) _ n | not (inlinable n) = Nothing -- see [CmmCall Inline Hack] rewrite assign (i, xs) mk n = Just $ mkMiddles xs <*> mk (Plain (inline i assign n)) rewriteLocal :: AssignmentMap -> (Bool, [WithRegUsage CmmNode O O]) -> LocalReg -> CmmExpr -> RegUsage -> Maybe (Graph (WithRegUsage CmmNode) O O) rewriteLocal _ (False, []) _ _ _ = Nothing rewriteLocal assign (i, xs) l e u = Just $ mkMiddles xs <*> mkMiddle n' where n' = AssignLocal l e' u e' = if i then wrapRecExp (inlineExp assign) e else e -- inlinable check omitted, since we can always inline into -- assignments. inline :: Bool -> AssignmentMap -> CmmNode e x -> CmmNode e x inline False _ n = n inline True _ n | not (inlinable n) = n -- see [CmmCall Inline Hack] inline True assign n = mapExpDeep (inlineExp assign) n inlineExp assign old@(CmmReg (CmmLocal r)) = case lookupUFM assign r of Just (AlwaysInline x) -> x _ -> old inlineExp assign old@(CmmRegOff (CmmLocal r) i) = case lookupUFM assign r of Just (AlwaysInline x) -> case x of (CmmRegOff r' i') -> CmmRegOff r' (i + i') _ -> CmmMachOp (MO_Add rep) [x, CmmLit (CmmInt (fromIntegral i) rep)] where rep = typeWidth (localRegType r) _ -> old -- See Note [Soundness of store rewriting] inlineExp _ old = old inlinable :: CmmNode e x -> Bool inlinable (CmmCall{}) = False inlinable (CmmForeignCall{}) = False inlinable (CmmUnsafeForeignCall{}) = False inlinable _ = True -- Need to interleave this with inlining, because machop folding results -- in literals, which we can inline more aggressively, and inlining -- gives us opportunities for more folding. However, we don't need any -- facts to do MachOp folding. machOpFoldRewrite :: DynFlags -> FwdRewrite UniqSM (WithRegUsage CmmNode) a machOpFoldRewrite dflags = mkFRewrite3 first middle last where first _ _ = return Nothing middle :: WithRegUsage CmmNode O O -> a -> GenCmmReplGraph (WithRegUsage CmmNode) O O middle (Plain m) _ = return (fmap (mkMiddle . Plain) (foldNode m)) middle (AssignLocal l e r) _ = return (fmap f (wrapRecExpM foldExp e)) where f e' = mkMiddle (AssignLocal l e' r) last :: WithRegUsage CmmNode O C -> a -> GenCmmReplGraph (WithRegUsage CmmNode) O C last (Plain l) _ = return (fmap (mkLast . Plain) (foldNode l)) foldNode :: CmmNode e x -> Maybe (CmmNode e x) foldNode n = mapExpDeepM foldExp n foldExp (CmmMachOp op args) = cmmMachOpFoldM dflags op args foldExp _ = Nothing -- ToDo: Outputable instance for UsageMap and AssignmentMap