123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564 |
- --[[--------------------------------------------------------------------
- optparser.lua: does parser-based optimizations
- This file is part of LuaSrcDiet.
- Copyright (c) 2008 Kein-Hong Man <khman@users.sf.net>
- The COPYRIGHT file describes the conditions
- under which this software may be distributed.
- See the ChangeLog for more information.
- ----------------------------------------------------------------------]]
- --[[--------------------------------------------------------------------
- -- NOTES:
- -- * For more parser-based optimization ideas, see the TODO items or
- -- look at technotes.txt.
- -- * The processing load is quite significant, but since this is an
- -- off-line text processor, I believe we can wait a few seconds.
- -- * TODO: might process "local a,a,a" wrongly... need tests!
- -- * TODO: remove position handling if overlapped locals (rem < 0)
- -- needs more study, to check behaviour
- -- * TODO: there are probably better ways to do allocation, e.g. by
- -- choosing better methods to sort and pick locals...
- -- * TODO: we don't need 53*63 two-letter identifiers; we can make
- -- do with significantly less depending on how many that are really
- -- needed and improve entropy; e.g. 13 needed -> choose 4*4 instead
- ----------------------------------------------------------------------]]
- local base = _G
- local string = require "string"
- local table = require "table"
- module "optparser"
- ----------------------------------------------------------------------
- -- Letter frequencies for reducing symbol entropy (fixed version)
- -- * Might help a wee bit when the output file is compressed
- -- * See Wikipedia: http://en.wikipedia.org/wiki/Letter_frequencies
- -- * We use letter frequencies according to a Linotype keyboard, plus
- -- the underscore, and both lower case and upper case letters.
- -- * The arrangement below (LC, underscore, %d, UC) is arbitrary.
- -- * This is certainly not optimal, but is quick-and-dirty and the
- -- process has no significant overhead
- ----------------------------------------------------------------------
- local LETTERS = "etaoinshrdlucmfwypvbgkqjxz_ETAOINSHRDLUCMFWYPVBGKQJXZ"
- local ALPHANUM = "etaoinshrdlucmfwypvbgkqjxz_0123456789ETAOINSHRDLUCMFWYPVBGKQJXZ"
- -- names or identifiers that must be skipped
- -- * the first two lines are for keywords
- local SKIP_NAME = {}
- for v in string.gmatch([[
- and break do else elseif end false for function if in
- local nil not or repeat return then true until while
- self]], "%S+") do
- SKIP_NAME[v] = true
- end
- ------------------------------------------------------------------------
- -- variables and data structures
- ------------------------------------------------------------------------
- local toklist, seminfolist, -- token lists
- globalinfo, localinfo, -- variable information tables
- globaluniq, localuniq, -- unique name tables
- var_new, -- index of new variable names
- varlist -- list of output variables
- ----------------------------------------------------------------------
- -- preprocess information table to get lists of unique names
- ----------------------------------------------------------------------
- local function preprocess(infotable)
- local uniqtable = {}
- for i = 1, #infotable do -- enumerate info table
- local obj = infotable[i]
- local name = obj.name
- --------------------------------------------------------------------
- if not uniqtable[name] then -- not found, start an entry
- uniqtable[name] = {
- decl = 0, token = 0, size = 0,
- }
- end
- --------------------------------------------------------------------
- local uniq = uniqtable[name] -- count declarations, tokens, size
- uniq.decl = uniq.decl + 1
- local xref = obj.xref
- local xcount = #xref
- uniq.token = uniq.token + xcount
- uniq.size = uniq.size + xcount * #name
- --------------------------------------------------------------------
- if obj.decl then -- if local table, create first,last pairs
- obj.id = i
- obj.xcount = xcount
- if xcount > 1 then -- if ==1, means local never accessed
- obj.first = xref[2]
- obj.last = xref[xcount]
- end
- --------------------------------------------------------------------
- else -- if global table, add a back ref
- uniq.id = i
- end
- --------------------------------------------------------------------
- end--for
- return uniqtable
- end
- ----------------------------------------------------------------------
- -- calculate actual symbol frequencies, in order to reduce entropy
- -- * this may help further reduce the size of compressed sources
- -- * note that since parsing optimizations is put before lexing
- -- optimizations, the frequency table is not exact!
- -- * yes, this will miss --keep block comments too...
- ----------------------------------------------------------------------
- local function recalc_for_entropy(option)
- local byte = string.byte
- local char = string.char
- -- table of token classes to accept in calculating symbol frequency
- local ACCEPT = {
- TK_KEYWORD = true, TK_NAME = true, TK_NUMBER = true,
- TK_STRING = true, TK_LSTRING = true,
- }
- if not option["opt-comments"] then
- ACCEPT.TK_COMMENT = true
- ACCEPT.TK_LCOMMENT = true
- end
- --------------------------------------------------------------------
- -- create a new table and remove any original locals by filtering
- --------------------------------------------------------------------
- local filtered = {}
- for i = 1, #toklist do
- filtered[i] = seminfolist[i]
- end
- for i = 1, #localinfo do -- enumerate local info table
- local obj = localinfo[i]
- local xref = obj.xref
- for j = 1, obj.xcount do
- local p = xref[j]
- filtered[p] = "" -- remove locals
- end
- end
- --------------------------------------------------------------------
- local freq = {} -- reset symbol frequency table
- for i = 0, 255 do freq[i] = 0 end
- for i = 1, #toklist do -- gather symbol frequency
- local tok, info = toklist[i], filtered[i]
- if ACCEPT[tok] then
- for j = 1, #info do
- local c = byte(info, j)
- freq[c] = freq[c] + 1
- end
- end--if
- end--for
- --------------------------------------------------------------------
- -- function to re-sort symbols according to actual frequencies
- --------------------------------------------------------------------
- local function resort(symbols)
- local symlist = {}
- for i = 1, #symbols do -- prepare table to sort
- local c = byte(symbols, i)
- symlist[i] = { c = c, freq = freq[c], }
- end
- table.sort(symlist, -- sort selected symbols
- function(v1, v2)
- return v1.freq > v2.freq
- end
- )
- local charlist = {} -- reconstitute the string
- for i = 1, #symlist do
- charlist[i] = char(symlist[i].c)
- end
- return table.concat(charlist)
- end
- --------------------------------------------------------------------
- LETTERS = resort(LETTERS) -- change letter arrangement
- ALPHANUM = resort(ALPHANUM)
- end
- ----------------------------------------------------------------------
- -- returns a string containing a new local variable name to use, and
- -- a flag indicating whether it collides with a global variable
- -- * trapping keywords and other names like 'self' is done elsewhere
- ----------------------------------------------------------------------
- local function new_var_name()
- local var
- local cletters, calphanum = #LETTERS, #ALPHANUM
- local v = var_new
- if v < cletters then -- single char
- v = v + 1
- var = string.sub(LETTERS, v, v)
- else -- longer names
- local range, sz = cletters, 1 -- calculate # chars fit
- repeat
- v = v - range
- range = range * calphanum
- sz = sz + 1
- until range > v
- local n = v % cletters -- left side cycles faster
- v = (v - n) / cletters -- do first char first
- n = n + 1
- var = string.sub(LETTERS, n, n)
- while sz > 1 do
- local m = v % calphanum
- v = (v - m) / calphanum
- m = m + 1
- var = var..string.sub(ALPHANUM, m, m)
- sz = sz - 1
- end
- end
- var_new = var_new + 1
- return var, globaluniq[var] ~= nil
- end
- ----------------------------------------------------------------------
- -- calculate and print some statistics
- -- * probably better in main source, put here for now
- ----------------------------------------------------------------------
- local function stats_summary(globaluniq, localuniq, afteruniq, option)
- local print = print or base.print
- local fmt = string.format
- local opt_details = option.DETAILS
- local uniq_g , uniq_li, uniq_lo, uniq_ti, uniq_to, -- stats needed
- decl_g, decl_li, decl_lo, decl_ti, decl_to,
- token_g, token_li, token_lo, token_ti, token_to,
- size_g, size_li, size_lo, size_ti, size_to
- = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
- local function avg(c, l) -- safe average function
- if c == 0 then return 0 end
- return l / c
- end
- --------------------------------------------------------------------
- -- collect statistics (note: globals do not have declarations!)
- --------------------------------------------------------------------
- for name, uniq in base.pairs(globaluniq) do
- uniq_g = uniq_g + 1
- token_g = token_g + uniq.token
- size_g = size_g + uniq.size
- end
- for name, uniq in base.pairs(localuniq) do
- uniq_li = uniq_li + 1
- decl_li = decl_li + uniq.decl
- token_li = token_li + uniq.token
- size_li = size_li + uniq.size
- end
- for name, uniq in base.pairs(afteruniq) do
- uniq_lo = uniq_lo + 1
- decl_lo = decl_lo + uniq.decl
- token_lo = token_lo + uniq.token
- size_lo = size_lo + uniq.size
- end
- uniq_ti = uniq_g + uniq_li
- decl_ti = decl_g + decl_li
- token_ti = token_g + token_li
- size_ti = size_g + size_li
- uniq_to = uniq_g + uniq_lo
- decl_to = decl_g + decl_lo
- token_to = token_g + token_lo
- size_to = size_g + size_lo
- --------------------------------------------------------------------
- -- detailed stats: global list
- --------------------------------------------------------------------
- if opt_details then
- local sorted = {} -- sort table of unique global names by size
- for name, uniq in base.pairs(globaluniq) do
- uniq.name = name
- sorted[#sorted + 1] = uniq
- end
- table.sort(sorted,
- function(v1, v2)
- return v1.size > v2.size
- end
- )
- local tabf1, tabf2 = "%8s%8s%10s %s", "%8d%8d%10.2f %s"
- local hl = string.rep("-", 44)
- print("*** global variable list (sorted by size) ***\n"..hl)
- print(fmt(tabf1, "Token", "Input", "Input", "Global"))
- print(fmt(tabf1, "Count", "Bytes", "Average", "Name"))
- print(hl)
- for i = 1, #sorted do
- local uniq = sorted[i]
- print(fmt(tabf2, uniq.token, uniq.size, avg(uniq.token, uniq.size), uniq.name))
- end
- print(hl)
- print(fmt(tabf2, token_g, size_g, avg(token_g, size_g), "TOTAL"))
- print(hl.."\n")
- --------------------------------------------------------------------
- -- detailed stats: local list
- --------------------------------------------------------------------
- local tabf1, tabf2 = "%8s%8s%8s%10s%8s%10s %s", "%8d%8d%8d%10.2f%8d%10.2f %s"
- local hl = string.rep("-", 70)
- print("*** local variable list (sorted by allocation order) ***\n"..hl)
- print(fmt(tabf1, "Decl.", "Token", "Input", "Input", "Output", "Output", "Global"))
- print(fmt(tabf1, "Count", "Count", "Bytes", "Average", "Bytes", "Average", "Name"))
- print(hl)
- for i = 1, #varlist do -- iterate according to order assigned
- local name = varlist[i]
- local uniq = afteruniq[name]
- local old_t, old_s = 0, 0
- for j = 1, #localinfo do -- find corresponding old names and calculate
- local obj = localinfo[j]
- if obj.name == name then
- old_t = old_t + obj.xcount
- old_s = old_s + obj.xcount * #obj.oldname
- end
- end
- print(fmt(tabf2, uniq.decl, uniq.token, old_s, avg(old_t, old_s),
- uniq.size, avg(uniq.token, uniq.size), name))
- end
- print(hl)
- print(fmt(tabf2, decl_lo, token_lo, size_li, avg(token_li, size_li),
- size_lo, avg(token_lo, size_lo), "TOTAL"))
- print(hl.."\n")
- end--if opt_details
- --------------------------------------------------------------------
- -- display output
- --------------------------------------------------------------------
- local tabf1, tabf2 = "%-16s%8s%8s%8s%8s%10s", "%-16s%8d%8d%8d%8d%10.2f"
- local hl = string.rep("-", 58)
- print("*** local variable optimization summary ***\n"..hl)
- print(fmt(tabf1, "Variable", "Unique", "Decl.", "Token", "Size", "Average"))
- print(fmt(tabf1, "Types", "Names", "Count", "Count", "Bytes", "Bytes"))
- print(hl)
- print(fmt(tabf2, "Global", uniq_g, decl_g, token_g, size_g, avg(token_g, size_g)))
- print(hl)
- print(fmt(tabf2, "Local (in)", uniq_li, decl_li, token_li, size_li, avg(token_li, size_li)))
- print(fmt(tabf2, "TOTAL (in)", uniq_ti, decl_ti, token_ti, size_ti, avg(token_ti, size_ti)))
- print(hl)
- print(fmt(tabf2, "Local (out)", uniq_lo, decl_lo, token_lo, size_lo, avg(token_lo, size_lo)))
- print(fmt(tabf2, "TOTAL (out)", uniq_to, decl_to, token_to, size_to, avg(token_to, size_to)))
- print(hl.."\n")
- end
- ----------------------------------------------------------------------
- -- main entry point
- -- * does only local variable optimization for now
- ----------------------------------------------------------------------
- function optimize(option, _toklist, _seminfolist, _globalinfo, _localinfo)
- -- set tables
- toklist, seminfolist, globalinfo, localinfo
- = _toklist, _seminfolist, _globalinfo, _localinfo
- var_new = 0 -- reset variable name allocator
- varlist = {}
- ------------------------------------------------------------------
- -- preprocess global/local tables, handle entropy reduction
- ------------------------------------------------------------------
- globaluniq = preprocess(globalinfo)
- localuniq = preprocess(localinfo)
- if option["opt-entropy"] then -- for entropy improvement
- recalc_for_entropy(option)
- end
- ------------------------------------------------------------------
- -- build initial declared object table, then sort according to
- -- token count, this might help assign more tokens to more common
- -- variable names such as 'e' thus possibly reducing entropy
- -- * an object knows its localinfo index via its 'id' field
- -- * special handling for "self" special local (parameter) here
- ------------------------------------------------------------------
- local object = {}
- for i = 1, #localinfo do
- object[i] = localinfo[i]
- end
- table.sort(object, -- sort largest first
- function(v1, v2)
- return v1.xcount > v2.xcount
- end
- )
- ------------------------------------------------------------------
- -- the special "self" function parameters must be preserved
- -- * the allocator below will never use "self", so it is safe to
- -- keep those implicit declarations as-is
- ------------------------------------------------------------------
- local temp, j, gotself = {}, 1, false
- for i = 1, #object do
- local obj = object[i]
- if not obj.isself then
- temp[j] = obj
- j = j + 1
- else
- gotself = true
- end
- end
- object = temp
- ------------------------------------------------------------------
- -- a simple first-come first-served heuristic name allocator,
- -- note that this is in no way optimal...
- -- * each object is a local variable declaration plus existence
- -- * the aim is to assign short names to as many tokens as possible,
- -- so the following tries to maximize name reuse
- -- * note that we preserve sort order
- ------------------------------------------------------------------
- local nobject = #object
- while nobject > 0 do
- local varname, gcollide
- repeat
- varname, gcollide = new_var_name() -- collect a variable name
- until not SKIP_NAME[varname] -- skip all special names
- varlist[#varlist + 1] = varname -- keep a list
- local oleft = nobject
- ------------------------------------------------------------------
- -- if variable name collides with an existing global, the name
- -- cannot be used by a local when the name is accessed as a global
- -- during which the local is alive (between 'act' to 'rem'), so
- -- we drop objects that collides with the corresponding global
- ------------------------------------------------------------------
- if gcollide then
- -- find the xref table of the global
- local gref = globalinfo[globaluniq[varname].id].xref
- local ngref = #gref
- -- enumerate for all current objects; all are valid at this point
- for i = 1, nobject do
- local obj = object[i]
- local act, rem = obj.act, obj.rem -- 'live' range of local
- -- if rem < 0, it is a -id to a local that had the same name
- -- so follow rem to extend it; does this make sense?
- while rem < 0 do
- rem = localinfo[-rem].rem
- end
- local drop
- for j = 1, ngref do
- local p = gref[j]
- if p >= act and p <= rem then drop = true end -- in range?
- end
- if drop then
- obj.skip = true
- oleft = oleft - 1
- end
- end--for
- end--if gcollide
- ------------------------------------------------------------------
- -- now the first unassigned local (since it's sorted) will be the
- -- one with the most tokens to rename, so we set this one and then
- -- eliminate all others that collides, then any locals that left
- -- can then reuse the same variable name; this is repeated until
- -- all local declaration that can use this name is assigned
- -- * the criteria for local-local reuse/collision is:
- -- A is the local with a name already assigned
- -- B is the unassigned local under consideration
- -- => anytime A is accessed, it cannot be when B is 'live'
- -- => to speed up things, we have first/last accesses noted
- ------------------------------------------------------------------
- while oleft > 0 do
- local i = 1
- while object[i].skip do -- scan for first object
- i = i + 1
- end
- ------------------------------------------------------------------
- -- first object is free for assignment of the variable name
- -- [first,last] gives the access range for collision checking
- ------------------------------------------------------------------
- oleft = oleft - 1
- local obja = object[i]
- i = i + 1
- obja.newname = varname
- obja.skip = true
- obja.done = true
- local first, last = obja.first, obja.last
- local xref = obja.xref
- ------------------------------------------------------------------
- -- then, scan all the rest and drop those colliding
- -- if A was never accessed then it'll never collide with anything
- -- otherwise trivial skip if:
- -- * B was activated after A's last access (last < act)
- -- * B was removed before A's first access (first > rem)
- -- if not, see detailed skip below...
- ------------------------------------------------------------------
- if first and oleft > 0 then -- must have at least 1 access
- local scanleft = oleft
- while scanleft > 0 do
- while object[i].skip do -- next valid object
- i = i + 1
- end
- scanleft = scanleft - 1
- local objb = object[i]
- i = i + 1
- local act, rem = objb.act, objb.rem -- live range of B
- -- if rem < 0, extend range of rem thru' following local
- while rem < 0 do
- rem = localinfo[-rem].rem
- end
- --------------------------------------------------------
- if not(last < act or first > rem) then -- possible collision
- --------------------------------------------------------
- -- B is activated later than A or at the same statement,
- -- this means for no collision, A cannot be accessed when B
- -- is alive, since B overrides A (or is a peer)
- --------------------------------------------------------
- if act >= obja.act then
- for j = 1, obja.xcount do -- ... then check every access
- local p = xref[j]
- if p >= act and p <= rem then -- A accessed when B live!
- oleft = oleft - 1
- objb.skip = true
- break
- end
- end--for
- --------------------------------------------------------
- -- A is activated later than B, this means for no collision,
- -- A's access is okay since it overrides B, but B's last
- -- access need to be earlier than A's activation time
- --------------------------------------------------------
- else
- if objb.last and objb.last >= obja.act then
- oleft = oleft - 1
- objb.skip = true
- end
- end
- end
- --------------------------------------------------------
- if oleft == 0 then break end
- end
- end--if first
- ------------------------------------------------------------------
- end--while
- ------------------------------------------------------------------
- -- after assigning all possible locals to one variable name, the
- -- unassigned locals/objects have the skip field reset and the table
- -- is compacted, to hopefully reduce iteration time
- ------------------------------------------------------------------
- local temp, j = {}, 1
- for i = 1, nobject do
- local obj = object[i]
- if not obj.done then
- obj.skip = false
- temp[j] = obj
- j = j + 1
- end
- end
- object = temp -- new compacted object table
- nobject = #object -- objects left to process
- ------------------------------------------------------------------
- end--while
- ------------------------------------------------------------------
- -- after assigning all locals with new variable names, we can
- -- patch in the new names, and reprocess to get 'after' stats
- ------------------------------------------------------------------
- for i = 1, #localinfo do -- enumerate all locals
- local obj = localinfo[i]
- local xref = obj.xref
- if obj.newname then -- if got new name, patch it in
- for j = 1, obj.xcount do
- local p = xref[j] -- xrefs indexes the token list
- seminfolist[p] = obj.newname
- end
- obj.name, obj.oldname -- adjust names
- = obj.newname, obj.name
- else
- obj.oldname = obj.name -- for cases like 'self'
- end
- end
- ------------------------------------------------------------------
- -- deal with statistics output
- ------------------------------------------------------------------
- if gotself then -- add 'self' to end of list
- varlist[#varlist + 1] = "self"
- end
- local afteruniq = preprocess(localinfo)
- stats_summary(globaluniq, localuniq, afteruniq, option)
- ------------------------------------------------------------------
- end
|