Bump versions pre-release

c-blake · Jun 28, 2023 · 20ab393 · 20ab393
1 parent 7d26bd6
commit 20ab393
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 37 deletions.
diff --git a/adix.nimble b/adix.nimble
@@ -1,12 +1,12 @@
 # Package
-version = "0.5.5"
+version = "0.5.6"
 author = "Charles Blake"
 description = "An Adaptive Index Library for Nim"
 license = "MIT/ISC"
 
 # Deps
 requires "nim >= 1.2.0"
-requires "cligen >= 1.6.6"
+requires "cligen >= 1.6.7"
 skipDirs = @[ "tests" ]
 
 bin = @[

diff --git a/tests/nim.cfg b/tests/nim.cfg
@@ -1,3 +1,3 @@
 path=".."
 path="../adix"
-path="../../cg"
+path="../../cg" # cligen
diff --git a/util/lfreq.nim b/util/lfreq.nim
@@ -1,43 +1,69 @@
 when not declared(stdin): import std/[syncio, formatfloat]
-import adix/lptabz {.all.}, cligen, cligen/[mslice, osUt], std/times
-var str: string # Big (NON-RELOCATABLE!) stack of string data
+import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats
 
-proc lfreq(n=0, count=false,uniq=false,Norm=false, size=7, dSize=99, tm=false)=
+const bLen {.intdefine.} = 16 # <16K long; RT params better but less easy
+const bOff {.intdefine.} = 32 # <4G UNIQUE line data
+const bCnt {.intdefine.} = 32 # <4 GiCount
+type
+ Count {.packed.} = object # Dense-ish hash Count type
+ when defined hashCache: hc: uint32 # 10B|14B per cell
+ len {.bitsize: bLen.}: uint32 # Cmp goes hc, len, key
+ off {.bitsize: bOff.}: uint32
+ cnt {.bitsize: bCnt.}: uint32
+ Counts = object
+ dat: seq[Count]
+ nUsed: int
+
+var s: string; s.keyStack off,uint32, Count,MSlice
+proc key(c: Counts, i: int): MSlice = c.dat[i].key
+proc val(c: Counts, i: int): uint32 = c.dat[i].cnt
+proc used(c: Counts, i: int): bool = c.dat[i].cnt != 0
+when defined hashCache: # 2nd def triggers saving lpt behavior
+ proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
+ proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc
+else:
+ proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash
+ proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard
+Counts.useCountedCellSeq dat, nUsed
+
+proc incFailed(h: var Counts, ms: MSlice): bool =
+ if ms.len > (1 shl bLen) - 1: # Careful to not overflow
+ erru "skipping too long line: ", ($ms)[0..<128], "\n"
+ return # Cannot go on LOCALLY
+ h.getPut(i, ms, hc): # Found key @i:
+ if h.dat[i].cnt == (1 shl bCnt) - 1:
+ erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit
+ else: h.dat[i].cnt.inc # bump
+ do: # Novel key->i:
+ h.dat[i].off = s.add(ms, (1 shl bOff) - 1):
+ erru "unique line data overflow at:",$ms,"\n" #XXX rate limit
+ return true # Cannot go on GLOBALLY
+ h.dat[i].len = ms.len.uint32# Init
+ h.dat[i].cnt = 1u32
+
+proc lfreq(n=10, count=false,Norm=false, size=9999,dSize=81920, tm=false) =
  ## Histogram `stdin` lines (read w/non-memory mapped IO to be pipe friendly).
- ## (Needs manual dSize tuning of non-movable string stack.)
- let t0 = epochTime()
+ ## Limits: <4 GiB unique data; <16 KiB lines; <4 GiCount.
+ let t0 = if tm: epochTime() else: 0.0
+ var h: Counts; h.setCap size # pre-size table & data
+ s.setLen dSize; s.setLen 0
  var nTot = 0
- str.setLen dSize; str.setLen 0
- var cnt = initLPTab[MSlice, int](size) # HCell 16+8+8=32B
- for (line, nLine) in stdin.getDelims:
- let ms = MSlice(mem: line, len: nLine - 1)
- inc nTot # Always bump `nTotal`
- cnt.getPut(i, ms) do: # Found key @i:
- cnt.cell(i).val.inc # bump
- do: # Novel key->i:
- let off = str.len # alloc, copy, init
- if off + ms.len+1 > dSize:raise newException(ValueError,"dSize too small")
- str.setLen off + ms.len+1 # a noInit would be nice
- copyMem str[off].addr, ms.mem, ms.len
- cnt.cell(i).key = MSlice(mem: str[off].addr, len: ms.len)
- cnt.cell(i).val = 1
- if count:
- echo cnt.len," unique ",nTot," total"
- template o =
- if not uniq:
- if Norm: stdout.urite c.float / nTot.float
- else: stdout.urite c
- stdout.urite " "
- stdout.urite k; stdout.urite "\n"
- if n == 0: (for (k, c) in pairs(cnt): o()) # unsorted
- elif n > 0: (for (k, c) in cnt.topByVal(n): o()) # top n
- if tm: stderr.write epochTime() - t0, " sec\n" # n<0 => only `c`/tm
+ block IO:
+ for (line, nLine) in stdin.getDelims:
+ let ms = MSlice(mem: line, len: nLine - 1)
+ inc nTot # Always bump `nTotal`
+ if h.incFailed(ms): break IO
+ if count: outu h.len," unique ",nTot," total ",s.len," B\n"
+ template output =
+ if Norm: outu c.float/nTot.float," ",k,"\n" else: outu c," ",k,"\n"
+ if n == 0: (for (k, c) in pairs(h): output())
+ elif n > 0: (for (k, c) in h.topByVal(n): output())
+ if tm: stderr.write epochTime() - t0, "\n"
 
 when isMainModule: dispatch lfreq, help={
- "n" : "only emit most frequent `n` lines (!=0=>sorted)",
+ "n" : "only emit most frequent `n` lines(!=0=>sorted)",
  "count": "only emit counts: unique & grand total",
- "uniq" : "only emit unique lines, not frequencies",
  "Norm" : "normalize frequencies by dividing by grand tot",
  "size" : "pre-size hash table for size unique entries",
- "dSize": "size string data area to this many bytes",
- "tm" : "emit elapsed wall time to stderr"}
+ "dSize": "pre-size str data area to this many bytes",
+ "tm" : "emit wall time of counting to stderr & quit"}