Skip to content

Commit

Permalink
Small changes to adapt old tests/util code to new adix/oats. Also add
Browse files Browse the repository at this point in the history
`tests/ucl.nim`.  All seems to work fine.  A new example program is in
the works to exercise an MFile-backed VOat.
  • Loading branch information
c-blake committed Dec 22, 2023
1 parent 70a2a3d commit fcdad32
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 23 deletions.
56 changes: 56 additions & 0 deletions tests/ucl.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
when not declared(stdin): import std/[syncio, formatfloat]
import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats

const bLen {.intdefine.} = 10 # <1024 long; RT limits nicer but harder
const bOff {.intdefine.} = 22 # <4MiB UNIQUE line data
type
Count {.packed.} = object # Dense-ish hash Count type
when defined hashCache: hc: uint32 # 4B|8B per cell
len {.bitsize: bLen.}: uint32
off {.bitsize: bOff.}: uint32
Counts = object
dat: seq[Count]
nUsed: int

var a = " "; oatKStack a, Counts, Count, off,uint32, MSlice, MSlice
#proc key(c: var Counts, i: int, q: MSlice) = c.dat[i]=c.keyR(q) wrong&unneeded
proc key(c: Counts, i: int): MSlice = c.dat[i].key
proc used(c: Counts, i: int): bool = c.dat[i].off!=0

when defined hashCache: # def auto-triggers use
proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash

oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable
when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"}

proc incFailed(h: var Counts, r: MSlice): bool =
if r.len + 1 > 1 shl bLen: # Careful to not overflow
erru "skipping too long(", $r.len, ") line: ",$r,"\n"
return # Cannot go on LOCALLY
h.upSert(r, i): discard # Found key @i: nothing to do
do: # Novel key->i:
h.dat[i].off = a.add(r, (1 shl bOff) - 1):
erru "unique word data overflow at:",$r,"\n" #XXX rate limit msgs
return true # Cannot go on GLOBALLY
h.dat[i].len = r.len.uint32 # Init

proc ucl(size=9999, dSize=81920, tm=false) =
## Count unique & total lines on `stdin`. <256B long; <16 MiB unique data.
let t0 = if tm: epochTime() else: 0.0
var h: Counts; h.setCap size # Pre-size table & data
a.setLen dSize; a.setLen 1
var nTot = 0
block IO:
for (line, nLine) in stdin.getDelims:
let ms = MSlice(mem: line, len: nLine - 1)
inc nTot # Always bump `nTotal`
if h.incFailed(ms): break IO
echo h.len," unique ",nTot," total ",a.len," B"
if tm: stderr.write epochTime() - t0, "\n"

when isMainModule: dispatch ucl, help={
"size" : "pre-size hash table for size slots",
"dSize": "pre-size str data area to this many bytes",
"tm" : "emit wall time of counting to stderr & quit"}
14 changes: 7 additions & 7 deletions tests/wfr.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,23 @@ type
dat: seq[Count]
nUsed: int

var s: string; s.keyStack off,uint32, Count,MSlice
var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
proc key(c: Counts, i: int): MSlice = c.dat[i].key
proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v
proc val(c: Counts, i: int): uint32 = c.dat[i].cnt
proc used(c: Counts, i: int): bool = c.dat[i].len != 0
when defined hashCache: # 2nd def triggers saving lpt behavior
proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc
else:
proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash
proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard
Counts.useCountedCellSeq dat, nUsed
oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable
when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"}

proc incFailed(h: var Counts, ms: MSlice): bool =
if ms.len > (1 shl bLen) - 1: # Careful to not overflow
erru "skipping too long word: ",$ms,"\n"
return # Cannot go on LOCALLY
h.getPut(i, ms, hc): # Found key @i:
h.upSert(ms, i): # Found key @i:
if h.dat[i].cnt == (1 shl bCnt) - 1:
erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit
else: h.dat[i].cnt.inc # bump
Expand Down
14 changes: 6 additions & 8 deletions tests/wu.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,23 +12,21 @@ type
dat: seq[Count]
nUsed: int

var s: string; s.keyStack off,uint32, Count,MSlice
var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
proc key(c: Counts, i: int): MSlice = c.dat[i].key
proc val(c: Counts, i: int): Void {.used.} = discard #NONE
proc used(c: Counts, i: int): bool = c.dat[i].len != 0
when defined hashCache: # 2nd def triggers saving lpt behavior
proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc
else:
proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash
proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard
Counts.useCountedCellSeq dat, nUsed
oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable
when Counts is ROat[MSlice, MSlice]: {.warning: "Counts is a ROat"}

proc incFailed(h: var Counts, ms: MSlice): bool =
if ms.len > (1 shl bLen) - 1: # Careful to not overflow
erru "skipping too long word: ",$ms,"\n"
return # Cannot go on LOCALLY
h.getPut(i, ms, hc): discard # Found key @i:
h.upSert(ms, i): discard # Found key @i:
do: # Novel key->i:
h.dat[i].off = s.add(ms, (1 shl bOff) - 1):
erru "unique word data overflow at:",$ms,"\n" #XXX rate limit
Expand Down
16 changes: 8 additions & 8 deletions util/lfreq.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,23 @@ type
dat: seq[Count]
nUsed: int

var s: string; s.keyStack off,uint32, Count,MSlice
var s: string; oatKStack s, Counts, Count, off,uint32, MSlice, MSlice
proc key(c: Counts, i: int): MSlice = c.dat[i].key
proc val(c: var Counts, i: int, v: uint32) {.used.} = c.dat[i].cnt = v
proc val(c: Counts, i: int): uint32 = c.dat[i].cnt
proc used(c: Counts, i: int): bool = c.dat[i].cnt != 0
when defined hashCache: # 2nd def triggers saving lpt behavior
proc hash(ms: MSlice): Hash = mslice.hash(ms).uint32.Hash
proc hash(c: var Counts, i: int, hc: Hash) {.used.} = c.dat[i].hc = hc.uint32
proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash
proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc
else:
proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash
proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard
Counts.useCountedCellSeq dat, nUsed
oatCounted c,Counts, c.nUsed; oatSeq Counts, dat # make counted & resizable
when Counts is VROat[MSlice, MSlice, uint32]: {.warning: "Counts is a VROat"}

proc incFailed(h: var Counts, ms: MSlice): bool =
if ms.len > (1 shl bLen) - 1: # Careful to not overflow
erru "skipping too long line: ", ($ms)[0..<128], "\n"
return # Cannot go on LOCALLY
h.getPut(i, ms, hc): # Found key @i:
return false # Cannot go on LOCALLY
h.upSert(ms, i): # Found key @i:
if h.dat[i].cnt == (1 shl bCnt) - 1:
erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit
else: h.dat[i].cnt.inc # bump
Expand Down

0 comments on commit fcdad32

Please sign in to comment.