-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
63 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
path=".." | ||
path="../adix" | ||
path="../../cg" | ||
path="../../cg" # cligen |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,69 @@ | ||
when not declared(stdin): import std/[syncio, formatfloat] | ||
import adix/lptabz {.all.}, cligen, cligen/[mslice, osUt], std/times | ||
var str: string # Big (NON-RELOCATABLE!) stack of string data | ||
import std/[hashes, times], cligen, cligen/[mslice, osUt], adix/oats | ||
|
||
proc lfreq(n=0, count=false,uniq=false,Norm=false, size=7, dSize=99, tm=false)= | ||
const bLen {.intdefine.} = 16 # <16K long; RT params better but less easy | ||
const bOff {.intdefine.} = 32 # <4G UNIQUE line data | ||
const bCnt {.intdefine.} = 32 # <4 GiCount | ||
type | ||
Count {.packed.} = object # Dense-ish hash Count type | ||
when defined hashCache: hc: uint32 # 10B|14B per cell | ||
len {.bitsize: bLen.}: uint32 # Cmp goes hc, len, key | ||
off {.bitsize: bOff.}: uint32 | ||
cnt {.bitsize: bCnt.}: uint32 | ||
Counts = object | ||
dat: seq[Count] | ||
nUsed: int | ||
|
||
var s: string; s.keyStack off,uint32, Count,MSlice | ||
proc key(c: Counts, i: int): MSlice = c.dat[i].key | ||
proc val(c: Counts, i: int): uint32 = c.dat[i].cnt | ||
proc used(c: Counts, i: int): bool = c.dat[i].cnt != 0 | ||
when defined hashCache: # 2nd def triggers saving lpt behavior | ||
proc hash(c: Counts, i: int): Hash = c.dat[i].hc.Hash | ||
proc hash(c: var Counts, i: int, hc: uint32) {.used.} = c.dat[i].hc = hc | ||
else: | ||
proc hash(c: Counts, i: int): Hash = c.dat[i].key.hash | ||
proc hash(c: var Counts, i: int, hc: Void) {.used.} = discard | ||
Counts.useCountedCellSeq dat, nUsed | ||
|
||
proc incFailed(h: var Counts, ms: MSlice): bool = | ||
if ms.len > (1 shl bLen) - 1: # Careful to not overflow | ||
erru "skipping too long line: ", ($ms)[0..<128], "\n" | ||
return # Cannot go on LOCALLY | ||
h.getPut(i, ms, hc): # Found key @i: | ||
if h.dat[i].cnt == (1 shl bCnt) - 1: | ||
erru "counter overflow for: ",$ms,"\n" # no update XXX rate limit | ||
else: h.dat[i].cnt.inc # bump | ||
do: # Novel key->i: | ||
h.dat[i].off = s.add(ms, (1 shl bOff) - 1): | ||
erru "unique line data overflow at:",$ms,"\n" #XXX rate limit | ||
return true # Cannot go on GLOBALLY | ||
h.dat[i].len = ms.len.uint32# Init | ||
h.dat[i].cnt = 1u32 | ||
|
||
proc lfreq(n=10, count=false,Norm=false, size=9999,dSize=81920, tm=false) = | ||
## Histogram `stdin` lines (read w/non-memory mapped IO to be pipe friendly). | ||
## (Needs manual dSize tuning of non-movable string stack.) | ||
let t0 = epochTime() | ||
## Limits: <4 GiB unique data; <16 KiB lines; <4 GiCount. | ||
let t0 = if tm: epochTime() else: 0.0 | ||
var h: Counts; h.setCap size # pre-size table & data | ||
s.setLen dSize; s.setLen 0 | ||
var nTot = 0 | ||
str.setLen dSize; str.setLen 0 | ||
var cnt = initLPTab[MSlice, int](size) # HCell 16+8+8=32B | ||
for (line, nLine) in stdin.getDelims: | ||
let ms = MSlice(mem: line, len: nLine - 1) | ||
inc nTot # Always bump `nTotal` | ||
cnt.getPut(i, ms) do: # Found key @i: | ||
cnt.cell(i).val.inc # bump | ||
do: # Novel key->i: | ||
let off = str.len # alloc, copy, init | ||
if off + ms.len+1 > dSize:raise newException(ValueError,"dSize too small") | ||
str.setLen off + ms.len+1 # a noInit would be nice | ||
copyMem str[off].addr, ms.mem, ms.len | ||
cnt.cell(i).key = MSlice(mem: str[off].addr, len: ms.len) | ||
cnt.cell(i).val = 1 | ||
if count: | ||
echo cnt.len," unique ",nTot," total" | ||
template o = | ||
if not uniq: | ||
if Norm: stdout.urite c.float / nTot.float | ||
else: stdout.urite c | ||
stdout.urite " " | ||
stdout.urite k; stdout.urite "\n" | ||
if n == 0: (for (k, c) in pairs(cnt): o()) # unsorted | ||
elif n > 0: (for (k, c) in cnt.topByVal(n): o()) # top n | ||
if tm: stderr.write epochTime() - t0, " sec\n" # n<0 => only `c`/tm | ||
block IO: | ||
for (line, nLine) in stdin.getDelims: | ||
let ms = MSlice(mem: line, len: nLine - 1) | ||
inc nTot # Always bump `nTotal` | ||
if h.incFailed(ms): break IO | ||
if count: outu h.len," unique ",nTot," total ",s.len," B\n" | ||
template output = | ||
if Norm: outu c.float/nTot.float," ",k,"\n" else: outu c," ",k,"\n" | ||
if n == 0: (for (k, c) in pairs(h): output()) | ||
elif n > 0: (for (k, c) in h.topByVal(n): output()) | ||
if tm: stderr.write epochTime() - t0, "\n" | ||
|
||
when isMainModule: dispatch lfreq, help={ | ||
"n" : "only emit most frequent `n` lines (!=0=>sorted)", | ||
"n" : "only emit most frequent `n` lines(!=0=>sorted)", | ||
"count": "only emit counts: unique & grand total", | ||
"uniq" : "only emit unique lines, not frequencies", | ||
"Norm" : "normalize frequencies by dividing by grand tot", | ||
"size" : "pre-size hash table for size unique entries", | ||
"dSize": "size string data area to this many bytes", | ||
"tm" : "emit elapsed wall time to stderr"} | ||
"dSize": "pre-size str data area to this many bytes", | ||
"tm" : "emit wall time of counting to stderr & quit"} |