Merge pull request #2308 from midichef/aggr_misc

[aggr-] fix handling of various special cases
saulpw · Mar 11, 2024 · 1987007 · 1987007
2 parents 1dd009a + 8a61e09
commit 1987007
Show file tree

Hide file tree

Showing 8 changed files with 68 additions and 21 deletions.
diff --git a/tests/golden/pr2308.json b/tests/golden/pr2308.json
@@ -0,0 +1,4 @@
+[
+{"Total": "Total", "count": 2, "A_keymax": [2.0]},
+{"Total": "Selected", "count": 0}
+]
diff --git a/tests/pr2308.vdj b/tests/pr2308.vdj
@@ -0,0 +1,5 @@
+#!vd -p
+{"sheet": null, "col": null, "row": null, "longname": "open-file", "input": "tests/small.json", "keystrokes": "o", "comment": null}
+{"sheet": "small", "col": "id", "row": "", "longname": "key-col", "input": "", "keystrokes": "!", "comment": "toggle current column as a key column"}
+{"sheet": "small", "col": "A", "row": "", "longname": "aggregate-col", "input": "keymax", "keystrokes": "+", "comment": "Add aggregator to current column"}
+{"sheet": "small", "col": "", "row": "", "longname": "freq-summary", "input": "", "keystrokes": "zShift+F", "comment": "open one-line summary for all rows and selected rows"}
diff --git a/tests/small.json b/tests/small.json
@@ -0,0 +1,4 @@
+[
+{"id": 1.0, "A": 1},
+{"id": 2.0, "A": 1}
+]
diff --git a/visidata/aggregators.py b/visidata/aggregators.py
@@ -5,7 +5,7 @@
 import statistics
 
 from visidata import Progress, Sheet, Column, ColumnsSheet, VisiData
-from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict
+from visidata import vd, anytype, vlen, asyncthread, wrapply, AttrDict, date
 
 vd.help_aggregators = '''# Choose Aggregators
 Start typing an aggregator name or description.
@@ -51,7 +51,11 @@ def getValues(self, rows):
 
 def aggregators_get(col):
  'A space-separated names of aggregators on this column.'
- return list(vd.aggregators[k] for k in (col.aggstr or '').split())
+ aggs = []
+ for k in (col.aggstr or '').split():
+ agg = vd.aggregators[k]
+ aggs += agg if isinstance(agg, list) else [agg]
+ return aggs
 
 def aggregators_set(col, aggs):
  if isinstance(aggs, str):
@@ -94,7 +98,7 @@ def _funcRows(col, rows): # wrap builtins so they can have a .type
  except Exception as e:
  if len(vals) == 0:
  return None
- return e
+ raise e
 
  vd.aggregators[name] = _defaggr(name, type, _funcRows, funcValues=funcValues, helpstr=helpstr) # accepts a srccol + list of rows
 
@@ -135,7 +139,9 @@ def _percentile(N, percent, key=lambda x:x):
 
 @functools.lru_cache(100)
 def percentile(pct, helpstr=''):
- return _defaggr('p%s'%pct, None, lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100), helpstr=helpstr)
+ return _defaggr('p%s'%pct, None,
+ lambda col,rows,pct=pct: _percentile(sorted(col.getValues(rows)), pct/100,
+ key=(lambda d: d.timestamp()) if col.type is date else lambda x:x), helpstr=helpstr)
 
 def quantiles(q, helpstr):
  return [percentile(round(100*i/q), helpstr) for i in range(1, q)]
@@ -149,7 +155,7 @@ def quantiles(q, helpstr):
 vd.aggregator('sum', vsum, 'sum of values')
 vd.aggregator('distinct', set, 'distinct values', type=vlen)
 vd.aggregator('count', lambda values: sum(1 for v in values), 'number of values', type=int)
-vd.aggregator('list', list, 'list of values')
+vd.aggregator('list', list, 'list of values', type=anytype)
 vd.aggregator('stdev', statistics.stdev, 'standard deviation of values', type=float)
 
 vd.aggregators['q3'] = quantiles(3, 'tertiles (33/66th pctile)')
@@ -162,8 +168,24 @@ def quantiles(q, helpstr):
 for pct in (10, 20, 25, 30, 33, 40, 50, 60, 67, 70, 75, 80, 90, 95, 99):
  vd.aggregators[f'p{pct}'] = percentile(pct, f'{pct}th percentile')
 
-# returns keys of the row with the max value
-vd.aggregators['keymax'] = _defaggr('keymax', anytype, lambda col, rows: col.sheet.rowkey(max(col.getValueRows(rows))[1]), helpstr='key of the maximum value')
+def keyfunc(aggr_func):
+ '''Return the key of the row that results from applying *aggr_func* to *rows*.
+ Return None if *rows* is an empty list.
+ *aggr_func* takes a list of (value, row) tuples, one for each row in the column,
+ excluding rows where the column holds null and error values.
+ *aggr_func* must also take the parameters *default* and *key*, as max() does:
+ https://docs.python.org/3/library/functions.html#max'''
+ def key_aggr_func(col, rows):
+ if not col.sheet.keyCols:
+ vd.error('key aggregator function requires one or more key columns')
+ return None
+ # convert dicts to lists because functions like max() can't compare dicts
+ sortkey = lambda t: (t[0], sorted(t[1].items())) if isinstance(t[1], dict) else t
+ row = aggr_func(col.getValueRows(rows), default=(None, None), key=sortkey)[1]
+ return col.sheet.rowkey(row) if row else None
+ return key_aggr_func
+vd.aggregators['keymax'] = _defaggr('keymax', anytype, keyfunc(max), helpstr='key of the maximum value')
+vd.aggregators['keymin'] = _defaggr('keymin', anytype, keyfunc(min), helpstr='key of the minimum value')
 
 
 ColumnsSheet.columns += [
@@ -175,7 +197,7 @@ def quantiles(q, helpstr):
 
 @Sheet.api
 def addAggregators(sheet, cols, aggrnames):
- 'Add each aggregator in list of *aggrnames* to each of *cols*.'
+ 'Add each aggregator in list of *aggrnames* to each of *cols*. Ignores names that are not valid.'
  for aggrname in aggrnames:
  aggrs = vd.aggregators.get(aggrname)
  aggrs = aggrs if isinstance(aggrs, list) else [aggrs]
@@ -194,14 +216,19 @@ def aggname(col, agg):
 
 @Column.api
 @asyncthread
-def memo_aggregate(col, agg, rows):
+def memo_aggregate(col, agg_choices, rows):
  'Show aggregated value in status, and add to memory.'
- aggval = agg(col, rows)
- typedval = wrapply(agg.type or col.type, aggval)
- dispval = col.format(typedval)
- k = col.name+'_'+agg.name
- vd.status(f'{k}={dispval}')
- vd.memory[k] = typedval
+ for agg_choice in agg_choices:
+ agg = vd.aggregators.get(agg_choice)
+ if not agg: continue
+ aggs = agg if isinstance(agg, list) else [agg]
+ for agg in aggs:
+ aggval = agg(col, rows)
+ typedval = wrapply(agg.type or col.type, aggval)
+ dispval = col.format(typedval)
+ k = col.name+'_'+agg.name
+ vd.status(f'{k}={dispval}')
+ vd.memory[k] = typedval
 
 
 @VisiData.property
@@ -215,6 +242,7 @@ def aggregator_choices(vd):
 
 @VisiData.api
 def chooseAggregators(vd):
+ '''Return a list of aggregator name strings chosen or entered by the user. User-entered names may be invalid.'''
  prompt = 'choose aggregators: '
  def _fmt_aggr_summary(match, row, trigger_key):
  formatted_aggrname = match.formatted.get('key', row.key) if match else row.key
@@ -235,12 +263,15 @@ def _fmt_aggr_summary(match, row, trigger_key):
  multiple=True)
 
  aggrs = r.split()
+ valid_choices = vd.aggregators.keys()
  for aggr in aggrs:
  vd.usedInputs[aggr] += 1
+ if aggr not in valid_choices:
+ vd.warning(f'aggregator does not exist: {aggr}')
  return aggrs
 
 Sheet.addCommand('+', 'aggregate-col', 'addAggregators([cursorCol], chooseAggregators())', 'Add aggregator to current column')
-Sheet.addCommand('z+', 'memo-aggregate', 'for agg in chooseAggregators(): cursorCol.memo_aggregate(aggregators[agg], selectedRows or rows)', 'memo result of aggregator over values in selected rows for current column')
+Sheet.addCommand('z+', 'memo-aggregate', 'cursorCol.memo_aggregate(chooseAggregators(), selectedRows or rows)', 'memo result of aggregator over values in selected rows for current column')
 ColumnsSheet.addCommand('g+', 'aggregate-cols', 'addAggregators(selectedRows or source[0].nonKeyVisibleCols, chooseAggregators())', 'add aggregators to selected source columns')
 
 vd.addMenuItems('''

diff --git a/visidata/column.py b/visidata/column.py
@@ -381,7 +381,7 @@ def getCell(self, row):
  notecolor='color_warning')
  else:
  return DisplayWrapper(typedval.val, text=str(typedval.val),
- error='unknown',
+ error=['unknown'],
  note=options.note_type_exc,
  notecolor='color_warning')
 

diff --git a/visidata/freqtbl.py b/visidata/freqtbl.py
@@ -140,7 +140,7 @@ def rows(self):
 FreqTableSheet.addCommand('', 'open-preview', 'vd.push(FreqTablePreviewSheet(sheet.name, "preview", source=sheet, columns=source.columns), pane=2); vd.options.disp_splitwin_pct=50', 'open split preview of source rows at cursor')
 
 Sheet.addCommand('F', 'freq-col', 'vd.push(makeFreqTable(sheet, cursorCol))', 'open Frequency Table grouped on current column, with aggregations of other columns')
-Sheet.addCommand('gF', 'freq-keys', 'vd.push(makeFreqTable(sheet, *keyCols))', 'open Frequency Table grouped by all key columns on source sheet, with aggregations of other columns')
+Sheet.addCommand('gF', 'freq-keys', 'vd.push(makeFreqTable(sheet, *keyCols)) if keyCols else vd.fail("there are no key columns to group by")', 'open Frequency Table grouped by all key columns on source sheet, with aggregations of other columns')
 Sheet.addCommand('zF', 'freq-summary', 'vd.push(makeFreqTableSheetSummary(sheet, Column("Total", sheet=sheet, getter=lambda col, row: "Total")))', 'open one-line summary for all rows and selected rows')
 
 ColumnsSheet.addCommand(ENTER, 'freq-row', 'vd.push(makeFreqTable(source[0], cursorRow))', 'open a Frequency Table sheet grouped on column referenced in current row')

diff --git a/visidata/pivot.py b/visidata/pivot.py
@@ -291,8 +291,11 @@ def afterLoad(self):
 @PivotSheet.api
 def addcol_aggr(sheet, col):
  hasattr(col, 'origCol') or vd.fail('not an aggregation column')
- for agg in vd.chooseAggregators():
- sheet.addColumnAtCursor(makeAggrColumn(col.origCol, vd.aggregators[agg]))
+ for agg_choice in vd.chooseAggregators():
+ agg_or_list = vd.aggregators[agg_choice]
+ aggs = agg_or_list if isinstance(agg_or_list, list) else [agg_or_list]
+ for agg in aggs:
+ sheet.addColumnAtCursor(makeAggrColumn(col.origCol, vd.aggregators[agg]))
 
 
 Sheet.addCommand('W', 'pivot', 'vd.push(makePivot(sheet, keyCols, [cursorCol]))', 'open Pivot Table: group rows by key column and summarize current column')

diff --git a/visidata/tests/test_commands.py b/visidata/tests/test_commands.py
@@ -93,7 +93,7 @@ def isTestableCommand(longname, cmdlist):
  'expand-cols-depth': '0',
  'save-cmdlog': 'test_commands.vdj',
  'aggregate-col': 'mean',
- 'memo-aggregate': 'mean',
+ 'memo-aggregate': 'count',
  'addcol-shell': '',
  'theme-input': 'light',
  'add-rows': '1',