miller/pkg/mlrval/mlrval_collections.go

789 lines
21 KiB
Go

// ================================================================
// ABOUT ARRAY/MAP INDEXING
//
// Arrays:
//
// * Within the Miller domain-specific language:
// o Arrays are indexed 1-up within the DSL
// o Negative-index aliases are supported: -1 means the last element
// of the array (if any), -2 means second-to-last, and in general
// -n..-1 are aliases for 1..n.
// * Within the Go implementation:
// o Array indices are of course 0-up.
//
// Maps:
//
// * Map keys are strings
//
// * They can also be indexed positionally (which is a linear search
// through the linked-hash-map struct). One of Miller's primary reasons for
// existing is to allow string-keyed access, but, sometimes people really
// want to access the nth field in a record. These are also 1-up.
//
// Why 1-up indices?
//
// * This is an odd choice in the broader programming-language context.
// There are a few languages such as Fortran, Julia, Matlab, and R which are
// 1-up but the overall trend is decidedly toward 0-up. This means that
// if Miller does 1-up it should do so for a good reason.
//
// * Reasons: so many other things are already 1-up, mostly inherited
// from AWK:
//
// o NR and FNR, the record-counter context variables start with 1.
// When we retain records, '@records[NR] = $*' is the natural thing
// to write, and the empty 0 slot would be a perpetual nuiscance.
//
// o Field indices $[1], ..., $[NF] match AWK as well as NIDX format.
// The 1-up indexing for NIDX format was in turn devised specifically
// to match behavior of Unix tools like cut, sort, and so on -- all
// of which are 1-up.
//
// o No zero index: AWK uses $0 like Miller uses $*, to refer
// to the entire record. In Miller, 0 is never a valid array index but one
// can use string(index) to get insertion-ordered hash maps:
// "0" is as valid a key as "1" or "100".
//
// Why integer indices at all?
//
// * AWK indices are always stringified and "arrays" are always associative,
// i.e. hashmaps. The Miller DSL is, essentially, a name-indexed AWK-ish
// processor. And, Miller versions up to Miller 5 stringified int indices
// awkishly.
//
// * But this is a new era; JSON is now widespread; people want arrays
// (per se) in their JSON files to be passed through as such.
//
// Naming conventions:
//
// * All userspace indexing is done in this file, ultimately through
// the UnaliasArrayIndex() function.
//
// * Outside of this file I simply say 'index'.
//
// * Inside this file I say 'zindex' for the 0-up Go indices and 'mindex'
// for 1-up Miller indices.
//
// ================================================================
package mlrval
import (
"errors"
"fmt"
"os"
"strconv"
"github.com/johnkerl/miller/v6/pkg/lib"
)
// ================================================================
// TODO: copy-reduction refactor
func (mv *Mlrval) ArrayGet(mindex *Mlrval) Mlrval {
if !mv.IsArray() {
return *FromNotArrayError("array [] base", mv)
}
if !mindex.IsInt() {
return *FromNotIntError("array [] index", mindex)
}
arrayval := mv.intf.([]*Mlrval)
value := arrayGetAliased(&arrayval, int(mindex.intf.(int64)))
if value == nil {
return *ABSENT
} else {
return *value
}
}
// ----------------------------------------------------------------
// TODO: make this return error so caller can do 'if err == nil { ... }'
func (mv *Mlrval) ArrayPut(mindex *Mlrval, value *Mlrval) {
if !mv.IsArray() {
fmt.Fprintf(
os.Stderr,
"mlr: expected array as indexed item in ArrayPut; got %s\n",
mv.GetTypeName(),
)
os.Exit(1)
}
if !mindex.IsInt() {
// TODO: need to be careful about semantics here.
// Silent no-ops are not good UX ...
fmt.Fprintf(
os.Stderr,
"mlr: expected int as index item ArrayPut; got %s\n",
mindex.GetTypeName(),
)
os.Exit(1)
}
arrayval := mv.intf.([]*Mlrval)
ok := arrayPutAliased(&arrayval, int(mindex.intf.(int64)), value)
if !ok {
fmt.Fprintf(
os.Stderr,
"mlr: array index %d out of bounds %d..%d\n",
mindex.intf.(int64), 1, len(arrayval),
)
os.Exit(1)
}
mv.intf = arrayval
}
// ----------------------------------------------------------------
func arrayGetAliased(array *[]*Mlrval, mindex int) *Mlrval {
zindex, ok := UnaliasArrayIndex(array, mindex)
if ok {
return (*array)[zindex]
} else {
return nil
}
}
func arrayPutAliased(array *[]*Mlrval, mindex int, value *Mlrval) bool {
zindex, ok := UnaliasArrayIndex(array, mindex)
if ok {
(*array)[zindex] = value.Copy()
return true
} else {
return false
}
}
func UnaliasArrayIndex(array *[]*Mlrval, mindex int) (int, bool) {
n := int(len(*array))
return UnaliasArrayLengthIndex(n, mindex)
}
// Input "mindex" is a Miller DSL array index. These are 1-up, so 1..n where n
// is the length of the array. Also, -n..-1 are aliases to 1..n. 0 is never a
// valid index.
//
// Output "zindex" is a Golang array index. These are 0-up, so 0..(n-1).
//
// The second return value indicates whether the Miller index is in-bounds.
// Even if it's out of bounds, while the second return value is false, the
// first return is correctly de-aliased. E.g. if the array has length 5 and the
// mindex is 8, zindex is 7 and valid=false. This is so in array-slice
// operations like 'v = myarray[2:8]' the callsite can hand back slots 2-5 of
// the array (which is the same way Python handles beyond-the-end indexing).
// Examples with n = 5:
//
// mindex zindex ok
// -7 -2 false
// -6 -1 false
// -5 0 true
// -4 1 true
// -3 2 true
// -2 3 true
// -1 4 true
// 0 -1 false
// 1 0 true
// 2 1 true
// 3 2 true
// 4 3 true
// 5 4 true
// 6 5 false
// 7 6 false
func UnaliasArrayLengthIndex(n int, mindex int) (int, bool) {
if 1 <= mindex {
zindex := mindex - 1
if mindex <= n { // in bounds
return zindex, true
} else { // out of bounds
return zindex, false
}
} else if mindex <= -1 {
zindex := mindex + n
if -n <= mindex { // in bounds
return zindex, true
} else { // out of bounds
return zindex, false
}
} else {
// mindex is 0
return -1, false
}
}
// ----------------------------------------------------------------
// TODO: thinking about capacity-resizing
func (mv *Mlrval) ArrayAppend(value *Mlrval) {
if !mv.IsArray() {
// TODO: need to be careful about semantics here.
// Silent no-ops are not good UX ...
return
}
mv.intf = append(mv.intf.([]*Mlrval), value)
}
// ================================================================
func (mv *Mlrval) MapGet(key *Mlrval) Mlrval {
if !mv.IsMap() {
return *FromNotMapError("map[]", mv)
}
mval, err := mv.intf.(*Mlrmap).GetWithMlrvalIndex(key)
if err != nil {
return *FromError(err)
}
if mval == nil {
return *ABSENT
}
// This returns a reference, not a (deep) copy. In general in Miller, we
// copy only on write/put.
return *mval
}
// ----------------------------------------------------------------
func (mv *Mlrval) MapPut(key *Mlrval, value *Mlrval) {
if !mv.IsMap() {
// TODO: need to be careful about semantics here.
// Silent no-ops are not good UX ...
return
}
if key.IsString() {
mv.intf.(*Mlrmap).PutCopy(key.printrep, value)
} else if key.IsInt() {
mv.intf.(*Mlrmap).PutCopy(key.String(), value)
}
// TODO: need to be careful about semantics here.
// Silent no-ops are not good UX ...
}
// ----------------------------------------------------------------
// This is a multi-level map/array put.
//
// E.g. '$name[1]["foo"] = "bar"' or '$*["foo"][1] = "bar"' In the former case
// the indices are ["name", 1, "foo"] and in the latter case the indices are
// ["foo", 1]. See also indexed-lvalues.md.
//
// There is auto-create for not-yet-populated levels, so for example if '@a' is
// already of type map, then on assignment of '@a["b"][2]["c"] = "d"' we'll
// create a map key "b", pointing to an array whose slot 2 is a map from "c" to
// "d".
//
// * If it's a map-type mlrval then:
//
// o Strings are map keys.
// o Integers are stringified, then interpreted as map keys.
//
// * If it's an array-type mlrval then:
//
// o Integers are array indices
// o We auto-create/auto-lengthen/auto-deepen.
// o If '@foo[]' has indices 0..3, then on '@foo[4] = "new"' we lengthen
// the array by one.
// o If '@foo[]' has indices 0..3, then on '@foo[6] = "new"'
// we lengthen the array and absent-fill the intervenings.
// o If '@foo["bar"]' does not exist, then on '@foo["bar"][0] = "new"'
// we create an array and populate the 0th slot.
// o If '@foo["bar"]' does not exist, then on '@foo["bar"][6] = "new"'
// we create an array and populate the 6th slot, and absent-fill
// the intervenings.
//
// * If this is a non-collection mlrval like string/int/float/etc. then
// it's non-indexable.
//
// See also indexed-lvalues.md.
func (mv *Mlrval) PutIndexed(indices []*Mlrval, rvalue *Mlrval) error {
lib.InternalCodingErrorIf(len(indices) < 1)
if mv.IsMap() {
return putIndexedOnMap(mv.intf.(*Mlrmap), indices, rvalue)
} else if mv.IsArray() {
arrayval := mv.intf.([]*Mlrval)
retval := putIndexedOnArray(&arrayval, indices, rvalue)
mv.intf = arrayval
return retval
} else {
baseIndex := indices[0]
if baseIndex.IsString() {
*mv = *FromEmptyMap()
return putIndexedOnMap(mv.intf.(*Mlrmap), indices, rvalue)
} else if baseIndex.IsInt() {
*mv = *FromEmptyArray()
arrayval := mv.intf.([]*Mlrval)
retval := putIndexedOnArray(&arrayval, indices, rvalue)
mv.intf = arrayval
return retval
} else {
return errors.New(
"mlr: only maps and arrays are indexable; got " + mv.GetTypeName(),
)
}
}
}
// ----------------------------------------------------------------
// Helper function for Mlrval.PutIndexed, for mlrvals of map type.
func putIndexedOnMap(baseMap *Mlrmap, indices []*Mlrval, rvalue *Mlrval) error {
numIndices := len(indices)
if numIndices == 0 {
// E.g. mlr put '$* = {"a":1, "b":2}'
if !rvalue.IsMap() {
return errors.New(
"Cannot assign non-map to existing map; got " +
rvalue.GetTypeName() +
".",
)
}
*baseMap = *rvalue.intf.(*Mlrmap).Copy()
return nil
}
baseIndex := indices[0]
// If last index, then assign.
if numIndices == 1 {
// E.g. mlr put '$*["a"] = 3'
return baseMap.PutCopyWithMlrvalIndex(baseIndex, rvalue)
}
// If not last index, then recurse.
if !baseIndex.IsString() && !baseIndex.IsInt() {
// Base is map, index is invalid type
return errors.New(
"mlr: map indices must be string, int, or array thereof; got " + baseIndex.GetTypeName(),
)
}
baseValue := baseMap.Get(baseIndex.String())
if baseValue == nil {
// Create a new level in order to recurse from
nextIndex := indices[1]
var err error = nil
baseValue, err = NewMlrvalForAutoDeepen(nextIndex.Type())
if err != nil {
return err
}
baseMap.PutReference(baseIndex.String(), baseValue)
}
return baseValue.PutIndexed(indices[1:], rvalue)
}
// ----------------------------------------------------------------
// Helper function for Mlrval.PutIndexed, for mlrvals of array type.
func putIndexedOnArray(
baseArray *[]*Mlrval,
indices []*Mlrval,
rvalue *Mlrval,
) error {
numIndices := len(indices)
lib.InternalCodingErrorIf(numIndices < 1)
mindex := indices[0]
if !mindex.IsInt() {
return errors.New(
"Array index must be int, but was " +
mindex.GetTypeName() +
".",
)
}
zindex, inBounds := UnaliasArrayIndex(baseArray, int(mindex.intf.(int64)))
if numIndices == 1 {
// If last index, then assign.
if inBounds {
(*baseArray)[zindex] = rvalue.Copy()
} else if mindex.intf.(int64) == 0 {
return errors.New("mlr: zero indices are not supported. Indices are 1-up")
} else if mindex.intf.(int64) < 0 {
return errors.New("mlr: Cannot use negative indices to auto-lengthen arrays")
} else {
// Array is [a,b,c] with mindices 1,2,3. Length is 3. Zindices are 0,1,2.
// Given mindex is 4.
LengthenMlrvalArray(baseArray, int(mindex.intf.(int64)))
zindex := mindex.intf.(int64) - 1
(*baseArray)[zindex] = rvalue.Copy()
}
return nil
} else {
// More indices remain; recurse
if inBounds {
nextIndex := indices[1]
// Overwrite what's in this slot if it's the wrong type
if nextIndex.IsString() {
if !(*baseArray)[zindex].IsMap() {
(*baseArray)[zindex] = FromEmptyMap()
}
} else if nextIndex.IsInt() {
if !(*baseArray)[zindex].IsArray() {
(*baseArray)[zindex] = FromEmptyArray()
}
} else {
return errors.New(
"mlr: indices must be string, int, or array thereof; got " + nextIndex.GetTypeName(),
)
}
return (*baseArray)[zindex].PutIndexed(indices[1:], rvalue)
} else if mindex.intf.(int64) == 0 {
return errors.New("mlr: zero indices are not supported. Indices are 1-up")
} else if mindex.intf.(int64) < 0 {
return errors.New("mlr: Cannot use negative indices to auto-lengthen arrays")
} else {
// Already allocated but needs to be longer
LengthenMlrvalArray(baseArray, int(mindex.intf.(int64)))
zindex := mindex.intf.(int64) - 1
return (*baseArray)[zindex].PutIndexed(indices[1:], rvalue)
}
}
}
// ----------------------------------------------------------------
func (mv *Mlrval) RemoveIndexed(indices []*Mlrval) error {
lib.InternalCodingErrorIf(len(indices) < 1)
if mv.IsMap() {
return removeIndexedOnMap(mv.intf.(*Mlrmap), indices)
} else if mv.IsArray() {
arrayval := mv.intf.([]*Mlrval)
retval := removeIndexedOnArray(&arrayval, indices)
mv.intf = arrayval
return retval
} else {
return errors.New(
"mlr: cannot unset index variable which is neither map nor array",
)
}
}
// ----------------------------------------------------------------
// Helper function for Mlrval.RemoveIndexed, for mlrvals of map type.
func removeIndexedOnMap(baseMap *Mlrmap, indices []*Mlrval) error {
numIndices := len(indices)
lib.InternalCodingErrorIf(numIndices < 1)
baseIndex := indices[0]
// If last index, then unset.
if numIndices == 1 {
if baseIndex.IsString() || baseIndex.IsInt() {
baseMap.Remove(baseIndex.String())
return nil
} else {
return errors.New(
"mlr: map indices must be string, int, or array thereof; got " +
baseIndex.GetTypeName(),
)
}
}
// If not last index, then recurse.
if baseIndex.IsString() || baseIndex.IsInt() {
// Base is map, index is string
baseValue := baseMap.Get(baseIndex.String())
if baseValue != nil {
return baseValue.RemoveIndexed(indices[1:])
}
} else {
// Base is map, index is invalid type
return errors.New(
"mlr: map indices must be string, int, or array thereof; got " + baseIndex.GetTypeName(),
)
}
return nil
}
// ----------------------------------------------------------------
// Helper function for Mlrval.PutIndexed, for mlrvals of array type.
func removeIndexedOnArray(
baseArray *[]*Mlrval,
indices []*Mlrval,
) error {
numIndices := len(indices)
lib.InternalCodingErrorIf(numIndices < 1)
mindex := indices[0]
if !mindex.IsInt() {
return errors.New(
"Array index must be int, but was " +
mindex.GetTypeName() +
".",
)
}
zindex, inBounds := UnaliasArrayIndex(baseArray, int(mindex.intf.(int64)))
// If last index, then unset.
if numIndices == 1 {
if inBounds {
leftSlice := (*baseArray)[0:zindex]
rightSlice := (*baseArray)[zindex+1 : len((*baseArray))]
*baseArray = append(leftSlice, rightSlice...)
} else if mindex.intf.(int64) == 0 {
return errors.New("mlr: zero indices are not supported. Indices are 1-up")
} else {
// TODO: improve wording
return errors.New("mlr: array index out of bounds for unset")
}
} else {
// More indices remain; recurse
if inBounds {
return (*baseArray)[zindex].RemoveIndexed(indices[1:])
} else if mindex.intf.(int64) == 0 {
return errors.New("mlr: zero indices are not supported. Indices are 1-up")
} else {
// TODO: improve wording
return errors.New("mlr: array index out of bounds for unset")
}
}
return nil
}
// ----------------------------------------------------------------
// Nominally for TopKeeper
type BsearchMlrvalArrayFunc func(
array *[]*Mlrval,
size int64, // maybe less than len(array)
value *Mlrval,
) int64
func BsearchMlrvalArrayForDescendingInsert(
array *[]*Mlrval,
size int64, // maybe less than len(array)
value *Mlrval,
) int64 {
lo := int64(0)
hi := size - 1
mid := (hi + lo) / 2
var newmid int64
if size == 0 {
return 0
}
if GreaterThan(value, (*array)[0]) {
return 0
}
if LessThan(value, (*array)[hi]) {
return size
}
for lo < hi {
middleElement := (*array)[mid]
if Equals(value, middleElement) {
return mid
} else if GreaterThan(value, middleElement) {
hi = mid
newmid = (hi + lo) / 2
} else {
lo = mid
newmid = (hi + lo) / 2
}
if mid == newmid {
if GreaterThanOrEquals(value, (*array)[lo]) {
return lo
} else if GreaterThanOrEquals(value, (*array)[hi]) {
return hi
} else {
return hi + 1
}
}
mid = newmid
}
return lo
}
func BsearchMlrvalArrayForAscendingInsert(
array *[]*Mlrval,
size int64, // maybe less than len(array)
value *Mlrval,
) int64 {
lo := int64(0)
hi := size - 1
mid := (hi + lo) / 2
var newmid int64
if size == 0 {
return 0
}
if LessThan(value, (*array)[0]) {
return 0
}
if GreaterThan(value, (*array)[hi]) {
return size
}
for lo < hi {
middleElement := (*array)[mid]
if Equals(value, middleElement) {
return mid
} else if LessThan(value, middleElement) {
hi = mid
newmid = (hi + lo) / 2
} else {
lo = mid
newmid = (hi + lo) / 2
}
if mid == newmid {
if LessThanOrEquals(value, (*array)[lo]) {
return lo
} else if LessThanOrEquals(value, (*array)[hi]) {
return hi
} else {
return hi + 1
}
}
mid = newmid
}
return lo
}
// NewMlrvalForAutoDeepen is for auto-deepen of nested maps in things like
//
// $foo[1]["a"][2]["b"] = 3
//
// Autocreated levels are maps. Array levels can be explicitly created e.g.
//
// $foo[1]["a"] ??= []
// $foo[1]["a"][2]["b"] = 3
func NewMlrvalForAutoDeepen(mvtype MVType) (*Mlrval, error) {
if mvtype == MT_STRING || mvtype == MT_INT {
empty := FromEmptyMap()
return empty, nil
} else {
return nil, errors.New(
"mlr: indices must be string, int, or array thereof; got " + GetTypeName(mvtype),
)
}
}
func (mv *Mlrval) Arrayify() *Mlrval {
if mv.IsMap() {
if mv.intf.(*Mlrmap).IsEmpty() {
return mv
}
convertible := true
i := 0
for pe := mv.intf.(*Mlrmap).Head; pe != nil; pe = pe.Next {
sval := strconv.Itoa(i + 1) // Miller user-space indices are 1-up
i++
if pe.Key != sval {
convertible = false
}
pe.Value = pe.Value.Arrayify()
}
if convertible {
arrayval := make([]*Mlrval, mv.intf.(*Mlrmap).FieldCount)
i := 0
for pe := mv.intf.(*Mlrmap).Head; pe != nil; pe = pe.Next {
arrayval[i] = pe.Value.Copy()
i++
}
return FromArray(arrayval)
} else {
return mv
}
} else if mv.IsArray() {
// TODO: comment (or rethink) that this modifies its inputs!!
output := mv.Copy()
arrayval := mv.intf.([]*Mlrval)
for i := range arrayval {
arrayval[i] = arrayval[i].Arrayify()
}
mv.intf = arrayval
return output
} else {
return mv
}
}
func LengthenMlrvalArray(array *[]*Mlrval, newLength64 int) {
newLength := int(newLength64)
lib.InternalCodingErrorIf(newLength <= len(*array))
if newLength <= cap(*array) {
newArray := (*array)[:newLength]
for zindex := len(*array); zindex < newLength; zindex++ {
// TODO: comment why not MT_ABSENT or MT_VOID
newArray[zindex] = NULL
}
*array = newArray
} else {
newArray := make([]*Mlrval, newLength, 2*newLength)
zindex := 0
for zindex = 0; zindex < len(*array); zindex++ {
newArray[zindex] = (*array)[zindex]
}
for zindex = len(*array); zindex < newLength; zindex++ {
// TODO: comment why not MT_ABSENT or MT_VOID
newArray[zindex] = NULL
}
*array = newArray
}
}
// ArrayFold reduces an array to a single value, with a user-supplied starting value and pairwise
// element-reducer function. Canonical example: start value is 0 and reducer f(a,b) is a+b: this
// will sum up the values in the array.
func ArrayFold(
a []*Mlrval,
initval *Mlrval,
f func(a, b *Mlrval) *Mlrval,
) *Mlrval {
acc := initval
for _, e := range a {
acc = f(acc, e)
}
return acc
}
// MapFold reduces a map's values to a single value, with a user-supplied starting value and
// pairwise element-reducer function. Canonical example: start value is 0 and reducer f(a,b) is a+b:
// this will sum up the values in the map. Nothing here accesses map keys.
func MapFold(
m *Mlrmap,
initval *Mlrval,
f func(a, b *Mlrval) *Mlrval,
) *Mlrval {
acc := initval
for pe := m.Head; pe != nil; pe = pe.Next {
acc = f(acc, pe.Value)
}
return acc
}
// CollectionFold multiplexes ArrayFold or MapFold. The panic here is not robust, but is done to
// avoid adding an error-return that would frictionalize the API. The idea is that the caller
// (internal/library functions, not directly user-facing) must have pre-validated that the argument
// is an array or map. The panic here is merely a fallback, not the primary check.
func CollectionFold(
c *Mlrval,
initval *Mlrval,
f func(a, b *Mlrval) *Mlrval,
) *Mlrval {
if c.IsArray() {
return ArrayFold(c.AcquireArrayValue(), initval, f)
} else if c.IsMap() {
return MapFold(c.AcquireMapValue(), initval, f)
} else {
panic("CollectionFold argument is neither array nor map")
}
}