miller/pkg/mlrval/mlrmap.go
John Kerl fb1f7f8421
Enable record-hashing by default (#1507)
* Enable record-hashing by default

* comments
2024-02-25 21:51:41 -05:00

155 lines
4.9 KiB
Go

// ================================================================
// ORDERED MAP FROM STRING TO MLRVAL
//
// This is an implementation of insertion-ordered key-value pairs for Miller's
// fundamental record data structure. It's also an ordered-map data structure,
// suitable for Miller JSON decode/encode.
//
// ----------------------------------------------------------------
// DESIGN
//
// * It keeps a doubly-linked list of key-value pairs.
//
// * With hash-records set to false, no hash functions are computed when the map
// is written to or read from.
//
// * Gets are implemented by sequential scan through the list: given a key,
// the key-value pairs are scanned through until a match is (or is not) found.
//
// * Performance improvement of 25% percent over hash-mapping from key to entry
// was found in the Go implementation. Test data was million-line CSV and
// DKVP, with a dozen columns or so.
//
// * However, with higher column-count (see https://github.com/johnkerl/miller/issues/1506
// and https://github.com/johnkerl/miller/pull/1507), non-hashing becomes
// a substantial penalty.
//
// Note however that an auxiliary constructor is provided which does use
// a key-to-entry hashmap in place of linear search for get/put/has/delete.
// This may be useful in certain contexts, even though it's not the default
// chosen for stream-records.
//
// ----------------------------------------------------------------
// MOTIVATION
//
// * The use case for records in Miller is that *all* fields are read from
// strings & written to strings (split/join), while only *some* fields are
// operated on.
//
// * Meanwhile there are few repeated accesses to a given record: the
// access-to-construct ratio is quite low for Miller data records. Miller
// instantiates thousands, millions, billions of records (depending on the
// input data) but accesses each record only once per transforming operation.
// (This is in contrast to accumulator hashmaps which are repeatedly accessed
// during a stats run.)
//
// * The hashed impl computes hashsums for *all* fields whether operated on or not,
// for the benefit of the *few* fields looked up during the transforming operation.
//
// * The hashless impl only keeps string pointers. Lookups are done at runtime
// doing prefix search on the key names. Assuming field names are distinct,
// this is just a few char-ptr accesses which (in experiments) turn out to
// offer about a 10-15% performance improvement.
//
// * Added benefit: the field-rename operation (preserving field order) becomes
// trivial.
// ================================================================
package mlrval
// As noted above, hashing has a minor penalty for low column count: computing
// hashmaps takes more time than is saved later on. But for higher column-count,
// non-hashing has a huge penalty. Therefore we default to on. And users can
// use `mlr --no-hash-records` or `mlr --hash-records` to flip the behavior.
var hashRecords = true
func HashRecords(onOff bool) {
hashRecords = onOff
}
// ----------------------------------------------------------------
type Mlrmap struct {
FieldCount int64
Head *MlrmapEntry
Tail *MlrmapEntry
// This can be nil if hashRecords is off.
keysToEntries map[string]*MlrmapEntry
}
type MlrmapEntry struct {
Key string
Value *Mlrval
Prev *MlrmapEntry
Next *MlrmapEntry
}
// MlrmapEntryForArray is for use by sorting routines where the Prev/Next pointers
// are irrelevant as well as ephemeral
type MlrmapEntryForArray struct {
Key string
Value *Mlrval
}
// Only used for sorting, map-to-pairs-array and pairs-array-to-map contexts.
type MlrmapPair struct {
Key string
Value *Mlrval
}
// ----------------------------------------------------------------
func NewMlrmapAsRecord() *Mlrmap {
if hashRecords {
return newMlrmapHashed()
} else {
return newMlrmapUnhashed()
}
}
func NewMlrmap() *Mlrmap {
return newMlrmapHashed()
}
// Faster on record-stream data as noted above.
func newMlrmapUnhashed() *Mlrmap {
return &Mlrmap{
FieldCount: 0,
Head: nil,
Tail: nil,
keysToEntries: nil,
}
}
// Intended for use in DSL expressions wherein the access-to-construct ratio
// might be higher (although this needs profiling over a variety of use-cases).
func newMlrmapHashed() *Mlrmap {
return &Mlrmap{
FieldCount: 0,
Head: nil,
Tail: nil,
keysToEntries: make(map[string]*MlrmapEntry),
}
}
func NewMlrmapMaybeHashed(wantHashing bool) *Mlrmap {
if wantHashing {
return newMlrmapHashed()
} else {
return newMlrmapUnhashed()
}
}
func (mlrmap *Mlrmap) isHashed() bool {
return mlrmap.keysToEntries != nil
}
// ----------------------------------------------------------------
// Value-copy is up to the caller -- PutReference and PutCopy
// are in the public Mlrmap API.
func newMlrmapEntry(key string, value *Mlrval) *MlrmapEntry {
return &MlrmapEntry{
key,
value,
nil,
nil,
}
}