mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
155 lines
4.9 KiB
Go
155 lines
4.9 KiB
Go
// ================================================================
|
|
// ORDERED MAP FROM STRING TO MLRVAL
|
|
//
|
|
// This is an implementation of insertion-ordered key-value pairs for Miller's
|
|
// fundamental record data structure. It's also an ordered-map data structure,
|
|
// suitable for Miller JSON decode/encode.
|
|
//
|
|
// ----------------------------------------------------------------
|
|
// DESIGN
|
|
//
|
|
// * It keeps a doubly-linked list of key-value pairs.
|
|
//
|
|
// * With hash-records set to false, no hash functions are computed when the map
|
|
// is written to or read from.
|
|
//
|
|
// * Gets are implemented by sequential scan through the list: given a key,
|
|
// the key-value pairs are scanned through until a match is (or is not) found.
|
|
//
|
|
// * Performance improvement of 25% percent over hash-mapping from key to entry
|
|
// was found in the Go implementation. Test data was million-line CSV and
|
|
// DKVP, with a dozen columns or so.
|
|
//
|
|
// * However, with higher column-count (see https://github.com/johnkerl/miller/issues/1506
|
|
// and https://github.com/johnkerl/miller/pull/1507), non-hashing becomes
|
|
// a substantial penalty.
|
|
//
|
|
// Note however that an auxiliary constructor is provided which does use
|
|
// a key-to-entry hashmap in place of linear search for get/put/has/delete.
|
|
// This may be useful in certain contexts, even though it's not the default
|
|
// chosen for stream-records.
|
|
//
|
|
// ----------------------------------------------------------------
|
|
// MOTIVATION
|
|
//
|
|
// * The use case for records in Miller is that *all* fields are read from
|
|
// strings & written to strings (split/join), while only *some* fields are
|
|
// operated on.
|
|
//
|
|
// * Meanwhile there are few repeated accesses to a given record: the
|
|
// access-to-construct ratio is quite low for Miller data records. Miller
|
|
// instantiates thousands, millions, billions of records (depending on the
|
|
// input data) but accesses each record only once per transforming operation.
|
|
// (This is in contrast to accumulator hashmaps which are repeatedly accessed
|
|
// during a stats run.)
|
|
//
|
|
// * The hashed impl computes hashsums for *all* fields whether operated on or not,
|
|
// for the benefit of the *few* fields looked up during the transforming operation.
|
|
//
|
|
// * The hashless impl only keeps string pointers. Lookups are done at runtime
|
|
// doing prefix search on the key names. Assuming field names are distinct,
|
|
// this is just a few char-ptr accesses which (in experiments) turn out to
|
|
// offer about a 10-15% performance improvement.
|
|
//
|
|
// * Added benefit: the field-rename operation (preserving field order) becomes
|
|
// trivial.
|
|
// ================================================================
|
|
|
|
package mlrval
|
|
|
|
// As noted above, hashing has a minor penalty for low column count: computing
|
|
// hashmaps takes more time than is saved later on. But for higher column-count,
|
|
// non-hashing has a huge penalty. Therefore we default to on. And users can
|
|
// use `mlr --no-hash-records` or `mlr --hash-records` to flip the behavior.
|
|
var hashRecords = true
|
|
|
|
func HashRecords(onOff bool) {
|
|
hashRecords = onOff
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
type Mlrmap struct {
|
|
FieldCount int64
|
|
Head *MlrmapEntry
|
|
Tail *MlrmapEntry
|
|
|
|
// This can be nil if hashRecords is off.
|
|
keysToEntries map[string]*MlrmapEntry
|
|
}
|
|
|
|
type MlrmapEntry struct {
|
|
Key string
|
|
Value *Mlrval
|
|
Prev *MlrmapEntry
|
|
Next *MlrmapEntry
|
|
}
|
|
|
|
// MlrmapEntryForArray is for use by sorting routines where the Prev/Next pointers
|
|
// are irrelevant as well as ephemeral
|
|
type MlrmapEntryForArray struct {
|
|
Key string
|
|
Value *Mlrval
|
|
}
|
|
|
|
// Only used for sorting, map-to-pairs-array and pairs-array-to-map contexts.
|
|
type MlrmapPair struct {
|
|
Key string
|
|
Value *Mlrval
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
func NewMlrmapAsRecord() *Mlrmap {
|
|
if hashRecords {
|
|
return newMlrmapHashed()
|
|
} else {
|
|
return newMlrmapUnhashed()
|
|
}
|
|
}
|
|
func NewMlrmap() *Mlrmap {
|
|
return newMlrmapHashed()
|
|
}
|
|
|
|
// Faster on record-stream data as noted above.
|
|
func newMlrmapUnhashed() *Mlrmap {
|
|
return &Mlrmap{
|
|
FieldCount: 0,
|
|
Head: nil,
|
|
Tail: nil,
|
|
keysToEntries: nil,
|
|
}
|
|
}
|
|
|
|
// Intended for use in DSL expressions wherein the access-to-construct ratio
|
|
// might be higher (although this needs profiling over a variety of use-cases).
|
|
func newMlrmapHashed() *Mlrmap {
|
|
return &Mlrmap{
|
|
FieldCount: 0,
|
|
Head: nil,
|
|
Tail: nil,
|
|
keysToEntries: make(map[string]*MlrmapEntry),
|
|
}
|
|
}
|
|
|
|
func NewMlrmapMaybeHashed(wantHashing bool) *Mlrmap {
|
|
if wantHashing {
|
|
return newMlrmapHashed()
|
|
} else {
|
|
return newMlrmapUnhashed()
|
|
}
|
|
}
|
|
|
|
func (mlrmap *Mlrmap) isHashed() bool {
|
|
return mlrmap.keysToEntries != nil
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
// Value-copy is up to the caller -- PutReference and PutCopy
|
|
// are in the public Mlrmap API.
|
|
func newMlrmapEntry(key string, value *Mlrval) *MlrmapEntry {
|
|
return &MlrmapEntry{
|
|
key,
|
|
value,
|
|
nil,
|
|
nil,
|
|
}
|
|
}
|