Export library code in pkg/ (#1391)

* Export library code in `pkg/` * new doc page
2026-01-23 02:14:13 +00:00 · 2023-09-10 17:15:13 -04:00 · 2023-09-10 17:15:13 -04:00 · 268a96d002
commit 268a96d002
parent 93b7c8eac0
358 changed files with 1076 additions and 693 deletions
--- a/pkg/lib/README.md
+++ b/pkg/lib/README.md
@ -0,0 +1 @@
+These are basic library routines for Miller.
--- a/pkg/lib/doc.go
+++ b/pkg/lib/doc.go
@ -0,0 +1,2 @@
+// Package lib contains basic library routines for Miller.
+package lib
--- a/pkg/lib/docurl.go
+++ b/pkg/lib/docurl.go
@ -0,0 +1,3 @@
+package lib
+
+const DOC_URL = "https://miller.readthedocs.io"
--- a/pkg/lib/file_readers.go
+++ b/pkg/lib/file_readers.go
@ -0,0 +1,322 @@
+// ================================================================
+// Wrapper for os.Open which maps string filename to *os.File, which in turn
+// implements io.ReadCloser, and optional in turn wrapping that in a
+// gzip/zlib/bunzip2 reader. Shared across record-readers for all the various
+// input-file formats (CSV, JSON, XTAB, DKVP, NIDX, PPRINT) which Miller
+// supports.
+//
+// There are two ways of handling compressed data in the Miller Go port:
+//
+// * A user-specified 'prepipe' command such as 'gunzip', where we popen a
+//   process, hand it the filename via '< filename', and read from that pipe;
+//
+// * An indication to use an in-process encoding reader (gzip or bzip2, etc).
+//
+// If a prepipe is specified, it is used; else if an encoding is specified, it
+// is used; otherwise the file suffix (.bz2, .gz, .z) is consulted; otherwise
+// the file is treated as text.
+// ================================================================
+
+package lib
+
+import (
+	"bytes"
+	"compress/bzip2"
+	"compress/gzip"
+	"compress/zlib"
+	"fmt"
+	"github.com/klauspost/compress/zstd"
+	"io"
+	"net/http"
+	"os"
+	"strings"
+)
+
+type TFileInputEncoding int
+
+const (
+	FileInputEncodingDefault TFileInputEncoding = iota
+	FileInputEncodingBzip2
+	FileInputEncodingGzip
+	FileInputEncodingZlib
+	FileInputEncodingZstd
+)
+
+// OpenFileForRead: If prepipe is non-empty, popens "{prepipe} < {filename}"
+// and returns a handle to that where prepipe is nominally things like
+// "gunzip", "cat", etc.  Otherwise, delegates to an in-process reader which
+// can natively handle gzip/bzip2/zlib depending on the specified encoding.  If
+// the encoding isn't a compression encoding, this ends up being simply
+// os.Open.
+func OpenFileForRead(
+	filename string,
+	prepipe string,
+	prepipeIsRaw bool,
+	encoding TFileInputEncoding, // ignored if prepipe is non-empty
+) (io.ReadCloser, error) {
+	if prepipe != "" {
+		return openPrepipedHandleForRead(filename, prepipe, prepipeIsRaw)
+	} else {
+		handle, err := PathToHandle(filename)
+		if err != nil {
+			return nil, err
+		}
+		return openEncodedHandleForRead(handle, encoding, filename)
+	}
+}
+
+// PathToHandle maps various back-ends to a stream. As of 2021-07-07, the
+// following URI schemes are supported:
+// * https://... and http://...
+// * file://...
+// * plain disk files
+func PathToHandle(
+	path string,
+) (io.ReadCloser, error) {
+	if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
+		resp, err := http.Get(path)
+		if err != nil {
+			return nil, err
+		}
+		handle := resp.Body
+		return handle, err
+	} else if strings.HasPrefix(path, "file://") {
+		return os.Open(strings.Replace(path, "file://", "", 1))
+	} else {
+		return os.Open(path)
+	}
+}
+
+// OpenStdin: if prepipe is non-empty, popens "{prepipe}" and returns a handle
+// to that where prepipe is nominally things like "gunzip", "cat", etc.
+// Otherwise, delegates to an in-process reader which can natively handle
+// gzip/bzip2/zlib depending on the specified encoding.  If the encoding isn't
+// a compression encoding, this ends up being simply os.Stdin.
+func OpenStdin(
+	prepipe string,
+	prepipeIsRaw bool,
+	encoding TFileInputEncoding, // ignored if prepipe is non-empty
+) (io.ReadCloser, error) {
+	if prepipe != "" {
+		return openPrepipedHandleForRead("", prepipe, prepipeIsRaw)
+	} else {
+		return openEncodedHandleForRead(os.Stdin, encoding, "")
+	}
+}
+
+func openPrepipedHandleForRead(
+	filename string,
+	prepipe string,
+	prepipeIsRaw bool,
+) (io.ReadCloser, error) {
+	escapedFilename := escapeFileNameForPopen(filename)
+
+	var command string
+	if filename == "" { // stdin
+		command = prepipe
+	} else {
+		if prepipeIsRaw {
+			command = prepipe + " " + escapedFilename
+		} else {
+			command = prepipe + " < " + escapedFilename
+		}
+	}
+
+	return OpenInboundHalfPipe(command)
+}
+
+// Avoids shell-injection cases by replacing single-quote with backslash
+// single-quote and double-quote with backslack double-quote, then wrapping the
+// entire result in initial and final single-quote.
+//
+// TODO: test on Windows. Maybe needs move to pkg/platform.
+func escapeFileNameForPopen(filename string) string {
+	var buffer bytes.Buffer
+	foundQuote := false
+	for _, c := range filename {
+		if c == '\'' || c == '"' {
+			buffer.WriteRune('\'')
+			buffer.WriteRune(c)
+			buffer.WriteRune('\'')
+		} else {
+			buffer.WriteRune(c)
+		}
+	}
+	if foundQuote {
+		return "'" + buffer.String() + "'"
+	} else {
+		return buffer.String()
+	}
+}
+
+// TODO: comment
+func openEncodedHandleForRead(
+	handle io.ReadCloser,
+	encoding TFileInputEncoding,
+	filename string,
+) (io.ReadCloser, error) {
+	switch encoding {
+	case FileInputEncodingBzip2:
+		return NewBZip2ReadCloser(handle), nil
+	case FileInputEncodingGzip:
+		return gzip.NewReader(handle)
+	case FileInputEncodingZlib:
+		return zlib.NewReader(handle)
+	case FileInputEncodingZstd:
+		return NewZstdReadCloser(handle)
+	}
+
+	InternalCodingErrorIf(encoding != FileInputEncodingDefault)
+
+	if strings.HasSuffix(filename, ".bz2") {
+		return NewBZip2ReadCloser(handle), nil
+	}
+	if strings.HasSuffix(filename, ".gz") {
+		return gzip.NewReader(handle)
+	}
+	if strings.HasSuffix(filename, ".z") {
+		return zlib.NewReader(handle)
+	}
+	if strings.HasSuffix(filename, ".zst") {
+		return NewZstdReadCloser(handle)
+	}
+
+	// Pass along os.Stdin or os.Open(filename)
+	return handle, nil
+}
+
+// ----------------------------------------------------------------
+// BZip2ReadCloser remedies the fact that bzip2.NewReader does not implement io.ReadCloser.
+type BZip2ReadCloser struct {
+	originalHandle io.ReadCloser
+	bzip2Handle    io.Reader
+}
+
+func NewBZip2ReadCloser(handle io.ReadCloser) *BZip2ReadCloser {
+	return &BZip2ReadCloser{
+		originalHandle: handle,
+		bzip2Handle:    bzip2.NewReader(handle),
+	}
+}
+
+func (rc *BZip2ReadCloser) Read(p []byte) (n int, err error) {
+	return rc.bzip2Handle.Read(p)
+}
+
+func (rc *BZip2ReadCloser) Close() error {
+	return rc.originalHandle.Close()
+}
+
+// ----------------------------------------------------------------
+// ZstdReadCloser remedies the fact that zstd.NewReader does not implement io.ReadCloser.
+type ZstdReadCloser struct {
+	originalHandle io.ReadCloser
+	zstdHandle     io.Reader
+}
+
+func NewZstdReadCloser(handle io.ReadCloser) (*ZstdReadCloser, error) {
+	zstdHandle, err := zstd.NewReader(handle)
+	if err != nil {
+		return nil, err
+	}
+	return &ZstdReadCloser{
+		originalHandle: handle,
+		zstdHandle:     zstdHandle,
+	}, nil
+}
+
+func (rc *ZstdReadCloser) Read(p []byte) (n int, err error) {
+	return rc.zstdHandle.Read(p)
+}
+
+func (rc *ZstdReadCloser) Close() error {
+	return rc.originalHandle.Close()
+}
+
+// ----------------------------------------------------------------
+
+// IsEOF handles the following problem: reading past end of files opened with
+// os.Open returns the error which is io.EOF. Reading past close of pipes
+// opened with popen (e.g.  Miller's prepipe, where the file isn't 'foo.dat'
+// but rather the process 'gunzip < foo.dat |') returns not io.EOF but an error
+// with 'file already closed' within it. See also
+// https://stackoverflow.com/questions/47486128/why-does-io-pipe-continue-to-block-even-when-eof-is-reached
+func IsEOF(err error) bool {
+	if err == nil {
+		return false
+	} else if err == io.EOF {
+		return true
+	} else if strings.Contains(err.Error(), "file already closed") {
+		return true
+	} else {
+		return false
+	}
+}
+
+// ----------------------------------------------------------------
+// Functions for in-place mode
+
+// IsUpdateableInPlace tells if we can use the input with mlr -I: not for URLs,
+// and not for prepipe commands (which we don't presume to know how to invert
+// for output).
+func IsUpdateableInPlace(
+	filename string,
+	prepipe string,
+) error {
+	if strings.HasPrefix(filename, "http://") ||
+		strings.HasPrefix(filename, "https://") ||
+		strings.HasPrefix(filename, "file://") {
+		return fmt.Errorf("http://, https://, and file:// URLs are not updateable in place.")
+	}
+	if prepipe != "" {
+		return fmt.Errorf("input with --prepipe or --prepipex is not updateable in place.")
+	}
+	return nil
+}
+
+// FindInputEncoding determines the input encoding (compression), whether from
+// a flag like --gzin, or from filename suffix like ".gz".  If the user did
+// --gzin on the command line, TFileInputEncoding will be
+// FileInputEncodingGzip.  If they didn't, but the filename ends in ".gz", then
+// we auto-infer FileInputEncodingGzip.  Either way, this function tells if we
+// will be using in-process decompression within the file-format-specific
+// record reader.
+func FindInputEncoding(
+	filename string,
+	inputFileInputEncoding TFileInputEncoding,
+) TFileInputEncoding {
+	if inputFileInputEncoding != FileInputEncodingDefault {
+		return inputFileInputEncoding
+	}
+	if strings.HasSuffix(filename, ".bz2") {
+		return FileInputEncodingBzip2
+	}
+	if strings.HasSuffix(filename, ".gz") {
+		return FileInputEncodingGzip
+	}
+	if strings.HasSuffix(filename, ".z") {
+		return FileInputEncodingZlib
+	}
+	return FileInputEncodingDefault
+}
+
+// WrapOutputHandle wraps a file-write handle with a decompressor.  The first
+// return value is the wrapped handle. The second is true if the returned
+// handle needs to be closed separately from the original.  The third is for
+// in-process compression we can't undo: namely, as of September 2021 the gzip
+// and zlib libraries support write-closers, but the bzip2 library does not.
+func WrapOutputHandle(
+	fileWriteHandle io.WriteCloser,
+	inputFileEncoding TFileInputEncoding,
+) (io.WriteCloser, bool, error) {
+	switch inputFileEncoding {
+	case FileInputEncodingBzip2:
+		return fileWriteHandle, false, fmt.Errorf("bzip2 is not currently supported for in-place mode.")
+	case FileInputEncodingGzip:
+		return gzip.NewWriter(fileWriteHandle), true, nil
+	case FileInputEncodingZlib:
+		return zlib.NewWriter(fileWriteHandle), true, nil
+	default:
+		return fileWriteHandle, false, nil
+	}
+}
--- a/pkg/lib/getoptify.go
+++ b/pkg/lib/getoptify.go
@ -0,0 +1,43 @@
+package lib
+
+import (
+	"regexp"
+	"strings"
+)
+
+// Getoptify expands "-xyz" into "-x -y -z" while leaving "--xyz" intact. This
+// is a keystroke-saver for the user.
+//
+// This is OK to do here globally since Miller is quite consistent (in main,
+// verbs, auxents, and terminals) that multi-character options start with two
+// dashes, e.g.  "--csv". (The sole exception is the sort verb's -nf/-nr which
+// are handled specially there.)
+//
+// Additionally, we split "--foo=bar" into "--foo" and "bar".
+func Getoptify(inargs []string) []string {
+	expandRegex := regexp.MustCompile("^-[a-zA-Z0-9]+$")
+	splitRegex := regexp.MustCompile("^--[^=]+=.+$")
+	numberRegex := regexp.MustCompile("^-[0-9]+$")
+	outargs := make([]string, 0)
+	for _, inarg := range inargs {
+		if expandRegex.MatchString(inarg) {
+			if numberRegex.MatchString(inarg) {
+				// Don't expand things like '-12345' which are (likely!) numeric arguments to verbs.
+				// Example: 'mlr unsparsify --fill-with -99999'.
+				outargs = append(outargs, inarg)
+			} else {
+				for _, c := range inarg[1:] {
+					outargs = append(outargs, "-"+string(c))
+				}
+			}
+		} else if splitRegex.MatchString(inarg) {
+			pair := strings.SplitN(inarg, "=", 2)
+			InternalCodingErrorIf(len(pair) != 2)
+			outargs = append(outargs, pair[0])
+			outargs = append(outargs, pair[1])
+		} else {
+			outargs = append(outargs, inarg)
+		}
+	}
+	return outargs
+}
--- a/pkg/lib/halfpipe.go
+++ b/pkg/lib/halfpipe.go
@ -0,0 +1,88 @@
+package lib
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/johnkerl/miller/pkg/platform"
+)
+
+// OpenOutboundHalfPipe returns a handle to a process. Writing to that handle
+// writes to the process' stdin. The process' stdout and stderr are the current
+// process' stdout and stderr.
+//
+// This is for pipe-output-redirection in the Miller put/filter DSL.
+//
+// Note I am not using os.exec.Cmd which is billed as being simpler than using
+// os.StartProcess. It may indeed be simpler when you want to handle the
+// subprocess' stdin/stdout/stderr all three within the parent process.  Here I
+// found it much easier to use os.StartProcess to let the stdout/stderr run
+// free.
+
+func OpenOutboundHalfPipe(commandString string) (*os.File, error) {
+	readPipe, writePipe, err := os.Pipe()
+
+	var procAttr os.ProcAttr
+	procAttr.Files = []*os.File{
+		readPipe,
+		os.Stdout,
+		os.Stderr,
+	}
+
+	// /bin/sh -c "..." or cmd /c "..."
+	shellRunArray := platform.GetShellRunArray(commandString)
+
+	process, err := os.StartProcess(shellRunArray[0], shellRunArray, &procAttr)
+	if err != nil {
+		return nil, err
+	}
+
+	go process.Wait()
+
+	return writePipe, nil
+}
+
+// OpenInboundHalfPipe returns a handle to a process. Reading from that handle
+// reads from the process' stdout. The process' stdin and stderr are the
+// current process' stdin and stderr.
+//
+// This is for the Miller prepipe feature.
+//
+// Note I am not using os.exec.Cmd which is billed as being simpler than using
+// os.StartProcess. It may indeed be simpler when you want to handle the
+// subprocess' stdin/stdout/stderr all three within the parent process.  Here I
+// found it much easier to use os.StartProcess to let the stdin/stderr run
+// free.
+
+func OpenInboundHalfPipe(commandString string) (*os.File, error) {
+	readPipe, writePipe, err := os.Pipe()
+
+	var procAttr os.ProcAttr
+	procAttr.Files = []*os.File{
+		os.Stdin,
+		writePipe,
+		os.Stderr,
+	}
+
+	// /bin/sh -c "..." or cmd /c "..."
+	shellRunArray := platform.GetShellRunArray(commandString)
+
+	process, err := os.StartProcess(shellRunArray[0], shellRunArray, &procAttr)
+	if err != nil {
+		return nil, err
+	}
+
+	// TODO comment somewhere
+	// https://stackoverflow.com/questions/47486128/why-does-io-pipe-continue-to-block-even-when-eof-is-reached
+
+	// TODO comment
+	go func(process *os.Process, readPipe *os.File) {
+		_, err := process.Wait()
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err)
+		}
+		readPipe.Close()
+	}(process, readPipe)
+
+	return readPipe, nil
+}
--- a/pkg/lib/latin1.go
+++ b/pkg/lib/latin1.go
@ -0,0 +1,38 @@
+package lib
+
+import (
+	"bytes"
+	"fmt"
+	"unicode/utf8"
+)
+
+func TryLatin1ToUTF8(input string) (string, error) {
+	var buffer bytes.Buffer
+	for _, b := range []byte(input) {
+		// 0x00-0xff map to 0x0000-0xffff
+		buffer.WriteRune(rune(b))
+	}
+	output := buffer.String()
+	return output, nil
+}
+
+func TryUTF8ToLatin1(input string) (string, error) {
+	var buffer bytes.Buffer
+
+	bytes := []byte(input)
+	for len(bytes) > 0 {
+		r, size := utf8.DecodeRune(bytes)
+
+		if r < 0x0080 {
+			buffer.WriteByte(byte(r))
+		} else if r >= 0x80 && r <= 0x00ff {
+			buffer.WriteByte(byte(r))
+		} else {
+			return "", fmt.Errorf("character 0x%08x (%v) is not encodable as Latin-1", int(r), r)
+		}
+
+		bytes = bytes[size:]
+	}
+	output := buffer.String()
+	return output, nil
+}
--- a/pkg/lib/latin1_test.go
+++ b/pkg/lib/latin1_test.go
@ -0,0 +1,100 @@
+// ================================================================
+// Most Miller tests (thousands of them) are command-line-driven via
+// mlr regtest. Here are some cases needing special focus.
+// ================================================================
+
+package lib
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+type tDataForLatin1 struct {
+	input          string
+	expectedOutput string
+	expectError    bool
+}
+
+var dataForLatin1ToUTF8 = []tDataForLatin1{
+	{
+		"",
+		"",
+		false,
+	},
+	{
+		"The quick brown fox jumped over the lazy dogs.",
+		"The quick brown fox jumped over the lazy dogs.",
+		false,
+	},
+	{
+		"a\xe4o\xf6",
+		"a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4"
+		false,
+	},
+	{
+		"Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich",
+		"Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich",
+		false,
+	},
+}
+
+var dataForUTF8ToLatin1 = []tDataForLatin1{
+	{
+		"",
+		"",
+		false,
+	},
+	{
+		"The quick brown fox jumped over the lazy dogs.",
+		"The quick brown fox jumped over the lazy dogs.",
+		false,
+	},
+	{
+		"a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4"
+		"a\xe4o\xf6",
+		false,
+	},
+	{
+		"Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich",
+		"Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich",
+		false,
+	},
+	{
+		"Съешь же ещё этих мягких французских булок да выпей чаю",
+		"",
+		true,
+	},
+}
+
+func TestLatin1ToUTF8(t *testing.T) {
+	for i, entry := range dataForLatin1ToUTF8 {
+		actualOutput, err := TryLatin1ToUTF8(entry.input)
+		if entry.expectError {
+			assert.NotNil(t, err)
+		} else {
+			assert.Nil(t, err)
+		}
+		if actualOutput != entry.expectedOutput {
+			t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
+				i, entry.input, entry.expectedOutput, actualOutput,
+			)
+		}
+	}
+}
+
+func TestUTF8ToLatin1(t *testing.T) {
+	for i, entry := range dataForUTF8ToLatin1 {
+		actualOutput, err := TryUTF8ToLatin1(entry.input)
+		if entry.expectError {
+			assert.NotNil(t, err)
+		} else {
+			assert.Nil(t, err)
+		}
+		if actualOutput != entry.expectedOutput {
+			t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
+				i, entry.input, entry.expectedOutput, actualOutput,
+			)
+		}
+	}
+}
--- a/pkg/lib/logger.go
+++ b/pkg/lib/logger.go
@ -0,0 +1,110 @@
+package lib
+
+import (
+	"fmt"
+	"os"
+	"path"
+	"runtime"
+)
+
+// InternalCodingErrorIf is a lookalike for C's __FILE__ and __LINE__ printing,
+// with exit 1 if the condition is true.
+func InternalCodingErrorIf(condition bool) {
+	if !condition {
+		return
+	}
+	_, fileName, fileLine, ok := runtime.Caller(1)
+	if ok {
+		fmt.Fprintf(
+			os.Stderr,
+			"Internal coding error detected at file %s line %d\n",
+			// Full path preferred but breaks diffs on regression-test actual vs expected
+			// stderr comparison on expect-fail cases.
+			path.Base(fileName),
+			fileLine,
+		)
+	} else {
+		fmt.Fprintf(
+			os.Stderr,
+			"Internal coding error detected at file %s line %s\n",
+			"(unknown)",
+			"(unknown)",
+		)
+	}
+	// Use this and re-run if you want to get a stack trace to get the
+	// call-tree that led to the indicated file/line:
+	if os.Getenv("MLR_PANIC_ON_INTERNAL_ERROR") != "" {
+		panic("Here is the stack trace")
+	}
+	os.Exit(1)
+}
+
+// InternalCodingErrorWithMessageIf is a lookalike for C's __FILE__ and
+// __LINE__ printing, with exit 1 if the condition is true.
+func InternalCodingErrorWithMessageIf(condition bool, message string) {
+	if !condition {
+		return
+	}
+	_, fileName, fileLine, ok := runtime.Caller(1)
+	if ok {
+		fmt.Fprintf(
+			os.Stderr,
+			"Internal coding error detected at file %s line %d: %s\n",
+			path.Base(fileName),
+			fileLine,
+			message,
+		)
+	} else {
+		fmt.Fprintf(
+			os.Stderr,
+			"Internal coding error detected at file %s line %s: %s\n",
+			"(unknown)",
+			"(unknown)",
+			message,
+		)
+	}
+	// use this and re-run if you want to get a stack trace to get the
+	// call-tree that led to the indicated file/line:
+	if os.Getenv("MLR_PANIC_ON_INTERNAL_ERROR") != "" {
+		panic("Here is the stack trace")
+	}
+	os.Exit(1)
+}
+
+// InternalCodingErrorPanic is like InternalCodingErrorIf, expect that it
+// panics the process (for stack trace, which is usually not desired), and that
+// it requires the if-test to be at the caller.
+func InternalCodingErrorPanic(message string) {
+	_, fileName, fileLine, ok := runtime.Caller(1)
+	if ok {
+		panic(
+			fmt.Sprintf(
+				"Internal coding error detected at file %s line %d: %s\n",
+				path.Base(fileName),
+				fileLine,
+				message,
+			),
+		)
+	} else {
+		panic(
+			fmt.Sprintf(
+				"Internal coding error detected at file %s line %s: %s\n",
+				"(unknown)",
+				"(unknown)",
+				message,
+			),
+		)
+	}
+}
+
+// WhereAreWe shows a stack trace from the current callsite.
+func WhereAreWe() {
+	// Start at 1, not 0, since this function itself is not of interest.
+	for i := 1; i < 20; i++ {
+		_, fileName, fileLine, ok := runtime.Caller(i)
+		if !ok {
+			break
+		}
+		fmt.Printf("  %s %d\n", fileName, fileLine)
+	}
+}
--- a/pkg/lib/mlrmath.go
+++ b/pkg/lib/mlrmath.go
@ -0,0 +1,430 @@
+// ================================================================
+// Non-mlrval math routines
+// ================================================================
+
+package lib
+
+import (
+	"fmt"
+	"math"
+	"os"
+)
+
+// ----------------------------------------------------------------
+// Some wrappers around things which aren't one-liners from math.*.
+
+func Sgn(a float64) float64 {
+	if a > 0 {
+		return 1.0
+	} else if a < 0 {
+		return -1.0
+	} else if a == 0 {
+		return 0.0
+	} else {
+		return math.NaN()
+	}
+}
+
+// Normal cumulative distribution function, expressed in terms of erfc library
+// function (which is awkward, but exists).
+func Qnorm(x float64) float64 {
+	return 0.5 * math.Erfc(-x/math.Sqrt2)
+}
+
+// This is a tangent-following method not unlike Newton-Raphson:
+// * We can compute qnorm(y) = integral from -infinity to y of (1/sqrt(2pi)) exp(-t^2/2) dt.
+// * We can compute derivative of qnorm(y) = (1/sqrt(2pi)) exp(-y^2/2).
+// * We cannot explicitly compute invqnorm(y).
+// * If dx/dy = (1/sqrt(2pi)) exp(-y^2/2) then dy/dx = sqrt(2pi) exp(y^2/2).
+//
+// This means we *can* compute the derivative of invqnorm even though we
+// can't compute the function itself. So the essence of the method is to
+// follow the tangent line to form successive approximations: we have known function input x
+// and unknown function output y and initial guess y0.  At each step we find the intersection
+// of the tangent line at y_n with the vertical line at x, to find y_{n+1}. Specificall:
+//
+// * Even though we can't compute y = q^-1(x) we can compute x = q(y).
+// * Start with initial guess for y (y0 = 0.0 or y0 = x both are OK).
+// * Find x = q(y). Since q (and therefore q^-1) are 1-1, we're done if qnorm(invqnorm(x)) is small.
+// * Else iterate: using point-slope form, (y_{n+1} - y_n) / (x_{n+1} - x_n) = m = sqrt(2pi) exp(y_n^2/2).
+//   Here x_2 = x (the input) and x_1 = q(y_1).
+// * Solve for y_{n+1} and repeat.
+
+const INVQNORM_TOL float64 = 1e-9
+const INVQNORM_MAXITER int = 30
+
+func Invqnorm(x float64) float64 {
+	// Initial approximation is linear. Starting with y0 = 0.0 works just as well.
+	y0 := x - 0.5
+	if x <= 0.0 {
+		return 0.0
+	}
+	if x >= 1.0 {
+		return 0.0
+	}
+
+	y := y0
+	niter := 0
+
+	for {
+
+		backx := Qnorm(y)
+		err := math.Abs(x - backx)
+		if err < INVQNORM_TOL {
+			break
+		}
+		if niter > INVQNORM_MAXITER {
+			fmt.Fprintf(os.Stderr,
+				"mlr: internal coding error: max iterations %d exceeded in invqnorm.\n",
+				INVQNORM_MAXITER,
+			)
+			os.Exit(1)
+		}
+		m := math.Sqrt2 * math.SqrtPi * math.Exp(y*y/2.0)
+		delta_y := m * (x - backx)
+		y += delta_y
+		niter++
+	}
+
+	return y
+}
+
+const JACOBI_TOLERANCE = 1e-12
+const JACOBI_MAXITER = 20
+
+// ----------------------------------------------------------------
+// Jacobi real-symmetric eigensolver. Loosely adapted from Numerical Recipes.
+//
+// Note: this is coded for n=2 (to implement PCA linear regression on 2
+// variables) but the algorithm is quite general. Changing from 2 to n is a
+// matter of updating the top and bottom of the function: function signature to
+// take double** matrix, double* eigenvector_1, double* eigenvector_2, and n;
+// create copy-matrix and make-identity matrix functions; free temp matrices at
+// the end; etc.
+
+func GetRealSymmetricEigensystem(
+	matrix [2][2]float64,
+) (
+	eigenvalue1 float64, // Output: dominant eigenvalue
+	eigenvalue2 float64, // Output: less-dominant eigenvalue
+	eigenvector1 [2]float64, // Output: corresponding to dominant eigenvalue
+	eigenvector2 [2]float64, // Output: corresponding to less-dominant eigenvalue
+) {
+	L := [2][2]float64{
+		{matrix[0][0], matrix[0][1]},
+		{matrix[1][0], matrix[1][1]},
+	}
+	V := [2][2]float64{
+		{1.0, 0.0},
+		{0.0, 1.0},
+	}
+	var P, PT_A [2][2]float64
+	n := 2
+
+	found := false
+	for iter := 0; iter < JACOBI_MAXITER; iter++ {
+		sum := 0.0
+		for i := 1; i < n; i++ {
+			for j := 0; j < i; j++ {
+				sum += math.Abs(L[i][j])
+			}
+		}
+		if math.Abs(sum*sum) < JACOBI_TOLERANCE {
+			found = true
+			break
+		}
+
+		for p := 0; p < n; p++ {
+			for q := p + 1; q < n; q++ {
+				numer := L[p][p] - L[q][q]
+				denom := L[p][q] + L[q][p]
+				if math.Abs(denom) < JACOBI_TOLERANCE {
+					continue
+				}
+				theta := numer / denom
+				signTheta := 1.0
+				if theta < 0 {
+					signTheta = -1.0
+				}
+				t := signTheta / (math.Abs(theta) + math.Sqrt(theta*theta+1))
+				c := 1.0 / math.Sqrt(t*t+1)
+				s := t * c
+
+				for pi := 0; pi < n; pi++ {
+					for pj := 0; pj < n; pj++ {
+						if pi == pj {
+							P[pi][pj] = 1.0
+						} else {
+							P[pi][pj] = 0.0
+						}
+					}
+				}
+				P[p][p] = c
+				P[p][q] = -s
+				P[q][p] = s
+				P[q][q] = c
+
+				// L = P.transpose() * L * P
+				// V = V * P
+				matmul2t(&PT_A, &P, &L)
+				matmul2(&L, &PT_A, &P)
+				matmul2(&V, &V, &P)
+			}
+		}
+	}
+
+	if !found {
+		fmt.Fprintf(os.Stderr,
+			"%s: Jacobi eigensolver: max iterations (%d) exceeded.  Non-symmetric input?\n",
+			"mlr",
+			JACOBI_MAXITER,
+		)
+		os.Exit(1)
+	}
+
+	eigenvalue1 = L[0][0]
+	eigenvalue2 = L[1][1]
+	abs1 := math.Abs(eigenvalue1)
+	abs2 := math.Abs(eigenvalue2)
+	if abs1 > abs2 {
+		eigenvector1[0] = V[0][0] // Column 0 of V
+		eigenvector1[1] = V[1][0]
+		eigenvector2[0] = V[0][1] // Column 1 of V
+		eigenvector2[1] = V[1][1]
+	} else {
+		eigenvalue1, eigenvalue2 = eigenvalue2, eigenvalue1
+		eigenvector1[0] = V[0][1]
+		eigenvector1[1] = V[1][1]
+		eigenvector2[0] = V[0][0]
+		eigenvector2[1] = V[1][0]
+	}
+
+	return eigenvalue1, eigenvalue2, eigenvector1, eigenvector2
+}
+
+// C = A * B
+func matmul2(
+	C *[2][2]float64, // Output
+	A *[2][2]float64, // Input
+	B *[2][2]float64, // Input
+) {
+	var T [2][2]float64
+	n := 2
+	for i := 0; i < n; i++ {
+		for j := 0; j < n; j++ {
+			sum := 0.0
+			for k := 0; k < n; k++ {
+				sum += A[i][k] * B[k][j]
+			}
+			T[i][j] = sum
+		}
+	}
+	// Needs copy in case C's memory is the same as A and/or B
+	for i := 0; i < n; i++ {
+		for j := 0; j < n; j++ {
+			C[i][j] = T[i][j]
+		}
+	}
+}
+
+// C = A^t * B
+func matmul2t(
+	C *[2][2]float64, // Output
+	A *[2][2]float64, // Input
+	B *[2][2]float64, // Input
+) {
+	var T [2][2]float64
+	n := 2
+	for i := 0; i < n; i++ {
+		for j := 0; j < n; j++ {
+			sum := 0.0
+			for k := 0; k < n; k++ {
+				sum += A[k][i] * B[k][j]
+			}
+			T[i][j] = sum
+		}
+	}
+	// Needs copy in case C's memory is the same as A and/or B
+	for i := 0; i < n; i++ {
+		for j := 0; j < n; j++ {
+			C[i][j] = T[i][j]
+		}
+	}
+}
+
+// ================================================================
+// Logisitic regression
+//
+// Real-valued x_0 .. x_{N-1}
+// 0/1-valued  y_0 .. y_{N-1}
+// Model p(x_i == 1)  as
+//   p(x, m, b) = 1 / (1 + exp(-m*x-b)
+// which is the same as
+//   log(p/(1-p)) = m*x + b
+// then
+//   p(x, m, b) = 1 / (1 + exp(-m*x-b)
+//              = exp(m*x+b) / (1 + exp(m*x+b)
+// and
+//   1-p        = exp(-m*x-b) / (1 + exp(-m*x-b)
+//              = 1 / (1 + exp(m*x+b)
+// Note for reference just below that
+//   dp/dm      = -1 / [1 + exp(-m*x-b)]**2 * (-x) * exp(-m*x-b)
+//              = [x exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2
+//              = x * p * (1-p)
+// and
+//   dp/db      = -1 / [1 + exp(-m*x-b)]**2 * (-1) * exp(-m*x-b)
+//              = [exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2
+//              = p * (1-p)
+// Write p_i for p(x_i, m, b)
+//
+// Maximum-likelihood equation:
+//   L(m, b)    = prod_{i=0}^{N-1} [ p_i**y_i * (1-p_i)**(1-y_i) ]
+//
+// Log-likelihood equation:
+//   ell(m, b)  = sum{i=0}^{N-1} [ y_i log(p_i) + (1-y_i) log(1-p_i) ]
+//              = sum{i=0}^{N-1} [ log(1-p_i) + y_i log(p_i/(1-p_i)) ]
+//              = sum{i=0}^{N-1} [ log(1-p_i) + y_i*(m*x_i+b) ]
+// Differentiate with respect to parameters:
+//
+//   d ell/dm   = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/dm + x_i*y_i ]
+//              = sum{i=0}^{N-1} [ -1/(1-p_i) x_i*p_i*(1-p_i) + x_i*y_i ]
+//              = sum{i=0}^{N-1} [ x_i(y_i-p_i) ]
+//
+//   d ell/db   = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/db + y_i ]
+//              = sum{i=0}^{N-1} [ -1/(1-p_i) p_i*(1-p_i) + y_i ]
+//              = sum{i=0}^{N-1} [ y_i - p_i ]
+//
+//
+//   d2ell/dm2  = sum{i=0}^{N-1} [ -x_i dp_i/dm ]
+//              = sum{i=0}^{N-1} [ -x_i**2 * p_i * (1-p_i) ]
+//
+//   d2ell/dmdb = sum{i=0}^{N-1} [ -x_i dp_i/db ]
+//              = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ]
+//
+//   d2ell/dbdm = sum{i=0}^{N-1} [ -dp_i/dm ]
+//              = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ]
+//
+//   d2ell/db2  = sum{i=0}^{N-1} [ -dp_i/db ]
+//              = sum{i=0}^{N-1} [ -p_i * (1-p_i) ]
+//
+// Newton-Raphson to minimize ell(m, b):
+// * Pick m0, b0
+// * [m_{j+1], b_{j+1}] = H^{-1} grad ell(m_j, b_j)
+// * grad ell =
+//   [ d ell/dm ]
+//   [ d ell/db ]
+// * H = Hessian of ell = Jacobian of grad ell =
+//   [ d2ell/dm2  d2ell/dmdb ]
+//   [ d2ell/dmdb d2ell/db2  ]
+
+// p(x,m,b) for logistic regression:
+func lrp(x, m, b float64) float64 {
+	return 1.0 / (1.0 + math.Exp(-m*x-b))
+}
+
+// 1 - p(x,m,b) for logistic regression:
+func lrq(x, m, b float64) float64 {
+	return 1.0 / (1.0 + math.Exp(m*x+b))
+}
+
+func LogisticRegression(xs, ys []float64) (m, b float64) {
+	m0 := -0.001
+	b0 := 0.002
+	tol := 1e-9
+	maxits := 100
+	return logisticRegressionAux(xs, ys, m0, b0, tol, maxits)
+}
+
+// Supporting routine for mlr_logistic_regression():
+func logisticRegressionAux(
+	xs, ys []float64,
+	m0, b0, tol float64,
+	maxits int,
+) (m, b float64) {
+
+	InternalCodingErrorIf(len(xs) != len(ys))
+	n := len(xs)
+
+	its := 0
+	done := false
+	m = m0
+	b = b0
+
+	for !done {
+		// Compute derivatives
+		dldm := 0.0
+		dldb := 0.0
+		d2ldm2 := 0.0
+		d2ldmdb := 0.0
+		d2ldb2 := 0.0
+		ell0 := 0.0
+
+		for i := 0; i < n; i++ {
+			xi := xs[i]
+			yi := ys[i]
+			pi := lrp(xi, m0, b0)
+			qi := lrq(xi, m0, b0)
+			dldm += xi * (yi - pi)
+			dldb += yi - pi
+			piqi := pi * qi
+			xipiqi := xi * piqi
+			xi2piqi := xi * xipiqi
+			d2ldm2 -= xi2piqi
+			d2ldmdb -= xipiqi
+			d2ldb2 -= piqi
+			ell0 += math.Log(qi) + yi*(m0*xi+b0)
+		}
+
+		// Form the Hessian
+		ha := d2ldm2
+		hb := d2ldmdb
+		hc := d2ldmdb
+		hd := d2ldb2
+
+		// Invert the Hessian
+		D := ha*hd - hb*hc
+		Hinva := hd / D
+		Hinvb := -hb / D
+		Hinvc := -hc / D
+		Hinvd := ha / D
+
+		// Compute H^-1 times grad ell
+		Hinvgradm := Hinva*dldm + Hinvb*dldb
+		Hinvgradb := Hinvc*dldm + Hinvd*dldb
+
+		// Update [m,b]
+		m = m0 - Hinvgradm
+		b = b0 - Hinvgradb
+
+		ell := 0.0
+		for i := 0; i < n; i++ {
+			xi := xs[i]
+			yi := ys[i]
+			qi := lrq(xi, m, b)
+			ell += math.Log(qi) + yi*(m0*xi+b0)
+		}
+
+		// Check for convergence
+		dell := math.Max(ell, ell0)
+		err := 0.0
+		if dell != 0.0 {
+			err = math.Abs(ell-ell0) / dell
+		}
+
+		if err < tol {
+			done = true
+		}
+		its++
+		if its > maxits {
+			fmt.Fprintf(os.Stderr,
+				"mlr_logistic_regression: Newton-Raphson convergence failed after %d iterations. m=%e, b=%e.\n",
+				its, m, b)
+			os.Exit(1)
+		}
+
+		m0 = m
+		b0 = b
+	}
+
+	return m, b
+}
--- a/pkg/lib/ordered_map.go
+++ b/pkg/lib/ordered_map.go
@ -0,0 +1,155 @@
+// ================================================================
+// ORDERED MAP FROM STRING TO INTERFACE{}
+//
+// Quite like types.OrderedMap but only with interface{} keys. See orderedMap.go for
+// more information.
+// ================================================================
+
+package lib
+
+// ----------------------------------------------------------------
+type OrderedMap struct {
+	FieldCount    int64
+	Head          *orderedMapEntry
+	Tail          *orderedMapEntry
+	keysToEntries map[string]*orderedMapEntry
+}
+
+type orderedMapEntry struct {
+	Key   string
+	Value interface{}
+	Prev  *orderedMapEntry
+	Next  *orderedMapEntry
+}
+
+// ----------------------------------------------------------------
+func NewOrderedMap() *OrderedMap {
+	return &OrderedMap{
+		FieldCount:    0,
+		Head:          nil,
+		Tail:          nil,
+		keysToEntries: make(map[string]*orderedMapEntry),
+	}
+}
+
+// ----------------------------------------------------------------
+// Value-copy is up to the caller -- PutReference and PutCopy
+// are in the public OrderedMap API.
+func newOrderedMapEntry(key *string, value interface{}) *orderedMapEntry {
+	return &orderedMapEntry{
+		*key,
+		value,
+		nil,
+		nil,
+	}
+}
+
+// ----------------------------------------------------------------
+func (omap *OrderedMap) IsEmpty() bool {
+	return omap.FieldCount == 0
+}
+
+func (omap *OrderedMap) Has(key string) bool {
+	return omap.findEntry(&key) != nil
+}
+
+func (omap *OrderedMap) findEntry(key *string) *orderedMapEntry {
+	if omap.keysToEntries != nil {
+		return omap.keysToEntries[*key]
+	} else {
+		for pe := omap.Head; pe != nil; pe = pe.Next {
+			if pe.Key == *key {
+				return pe
+			}
+		}
+		return nil
+	}
+}
+
+// ----------------------------------------------------------------
+func (omap *OrderedMap) Put(key string, value interface{}) {
+	pe := omap.findEntry(&key)
+	if pe == nil {
+		pe = newOrderedMapEntry(&key, value)
+		if omap.Head == nil {
+			omap.Head = pe
+			omap.Tail = pe
+		} else {
+			pe.Prev = omap.Tail
+			pe.Next = nil
+			omap.Tail.Next = pe
+			omap.Tail = pe
+		}
+		if omap.keysToEntries != nil {
+			omap.keysToEntries[key] = pe
+		}
+		omap.FieldCount++
+	} else {
+		pe.Value = value
+	}
+}
+
+// ----------------------------------------------------------------
+func (omap *OrderedMap) Get(key string) interface{} {
+	pe := omap.findEntry(&key)
+	if pe == nil {
+		return nil
+	} else {
+		return pe.Value
+	}
+}
+
+// The Get is sufficient for pointer values -- the caller can check if the
+// return value is nil. For int/string values (which are non-nullable) we have
+// this method.
+func (omap *OrderedMap) GetWithCheck(key string) (interface{}, bool) {
+	pe := omap.findEntry(&key)
+	if pe == nil {
+		return nil, false
+	} else {
+		return pe.Value, true
+	}
+}
+
+// ----------------------------------------------------------------
+func (omap *OrderedMap) Clear() {
+	omap.FieldCount = 0
+	omap.Head = nil
+	omap.Tail = nil
+}
+
+// ----------------------------------------------------------------
+// Returns true if it was found and removed
+func (omap *OrderedMap) Remove(key string) bool {
+	pe := omap.findEntry(&key)
+	if pe == nil {
+		return false
+	} else {
+		omap.unlink(pe)
+		return true
+	}
+}
+
+// ----------------------------------------------------------------
+func (omap *OrderedMap) unlink(pe *orderedMapEntry) {
+	if pe == omap.Head {
+		if pe == omap.Tail {
+			omap.Head = nil
+			omap.Tail = nil
+		} else {
+			omap.Head = pe.Next
+			pe.Next.Prev = nil
+		}
+	} else {
+		pe.Prev.Next = pe.Next
+		if pe == omap.Tail {
+			omap.Tail = pe.Prev
+		} else {
+			pe.Next.Prev = pe.Prev
+		}
+	}
+	if omap.keysToEntries != nil {
+		delete(omap.keysToEntries, pe.Key)
+	}
+	omap.FieldCount--
+}
--- a/pkg/lib/paragraph.go
+++ b/pkg/lib/paragraph.go
@ -0,0 +1,71 @@
+package lib
+
+import (
+	"bytes"
+	"fmt"
+	"strings"
+)
+
+// For online help contexts like printing all the built-in DSL functions, or
+// the list of all verbs.
+func PrintWordsAsParagraph(words []string) {
+	separator := " "
+	maxlen := 80
+
+	separatorlen := len(separator)
+	linelen := 0
+	j := 0
+
+	for _, word := range words {
+		wordlen := len(word)
+		linelen += separatorlen + wordlen
+		if linelen >= maxlen {
+			fmt.Printf("\n")
+			linelen = separatorlen + wordlen
+			j = 0
+		}
+		if j > 0 {
+			fmt.Print(separator)
+		}
+		fmt.Print(word)
+		j++
+	}
+
+	fmt.Printf("\n")
+}
+
+// For online help contexts like printing all the built-in DSL functions, or
+// the list of all verbs. Max width is nominally 80.
+func FormatAsParagraph(text string, maxWidth int) []string {
+	lines := make([]string, 0)
+	words := strings.Fields(text)
+
+	separator := " "
+	separatorlen := len(separator)
+	linelen := 0
+	j := 0
+
+	var buffer bytes.Buffer
+	for _, word := range words {
+		wordlen := len(word)
+		linelen += separatorlen + wordlen
+		if linelen >= maxWidth {
+			line := buffer.String()
+			lines = append(lines, line)
+			buffer.Reset()
+			linelen = separatorlen + wordlen
+			j = 0
+		}
+		if j > 0 {
+			buffer.WriteString(separator)
+		}
+		buffer.WriteString(word)
+		j++
+	}
+	line := buffer.String()
+	if line != "" {
+		lines = append(lines, line)
+	}
+
+	return lines
+}
--- a/pkg/lib/rand.go
+++ b/pkg/lib/rand.go
@ -0,0 +1,42 @@
+// ================================================================
+// Thinly wraps Go's rand library, with seed-function support
+// ================================================================
+
+package lib
+
+import (
+	"math/rand"
+	"os"
+	"time"
+)
+
+// By default, Miller random numbers are different on every run.
+var defaultSeed = time.Now().UnixNano() ^ int64(os.Getpid())
+var source = rand.NewSource(defaultSeed)
+var generator = rand.New(source)
+
+// Users can request specific seeds if they want the same random-number
+// sequence on each run.
+func SeedRandom(seed int64) {
+	source = rand.NewSource(seed)
+	generator = rand.New(source)
+}
+
+func RandFloat64() float64 {
+	return generator.Float64()
+}
+func RandUint32() uint32 {
+	return generator.Uint32()
+}
+func RandInt63() int64 {
+	return generator.Int63()
+}
+func RandRange(lowInclusive, highExclusive int64) int64 {
+	if lowInclusive == highExclusive {
+		return lowInclusive
+	} else {
+		u := generator.Int63()
+		// TODO: test divide-by-zero cases in UT
+		return lowInclusive + (u % (highExclusive - lowInclusive))
+	}
+}
--- a/pkg/lib/readfiles.go
+++ b/pkg/lib/readfiles.go
@ -0,0 +1,90 @@
+// ================================================================
+// Routines for loading strings from files. Nominally for the put/filter verbs
+// to load DSL strings from .mlr files.
+// ================================================================
+
+package lib
+
+import (
+	"io/ioutil"
+	"os"
+	"strings"
+
+	csv "github.com/johnkerl/miller/pkg/go-csv"
+)
+
+// LoadStringsFromFileOrDir calls LoadStringFromFile if path exists and is a
+// file, or LoadStringsFromDir if path exists and is a directory.  In the
+// former case the extension is ignored; in the latter case it's used as a
+// filter on the directory entries.
+func LoadStringsFromFileOrDir(path string, extension string) ([]string, error) {
+	fileInfo, err := os.Stat(path)
+	if err != nil {
+		return nil, err
+	}
+	if fileInfo.IsDir() {
+		return LoadStringsFromDir(path, extension)
+	} else {
+		dslString, err := LoadStringFromFile(path)
+		if err != nil {
+			return nil, err
+		} else {
+			return []string{dslString}, nil
+		}
+	}
+}
+
+// LoadStringFromFile is just a wrapper around ioutil.ReadFile,
+// with a cast from []byte to string.
+func LoadStringFromFile(filename string) (string, error) {
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		return "", err
+	}
+	return string(data), nil
+}
+
+// LoadStringsFromDir loads all file contents for files in the given directory
+// having the given extension. E.g. LoadStringsFromDir("/u/myfiles", ".mlr")
+// will load /u/myfiles/foo.mlr and /u/myfiles/bar.mlr but will skip over
+// /u/myfiles/data.csv and /u/myfiles/todo.txt.
+func LoadStringsFromDir(dirname string, extension string) ([]string, error) {
+	dslStrings := make([]string, 0)
+
+	entries, err := ioutil.ReadDir(dirname)
+	if err != nil {
+		return nil, err
+	}
+
+	for i := range entries {
+		entry := &entries[i]
+		name := (*entry).Name()
+		if !strings.HasSuffix(name, extension) {
+			continue
+		}
+
+		path := dirname + "/" + name
+		dslString, err := LoadStringFromFile(path)
+		if err != nil {
+			return nil, err
+		}
+
+		dslStrings = append(dslStrings, dslString)
+	}
+
+	return dslStrings, nil
+}
+
+func ReadCSVHeader(filename string) ([]string, error) {
+	handle, err := os.Open(filename)
+	if err != nil {
+		return nil, err
+	}
+	defer handle.Close()
+	csvReader := csv.NewReader(handle)
+	header, err := csvReader.Read()
+	if err != nil {
+		return nil, err
+	}
+	return header, nil
+}
--- a/pkg/lib/regex.go
+++ b/pkg/lib/regex.go
@ -0,0 +1,386 @@
+// ================================================================
+// Support for regexes in Miller.
+//
+// * By and large we use the Go library.
+//
+// * There is (for historical reasons) a DSL syntax "[a-z]"i (note the trailing i)
+//   for case-insensitive regular expressions which we map into Go syntax for
+//   regex-compilation.
+//
+// * Also for historical reasons, we allow things like
+//     if ($x =~ "(..)_(...)") {
+//       ... other lines of code ...
+//       $y = "\2:\1";
+//     }
+//   where the '=~' sets the captures and the "\2:\1" uses them.  (Note that
+//   https://github.com/johnkerl/miller/issues/388 has a better suggestion
+//   which would make the captures explicit as variables, rather than implicit
+//   within CST state -- regardless, the current syntax will still be supported
+//   for backward compatibility and so is here to stay.) Here we make use of Go
+//   regexp-library functions to write to, and then later interpolate from, a
+//   captures array which is stored within CST state. (See the `runtime.State`
+//   object.)
+//
+// * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g.
+//   if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is
+//   "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "".
+// ================================================================
+
+package lib
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"regexp"
+	"strings"
+)
+
+// captureDetector is used to see if a string literal interpolates previous
+// captures (like "\2:\1") or not (like "2:1").
+var captureDetector = regexp.MustCompile(`\\[0-9]`)
+
+// captureSplitter is used to precompute an offsets matrix for strings like
+// "\2:\1" so they don't need to be recomputed on every record.
+var captureSplitter = regexp.MustCompile(`(\\[0-9])`)
+
+// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax
+// which predate the port of Miller from C to Go.  Miller regexes use a final
+// 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)".
+//
+// (See also mlr.bnf where we specify which things can be backslash-escaped
+// without a syntax error at parse time.)
+//
+// * If the regex_string is of the form a.*b, compiles it case-sensisitively.
+// * If the regex_string is of the form "a.*b", compiles a.*b case-sensisitively.
+// * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
+func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
+	n := len(regexString)
+	if n < 2 {
+		return regexp.Compile(regexString)
+	}
+
+	// TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"".
+	// The parser-to-AST will have stripped the outer and we'll strip the inner and the
+	// user's intent will be lost.
+	//
+	// TODO: make separate functions for calling from parser-to-AST (string
+	// literals) and from verbs (like cut -r or having-fields).
+
+	if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") {
+		return regexp.Compile(regexString[1 : n-1])
+	}
+	if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") {
+		return regexp.Compile(regexString[1 : n-1])
+	}
+
+	if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") {
+		return regexp.Compile("(?i)" + regexString[1:n-2])
+	}
+	if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") {
+		return regexp.Compile("(?i)" + regexString[1:n-2])
+	}
+
+	return regexp.Compile(regexString)
+}
+
+// CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to
+// return a second error argument rather than fataling. However, if there's a
+// malformed regex we really cannot continue so it's simpler to just fatal.
+func CompileMillerRegexOrDie(regexString string) *regexp.Regexp {
+	regex, err := CompileMillerRegex(regexString)
+	if err != nil {
+		fmt.Fprint(os.Stderr, err)
+		os.Exit(1)
+	}
+	return regex
+}
+
+// CompileMillerRegexesOrDie is a convenenience looper over CompileMillerRegexOrDie.
+func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp {
+	regexes := make([]*regexp.Regexp, len(regexStrings))
+
+	for i, regexString := range regexStrings {
+		regexes[i] = CompileMillerRegexOrDie(regexString)
+	}
+
+	return regexes
+}
+
+// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
+// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
+// but "" splits to [""] when I wish it were []. This function does the latter.
+func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
+	if input == "" {
+		return make([]string, 0)
+	} else {
+		return regex.Split(input, n)
+	}
+}
+
+// MakeEmptyRegexCaptures is for initial CST state at the start of executing
+// the DSL expression for the current record.  Even if '$x =~ "(..)_(...)" set
+// "\1" and "\2" on the previous record, at start of processing for the current
+// record we need to start with a clean slate.
+func MakeEmptyRegexCaptures() []string {
+	return nil
+}
+
+// RegexReplacementHasCaptures is used by the CST builder to see if
+// string-literal is like "foo bar" or "foo \1 bar" -- in the latter case it
+// needs to retain the compiled offsets-matrix information.
+func RegexReplacementHasCaptures(
+	replacement string,
+) (
+	hasCaptures bool,
+	matrix [][]int,
+) {
+	if captureDetector.MatchString(replacement) {
+		return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
+	} else {
+		return false, nil
+	}
+}
+
+// RegexMatches implements the =~ DSL operator. The captures are stored in DSL
+// state and may be used by a DSL statement after the =~. For example, in
+//
+//	sub($a, "(..)_(...)", "\1:\2")
+//
+// the replacement string is an argument to sub and therefore the captures are
+// confined to the implementation of the sub function.  Similarly for gsub. But
+// for the match operator, people can do
+//
+//	if ($x =~ "(..)_(...)") {
+//	  ... other lines of code ...
+//	  $y = "\2:\1"
+//	}
+//
+// and the =~ callsite doesn't know if captures will be used or not. So,
+// RegexMatches always returns the captures array. It is stored within the CST
+// state.
+func RegexMatches(
+	input string,
+	sregex string,
+) (
+	matches bool,
+	capturesOneUp []string,
+) {
+	regex := CompileMillerRegexOrDie(sregex)
+	return RegexMatchesCompiled(input, regex)
+}
+
+// RegexMatchesCompiled is the implementation for the =~ operator.  Without
+// Miller-style regex captures this would a simple one-line
+// regex.MatchString(input). However, we return the captures array for the
+// benefit of subsequent references to "\0".."\9".
+func RegexMatchesCompiled(
+	input string,
+	regex *regexp.Regexp,
+) (bool, []string) {
+	matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
+	if matrix == nil || len(matrix) == 0 {
+		// Set all captures to ""
+		return false, make([]string, 10)
+	}
+
+	// "\0" .. "\9"
+	captures := make([]string, 10)
+
+	// If there are multiple matches -- e.g. input is
+	//
+	//   "...ab_cde...fg_hij..."
+	//
+	// with regex
+	//
+	//   "(..)_(...)"
+	//
+	// -- then we only consider the first match: boolean return value is true
+	// (the input string matched the regex), and the captures array will map
+	// "\1" to "ab" and "\2" to "cde".
+	row := matrix[0]
+	n := len(row)
+
+	// Example return value from FindAllSubmatchIndex with input
+	// "...ab_cde...fg_hij..." and regex "(..)_(...)":
+	//
+	// Matrix is [][]int{
+	//   []int{3, 9, 3, 5, 6, 9},
+	//   []int{12, 18, 12, 14, 15, 18},
+	// }
+	//
+	// As noted above we look at only the first row.
+	//
+	// * 3-9 is for the entire match "ab_cde"
+	// * 3-5 is for the first capture "ab"
+	// * 6-9 is for the second capture "cde"
+
+	di := 0
+	for si := 0; si < n && di <= 9; si += 2 {
+		start := row[si]
+		end := row[si+1]
+		if start >= 0 && end >= 0 {
+			captures[di] = input[start:end]
+		}
+		di += 1
+	}
+
+	return true, captures
+}
+
+// InterpolateCaptures example:
+//   - Input $x is "ab_cde"
+//   - DSL expression
+//     if ($x =~ "(..)_(...)") {
+//     ... other lines of code ...
+//     $y = "\2:\1";
+//     }
+//   - InterpolateCaptures is used on the evaluation of "\2:\1"
+//   - replacementString is "\2:\1"
+//   - replacementMatrix contains precomputed/cached offsets for the "\2" and
+//     "\1" substrings within "\2:\1"
+//   - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
+//     slot 2 being "cde" (for "\2"), and slots 3-9 being "".
+func InterpolateCaptures(
+	replacementString string,
+	replacementMatrix [][]int,
+	captures []string,
+) string {
+	if replacementMatrix == nil || captures == nil {
+		return replacementString
+	}
+	var buffer bytes.Buffer
+
+	nonMatchStartIndex := 0
+
+	for _, row := range replacementMatrix {
+		start := row[0]
+		buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
+
+		// Map "\0".."\9" to integer index 0..9
+		index := replacementString[start+1] - '0'
+		buffer.WriteString(captures[index])
+
+		nonMatchStartIndex = row[1]
+	}
+
+	buffer.WriteString(replacementString[nonMatchStartIndex:])
+
+	return buffer.String()
+}
+
+// RegexSub implements the sub DSL function.
+func RegexSub(
+	input string,
+	sregex string,
+	replacement string,
+) string {
+	regex := CompileMillerRegexOrDie(sregex)
+	_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
+	return RegexSubCompiled(input, regex, replacement, replacementCaptureMatrix)
+}
+
+// RegexSubCompiled is the same as RegexSub but with compiled regex and
+// replacement strings.
+func RegexSubCompiled(
+	input string,
+	regex *regexp.Regexp,
+	replacement string,
+	replacementCaptureMatrix [][]int,
+) string {
+	return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, true)
+}
+
+// RegexGsub implements the gsub DSL function.
+func RegexGsub(
+	input string,
+	sregex string,
+	replacement string,
+) string {
+	regex := CompileMillerRegexOrDie(sregex)
+	_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
+	return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, false)
+}
+
+// regexSubGsubCompiled is the implementation for sub/gsub with compilex regex
+// and replacement strings.
+func regexSubGsubCompiled(
+	input string,
+	regex *regexp.Regexp,
+	replacement string,
+	replacementCaptureMatrix [][]int,
+	breakOnFirst bool,
+) string {
+	matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
+	if matrix == nil || len(matrix) == 0 {
+		return input
+	}
+
+	// Example return value from FindAllSubmatchIndex with input
+	// "...ab_cde...fg_hij..." and regex "(..)_(...)":
+	//
+	// Matrix is [][]int{
+	//   []int{3, 9, 3, 5, 6, 9},
+	//   []int{12, 18, 12, 14, 15, 18},
+	// }
+	//
+	// * 3-9 is for the entire match "ab_cde"
+	// * 3-5 is for the first capture "ab"
+	// * 6-9 is for the second capture "cde"
+	//
+	// * 12-18 is for the entire match "fg_hij"
+	// * 12-14 is for the first capture "fg"
+	// * 15-18 is for the second capture "hij"
+
+	var buffer bytes.Buffer
+	nonMatchStartIndex := 0
+
+	for _, row := range matrix {
+		buffer.WriteString(input[nonMatchStartIndex:row[0]])
+
+		// "\0" .. "\9"
+		captures := make([]string, 10)
+		di := 0
+		n := len(row)
+		for si := 0; si < n && di <= 9; si += 2 {
+			start := row[si]
+			end := row[si+1]
+			if start >= 0 && end >= 0 {
+				captures[di] = input[start:end]
+			}
+			di += 1
+		}
+
+		// If the replacement had no captures, e.g. "xyz", we would insert it
+		//
+		//   "..."     -> "..."
+		//   "ab_cde"  -> "xyz"   --- here
+		//   "..."     -> "..."
+		//   "fg_hij"  -> "xyz"   --- and here
+		//   "..."     -> "..."
+		//
+		// using buffer.WriteString(replacement). However, this function exists
+		// to handle the case when the replacement string has captures like
+		// "\2:\1", so we need to produce
+		//
+		//   "..."     -> "..."
+		//   "ab_cde"  -> "cde:ab"   --- here
+		//   "..."     -> "..."
+		//   "fg_hij"  -> "hij:fg"   --- and here
+		//   "..."     -> "..."
+		updatedReplacement := InterpolateCaptures(
+			replacement,
+			replacementCaptureMatrix,
+			captures,
+		)
+		buffer.WriteString(updatedReplacement)
+
+		nonMatchStartIndex = row[1]
+		if breakOnFirst {
+			break
+		}
+	}
+
+	buffer.WriteString(input[nonMatchStartIndex:])
+	return buffer.String()
+}
--- a/pkg/lib/regex_test.go
+++ b/pkg/lib/regex_test.go
@ -0,0 +1,190 @@
+// ================================================================
+// Most Miller tests (thousands of them) are command-line-driven via
+// mlr regtest. Here are some cases needing special focus.
+// ================================================================
+
+package lib
+
+import (
+	"testing"
+)
+
+// ----------------------------------------------------------------
+type tDataForHasCaptures struct {
+	replacement         string
+	expectedHasCaptures bool
+	expectedMatrix      [][]int
+}
+
+type tDataForSubGsub struct {
+	input          string
+	sregex         string
+	replacement    string
+	expectedOutput string
+}
+
+type tDataForMatches struct {
+	input            string
+	sregex           string
+	expectedOutput   bool
+	expectedCaptures []string
+}
+
+// ----------------------------------------------------------------
+var dataForHasCaptures = []tDataForHasCaptures{
+	{"foo", false, nil},
+	{"\\0", true, [][]int{{0, 2, 0, 2}}},
+	{"\\3", true, [][]int{{0, 2, 0, 2}}},
+	{"\\34", true, [][]int{{0, 2, 0, 2}}},
+	{"abc\\1def\\2ghi", true, [][]int{{3, 5, 3, 5}, {8, 10, 8, 10}}},
+}
+
+var dataForSub = []tDataForSubGsub{
+	{"abcde", "c", "X", "abXde"},
+	{"abcde", "z", "X", "abcde"},
+	{"abcde", "[a-z]", "X", "Xbcde"},
+	{"abcde", "[A-Z]", "X", "abcde"},
+
+	{"abcde", "c", "X", "abXde"},
+	{"abcde", "z", "X", "abcde"},
+	{"abcde", "[a-z]", "X", "Xbcde"},
+	{"abcde", "[A-Z]", "X", "abcde"},
+
+	{"ab_cde", "(..)_(...)", "\\2\\1", "cdeab"},
+	{"ab_cde", "(..)_(...)", "\\2-\\1", "cde-ab"},
+	{"ab_cde", "(..)_(...)", "X\\2Y\\1Z", "XcdeYabZ"},
+
+	{"foofoofoo", "(f.o)", "b\\1r", "bfoorfoofoo"},
+	{"foofoofoo", "(f.*o)", "b\\1r", "bfoofoofoor"},
+	{"foofoofoo", "(f.o)", "b\\2r", "brfoofoo"},
+	{"foofoofoo", "(f.*o)", "b\\2r", "br"},
+}
+
+var dataForGsub = []tDataForSubGsub{
+	{"abcde", "c", "X", "abXde"},
+	{"abcde", "z", "X", "abcde"},
+	{"abcde", "[a-z]", "X", "XXXXX"},
+	{"abcde", "[A-Z]", "X", "abcde"},
+	{"abcde", "[c-d]", "X", "abXXe"},
+
+	{"abcde", "c", "X", "abXde"},
+	{"abcde", "z", "X", "abcde"},
+	{"abcde", "[a-z]", "X", "XXXXX"},
+	{"abcde", "[A-Z]", "X", "abcde"},
+	{"abcde", "[c-d]", "X", "abXXe"},
+
+	{"abacad", "a(.)", "<\\1>", "<b><c><d>"},
+	{"abacad", "a(.)", "<\\2>", "<><><>"},
+}
+
+var dataForMatches = []tDataForMatches{
+	{"abcde", "[A-Z]", false, []string{"", "", "", "", "", "", "", "", "", ""}},
+	{"abcde", "[a-z]", true, []string{"a", "", "", "", "", "", "", "", "", ""}},
+	{"...ab_cde...", "(..)_(...)", true, []string{"ab_cde", "ab", "cde", "", "", "", "", "", "", ""}},
+	{"...ab_cde...fg_hij...", "(..)_(...)", true, []string{"ab_cde", "ab", "cde", "", "", "", "", "", "", ""}},
+	{"foofoofoo", "(f.o)", true, []string{"foo", "foo", "", "", "", "", "", "", "", ""}},
+	{"foofoofoo", "(f.*o)", true, []string{"foofoofoo", "foofoofoo", "", "", "", "", "", "", "", ""}},
+}
+
+func TestRegexReplacementHasCaptures(t *testing.T) {
+	for i, entry := range dataForHasCaptures {
+		actualHasCaptures, actualMatrix := RegexReplacementHasCaptures(entry.replacement)
+		if actualHasCaptures != entry.expectedHasCaptures {
+			t.Fatalf("case %d replacement \"%s\" expected %v got %v\n",
+				i, entry.replacement, entry.expectedHasCaptures, actualHasCaptures,
+			)
+		}
+		if !compareMatrices(actualMatrix, entry.expectedMatrix) {
+			t.Fatalf("case %d replacement \"%s\" expected matrix %#v got %#v\n",
+				i, entry.replacement, entry.expectedMatrix, actualMatrix,
+			)
+		}
+	}
+}
+
+func TestRegexSub(t *testing.T) {
+	for i, entry := range dataForSub {
+		actualOutput := RegexSub(entry.input, entry.sregex, entry.replacement)
+		if actualOutput != entry.expectedOutput {
+			t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
+				i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
+			)
+		}
+	}
+}
+
+func TestRegexGsub(t *testing.T) {
+	for i, entry := range dataForGsub {
+		actualOutput := RegexGsub(entry.input, entry.sregex, entry.replacement)
+		if actualOutput != entry.expectedOutput {
+			t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
+				i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
+			)
+		}
+	}
+}
+
+func TestRegexMatches(t *testing.T) {
+	for i, entry := range dataForMatches {
+		actualOutput, actualCaptures := RegexMatches(entry.input, entry.sregex)
+		if actualOutput != entry.expectedOutput {
+			t.Fatalf("case %d input \"%s\" sregex \"%s\" expected %v got %v\n",
+				i, entry.input, entry.sregex, entry.expectedOutput, actualOutput,
+			)
+		}
+		if !compareCaptures(actualCaptures, entry.expectedCaptures) {
+			t.Fatalf("case %d input \"%s\" sregex \"%s\" expected captures %#v got %#v\n",
+				i, entry.input, entry.sregex, entry.expectedCaptures, actualCaptures,
+			)
+		}
+	}
+}
+
+func compareMatrices(
+	actualMatrix [][]int,
+	expectedMatrix [][]int,
+) bool {
+	if actualMatrix == nil && expectedMatrix == nil {
+		return true
+	}
+	if actualMatrix == nil || expectedMatrix == nil {
+		return false
+	}
+	if len(actualMatrix) != len(expectedMatrix) {
+		return false
+	}
+	for i := range expectedMatrix {
+		actualRow := actualMatrix[i]
+		expectedRow := expectedMatrix[i]
+		if len(actualRow) != len(expectedRow) {
+			return false
+		}
+		for j := range expectedRow {
+			if actualRow[j] != expectedRow[j] {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+func compareCaptures(
+	actualCaptures []string,
+	expectedCaptures []string,
+) bool {
+	if actualCaptures == nil && expectedCaptures == nil {
+		return true
+	}
+	if actualCaptures == nil || expectedCaptures == nil {
+		return false
+	}
+	if len(actualCaptures) != len(expectedCaptures) {
+		return false
+	}
+	for i := range expectedCaptures {
+		if actualCaptures[i] != expectedCaptures[i] {
+			return false
+		}
+	}
+	return true
+}
--- a/pkg/lib/stats.go
+++ b/pkg/lib/stats.go
@ -0,0 +1,278 @@
+// ================================================================
+// These are intended for streaming (i.e. single-pass) applications. Otherwise
+// the formulas look different (and are more intuitive).
+// ================================================================
+
+package lib
+
+import (
+	"math"
+)
+
+// ----------------------------------------------------------------
+// Univariate linear regression
+// ----------------------------------------------------------------
+// There are N (xi, yi) pairs.
+//
+// minimize E = sum (yi - m xi - b)^2
+//
+// Set the two partial derivatives to zero and solve for m and b:
+//
+// DE/Dm = sum 2 (yi - m xi - b) (-xi) = 0
+// DE/Db = sum 2 (yi - m xi - b) (-1)  = 0
+//
+// sum (yi - m xi - b) (xi) = 0
+// sum (yi - m xi - b)      = 0
+//
+// sum (xi yi - m xi^2 - b xi) = 0
+// sum (yi - m xi - b)         = 0
+//
+// m sum(xi^2) + b sum(xi) = sum(xi yi)
+// m sum(xi)   + b N       = sum(yi)
+//
+// [ sum(xi^2)   sum(xi) ] [ m ] = [ sum(xi yi) ]
+// [ sum(xi)     N       ] [ b ] = [ sum(yi)    ]
+//
+// [ m ] = [ sum(xi^2) sum(xi) ]^-1  [ sum(xi yi) ]
+// [ b ]   [ sum(xi)   N       ]     [ sum(yi)    ]
+//
+//       = [ N         -sum(xi)  ]  [ sum(xi yi) ] * 1/D
+//         [ -sum(xi)   sum(xi^2)]  [ sum(yi)    ]
+//
+// where
+//
+//   D = N sum(xi^2) - sum(xi)^2.
+//
+// So
+//
+//      N sum(xi yi) - sum(xi) sum(yi)
+// m = --------------------------------
+//                   D
+//
+//      -sum(xi)sum(xi yi) + sum(xi^2) sum(yi)
+// b = ----------------------------------------
+//                   D
+//
+// ----------------------------------------------------------------
+
+func GetLinearRegressionOLS(
+	nint int64,
+	sumx float64,
+	sumx2 float64,
+	sumxy float64,
+	sumy float64,
+) (m, b float64) {
+
+	n := float64(nint)
+	D := n*sumx2 - sumx*sumx
+	m = (n*sumxy - sumx*sumy) / D
+	b = (-sumx*sumxy + sumx2*sumy) / D
+	return m, b
+}
+
+// We would need a second pass through the data to compute the error-bars given
+// the data and the m and the b.
+//
+//	# Young 1962, pp. 122-124.  Compute sample variance of linear
+//	# approximations, then variances of m and b.
+//	var_z = 0.0
+//	for i in range(0, N):
+//		var_z += (m * xs[i] + b - ys[i])**2
+//	var_z /= N
+//
+//	var_m = (N * var_z) / D
+//	var_b = (var_z * sumx2) / D
+//
+//	output = [m, b, math.sqrt(var_m), math.sqrt(var_b)]
+
+// ----------------------------------------------------------------
+
+// GetVar is the finalizing function for computing variance from streamed
+// accumulator values.
+func GetVar(
+	nint int64,
+	sumx float64,
+	sumx2 float64,
+) float64 {
+
+	n := float64(nint)
+	mean := sumx / n
+	numerator := sumx2 - mean*(2.0*sumx-n*mean)
+	if numerator < 0.0 { // round-off error
+		numerator = 0.0
+	}
+	denominator := n - 1.0
+	return numerator / denominator
+}
+
+// ----------------------------------------------------------------
+// Unbiased estimator:
+//    (1/n)   sum{(xi-mean)**3}
+//  -----------------------------
+// [(1/(n-1)) sum{(xi-mean)**2}]**1.5
+
+// mean = sumx / n; n mean = sumx
+
+// sum{(xi-mean)^3}
+//   = sum{xi^3 - 3 mean xi^2 + 3 mean^2 xi - mean^3}
+//   = sum{xi^3} - 3 mean sum{xi^2} + 3 mean^2 sum{xi} - n mean^3
+//   = sumx3 - 3 mean sumx2 + 3 mean^2 sumx - n mean^3
+//   = sumx3 - 3 mean sumx2 + 3n mean^3 - n mean^3
+//   = sumx3 - 3 mean sumx2 + 2n mean^3
+//   = sumx3 - mean*(3 sumx2 + 2n mean^2)
+
+// sum{(xi-mean)^2}
+//   = sum{xi^2 - 2 mean xi + mean^2}
+//   = sum{xi^2} - 2 mean sum{xi} + n mean^2
+//   = sumx2 - 2 mean sumx + n mean^2
+//   = sumx2 - 2 n mean^2 + n mean^2
+//   = sumx2 - n mean^2
+
+// ----------------------------------------------------------------
+
+// GetSkewness is the finalizing function for computing skewness from streamed
+// accumulator values.
+func GetSkewness(
+	nint int,
+	sumx float64,
+	sumx2 float64,
+	sumx3 float64,
+) float64 {
+
+	n := float64(nint)
+	mean := sumx / n
+	numerator := sumx3 - mean*(3*sumx2-2*n*mean*mean)
+	numerator = numerator / n
+	denominator := (sumx2 - n*mean*mean) / (n - 1)
+	denominator = math.Pow(denominator, 1.5)
+	return numerator / denominator
+}
+
+// ----------------------------------------------------------------
+// Unbiased:
+//  (1/n) sum{(x-mean)**4}
+//  ----------------------- - 3
+// [(1/n) sum{(x-mean)**2}]**2
+
+// sum{(xi-mean)^4}
+//   = sum{xi^4 - 4 mean xi^3 + 6 mean^2 xi^2 - 4 mean^3 xi + mean^4}
+//   = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 mean^3 sum{xi} + n mean^4
+//   = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 n mean^4 + n mean^4
+//   = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 3 n mean^4
+//   = sum{xi^4} - mean*(4 sum{xi^3} - 6 mean sum{xi^2} + 3 n mean^3)
+//   = sumx4 - mean*(4 sumx3 - 6 mean sumx2 + 3 n mean^3)
+//   = sumx4 - mean*(4 sumx3 - mean*(6 sumx2 - 3 n mean^2))
+
+func GetKurtosis(
+	nint int,
+	sumx float64,
+	sumx2 float64,
+	sumx3 float64,
+	sumx4 float64,
+) float64 {
+
+	n := float64(nint)
+	mean := sumx / n
+	numerator := sumx4 - mean*(4*sumx3-mean*(6*sumx2-3*n*mean*mean))
+	numerator = numerator / n
+	denominator := (sumx2 - n*mean*mean) / n
+	denominator = denominator * denominator
+	return numerator/denominator - 3.0
+}
+
+// ----------------------------------------------------------------
+// Non-streaming implementation:
+//
+// def find_sample_covariance(xs, ys):
+//      n = len(xs)
+//      mean_x = find_mean(xs)
+//      mean_y = find_mean(ys)
+//
+//      sum = 0.0
+//      for k in range(0, n):
+//              sum += (xs[k] - mean_x) * (ys[k] - mean_y)
+//
+//      return sum / (n-1.0)
+
+func GetCov(
+	nint int64,
+	sumx float64,
+	sumy float64,
+	sumxy float64,
+) float64 {
+
+	n := float64(nint)
+	meanx := sumx / n
+	meany := sumy / n
+	numerator := sumxy - meanx*sumy - meany*sumx + n*meanx*meany
+	denominator := n - 1
+	return numerator / denominator
+}
+
+// ----------------------------------------------------------------
+func GetCovMatrix(
+	nint int64,
+	sumx float64,
+	sumx2 float64,
+	sumy float64,
+	sumy2 float64,
+	sumxy float64,
+) (Q [2][2]float64) {
+
+	n := float64(nint)
+	denominator := n - 1
+
+	Q[0][0] = (sumx2 - sumx*sumx/n) / denominator
+	Q[0][1] = (sumxy - sumx*sumy/n) / denominator
+	Q[1][0] = Q[0][1]
+	Q[1][1] = (sumy2 - sumy*sumy/n) / denominator
+
+	return Q
+}
+
+// ----------------------------------------------------------------
+// Principal component analysis can be used for linear regression:
+//
+// * Compute the covariance matrix for the x's and y's.
+//
+// * Find its eigenvalues and eigenvectors of the cov. (This is real-symmetric
+//   so Jacobi iteration is simple and fine.)
+//
+// * The principal eigenvector points in the direction of the fit.
+//
+// * The covariance matrix is computed on zero-mean data so the intercept
+//   is zero. The fit equation is of the form (y - nu) = m*(x - mu) where mu
+//   and nu are x and y means, respectively.
+//
+// * If the fit is perfect then the 2nd eigenvalue will be zero; if the fit is
+//   good then the 2nd eigenvalue will be smaller; if the fit is bad then
+//   they'll be about the same. I use 1 - |lambda2|/|lambda1| as an indication
+//   of quality of the fit.
+//
+// Standard ("ordinary least-squares") linear regression is appropriate when
+// the errors are thought to be all in the y's. PCA ("total least-squares") is
+// appropriate when the x's and the y's are thought to both have errors.
+
+func GetLinearRegressionPCA(
+	eigenvalue_1 float64,
+	eigenvalue_2 float64,
+	eigenvector_1 [2]float64,
+	eigenvector_2 [2]float64,
+	x_mean float64,
+	y_mean float64,
+) (m, b, quality float64) {
+
+	abs_1 := math.Abs(eigenvalue_1)
+	abs_2 := math.Abs(eigenvalue_2)
+	quality = 1.0
+	if abs_1 == 0.0 {
+		quality = 0.0
+	} else if abs_2 > 0.0 {
+		quality = 1.0 - abs_2/abs_1
+	}
+	a0 := eigenvector_1[0]
+	a1 := eigenvector_1[1]
+	m = a1 / a0
+	b = y_mean - m*x_mean
+	return m, b, quality
+}
--- a/pkg/lib/time.go
+++ b/pkg/lib/time.go
@ -0,0 +1,187 @@
+package lib
+
+import (
+	"fmt"
+	"os"
+	"time"
+)
+
+// SetTZFromEnv applies the $TZ environment variable. This has three reasons:
+// (1) On Windows (as of 2021-10-20), this is necessary to get $TZ into use.
+// (2) On Linux/Mac, as of this writing it is not necessary for initial value
+// of TZ at startup. However, an explicit check is helpful since if someone
+// does 'export TZ=Something/Invalid', then runs Miller, and invalid TZ is
+// simply *ignored* -- we want to surface that error to the user.  (3) On any
+// platform this is necessary for *changing* TZ mid-process: e.g.  if a DSL
+// statement does 'ENV["TZ"] = Asia/Istanbul'.
+func SetTZFromEnv() error {
+	tzenv := os.Getenv("TZ")
+	location, err := time.LoadLocation(tzenv)
+	if err != nil {
+		return fmt.Errorf("TZ environment variable appears malformed: \"%s\"", tzenv)
+	}
+	time.Local = location
+	return nil
+}
+
+func Sec2GMT(epochSeconds float64, numDecimalPlaces int) string {
+	return secToFormattedTime(epochSeconds, numDecimalPlaces, false, nil)
+}
+
+func Nsec2GMT(epochNanoseconds int64, numDecimalPlaces int) string {
+	return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, false, nil)
+}
+
+func Sec2LocalTime(epochSeconds float64, numDecimalPlaces int) string {
+	return secToFormattedTime(epochSeconds, numDecimalPlaces, true, nil)
+}
+
+func Nsec2LocalTime(epochNanoseconds int64, numDecimalPlaces int) string {
+	return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, true, nil)
+}
+
+func Sec2LocationTime(epochSeconds float64, numDecimalPlaces int, location *time.Location) string {
+	return secToFormattedTime(epochSeconds, numDecimalPlaces, true, location)
+}
+
+func Nsec2LocationTime(epochNanoseconds int64, numDecimalPlaces int, location *time.Location) string {
+	return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, true, location)
+}
+
+// secToFormattedTime is for DSL functions sec2gmt and sec2localtime. If doLocal is
+// false, use UTC.  Else if location is nil, use $TZ environment variable. Else
+// use the specified location.
+func secToFormattedTime(epochSeconds float64, numDecimalPlaces int, doLocal bool, location *time.Location) string {
+	intPart := int64(epochSeconds)
+	fractionalPart := epochSeconds - float64(intPart)
+	if fractionalPart < 0 {
+		intPart -= 1
+		fractionalPart += 1.0
+	}
+
+	t := time.Unix(intPart, int64(fractionalPart*1e9))
+	return goTimeToFormattedTime(t, numDecimalPlaces, doLocal, location)
+}
+
+// nsecToFormattedTime is for DSL functions nsec2gmt and nsec2localtime. If doLocal is
+// false, use UTC.  Else if location is nil, use $TZ environment variable. Else
+// use the specified location.
+func nsecToFormattedTime(epochNanoseconds int64, numDecimalPlaces int, doLocal bool, location *time.Location) string {
+	t := time.Unix(epochNanoseconds/1000000000, epochNanoseconds%1000000000)
+	return goTimeToFormattedTime(t, numDecimalPlaces, doLocal, location)
+}
+
+// This is how much to divide nanoseconds by to get a desired number of decimal places
+var nsToFracDivisors = []int{
+	/* 0 */ 0, /* unused */
+	/* 1 */ 100000000,
+	/* 2 */ 10000000,
+	/* 3 */ 1000000,
+	/* 4 */ 100000,
+	/* 5 */ 10000,
+	/* 6 */ 1000,
+	/* 7 */ 100,
+	/* 8 */ 10,
+	/* 9 */ 1,
+}
+
+func goTimeToFormattedTime(t time.Time, numDecimalPlaces int, doLocal bool, location *time.Location) string {
+	if doLocal {
+		if location != nil {
+			t = t.In(location)
+		} else {
+			t = t.Local()
+		}
+	} else {
+		t = t.UTC()
+	}
+
+	YYYY := t.Year()
+	MM := int(t.Month())
+	DD := t.Day()
+	hh := t.Hour()
+	mm := t.Minute()
+	ss := t.Second()
+
+	if numDecimalPlaces < 0 {
+		numDecimalPlaces = 0
+	} else if numDecimalPlaces > 9 {
+		numDecimalPlaces = 9
+	}
+
+	if numDecimalPlaces == 0 {
+		if doLocal {
+			return fmt.Sprintf(
+				"%04d-%02d-%02d %02d:%02d:%02d",
+				YYYY, MM, DD, hh, mm, ss)
+		} else {
+			return fmt.Sprintf(
+				"%04d-%02d-%02dT%02d:%02d:%02dZ",
+				YYYY, MM, DD, hh, mm, ss)
+		}
+	} else {
+		fractionalPart := t.Nanosecond() / nsToFracDivisors[numDecimalPlaces]
+		if doLocal {
+			return fmt.Sprintf(
+				"%04d-%02d-%02d %02d:%02d:%02d.%0*d",
+				YYYY, MM, DD, hh, mm, ss, numDecimalPlaces, fractionalPart)
+		} else {
+			return fmt.Sprintf(
+				"%04d-%02d-%02dT%02d:%02d:%02d.%0*dZ",
+				YYYY, MM, DD, hh, mm, ss, numDecimalPlaces, fractionalPart)
+		}
+	}
+}
+
+func EpochSecondsToGMT(epochSeconds float64) time.Time {
+	return epochSecondsToTime(epochSeconds, false, nil)
+}
+
+func EpochNanosecondsToGMT(epochNanoseconds int64) time.Time {
+	return epochNanosecondsToTime(epochNanoseconds, false, nil)
+}
+
+func EpochSecondsToLocalTime(epochSeconds float64) time.Time {
+	return epochSecondsToTime(epochSeconds, true, nil)
+}
+
+func EpochNanosecondsToLocalTime(epochNanoseconds int64) time.Time {
+	return epochNanosecondsToTime(epochNanoseconds, true, nil)
+}
+
+func EpochSecondsToLocationTime(epochSeconds float64, location *time.Location) time.Time {
+	return epochSecondsToTime(epochSeconds, true, location)
+}
+
+func EpochNanosecondsToLocationTime(epochNanoseconds int64, location *time.Location) time.Time {
+	return epochNanosecondsToTime(epochNanoseconds, true, location)
+}
+
+func epochSecondsToTime(epochSeconds float64, doLocal bool, location *time.Location) time.Time {
+	intPart := int64(epochSeconds)
+	fractionalPart := epochSeconds - float64(intPart)
+	decimalPart := int64(fractionalPart * 1e9)
+	if doLocal {
+		if location == nil {
+			return time.Unix(intPart, decimalPart).Local()
+		} else {
+			return time.Unix(intPart, decimalPart).In(location)
+		}
+	} else {
+		return time.Unix(intPart, decimalPart).UTC()
+	}
+}
+
+func epochNanosecondsToTime(epochNanoseconds int64, doLocal bool, location *time.Location) time.Time {
+	intPart := epochNanoseconds / 1000000000
+	fractionalPart := epochNanoseconds % 1000000000
+	if doLocal {
+		if location == nil {
+			return time.Unix(intPart, fractionalPart).Local()
+		} else {
+			return time.Unix(intPart, fractionalPart).In(location)
+		}
+	} else {
+		return time.Unix(intPart, fractionalPart).UTC()
+	}
+}
--- a/pkg/lib/time_test.go
+++ b/pkg/lib/time_test.go
@ -0,0 +1,101 @@
+// ================================================================
+// Most Miller tests (thousands of them) are command-line-driven via
+// mlr regtest. Here are some cases needing special focus.
+// ================================================================
+
+package lib
+
+import (
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+// ----------------------------------------------------------------
+type tDataForSec2GMT struct {
+	epochSeconds     float64
+	numDecimalPlaces int
+	expectedOutput   string
+}
+
+var dataForSec2GMT = []tDataForSec2GMT{
+	{0.0, 0, "1970-01-01T00:00:00Z"},
+	{0.0, 6, "1970-01-01T00:00:00.000000Z"},
+	{1.0, 6, "1970-01-01T00:00:01.000000Z"},
+	{123456789.25, 3, "1973-11-29T21:33:09.250Z"},
+}
+
+func TestSec2GMT(t *testing.T) {
+	for _, entry := range dataForSec2GMT {
+		assert.Equal(t, entry.expectedOutput, Sec2GMT(entry.epochSeconds, entry.numDecimalPlaces))
+	}
+}
+
+// ----------------------------------------------------------------
+type tDataForNsec2GMT struct {
+	epochNanoseconds int64
+	numDecimalPlaces int
+	expectedOutput   string
+}
+
+var dataForNsec2GMT = []tDataForNsec2GMT{
+	{0, 0, "1970-01-01T00:00:00Z"},
+	{0, 6, "1970-01-01T00:00:00.000000Z"},
+	{946684800123456789, 0, "2000-01-01T00:00:00Z"},
+	{946684800123456789, 1, "2000-01-01T00:00:00.1Z"},
+	{946684800123456789, 2, "2000-01-01T00:00:00.12Z"},
+	{946684800123456789, 3, "2000-01-01T00:00:00.123Z"},
+	{946684800123456789, 4, "2000-01-01T00:00:00.1234Z"},
+	{946684800123456789, 5, "2000-01-01T00:00:00.12345Z"},
+	{946684800123456789, 6, "2000-01-01T00:00:00.123456Z"},
+	{946684800123456789, 7, "2000-01-01T00:00:00.1234567Z"},
+	{946684800123456789, 8, "2000-01-01T00:00:00.12345678Z"},
+	{946684800123456789, 9, "2000-01-01T00:00:00.123456789Z"},
+}
+
+func TestNsec2GMT(t *testing.T) {
+	for _, entry := range dataForNsec2GMT {
+		actualOutput := Nsec2GMT(entry.epochNanoseconds, entry.numDecimalPlaces)
+		assert.Equal(t, entry.expectedOutput, actualOutput)
+	}
+}
+
+// ----------------------------------------------------------------
+type tDataForEpochSecondsToGMT struct {
+	epochSeconds   float64
+	expectedOutput time.Time
+}
+
+var dataForEpochSecondsToGMT = []tDataForEpochSecondsToGMT{
+	{0.0, time.Unix(0, 0).UTC()},
+	{1.25, time.Unix(1, 250000000).UTC()},
+	{123456789.25, time.Unix(123456789, 250000000).UTC()},
+}
+
+func TestEpochSecondsToGMT(t *testing.T) {
+	for _, entry := range dataForEpochSecondsToGMT {
+		assert.Equal(t, entry.expectedOutput, EpochSecondsToGMT(entry.epochSeconds))
+	}
+}
+
+// ----------------------------------------------------------------
+type tDataForEpochNanosecondsToGMT struct {
+	epochNanoseconds int64
+	expectedOutput   time.Time
+}
+
+var dataForEpochNanosecondsToGMT = []tDataForEpochNanosecondsToGMT{
+	{0, time.Unix(0, 0).UTC()},
+	{1000000000, time.Unix(1, 0).UTC()},
+	{1200000000, time.Unix(1, 200000000).UTC()},
+	{-1000000000, time.Unix(-1, 0).UTC()},
+	{-1200000000, time.Unix(-1, -200000000).UTC()},
+	{123456789250000047, time.Unix(123456789, 250000047).UTC()},
+}
+
+func TestEpochNanosecondsToGMT(t *testing.T) {
+	for _, entry := range dataForEpochNanosecondsToGMT {
+		assert.Equal(t, entry.expectedOutput, EpochNanosecondsToGMT(entry.epochNanoseconds))
+	}
+}
--- a/pkg/lib/tsv_codec.go
+++ b/pkg/lib/tsv_codec.go
@ -0,0 +1,67 @@
+package lib
+
+import (
+	"bytes"
+)
+
+// * https://en.wikipedia.org/wiki/Tab-separated_values
+// * https://www.iana.org/assignments/media-types/text/tab-separated-values
+//   \n for newline,
+//   \r for carriage return,
+//   \t for tab,
+//   \\ for backslash.
+
+// TSVDecodeField is for the TSV record-reader.
+func TSVDecodeField(input string) string {
+	var buffer bytes.Buffer
+	n := len(input)
+	for i := 0; i < n; /* increment in loop */ {
+		c := input[i]
+		if c == '\\' && i < n-1 {
+			d := input[i+1]
+			if d == '\\' {
+				buffer.WriteByte('\\')
+				i += 2
+			} else if d == 'n' {
+				buffer.WriteByte('\n')
+				i += 2
+			} else if d == 'r' {
+				buffer.WriteByte('\r')
+				i += 2
+			} else if d == 't' {
+				buffer.WriteByte('\t')
+				i += 2
+			} else {
+				buffer.WriteByte(c)
+				i++
+			}
+		} else {
+			buffer.WriteByte(c)
+			i++
+		}
+	}
+	return buffer.String()
+}
+
+// TSVEncodeField is for the TSV record-writer.
+func TSVEncodeField(input string) string {
+	var buffer bytes.Buffer
+	for _, r := range input {
+		if r == '\\' {
+			buffer.WriteByte('\\')
+			buffer.WriteByte('\\')
+		} else if r == '\n' {
+			buffer.WriteByte('\\')
+			buffer.WriteByte('n')
+		} else if r == '\r' {
+			buffer.WriteByte('\\')
+			buffer.WriteByte('r')
+		} else if r == '\t' {
+			buffer.WriteByte('\\')
+			buffer.WriteByte('t')
+		} else {
+			buffer.WriteRune(r)
+		}
+	}
+	return buffer.String()
+}
--- a/pkg/lib/tsv_codec_test.go
+++ b/pkg/lib/tsv_codec_test.go
@ -0,0 +1,35 @@
+package lib
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestTSVDecodeField(t *testing.T) {
+	assert.Equal(t, "", TSVDecodeField(""))
+	assert.Equal(t, "a", TSVDecodeField("a"))
+	assert.Equal(t, "abc", TSVDecodeField("abc"))
+	assert.Equal(t, `\`, TSVDecodeField(`\`))
+	assert.Equal(t, "\n", TSVDecodeField(`\n`))
+	assert.Equal(t, "\r", TSVDecodeField(`\r`))
+	assert.Equal(t, "\t", TSVDecodeField(`\t`))
+	assert.Equal(t, "\\", TSVDecodeField(`\\`))
+	assert.Equal(t, `\n`, TSVDecodeField(`\\n`))
+	assert.Equal(t, "\\\n", TSVDecodeField(`\\\n`))
+	assert.Equal(t, "abc\r\ndef\r\n", TSVDecodeField(`abc\r\ndef\r\n`))
+}
+
+func TestTSVEncodeField(t *testing.T) {
+	assert.Equal(t, "", TSVEncodeField(""))
+	assert.Equal(t, "a", TSVEncodeField("a"))
+	assert.Equal(t, "abc", TSVEncodeField("abc"))
+	assert.Equal(t, `\\`, TSVEncodeField(`\`))
+	assert.Equal(t, `\n`, TSVEncodeField("\n"))
+	assert.Equal(t, `\r`, TSVEncodeField("\r"))
+	assert.Equal(t, `\t`, TSVEncodeField("\t"))
+	assert.Equal(t, `\\`, TSVEncodeField("\\"))
+	assert.Equal(t, `\\n`, TSVEncodeField("\\n"))
+	assert.Equal(t, `\\\n`, TSVEncodeField("\\\n"))
+	assert.Equal(t, `abc\r\ndef\r\n`, TSVEncodeField("abc\r\ndef\r\n"))
+}
--- a/pkg/lib/unbackslash.go
+++ b/pkg/lib/unbackslash.go
@ -0,0 +1,246 @@
+// ================================================================
+// See cst.BuildStringLiteralNode for more context.
+// ================================================================
+
+package lib
+
+import (
+	"bytes"
+	"strconv"
+)
+
+var unbackslashReplacements = map[byte]string{
+	'a': "\a",
+	'b': "\b",
+	'f': "\f",
+	'n': "\n",
+	'r': "\r",
+	't': "\t",
+	'v': "\v",
+	// At the Miller-user level this means "\\" becomes a single backslash
+	// character.  It looks less clear here since here we are accommodating Go
+	// conventions for backslashing conventions as well.
+	'\\': "\\",
+	// Similarly, "\'" becomes "'"
+	'\'': "'",
+	'"':  "\"",
+	'?':  "?",
+}
+
+// UnbackslashStringLiteral replaces "\t" with TAB, etc. for DSL expressions
+// like '$foo = "a\tb"'.  See also
+// https://en.wikipedia.org/wiki/Escape_sequences_in_C
+// (predates the port of Miller from C to Go).
+//
+// Note that a CST-build pre-pass intentionally excludes regex literals (2nd
+// argument to sub/gsub/regextract/etc) from being modified here.
+//
+// Note "\0" .. "\9" are used for regex captures within the DSL CST builder
+// and are not touched here. (See also lib/regex.go.)
+func UnbackslashStringLiteral(input string) string {
+
+	// We could just do this. However, if someone has a valid "\t" in one part of the string,
+	// and something else strconv.Unquote doesn't handle in another part of the string,
+	// we'd fail to unbackslash the former ...
+	//
+	//	output, err := strconv.Unquote(`"` + input + `"`)
+	//	if err == nil {
+	//		return output
+	//	} else {
+	//		return input
+	//	}
+	//
+	// ... and, given that desire, we don't a priori know how many digits in Unicode
+	// escape sequences -- so we *require* that people use four hex digits after \u
+	// and eight hex digits after \U.
+
+	var buffer bytes.Buffer
+
+	n := len(input)
+
+	for i := 0; i < n; /* increment in loop */ {
+		if input[i] != '\\' {
+			buffer.WriteByte(input[i])
+			i++
+			continue
+		}
+
+		if i == n-1 {
+			buffer.WriteByte(input[i])
+			i++
+			continue
+		}
+
+		next := input[i+1]
+		replacement, ok := unbackslashReplacements[next]
+		if ok {
+			buffer.WriteString(replacement)
+			i += 2
+		} else if ok, code := isBackslashOctal(input[i:]); ok {
+			buffer.WriteByte(byte(code))
+			i += 4
+		} else if ok, code := isBackslashHex(input[i:]); ok {
+			buffer.WriteByte(byte(code))
+			i += 4
+		} else if ok, s := isUnicode4(input[i:]); ok {
+			buffer.WriteString(s)
+			i += 6
+		} else if ok, s := isUnicode8(input[i:]); ok {
+			buffer.WriteString(s)
+			i += 10
+		} else {
+			buffer.WriteByte('\\')
+			buffer.WriteByte(next)
+			i += 2
+		}
+	}
+
+	return buffer.String()
+}
+
+// UnhexStringLiteral is like UnbackslashStringLiteral but only unhexes things
+// like "\x1f". This is for IFS and IPS setup; see the cli package.
+func UnhexStringLiteral(input string) string {
+	var buffer bytes.Buffer
+
+	n := len(input)
+
+	for i := 0; i < n; /* increment in loop */ {
+		if input[i] != '\\' {
+			buffer.WriteByte(input[i])
+			i++
+			continue
+		}
+
+		if i == n-1 {
+			buffer.WriteByte(input[i])
+			i++
+			continue
+		}
+
+		next := input[i+1]
+		if ok, code := isBackslashHex(input[i:]); ok {
+			buffer.WriteByte(byte(code))
+			i += 4
+		} else {
+			buffer.WriteByte('\\')
+			buffer.WriteByte(next)
+			i += 2
+		}
+	}
+
+	return buffer.String()
+}
+
+// If the string starts with backslash followed by three octal digits, convert
+// the next 3 characters from octal. E.g. "\123" becomes 83 (in decimal).
+func isBackslashOctal(input string) (bool, int) {
+	if len(input) < 4 {
+		return false, 0
+	}
+
+	if input[0] != '\\' {
+		return false, 0
+	}
+
+	ok, digit := isOctalDigit(input[1])
+	if !ok {
+		return false, 0
+	}
+	code := int(digit)
+
+	ok, digit = isOctalDigit(input[2])
+	if !ok {
+		return false, 0
+	}
+	code = 8*code + int(digit)
+
+	ok, digit = isOctalDigit(input[3])
+	if !ok {
+		return false, 0
+	}
+	code = 8*code + int(digit)
+
+	return true, code
+}
+
+func isOctalDigit(b byte) (bool, byte) {
+	if '0' <= b && b <= '7' {
+		return true, b - '0'
+	}
+	return false, 0
+}
+
+// If the string starts with leading \x, convert the next 2 characters from hex.
+// E.g.  "\xff" becomes 255 (in decimal).
+func isBackslashHex(input string) (bool, int) {
+	if len(input) < 4 {
+		return false, 0
+	}
+
+	if input[0] != '\\' {
+		return false, 0
+	}
+
+	if input[1] != 'x' && input[1] != 'X' {
+		return false, 0
+	}
+
+	ok, nybble := isHexDigit(input[2])
+	if !ok {
+		return false, 0
+	}
+	code := 16 * int(nybble)
+
+	ok, nybble = isHexDigit(input[3])
+	if !ok {
+		return false, 0
+	}
+	code += int(nybble)
+
+	return true, code
+}
+
+// isHexDigit tries to parse e.g. "\x41"
+func isHexDigit(b byte) (bool, byte) {
+	if '0' <= b && b <= '9' {
+		return true, b - '0'
+	}
+	if 'a' <= b && b <= 'f' {
+		return true, b - 'a' + 10
+	}
+	if 'A' <= b && b <= 'F' {
+		return true, b - 'A' + 10
+	}
+	return false, 0
+}
+
+// isUnicode4 tries to parse e.g. "\u2766"
+func isUnicode4(input string) (bool, string) {
+	if len(input) < 6 {
+		return false, ""
+	}
+	if input[0:2] != `\u` {
+		return false, ""
+	}
+	s, err := strconv.Unquote(`"` + input[0:6] + `"`)
+	if err == nil {
+		return true, s
+	}
+	return false, ""
+}
+
+// isUnicode8 tries to parse e.g. "\U00010877"
+func isUnicode8(input string) (bool, string) {
+	if len(input) < 10 {
+		return false, ""
+	}
+	if input[0:2] != `\U` {
+		return false, ""
+	}
+	s, err := strconv.Unquote(`"` + input[0:10] + `"`)
+	if err == nil {
+		return true, s
+	}
+	return false, ""
+}
--- a/pkg/lib/unbackslash_test.go
+++ b/pkg/lib/unbackslash_test.go
@ -0,0 +1,45 @@
+// ================================================================
+// Most Miller tests (thousands of them) are command-line-driven via
+// mlr regtest. Here are some cases needing special focus.
+// ================================================================
+
+package lib
+
+import (
+	"testing"
+)
+
+type tDataForUnbackslash struct {
+	input          string
+	expectedOutput string
+}
+
+// Note we are here dealing with Go's backslashing conventions.
+// At the Miller user-space level this is simply "\t" -> TAB, etc.
+var dataForUnbackslash = []tDataForUnbackslash{
+	{"", ""},
+	{"abcde", "abcde"},
+	{`\1`, `\1`},
+	{`a\tb\tc`, "a\tb\tc"},
+	{`a\fb\rc`, "a\fb\rc"},
+	{`a"b"c`, `a"b"c`},
+	{`a\"b\"c`, `a"b"c`},
+	{`a\102c`, `aBc`},
+	{`a\x42c`, `aBc`},
+	{`[\101\102\103]`, `[ABC]`},
+	{`[\x44\x45\x46]`, `[DEF]`},
+	{`\u2766`, `❦`},
+	{`\U00010877`, `𐡷`},
+	{`a\u0062c`, `abc`},
+}
+
+func TestUnbackslash(t *testing.T) {
+	for i, entry := range dataForUnbackslash {
+		actualOutput := UnbackslashStringLiteral(entry.input)
+		if actualOutput != entry.expectedOutput {
+			t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
+				i, entry.input, entry.expectedOutput, actualOutput,
+			)
+		}
+	}
+}
--- a/pkg/lib/util.go
+++ b/pkg/lib/util.go
@ -0,0 +1,229 @@
+package lib
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"sort"
+	"strconv"
+	"strings"
+	"unicode/utf8"
+)
+
+func BooleanXOR(a, b bool) bool {
+	return a != b
+}
+
+func BoolToInt(b bool) int64 {
+	if b == false {
+		return 0
+	} else {
+		return 1
+	}
+}
+
+func Plural(n int) string {
+	if n == 1 {
+		return ""
+	} else {
+		return "s"
+	}
+}
+
+// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
+// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
+// but "" splits to [""] when I wish it were []. This function does the latter.
+func SplitString(input string, separator string) []string {
+	if input == "" {
+		return make([]string, 0)
+	} else {
+		return strings.Split(input, separator)
+	}
+}
+
+func StringListToSet(stringList []string) map[string]bool {
+	if stringList == nil {
+		return nil
+	}
+
+	stringSet := make(map[string]bool)
+	for _, s := range stringList {
+		stringSet[s] = true
+	}
+	return stringSet
+}
+
+func SortStrings(strings []string) {
+	// Go sort API: for ascending sort, return true if element i < element j.
+	sort.Slice(strings, func(i, j int) bool {
+		return strings[i] < strings[j]
+	})
+}
+
+func ReverseStringList(strings []string) {
+	n := len(strings)
+	i := 0
+	j := n - 1
+	for i < j {
+		temp := strings[i]
+		strings[i] = strings[j]
+		strings[j] = temp
+		i++
+		j--
+	}
+}
+
+func SortedStrings(strings []string) []string {
+	copy := make([]string, len(strings))
+	for i, s := range strings {
+		copy[i] = s
+	}
+	// Go sort API: for ascending sort, return true if element i < element j.
+	sort.Slice(copy, func(i, j int) bool {
+		return copy[i] < copy[j]
+	})
+	return copy
+}
+
+func IntMin2(a, b int64) int64 {
+	if a < b {
+		return a
+	} else {
+		return b
+	}
+}
+
+// TryIntFromString tries decimal, hex, octal, and binary.
+func TryIntFromString(input string) (int64, bool) {
+	// Go's strconv parses "1_2" as 12; not OK for Miller syntax. (Also not valid JSON.)
+	for i := 0; i < len(input); i++ {
+		if input[i] == '_' {
+			return 0, false
+		}
+	}
+
+	// Following twos-complement formatting familiar from all manner of
+	// languages, including C which was Miller's original implementation
+	// language, we want to allow 0x00....00 through 0x7f....ff as positive
+	// 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's
+	// signed-int parsing explicitly doesn't allow that, but we don't want Go
+	// semantics to dictate Miller semantics.  So, we try signed-int parsing
+	// for 0x00....00 through 0x7f....ff, as well as positive or negative
+	// decimal. Failing that, we try unsigned-int parsing for 0x80....00
+	// through 0xff....ff.
+	i64, ierr := strconv.ParseInt(input, 0 /* check all*/, 64)
+	if ierr == nil {
+		return i64, true
+	}
+	u64, uerr := strconv.ParseUint(input, 0 /* check all*/, 64)
+	if uerr == nil {
+		return int64(u64), true
+	}
+	return 0, false
+}
+
+// TryIntFromStringWithBase allows the user to choose the base that's used,
+// rather than inferring from 0x prefix, etc as TryIntFromString does.
+func TryIntFromStringWithBase(input string, base int64) (int64, bool) {
+	// Go's strconv parses "1_2" as 12; not OK for Miller syntax. (Also not valid JSON.)
+	for i := 0; i < len(input); i++ {
+		if input[i] == '_' {
+			return 0, false
+		}
+	}
+
+	i64, ierr := strconv.ParseInt(input, int(base), 64)
+	if ierr == nil {
+		return i64, true
+	}
+	u64, uerr := strconv.ParseUint(input, int(base), 64)
+	if uerr == nil {
+		return int64(u64), true
+	}
+	return 0, false
+}
+
+func TryFloatFromString(input string) (float64, bool) {
+	// Go's strconv parses "1_2.3_4" as 12.34; not OK for Miller syntax. (Also not valid JSON.)
+	for i := 0; i < len(input); i++ {
+		if input[i] == '_' {
+			return 0, false
+		}
+	}
+
+	fval, err := strconv.ParseFloat(input, 64)
+	if err == nil {
+		return fval, true
+	} else {
+		return 0, false
+	}
+}
+
+func TryBoolFromBoolString(input string) (bool, bool) {
+	if input == "true" {
+		return true, true
+	} else if input == "false" {
+		return false, true
+	} else {
+		return false, false
+	}
+}
+
+// Go doesn't preserve insertion order in its arrays, so here we make an
+// accessor for getting the keys in sorted order for the benefit of
+// map-printers.
+func GetArrayKeysSorted(input map[string]string) []string {
+	keys := make([]string, len(input))
+	i := 0
+	for key := range input {
+		keys[i] = key
+		i++
+	}
+	sort.Strings(keys)
+	return keys
+}
+
+// WriteTempFile places the contents string into a temp file, which the caller
+// must remove.
+func WriteTempFileOrDie(contents string) string {
+	// Use "" as first argument to ioutil.TempFile to use default directory.
+	// Nominally "/tmp" or somesuch on all unix-like systems, but not for Windows.
+	handle, err := ioutil.TempFile("", "mlr-temp")
+	if err != nil {
+		fmt.Printf("mlr: could not create temp file.\n")
+		os.Exit(1)
+	}
+
+	_, err = handle.WriteString(contents)
+	if err != nil {
+		fmt.Printf("mlr: could not populate temp file.\n")
+		os.Exit(1)
+	}
+
+	err = handle.Close()
+	if err != nil {
+		fmt.Printf("mlr: could not finish write of  temp file.\n")
+		os.Exit(1)
+	}
+	return handle.Name()
+}
+
+func CopyStringArray(input []string) []string {
+	output := make([]string, len(input))
+	copy(output, input)
+	return output
+}
+
+func StripEmpties(input []string) []string {
+	output := make([]string, 0, len(input))
+	for _, e := range input {
+		if e != "" {
+			output = append(output, e)
+		}
+	}
+	return output
+}
+
+func UTF8Strlen(s string) int64 {
+	return int64(utf8.RuneCountInString(s))
+}
				`@ -0,0 +1 @@`
				`These are basic library routines for Miller.`