Export library code in pkg/ (#1391)

* Export library code in `pkg/`

* new doc page
This commit is contained in:
John Kerl 2023-09-10 17:15:13 -04:00 committed by GitHub
parent 93b7c8eac0
commit 268a96d002
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
358 changed files with 1076 additions and 693 deletions

1
pkg/lib/README.md Normal file
View file

@ -0,0 +1 @@
These are basic library routines for Miller.

2
pkg/lib/doc.go Normal file
View file

@ -0,0 +1,2 @@
// Package lib contains basic library routines for Miller.
package lib

3
pkg/lib/docurl.go Normal file
View file

@ -0,0 +1,3 @@
package lib
const DOC_URL = "https://miller.readthedocs.io"

322
pkg/lib/file_readers.go Normal file
View file

@ -0,0 +1,322 @@
// ================================================================
// Wrapper for os.Open which maps string filename to *os.File, which in turn
// implements io.ReadCloser, and optional in turn wrapping that in a
// gzip/zlib/bunzip2 reader. Shared across record-readers for all the various
// input-file formats (CSV, JSON, XTAB, DKVP, NIDX, PPRINT) which Miller
// supports.
//
// There are two ways of handling compressed data in the Miller Go port:
//
// * A user-specified 'prepipe' command such as 'gunzip', where we popen a
// process, hand it the filename via '< filename', and read from that pipe;
//
// * An indication to use an in-process encoding reader (gzip or bzip2, etc).
//
// If a prepipe is specified, it is used; else if an encoding is specified, it
// is used; otherwise the file suffix (.bz2, .gz, .z) is consulted; otherwise
// the file is treated as text.
// ================================================================
package lib
import (
"bytes"
"compress/bzip2"
"compress/gzip"
"compress/zlib"
"fmt"
"github.com/klauspost/compress/zstd"
"io"
"net/http"
"os"
"strings"
)
type TFileInputEncoding int
const (
FileInputEncodingDefault TFileInputEncoding = iota
FileInputEncodingBzip2
FileInputEncodingGzip
FileInputEncodingZlib
FileInputEncodingZstd
)
// OpenFileForRead: If prepipe is non-empty, popens "{prepipe} < {filename}"
// and returns a handle to that where prepipe is nominally things like
// "gunzip", "cat", etc. Otherwise, delegates to an in-process reader which
// can natively handle gzip/bzip2/zlib depending on the specified encoding. If
// the encoding isn't a compression encoding, this ends up being simply
// os.Open.
func OpenFileForRead(
filename string,
prepipe string,
prepipeIsRaw bool,
encoding TFileInputEncoding, // ignored if prepipe is non-empty
) (io.ReadCloser, error) {
if prepipe != "" {
return openPrepipedHandleForRead(filename, prepipe, prepipeIsRaw)
} else {
handle, err := PathToHandle(filename)
if err != nil {
return nil, err
}
return openEncodedHandleForRead(handle, encoding, filename)
}
}
// PathToHandle maps various back-ends to a stream. As of 2021-07-07, the
// following URI schemes are supported:
// * https://... and http://...
// * file://...
// * plain disk files
func PathToHandle(
path string,
) (io.ReadCloser, error) {
if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
resp, err := http.Get(path)
if err != nil {
return nil, err
}
handle := resp.Body
return handle, err
} else if strings.HasPrefix(path, "file://") {
return os.Open(strings.Replace(path, "file://", "", 1))
} else {
return os.Open(path)
}
}
// OpenStdin: if prepipe is non-empty, popens "{prepipe}" and returns a handle
// to that where prepipe is nominally things like "gunzip", "cat", etc.
// Otherwise, delegates to an in-process reader which can natively handle
// gzip/bzip2/zlib depending on the specified encoding. If the encoding isn't
// a compression encoding, this ends up being simply os.Stdin.
func OpenStdin(
prepipe string,
prepipeIsRaw bool,
encoding TFileInputEncoding, // ignored if prepipe is non-empty
) (io.ReadCloser, error) {
if prepipe != "" {
return openPrepipedHandleForRead("", prepipe, prepipeIsRaw)
} else {
return openEncodedHandleForRead(os.Stdin, encoding, "")
}
}
func openPrepipedHandleForRead(
filename string,
prepipe string,
prepipeIsRaw bool,
) (io.ReadCloser, error) {
escapedFilename := escapeFileNameForPopen(filename)
var command string
if filename == "" { // stdin
command = prepipe
} else {
if prepipeIsRaw {
command = prepipe + " " + escapedFilename
} else {
command = prepipe + " < " + escapedFilename
}
}
return OpenInboundHalfPipe(command)
}
// Avoids shell-injection cases by replacing single-quote with backslash
// single-quote and double-quote with backslack double-quote, then wrapping the
// entire result in initial and final single-quote.
//
// TODO: test on Windows. Maybe needs move to pkg/platform.
func escapeFileNameForPopen(filename string) string {
var buffer bytes.Buffer
foundQuote := false
for _, c := range filename {
if c == '\'' || c == '"' {
buffer.WriteRune('\'')
buffer.WriteRune(c)
buffer.WriteRune('\'')
} else {
buffer.WriteRune(c)
}
}
if foundQuote {
return "'" + buffer.String() + "'"
} else {
return buffer.String()
}
}
// TODO: comment
func openEncodedHandleForRead(
handle io.ReadCloser,
encoding TFileInputEncoding,
filename string,
) (io.ReadCloser, error) {
switch encoding {
case FileInputEncodingBzip2:
return NewBZip2ReadCloser(handle), nil
case FileInputEncodingGzip:
return gzip.NewReader(handle)
case FileInputEncodingZlib:
return zlib.NewReader(handle)
case FileInputEncodingZstd:
return NewZstdReadCloser(handle)
}
InternalCodingErrorIf(encoding != FileInputEncodingDefault)
if strings.HasSuffix(filename, ".bz2") {
return NewBZip2ReadCloser(handle), nil
}
if strings.HasSuffix(filename, ".gz") {
return gzip.NewReader(handle)
}
if strings.HasSuffix(filename, ".z") {
return zlib.NewReader(handle)
}
if strings.HasSuffix(filename, ".zst") {
return NewZstdReadCloser(handle)
}
// Pass along os.Stdin or os.Open(filename)
return handle, nil
}
// ----------------------------------------------------------------
// BZip2ReadCloser remedies the fact that bzip2.NewReader does not implement io.ReadCloser.
type BZip2ReadCloser struct {
originalHandle io.ReadCloser
bzip2Handle io.Reader
}
func NewBZip2ReadCloser(handle io.ReadCloser) *BZip2ReadCloser {
return &BZip2ReadCloser{
originalHandle: handle,
bzip2Handle: bzip2.NewReader(handle),
}
}
func (rc *BZip2ReadCloser) Read(p []byte) (n int, err error) {
return rc.bzip2Handle.Read(p)
}
func (rc *BZip2ReadCloser) Close() error {
return rc.originalHandle.Close()
}
// ----------------------------------------------------------------
// ZstdReadCloser remedies the fact that zstd.NewReader does not implement io.ReadCloser.
type ZstdReadCloser struct {
originalHandle io.ReadCloser
zstdHandle io.Reader
}
func NewZstdReadCloser(handle io.ReadCloser) (*ZstdReadCloser, error) {
zstdHandle, err := zstd.NewReader(handle)
if err != nil {
return nil, err
}
return &ZstdReadCloser{
originalHandle: handle,
zstdHandle: zstdHandle,
}, nil
}
func (rc *ZstdReadCloser) Read(p []byte) (n int, err error) {
return rc.zstdHandle.Read(p)
}
func (rc *ZstdReadCloser) Close() error {
return rc.originalHandle.Close()
}
// ----------------------------------------------------------------
// IsEOF handles the following problem: reading past end of files opened with
// os.Open returns the error which is io.EOF. Reading past close of pipes
// opened with popen (e.g. Miller's prepipe, where the file isn't 'foo.dat'
// but rather the process 'gunzip < foo.dat |') returns not io.EOF but an error
// with 'file already closed' within it. See also
// https://stackoverflow.com/questions/47486128/why-does-io-pipe-continue-to-block-even-when-eof-is-reached
func IsEOF(err error) bool {
if err == nil {
return false
} else if err == io.EOF {
return true
} else if strings.Contains(err.Error(), "file already closed") {
return true
} else {
return false
}
}
// ----------------------------------------------------------------
// Functions for in-place mode
// IsUpdateableInPlace tells if we can use the input with mlr -I: not for URLs,
// and not for prepipe commands (which we don't presume to know how to invert
// for output).
func IsUpdateableInPlace(
filename string,
prepipe string,
) error {
if strings.HasPrefix(filename, "http://") ||
strings.HasPrefix(filename, "https://") ||
strings.HasPrefix(filename, "file://") {
return fmt.Errorf("http://, https://, and file:// URLs are not updateable in place.")
}
if prepipe != "" {
return fmt.Errorf("input with --prepipe or --prepipex is not updateable in place.")
}
return nil
}
// FindInputEncoding determines the input encoding (compression), whether from
// a flag like --gzin, or from filename suffix like ".gz". If the user did
// --gzin on the command line, TFileInputEncoding will be
// FileInputEncodingGzip. If they didn't, but the filename ends in ".gz", then
// we auto-infer FileInputEncodingGzip. Either way, this function tells if we
// will be using in-process decompression within the file-format-specific
// record reader.
func FindInputEncoding(
filename string,
inputFileInputEncoding TFileInputEncoding,
) TFileInputEncoding {
if inputFileInputEncoding != FileInputEncodingDefault {
return inputFileInputEncoding
}
if strings.HasSuffix(filename, ".bz2") {
return FileInputEncodingBzip2
}
if strings.HasSuffix(filename, ".gz") {
return FileInputEncodingGzip
}
if strings.HasSuffix(filename, ".z") {
return FileInputEncodingZlib
}
return FileInputEncodingDefault
}
// WrapOutputHandle wraps a file-write handle with a decompressor. The first
// return value is the wrapped handle. The second is true if the returned
// handle needs to be closed separately from the original. The third is for
// in-process compression we can't undo: namely, as of September 2021 the gzip
// and zlib libraries support write-closers, but the bzip2 library does not.
func WrapOutputHandle(
fileWriteHandle io.WriteCloser,
inputFileEncoding TFileInputEncoding,
) (io.WriteCloser, bool, error) {
switch inputFileEncoding {
case FileInputEncodingBzip2:
return fileWriteHandle, false, fmt.Errorf("bzip2 is not currently supported for in-place mode.")
case FileInputEncodingGzip:
return gzip.NewWriter(fileWriteHandle), true, nil
case FileInputEncodingZlib:
return zlib.NewWriter(fileWriteHandle), true, nil
default:
return fileWriteHandle, false, nil
}
}

43
pkg/lib/getoptify.go Normal file
View file

@ -0,0 +1,43 @@
package lib
import (
"regexp"
"strings"
)
// Getoptify expands "-xyz" into "-x -y -z" while leaving "--xyz" intact. This
// is a keystroke-saver for the user.
//
// This is OK to do here globally since Miller is quite consistent (in main,
// verbs, auxents, and terminals) that multi-character options start with two
// dashes, e.g. "--csv". (The sole exception is the sort verb's -nf/-nr which
// are handled specially there.)
//
// Additionally, we split "--foo=bar" into "--foo" and "bar".
func Getoptify(inargs []string) []string {
expandRegex := regexp.MustCompile("^-[a-zA-Z0-9]+$")
splitRegex := regexp.MustCompile("^--[^=]+=.+$")
numberRegex := regexp.MustCompile("^-[0-9]+$")
outargs := make([]string, 0)
for _, inarg := range inargs {
if expandRegex.MatchString(inarg) {
if numberRegex.MatchString(inarg) {
// Don't expand things like '-12345' which are (likely!) numeric arguments to verbs.
// Example: 'mlr unsparsify --fill-with -99999'.
outargs = append(outargs, inarg)
} else {
for _, c := range inarg[1:] {
outargs = append(outargs, "-"+string(c))
}
}
} else if splitRegex.MatchString(inarg) {
pair := strings.SplitN(inarg, "=", 2)
InternalCodingErrorIf(len(pair) != 2)
outargs = append(outargs, pair[0])
outargs = append(outargs, pair[1])
} else {
outargs = append(outargs, inarg)
}
}
return outargs
}

88
pkg/lib/halfpipe.go Normal file
View file

@ -0,0 +1,88 @@
package lib
import (
"fmt"
"os"
"github.com/johnkerl/miller/pkg/platform"
)
// OpenOutboundHalfPipe returns a handle to a process. Writing to that handle
// writes to the process' stdin. The process' stdout and stderr are the current
// process' stdout and stderr.
//
// This is for pipe-output-redirection in the Miller put/filter DSL.
//
// Note I am not using os.exec.Cmd which is billed as being simpler than using
// os.StartProcess. It may indeed be simpler when you want to handle the
// subprocess' stdin/stdout/stderr all three within the parent process. Here I
// found it much easier to use os.StartProcess to let the stdout/stderr run
// free.
func OpenOutboundHalfPipe(commandString string) (*os.File, error) {
readPipe, writePipe, err := os.Pipe()
var procAttr os.ProcAttr
procAttr.Files = []*os.File{
readPipe,
os.Stdout,
os.Stderr,
}
// /bin/sh -c "..." or cmd /c "..."
shellRunArray := platform.GetShellRunArray(commandString)
process, err := os.StartProcess(shellRunArray[0], shellRunArray, &procAttr)
if err != nil {
return nil, err
}
go process.Wait()
return writePipe, nil
}
// OpenInboundHalfPipe returns a handle to a process. Reading from that handle
// reads from the process' stdout. The process' stdin and stderr are the
// current process' stdin and stderr.
//
// This is for the Miller prepipe feature.
//
// Note I am not using os.exec.Cmd which is billed as being simpler than using
// os.StartProcess. It may indeed be simpler when you want to handle the
// subprocess' stdin/stdout/stderr all three within the parent process. Here I
// found it much easier to use os.StartProcess to let the stdin/stderr run
// free.
func OpenInboundHalfPipe(commandString string) (*os.File, error) {
readPipe, writePipe, err := os.Pipe()
var procAttr os.ProcAttr
procAttr.Files = []*os.File{
os.Stdin,
writePipe,
os.Stderr,
}
// /bin/sh -c "..." or cmd /c "..."
shellRunArray := platform.GetShellRunArray(commandString)
process, err := os.StartProcess(shellRunArray[0], shellRunArray, &procAttr)
if err != nil {
return nil, err
}
// TODO comment somewhere
// https://stackoverflow.com/questions/47486128/why-does-io-pipe-continue-to-block-even-when-eof-is-reached
// TODO comment
go func(process *os.Process, readPipe *os.File) {
_, err := process.Wait()
if err != nil {
fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err)
}
readPipe.Close()
}(process, readPipe)
return readPipe, nil
}

38
pkg/lib/latin1.go Normal file
View file

@ -0,0 +1,38 @@
package lib
import (
"bytes"
"fmt"
"unicode/utf8"
)
func TryLatin1ToUTF8(input string) (string, error) {
var buffer bytes.Buffer
for _, b := range []byte(input) {
// 0x00-0xff map to 0x0000-0xffff
buffer.WriteRune(rune(b))
}
output := buffer.String()
return output, nil
}
func TryUTF8ToLatin1(input string) (string, error) {
var buffer bytes.Buffer
bytes := []byte(input)
for len(bytes) > 0 {
r, size := utf8.DecodeRune(bytes)
if r < 0x0080 {
buffer.WriteByte(byte(r))
} else if r >= 0x80 && r <= 0x00ff {
buffer.WriteByte(byte(r))
} else {
return "", fmt.Errorf("character 0x%08x (%v) is not encodable as Latin-1", int(r), r)
}
bytes = bytes[size:]
}
output := buffer.String()
return output, nil
}

100
pkg/lib/latin1_test.go Normal file
View file

@ -0,0 +1,100 @@
// ================================================================
// Most Miller tests (thousands of them) are command-line-driven via
// mlr regtest. Here are some cases needing special focus.
// ================================================================
package lib
import (
"github.com/stretchr/testify/assert"
"testing"
)
type tDataForLatin1 struct {
input string
expectedOutput string
expectError bool
}
var dataForLatin1ToUTF8 = []tDataForLatin1{
{
"",
"",
false,
},
{
"The quick brown fox jumped over the lazy dogs.",
"The quick brown fox jumped over the lazy dogs.",
false,
},
{
"a\xe4o\xf6",
"a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4"
false,
},
{
"Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich",
"Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich",
false,
},
}
var dataForUTF8ToLatin1 = []tDataForLatin1{
{
"",
"",
false,
},
{
"The quick brown fox jumped over the lazy dogs.",
"The quick brown fox jumped over the lazy dogs.",
false,
},
{
"a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4"
"a\xe4o\xf6",
false,
},
{
"Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich",
"Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich",
false,
},
{
"Съешь же ещё этих мягких французских булок да выпей чаю",
"",
true,
},
}
func TestLatin1ToUTF8(t *testing.T) {
for i, entry := range dataForLatin1ToUTF8 {
actualOutput, err := TryLatin1ToUTF8(entry.input)
if entry.expectError {
assert.NotNil(t, err)
} else {
assert.Nil(t, err)
}
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
i, entry.input, entry.expectedOutput, actualOutput,
)
}
}
}
func TestUTF8ToLatin1(t *testing.T) {
for i, entry := range dataForUTF8ToLatin1 {
actualOutput, err := TryUTF8ToLatin1(entry.input)
if entry.expectError {
assert.NotNil(t, err)
} else {
assert.Nil(t, err)
}
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
i, entry.input, entry.expectedOutput, actualOutput,
)
}
}
}

110
pkg/lib/logger.go Normal file
View file

@ -0,0 +1,110 @@
package lib
import (
"fmt"
"os"
"path"
"runtime"
)
// InternalCodingErrorIf is a lookalike for C's __FILE__ and __LINE__ printing,
// with exit 1 if the condition is true.
func InternalCodingErrorIf(condition bool) {
if !condition {
return
}
_, fileName, fileLine, ok := runtime.Caller(1)
if ok {
fmt.Fprintf(
os.Stderr,
"Internal coding error detected at file %s line %d\n",
// Full path preferred but breaks diffs on regression-test actual vs expected
// stderr comparison on expect-fail cases.
path.Base(fileName),
fileLine,
)
} else {
fmt.Fprintf(
os.Stderr,
"Internal coding error detected at file %s line %s\n",
"(unknown)",
"(unknown)",
)
}
// Use this and re-run if you want to get a stack trace to get the
// call-tree that led to the indicated file/line:
if os.Getenv("MLR_PANIC_ON_INTERNAL_ERROR") != "" {
panic("Here is the stack trace")
}
os.Exit(1)
}
// InternalCodingErrorWithMessageIf is a lookalike for C's __FILE__ and
// __LINE__ printing, with exit 1 if the condition is true.
func InternalCodingErrorWithMessageIf(condition bool, message string) {
if !condition {
return
}
_, fileName, fileLine, ok := runtime.Caller(1)
if ok {
fmt.Fprintf(
os.Stderr,
"Internal coding error detected at file %s line %d: %s\n",
path.Base(fileName),
fileLine,
message,
)
} else {
fmt.Fprintf(
os.Stderr,
"Internal coding error detected at file %s line %s: %s\n",
"(unknown)",
"(unknown)",
message,
)
}
// use this and re-run if you want to get a stack trace to get the
// call-tree that led to the indicated file/line:
if os.Getenv("MLR_PANIC_ON_INTERNAL_ERROR") != "" {
panic("Here is the stack trace")
}
os.Exit(1)
}
// InternalCodingErrorPanic is like InternalCodingErrorIf, expect that it
// panics the process (for stack trace, which is usually not desired), and that
// it requires the if-test to be at the caller.
func InternalCodingErrorPanic(message string) {
_, fileName, fileLine, ok := runtime.Caller(1)
if ok {
panic(
fmt.Sprintf(
"Internal coding error detected at file %s line %d: %s\n",
path.Base(fileName),
fileLine,
message,
),
)
} else {
panic(
fmt.Sprintf(
"Internal coding error detected at file %s line %s: %s\n",
"(unknown)",
"(unknown)",
message,
),
)
}
}
// WhereAreWe shows a stack trace from the current callsite.
func WhereAreWe() {
// Start at 1, not 0, since this function itself is not of interest.
for i := 1; i < 20; i++ {
_, fileName, fileLine, ok := runtime.Caller(i)
if !ok {
break
}
fmt.Printf(" %s %d\n", fileName, fileLine)
}
}

430
pkg/lib/mlrmath.go Normal file
View file

@ -0,0 +1,430 @@
// ================================================================
// Non-mlrval math routines
// ================================================================
package lib
import (
"fmt"
"math"
"os"
)
// ----------------------------------------------------------------
// Some wrappers around things which aren't one-liners from math.*.
func Sgn(a float64) float64 {
if a > 0 {
return 1.0
} else if a < 0 {
return -1.0
} else if a == 0 {
return 0.0
} else {
return math.NaN()
}
}
// Normal cumulative distribution function, expressed in terms of erfc library
// function (which is awkward, but exists).
func Qnorm(x float64) float64 {
return 0.5 * math.Erfc(-x/math.Sqrt2)
}
// This is a tangent-following method not unlike Newton-Raphson:
// * We can compute qnorm(y) = integral from -infinity to y of (1/sqrt(2pi)) exp(-t^2/2) dt.
// * We can compute derivative of qnorm(y) = (1/sqrt(2pi)) exp(-y^2/2).
// * We cannot explicitly compute invqnorm(y).
// * If dx/dy = (1/sqrt(2pi)) exp(-y^2/2) then dy/dx = sqrt(2pi) exp(y^2/2).
//
// This means we *can* compute the derivative of invqnorm even though we
// can't compute the function itself. So the essence of the method is to
// follow the tangent line to form successive approximations: we have known function input x
// and unknown function output y and initial guess y0. At each step we find the intersection
// of the tangent line at y_n with the vertical line at x, to find y_{n+1}. Specificall:
//
// * Even though we can't compute y = q^-1(x) we can compute x = q(y).
// * Start with initial guess for y (y0 = 0.0 or y0 = x both are OK).
// * Find x = q(y). Since q (and therefore q^-1) are 1-1, we're done if qnorm(invqnorm(x)) is small.
// * Else iterate: using point-slope form, (y_{n+1} - y_n) / (x_{n+1} - x_n) = m = sqrt(2pi) exp(y_n^2/2).
// Here x_2 = x (the input) and x_1 = q(y_1).
// * Solve for y_{n+1} and repeat.
const INVQNORM_TOL float64 = 1e-9
const INVQNORM_MAXITER int = 30
func Invqnorm(x float64) float64 {
// Initial approximation is linear. Starting with y0 = 0.0 works just as well.
y0 := x - 0.5
if x <= 0.0 {
return 0.0
}
if x >= 1.0 {
return 0.0
}
y := y0
niter := 0
for {
backx := Qnorm(y)
err := math.Abs(x - backx)
if err < INVQNORM_TOL {
break
}
if niter > INVQNORM_MAXITER {
fmt.Fprintf(os.Stderr,
"mlr: internal coding error: max iterations %d exceeded in invqnorm.\n",
INVQNORM_MAXITER,
)
os.Exit(1)
}
m := math.Sqrt2 * math.SqrtPi * math.Exp(y*y/2.0)
delta_y := m * (x - backx)
y += delta_y
niter++
}
return y
}
const JACOBI_TOLERANCE = 1e-12
const JACOBI_MAXITER = 20
// ----------------------------------------------------------------
// Jacobi real-symmetric eigensolver. Loosely adapted from Numerical Recipes.
//
// Note: this is coded for n=2 (to implement PCA linear regression on 2
// variables) but the algorithm is quite general. Changing from 2 to n is a
// matter of updating the top and bottom of the function: function signature to
// take double** matrix, double* eigenvector_1, double* eigenvector_2, and n;
// create copy-matrix and make-identity matrix functions; free temp matrices at
// the end; etc.
func GetRealSymmetricEigensystem(
matrix [2][2]float64,
) (
eigenvalue1 float64, // Output: dominant eigenvalue
eigenvalue2 float64, // Output: less-dominant eigenvalue
eigenvector1 [2]float64, // Output: corresponding to dominant eigenvalue
eigenvector2 [2]float64, // Output: corresponding to less-dominant eigenvalue
) {
L := [2][2]float64{
{matrix[0][0], matrix[0][1]},
{matrix[1][0], matrix[1][1]},
}
V := [2][2]float64{
{1.0, 0.0},
{0.0, 1.0},
}
var P, PT_A [2][2]float64
n := 2
found := false
for iter := 0; iter < JACOBI_MAXITER; iter++ {
sum := 0.0
for i := 1; i < n; i++ {
for j := 0; j < i; j++ {
sum += math.Abs(L[i][j])
}
}
if math.Abs(sum*sum) < JACOBI_TOLERANCE {
found = true
break
}
for p := 0; p < n; p++ {
for q := p + 1; q < n; q++ {
numer := L[p][p] - L[q][q]
denom := L[p][q] + L[q][p]
if math.Abs(denom) < JACOBI_TOLERANCE {
continue
}
theta := numer / denom
signTheta := 1.0
if theta < 0 {
signTheta = -1.0
}
t := signTheta / (math.Abs(theta) + math.Sqrt(theta*theta+1))
c := 1.0 / math.Sqrt(t*t+1)
s := t * c
for pi := 0; pi < n; pi++ {
for pj := 0; pj < n; pj++ {
if pi == pj {
P[pi][pj] = 1.0
} else {
P[pi][pj] = 0.0
}
}
}
P[p][p] = c
P[p][q] = -s
P[q][p] = s
P[q][q] = c
// L = P.transpose() * L * P
// V = V * P
matmul2t(&PT_A, &P, &L)
matmul2(&L, &PT_A, &P)
matmul2(&V, &V, &P)
}
}
}
if !found {
fmt.Fprintf(os.Stderr,
"%s: Jacobi eigensolver: max iterations (%d) exceeded. Non-symmetric input?\n",
"mlr",
JACOBI_MAXITER,
)
os.Exit(1)
}
eigenvalue1 = L[0][0]
eigenvalue2 = L[1][1]
abs1 := math.Abs(eigenvalue1)
abs2 := math.Abs(eigenvalue2)
if abs1 > abs2 {
eigenvector1[0] = V[0][0] // Column 0 of V
eigenvector1[1] = V[1][0]
eigenvector2[0] = V[0][1] // Column 1 of V
eigenvector2[1] = V[1][1]
} else {
eigenvalue1, eigenvalue2 = eigenvalue2, eigenvalue1
eigenvector1[0] = V[0][1]
eigenvector1[1] = V[1][1]
eigenvector2[0] = V[0][0]
eigenvector2[1] = V[1][0]
}
return eigenvalue1, eigenvalue2, eigenvector1, eigenvector2
}
// C = A * B
func matmul2(
C *[2][2]float64, // Output
A *[2][2]float64, // Input
B *[2][2]float64, // Input
) {
var T [2][2]float64
n := 2
for i := 0; i < n; i++ {
for j := 0; j < n; j++ {
sum := 0.0
for k := 0; k < n; k++ {
sum += A[i][k] * B[k][j]
}
T[i][j] = sum
}
}
// Needs copy in case C's memory is the same as A and/or B
for i := 0; i < n; i++ {
for j := 0; j < n; j++ {
C[i][j] = T[i][j]
}
}
}
// C = A^t * B
func matmul2t(
C *[2][2]float64, // Output
A *[2][2]float64, // Input
B *[2][2]float64, // Input
) {
var T [2][2]float64
n := 2
for i := 0; i < n; i++ {
for j := 0; j < n; j++ {
sum := 0.0
for k := 0; k < n; k++ {
sum += A[k][i] * B[k][j]
}
T[i][j] = sum
}
}
// Needs copy in case C's memory is the same as A and/or B
for i := 0; i < n; i++ {
for j := 0; j < n; j++ {
C[i][j] = T[i][j]
}
}
}
// ================================================================
// Logisitic regression
//
// Real-valued x_0 .. x_{N-1}
// 0/1-valued y_0 .. y_{N-1}
// Model p(x_i == 1) as
// p(x, m, b) = 1 / (1 + exp(-m*x-b)
// which is the same as
// log(p/(1-p)) = m*x + b
// then
// p(x, m, b) = 1 / (1 + exp(-m*x-b)
// = exp(m*x+b) / (1 + exp(m*x+b)
// and
// 1-p = exp(-m*x-b) / (1 + exp(-m*x-b)
// = 1 / (1 + exp(m*x+b)
// Note for reference just below that
// dp/dm = -1 / [1 + exp(-m*x-b)]**2 * (-x) * exp(-m*x-b)
// = [x exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2
// = x * p * (1-p)
// and
// dp/db = -1 / [1 + exp(-m*x-b)]**2 * (-1) * exp(-m*x-b)
// = [exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2
// = p * (1-p)
// Write p_i for p(x_i, m, b)
//
// Maximum-likelihood equation:
// L(m, b) = prod_{i=0}^{N-1} [ p_i**y_i * (1-p_i)**(1-y_i) ]
//
// Log-likelihood equation:
// ell(m, b) = sum{i=0}^{N-1} [ y_i log(p_i) + (1-y_i) log(1-p_i) ]
// = sum{i=0}^{N-1} [ log(1-p_i) + y_i log(p_i/(1-p_i)) ]
// = sum{i=0}^{N-1} [ log(1-p_i) + y_i*(m*x_i+b) ]
// Differentiate with respect to parameters:
//
// d ell/dm = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/dm + x_i*y_i ]
// = sum{i=0}^{N-1} [ -1/(1-p_i) x_i*p_i*(1-p_i) + x_i*y_i ]
// = sum{i=0}^{N-1} [ x_i(y_i-p_i) ]
//
// d ell/db = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/db + y_i ]
// = sum{i=0}^{N-1} [ -1/(1-p_i) p_i*(1-p_i) + y_i ]
// = sum{i=0}^{N-1} [ y_i - p_i ]
//
//
// d2ell/dm2 = sum{i=0}^{N-1} [ -x_i dp_i/dm ]
// = sum{i=0}^{N-1} [ -x_i**2 * p_i * (1-p_i) ]
//
// d2ell/dmdb = sum{i=0}^{N-1} [ -x_i dp_i/db ]
// = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ]
//
// d2ell/dbdm = sum{i=0}^{N-1} [ -dp_i/dm ]
// = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ]
//
// d2ell/db2 = sum{i=0}^{N-1} [ -dp_i/db ]
// = sum{i=0}^{N-1} [ -p_i * (1-p_i) ]
//
// Newton-Raphson to minimize ell(m, b):
// * Pick m0, b0
// * [m_{j+1], b_{j+1}] = H^{-1} grad ell(m_j, b_j)
// * grad ell =
// [ d ell/dm ]
// [ d ell/db ]
// * H = Hessian of ell = Jacobian of grad ell =
// [ d2ell/dm2 d2ell/dmdb ]
// [ d2ell/dmdb d2ell/db2 ]
// p(x,m,b) for logistic regression:
func lrp(x, m, b float64) float64 {
return 1.0 / (1.0 + math.Exp(-m*x-b))
}
// 1 - p(x,m,b) for logistic regression:
func lrq(x, m, b float64) float64 {
return 1.0 / (1.0 + math.Exp(m*x+b))
}
func LogisticRegression(xs, ys []float64) (m, b float64) {
m0 := -0.001
b0 := 0.002
tol := 1e-9
maxits := 100
return logisticRegressionAux(xs, ys, m0, b0, tol, maxits)
}
// Supporting routine for mlr_logistic_regression():
func logisticRegressionAux(
xs, ys []float64,
m0, b0, tol float64,
maxits int,
) (m, b float64) {
InternalCodingErrorIf(len(xs) != len(ys))
n := len(xs)
its := 0
done := false
m = m0
b = b0
for !done {
// Compute derivatives
dldm := 0.0
dldb := 0.0
d2ldm2 := 0.0
d2ldmdb := 0.0
d2ldb2 := 0.0
ell0 := 0.0
for i := 0; i < n; i++ {
xi := xs[i]
yi := ys[i]
pi := lrp(xi, m0, b0)
qi := lrq(xi, m0, b0)
dldm += xi * (yi - pi)
dldb += yi - pi
piqi := pi * qi
xipiqi := xi * piqi
xi2piqi := xi * xipiqi
d2ldm2 -= xi2piqi
d2ldmdb -= xipiqi
d2ldb2 -= piqi
ell0 += math.Log(qi) + yi*(m0*xi+b0)
}
// Form the Hessian
ha := d2ldm2
hb := d2ldmdb
hc := d2ldmdb
hd := d2ldb2
// Invert the Hessian
D := ha*hd - hb*hc
Hinva := hd / D
Hinvb := -hb / D
Hinvc := -hc / D
Hinvd := ha / D
// Compute H^-1 times grad ell
Hinvgradm := Hinva*dldm + Hinvb*dldb
Hinvgradb := Hinvc*dldm + Hinvd*dldb
// Update [m,b]
m = m0 - Hinvgradm
b = b0 - Hinvgradb
ell := 0.0
for i := 0; i < n; i++ {
xi := xs[i]
yi := ys[i]
qi := lrq(xi, m, b)
ell += math.Log(qi) + yi*(m0*xi+b0)
}
// Check for convergence
dell := math.Max(ell, ell0)
err := 0.0
if dell != 0.0 {
err = math.Abs(ell-ell0) / dell
}
if err < tol {
done = true
}
its++
if its > maxits {
fmt.Fprintf(os.Stderr,
"mlr_logistic_regression: Newton-Raphson convergence failed after %d iterations. m=%e, b=%e.\n",
its, m, b)
os.Exit(1)
}
m0 = m
b0 = b
}
return m, b
}

155
pkg/lib/ordered_map.go Normal file
View file

@ -0,0 +1,155 @@
// ================================================================
// ORDERED MAP FROM STRING TO INTERFACE{}
//
// Quite like types.OrderedMap but only with interface{} keys. See orderedMap.go for
// more information.
// ================================================================
package lib
// ----------------------------------------------------------------
type OrderedMap struct {
FieldCount int64
Head *orderedMapEntry
Tail *orderedMapEntry
keysToEntries map[string]*orderedMapEntry
}
type orderedMapEntry struct {
Key string
Value interface{}
Prev *orderedMapEntry
Next *orderedMapEntry
}
// ----------------------------------------------------------------
func NewOrderedMap() *OrderedMap {
return &OrderedMap{
FieldCount: 0,
Head: nil,
Tail: nil,
keysToEntries: make(map[string]*orderedMapEntry),
}
}
// ----------------------------------------------------------------
// Value-copy is up to the caller -- PutReference and PutCopy
// are in the public OrderedMap API.
func newOrderedMapEntry(key *string, value interface{}) *orderedMapEntry {
return &orderedMapEntry{
*key,
value,
nil,
nil,
}
}
// ----------------------------------------------------------------
func (omap *OrderedMap) IsEmpty() bool {
return omap.FieldCount == 0
}
func (omap *OrderedMap) Has(key string) bool {
return omap.findEntry(&key) != nil
}
func (omap *OrderedMap) findEntry(key *string) *orderedMapEntry {
if omap.keysToEntries != nil {
return omap.keysToEntries[*key]
} else {
for pe := omap.Head; pe != nil; pe = pe.Next {
if pe.Key == *key {
return pe
}
}
return nil
}
}
// ----------------------------------------------------------------
func (omap *OrderedMap) Put(key string, value interface{}) {
pe := omap.findEntry(&key)
if pe == nil {
pe = newOrderedMapEntry(&key, value)
if omap.Head == nil {
omap.Head = pe
omap.Tail = pe
} else {
pe.Prev = omap.Tail
pe.Next = nil
omap.Tail.Next = pe
omap.Tail = pe
}
if omap.keysToEntries != nil {
omap.keysToEntries[key] = pe
}
omap.FieldCount++
} else {
pe.Value = value
}
}
// ----------------------------------------------------------------
func (omap *OrderedMap) Get(key string) interface{} {
pe := omap.findEntry(&key)
if pe == nil {
return nil
} else {
return pe.Value
}
}
// The Get is sufficient for pointer values -- the caller can check if the
// return value is nil. For int/string values (which are non-nullable) we have
// this method.
func (omap *OrderedMap) GetWithCheck(key string) (interface{}, bool) {
pe := omap.findEntry(&key)
if pe == nil {
return nil, false
} else {
return pe.Value, true
}
}
// ----------------------------------------------------------------
func (omap *OrderedMap) Clear() {
omap.FieldCount = 0
omap.Head = nil
omap.Tail = nil
}
// ----------------------------------------------------------------
// Returns true if it was found and removed
func (omap *OrderedMap) Remove(key string) bool {
pe := omap.findEntry(&key)
if pe == nil {
return false
} else {
omap.unlink(pe)
return true
}
}
// ----------------------------------------------------------------
func (omap *OrderedMap) unlink(pe *orderedMapEntry) {
if pe == omap.Head {
if pe == omap.Tail {
omap.Head = nil
omap.Tail = nil
} else {
omap.Head = pe.Next
pe.Next.Prev = nil
}
} else {
pe.Prev.Next = pe.Next
if pe == omap.Tail {
omap.Tail = pe.Prev
} else {
pe.Next.Prev = pe.Prev
}
}
if omap.keysToEntries != nil {
delete(omap.keysToEntries, pe.Key)
}
omap.FieldCount--
}

71
pkg/lib/paragraph.go Normal file
View file

@ -0,0 +1,71 @@
package lib
import (
"bytes"
"fmt"
"strings"
)
// For online help contexts like printing all the built-in DSL functions, or
// the list of all verbs.
func PrintWordsAsParagraph(words []string) {
separator := " "
maxlen := 80
separatorlen := len(separator)
linelen := 0
j := 0
for _, word := range words {
wordlen := len(word)
linelen += separatorlen + wordlen
if linelen >= maxlen {
fmt.Printf("\n")
linelen = separatorlen + wordlen
j = 0
}
if j > 0 {
fmt.Print(separator)
}
fmt.Print(word)
j++
}
fmt.Printf("\n")
}
// For online help contexts like printing all the built-in DSL functions, or
// the list of all verbs. Max width is nominally 80.
func FormatAsParagraph(text string, maxWidth int) []string {
lines := make([]string, 0)
words := strings.Fields(text)
separator := " "
separatorlen := len(separator)
linelen := 0
j := 0
var buffer bytes.Buffer
for _, word := range words {
wordlen := len(word)
linelen += separatorlen + wordlen
if linelen >= maxWidth {
line := buffer.String()
lines = append(lines, line)
buffer.Reset()
linelen = separatorlen + wordlen
j = 0
}
if j > 0 {
buffer.WriteString(separator)
}
buffer.WriteString(word)
j++
}
line := buffer.String()
if line != "" {
lines = append(lines, line)
}
return lines
}

42
pkg/lib/rand.go Normal file
View file

@ -0,0 +1,42 @@
// ================================================================
// Thinly wraps Go's rand library, with seed-function support
// ================================================================
package lib
import (
"math/rand"
"os"
"time"
)
// By default, Miller random numbers are different on every run.
var defaultSeed = time.Now().UnixNano() ^ int64(os.Getpid())
var source = rand.NewSource(defaultSeed)
var generator = rand.New(source)
// Users can request specific seeds if they want the same random-number
// sequence on each run.
func SeedRandom(seed int64) {
source = rand.NewSource(seed)
generator = rand.New(source)
}
func RandFloat64() float64 {
return generator.Float64()
}
func RandUint32() uint32 {
return generator.Uint32()
}
func RandInt63() int64 {
return generator.Int63()
}
func RandRange(lowInclusive, highExclusive int64) int64 {
if lowInclusive == highExclusive {
return lowInclusive
} else {
u := generator.Int63()
// TODO: test divide-by-zero cases in UT
return lowInclusive + (u % (highExclusive - lowInclusive))
}
}

90
pkg/lib/readfiles.go Normal file
View file

@ -0,0 +1,90 @@
// ================================================================
// Routines for loading strings from files. Nominally for the put/filter verbs
// to load DSL strings from .mlr files.
// ================================================================
package lib
import (
"io/ioutil"
"os"
"strings"
csv "github.com/johnkerl/miller/pkg/go-csv"
)
// LoadStringsFromFileOrDir calls LoadStringFromFile if path exists and is a
// file, or LoadStringsFromDir if path exists and is a directory. In the
// former case the extension is ignored; in the latter case it's used as a
// filter on the directory entries.
func LoadStringsFromFileOrDir(path string, extension string) ([]string, error) {
fileInfo, err := os.Stat(path)
if err != nil {
return nil, err
}
if fileInfo.IsDir() {
return LoadStringsFromDir(path, extension)
} else {
dslString, err := LoadStringFromFile(path)
if err != nil {
return nil, err
} else {
return []string{dslString}, nil
}
}
}
// LoadStringFromFile is just a wrapper around ioutil.ReadFile,
// with a cast from []byte to string.
func LoadStringFromFile(filename string) (string, error) {
data, err := ioutil.ReadFile(filename)
if err != nil {
return "", err
}
return string(data), nil
}
// LoadStringsFromDir loads all file contents for files in the given directory
// having the given extension. E.g. LoadStringsFromDir("/u/myfiles", ".mlr")
// will load /u/myfiles/foo.mlr and /u/myfiles/bar.mlr but will skip over
// /u/myfiles/data.csv and /u/myfiles/todo.txt.
func LoadStringsFromDir(dirname string, extension string) ([]string, error) {
dslStrings := make([]string, 0)
entries, err := ioutil.ReadDir(dirname)
if err != nil {
return nil, err
}
for i := range entries {
entry := &entries[i]
name := (*entry).Name()
if !strings.HasSuffix(name, extension) {
continue
}
path := dirname + "/" + name
dslString, err := LoadStringFromFile(path)
if err != nil {
return nil, err
}
dslStrings = append(dslStrings, dslString)
}
return dslStrings, nil
}
func ReadCSVHeader(filename string) ([]string, error) {
handle, err := os.Open(filename)
if err != nil {
return nil, err
}
defer handle.Close()
csvReader := csv.NewReader(handle)
header, err := csvReader.Read()
if err != nil {
return nil, err
}
return header, nil
}

386
pkg/lib/regex.go Normal file
View file

@ -0,0 +1,386 @@
// ================================================================
// Support for regexes in Miller.
//
// * By and large we use the Go library.
//
// * There is (for historical reasons) a DSL syntax "[a-z]"i (note the trailing i)
// for case-insensitive regular expressions which we map into Go syntax for
// regex-compilation.
//
// * Also for historical reasons, we allow things like
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
// where the '=~' sets the captures and the "\2:\1" uses them. (Note that
// https://github.com/johnkerl/miller/issues/388 has a better suggestion
// which would make the captures explicit as variables, rather than implicit
// within CST state -- regardless, the current syntax will still be supported
// for backward compatibility and so is here to stay.) Here we make use of Go
// regexp-library functions to write to, and then later interpolate from, a
// captures array which is stored within CST state. (See the `runtime.State`
// object.)
//
// * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g.
// if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is
// "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "".
// ================================================================
package lib
import (
"bytes"
"fmt"
"os"
"regexp"
"strings"
)
// captureDetector is used to see if a string literal interpolates previous
// captures (like "\2:\1") or not (like "2:1").
var captureDetector = regexp.MustCompile(`\\[0-9]`)
// captureSplitter is used to precompute an offsets matrix for strings like
// "\2:\1" so they don't need to be recomputed on every record.
var captureSplitter = regexp.MustCompile(`(\\[0-9])`)
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax
// which predate the port of Miller from C to Go. Miller regexes use a final
// 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)".
//
// (See also mlr.bnf where we specify which things can be backslash-escaped
// without a syntax error at parse time.)
//
// * If the regex_string is of the form a.*b, compiles it case-sensisitively.
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensisitively.
// * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
n := len(regexString)
if n < 2 {
return regexp.Compile(regexString)
}
// TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"".
// The parser-to-AST will have stripped the outer and we'll strip the inner and the
// user's intent will be lost.
//
// TODO: make separate functions for calling from parser-to-AST (string
// literals) and from verbs (like cut -r or having-fields).
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") {
return regexp.Compile(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") {
return regexp.Compile(regexString[1 : n-1])
}
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") {
return regexp.Compile("(?i)" + regexString[1:n-2])
}
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") {
return regexp.Compile("(?i)" + regexString[1:n-2])
}
return regexp.Compile(regexString)
}
// CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to
// return a second error argument rather than fataling. However, if there's a
// malformed regex we really cannot continue so it's simpler to just fatal.
func CompileMillerRegexOrDie(regexString string) *regexp.Regexp {
regex, err := CompileMillerRegex(regexString)
if err != nil {
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
return regex
}
// CompileMillerRegexesOrDie is a convenenience looper over CompileMillerRegexOrDie.
func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp {
regexes := make([]*regexp.Regexp, len(regexStrings))
for i, regexString := range regexStrings {
regexes[i] = CompileMillerRegexOrDie(regexString)
}
return regexes
}
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
// but "" splits to [""] when I wish it were []. This function does the latter.
func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
if input == "" {
return make([]string, 0)
} else {
return regex.Split(input, n)
}
}
// MakeEmptyRegexCaptures is for initial CST state at the start of executing
// the DSL expression for the current record. Even if '$x =~ "(..)_(...)" set
// "\1" and "\2" on the previous record, at start of processing for the current
// record we need to start with a clean slate.
func MakeEmptyRegexCaptures() []string {
return nil
}
// RegexReplacementHasCaptures is used by the CST builder to see if
// string-literal is like "foo bar" or "foo \1 bar" -- in the latter case it
// needs to retain the compiled offsets-matrix information.
func RegexReplacementHasCaptures(
replacement string,
) (
hasCaptures bool,
matrix [][]int,
) {
if captureDetector.MatchString(replacement) {
return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
} else {
return false, nil
}
}
// RegexMatches implements the =~ DSL operator. The captures are stored in DSL
// state and may be used by a DSL statement after the =~. For example, in
//
// sub($a, "(..)_(...)", "\1:\2")
//
// the replacement string is an argument to sub and therefore the captures are
// confined to the implementation of the sub function. Similarly for gsub. But
// for the match operator, people can do
//
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1"
// }
//
// and the =~ callsite doesn't know if captures will be used or not. So,
// RegexMatches always returns the captures array. It is stored within the CST
// state.
func RegexMatches(
input string,
sregex string,
) (
matches bool,
capturesOneUp []string,
) {
regex := CompileMillerRegexOrDie(sregex)
return RegexMatchesCompiled(input, regex)
}
// RegexMatchesCompiled is the implementation for the =~ operator. Without
// Miller-style regex captures this would a simple one-line
// regex.MatchString(input). However, we return the captures array for the
// benefit of subsequent references to "\0".."\9".
func RegexMatchesCompiled(
input string,
regex *regexp.Regexp,
) (bool, []string) {
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
if matrix == nil || len(matrix) == 0 {
// Set all captures to ""
return false, make([]string, 10)
}
// "\0" .. "\9"
captures := make([]string, 10)
// If there are multiple matches -- e.g. input is
//
// "...ab_cde...fg_hij..."
//
// with regex
//
// "(..)_(...)"
//
// -- then we only consider the first match: boolean return value is true
// (the input string matched the regex), and the captures array will map
// "\1" to "ab" and "\2" to "cde".
row := matrix[0]
n := len(row)
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// As noted above we look at only the first row.
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
di := 0
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
return true, captures
}
// InterpolateCaptures example:
// - Input $x is "ab_cde"
// - DSL expression
// if ($x =~ "(..)_(...)") {
// ... other lines of code ...
// $y = "\2:\1";
// }
// - InterpolateCaptures is used on the evaluation of "\2:\1"
// - replacementString is "\2:\1"
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
// "\1" substrings within "\2:\1"
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
func InterpolateCaptures(
replacementString string,
replacementMatrix [][]int,
captures []string,
) string {
if replacementMatrix == nil || captures == nil {
return replacementString
}
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range replacementMatrix {
start := row[0]
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
// Map "\0".."\9" to integer index 0..9
index := replacementString[start+1] - '0'
buffer.WriteString(captures[index])
nonMatchStartIndex = row[1]
}
buffer.WriteString(replacementString[nonMatchStartIndex:])
return buffer.String()
}
// RegexSub implements the sub DSL function.
func RegexSub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
return RegexSubCompiled(input, regex, replacement, replacementCaptureMatrix)
}
// RegexSubCompiled is the same as RegexSub but with compiled regex and
// replacement strings.
func RegexSubCompiled(
input string,
regex *regexp.Regexp,
replacement string,
replacementCaptureMatrix [][]int,
) string {
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, true)
}
// RegexGsub implements the gsub DSL function.
func RegexGsub(
input string,
sregex string,
replacement string,
) string {
regex := CompileMillerRegexOrDie(sregex)
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, false)
}
// regexSubGsubCompiled is the implementation for sub/gsub with compilex regex
// and replacement strings.
func regexSubGsubCompiled(
input string,
regex *regexp.Regexp,
replacement string,
replacementCaptureMatrix [][]int,
breakOnFirst bool,
) string {
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
if matrix == nil || len(matrix) == 0 {
return input
}
// Example return value from FindAllSubmatchIndex with input
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
//
// Matrix is [][]int{
// []int{3, 9, 3, 5, 6, 9},
// []int{12, 18, 12, 14, 15, 18},
// }
//
// * 3-9 is for the entire match "ab_cde"
// * 3-5 is for the first capture "ab"
// * 6-9 is for the second capture "cde"
//
// * 12-18 is for the entire match "fg_hij"
// * 12-14 is for the first capture "fg"
// * 15-18 is for the second capture "hij"
var buffer bytes.Buffer
nonMatchStartIndex := 0
for _, row := range matrix {
buffer.WriteString(input[nonMatchStartIndex:row[0]])
// "\0" .. "\9"
captures := make([]string, 10)
di := 0
n := len(row)
for si := 0; si < n && di <= 9; si += 2 {
start := row[si]
end := row[si+1]
if start >= 0 && end >= 0 {
captures[di] = input[start:end]
}
di += 1
}
// If the replacement had no captures, e.g. "xyz", we would insert it
//
// "..." -> "..."
// "ab_cde" -> "xyz" --- here
// "..." -> "..."
// "fg_hij" -> "xyz" --- and here
// "..." -> "..."
//
// using buffer.WriteString(replacement). However, this function exists
// to handle the case when the replacement string has captures like
// "\2:\1", so we need to produce
//
// "..." -> "..."
// "ab_cde" -> "cde:ab" --- here
// "..." -> "..."
// "fg_hij" -> "hij:fg" --- and here
// "..." -> "..."
updatedReplacement := InterpolateCaptures(
replacement,
replacementCaptureMatrix,
captures,
)
buffer.WriteString(updatedReplacement)
nonMatchStartIndex = row[1]
if breakOnFirst {
break
}
}
buffer.WriteString(input[nonMatchStartIndex:])
return buffer.String()
}

190
pkg/lib/regex_test.go Normal file
View file

@ -0,0 +1,190 @@
// ================================================================
// Most Miller tests (thousands of them) are command-line-driven via
// mlr regtest. Here are some cases needing special focus.
// ================================================================
package lib
import (
"testing"
)
// ----------------------------------------------------------------
type tDataForHasCaptures struct {
replacement string
expectedHasCaptures bool
expectedMatrix [][]int
}
type tDataForSubGsub struct {
input string
sregex string
replacement string
expectedOutput string
}
type tDataForMatches struct {
input string
sregex string
expectedOutput bool
expectedCaptures []string
}
// ----------------------------------------------------------------
var dataForHasCaptures = []tDataForHasCaptures{
{"foo", false, nil},
{"\\0", true, [][]int{{0, 2, 0, 2}}},
{"\\3", true, [][]int{{0, 2, 0, 2}}},
{"\\34", true, [][]int{{0, 2, 0, 2}}},
{"abc\\1def\\2ghi", true, [][]int{{3, 5, 3, 5}, {8, 10, 8, 10}}},
}
var dataForSub = []tDataForSubGsub{
{"abcde", "c", "X", "abXde"},
{"abcde", "z", "X", "abcde"},
{"abcde", "[a-z]", "X", "Xbcde"},
{"abcde", "[A-Z]", "X", "abcde"},
{"abcde", "c", "X", "abXde"},
{"abcde", "z", "X", "abcde"},
{"abcde", "[a-z]", "X", "Xbcde"},
{"abcde", "[A-Z]", "X", "abcde"},
{"ab_cde", "(..)_(...)", "\\2\\1", "cdeab"},
{"ab_cde", "(..)_(...)", "\\2-\\1", "cde-ab"},
{"ab_cde", "(..)_(...)", "X\\2Y\\1Z", "XcdeYabZ"},
{"foofoofoo", "(f.o)", "b\\1r", "bfoorfoofoo"},
{"foofoofoo", "(f.*o)", "b\\1r", "bfoofoofoor"},
{"foofoofoo", "(f.o)", "b\\2r", "brfoofoo"},
{"foofoofoo", "(f.*o)", "b\\2r", "br"},
}
var dataForGsub = []tDataForSubGsub{
{"abcde", "c", "X", "abXde"},
{"abcde", "z", "X", "abcde"},
{"abcde", "[a-z]", "X", "XXXXX"},
{"abcde", "[A-Z]", "X", "abcde"},
{"abcde", "[c-d]", "X", "abXXe"},
{"abcde", "c", "X", "abXde"},
{"abcde", "z", "X", "abcde"},
{"abcde", "[a-z]", "X", "XXXXX"},
{"abcde", "[A-Z]", "X", "abcde"},
{"abcde", "[c-d]", "X", "abXXe"},
{"abacad", "a(.)", "<\\1>", "<b><c><d>"},
{"abacad", "a(.)", "<\\2>", "<><><>"},
}
var dataForMatches = []tDataForMatches{
{"abcde", "[A-Z]", false, []string{"", "", "", "", "", "", "", "", "", ""}},
{"abcde", "[a-z]", true, []string{"a", "", "", "", "", "", "", "", "", ""}},
{"...ab_cde...", "(..)_(...)", true, []string{"ab_cde", "ab", "cde", "", "", "", "", "", "", ""}},
{"...ab_cde...fg_hij...", "(..)_(...)", true, []string{"ab_cde", "ab", "cde", "", "", "", "", "", "", ""}},
{"foofoofoo", "(f.o)", true, []string{"foo", "foo", "", "", "", "", "", "", "", ""}},
{"foofoofoo", "(f.*o)", true, []string{"foofoofoo", "foofoofoo", "", "", "", "", "", "", "", ""}},
}
func TestRegexReplacementHasCaptures(t *testing.T) {
for i, entry := range dataForHasCaptures {
actualHasCaptures, actualMatrix := RegexReplacementHasCaptures(entry.replacement)
if actualHasCaptures != entry.expectedHasCaptures {
t.Fatalf("case %d replacement \"%s\" expected %v got %v\n",
i, entry.replacement, entry.expectedHasCaptures, actualHasCaptures,
)
}
if !compareMatrices(actualMatrix, entry.expectedMatrix) {
t.Fatalf("case %d replacement \"%s\" expected matrix %#v got %#v\n",
i, entry.replacement, entry.expectedMatrix, actualMatrix,
)
}
}
}
func TestRegexSub(t *testing.T) {
for i, entry := range dataForSub {
actualOutput := RegexSub(entry.input, entry.sregex, entry.replacement)
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
)
}
}
}
func TestRegexGsub(t *testing.T) {
for i, entry := range dataForGsub {
actualOutput := RegexGsub(entry.input, entry.sregex, entry.replacement)
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
)
}
}
}
func TestRegexMatches(t *testing.T) {
for i, entry := range dataForMatches {
actualOutput, actualCaptures := RegexMatches(entry.input, entry.sregex)
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" sregex \"%s\" expected %v got %v\n",
i, entry.input, entry.sregex, entry.expectedOutput, actualOutput,
)
}
if !compareCaptures(actualCaptures, entry.expectedCaptures) {
t.Fatalf("case %d input \"%s\" sregex \"%s\" expected captures %#v got %#v\n",
i, entry.input, entry.sregex, entry.expectedCaptures, actualCaptures,
)
}
}
}
func compareMatrices(
actualMatrix [][]int,
expectedMatrix [][]int,
) bool {
if actualMatrix == nil && expectedMatrix == nil {
return true
}
if actualMatrix == nil || expectedMatrix == nil {
return false
}
if len(actualMatrix) != len(expectedMatrix) {
return false
}
for i := range expectedMatrix {
actualRow := actualMatrix[i]
expectedRow := expectedMatrix[i]
if len(actualRow) != len(expectedRow) {
return false
}
for j := range expectedRow {
if actualRow[j] != expectedRow[j] {
return false
}
}
}
return true
}
func compareCaptures(
actualCaptures []string,
expectedCaptures []string,
) bool {
if actualCaptures == nil && expectedCaptures == nil {
return true
}
if actualCaptures == nil || expectedCaptures == nil {
return false
}
if len(actualCaptures) != len(expectedCaptures) {
return false
}
for i := range expectedCaptures {
if actualCaptures[i] != expectedCaptures[i] {
return false
}
}
return true
}

278
pkg/lib/stats.go Normal file
View file

@ -0,0 +1,278 @@
// ================================================================
// These are intended for streaming (i.e. single-pass) applications. Otherwise
// the formulas look different (and are more intuitive).
// ================================================================
package lib
import (
"math"
)
// ----------------------------------------------------------------
// Univariate linear regression
// ----------------------------------------------------------------
// There are N (xi, yi) pairs.
//
// minimize E = sum (yi - m xi - b)^2
//
// Set the two partial derivatives to zero and solve for m and b:
//
// DE/Dm = sum 2 (yi - m xi - b) (-xi) = 0
// DE/Db = sum 2 (yi - m xi - b) (-1) = 0
//
// sum (yi - m xi - b) (xi) = 0
// sum (yi - m xi - b) = 0
//
// sum (xi yi - m xi^2 - b xi) = 0
// sum (yi - m xi - b) = 0
//
// m sum(xi^2) + b sum(xi) = sum(xi yi)
// m sum(xi) + b N = sum(yi)
//
// [ sum(xi^2) sum(xi) ] [ m ] = [ sum(xi yi) ]
// [ sum(xi) N ] [ b ] = [ sum(yi) ]
//
// [ m ] = [ sum(xi^2) sum(xi) ]^-1 [ sum(xi yi) ]
// [ b ] [ sum(xi) N ] [ sum(yi) ]
//
// = [ N -sum(xi) ] [ sum(xi yi) ] * 1/D
// [ -sum(xi) sum(xi^2)] [ sum(yi) ]
//
// where
//
// D = N sum(xi^2) - sum(xi)^2.
//
// So
//
// N sum(xi yi) - sum(xi) sum(yi)
// m = --------------------------------
// D
//
// -sum(xi)sum(xi yi) + sum(xi^2) sum(yi)
// b = ----------------------------------------
// D
//
// ----------------------------------------------------------------
func GetLinearRegressionOLS(
nint int64,
sumx float64,
sumx2 float64,
sumxy float64,
sumy float64,
) (m, b float64) {
n := float64(nint)
D := n*sumx2 - sumx*sumx
m = (n*sumxy - sumx*sumy) / D
b = (-sumx*sumxy + sumx2*sumy) / D
return m, b
}
// We would need a second pass through the data to compute the error-bars given
// the data and the m and the b.
//
// # Young 1962, pp. 122-124. Compute sample variance of linear
// # approximations, then variances of m and b.
// var_z = 0.0
// for i in range(0, N):
// var_z += (m * xs[i] + b - ys[i])**2
// var_z /= N
//
// var_m = (N * var_z) / D
// var_b = (var_z * sumx2) / D
//
// output = [m, b, math.sqrt(var_m), math.sqrt(var_b)]
// ----------------------------------------------------------------
// GetVar is the finalizing function for computing variance from streamed
// accumulator values.
func GetVar(
nint int64,
sumx float64,
sumx2 float64,
) float64 {
n := float64(nint)
mean := sumx / n
numerator := sumx2 - mean*(2.0*sumx-n*mean)
if numerator < 0.0 { // round-off error
numerator = 0.0
}
denominator := n - 1.0
return numerator / denominator
}
// ----------------------------------------------------------------
// Unbiased estimator:
// (1/n) sum{(xi-mean)**3}
// -----------------------------
// [(1/(n-1)) sum{(xi-mean)**2}]**1.5
// mean = sumx / n; n mean = sumx
// sum{(xi-mean)^3}
// = sum{xi^3 - 3 mean xi^2 + 3 mean^2 xi - mean^3}
// = sum{xi^3} - 3 mean sum{xi^2} + 3 mean^2 sum{xi} - n mean^3
// = sumx3 - 3 mean sumx2 + 3 mean^2 sumx - n mean^3
// = sumx3 - 3 mean sumx2 + 3n mean^3 - n mean^3
// = sumx3 - 3 mean sumx2 + 2n mean^3
// = sumx3 - mean*(3 sumx2 + 2n mean^2)
// sum{(xi-mean)^2}
// = sum{xi^2 - 2 mean xi + mean^2}
// = sum{xi^2} - 2 mean sum{xi} + n mean^2
// = sumx2 - 2 mean sumx + n mean^2
// = sumx2 - 2 n mean^2 + n mean^2
// = sumx2 - n mean^2
// ----------------------------------------------------------------
// GetSkewness is the finalizing function for computing skewness from streamed
// accumulator values.
func GetSkewness(
nint int,
sumx float64,
sumx2 float64,
sumx3 float64,
) float64 {
n := float64(nint)
mean := sumx / n
numerator := sumx3 - mean*(3*sumx2-2*n*mean*mean)
numerator = numerator / n
denominator := (sumx2 - n*mean*mean) / (n - 1)
denominator = math.Pow(denominator, 1.5)
return numerator / denominator
}
// ----------------------------------------------------------------
// Unbiased:
// (1/n) sum{(x-mean)**4}
// ----------------------- - 3
// [(1/n) sum{(x-mean)**2}]**2
// sum{(xi-mean)^4}
// = sum{xi^4 - 4 mean xi^3 + 6 mean^2 xi^2 - 4 mean^3 xi + mean^4}
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 mean^3 sum{xi} + n mean^4
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 n mean^4 + n mean^4
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 3 n mean^4
// = sum{xi^4} - mean*(4 sum{xi^3} - 6 mean sum{xi^2} + 3 n mean^3)
// = sumx4 - mean*(4 sumx3 - 6 mean sumx2 + 3 n mean^3)
// = sumx4 - mean*(4 sumx3 - mean*(6 sumx2 - 3 n mean^2))
func GetKurtosis(
nint int,
sumx float64,
sumx2 float64,
sumx3 float64,
sumx4 float64,
) float64 {
n := float64(nint)
mean := sumx / n
numerator := sumx4 - mean*(4*sumx3-mean*(6*sumx2-3*n*mean*mean))
numerator = numerator / n
denominator := (sumx2 - n*mean*mean) / n
denominator = denominator * denominator
return numerator/denominator - 3.0
}
// ----------------------------------------------------------------
// Non-streaming implementation:
//
// def find_sample_covariance(xs, ys):
// n = len(xs)
// mean_x = find_mean(xs)
// mean_y = find_mean(ys)
//
// sum = 0.0
// for k in range(0, n):
// sum += (xs[k] - mean_x) * (ys[k] - mean_y)
//
// return sum / (n-1.0)
func GetCov(
nint int64,
sumx float64,
sumy float64,
sumxy float64,
) float64 {
n := float64(nint)
meanx := sumx / n
meany := sumy / n
numerator := sumxy - meanx*sumy - meany*sumx + n*meanx*meany
denominator := n - 1
return numerator / denominator
}
// ----------------------------------------------------------------
func GetCovMatrix(
nint int64,
sumx float64,
sumx2 float64,
sumy float64,
sumy2 float64,
sumxy float64,
) (Q [2][2]float64) {
n := float64(nint)
denominator := n - 1
Q[0][0] = (sumx2 - sumx*sumx/n) / denominator
Q[0][1] = (sumxy - sumx*sumy/n) / denominator
Q[1][0] = Q[0][1]
Q[1][1] = (sumy2 - sumy*sumy/n) / denominator
return Q
}
// ----------------------------------------------------------------
// Principal component analysis can be used for linear regression:
//
// * Compute the covariance matrix for the x's and y's.
//
// * Find its eigenvalues and eigenvectors of the cov. (This is real-symmetric
// so Jacobi iteration is simple and fine.)
//
// * The principal eigenvector points in the direction of the fit.
//
// * The covariance matrix is computed on zero-mean data so the intercept
// is zero. The fit equation is of the form (y - nu) = m*(x - mu) where mu
// and nu are x and y means, respectively.
//
// * If the fit is perfect then the 2nd eigenvalue will be zero; if the fit is
// good then the 2nd eigenvalue will be smaller; if the fit is bad then
// they'll be about the same. I use 1 - |lambda2|/|lambda1| as an indication
// of quality of the fit.
//
// Standard ("ordinary least-squares") linear regression is appropriate when
// the errors are thought to be all in the y's. PCA ("total least-squares") is
// appropriate when the x's and the y's are thought to both have errors.
func GetLinearRegressionPCA(
eigenvalue_1 float64,
eigenvalue_2 float64,
eigenvector_1 [2]float64,
eigenvector_2 [2]float64,
x_mean float64,
y_mean float64,
) (m, b, quality float64) {
abs_1 := math.Abs(eigenvalue_1)
abs_2 := math.Abs(eigenvalue_2)
quality = 1.0
if abs_1 == 0.0 {
quality = 0.0
} else if abs_2 > 0.0 {
quality = 1.0 - abs_2/abs_1
}
a0 := eigenvector_1[0]
a1 := eigenvector_1[1]
m = a1 / a0
b = y_mean - m*x_mean
return m, b, quality
}

187
pkg/lib/time.go Normal file
View file

@ -0,0 +1,187 @@
package lib
import (
"fmt"
"os"
"time"
)
// SetTZFromEnv applies the $TZ environment variable. This has three reasons:
// (1) On Windows (as of 2021-10-20), this is necessary to get $TZ into use.
// (2) On Linux/Mac, as of this writing it is not necessary for initial value
// of TZ at startup. However, an explicit check is helpful since if someone
// does 'export TZ=Something/Invalid', then runs Miller, and invalid TZ is
// simply *ignored* -- we want to surface that error to the user. (3) On any
// platform this is necessary for *changing* TZ mid-process: e.g. if a DSL
// statement does 'ENV["TZ"] = Asia/Istanbul'.
func SetTZFromEnv() error {
tzenv := os.Getenv("TZ")
location, err := time.LoadLocation(tzenv)
if err != nil {
return fmt.Errorf("TZ environment variable appears malformed: \"%s\"", tzenv)
}
time.Local = location
return nil
}
func Sec2GMT(epochSeconds float64, numDecimalPlaces int) string {
return secToFormattedTime(epochSeconds, numDecimalPlaces, false, nil)
}
func Nsec2GMT(epochNanoseconds int64, numDecimalPlaces int) string {
return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, false, nil)
}
func Sec2LocalTime(epochSeconds float64, numDecimalPlaces int) string {
return secToFormattedTime(epochSeconds, numDecimalPlaces, true, nil)
}
func Nsec2LocalTime(epochNanoseconds int64, numDecimalPlaces int) string {
return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, true, nil)
}
func Sec2LocationTime(epochSeconds float64, numDecimalPlaces int, location *time.Location) string {
return secToFormattedTime(epochSeconds, numDecimalPlaces, true, location)
}
func Nsec2LocationTime(epochNanoseconds int64, numDecimalPlaces int, location *time.Location) string {
return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, true, location)
}
// secToFormattedTime is for DSL functions sec2gmt and sec2localtime. If doLocal is
// false, use UTC. Else if location is nil, use $TZ environment variable. Else
// use the specified location.
func secToFormattedTime(epochSeconds float64, numDecimalPlaces int, doLocal bool, location *time.Location) string {
intPart := int64(epochSeconds)
fractionalPart := epochSeconds - float64(intPart)
if fractionalPart < 0 {
intPart -= 1
fractionalPart += 1.0
}
t := time.Unix(intPart, int64(fractionalPart*1e9))
return goTimeToFormattedTime(t, numDecimalPlaces, doLocal, location)
}
// nsecToFormattedTime is for DSL functions nsec2gmt and nsec2localtime. If doLocal is
// false, use UTC. Else if location is nil, use $TZ environment variable. Else
// use the specified location.
func nsecToFormattedTime(epochNanoseconds int64, numDecimalPlaces int, doLocal bool, location *time.Location) string {
t := time.Unix(epochNanoseconds/1000000000, epochNanoseconds%1000000000)
return goTimeToFormattedTime(t, numDecimalPlaces, doLocal, location)
}
// This is how much to divide nanoseconds by to get a desired number of decimal places
var nsToFracDivisors = []int{
/* 0 */ 0, /* unused */
/* 1 */ 100000000,
/* 2 */ 10000000,
/* 3 */ 1000000,
/* 4 */ 100000,
/* 5 */ 10000,
/* 6 */ 1000,
/* 7 */ 100,
/* 8 */ 10,
/* 9 */ 1,
}
func goTimeToFormattedTime(t time.Time, numDecimalPlaces int, doLocal bool, location *time.Location) string {
if doLocal {
if location != nil {
t = t.In(location)
} else {
t = t.Local()
}
} else {
t = t.UTC()
}
YYYY := t.Year()
MM := int(t.Month())
DD := t.Day()
hh := t.Hour()
mm := t.Minute()
ss := t.Second()
if numDecimalPlaces < 0 {
numDecimalPlaces = 0
} else if numDecimalPlaces > 9 {
numDecimalPlaces = 9
}
if numDecimalPlaces == 0 {
if doLocal {
return fmt.Sprintf(
"%04d-%02d-%02d %02d:%02d:%02d",
YYYY, MM, DD, hh, mm, ss)
} else {
return fmt.Sprintf(
"%04d-%02d-%02dT%02d:%02d:%02dZ",
YYYY, MM, DD, hh, mm, ss)
}
} else {
fractionalPart := t.Nanosecond() / nsToFracDivisors[numDecimalPlaces]
if doLocal {
return fmt.Sprintf(
"%04d-%02d-%02d %02d:%02d:%02d.%0*d",
YYYY, MM, DD, hh, mm, ss, numDecimalPlaces, fractionalPart)
} else {
return fmt.Sprintf(
"%04d-%02d-%02dT%02d:%02d:%02d.%0*dZ",
YYYY, MM, DD, hh, mm, ss, numDecimalPlaces, fractionalPart)
}
}
}
func EpochSecondsToGMT(epochSeconds float64) time.Time {
return epochSecondsToTime(epochSeconds, false, nil)
}
func EpochNanosecondsToGMT(epochNanoseconds int64) time.Time {
return epochNanosecondsToTime(epochNanoseconds, false, nil)
}
func EpochSecondsToLocalTime(epochSeconds float64) time.Time {
return epochSecondsToTime(epochSeconds, true, nil)
}
func EpochNanosecondsToLocalTime(epochNanoseconds int64) time.Time {
return epochNanosecondsToTime(epochNanoseconds, true, nil)
}
func EpochSecondsToLocationTime(epochSeconds float64, location *time.Location) time.Time {
return epochSecondsToTime(epochSeconds, true, location)
}
func EpochNanosecondsToLocationTime(epochNanoseconds int64, location *time.Location) time.Time {
return epochNanosecondsToTime(epochNanoseconds, true, location)
}
func epochSecondsToTime(epochSeconds float64, doLocal bool, location *time.Location) time.Time {
intPart := int64(epochSeconds)
fractionalPart := epochSeconds - float64(intPart)
decimalPart := int64(fractionalPart * 1e9)
if doLocal {
if location == nil {
return time.Unix(intPart, decimalPart).Local()
} else {
return time.Unix(intPart, decimalPart).In(location)
}
} else {
return time.Unix(intPart, decimalPart).UTC()
}
}
func epochNanosecondsToTime(epochNanoseconds int64, doLocal bool, location *time.Location) time.Time {
intPart := epochNanoseconds / 1000000000
fractionalPart := epochNanoseconds % 1000000000
if doLocal {
if location == nil {
return time.Unix(intPart, fractionalPart).Local()
} else {
return time.Unix(intPart, fractionalPart).In(location)
}
} else {
return time.Unix(intPart, fractionalPart).UTC()
}
}

101
pkg/lib/time_test.go Normal file
View file

@ -0,0 +1,101 @@
// ================================================================
// Most Miller tests (thousands of them) are command-line-driven via
// mlr regtest. Here are some cases needing special focus.
// ================================================================
package lib
import (
"time"
"github.com/stretchr/testify/assert"
"testing"
)
// ----------------------------------------------------------------
type tDataForSec2GMT struct {
epochSeconds float64
numDecimalPlaces int
expectedOutput string
}
var dataForSec2GMT = []tDataForSec2GMT{
{0.0, 0, "1970-01-01T00:00:00Z"},
{0.0, 6, "1970-01-01T00:00:00.000000Z"},
{1.0, 6, "1970-01-01T00:00:01.000000Z"},
{123456789.25, 3, "1973-11-29T21:33:09.250Z"},
}
func TestSec2GMT(t *testing.T) {
for _, entry := range dataForSec2GMT {
assert.Equal(t, entry.expectedOutput, Sec2GMT(entry.epochSeconds, entry.numDecimalPlaces))
}
}
// ----------------------------------------------------------------
type tDataForNsec2GMT struct {
epochNanoseconds int64
numDecimalPlaces int
expectedOutput string
}
var dataForNsec2GMT = []tDataForNsec2GMT{
{0, 0, "1970-01-01T00:00:00Z"},
{0, 6, "1970-01-01T00:00:00.000000Z"},
{946684800123456789, 0, "2000-01-01T00:00:00Z"},
{946684800123456789, 1, "2000-01-01T00:00:00.1Z"},
{946684800123456789, 2, "2000-01-01T00:00:00.12Z"},
{946684800123456789, 3, "2000-01-01T00:00:00.123Z"},
{946684800123456789, 4, "2000-01-01T00:00:00.1234Z"},
{946684800123456789, 5, "2000-01-01T00:00:00.12345Z"},
{946684800123456789, 6, "2000-01-01T00:00:00.123456Z"},
{946684800123456789, 7, "2000-01-01T00:00:00.1234567Z"},
{946684800123456789, 8, "2000-01-01T00:00:00.12345678Z"},
{946684800123456789, 9, "2000-01-01T00:00:00.123456789Z"},
}
func TestNsec2GMT(t *testing.T) {
for _, entry := range dataForNsec2GMT {
actualOutput := Nsec2GMT(entry.epochNanoseconds, entry.numDecimalPlaces)
assert.Equal(t, entry.expectedOutput, actualOutput)
}
}
// ----------------------------------------------------------------
type tDataForEpochSecondsToGMT struct {
epochSeconds float64
expectedOutput time.Time
}
var dataForEpochSecondsToGMT = []tDataForEpochSecondsToGMT{
{0.0, time.Unix(0, 0).UTC()},
{1.25, time.Unix(1, 250000000).UTC()},
{123456789.25, time.Unix(123456789, 250000000).UTC()},
}
func TestEpochSecondsToGMT(t *testing.T) {
for _, entry := range dataForEpochSecondsToGMT {
assert.Equal(t, entry.expectedOutput, EpochSecondsToGMT(entry.epochSeconds))
}
}
// ----------------------------------------------------------------
type tDataForEpochNanosecondsToGMT struct {
epochNanoseconds int64
expectedOutput time.Time
}
var dataForEpochNanosecondsToGMT = []tDataForEpochNanosecondsToGMT{
{0, time.Unix(0, 0).UTC()},
{1000000000, time.Unix(1, 0).UTC()},
{1200000000, time.Unix(1, 200000000).UTC()},
{-1000000000, time.Unix(-1, 0).UTC()},
{-1200000000, time.Unix(-1, -200000000).UTC()},
{123456789250000047, time.Unix(123456789, 250000047).UTC()},
}
func TestEpochNanosecondsToGMT(t *testing.T) {
for _, entry := range dataForEpochNanosecondsToGMT {
assert.Equal(t, entry.expectedOutput, EpochNanosecondsToGMT(entry.epochNanoseconds))
}
}

67
pkg/lib/tsv_codec.go Normal file
View file

@ -0,0 +1,67 @@
package lib
import (
"bytes"
)
// * https://en.wikipedia.org/wiki/Tab-separated_values
// * https://www.iana.org/assignments/media-types/text/tab-separated-values
// \n for newline,
// \r for carriage return,
// \t for tab,
// \\ for backslash.
// TSVDecodeField is for the TSV record-reader.
func TSVDecodeField(input string) string {
var buffer bytes.Buffer
n := len(input)
for i := 0; i < n; /* increment in loop */ {
c := input[i]
if c == '\\' && i < n-1 {
d := input[i+1]
if d == '\\' {
buffer.WriteByte('\\')
i += 2
} else if d == 'n' {
buffer.WriteByte('\n')
i += 2
} else if d == 'r' {
buffer.WriteByte('\r')
i += 2
} else if d == 't' {
buffer.WriteByte('\t')
i += 2
} else {
buffer.WriteByte(c)
i++
}
} else {
buffer.WriteByte(c)
i++
}
}
return buffer.String()
}
// TSVEncodeField is for the TSV record-writer.
func TSVEncodeField(input string) string {
var buffer bytes.Buffer
for _, r := range input {
if r == '\\' {
buffer.WriteByte('\\')
buffer.WriteByte('\\')
} else if r == '\n' {
buffer.WriteByte('\\')
buffer.WriteByte('n')
} else if r == '\r' {
buffer.WriteByte('\\')
buffer.WriteByte('r')
} else if r == '\t' {
buffer.WriteByte('\\')
buffer.WriteByte('t')
} else {
buffer.WriteRune(r)
}
}
return buffer.String()
}

35
pkg/lib/tsv_codec_test.go Normal file
View file

@ -0,0 +1,35 @@
package lib
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestTSVDecodeField(t *testing.T) {
assert.Equal(t, "", TSVDecodeField(""))
assert.Equal(t, "a", TSVDecodeField("a"))
assert.Equal(t, "abc", TSVDecodeField("abc"))
assert.Equal(t, `\`, TSVDecodeField(`\`))
assert.Equal(t, "\n", TSVDecodeField(`\n`))
assert.Equal(t, "\r", TSVDecodeField(`\r`))
assert.Equal(t, "\t", TSVDecodeField(`\t`))
assert.Equal(t, "\\", TSVDecodeField(`\\`))
assert.Equal(t, `\n`, TSVDecodeField(`\\n`))
assert.Equal(t, "\\\n", TSVDecodeField(`\\\n`))
assert.Equal(t, "abc\r\ndef\r\n", TSVDecodeField(`abc\r\ndef\r\n`))
}
func TestTSVEncodeField(t *testing.T) {
assert.Equal(t, "", TSVEncodeField(""))
assert.Equal(t, "a", TSVEncodeField("a"))
assert.Equal(t, "abc", TSVEncodeField("abc"))
assert.Equal(t, `\\`, TSVEncodeField(`\`))
assert.Equal(t, `\n`, TSVEncodeField("\n"))
assert.Equal(t, `\r`, TSVEncodeField("\r"))
assert.Equal(t, `\t`, TSVEncodeField("\t"))
assert.Equal(t, `\\`, TSVEncodeField("\\"))
assert.Equal(t, `\\n`, TSVEncodeField("\\n"))
assert.Equal(t, `\\\n`, TSVEncodeField("\\\n"))
assert.Equal(t, `abc\r\ndef\r\n`, TSVEncodeField("abc\r\ndef\r\n"))
}

246
pkg/lib/unbackslash.go Normal file
View file

@ -0,0 +1,246 @@
// ================================================================
// See cst.BuildStringLiteralNode for more context.
// ================================================================
package lib
import (
"bytes"
"strconv"
)
var unbackslashReplacements = map[byte]string{
'a': "\a",
'b': "\b",
'f': "\f",
'n': "\n",
'r': "\r",
't': "\t",
'v': "\v",
// At the Miller-user level this means "\\" becomes a single backslash
// character. It looks less clear here since here we are accommodating Go
// conventions for backslashing conventions as well.
'\\': "\\",
// Similarly, "\'" becomes "'"
'\'': "'",
'"': "\"",
'?': "?",
}
// UnbackslashStringLiteral replaces "\t" with TAB, etc. for DSL expressions
// like '$foo = "a\tb"'. See also
// https://en.wikipedia.org/wiki/Escape_sequences_in_C
// (predates the port of Miller from C to Go).
//
// Note that a CST-build pre-pass intentionally excludes regex literals (2nd
// argument to sub/gsub/regextract/etc) from being modified here.
//
// Note "\0" .. "\9" are used for regex captures within the DSL CST builder
// and are not touched here. (See also lib/regex.go.)
func UnbackslashStringLiteral(input string) string {
// We could just do this. However, if someone has a valid "\t" in one part of the string,
// and something else strconv.Unquote doesn't handle in another part of the string,
// we'd fail to unbackslash the former ...
//
// output, err := strconv.Unquote(`"` + input + `"`)
// if err == nil {
// return output
// } else {
// return input
// }
//
// ... and, given that desire, we don't a priori know how many digits in Unicode
// escape sequences -- so we *require* that people use four hex digits after \u
// and eight hex digits after \U.
var buffer bytes.Buffer
n := len(input)
for i := 0; i < n; /* increment in loop */ {
if input[i] != '\\' {
buffer.WriteByte(input[i])
i++
continue
}
if i == n-1 {
buffer.WriteByte(input[i])
i++
continue
}
next := input[i+1]
replacement, ok := unbackslashReplacements[next]
if ok {
buffer.WriteString(replacement)
i += 2
} else if ok, code := isBackslashOctal(input[i:]); ok {
buffer.WriteByte(byte(code))
i += 4
} else if ok, code := isBackslashHex(input[i:]); ok {
buffer.WriteByte(byte(code))
i += 4
} else if ok, s := isUnicode4(input[i:]); ok {
buffer.WriteString(s)
i += 6
} else if ok, s := isUnicode8(input[i:]); ok {
buffer.WriteString(s)
i += 10
} else {
buffer.WriteByte('\\')
buffer.WriteByte(next)
i += 2
}
}
return buffer.String()
}
// UnhexStringLiteral is like UnbackslashStringLiteral but only unhexes things
// like "\x1f". This is for IFS and IPS setup; see the cli package.
func UnhexStringLiteral(input string) string {
var buffer bytes.Buffer
n := len(input)
for i := 0; i < n; /* increment in loop */ {
if input[i] != '\\' {
buffer.WriteByte(input[i])
i++
continue
}
if i == n-1 {
buffer.WriteByte(input[i])
i++
continue
}
next := input[i+1]
if ok, code := isBackslashHex(input[i:]); ok {
buffer.WriteByte(byte(code))
i += 4
} else {
buffer.WriteByte('\\')
buffer.WriteByte(next)
i += 2
}
}
return buffer.String()
}
// If the string starts with backslash followed by three octal digits, convert
// the next 3 characters from octal. E.g. "\123" becomes 83 (in decimal).
func isBackslashOctal(input string) (bool, int) {
if len(input) < 4 {
return false, 0
}
if input[0] != '\\' {
return false, 0
}
ok, digit := isOctalDigit(input[1])
if !ok {
return false, 0
}
code := int(digit)
ok, digit = isOctalDigit(input[2])
if !ok {
return false, 0
}
code = 8*code + int(digit)
ok, digit = isOctalDigit(input[3])
if !ok {
return false, 0
}
code = 8*code + int(digit)
return true, code
}
func isOctalDigit(b byte) (bool, byte) {
if '0' <= b && b <= '7' {
return true, b - '0'
}
return false, 0
}
// If the string starts with leading \x, convert the next 2 characters from hex.
// E.g. "\xff" becomes 255 (in decimal).
func isBackslashHex(input string) (bool, int) {
if len(input) < 4 {
return false, 0
}
if input[0] != '\\' {
return false, 0
}
if input[1] != 'x' && input[1] != 'X' {
return false, 0
}
ok, nybble := isHexDigit(input[2])
if !ok {
return false, 0
}
code := 16 * int(nybble)
ok, nybble = isHexDigit(input[3])
if !ok {
return false, 0
}
code += int(nybble)
return true, code
}
// isHexDigit tries to parse e.g. "\x41"
func isHexDigit(b byte) (bool, byte) {
if '0' <= b && b <= '9' {
return true, b - '0'
}
if 'a' <= b && b <= 'f' {
return true, b - 'a' + 10
}
if 'A' <= b && b <= 'F' {
return true, b - 'A' + 10
}
return false, 0
}
// isUnicode4 tries to parse e.g. "\u2766"
func isUnicode4(input string) (bool, string) {
if len(input) < 6 {
return false, ""
}
if input[0:2] != `\u` {
return false, ""
}
s, err := strconv.Unquote(`"` + input[0:6] + `"`)
if err == nil {
return true, s
}
return false, ""
}
// isUnicode8 tries to parse e.g. "\U00010877"
func isUnicode8(input string) (bool, string) {
if len(input) < 10 {
return false, ""
}
if input[0:2] != `\U` {
return false, ""
}
s, err := strconv.Unquote(`"` + input[0:10] + `"`)
if err == nil {
return true, s
}
return false, ""
}

View file

@ -0,0 +1,45 @@
// ================================================================
// Most Miller tests (thousands of them) are command-line-driven via
// mlr regtest. Here are some cases needing special focus.
// ================================================================
package lib
import (
"testing"
)
type tDataForUnbackslash struct {
input string
expectedOutput string
}
// Note we are here dealing with Go's backslashing conventions.
// At the Miller user-space level this is simply "\t" -> TAB, etc.
var dataForUnbackslash = []tDataForUnbackslash{
{"", ""},
{"abcde", "abcde"},
{`\1`, `\1`},
{`a\tb\tc`, "a\tb\tc"},
{`a\fb\rc`, "a\fb\rc"},
{`a"b"c`, `a"b"c`},
{`a\"b\"c`, `a"b"c`},
{`a\102c`, `aBc`},
{`a\x42c`, `aBc`},
{`[\101\102\103]`, `[ABC]`},
{`[\x44\x45\x46]`, `[DEF]`},
{`\u2766`, ``},
{`\U00010877`, `𐡷`},
{`a\u0062c`, `abc`},
}
func TestUnbackslash(t *testing.T) {
for i, entry := range dataForUnbackslash {
actualOutput := UnbackslashStringLiteral(entry.input)
if actualOutput != entry.expectedOutput {
t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
i, entry.input, entry.expectedOutput, actualOutput,
)
}
}
}

229
pkg/lib/util.go Normal file
View file

@ -0,0 +1,229 @@
package lib
import (
"fmt"
"io/ioutil"
"os"
"sort"
"strconv"
"strings"
"unicode/utf8"
)
func BooleanXOR(a, b bool) bool {
return a != b
}
func BoolToInt(b bool) int64 {
if b == false {
return 0
} else {
return 1
}
}
func Plural(n int) string {
if n == 1 {
return ""
} else {
return "s"
}
}
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
// but "" splits to [""] when I wish it were []. This function does the latter.
func SplitString(input string, separator string) []string {
if input == "" {
return make([]string, 0)
} else {
return strings.Split(input, separator)
}
}
func StringListToSet(stringList []string) map[string]bool {
if stringList == nil {
return nil
}
stringSet := make(map[string]bool)
for _, s := range stringList {
stringSet[s] = true
}
return stringSet
}
func SortStrings(strings []string) {
// Go sort API: for ascending sort, return true if element i < element j.
sort.Slice(strings, func(i, j int) bool {
return strings[i] < strings[j]
})
}
func ReverseStringList(strings []string) {
n := len(strings)
i := 0
j := n - 1
for i < j {
temp := strings[i]
strings[i] = strings[j]
strings[j] = temp
i++
j--
}
}
func SortedStrings(strings []string) []string {
copy := make([]string, len(strings))
for i, s := range strings {
copy[i] = s
}
// Go sort API: for ascending sort, return true if element i < element j.
sort.Slice(copy, func(i, j int) bool {
return copy[i] < copy[j]
})
return copy
}
func IntMin2(a, b int64) int64 {
if a < b {
return a
} else {
return b
}
}
// TryIntFromString tries decimal, hex, octal, and binary.
func TryIntFromString(input string) (int64, bool) {
// Go's strconv parses "1_2" as 12; not OK for Miller syntax. (Also not valid JSON.)
for i := 0; i < len(input); i++ {
if input[i] == '_' {
return 0, false
}
}
// Following twos-complement formatting familiar from all manner of
// languages, including C which was Miller's original implementation
// language, we want to allow 0x00....00 through 0x7f....ff as positive
// 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's
// signed-int parsing explicitly doesn't allow that, but we don't want Go
// semantics to dictate Miller semantics. So, we try signed-int parsing
// for 0x00....00 through 0x7f....ff, as well as positive or negative
// decimal. Failing that, we try unsigned-int parsing for 0x80....00
// through 0xff....ff.
i64, ierr := strconv.ParseInt(input, 0 /* check all*/, 64)
if ierr == nil {
return i64, true
}
u64, uerr := strconv.ParseUint(input, 0 /* check all*/, 64)
if uerr == nil {
return int64(u64), true
}
return 0, false
}
// TryIntFromStringWithBase allows the user to choose the base that's used,
// rather than inferring from 0x prefix, etc as TryIntFromString does.
func TryIntFromStringWithBase(input string, base int64) (int64, bool) {
// Go's strconv parses "1_2" as 12; not OK for Miller syntax. (Also not valid JSON.)
for i := 0; i < len(input); i++ {
if input[i] == '_' {
return 0, false
}
}
i64, ierr := strconv.ParseInt(input, int(base), 64)
if ierr == nil {
return i64, true
}
u64, uerr := strconv.ParseUint(input, int(base), 64)
if uerr == nil {
return int64(u64), true
}
return 0, false
}
func TryFloatFromString(input string) (float64, bool) {
// Go's strconv parses "1_2.3_4" as 12.34; not OK for Miller syntax. (Also not valid JSON.)
for i := 0; i < len(input); i++ {
if input[i] == '_' {
return 0, false
}
}
fval, err := strconv.ParseFloat(input, 64)
if err == nil {
return fval, true
} else {
return 0, false
}
}
func TryBoolFromBoolString(input string) (bool, bool) {
if input == "true" {
return true, true
} else if input == "false" {
return false, true
} else {
return false, false
}
}
// Go doesn't preserve insertion order in its arrays, so here we make an
// accessor for getting the keys in sorted order for the benefit of
// map-printers.
func GetArrayKeysSorted(input map[string]string) []string {
keys := make([]string, len(input))
i := 0
for key := range input {
keys[i] = key
i++
}
sort.Strings(keys)
return keys
}
// WriteTempFile places the contents string into a temp file, which the caller
// must remove.
func WriteTempFileOrDie(contents string) string {
// Use "" as first argument to ioutil.TempFile to use default directory.
// Nominally "/tmp" or somesuch on all unix-like systems, but not for Windows.
handle, err := ioutil.TempFile("", "mlr-temp")
if err != nil {
fmt.Printf("mlr: could not create temp file.\n")
os.Exit(1)
}
_, err = handle.WriteString(contents)
if err != nil {
fmt.Printf("mlr: could not populate temp file.\n")
os.Exit(1)
}
err = handle.Close()
if err != nil {
fmt.Printf("mlr: could not finish write of temp file.\n")
os.Exit(1)
}
return handle.Name()
}
func CopyStringArray(input []string) []string {
output := make([]string, len(input))
copy(output, input)
return output
}
func StripEmpties(input []string) []string {
output := make([]string, 0, len(input))
for _, e := range input {
if e != "" {
output = append(output, e)
}
}
return output
}
func UTF8Strlen(s string) int64 {
return int64(utf8.RuneCountInString(s))
}