mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
Export library code in pkg/ (#1391)
* Export library code in `pkg/` * new doc page
This commit is contained in:
parent
93b7c8eac0
commit
268a96d002
358 changed files with 1076 additions and 693 deletions
1
pkg/lib/README.md
Normal file
1
pkg/lib/README.md
Normal file
|
|
@ -0,0 +1 @@
|
|||
These are basic library routines for Miller.
|
||||
2
pkg/lib/doc.go
Normal file
2
pkg/lib/doc.go
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
// Package lib contains basic library routines for Miller.
|
||||
package lib
|
||||
3
pkg/lib/docurl.go
Normal file
3
pkg/lib/docurl.go
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
package lib
|
||||
|
||||
const DOC_URL = "https://miller.readthedocs.io"
|
||||
322
pkg/lib/file_readers.go
Normal file
322
pkg/lib/file_readers.go
Normal file
|
|
@ -0,0 +1,322 @@
|
|||
// ================================================================
|
||||
// Wrapper for os.Open which maps string filename to *os.File, which in turn
|
||||
// implements io.ReadCloser, and optional in turn wrapping that in a
|
||||
// gzip/zlib/bunzip2 reader. Shared across record-readers for all the various
|
||||
// input-file formats (CSV, JSON, XTAB, DKVP, NIDX, PPRINT) which Miller
|
||||
// supports.
|
||||
//
|
||||
// There are two ways of handling compressed data in the Miller Go port:
|
||||
//
|
||||
// * A user-specified 'prepipe' command such as 'gunzip', where we popen a
|
||||
// process, hand it the filename via '< filename', and read from that pipe;
|
||||
//
|
||||
// * An indication to use an in-process encoding reader (gzip or bzip2, etc).
|
||||
//
|
||||
// If a prepipe is specified, it is used; else if an encoding is specified, it
|
||||
// is used; otherwise the file suffix (.bz2, .gz, .z) is consulted; otherwise
|
||||
// the file is treated as text.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/bzip2"
|
||||
"compress/gzip"
|
||||
"compress/zlib"
|
||||
"fmt"
|
||||
"github.com/klauspost/compress/zstd"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type TFileInputEncoding int
|
||||
|
||||
const (
|
||||
FileInputEncodingDefault TFileInputEncoding = iota
|
||||
FileInputEncodingBzip2
|
||||
FileInputEncodingGzip
|
||||
FileInputEncodingZlib
|
||||
FileInputEncodingZstd
|
||||
)
|
||||
|
||||
// OpenFileForRead: If prepipe is non-empty, popens "{prepipe} < {filename}"
|
||||
// and returns a handle to that where prepipe is nominally things like
|
||||
// "gunzip", "cat", etc. Otherwise, delegates to an in-process reader which
|
||||
// can natively handle gzip/bzip2/zlib depending on the specified encoding. If
|
||||
// the encoding isn't a compression encoding, this ends up being simply
|
||||
// os.Open.
|
||||
func OpenFileForRead(
|
||||
filename string,
|
||||
prepipe string,
|
||||
prepipeIsRaw bool,
|
||||
encoding TFileInputEncoding, // ignored if prepipe is non-empty
|
||||
) (io.ReadCloser, error) {
|
||||
if prepipe != "" {
|
||||
return openPrepipedHandleForRead(filename, prepipe, prepipeIsRaw)
|
||||
} else {
|
||||
handle, err := PathToHandle(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return openEncodedHandleForRead(handle, encoding, filename)
|
||||
}
|
||||
}
|
||||
|
||||
// PathToHandle maps various back-ends to a stream. As of 2021-07-07, the
|
||||
// following URI schemes are supported:
|
||||
// * https://... and http://...
|
||||
// * file://...
|
||||
// * plain disk files
|
||||
func PathToHandle(
|
||||
path string,
|
||||
) (io.ReadCloser, error) {
|
||||
if strings.HasPrefix(path, "http://") || strings.HasPrefix(path, "https://") {
|
||||
resp, err := http.Get(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
handle := resp.Body
|
||||
return handle, err
|
||||
} else if strings.HasPrefix(path, "file://") {
|
||||
return os.Open(strings.Replace(path, "file://", "", 1))
|
||||
} else {
|
||||
return os.Open(path)
|
||||
}
|
||||
}
|
||||
|
||||
// OpenStdin: if prepipe is non-empty, popens "{prepipe}" and returns a handle
|
||||
// to that where prepipe is nominally things like "gunzip", "cat", etc.
|
||||
// Otherwise, delegates to an in-process reader which can natively handle
|
||||
// gzip/bzip2/zlib depending on the specified encoding. If the encoding isn't
|
||||
// a compression encoding, this ends up being simply os.Stdin.
|
||||
func OpenStdin(
|
||||
prepipe string,
|
||||
prepipeIsRaw bool,
|
||||
encoding TFileInputEncoding, // ignored if prepipe is non-empty
|
||||
) (io.ReadCloser, error) {
|
||||
if prepipe != "" {
|
||||
return openPrepipedHandleForRead("", prepipe, prepipeIsRaw)
|
||||
} else {
|
||||
return openEncodedHandleForRead(os.Stdin, encoding, "")
|
||||
}
|
||||
}
|
||||
|
||||
func openPrepipedHandleForRead(
|
||||
filename string,
|
||||
prepipe string,
|
||||
prepipeIsRaw bool,
|
||||
) (io.ReadCloser, error) {
|
||||
escapedFilename := escapeFileNameForPopen(filename)
|
||||
|
||||
var command string
|
||||
if filename == "" { // stdin
|
||||
command = prepipe
|
||||
} else {
|
||||
if prepipeIsRaw {
|
||||
command = prepipe + " " + escapedFilename
|
||||
} else {
|
||||
command = prepipe + " < " + escapedFilename
|
||||
}
|
||||
}
|
||||
|
||||
return OpenInboundHalfPipe(command)
|
||||
}
|
||||
|
||||
// Avoids shell-injection cases by replacing single-quote with backslash
|
||||
// single-quote and double-quote with backslack double-quote, then wrapping the
|
||||
// entire result in initial and final single-quote.
|
||||
//
|
||||
// TODO: test on Windows. Maybe needs move to pkg/platform.
|
||||
func escapeFileNameForPopen(filename string) string {
|
||||
var buffer bytes.Buffer
|
||||
foundQuote := false
|
||||
for _, c := range filename {
|
||||
if c == '\'' || c == '"' {
|
||||
buffer.WriteRune('\'')
|
||||
buffer.WriteRune(c)
|
||||
buffer.WriteRune('\'')
|
||||
} else {
|
||||
buffer.WriteRune(c)
|
||||
}
|
||||
}
|
||||
if foundQuote {
|
||||
return "'" + buffer.String() + "'"
|
||||
} else {
|
||||
return buffer.String()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: comment
|
||||
func openEncodedHandleForRead(
|
||||
handle io.ReadCloser,
|
||||
encoding TFileInputEncoding,
|
||||
filename string,
|
||||
) (io.ReadCloser, error) {
|
||||
switch encoding {
|
||||
case FileInputEncodingBzip2:
|
||||
return NewBZip2ReadCloser(handle), nil
|
||||
case FileInputEncodingGzip:
|
||||
return gzip.NewReader(handle)
|
||||
case FileInputEncodingZlib:
|
||||
return zlib.NewReader(handle)
|
||||
case FileInputEncodingZstd:
|
||||
return NewZstdReadCloser(handle)
|
||||
}
|
||||
|
||||
InternalCodingErrorIf(encoding != FileInputEncodingDefault)
|
||||
|
||||
if strings.HasSuffix(filename, ".bz2") {
|
||||
return NewBZip2ReadCloser(handle), nil
|
||||
}
|
||||
if strings.HasSuffix(filename, ".gz") {
|
||||
return gzip.NewReader(handle)
|
||||
}
|
||||
if strings.HasSuffix(filename, ".z") {
|
||||
return zlib.NewReader(handle)
|
||||
}
|
||||
if strings.HasSuffix(filename, ".zst") {
|
||||
return NewZstdReadCloser(handle)
|
||||
}
|
||||
|
||||
// Pass along os.Stdin or os.Open(filename)
|
||||
return handle, nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// BZip2ReadCloser remedies the fact that bzip2.NewReader does not implement io.ReadCloser.
|
||||
type BZip2ReadCloser struct {
|
||||
originalHandle io.ReadCloser
|
||||
bzip2Handle io.Reader
|
||||
}
|
||||
|
||||
func NewBZip2ReadCloser(handle io.ReadCloser) *BZip2ReadCloser {
|
||||
return &BZip2ReadCloser{
|
||||
originalHandle: handle,
|
||||
bzip2Handle: bzip2.NewReader(handle),
|
||||
}
|
||||
}
|
||||
|
||||
func (rc *BZip2ReadCloser) Read(p []byte) (n int, err error) {
|
||||
return rc.bzip2Handle.Read(p)
|
||||
}
|
||||
|
||||
func (rc *BZip2ReadCloser) Close() error {
|
||||
return rc.originalHandle.Close()
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// ZstdReadCloser remedies the fact that zstd.NewReader does not implement io.ReadCloser.
|
||||
type ZstdReadCloser struct {
|
||||
originalHandle io.ReadCloser
|
||||
zstdHandle io.Reader
|
||||
}
|
||||
|
||||
func NewZstdReadCloser(handle io.ReadCloser) (*ZstdReadCloser, error) {
|
||||
zstdHandle, err := zstd.NewReader(handle)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &ZstdReadCloser{
|
||||
originalHandle: handle,
|
||||
zstdHandle: zstdHandle,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (rc *ZstdReadCloser) Read(p []byte) (n int, err error) {
|
||||
return rc.zstdHandle.Read(p)
|
||||
}
|
||||
|
||||
func (rc *ZstdReadCloser) Close() error {
|
||||
return rc.originalHandle.Close()
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// IsEOF handles the following problem: reading past end of files opened with
|
||||
// os.Open returns the error which is io.EOF. Reading past close of pipes
|
||||
// opened with popen (e.g. Miller's prepipe, where the file isn't 'foo.dat'
|
||||
// but rather the process 'gunzip < foo.dat |') returns not io.EOF but an error
|
||||
// with 'file already closed' within it. See also
|
||||
// https://stackoverflow.com/questions/47486128/why-does-io-pipe-continue-to-block-even-when-eof-is-reached
|
||||
func IsEOF(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
} else if err == io.EOF {
|
||||
return true
|
||||
} else if strings.Contains(err.Error(), "file already closed") {
|
||||
return true
|
||||
} else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Functions for in-place mode
|
||||
|
||||
// IsUpdateableInPlace tells if we can use the input with mlr -I: not for URLs,
|
||||
// and not for prepipe commands (which we don't presume to know how to invert
|
||||
// for output).
|
||||
func IsUpdateableInPlace(
|
||||
filename string,
|
||||
prepipe string,
|
||||
) error {
|
||||
if strings.HasPrefix(filename, "http://") ||
|
||||
strings.HasPrefix(filename, "https://") ||
|
||||
strings.HasPrefix(filename, "file://") {
|
||||
return fmt.Errorf("http://, https://, and file:// URLs are not updateable in place.")
|
||||
}
|
||||
if prepipe != "" {
|
||||
return fmt.Errorf("input with --prepipe or --prepipex is not updateable in place.")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// FindInputEncoding determines the input encoding (compression), whether from
|
||||
// a flag like --gzin, or from filename suffix like ".gz". If the user did
|
||||
// --gzin on the command line, TFileInputEncoding will be
|
||||
// FileInputEncodingGzip. If they didn't, but the filename ends in ".gz", then
|
||||
// we auto-infer FileInputEncodingGzip. Either way, this function tells if we
|
||||
// will be using in-process decompression within the file-format-specific
|
||||
// record reader.
|
||||
func FindInputEncoding(
|
||||
filename string,
|
||||
inputFileInputEncoding TFileInputEncoding,
|
||||
) TFileInputEncoding {
|
||||
if inputFileInputEncoding != FileInputEncodingDefault {
|
||||
return inputFileInputEncoding
|
||||
}
|
||||
if strings.HasSuffix(filename, ".bz2") {
|
||||
return FileInputEncodingBzip2
|
||||
}
|
||||
if strings.HasSuffix(filename, ".gz") {
|
||||
return FileInputEncodingGzip
|
||||
}
|
||||
if strings.HasSuffix(filename, ".z") {
|
||||
return FileInputEncodingZlib
|
||||
}
|
||||
return FileInputEncodingDefault
|
||||
}
|
||||
|
||||
// WrapOutputHandle wraps a file-write handle with a decompressor. The first
|
||||
// return value is the wrapped handle. The second is true if the returned
|
||||
// handle needs to be closed separately from the original. The third is for
|
||||
// in-process compression we can't undo: namely, as of September 2021 the gzip
|
||||
// and zlib libraries support write-closers, but the bzip2 library does not.
|
||||
func WrapOutputHandle(
|
||||
fileWriteHandle io.WriteCloser,
|
||||
inputFileEncoding TFileInputEncoding,
|
||||
) (io.WriteCloser, bool, error) {
|
||||
switch inputFileEncoding {
|
||||
case FileInputEncodingBzip2:
|
||||
return fileWriteHandle, false, fmt.Errorf("bzip2 is not currently supported for in-place mode.")
|
||||
case FileInputEncodingGzip:
|
||||
return gzip.NewWriter(fileWriteHandle), true, nil
|
||||
case FileInputEncodingZlib:
|
||||
return zlib.NewWriter(fileWriteHandle), true, nil
|
||||
default:
|
||||
return fileWriteHandle, false, nil
|
||||
}
|
||||
}
|
||||
43
pkg/lib/getoptify.go
Normal file
43
pkg/lib/getoptify.go
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Getoptify expands "-xyz" into "-x -y -z" while leaving "--xyz" intact. This
|
||||
// is a keystroke-saver for the user.
|
||||
//
|
||||
// This is OK to do here globally since Miller is quite consistent (in main,
|
||||
// verbs, auxents, and terminals) that multi-character options start with two
|
||||
// dashes, e.g. "--csv". (The sole exception is the sort verb's -nf/-nr which
|
||||
// are handled specially there.)
|
||||
//
|
||||
// Additionally, we split "--foo=bar" into "--foo" and "bar".
|
||||
func Getoptify(inargs []string) []string {
|
||||
expandRegex := regexp.MustCompile("^-[a-zA-Z0-9]+$")
|
||||
splitRegex := regexp.MustCompile("^--[^=]+=.+$")
|
||||
numberRegex := regexp.MustCompile("^-[0-9]+$")
|
||||
outargs := make([]string, 0)
|
||||
for _, inarg := range inargs {
|
||||
if expandRegex.MatchString(inarg) {
|
||||
if numberRegex.MatchString(inarg) {
|
||||
// Don't expand things like '-12345' which are (likely!) numeric arguments to verbs.
|
||||
// Example: 'mlr unsparsify --fill-with -99999'.
|
||||
outargs = append(outargs, inarg)
|
||||
} else {
|
||||
for _, c := range inarg[1:] {
|
||||
outargs = append(outargs, "-"+string(c))
|
||||
}
|
||||
}
|
||||
} else if splitRegex.MatchString(inarg) {
|
||||
pair := strings.SplitN(inarg, "=", 2)
|
||||
InternalCodingErrorIf(len(pair) != 2)
|
||||
outargs = append(outargs, pair[0])
|
||||
outargs = append(outargs, pair[1])
|
||||
} else {
|
||||
outargs = append(outargs, inarg)
|
||||
}
|
||||
}
|
||||
return outargs
|
||||
}
|
||||
88
pkg/lib/halfpipe.go
Normal file
88
pkg/lib/halfpipe.go
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
"github.com/johnkerl/miller/pkg/platform"
|
||||
)
|
||||
|
||||
// OpenOutboundHalfPipe returns a handle to a process. Writing to that handle
|
||||
// writes to the process' stdin. The process' stdout and stderr are the current
|
||||
// process' stdout and stderr.
|
||||
//
|
||||
// This is for pipe-output-redirection in the Miller put/filter DSL.
|
||||
//
|
||||
// Note I am not using os.exec.Cmd which is billed as being simpler than using
|
||||
// os.StartProcess. It may indeed be simpler when you want to handle the
|
||||
// subprocess' stdin/stdout/stderr all three within the parent process. Here I
|
||||
// found it much easier to use os.StartProcess to let the stdout/stderr run
|
||||
// free.
|
||||
|
||||
func OpenOutboundHalfPipe(commandString string) (*os.File, error) {
|
||||
readPipe, writePipe, err := os.Pipe()
|
||||
|
||||
var procAttr os.ProcAttr
|
||||
procAttr.Files = []*os.File{
|
||||
readPipe,
|
||||
os.Stdout,
|
||||
os.Stderr,
|
||||
}
|
||||
|
||||
// /bin/sh -c "..." or cmd /c "..."
|
||||
shellRunArray := platform.GetShellRunArray(commandString)
|
||||
|
||||
process, err := os.StartProcess(shellRunArray[0], shellRunArray, &procAttr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
go process.Wait()
|
||||
|
||||
return writePipe, nil
|
||||
}
|
||||
|
||||
// OpenInboundHalfPipe returns a handle to a process. Reading from that handle
|
||||
// reads from the process' stdout. The process' stdin and stderr are the
|
||||
// current process' stdin and stderr.
|
||||
//
|
||||
// This is for the Miller prepipe feature.
|
||||
//
|
||||
// Note I am not using os.exec.Cmd which is billed as being simpler than using
|
||||
// os.StartProcess. It may indeed be simpler when you want to handle the
|
||||
// subprocess' stdin/stdout/stderr all three within the parent process. Here I
|
||||
// found it much easier to use os.StartProcess to let the stdin/stderr run
|
||||
// free.
|
||||
|
||||
func OpenInboundHalfPipe(commandString string) (*os.File, error) {
|
||||
readPipe, writePipe, err := os.Pipe()
|
||||
|
||||
var procAttr os.ProcAttr
|
||||
procAttr.Files = []*os.File{
|
||||
os.Stdin,
|
||||
writePipe,
|
||||
os.Stderr,
|
||||
}
|
||||
|
||||
// /bin/sh -c "..." or cmd /c "..."
|
||||
shellRunArray := platform.GetShellRunArray(commandString)
|
||||
|
||||
process, err := os.StartProcess(shellRunArray[0], shellRunArray, &procAttr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// TODO comment somewhere
|
||||
// https://stackoverflow.com/questions/47486128/why-does-io-pipe-continue-to-block-even-when-eof-is-reached
|
||||
|
||||
// TODO comment
|
||||
go func(process *os.Process, readPipe *os.File) {
|
||||
_, err := process.Wait()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err)
|
||||
}
|
||||
readPipe.Close()
|
||||
}(process, readPipe)
|
||||
|
||||
return readPipe, nil
|
||||
}
|
||||
38
pkg/lib/latin1.go
Normal file
38
pkg/lib/latin1.go
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func TryLatin1ToUTF8(input string) (string, error) {
|
||||
var buffer bytes.Buffer
|
||||
for _, b := range []byte(input) {
|
||||
// 0x00-0xff map to 0x0000-0xffff
|
||||
buffer.WriteRune(rune(b))
|
||||
}
|
||||
output := buffer.String()
|
||||
return output, nil
|
||||
}
|
||||
|
||||
func TryUTF8ToLatin1(input string) (string, error) {
|
||||
var buffer bytes.Buffer
|
||||
|
||||
bytes := []byte(input)
|
||||
for len(bytes) > 0 {
|
||||
r, size := utf8.DecodeRune(bytes)
|
||||
|
||||
if r < 0x0080 {
|
||||
buffer.WriteByte(byte(r))
|
||||
} else if r >= 0x80 && r <= 0x00ff {
|
||||
buffer.WriteByte(byte(r))
|
||||
} else {
|
||||
return "", fmt.Errorf("character 0x%08x (%v) is not encodable as Latin-1", int(r), r)
|
||||
}
|
||||
|
||||
bytes = bytes[size:]
|
||||
}
|
||||
output := buffer.String()
|
||||
return output, nil
|
||||
}
|
||||
100
pkg/lib/latin1_test.go
Normal file
100
pkg/lib/latin1_test.go
Normal file
|
|
@ -0,0 +1,100 @@
|
|||
// ================================================================
|
||||
// Most Miller tests (thousands of them) are command-line-driven via
|
||||
// mlr regtest. Here are some cases needing special focus.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type tDataForLatin1 struct {
|
||||
input string
|
||||
expectedOutput string
|
||||
expectError bool
|
||||
}
|
||||
|
||||
var dataForLatin1ToUTF8 = []tDataForLatin1{
|
||||
{
|
||||
"",
|
||||
"",
|
||||
false,
|
||||
},
|
||||
{
|
||||
"The quick brown fox jumped over the lazy dogs.",
|
||||
"The quick brown fox jumped over the lazy dogs.",
|
||||
false,
|
||||
},
|
||||
{
|
||||
"a\xe4o\xf6",
|
||||
"a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4"
|
||||
false,
|
||||
},
|
||||
{
|
||||
"Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich",
|
||||
"Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich",
|
||||
false,
|
||||
},
|
||||
}
|
||||
|
||||
var dataForUTF8ToLatin1 = []tDataForLatin1{
|
||||
{
|
||||
"",
|
||||
"",
|
||||
false,
|
||||
},
|
||||
{
|
||||
"The quick brown fox jumped over the lazy dogs.",
|
||||
"The quick brown fox jumped over the lazy dogs.",
|
||||
false,
|
||||
},
|
||||
{
|
||||
"a\xc3\xa4o\xc3\xb6", // "aäoö" -- showing explicitly here "\u00e4" encodes as "\xc3\xa4"
|
||||
"a\xe4o\xf6",
|
||||
false,
|
||||
},
|
||||
{
|
||||
"Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich",
|
||||
"Victor jagt zw\xf6lf Boxk\xe4mpfer quer \xfcber den gro\xdfen Sylter Deich",
|
||||
false,
|
||||
},
|
||||
{
|
||||
"Съешь же ещё этих мягких французских булок да выпей чаю",
|
||||
"",
|
||||
true,
|
||||
},
|
||||
}
|
||||
|
||||
func TestLatin1ToUTF8(t *testing.T) {
|
||||
for i, entry := range dataForLatin1ToUTF8 {
|
||||
actualOutput, err := TryLatin1ToUTF8(entry.input)
|
||||
if entry.expectError {
|
||||
assert.NotNil(t, err)
|
||||
} else {
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
|
||||
i, entry.input, entry.expectedOutput, actualOutput,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUTF8ToLatin1(t *testing.T) {
|
||||
for i, entry := range dataForUTF8ToLatin1 {
|
||||
actualOutput, err := TryUTF8ToLatin1(entry.input)
|
||||
if entry.expectError {
|
||||
assert.NotNil(t, err)
|
||||
} else {
|
||||
assert.Nil(t, err)
|
||||
}
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
|
||||
i, entry.input, entry.expectedOutput, actualOutput,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
110
pkg/lib/logger.go
Normal file
110
pkg/lib/logger.go
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
// InternalCodingErrorIf is a lookalike for C's __FILE__ and __LINE__ printing,
|
||||
// with exit 1 if the condition is true.
|
||||
func InternalCodingErrorIf(condition bool) {
|
||||
if !condition {
|
||||
return
|
||||
}
|
||||
_, fileName, fileLine, ok := runtime.Caller(1)
|
||||
if ok {
|
||||
fmt.Fprintf(
|
||||
os.Stderr,
|
||||
"Internal coding error detected at file %s line %d\n",
|
||||
// Full path preferred but breaks diffs on regression-test actual vs expected
|
||||
// stderr comparison on expect-fail cases.
|
||||
path.Base(fileName),
|
||||
fileLine,
|
||||
)
|
||||
} else {
|
||||
fmt.Fprintf(
|
||||
os.Stderr,
|
||||
"Internal coding error detected at file %s line %s\n",
|
||||
"(unknown)",
|
||||
"(unknown)",
|
||||
)
|
||||
}
|
||||
// Use this and re-run if you want to get a stack trace to get the
|
||||
// call-tree that led to the indicated file/line:
|
||||
if os.Getenv("MLR_PANIC_ON_INTERNAL_ERROR") != "" {
|
||||
panic("Here is the stack trace")
|
||||
}
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// InternalCodingErrorWithMessageIf is a lookalike for C's __FILE__ and
|
||||
// __LINE__ printing, with exit 1 if the condition is true.
|
||||
func InternalCodingErrorWithMessageIf(condition bool, message string) {
|
||||
if !condition {
|
||||
return
|
||||
}
|
||||
_, fileName, fileLine, ok := runtime.Caller(1)
|
||||
if ok {
|
||||
fmt.Fprintf(
|
||||
os.Stderr,
|
||||
"Internal coding error detected at file %s line %d: %s\n",
|
||||
path.Base(fileName),
|
||||
fileLine,
|
||||
message,
|
||||
)
|
||||
} else {
|
||||
fmt.Fprintf(
|
||||
os.Stderr,
|
||||
"Internal coding error detected at file %s line %s: %s\n",
|
||||
"(unknown)",
|
||||
"(unknown)",
|
||||
message,
|
||||
)
|
||||
}
|
||||
// use this and re-run if you want to get a stack trace to get the
|
||||
// call-tree that led to the indicated file/line:
|
||||
if os.Getenv("MLR_PANIC_ON_INTERNAL_ERROR") != "" {
|
||||
panic("Here is the stack trace")
|
||||
}
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// InternalCodingErrorPanic is like InternalCodingErrorIf, expect that it
|
||||
// panics the process (for stack trace, which is usually not desired), and that
|
||||
// it requires the if-test to be at the caller.
|
||||
func InternalCodingErrorPanic(message string) {
|
||||
_, fileName, fileLine, ok := runtime.Caller(1)
|
||||
if ok {
|
||||
panic(
|
||||
fmt.Sprintf(
|
||||
"Internal coding error detected at file %s line %d: %s\n",
|
||||
path.Base(fileName),
|
||||
fileLine,
|
||||
message,
|
||||
),
|
||||
)
|
||||
} else {
|
||||
panic(
|
||||
fmt.Sprintf(
|
||||
"Internal coding error detected at file %s line %s: %s\n",
|
||||
"(unknown)",
|
||||
"(unknown)",
|
||||
message,
|
||||
),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// WhereAreWe shows a stack trace from the current callsite.
|
||||
func WhereAreWe() {
|
||||
// Start at 1, not 0, since this function itself is not of interest.
|
||||
for i := 1; i < 20; i++ {
|
||||
_, fileName, fileLine, ok := runtime.Caller(i)
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
fmt.Printf(" %s %d\n", fileName, fileLine)
|
||||
}
|
||||
}
|
||||
430
pkg/lib/mlrmath.go
Normal file
430
pkg/lib/mlrmath.go
Normal file
|
|
@ -0,0 +1,430 @@
|
|||
// ================================================================
|
||||
// Non-mlrval math routines
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Some wrappers around things which aren't one-liners from math.*.
|
||||
|
||||
func Sgn(a float64) float64 {
|
||||
if a > 0 {
|
||||
return 1.0
|
||||
} else if a < 0 {
|
||||
return -1.0
|
||||
} else if a == 0 {
|
||||
return 0.0
|
||||
} else {
|
||||
return math.NaN()
|
||||
}
|
||||
}
|
||||
|
||||
// Normal cumulative distribution function, expressed in terms of erfc library
|
||||
// function (which is awkward, but exists).
|
||||
func Qnorm(x float64) float64 {
|
||||
return 0.5 * math.Erfc(-x/math.Sqrt2)
|
||||
}
|
||||
|
||||
// This is a tangent-following method not unlike Newton-Raphson:
|
||||
// * We can compute qnorm(y) = integral from -infinity to y of (1/sqrt(2pi)) exp(-t^2/2) dt.
|
||||
// * We can compute derivative of qnorm(y) = (1/sqrt(2pi)) exp(-y^2/2).
|
||||
// * We cannot explicitly compute invqnorm(y).
|
||||
// * If dx/dy = (1/sqrt(2pi)) exp(-y^2/2) then dy/dx = sqrt(2pi) exp(y^2/2).
|
||||
//
|
||||
// This means we *can* compute the derivative of invqnorm even though we
|
||||
// can't compute the function itself. So the essence of the method is to
|
||||
// follow the tangent line to form successive approximations: we have known function input x
|
||||
// and unknown function output y and initial guess y0. At each step we find the intersection
|
||||
// of the tangent line at y_n with the vertical line at x, to find y_{n+1}. Specificall:
|
||||
//
|
||||
// * Even though we can't compute y = q^-1(x) we can compute x = q(y).
|
||||
// * Start with initial guess for y (y0 = 0.0 or y0 = x both are OK).
|
||||
// * Find x = q(y). Since q (and therefore q^-1) are 1-1, we're done if qnorm(invqnorm(x)) is small.
|
||||
// * Else iterate: using point-slope form, (y_{n+1} - y_n) / (x_{n+1} - x_n) = m = sqrt(2pi) exp(y_n^2/2).
|
||||
// Here x_2 = x (the input) and x_1 = q(y_1).
|
||||
// * Solve for y_{n+1} and repeat.
|
||||
|
||||
const INVQNORM_TOL float64 = 1e-9
|
||||
const INVQNORM_MAXITER int = 30
|
||||
|
||||
func Invqnorm(x float64) float64 {
|
||||
// Initial approximation is linear. Starting with y0 = 0.0 works just as well.
|
||||
y0 := x - 0.5
|
||||
if x <= 0.0 {
|
||||
return 0.0
|
||||
}
|
||||
if x >= 1.0 {
|
||||
return 0.0
|
||||
}
|
||||
|
||||
y := y0
|
||||
niter := 0
|
||||
|
||||
for {
|
||||
|
||||
backx := Qnorm(y)
|
||||
err := math.Abs(x - backx)
|
||||
if err < INVQNORM_TOL {
|
||||
break
|
||||
}
|
||||
if niter > INVQNORM_MAXITER {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"mlr: internal coding error: max iterations %d exceeded in invqnorm.\n",
|
||||
INVQNORM_MAXITER,
|
||||
)
|
||||
os.Exit(1)
|
||||
}
|
||||
m := math.Sqrt2 * math.SqrtPi * math.Exp(y*y/2.0)
|
||||
delta_y := m * (x - backx)
|
||||
y += delta_y
|
||||
niter++
|
||||
}
|
||||
|
||||
return y
|
||||
}
|
||||
|
||||
const JACOBI_TOLERANCE = 1e-12
|
||||
const JACOBI_MAXITER = 20
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Jacobi real-symmetric eigensolver. Loosely adapted from Numerical Recipes.
|
||||
//
|
||||
// Note: this is coded for n=2 (to implement PCA linear regression on 2
|
||||
// variables) but the algorithm is quite general. Changing from 2 to n is a
|
||||
// matter of updating the top and bottom of the function: function signature to
|
||||
// take double** matrix, double* eigenvector_1, double* eigenvector_2, and n;
|
||||
// create copy-matrix and make-identity matrix functions; free temp matrices at
|
||||
// the end; etc.
|
||||
|
||||
func GetRealSymmetricEigensystem(
|
||||
matrix [2][2]float64,
|
||||
) (
|
||||
eigenvalue1 float64, // Output: dominant eigenvalue
|
||||
eigenvalue2 float64, // Output: less-dominant eigenvalue
|
||||
eigenvector1 [2]float64, // Output: corresponding to dominant eigenvalue
|
||||
eigenvector2 [2]float64, // Output: corresponding to less-dominant eigenvalue
|
||||
) {
|
||||
L := [2][2]float64{
|
||||
{matrix[0][0], matrix[0][1]},
|
||||
{matrix[1][0], matrix[1][1]},
|
||||
}
|
||||
V := [2][2]float64{
|
||||
{1.0, 0.0},
|
||||
{0.0, 1.0},
|
||||
}
|
||||
var P, PT_A [2][2]float64
|
||||
n := 2
|
||||
|
||||
found := false
|
||||
for iter := 0; iter < JACOBI_MAXITER; iter++ {
|
||||
sum := 0.0
|
||||
for i := 1; i < n; i++ {
|
||||
for j := 0; j < i; j++ {
|
||||
sum += math.Abs(L[i][j])
|
||||
}
|
||||
}
|
||||
if math.Abs(sum*sum) < JACOBI_TOLERANCE {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
|
||||
for p := 0; p < n; p++ {
|
||||
for q := p + 1; q < n; q++ {
|
||||
numer := L[p][p] - L[q][q]
|
||||
denom := L[p][q] + L[q][p]
|
||||
if math.Abs(denom) < JACOBI_TOLERANCE {
|
||||
continue
|
||||
}
|
||||
theta := numer / denom
|
||||
signTheta := 1.0
|
||||
if theta < 0 {
|
||||
signTheta = -1.0
|
||||
}
|
||||
t := signTheta / (math.Abs(theta) + math.Sqrt(theta*theta+1))
|
||||
c := 1.0 / math.Sqrt(t*t+1)
|
||||
s := t * c
|
||||
|
||||
for pi := 0; pi < n; pi++ {
|
||||
for pj := 0; pj < n; pj++ {
|
||||
if pi == pj {
|
||||
P[pi][pj] = 1.0
|
||||
} else {
|
||||
P[pi][pj] = 0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
P[p][p] = c
|
||||
P[p][q] = -s
|
||||
P[q][p] = s
|
||||
P[q][q] = c
|
||||
|
||||
// L = P.transpose() * L * P
|
||||
// V = V * P
|
||||
matmul2t(&PT_A, &P, &L)
|
||||
matmul2(&L, &PT_A, &P)
|
||||
matmul2(&V, &V, &P)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !found {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"%s: Jacobi eigensolver: max iterations (%d) exceeded. Non-symmetric input?\n",
|
||||
"mlr",
|
||||
JACOBI_MAXITER,
|
||||
)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
eigenvalue1 = L[0][0]
|
||||
eigenvalue2 = L[1][1]
|
||||
abs1 := math.Abs(eigenvalue1)
|
||||
abs2 := math.Abs(eigenvalue2)
|
||||
if abs1 > abs2 {
|
||||
eigenvector1[0] = V[0][0] // Column 0 of V
|
||||
eigenvector1[1] = V[1][0]
|
||||
eigenvector2[0] = V[0][1] // Column 1 of V
|
||||
eigenvector2[1] = V[1][1]
|
||||
} else {
|
||||
eigenvalue1, eigenvalue2 = eigenvalue2, eigenvalue1
|
||||
eigenvector1[0] = V[0][1]
|
||||
eigenvector1[1] = V[1][1]
|
||||
eigenvector2[0] = V[0][0]
|
||||
eigenvector2[1] = V[1][0]
|
||||
}
|
||||
|
||||
return eigenvalue1, eigenvalue2, eigenvector1, eigenvector2
|
||||
}
|
||||
|
||||
// C = A * B
|
||||
func matmul2(
|
||||
C *[2][2]float64, // Output
|
||||
A *[2][2]float64, // Input
|
||||
B *[2][2]float64, // Input
|
||||
) {
|
||||
var T [2][2]float64
|
||||
n := 2
|
||||
for i := 0; i < n; i++ {
|
||||
for j := 0; j < n; j++ {
|
||||
sum := 0.0
|
||||
for k := 0; k < n; k++ {
|
||||
sum += A[i][k] * B[k][j]
|
||||
}
|
||||
T[i][j] = sum
|
||||
}
|
||||
}
|
||||
// Needs copy in case C's memory is the same as A and/or B
|
||||
for i := 0; i < n; i++ {
|
||||
for j := 0; j < n; j++ {
|
||||
C[i][j] = T[i][j]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// C = A^t * B
|
||||
func matmul2t(
|
||||
C *[2][2]float64, // Output
|
||||
A *[2][2]float64, // Input
|
||||
B *[2][2]float64, // Input
|
||||
) {
|
||||
var T [2][2]float64
|
||||
n := 2
|
||||
for i := 0; i < n; i++ {
|
||||
for j := 0; j < n; j++ {
|
||||
sum := 0.0
|
||||
for k := 0; k < n; k++ {
|
||||
sum += A[k][i] * B[k][j]
|
||||
}
|
||||
T[i][j] = sum
|
||||
}
|
||||
}
|
||||
// Needs copy in case C's memory is the same as A and/or B
|
||||
for i := 0; i < n; i++ {
|
||||
for j := 0; j < n; j++ {
|
||||
C[i][j] = T[i][j]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ================================================================
|
||||
// Logisitic regression
|
||||
//
|
||||
// Real-valued x_0 .. x_{N-1}
|
||||
// 0/1-valued y_0 .. y_{N-1}
|
||||
// Model p(x_i == 1) as
|
||||
// p(x, m, b) = 1 / (1 + exp(-m*x-b)
|
||||
// which is the same as
|
||||
// log(p/(1-p)) = m*x + b
|
||||
// then
|
||||
// p(x, m, b) = 1 / (1 + exp(-m*x-b)
|
||||
// = exp(m*x+b) / (1 + exp(m*x+b)
|
||||
// and
|
||||
// 1-p = exp(-m*x-b) / (1 + exp(-m*x-b)
|
||||
// = 1 / (1 + exp(m*x+b)
|
||||
// Note for reference just below that
|
||||
// dp/dm = -1 / [1 + exp(-m*x-b)]**2 * (-x) * exp(-m*x-b)
|
||||
// = [x exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2
|
||||
// = x * p * (1-p)
|
||||
// and
|
||||
// dp/db = -1 / [1 + exp(-m*x-b)]**2 * (-1) * exp(-m*x-b)
|
||||
// = [exp(-m*x-b)) ] / [1 + exp(-m*x-b)]**2
|
||||
// = p * (1-p)
|
||||
// Write p_i for p(x_i, m, b)
|
||||
//
|
||||
// Maximum-likelihood equation:
|
||||
// L(m, b) = prod_{i=0}^{N-1} [ p_i**y_i * (1-p_i)**(1-y_i) ]
|
||||
//
|
||||
// Log-likelihood equation:
|
||||
// ell(m, b) = sum{i=0}^{N-1} [ y_i log(p_i) + (1-y_i) log(1-p_i) ]
|
||||
// = sum{i=0}^{N-1} [ log(1-p_i) + y_i log(p_i/(1-p_i)) ]
|
||||
// = sum{i=0}^{N-1} [ log(1-p_i) + y_i*(m*x_i+b) ]
|
||||
// Differentiate with respect to parameters:
|
||||
//
|
||||
// d ell/dm = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/dm + x_i*y_i ]
|
||||
// = sum{i=0}^{N-1} [ -1/(1-p_i) x_i*p_i*(1-p_i) + x_i*y_i ]
|
||||
// = sum{i=0}^{N-1} [ x_i(y_i-p_i) ]
|
||||
//
|
||||
// d ell/db = sum{i=0}^{N-1} [ -1/(1-p_i) dp_i/db + y_i ]
|
||||
// = sum{i=0}^{N-1} [ -1/(1-p_i) p_i*(1-p_i) + y_i ]
|
||||
// = sum{i=0}^{N-1} [ y_i - p_i ]
|
||||
//
|
||||
//
|
||||
// d2ell/dm2 = sum{i=0}^{N-1} [ -x_i dp_i/dm ]
|
||||
// = sum{i=0}^{N-1} [ -x_i**2 * p_i * (1-p_i) ]
|
||||
//
|
||||
// d2ell/dmdb = sum{i=0}^{N-1} [ -x_i dp_i/db ]
|
||||
// = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ]
|
||||
//
|
||||
// d2ell/dbdm = sum{i=0}^{N-1} [ -dp_i/dm ]
|
||||
// = sum{i=0}^{N-1} [ -x_i * p_i * (1-p_i) ]
|
||||
//
|
||||
// d2ell/db2 = sum{i=0}^{N-1} [ -dp_i/db ]
|
||||
// = sum{i=0}^{N-1} [ -p_i * (1-p_i) ]
|
||||
//
|
||||
// Newton-Raphson to minimize ell(m, b):
|
||||
// * Pick m0, b0
|
||||
// * [m_{j+1], b_{j+1}] = H^{-1} grad ell(m_j, b_j)
|
||||
// * grad ell =
|
||||
// [ d ell/dm ]
|
||||
// [ d ell/db ]
|
||||
// * H = Hessian of ell = Jacobian of grad ell =
|
||||
// [ d2ell/dm2 d2ell/dmdb ]
|
||||
// [ d2ell/dmdb d2ell/db2 ]
|
||||
|
||||
// p(x,m,b) for logistic regression:
|
||||
func lrp(x, m, b float64) float64 {
|
||||
return 1.0 / (1.0 + math.Exp(-m*x-b))
|
||||
}
|
||||
|
||||
// 1 - p(x,m,b) for logistic regression:
|
||||
func lrq(x, m, b float64) float64 {
|
||||
return 1.0 / (1.0 + math.Exp(m*x+b))
|
||||
}
|
||||
|
||||
func LogisticRegression(xs, ys []float64) (m, b float64) {
|
||||
m0 := -0.001
|
||||
b0 := 0.002
|
||||
tol := 1e-9
|
||||
maxits := 100
|
||||
return logisticRegressionAux(xs, ys, m0, b0, tol, maxits)
|
||||
}
|
||||
|
||||
// Supporting routine for mlr_logistic_regression():
|
||||
func logisticRegressionAux(
|
||||
xs, ys []float64,
|
||||
m0, b0, tol float64,
|
||||
maxits int,
|
||||
) (m, b float64) {
|
||||
|
||||
InternalCodingErrorIf(len(xs) != len(ys))
|
||||
n := len(xs)
|
||||
|
||||
its := 0
|
||||
done := false
|
||||
m = m0
|
||||
b = b0
|
||||
|
||||
for !done {
|
||||
// Compute derivatives
|
||||
dldm := 0.0
|
||||
dldb := 0.0
|
||||
d2ldm2 := 0.0
|
||||
d2ldmdb := 0.0
|
||||
d2ldb2 := 0.0
|
||||
ell0 := 0.0
|
||||
|
||||
for i := 0; i < n; i++ {
|
||||
xi := xs[i]
|
||||
yi := ys[i]
|
||||
pi := lrp(xi, m0, b0)
|
||||
qi := lrq(xi, m0, b0)
|
||||
dldm += xi * (yi - pi)
|
||||
dldb += yi - pi
|
||||
piqi := pi * qi
|
||||
xipiqi := xi * piqi
|
||||
xi2piqi := xi * xipiqi
|
||||
d2ldm2 -= xi2piqi
|
||||
d2ldmdb -= xipiqi
|
||||
d2ldb2 -= piqi
|
||||
ell0 += math.Log(qi) + yi*(m0*xi+b0)
|
||||
}
|
||||
|
||||
// Form the Hessian
|
||||
ha := d2ldm2
|
||||
hb := d2ldmdb
|
||||
hc := d2ldmdb
|
||||
hd := d2ldb2
|
||||
|
||||
// Invert the Hessian
|
||||
D := ha*hd - hb*hc
|
||||
Hinva := hd / D
|
||||
Hinvb := -hb / D
|
||||
Hinvc := -hc / D
|
||||
Hinvd := ha / D
|
||||
|
||||
// Compute H^-1 times grad ell
|
||||
Hinvgradm := Hinva*dldm + Hinvb*dldb
|
||||
Hinvgradb := Hinvc*dldm + Hinvd*dldb
|
||||
|
||||
// Update [m,b]
|
||||
m = m0 - Hinvgradm
|
||||
b = b0 - Hinvgradb
|
||||
|
||||
ell := 0.0
|
||||
for i := 0; i < n; i++ {
|
||||
xi := xs[i]
|
||||
yi := ys[i]
|
||||
qi := lrq(xi, m, b)
|
||||
ell += math.Log(qi) + yi*(m0*xi+b0)
|
||||
}
|
||||
|
||||
// Check for convergence
|
||||
dell := math.Max(ell, ell0)
|
||||
err := 0.0
|
||||
if dell != 0.0 {
|
||||
err = math.Abs(ell-ell0) / dell
|
||||
}
|
||||
|
||||
if err < tol {
|
||||
done = true
|
||||
}
|
||||
its++
|
||||
if its > maxits {
|
||||
fmt.Fprintf(os.Stderr,
|
||||
"mlr_logistic_regression: Newton-Raphson convergence failed after %d iterations. m=%e, b=%e.\n",
|
||||
its, m, b)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
m0 = m
|
||||
b0 = b
|
||||
}
|
||||
|
||||
return m, b
|
||||
}
|
||||
155
pkg/lib/ordered_map.go
Normal file
155
pkg/lib/ordered_map.go
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
// ================================================================
|
||||
// ORDERED MAP FROM STRING TO INTERFACE{}
|
||||
//
|
||||
// Quite like types.OrderedMap but only with interface{} keys. See orderedMap.go for
|
||||
// more information.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
type OrderedMap struct {
|
||||
FieldCount int64
|
||||
Head *orderedMapEntry
|
||||
Tail *orderedMapEntry
|
||||
keysToEntries map[string]*orderedMapEntry
|
||||
}
|
||||
|
||||
type orderedMapEntry struct {
|
||||
Key string
|
||||
Value interface{}
|
||||
Prev *orderedMapEntry
|
||||
Next *orderedMapEntry
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func NewOrderedMap() *OrderedMap {
|
||||
return &OrderedMap{
|
||||
FieldCount: 0,
|
||||
Head: nil,
|
||||
Tail: nil,
|
||||
keysToEntries: make(map[string]*orderedMapEntry),
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Value-copy is up to the caller -- PutReference and PutCopy
|
||||
// are in the public OrderedMap API.
|
||||
func newOrderedMapEntry(key *string, value interface{}) *orderedMapEntry {
|
||||
return &orderedMapEntry{
|
||||
*key,
|
||||
value,
|
||||
nil,
|
||||
nil,
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func (omap *OrderedMap) IsEmpty() bool {
|
||||
return omap.FieldCount == 0
|
||||
}
|
||||
|
||||
func (omap *OrderedMap) Has(key string) bool {
|
||||
return omap.findEntry(&key) != nil
|
||||
}
|
||||
|
||||
func (omap *OrderedMap) findEntry(key *string) *orderedMapEntry {
|
||||
if omap.keysToEntries != nil {
|
||||
return omap.keysToEntries[*key]
|
||||
} else {
|
||||
for pe := omap.Head; pe != nil; pe = pe.Next {
|
||||
if pe.Key == *key {
|
||||
return pe
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func (omap *OrderedMap) Put(key string, value interface{}) {
|
||||
pe := omap.findEntry(&key)
|
||||
if pe == nil {
|
||||
pe = newOrderedMapEntry(&key, value)
|
||||
if omap.Head == nil {
|
||||
omap.Head = pe
|
||||
omap.Tail = pe
|
||||
} else {
|
||||
pe.Prev = omap.Tail
|
||||
pe.Next = nil
|
||||
omap.Tail.Next = pe
|
||||
omap.Tail = pe
|
||||
}
|
||||
if omap.keysToEntries != nil {
|
||||
omap.keysToEntries[key] = pe
|
||||
}
|
||||
omap.FieldCount++
|
||||
} else {
|
||||
pe.Value = value
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func (omap *OrderedMap) Get(key string) interface{} {
|
||||
pe := omap.findEntry(&key)
|
||||
if pe == nil {
|
||||
return nil
|
||||
} else {
|
||||
return pe.Value
|
||||
}
|
||||
}
|
||||
|
||||
// The Get is sufficient for pointer values -- the caller can check if the
|
||||
// return value is nil. For int/string values (which are non-nullable) we have
|
||||
// this method.
|
||||
func (omap *OrderedMap) GetWithCheck(key string) (interface{}, bool) {
|
||||
pe := omap.findEntry(&key)
|
||||
if pe == nil {
|
||||
return nil, false
|
||||
} else {
|
||||
return pe.Value, true
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func (omap *OrderedMap) Clear() {
|
||||
omap.FieldCount = 0
|
||||
omap.Head = nil
|
||||
omap.Tail = nil
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Returns true if it was found and removed
|
||||
func (omap *OrderedMap) Remove(key string) bool {
|
||||
pe := omap.findEntry(&key)
|
||||
if pe == nil {
|
||||
return false
|
||||
} else {
|
||||
omap.unlink(pe)
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func (omap *OrderedMap) unlink(pe *orderedMapEntry) {
|
||||
if pe == omap.Head {
|
||||
if pe == omap.Tail {
|
||||
omap.Head = nil
|
||||
omap.Tail = nil
|
||||
} else {
|
||||
omap.Head = pe.Next
|
||||
pe.Next.Prev = nil
|
||||
}
|
||||
} else {
|
||||
pe.Prev.Next = pe.Next
|
||||
if pe == omap.Tail {
|
||||
omap.Tail = pe.Prev
|
||||
} else {
|
||||
pe.Next.Prev = pe.Prev
|
||||
}
|
||||
}
|
||||
if omap.keysToEntries != nil {
|
||||
delete(omap.keysToEntries, pe.Key)
|
||||
}
|
||||
omap.FieldCount--
|
||||
}
|
||||
71
pkg/lib/paragraph.go
Normal file
71
pkg/lib/paragraph.go
Normal file
|
|
@ -0,0 +1,71 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// For online help contexts like printing all the built-in DSL functions, or
|
||||
// the list of all verbs.
|
||||
func PrintWordsAsParagraph(words []string) {
|
||||
separator := " "
|
||||
maxlen := 80
|
||||
|
||||
separatorlen := len(separator)
|
||||
linelen := 0
|
||||
j := 0
|
||||
|
||||
for _, word := range words {
|
||||
wordlen := len(word)
|
||||
linelen += separatorlen + wordlen
|
||||
if linelen >= maxlen {
|
||||
fmt.Printf("\n")
|
||||
linelen = separatorlen + wordlen
|
||||
j = 0
|
||||
}
|
||||
if j > 0 {
|
||||
fmt.Print(separator)
|
||||
}
|
||||
fmt.Print(word)
|
||||
j++
|
||||
}
|
||||
|
||||
fmt.Printf("\n")
|
||||
}
|
||||
|
||||
// For online help contexts like printing all the built-in DSL functions, or
|
||||
// the list of all verbs. Max width is nominally 80.
|
||||
func FormatAsParagraph(text string, maxWidth int) []string {
|
||||
lines := make([]string, 0)
|
||||
words := strings.Fields(text)
|
||||
|
||||
separator := " "
|
||||
separatorlen := len(separator)
|
||||
linelen := 0
|
||||
j := 0
|
||||
|
||||
var buffer bytes.Buffer
|
||||
for _, word := range words {
|
||||
wordlen := len(word)
|
||||
linelen += separatorlen + wordlen
|
||||
if linelen >= maxWidth {
|
||||
line := buffer.String()
|
||||
lines = append(lines, line)
|
||||
buffer.Reset()
|
||||
linelen = separatorlen + wordlen
|
||||
j = 0
|
||||
}
|
||||
if j > 0 {
|
||||
buffer.WriteString(separator)
|
||||
}
|
||||
buffer.WriteString(word)
|
||||
j++
|
||||
}
|
||||
line := buffer.String()
|
||||
if line != "" {
|
||||
lines = append(lines, line)
|
||||
}
|
||||
|
||||
return lines
|
||||
}
|
||||
42
pkg/lib/rand.go
Normal file
42
pkg/lib/rand.go
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
// ================================================================
|
||||
// Thinly wraps Go's rand library, with seed-function support
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// By default, Miller random numbers are different on every run.
|
||||
var defaultSeed = time.Now().UnixNano() ^ int64(os.Getpid())
|
||||
var source = rand.NewSource(defaultSeed)
|
||||
var generator = rand.New(source)
|
||||
|
||||
// Users can request specific seeds if they want the same random-number
|
||||
// sequence on each run.
|
||||
func SeedRandom(seed int64) {
|
||||
source = rand.NewSource(seed)
|
||||
generator = rand.New(source)
|
||||
}
|
||||
|
||||
func RandFloat64() float64 {
|
||||
return generator.Float64()
|
||||
}
|
||||
func RandUint32() uint32 {
|
||||
return generator.Uint32()
|
||||
}
|
||||
func RandInt63() int64 {
|
||||
return generator.Int63()
|
||||
}
|
||||
func RandRange(lowInclusive, highExclusive int64) int64 {
|
||||
if lowInclusive == highExclusive {
|
||||
return lowInclusive
|
||||
} else {
|
||||
u := generator.Int63()
|
||||
// TODO: test divide-by-zero cases in UT
|
||||
return lowInclusive + (u % (highExclusive - lowInclusive))
|
||||
}
|
||||
}
|
||||
90
pkg/lib/readfiles.go
Normal file
90
pkg/lib/readfiles.go
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
// ================================================================
|
||||
// Routines for loading strings from files. Nominally for the put/filter verbs
|
||||
// to load DSL strings from .mlr files.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
csv "github.com/johnkerl/miller/pkg/go-csv"
|
||||
)
|
||||
|
||||
// LoadStringsFromFileOrDir calls LoadStringFromFile if path exists and is a
|
||||
// file, or LoadStringsFromDir if path exists and is a directory. In the
|
||||
// former case the extension is ignored; in the latter case it's used as a
|
||||
// filter on the directory entries.
|
||||
func LoadStringsFromFileOrDir(path string, extension string) ([]string, error) {
|
||||
fileInfo, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if fileInfo.IsDir() {
|
||||
return LoadStringsFromDir(path, extension)
|
||||
} else {
|
||||
dslString, err := LoadStringFromFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else {
|
||||
return []string{dslString}, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// LoadStringFromFile is just a wrapper around ioutil.ReadFile,
|
||||
// with a cast from []byte to string.
|
||||
func LoadStringFromFile(filename string) (string, error) {
|
||||
data, err := ioutil.ReadFile(filename)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
|
||||
// LoadStringsFromDir loads all file contents for files in the given directory
|
||||
// having the given extension. E.g. LoadStringsFromDir("/u/myfiles", ".mlr")
|
||||
// will load /u/myfiles/foo.mlr and /u/myfiles/bar.mlr but will skip over
|
||||
// /u/myfiles/data.csv and /u/myfiles/todo.txt.
|
||||
func LoadStringsFromDir(dirname string, extension string) ([]string, error) {
|
||||
dslStrings := make([]string, 0)
|
||||
|
||||
entries, err := ioutil.ReadDir(dirname)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for i := range entries {
|
||||
entry := &entries[i]
|
||||
name := (*entry).Name()
|
||||
if !strings.HasSuffix(name, extension) {
|
||||
continue
|
||||
}
|
||||
|
||||
path := dirname + "/" + name
|
||||
dslString, err := LoadStringFromFile(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dslStrings = append(dslStrings, dslString)
|
||||
}
|
||||
|
||||
return dslStrings, nil
|
||||
}
|
||||
|
||||
func ReadCSVHeader(filename string) ([]string, error) {
|
||||
handle, err := os.Open(filename)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer handle.Close()
|
||||
csvReader := csv.NewReader(handle)
|
||||
header, err := csvReader.Read()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return header, nil
|
||||
}
|
||||
386
pkg/lib/regex.go
Normal file
386
pkg/lib/regex.go
Normal file
|
|
@ -0,0 +1,386 @@
|
|||
// ================================================================
|
||||
// Support for regexes in Miller.
|
||||
//
|
||||
// * By and large we use the Go library.
|
||||
//
|
||||
// * There is (for historical reasons) a DSL syntax "[a-z]"i (note the trailing i)
|
||||
// for case-insensitive regular expressions which we map into Go syntax for
|
||||
// regex-compilation.
|
||||
//
|
||||
// * Also for historical reasons, we allow things like
|
||||
// if ($x =~ "(..)_(...)") {
|
||||
// ... other lines of code ...
|
||||
// $y = "\2:\1";
|
||||
// }
|
||||
// where the '=~' sets the captures and the "\2:\1" uses them. (Note that
|
||||
// https://github.com/johnkerl/miller/issues/388 has a better suggestion
|
||||
// which would make the captures explicit as variables, rather than implicit
|
||||
// within CST state -- regardless, the current syntax will still be supported
|
||||
// for backward compatibility and so is here to stay.) Here we make use of Go
|
||||
// regexp-library functions to write to, and then later interpolate from, a
|
||||
// captures array which is stored within CST state. (See the `runtime.State`
|
||||
// object.)
|
||||
//
|
||||
// * "\0" is for a full match; "\1" .. "\9" are for submatch cqptures. E.g.
|
||||
// if $x is "foobarbaz" and the regex is "foo(.)(..)baz", then "\0" is
|
||||
// "foobarbaz", "\1" is "b", "\2" is "ar", and "\3".."\9" are "".
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"os"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// captureDetector is used to see if a string literal interpolates previous
|
||||
// captures (like "\2:\1") or not (like "2:1").
|
||||
var captureDetector = regexp.MustCompile(`\\[0-9]`)
|
||||
|
||||
// captureSplitter is used to precompute an offsets matrix for strings like
|
||||
// "\2:\1" so they don't need to be recomputed on every record.
|
||||
var captureSplitter = regexp.MustCompile(`(\\[0-9])`)
|
||||
|
||||
// CompileMillerRegex wraps Go regex-compile with some Miller-specific syntax
|
||||
// which predate the port of Miller from C to Go. Miller regexes use a final
|
||||
// 'i' to indicate case-insensitivity; Go regexes use an initial "(?i)".
|
||||
//
|
||||
// (See also mlr.bnf where we specify which things can be backslash-escaped
|
||||
// without a syntax error at parse time.)
|
||||
//
|
||||
// * If the regex_string is of the form a.*b, compiles it case-sensisitively.
|
||||
// * If the regex_string is of the form "a.*b", compiles a.*b case-sensisitively.
|
||||
// * If the regex_string is of the form "a.*b"i, compiles a.*b case-insensitively.
|
||||
func CompileMillerRegex(regexString string) (*regexp.Regexp, error) {
|
||||
n := len(regexString)
|
||||
if n < 2 {
|
||||
return regexp.Compile(regexString)
|
||||
}
|
||||
|
||||
// TODO: rethink this. This will strip out things people have entered, e.g. "\"...\"".
|
||||
// The parser-to-AST will have stripped the outer and we'll strip the inner and the
|
||||
// user's intent will be lost.
|
||||
//
|
||||
// TODO: make separate functions for calling from parser-to-AST (string
|
||||
// literals) and from verbs (like cut -r or having-fields).
|
||||
|
||||
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"") {
|
||||
return regexp.Compile(regexString[1 : n-1])
|
||||
}
|
||||
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/") {
|
||||
return regexp.Compile(regexString[1 : n-1])
|
||||
}
|
||||
|
||||
if strings.HasPrefix(regexString, "\"") && strings.HasSuffix(regexString, "\"i") {
|
||||
return regexp.Compile("(?i)" + regexString[1:n-2])
|
||||
}
|
||||
if strings.HasPrefix(regexString, "/") && strings.HasSuffix(regexString, "/i") {
|
||||
return regexp.Compile("(?i)" + regexString[1:n-2])
|
||||
}
|
||||
|
||||
return regexp.Compile(regexString)
|
||||
}
|
||||
|
||||
// CompileMillerRegexOrDie wraps CompileMillerRegex. Usually in Go we want to
|
||||
// return a second error argument rather than fataling. However, if there's a
|
||||
// malformed regex we really cannot continue so it's simpler to just fatal.
|
||||
func CompileMillerRegexOrDie(regexString string) *regexp.Regexp {
|
||||
regex, err := CompileMillerRegex(regexString)
|
||||
if err != nil {
|
||||
fmt.Fprint(os.Stderr, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
return regex
|
||||
}
|
||||
|
||||
// CompileMillerRegexesOrDie is a convenenience looper over CompileMillerRegexOrDie.
|
||||
func CompileMillerRegexesOrDie(regexStrings []string) []*regexp.Regexp {
|
||||
regexes := make([]*regexp.Regexp, len(regexStrings))
|
||||
|
||||
for i, regexString := range regexStrings {
|
||||
regexes[i] = CompileMillerRegexOrDie(regexString)
|
||||
}
|
||||
|
||||
return regexes
|
||||
}
|
||||
|
||||
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
|
||||
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
|
||||
// but "" splits to [""] when I wish it were []. This function does the latter.
|
||||
func RegexSplitString(regex *regexp.Regexp, input string, n int) []string {
|
||||
if input == "" {
|
||||
return make([]string, 0)
|
||||
} else {
|
||||
return regex.Split(input, n)
|
||||
}
|
||||
}
|
||||
|
||||
// MakeEmptyRegexCaptures is for initial CST state at the start of executing
|
||||
// the DSL expression for the current record. Even if '$x =~ "(..)_(...)" set
|
||||
// "\1" and "\2" on the previous record, at start of processing for the current
|
||||
// record we need to start with a clean slate.
|
||||
func MakeEmptyRegexCaptures() []string {
|
||||
return nil
|
||||
}
|
||||
|
||||
// RegexReplacementHasCaptures is used by the CST builder to see if
|
||||
// string-literal is like "foo bar" or "foo \1 bar" -- in the latter case it
|
||||
// needs to retain the compiled offsets-matrix information.
|
||||
func RegexReplacementHasCaptures(
|
||||
replacement string,
|
||||
) (
|
||||
hasCaptures bool,
|
||||
matrix [][]int,
|
||||
) {
|
||||
if captureDetector.MatchString(replacement) {
|
||||
return true, captureSplitter.FindAllSubmatchIndex([]byte(replacement), -1)
|
||||
} else {
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
|
||||
// RegexMatches implements the =~ DSL operator. The captures are stored in DSL
|
||||
// state and may be used by a DSL statement after the =~. For example, in
|
||||
//
|
||||
// sub($a, "(..)_(...)", "\1:\2")
|
||||
//
|
||||
// the replacement string is an argument to sub and therefore the captures are
|
||||
// confined to the implementation of the sub function. Similarly for gsub. But
|
||||
// for the match operator, people can do
|
||||
//
|
||||
// if ($x =~ "(..)_(...)") {
|
||||
// ... other lines of code ...
|
||||
// $y = "\2:\1"
|
||||
// }
|
||||
//
|
||||
// and the =~ callsite doesn't know if captures will be used or not. So,
|
||||
// RegexMatches always returns the captures array. It is stored within the CST
|
||||
// state.
|
||||
func RegexMatches(
|
||||
input string,
|
||||
sregex string,
|
||||
) (
|
||||
matches bool,
|
||||
capturesOneUp []string,
|
||||
) {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
return RegexMatchesCompiled(input, regex)
|
||||
}
|
||||
|
||||
// RegexMatchesCompiled is the implementation for the =~ operator. Without
|
||||
// Miller-style regex captures this would a simple one-line
|
||||
// regex.MatchString(input). However, we return the captures array for the
|
||||
// benefit of subsequent references to "\0".."\9".
|
||||
func RegexMatchesCompiled(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
) (bool, []string) {
|
||||
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
|
||||
if matrix == nil || len(matrix) == 0 {
|
||||
// Set all captures to ""
|
||||
return false, make([]string, 10)
|
||||
}
|
||||
|
||||
// "\0" .. "\9"
|
||||
captures := make([]string, 10)
|
||||
|
||||
// If there are multiple matches -- e.g. input is
|
||||
//
|
||||
// "...ab_cde...fg_hij..."
|
||||
//
|
||||
// with regex
|
||||
//
|
||||
// "(..)_(...)"
|
||||
//
|
||||
// -- then we only consider the first match: boolean return value is true
|
||||
// (the input string matched the regex), and the captures array will map
|
||||
// "\1" to "ab" and "\2" to "cde".
|
||||
row := matrix[0]
|
||||
n := len(row)
|
||||
|
||||
// Example return value from FindAllSubmatchIndex with input
|
||||
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
|
||||
//
|
||||
// Matrix is [][]int{
|
||||
// []int{3, 9, 3, 5, 6, 9},
|
||||
// []int{12, 18, 12, 14, 15, 18},
|
||||
// }
|
||||
//
|
||||
// As noted above we look at only the first row.
|
||||
//
|
||||
// * 3-9 is for the entire match "ab_cde"
|
||||
// * 3-5 is for the first capture "ab"
|
||||
// * 6-9 is for the second capture "cde"
|
||||
|
||||
di := 0
|
||||
for si := 0; si < n && di <= 9; si += 2 {
|
||||
start := row[si]
|
||||
end := row[si+1]
|
||||
if start >= 0 && end >= 0 {
|
||||
captures[di] = input[start:end]
|
||||
}
|
||||
di += 1
|
||||
}
|
||||
|
||||
return true, captures
|
||||
}
|
||||
|
||||
// InterpolateCaptures example:
|
||||
// - Input $x is "ab_cde"
|
||||
// - DSL expression
|
||||
// if ($x =~ "(..)_(...)") {
|
||||
// ... other lines of code ...
|
||||
// $y = "\2:\1";
|
||||
// }
|
||||
// - InterpolateCaptures is used on the evaluation of "\2:\1"
|
||||
// - replacementString is "\2:\1"
|
||||
// - replacementMatrix contains precomputed/cached offsets for the "\2" and
|
||||
// "\1" substrings within "\2:\1"
|
||||
// - captures has slot 0 being "ab_cde" (for "\0"), slot 1 being "ab" (for "\1"),
|
||||
// slot 2 being "cde" (for "\2"), and slots 3-9 being "".
|
||||
func InterpolateCaptures(
|
||||
replacementString string,
|
||||
replacementMatrix [][]int,
|
||||
captures []string,
|
||||
) string {
|
||||
if replacementMatrix == nil || captures == nil {
|
||||
return replacementString
|
||||
}
|
||||
var buffer bytes.Buffer
|
||||
|
||||
nonMatchStartIndex := 0
|
||||
|
||||
for _, row := range replacementMatrix {
|
||||
start := row[0]
|
||||
buffer.WriteString(replacementString[nonMatchStartIndex:row[0]])
|
||||
|
||||
// Map "\0".."\9" to integer index 0..9
|
||||
index := replacementString[start+1] - '0'
|
||||
buffer.WriteString(captures[index])
|
||||
|
||||
nonMatchStartIndex = row[1]
|
||||
}
|
||||
|
||||
buffer.WriteString(replacementString[nonMatchStartIndex:])
|
||||
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// RegexSub implements the sub DSL function.
|
||||
func RegexSub(
|
||||
input string,
|
||||
sregex string,
|
||||
replacement string,
|
||||
) string {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
|
||||
return RegexSubCompiled(input, regex, replacement, replacementCaptureMatrix)
|
||||
}
|
||||
|
||||
// RegexSubCompiled is the same as RegexSub but with compiled regex and
|
||||
// replacement strings.
|
||||
func RegexSubCompiled(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
replacement string,
|
||||
replacementCaptureMatrix [][]int,
|
||||
) string {
|
||||
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, true)
|
||||
}
|
||||
|
||||
// RegexGsub implements the gsub DSL function.
|
||||
func RegexGsub(
|
||||
input string,
|
||||
sregex string,
|
||||
replacement string,
|
||||
) string {
|
||||
regex := CompileMillerRegexOrDie(sregex)
|
||||
_, replacementCaptureMatrix := RegexReplacementHasCaptures(replacement)
|
||||
return regexSubGsubCompiled(input, regex, replacement, replacementCaptureMatrix, false)
|
||||
}
|
||||
|
||||
// regexSubGsubCompiled is the implementation for sub/gsub with compilex regex
|
||||
// and replacement strings.
|
||||
func regexSubGsubCompiled(
|
||||
input string,
|
||||
regex *regexp.Regexp,
|
||||
replacement string,
|
||||
replacementCaptureMatrix [][]int,
|
||||
breakOnFirst bool,
|
||||
) string {
|
||||
matrix := regex.FindAllSubmatchIndex([]byte(input), -1)
|
||||
if matrix == nil || len(matrix) == 0 {
|
||||
return input
|
||||
}
|
||||
|
||||
// Example return value from FindAllSubmatchIndex with input
|
||||
// "...ab_cde...fg_hij..." and regex "(..)_(...)":
|
||||
//
|
||||
// Matrix is [][]int{
|
||||
// []int{3, 9, 3, 5, 6, 9},
|
||||
// []int{12, 18, 12, 14, 15, 18},
|
||||
// }
|
||||
//
|
||||
// * 3-9 is for the entire match "ab_cde"
|
||||
// * 3-5 is for the first capture "ab"
|
||||
// * 6-9 is for the second capture "cde"
|
||||
//
|
||||
// * 12-18 is for the entire match "fg_hij"
|
||||
// * 12-14 is for the first capture "fg"
|
||||
// * 15-18 is for the second capture "hij"
|
||||
|
||||
var buffer bytes.Buffer
|
||||
nonMatchStartIndex := 0
|
||||
|
||||
for _, row := range matrix {
|
||||
buffer.WriteString(input[nonMatchStartIndex:row[0]])
|
||||
|
||||
// "\0" .. "\9"
|
||||
captures := make([]string, 10)
|
||||
di := 0
|
||||
n := len(row)
|
||||
for si := 0; si < n && di <= 9; si += 2 {
|
||||
start := row[si]
|
||||
end := row[si+1]
|
||||
if start >= 0 && end >= 0 {
|
||||
captures[di] = input[start:end]
|
||||
}
|
||||
di += 1
|
||||
}
|
||||
|
||||
// If the replacement had no captures, e.g. "xyz", we would insert it
|
||||
//
|
||||
// "..." -> "..."
|
||||
// "ab_cde" -> "xyz" --- here
|
||||
// "..." -> "..."
|
||||
// "fg_hij" -> "xyz" --- and here
|
||||
// "..." -> "..."
|
||||
//
|
||||
// using buffer.WriteString(replacement). However, this function exists
|
||||
// to handle the case when the replacement string has captures like
|
||||
// "\2:\1", so we need to produce
|
||||
//
|
||||
// "..." -> "..."
|
||||
// "ab_cde" -> "cde:ab" --- here
|
||||
// "..." -> "..."
|
||||
// "fg_hij" -> "hij:fg" --- and here
|
||||
// "..." -> "..."
|
||||
updatedReplacement := InterpolateCaptures(
|
||||
replacement,
|
||||
replacementCaptureMatrix,
|
||||
captures,
|
||||
)
|
||||
buffer.WriteString(updatedReplacement)
|
||||
|
||||
nonMatchStartIndex = row[1]
|
||||
if breakOnFirst {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
buffer.WriteString(input[nonMatchStartIndex:])
|
||||
return buffer.String()
|
||||
}
|
||||
190
pkg/lib/regex_test.go
Normal file
190
pkg/lib/regex_test.go
Normal file
|
|
@ -0,0 +1,190 @@
|
|||
// ================================================================
|
||||
// Most Miller tests (thousands of them) are command-line-driven via
|
||||
// mlr regtest. Here are some cases needing special focus.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
type tDataForHasCaptures struct {
|
||||
replacement string
|
||||
expectedHasCaptures bool
|
||||
expectedMatrix [][]int
|
||||
}
|
||||
|
||||
type tDataForSubGsub struct {
|
||||
input string
|
||||
sregex string
|
||||
replacement string
|
||||
expectedOutput string
|
||||
}
|
||||
|
||||
type tDataForMatches struct {
|
||||
input string
|
||||
sregex string
|
||||
expectedOutput bool
|
||||
expectedCaptures []string
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
var dataForHasCaptures = []tDataForHasCaptures{
|
||||
{"foo", false, nil},
|
||||
{"\\0", true, [][]int{{0, 2, 0, 2}}},
|
||||
{"\\3", true, [][]int{{0, 2, 0, 2}}},
|
||||
{"\\34", true, [][]int{{0, 2, 0, 2}}},
|
||||
{"abc\\1def\\2ghi", true, [][]int{{3, 5, 3, 5}, {8, 10, 8, 10}}},
|
||||
}
|
||||
|
||||
var dataForSub = []tDataForSubGsub{
|
||||
{"abcde", "c", "X", "abXde"},
|
||||
{"abcde", "z", "X", "abcde"},
|
||||
{"abcde", "[a-z]", "X", "Xbcde"},
|
||||
{"abcde", "[A-Z]", "X", "abcde"},
|
||||
|
||||
{"abcde", "c", "X", "abXde"},
|
||||
{"abcde", "z", "X", "abcde"},
|
||||
{"abcde", "[a-z]", "X", "Xbcde"},
|
||||
{"abcde", "[A-Z]", "X", "abcde"},
|
||||
|
||||
{"ab_cde", "(..)_(...)", "\\2\\1", "cdeab"},
|
||||
{"ab_cde", "(..)_(...)", "\\2-\\1", "cde-ab"},
|
||||
{"ab_cde", "(..)_(...)", "X\\2Y\\1Z", "XcdeYabZ"},
|
||||
|
||||
{"foofoofoo", "(f.o)", "b\\1r", "bfoorfoofoo"},
|
||||
{"foofoofoo", "(f.*o)", "b\\1r", "bfoofoofoor"},
|
||||
{"foofoofoo", "(f.o)", "b\\2r", "brfoofoo"},
|
||||
{"foofoofoo", "(f.*o)", "b\\2r", "br"},
|
||||
}
|
||||
|
||||
var dataForGsub = []tDataForSubGsub{
|
||||
{"abcde", "c", "X", "abXde"},
|
||||
{"abcde", "z", "X", "abcde"},
|
||||
{"abcde", "[a-z]", "X", "XXXXX"},
|
||||
{"abcde", "[A-Z]", "X", "abcde"},
|
||||
{"abcde", "[c-d]", "X", "abXXe"},
|
||||
|
||||
{"abcde", "c", "X", "abXde"},
|
||||
{"abcde", "z", "X", "abcde"},
|
||||
{"abcde", "[a-z]", "X", "XXXXX"},
|
||||
{"abcde", "[A-Z]", "X", "abcde"},
|
||||
{"abcde", "[c-d]", "X", "abXXe"},
|
||||
|
||||
{"abacad", "a(.)", "<\\1>", "<b><c><d>"},
|
||||
{"abacad", "a(.)", "<\\2>", "<><><>"},
|
||||
}
|
||||
|
||||
var dataForMatches = []tDataForMatches{
|
||||
{"abcde", "[A-Z]", false, []string{"", "", "", "", "", "", "", "", "", ""}},
|
||||
{"abcde", "[a-z]", true, []string{"a", "", "", "", "", "", "", "", "", ""}},
|
||||
{"...ab_cde...", "(..)_(...)", true, []string{"ab_cde", "ab", "cde", "", "", "", "", "", "", ""}},
|
||||
{"...ab_cde...fg_hij...", "(..)_(...)", true, []string{"ab_cde", "ab", "cde", "", "", "", "", "", "", ""}},
|
||||
{"foofoofoo", "(f.o)", true, []string{"foo", "foo", "", "", "", "", "", "", "", ""}},
|
||||
{"foofoofoo", "(f.*o)", true, []string{"foofoofoo", "foofoofoo", "", "", "", "", "", "", "", ""}},
|
||||
}
|
||||
|
||||
func TestRegexReplacementHasCaptures(t *testing.T) {
|
||||
for i, entry := range dataForHasCaptures {
|
||||
actualHasCaptures, actualMatrix := RegexReplacementHasCaptures(entry.replacement)
|
||||
if actualHasCaptures != entry.expectedHasCaptures {
|
||||
t.Fatalf("case %d replacement \"%s\" expected %v got %v\n",
|
||||
i, entry.replacement, entry.expectedHasCaptures, actualHasCaptures,
|
||||
)
|
||||
}
|
||||
if !compareMatrices(actualMatrix, entry.expectedMatrix) {
|
||||
t.Fatalf("case %d replacement \"%s\" expected matrix %#v got %#v\n",
|
||||
i, entry.replacement, entry.expectedMatrix, actualMatrix,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegexSub(t *testing.T) {
|
||||
for i, entry := range dataForSub {
|
||||
actualOutput := RegexSub(entry.input, entry.sregex, entry.replacement)
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
|
||||
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegexGsub(t *testing.T) {
|
||||
for i, entry := range dataForGsub {
|
||||
actualOutput := RegexGsub(entry.input, entry.sregex, entry.replacement)
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" sregex \"%s\" replacement \"%s\" expected \"%s\" got \"%s\"\n",
|
||||
i, entry.input, entry.sregex, entry.replacement, entry.expectedOutput, actualOutput,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRegexMatches(t *testing.T) {
|
||||
for i, entry := range dataForMatches {
|
||||
actualOutput, actualCaptures := RegexMatches(entry.input, entry.sregex)
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" sregex \"%s\" expected %v got %v\n",
|
||||
i, entry.input, entry.sregex, entry.expectedOutput, actualOutput,
|
||||
)
|
||||
}
|
||||
if !compareCaptures(actualCaptures, entry.expectedCaptures) {
|
||||
t.Fatalf("case %d input \"%s\" sregex \"%s\" expected captures %#v got %#v\n",
|
||||
i, entry.input, entry.sregex, entry.expectedCaptures, actualCaptures,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func compareMatrices(
|
||||
actualMatrix [][]int,
|
||||
expectedMatrix [][]int,
|
||||
) bool {
|
||||
if actualMatrix == nil && expectedMatrix == nil {
|
||||
return true
|
||||
}
|
||||
if actualMatrix == nil || expectedMatrix == nil {
|
||||
return false
|
||||
}
|
||||
if len(actualMatrix) != len(expectedMatrix) {
|
||||
return false
|
||||
}
|
||||
for i := range expectedMatrix {
|
||||
actualRow := actualMatrix[i]
|
||||
expectedRow := expectedMatrix[i]
|
||||
if len(actualRow) != len(expectedRow) {
|
||||
return false
|
||||
}
|
||||
for j := range expectedRow {
|
||||
if actualRow[j] != expectedRow[j] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func compareCaptures(
|
||||
actualCaptures []string,
|
||||
expectedCaptures []string,
|
||||
) bool {
|
||||
if actualCaptures == nil && expectedCaptures == nil {
|
||||
return true
|
||||
}
|
||||
if actualCaptures == nil || expectedCaptures == nil {
|
||||
return false
|
||||
}
|
||||
if len(actualCaptures) != len(expectedCaptures) {
|
||||
return false
|
||||
}
|
||||
for i := range expectedCaptures {
|
||||
if actualCaptures[i] != expectedCaptures[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
278
pkg/lib/stats.go
Normal file
278
pkg/lib/stats.go
Normal file
|
|
@ -0,0 +1,278 @@
|
|||
// ================================================================
|
||||
// These are intended for streaming (i.e. single-pass) applications. Otherwise
|
||||
// the formulas look different (and are more intuitive).
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"math"
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Univariate linear regression
|
||||
// ----------------------------------------------------------------
|
||||
// There are N (xi, yi) pairs.
|
||||
//
|
||||
// minimize E = sum (yi - m xi - b)^2
|
||||
//
|
||||
// Set the two partial derivatives to zero and solve for m and b:
|
||||
//
|
||||
// DE/Dm = sum 2 (yi - m xi - b) (-xi) = 0
|
||||
// DE/Db = sum 2 (yi - m xi - b) (-1) = 0
|
||||
//
|
||||
// sum (yi - m xi - b) (xi) = 0
|
||||
// sum (yi - m xi - b) = 0
|
||||
//
|
||||
// sum (xi yi - m xi^2 - b xi) = 0
|
||||
// sum (yi - m xi - b) = 0
|
||||
//
|
||||
// m sum(xi^2) + b sum(xi) = sum(xi yi)
|
||||
// m sum(xi) + b N = sum(yi)
|
||||
//
|
||||
// [ sum(xi^2) sum(xi) ] [ m ] = [ sum(xi yi) ]
|
||||
// [ sum(xi) N ] [ b ] = [ sum(yi) ]
|
||||
//
|
||||
// [ m ] = [ sum(xi^2) sum(xi) ]^-1 [ sum(xi yi) ]
|
||||
// [ b ] [ sum(xi) N ] [ sum(yi) ]
|
||||
//
|
||||
// = [ N -sum(xi) ] [ sum(xi yi) ] * 1/D
|
||||
// [ -sum(xi) sum(xi^2)] [ sum(yi) ]
|
||||
//
|
||||
// where
|
||||
//
|
||||
// D = N sum(xi^2) - sum(xi)^2.
|
||||
//
|
||||
// So
|
||||
//
|
||||
// N sum(xi yi) - sum(xi) sum(yi)
|
||||
// m = --------------------------------
|
||||
// D
|
||||
//
|
||||
// -sum(xi)sum(xi yi) + sum(xi^2) sum(yi)
|
||||
// b = ----------------------------------------
|
||||
// D
|
||||
//
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
func GetLinearRegressionOLS(
|
||||
nint int64,
|
||||
sumx float64,
|
||||
sumx2 float64,
|
||||
sumxy float64,
|
||||
sumy float64,
|
||||
) (m, b float64) {
|
||||
|
||||
n := float64(nint)
|
||||
D := n*sumx2 - sumx*sumx
|
||||
m = (n*sumxy - sumx*sumy) / D
|
||||
b = (-sumx*sumxy + sumx2*sumy) / D
|
||||
return m, b
|
||||
}
|
||||
|
||||
// We would need a second pass through the data to compute the error-bars given
|
||||
// the data and the m and the b.
|
||||
//
|
||||
// # Young 1962, pp. 122-124. Compute sample variance of linear
|
||||
// # approximations, then variances of m and b.
|
||||
// var_z = 0.0
|
||||
// for i in range(0, N):
|
||||
// var_z += (m * xs[i] + b - ys[i])**2
|
||||
// var_z /= N
|
||||
//
|
||||
// var_m = (N * var_z) / D
|
||||
// var_b = (var_z * sumx2) / D
|
||||
//
|
||||
// output = [m, b, math.sqrt(var_m), math.sqrt(var_b)]
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// GetVar is the finalizing function for computing variance from streamed
|
||||
// accumulator values.
|
||||
func GetVar(
|
||||
nint int64,
|
||||
sumx float64,
|
||||
sumx2 float64,
|
||||
) float64 {
|
||||
|
||||
n := float64(nint)
|
||||
mean := sumx / n
|
||||
numerator := sumx2 - mean*(2.0*sumx-n*mean)
|
||||
if numerator < 0.0 { // round-off error
|
||||
numerator = 0.0
|
||||
}
|
||||
denominator := n - 1.0
|
||||
return numerator / denominator
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Unbiased estimator:
|
||||
// (1/n) sum{(xi-mean)**3}
|
||||
// -----------------------------
|
||||
// [(1/(n-1)) sum{(xi-mean)**2}]**1.5
|
||||
|
||||
// mean = sumx / n; n mean = sumx
|
||||
|
||||
// sum{(xi-mean)^3}
|
||||
// = sum{xi^3 - 3 mean xi^2 + 3 mean^2 xi - mean^3}
|
||||
// = sum{xi^3} - 3 mean sum{xi^2} + 3 mean^2 sum{xi} - n mean^3
|
||||
// = sumx3 - 3 mean sumx2 + 3 mean^2 sumx - n mean^3
|
||||
// = sumx3 - 3 mean sumx2 + 3n mean^3 - n mean^3
|
||||
// = sumx3 - 3 mean sumx2 + 2n mean^3
|
||||
// = sumx3 - mean*(3 sumx2 + 2n mean^2)
|
||||
|
||||
// sum{(xi-mean)^2}
|
||||
// = sum{xi^2 - 2 mean xi + mean^2}
|
||||
// = sum{xi^2} - 2 mean sum{xi} + n mean^2
|
||||
// = sumx2 - 2 mean sumx + n mean^2
|
||||
// = sumx2 - 2 n mean^2 + n mean^2
|
||||
// = sumx2 - n mean^2
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
// GetSkewness is the finalizing function for computing skewness from streamed
|
||||
// accumulator values.
|
||||
func GetSkewness(
|
||||
nint int,
|
||||
sumx float64,
|
||||
sumx2 float64,
|
||||
sumx3 float64,
|
||||
) float64 {
|
||||
|
||||
n := float64(nint)
|
||||
mean := sumx / n
|
||||
numerator := sumx3 - mean*(3*sumx2-2*n*mean*mean)
|
||||
numerator = numerator / n
|
||||
denominator := (sumx2 - n*mean*mean) / (n - 1)
|
||||
denominator = math.Pow(denominator, 1.5)
|
||||
return numerator / denominator
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Unbiased:
|
||||
// (1/n) sum{(x-mean)**4}
|
||||
// ----------------------- - 3
|
||||
// [(1/n) sum{(x-mean)**2}]**2
|
||||
|
||||
// sum{(xi-mean)^4}
|
||||
// = sum{xi^4 - 4 mean xi^3 + 6 mean^2 xi^2 - 4 mean^3 xi + mean^4}
|
||||
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 mean^3 sum{xi} + n mean^4
|
||||
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 4 n mean^4 + n mean^4
|
||||
// = sum{xi^4} - 4 mean sum{xi^3} + 6 mean^2 sum{xi^2} - 3 n mean^4
|
||||
// = sum{xi^4} - mean*(4 sum{xi^3} - 6 mean sum{xi^2} + 3 n mean^3)
|
||||
// = sumx4 - mean*(4 sumx3 - 6 mean sumx2 + 3 n mean^3)
|
||||
// = sumx4 - mean*(4 sumx3 - mean*(6 sumx2 - 3 n mean^2))
|
||||
|
||||
func GetKurtosis(
|
||||
nint int,
|
||||
sumx float64,
|
||||
sumx2 float64,
|
||||
sumx3 float64,
|
||||
sumx4 float64,
|
||||
) float64 {
|
||||
|
||||
n := float64(nint)
|
||||
mean := sumx / n
|
||||
numerator := sumx4 - mean*(4*sumx3-mean*(6*sumx2-3*n*mean*mean))
|
||||
numerator = numerator / n
|
||||
denominator := (sumx2 - n*mean*mean) / n
|
||||
denominator = denominator * denominator
|
||||
return numerator/denominator - 3.0
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Non-streaming implementation:
|
||||
//
|
||||
// def find_sample_covariance(xs, ys):
|
||||
// n = len(xs)
|
||||
// mean_x = find_mean(xs)
|
||||
// mean_y = find_mean(ys)
|
||||
//
|
||||
// sum = 0.0
|
||||
// for k in range(0, n):
|
||||
// sum += (xs[k] - mean_x) * (ys[k] - mean_y)
|
||||
//
|
||||
// return sum / (n-1.0)
|
||||
|
||||
func GetCov(
|
||||
nint int64,
|
||||
sumx float64,
|
||||
sumy float64,
|
||||
sumxy float64,
|
||||
) float64 {
|
||||
|
||||
n := float64(nint)
|
||||
meanx := sumx / n
|
||||
meany := sumy / n
|
||||
numerator := sumxy - meanx*sumy - meany*sumx + n*meanx*meany
|
||||
denominator := n - 1
|
||||
return numerator / denominator
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
func GetCovMatrix(
|
||||
nint int64,
|
||||
sumx float64,
|
||||
sumx2 float64,
|
||||
sumy float64,
|
||||
sumy2 float64,
|
||||
sumxy float64,
|
||||
) (Q [2][2]float64) {
|
||||
|
||||
n := float64(nint)
|
||||
denominator := n - 1
|
||||
|
||||
Q[0][0] = (sumx2 - sumx*sumx/n) / denominator
|
||||
Q[0][1] = (sumxy - sumx*sumy/n) / denominator
|
||||
Q[1][0] = Q[0][1]
|
||||
Q[1][1] = (sumy2 - sumy*sumy/n) / denominator
|
||||
|
||||
return Q
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Principal component analysis can be used for linear regression:
|
||||
//
|
||||
// * Compute the covariance matrix for the x's and y's.
|
||||
//
|
||||
// * Find its eigenvalues and eigenvectors of the cov. (This is real-symmetric
|
||||
// so Jacobi iteration is simple and fine.)
|
||||
//
|
||||
// * The principal eigenvector points in the direction of the fit.
|
||||
//
|
||||
// * The covariance matrix is computed on zero-mean data so the intercept
|
||||
// is zero. The fit equation is of the form (y - nu) = m*(x - mu) where mu
|
||||
// and nu are x and y means, respectively.
|
||||
//
|
||||
// * If the fit is perfect then the 2nd eigenvalue will be zero; if the fit is
|
||||
// good then the 2nd eigenvalue will be smaller; if the fit is bad then
|
||||
// they'll be about the same. I use 1 - |lambda2|/|lambda1| as an indication
|
||||
// of quality of the fit.
|
||||
//
|
||||
// Standard ("ordinary least-squares") linear regression is appropriate when
|
||||
// the errors are thought to be all in the y's. PCA ("total least-squares") is
|
||||
// appropriate when the x's and the y's are thought to both have errors.
|
||||
|
||||
func GetLinearRegressionPCA(
|
||||
eigenvalue_1 float64,
|
||||
eigenvalue_2 float64,
|
||||
eigenvector_1 [2]float64,
|
||||
eigenvector_2 [2]float64,
|
||||
x_mean float64,
|
||||
y_mean float64,
|
||||
) (m, b, quality float64) {
|
||||
|
||||
abs_1 := math.Abs(eigenvalue_1)
|
||||
abs_2 := math.Abs(eigenvalue_2)
|
||||
quality = 1.0
|
||||
if abs_1 == 0.0 {
|
||||
quality = 0.0
|
||||
} else if abs_2 > 0.0 {
|
||||
quality = 1.0 - abs_2/abs_1
|
||||
}
|
||||
a0 := eigenvector_1[0]
|
||||
a1 := eigenvector_1[1]
|
||||
m = a1 / a0
|
||||
b = y_mean - m*x_mean
|
||||
return m, b, quality
|
||||
}
|
||||
187
pkg/lib/time.go
Normal file
187
pkg/lib/time.go
Normal file
|
|
@ -0,0 +1,187 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// SetTZFromEnv applies the $TZ environment variable. This has three reasons:
|
||||
// (1) On Windows (as of 2021-10-20), this is necessary to get $TZ into use.
|
||||
// (2) On Linux/Mac, as of this writing it is not necessary for initial value
|
||||
// of TZ at startup. However, an explicit check is helpful since if someone
|
||||
// does 'export TZ=Something/Invalid', then runs Miller, and invalid TZ is
|
||||
// simply *ignored* -- we want to surface that error to the user. (3) On any
|
||||
// platform this is necessary for *changing* TZ mid-process: e.g. if a DSL
|
||||
// statement does 'ENV["TZ"] = Asia/Istanbul'.
|
||||
func SetTZFromEnv() error {
|
||||
tzenv := os.Getenv("TZ")
|
||||
location, err := time.LoadLocation(tzenv)
|
||||
if err != nil {
|
||||
return fmt.Errorf("TZ environment variable appears malformed: \"%s\"", tzenv)
|
||||
}
|
||||
time.Local = location
|
||||
return nil
|
||||
}
|
||||
|
||||
func Sec2GMT(epochSeconds float64, numDecimalPlaces int) string {
|
||||
return secToFormattedTime(epochSeconds, numDecimalPlaces, false, nil)
|
||||
}
|
||||
|
||||
func Nsec2GMT(epochNanoseconds int64, numDecimalPlaces int) string {
|
||||
return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, false, nil)
|
||||
}
|
||||
|
||||
func Sec2LocalTime(epochSeconds float64, numDecimalPlaces int) string {
|
||||
return secToFormattedTime(epochSeconds, numDecimalPlaces, true, nil)
|
||||
}
|
||||
|
||||
func Nsec2LocalTime(epochNanoseconds int64, numDecimalPlaces int) string {
|
||||
return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, true, nil)
|
||||
}
|
||||
|
||||
func Sec2LocationTime(epochSeconds float64, numDecimalPlaces int, location *time.Location) string {
|
||||
return secToFormattedTime(epochSeconds, numDecimalPlaces, true, location)
|
||||
}
|
||||
|
||||
func Nsec2LocationTime(epochNanoseconds int64, numDecimalPlaces int, location *time.Location) string {
|
||||
return nsecToFormattedTime(epochNanoseconds, numDecimalPlaces, true, location)
|
||||
}
|
||||
|
||||
// secToFormattedTime is for DSL functions sec2gmt and sec2localtime. If doLocal is
|
||||
// false, use UTC. Else if location is nil, use $TZ environment variable. Else
|
||||
// use the specified location.
|
||||
func secToFormattedTime(epochSeconds float64, numDecimalPlaces int, doLocal bool, location *time.Location) string {
|
||||
intPart := int64(epochSeconds)
|
||||
fractionalPart := epochSeconds - float64(intPart)
|
||||
if fractionalPart < 0 {
|
||||
intPart -= 1
|
||||
fractionalPart += 1.0
|
||||
}
|
||||
|
||||
t := time.Unix(intPart, int64(fractionalPart*1e9))
|
||||
return goTimeToFormattedTime(t, numDecimalPlaces, doLocal, location)
|
||||
}
|
||||
|
||||
// nsecToFormattedTime is for DSL functions nsec2gmt and nsec2localtime. If doLocal is
|
||||
// false, use UTC. Else if location is nil, use $TZ environment variable. Else
|
||||
// use the specified location.
|
||||
func nsecToFormattedTime(epochNanoseconds int64, numDecimalPlaces int, doLocal bool, location *time.Location) string {
|
||||
t := time.Unix(epochNanoseconds/1000000000, epochNanoseconds%1000000000)
|
||||
return goTimeToFormattedTime(t, numDecimalPlaces, doLocal, location)
|
||||
}
|
||||
|
||||
// This is how much to divide nanoseconds by to get a desired number of decimal places
|
||||
var nsToFracDivisors = []int{
|
||||
/* 0 */ 0, /* unused */
|
||||
/* 1 */ 100000000,
|
||||
/* 2 */ 10000000,
|
||||
/* 3 */ 1000000,
|
||||
/* 4 */ 100000,
|
||||
/* 5 */ 10000,
|
||||
/* 6 */ 1000,
|
||||
/* 7 */ 100,
|
||||
/* 8 */ 10,
|
||||
/* 9 */ 1,
|
||||
}
|
||||
|
||||
func goTimeToFormattedTime(t time.Time, numDecimalPlaces int, doLocal bool, location *time.Location) string {
|
||||
if doLocal {
|
||||
if location != nil {
|
||||
t = t.In(location)
|
||||
} else {
|
||||
t = t.Local()
|
||||
}
|
||||
} else {
|
||||
t = t.UTC()
|
||||
}
|
||||
|
||||
YYYY := t.Year()
|
||||
MM := int(t.Month())
|
||||
DD := t.Day()
|
||||
hh := t.Hour()
|
||||
mm := t.Minute()
|
||||
ss := t.Second()
|
||||
|
||||
if numDecimalPlaces < 0 {
|
||||
numDecimalPlaces = 0
|
||||
} else if numDecimalPlaces > 9 {
|
||||
numDecimalPlaces = 9
|
||||
}
|
||||
|
||||
if numDecimalPlaces == 0 {
|
||||
if doLocal {
|
||||
return fmt.Sprintf(
|
||||
"%04d-%02d-%02d %02d:%02d:%02d",
|
||||
YYYY, MM, DD, hh, mm, ss)
|
||||
} else {
|
||||
return fmt.Sprintf(
|
||||
"%04d-%02d-%02dT%02d:%02d:%02dZ",
|
||||
YYYY, MM, DD, hh, mm, ss)
|
||||
}
|
||||
} else {
|
||||
fractionalPart := t.Nanosecond() / nsToFracDivisors[numDecimalPlaces]
|
||||
if doLocal {
|
||||
return fmt.Sprintf(
|
||||
"%04d-%02d-%02d %02d:%02d:%02d.%0*d",
|
||||
YYYY, MM, DD, hh, mm, ss, numDecimalPlaces, fractionalPart)
|
||||
} else {
|
||||
return fmt.Sprintf(
|
||||
"%04d-%02d-%02dT%02d:%02d:%02d.%0*dZ",
|
||||
YYYY, MM, DD, hh, mm, ss, numDecimalPlaces, fractionalPart)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func EpochSecondsToGMT(epochSeconds float64) time.Time {
|
||||
return epochSecondsToTime(epochSeconds, false, nil)
|
||||
}
|
||||
|
||||
func EpochNanosecondsToGMT(epochNanoseconds int64) time.Time {
|
||||
return epochNanosecondsToTime(epochNanoseconds, false, nil)
|
||||
}
|
||||
|
||||
func EpochSecondsToLocalTime(epochSeconds float64) time.Time {
|
||||
return epochSecondsToTime(epochSeconds, true, nil)
|
||||
}
|
||||
|
||||
func EpochNanosecondsToLocalTime(epochNanoseconds int64) time.Time {
|
||||
return epochNanosecondsToTime(epochNanoseconds, true, nil)
|
||||
}
|
||||
|
||||
func EpochSecondsToLocationTime(epochSeconds float64, location *time.Location) time.Time {
|
||||
return epochSecondsToTime(epochSeconds, true, location)
|
||||
}
|
||||
|
||||
func EpochNanosecondsToLocationTime(epochNanoseconds int64, location *time.Location) time.Time {
|
||||
return epochNanosecondsToTime(epochNanoseconds, true, location)
|
||||
}
|
||||
|
||||
func epochSecondsToTime(epochSeconds float64, doLocal bool, location *time.Location) time.Time {
|
||||
intPart := int64(epochSeconds)
|
||||
fractionalPart := epochSeconds - float64(intPart)
|
||||
decimalPart := int64(fractionalPart * 1e9)
|
||||
if doLocal {
|
||||
if location == nil {
|
||||
return time.Unix(intPart, decimalPart).Local()
|
||||
} else {
|
||||
return time.Unix(intPart, decimalPart).In(location)
|
||||
}
|
||||
} else {
|
||||
return time.Unix(intPart, decimalPart).UTC()
|
||||
}
|
||||
}
|
||||
|
||||
func epochNanosecondsToTime(epochNanoseconds int64, doLocal bool, location *time.Location) time.Time {
|
||||
intPart := epochNanoseconds / 1000000000
|
||||
fractionalPart := epochNanoseconds % 1000000000
|
||||
if doLocal {
|
||||
if location == nil {
|
||||
return time.Unix(intPart, fractionalPart).Local()
|
||||
} else {
|
||||
return time.Unix(intPart, fractionalPart).In(location)
|
||||
}
|
||||
} else {
|
||||
return time.Unix(intPart, fractionalPart).UTC()
|
||||
}
|
||||
}
|
||||
101
pkg/lib/time_test.go
Normal file
101
pkg/lib/time_test.go
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
// ================================================================
|
||||
// Most Miller tests (thousands of them) are command-line-driven via
|
||||
// mlr regtest. Here are some cases needing special focus.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
type tDataForSec2GMT struct {
|
||||
epochSeconds float64
|
||||
numDecimalPlaces int
|
||||
expectedOutput string
|
||||
}
|
||||
|
||||
var dataForSec2GMT = []tDataForSec2GMT{
|
||||
{0.0, 0, "1970-01-01T00:00:00Z"},
|
||||
{0.0, 6, "1970-01-01T00:00:00.000000Z"},
|
||||
{1.0, 6, "1970-01-01T00:00:01.000000Z"},
|
||||
{123456789.25, 3, "1973-11-29T21:33:09.250Z"},
|
||||
}
|
||||
|
||||
func TestSec2GMT(t *testing.T) {
|
||||
for _, entry := range dataForSec2GMT {
|
||||
assert.Equal(t, entry.expectedOutput, Sec2GMT(entry.epochSeconds, entry.numDecimalPlaces))
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
type tDataForNsec2GMT struct {
|
||||
epochNanoseconds int64
|
||||
numDecimalPlaces int
|
||||
expectedOutput string
|
||||
}
|
||||
|
||||
var dataForNsec2GMT = []tDataForNsec2GMT{
|
||||
{0, 0, "1970-01-01T00:00:00Z"},
|
||||
{0, 6, "1970-01-01T00:00:00.000000Z"},
|
||||
{946684800123456789, 0, "2000-01-01T00:00:00Z"},
|
||||
{946684800123456789, 1, "2000-01-01T00:00:00.1Z"},
|
||||
{946684800123456789, 2, "2000-01-01T00:00:00.12Z"},
|
||||
{946684800123456789, 3, "2000-01-01T00:00:00.123Z"},
|
||||
{946684800123456789, 4, "2000-01-01T00:00:00.1234Z"},
|
||||
{946684800123456789, 5, "2000-01-01T00:00:00.12345Z"},
|
||||
{946684800123456789, 6, "2000-01-01T00:00:00.123456Z"},
|
||||
{946684800123456789, 7, "2000-01-01T00:00:00.1234567Z"},
|
||||
{946684800123456789, 8, "2000-01-01T00:00:00.12345678Z"},
|
||||
{946684800123456789, 9, "2000-01-01T00:00:00.123456789Z"},
|
||||
}
|
||||
|
||||
func TestNsec2GMT(t *testing.T) {
|
||||
for _, entry := range dataForNsec2GMT {
|
||||
actualOutput := Nsec2GMT(entry.epochNanoseconds, entry.numDecimalPlaces)
|
||||
assert.Equal(t, entry.expectedOutput, actualOutput)
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
type tDataForEpochSecondsToGMT struct {
|
||||
epochSeconds float64
|
||||
expectedOutput time.Time
|
||||
}
|
||||
|
||||
var dataForEpochSecondsToGMT = []tDataForEpochSecondsToGMT{
|
||||
{0.0, time.Unix(0, 0).UTC()},
|
||||
{1.25, time.Unix(1, 250000000).UTC()},
|
||||
{123456789.25, time.Unix(123456789, 250000000).UTC()},
|
||||
}
|
||||
|
||||
func TestEpochSecondsToGMT(t *testing.T) {
|
||||
for _, entry := range dataForEpochSecondsToGMT {
|
||||
assert.Equal(t, entry.expectedOutput, EpochSecondsToGMT(entry.epochSeconds))
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
type tDataForEpochNanosecondsToGMT struct {
|
||||
epochNanoseconds int64
|
||||
expectedOutput time.Time
|
||||
}
|
||||
|
||||
var dataForEpochNanosecondsToGMT = []tDataForEpochNanosecondsToGMT{
|
||||
{0, time.Unix(0, 0).UTC()},
|
||||
{1000000000, time.Unix(1, 0).UTC()},
|
||||
{1200000000, time.Unix(1, 200000000).UTC()},
|
||||
{-1000000000, time.Unix(-1, 0).UTC()},
|
||||
{-1200000000, time.Unix(-1, -200000000).UTC()},
|
||||
{123456789250000047, time.Unix(123456789, 250000047).UTC()},
|
||||
}
|
||||
|
||||
func TestEpochNanosecondsToGMT(t *testing.T) {
|
||||
for _, entry := range dataForEpochNanosecondsToGMT {
|
||||
assert.Equal(t, entry.expectedOutput, EpochNanosecondsToGMT(entry.epochNanoseconds))
|
||||
}
|
||||
}
|
||||
67
pkg/lib/tsv_codec.go
Normal file
67
pkg/lib/tsv_codec.go
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
)
|
||||
|
||||
// * https://en.wikipedia.org/wiki/Tab-separated_values
|
||||
// * https://www.iana.org/assignments/media-types/text/tab-separated-values
|
||||
// \n for newline,
|
||||
// \r for carriage return,
|
||||
// \t for tab,
|
||||
// \\ for backslash.
|
||||
|
||||
// TSVDecodeField is for the TSV record-reader.
|
||||
func TSVDecodeField(input string) string {
|
||||
var buffer bytes.Buffer
|
||||
n := len(input)
|
||||
for i := 0; i < n; /* increment in loop */ {
|
||||
c := input[i]
|
||||
if c == '\\' && i < n-1 {
|
||||
d := input[i+1]
|
||||
if d == '\\' {
|
||||
buffer.WriteByte('\\')
|
||||
i += 2
|
||||
} else if d == 'n' {
|
||||
buffer.WriteByte('\n')
|
||||
i += 2
|
||||
} else if d == 'r' {
|
||||
buffer.WriteByte('\r')
|
||||
i += 2
|
||||
} else if d == 't' {
|
||||
buffer.WriteByte('\t')
|
||||
i += 2
|
||||
} else {
|
||||
buffer.WriteByte(c)
|
||||
i++
|
||||
}
|
||||
} else {
|
||||
buffer.WriteByte(c)
|
||||
i++
|
||||
}
|
||||
}
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// TSVEncodeField is for the TSV record-writer.
|
||||
func TSVEncodeField(input string) string {
|
||||
var buffer bytes.Buffer
|
||||
for _, r := range input {
|
||||
if r == '\\' {
|
||||
buffer.WriteByte('\\')
|
||||
buffer.WriteByte('\\')
|
||||
} else if r == '\n' {
|
||||
buffer.WriteByte('\\')
|
||||
buffer.WriteByte('n')
|
||||
} else if r == '\r' {
|
||||
buffer.WriteByte('\\')
|
||||
buffer.WriteByte('r')
|
||||
} else if r == '\t' {
|
||||
buffer.WriteByte('\\')
|
||||
buffer.WriteByte('t')
|
||||
} else {
|
||||
buffer.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return buffer.String()
|
||||
}
|
||||
35
pkg/lib/tsv_codec_test.go
Normal file
35
pkg/lib/tsv_codec_test.go
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestTSVDecodeField(t *testing.T) {
|
||||
assert.Equal(t, "", TSVDecodeField(""))
|
||||
assert.Equal(t, "a", TSVDecodeField("a"))
|
||||
assert.Equal(t, "abc", TSVDecodeField("abc"))
|
||||
assert.Equal(t, `\`, TSVDecodeField(`\`))
|
||||
assert.Equal(t, "\n", TSVDecodeField(`\n`))
|
||||
assert.Equal(t, "\r", TSVDecodeField(`\r`))
|
||||
assert.Equal(t, "\t", TSVDecodeField(`\t`))
|
||||
assert.Equal(t, "\\", TSVDecodeField(`\\`))
|
||||
assert.Equal(t, `\n`, TSVDecodeField(`\\n`))
|
||||
assert.Equal(t, "\\\n", TSVDecodeField(`\\\n`))
|
||||
assert.Equal(t, "abc\r\ndef\r\n", TSVDecodeField(`abc\r\ndef\r\n`))
|
||||
}
|
||||
|
||||
func TestTSVEncodeField(t *testing.T) {
|
||||
assert.Equal(t, "", TSVEncodeField(""))
|
||||
assert.Equal(t, "a", TSVEncodeField("a"))
|
||||
assert.Equal(t, "abc", TSVEncodeField("abc"))
|
||||
assert.Equal(t, `\\`, TSVEncodeField(`\`))
|
||||
assert.Equal(t, `\n`, TSVEncodeField("\n"))
|
||||
assert.Equal(t, `\r`, TSVEncodeField("\r"))
|
||||
assert.Equal(t, `\t`, TSVEncodeField("\t"))
|
||||
assert.Equal(t, `\\`, TSVEncodeField("\\"))
|
||||
assert.Equal(t, `\\n`, TSVEncodeField("\\n"))
|
||||
assert.Equal(t, `\\\n`, TSVEncodeField("\\\n"))
|
||||
assert.Equal(t, `abc\r\ndef\r\n`, TSVEncodeField("abc\r\ndef\r\n"))
|
||||
}
|
||||
246
pkg/lib/unbackslash.go
Normal file
246
pkg/lib/unbackslash.go
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
// ================================================================
|
||||
// See cst.BuildStringLiteralNode for more context.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"strconv"
|
||||
)
|
||||
|
||||
var unbackslashReplacements = map[byte]string{
|
||||
'a': "\a",
|
||||
'b': "\b",
|
||||
'f': "\f",
|
||||
'n': "\n",
|
||||
'r': "\r",
|
||||
't': "\t",
|
||||
'v': "\v",
|
||||
// At the Miller-user level this means "\\" becomes a single backslash
|
||||
// character. It looks less clear here since here we are accommodating Go
|
||||
// conventions for backslashing conventions as well.
|
||||
'\\': "\\",
|
||||
// Similarly, "\'" becomes "'"
|
||||
'\'': "'",
|
||||
'"': "\"",
|
||||
'?': "?",
|
||||
}
|
||||
|
||||
// UnbackslashStringLiteral replaces "\t" with TAB, etc. for DSL expressions
|
||||
// like '$foo = "a\tb"'. See also
|
||||
// https://en.wikipedia.org/wiki/Escape_sequences_in_C
|
||||
// (predates the port of Miller from C to Go).
|
||||
//
|
||||
// Note that a CST-build pre-pass intentionally excludes regex literals (2nd
|
||||
// argument to sub/gsub/regextract/etc) from being modified here.
|
||||
//
|
||||
// Note "\0" .. "\9" are used for regex captures within the DSL CST builder
|
||||
// and are not touched here. (See also lib/regex.go.)
|
||||
func UnbackslashStringLiteral(input string) string {
|
||||
|
||||
// We could just do this. However, if someone has a valid "\t" in one part of the string,
|
||||
// and something else strconv.Unquote doesn't handle in another part of the string,
|
||||
// we'd fail to unbackslash the former ...
|
||||
//
|
||||
// output, err := strconv.Unquote(`"` + input + `"`)
|
||||
// if err == nil {
|
||||
// return output
|
||||
// } else {
|
||||
// return input
|
||||
// }
|
||||
//
|
||||
// ... and, given that desire, we don't a priori know how many digits in Unicode
|
||||
// escape sequences -- so we *require* that people use four hex digits after \u
|
||||
// and eight hex digits after \U.
|
||||
|
||||
var buffer bytes.Buffer
|
||||
|
||||
n := len(input)
|
||||
|
||||
for i := 0; i < n; /* increment in loop */ {
|
||||
if input[i] != '\\' {
|
||||
buffer.WriteByte(input[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
if i == n-1 {
|
||||
buffer.WriteByte(input[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
next := input[i+1]
|
||||
replacement, ok := unbackslashReplacements[next]
|
||||
if ok {
|
||||
buffer.WriteString(replacement)
|
||||
i += 2
|
||||
} else if ok, code := isBackslashOctal(input[i:]); ok {
|
||||
buffer.WriteByte(byte(code))
|
||||
i += 4
|
||||
} else if ok, code := isBackslashHex(input[i:]); ok {
|
||||
buffer.WriteByte(byte(code))
|
||||
i += 4
|
||||
} else if ok, s := isUnicode4(input[i:]); ok {
|
||||
buffer.WriteString(s)
|
||||
i += 6
|
||||
} else if ok, s := isUnicode8(input[i:]); ok {
|
||||
buffer.WriteString(s)
|
||||
i += 10
|
||||
} else {
|
||||
buffer.WriteByte('\\')
|
||||
buffer.WriteByte(next)
|
||||
i += 2
|
||||
}
|
||||
}
|
||||
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// UnhexStringLiteral is like UnbackslashStringLiteral but only unhexes things
|
||||
// like "\x1f". This is for IFS and IPS setup; see the cli package.
|
||||
func UnhexStringLiteral(input string) string {
|
||||
var buffer bytes.Buffer
|
||||
|
||||
n := len(input)
|
||||
|
||||
for i := 0; i < n; /* increment in loop */ {
|
||||
if input[i] != '\\' {
|
||||
buffer.WriteByte(input[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
if i == n-1 {
|
||||
buffer.WriteByte(input[i])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
|
||||
next := input[i+1]
|
||||
if ok, code := isBackslashHex(input[i:]); ok {
|
||||
buffer.WriteByte(byte(code))
|
||||
i += 4
|
||||
} else {
|
||||
buffer.WriteByte('\\')
|
||||
buffer.WriteByte(next)
|
||||
i += 2
|
||||
}
|
||||
}
|
||||
|
||||
return buffer.String()
|
||||
}
|
||||
|
||||
// If the string starts with backslash followed by three octal digits, convert
|
||||
// the next 3 characters from octal. E.g. "\123" becomes 83 (in decimal).
|
||||
func isBackslashOctal(input string) (bool, int) {
|
||||
if len(input) < 4 {
|
||||
return false, 0
|
||||
}
|
||||
|
||||
if input[0] != '\\' {
|
||||
return false, 0
|
||||
}
|
||||
|
||||
ok, digit := isOctalDigit(input[1])
|
||||
if !ok {
|
||||
return false, 0
|
||||
}
|
||||
code := int(digit)
|
||||
|
||||
ok, digit = isOctalDigit(input[2])
|
||||
if !ok {
|
||||
return false, 0
|
||||
}
|
||||
code = 8*code + int(digit)
|
||||
|
||||
ok, digit = isOctalDigit(input[3])
|
||||
if !ok {
|
||||
return false, 0
|
||||
}
|
||||
code = 8*code + int(digit)
|
||||
|
||||
return true, code
|
||||
}
|
||||
|
||||
func isOctalDigit(b byte) (bool, byte) {
|
||||
if '0' <= b && b <= '7' {
|
||||
return true, b - '0'
|
||||
}
|
||||
return false, 0
|
||||
}
|
||||
|
||||
// If the string starts with leading \x, convert the next 2 characters from hex.
|
||||
// E.g. "\xff" becomes 255 (in decimal).
|
||||
func isBackslashHex(input string) (bool, int) {
|
||||
if len(input) < 4 {
|
||||
return false, 0
|
||||
}
|
||||
|
||||
if input[0] != '\\' {
|
||||
return false, 0
|
||||
}
|
||||
|
||||
if input[1] != 'x' && input[1] != 'X' {
|
||||
return false, 0
|
||||
}
|
||||
|
||||
ok, nybble := isHexDigit(input[2])
|
||||
if !ok {
|
||||
return false, 0
|
||||
}
|
||||
code := 16 * int(nybble)
|
||||
|
||||
ok, nybble = isHexDigit(input[3])
|
||||
if !ok {
|
||||
return false, 0
|
||||
}
|
||||
code += int(nybble)
|
||||
|
||||
return true, code
|
||||
}
|
||||
|
||||
// isHexDigit tries to parse e.g. "\x41"
|
||||
func isHexDigit(b byte) (bool, byte) {
|
||||
if '0' <= b && b <= '9' {
|
||||
return true, b - '0'
|
||||
}
|
||||
if 'a' <= b && b <= 'f' {
|
||||
return true, b - 'a' + 10
|
||||
}
|
||||
if 'A' <= b && b <= 'F' {
|
||||
return true, b - 'A' + 10
|
||||
}
|
||||
return false, 0
|
||||
}
|
||||
|
||||
// isUnicode4 tries to parse e.g. "\u2766"
|
||||
func isUnicode4(input string) (bool, string) {
|
||||
if len(input) < 6 {
|
||||
return false, ""
|
||||
}
|
||||
if input[0:2] != `\u` {
|
||||
return false, ""
|
||||
}
|
||||
s, err := strconv.Unquote(`"` + input[0:6] + `"`)
|
||||
if err == nil {
|
||||
return true, s
|
||||
}
|
||||
return false, ""
|
||||
}
|
||||
|
||||
// isUnicode8 tries to parse e.g. "\U00010877"
|
||||
func isUnicode8(input string) (bool, string) {
|
||||
if len(input) < 10 {
|
||||
return false, ""
|
||||
}
|
||||
if input[0:2] != `\U` {
|
||||
return false, ""
|
||||
}
|
||||
s, err := strconv.Unquote(`"` + input[0:10] + `"`)
|
||||
if err == nil {
|
||||
return true, s
|
||||
}
|
||||
return false, ""
|
||||
}
|
||||
45
pkg/lib/unbackslash_test.go
Normal file
45
pkg/lib/unbackslash_test.go
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
// ================================================================
|
||||
// Most Miller tests (thousands of them) are command-line-driven via
|
||||
// mlr regtest. Here are some cases needing special focus.
|
||||
// ================================================================
|
||||
|
||||
package lib
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
type tDataForUnbackslash struct {
|
||||
input string
|
||||
expectedOutput string
|
||||
}
|
||||
|
||||
// Note we are here dealing with Go's backslashing conventions.
|
||||
// At the Miller user-space level this is simply "\t" -> TAB, etc.
|
||||
var dataForUnbackslash = []tDataForUnbackslash{
|
||||
{"", ""},
|
||||
{"abcde", "abcde"},
|
||||
{`\1`, `\1`},
|
||||
{`a\tb\tc`, "a\tb\tc"},
|
||||
{`a\fb\rc`, "a\fb\rc"},
|
||||
{`a"b"c`, `a"b"c`},
|
||||
{`a\"b\"c`, `a"b"c`},
|
||||
{`a\102c`, `aBc`},
|
||||
{`a\x42c`, `aBc`},
|
||||
{`[\101\102\103]`, `[ABC]`},
|
||||
{`[\x44\x45\x46]`, `[DEF]`},
|
||||
{`\u2766`, `❦`},
|
||||
{`\U00010877`, `𐡷`},
|
||||
{`a\u0062c`, `abc`},
|
||||
}
|
||||
|
||||
func TestUnbackslash(t *testing.T) {
|
||||
for i, entry := range dataForUnbackslash {
|
||||
actualOutput := UnbackslashStringLiteral(entry.input)
|
||||
if actualOutput != entry.expectedOutput {
|
||||
t.Fatalf("case %d input \"%s\" expected \"%s\" got \"%s\"\n",
|
||||
i, entry.input, entry.expectedOutput, actualOutput,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
229
pkg/lib/util.go
Normal file
229
pkg/lib/util.go
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
package lib
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func BooleanXOR(a, b bool) bool {
|
||||
return a != b
|
||||
}
|
||||
|
||||
func BoolToInt(b bool) int64 {
|
||||
if b == false {
|
||||
return 0
|
||||
} else {
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
func Plural(n int) string {
|
||||
if n == 1 {
|
||||
return ""
|
||||
} else {
|
||||
return "s"
|
||||
}
|
||||
}
|
||||
|
||||
// In Go as in all languages I'm aware of with a string-split, "a,b,c" splits
|
||||
// on "," to ["a", "b", "c" and "a" splits to ["a"], both of which are fine --
|
||||
// but "" splits to [""] when I wish it were []. This function does the latter.
|
||||
func SplitString(input string, separator string) []string {
|
||||
if input == "" {
|
||||
return make([]string, 0)
|
||||
} else {
|
||||
return strings.Split(input, separator)
|
||||
}
|
||||
}
|
||||
|
||||
func StringListToSet(stringList []string) map[string]bool {
|
||||
if stringList == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
stringSet := make(map[string]bool)
|
||||
for _, s := range stringList {
|
||||
stringSet[s] = true
|
||||
}
|
||||
return stringSet
|
||||
}
|
||||
|
||||
func SortStrings(strings []string) {
|
||||
// Go sort API: for ascending sort, return true if element i < element j.
|
||||
sort.Slice(strings, func(i, j int) bool {
|
||||
return strings[i] < strings[j]
|
||||
})
|
||||
}
|
||||
|
||||
func ReverseStringList(strings []string) {
|
||||
n := len(strings)
|
||||
i := 0
|
||||
j := n - 1
|
||||
for i < j {
|
||||
temp := strings[i]
|
||||
strings[i] = strings[j]
|
||||
strings[j] = temp
|
||||
i++
|
||||
j--
|
||||
}
|
||||
}
|
||||
|
||||
func SortedStrings(strings []string) []string {
|
||||
copy := make([]string, len(strings))
|
||||
for i, s := range strings {
|
||||
copy[i] = s
|
||||
}
|
||||
// Go sort API: for ascending sort, return true if element i < element j.
|
||||
sort.Slice(copy, func(i, j int) bool {
|
||||
return copy[i] < copy[j]
|
||||
})
|
||||
return copy
|
||||
}
|
||||
|
||||
func IntMin2(a, b int64) int64 {
|
||||
if a < b {
|
||||
return a
|
||||
} else {
|
||||
return b
|
||||
}
|
||||
}
|
||||
|
||||
// TryIntFromString tries decimal, hex, octal, and binary.
|
||||
func TryIntFromString(input string) (int64, bool) {
|
||||
// Go's strconv parses "1_2" as 12; not OK for Miller syntax. (Also not valid JSON.)
|
||||
for i := 0; i < len(input); i++ {
|
||||
if input[i] == '_' {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
// Following twos-complement formatting familiar from all manner of
|
||||
// languages, including C which was Miller's original implementation
|
||||
// language, we want to allow 0x00....00 through 0x7f....ff as positive
|
||||
// 64-bit integers and 0x80....00 through 0xff....ff as negative ones. Go's
|
||||
// signed-int parsing explicitly doesn't allow that, but we don't want Go
|
||||
// semantics to dictate Miller semantics. So, we try signed-int parsing
|
||||
// for 0x00....00 through 0x7f....ff, as well as positive or negative
|
||||
// decimal. Failing that, we try unsigned-int parsing for 0x80....00
|
||||
// through 0xff....ff.
|
||||
i64, ierr := strconv.ParseInt(input, 0 /* check all*/, 64)
|
||||
if ierr == nil {
|
||||
return i64, true
|
||||
}
|
||||
u64, uerr := strconv.ParseUint(input, 0 /* check all*/, 64)
|
||||
if uerr == nil {
|
||||
return int64(u64), true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// TryIntFromStringWithBase allows the user to choose the base that's used,
|
||||
// rather than inferring from 0x prefix, etc as TryIntFromString does.
|
||||
func TryIntFromStringWithBase(input string, base int64) (int64, bool) {
|
||||
// Go's strconv parses "1_2" as 12; not OK for Miller syntax. (Also not valid JSON.)
|
||||
for i := 0; i < len(input); i++ {
|
||||
if input[i] == '_' {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
i64, ierr := strconv.ParseInt(input, int(base), 64)
|
||||
if ierr == nil {
|
||||
return i64, true
|
||||
}
|
||||
u64, uerr := strconv.ParseUint(input, int(base), 64)
|
||||
if uerr == nil {
|
||||
return int64(u64), true
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
func TryFloatFromString(input string) (float64, bool) {
|
||||
// Go's strconv parses "1_2.3_4" as 12.34; not OK for Miller syntax. (Also not valid JSON.)
|
||||
for i := 0; i < len(input); i++ {
|
||||
if input[i] == '_' {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
fval, err := strconv.ParseFloat(input, 64)
|
||||
if err == nil {
|
||||
return fval, true
|
||||
} else {
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
func TryBoolFromBoolString(input string) (bool, bool) {
|
||||
if input == "true" {
|
||||
return true, true
|
||||
} else if input == "false" {
|
||||
return false, true
|
||||
} else {
|
||||
return false, false
|
||||
}
|
||||
}
|
||||
|
||||
// Go doesn't preserve insertion order in its arrays, so here we make an
|
||||
// accessor for getting the keys in sorted order for the benefit of
|
||||
// map-printers.
|
||||
func GetArrayKeysSorted(input map[string]string) []string {
|
||||
keys := make([]string, len(input))
|
||||
i := 0
|
||||
for key := range input {
|
||||
keys[i] = key
|
||||
i++
|
||||
}
|
||||
sort.Strings(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
// WriteTempFile places the contents string into a temp file, which the caller
|
||||
// must remove.
|
||||
func WriteTempFileOrDie(contents string) string {
|
||||
// Use "" as first argument to ioutil.TempFile to use default directory.
|
||||
// Nominally "/tmp" or somesuch on all unix-like systems, but not for Windows.
|
||||
handle, err := ioutil.TempFile("", "mlr-temp")
|
||||
if err != nil {
|
||||
fmt.Printf("mlr: could not create temp file.\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
_, err = handle.WriteString(contents)
|
||||
if err != nil {
|
||||
fmt.Printf("mlr: could not populate temp file.\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
err = handle.Close()
|
||||
if err != nil {
|
||||
fmt.Printf("mlr: could not finish write of temp file.\n")
|
||||
os.Exit(1)
|
||||
}
|
||||
return handle.Name()
|
||||
}
|
||||
|
||||
func CopyStringArray(input []string) []string {
|
||||
output := make([]string, len(input))
|
||||
copy(output, input)
|
||||
return output
|
||||
}
|
||||
|
||||
func StripEmpties(input []string) []string {
|
||||
output := make([]string, 0, len(input))
|
||||
for _, e := range input {
|
||||
if e != "" {
|
||||
output = append(output, e)
|
||||
}
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
func UTF8Strlen(s string) int64 {
|
||||
return int64(utf8.RuneCountInString(s))
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue