Support PPRINT barred input (#1472)

* Support PPRINT barred input

* regression-test files

* output from `make dev`

* doc updates
This commit is contained in:
John Kerl 2024-01-20 12:59:12 -05:00 committed by GitHub
parent 76408f3358
commit 794a754c36
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 565 additions and 57 deletions

View file

@ -366,7 +366,7 @@ Note that while Miller is a line-at-a-time processor and retains input lines in
See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
For output only (this isn't supported in the input-scanner as of 5.0.0) you can use `--barred` with pprint output format:
Since Miller 5.0.0, you can use `--barred` or `--barred-output` with pprint output format:
<pre class="pre-highlight-in-pair">
<b>mlr --opprint --barred cat data/small</b>
@ -383,6 +383,37 @@ For output only (this isn't supported in the input-scanner as of 5.0.0) you can
+-----+-----+---+----------+----------+
</pre>
Since Miller 6.11.0, you can use `--barred-input` with pprint output format:
<pre class="pre-highlight-in-pair">
<b>mlr -o pprint --barred cat data/small | mlr -i pprint --barred-input -o json filter '$b == "pan"'</b>
</pre>
<pre class="pre-non-highlight-in-pair">
[
{
"a": "pan",
"b": "pan",
"i": 1,
"x": 0.346791,
"y": 0.726802
},
{
"a": "eks",
"b": "pan",
"i": 2,
"x": 0.758679,
"y": 0.522151
},
{
"a": "wye",
"b": "pan",
"i": 5,
"x": 0.573288,
"y": 0.863624
}
]
</pre>
## Markdown tabular
Markdown format looks like this:

View file

@ -153,12 +153,18 @@ Note that while Miller is a line-at-a-time processor and retains input lines in
See [Record Heterogeneity](record-heterogeneity.md) for how Miller handles changes of field names within a single data stream.
For output only (this isn't supported in the input-scanner as of 5.0.0) you can use `--barred` with pprint output format:
Since Miller 5.0.0, you can use `--barred` or `--barred-output` with pprint output format:
GENMD-RUN-COMMAND
mlr --opprint --barred cat data/small
GENMD-EOF
Since Miller 6.11.0, you can use `--barred-input` with pprint output format:
GENMD-RUN-COMMAND
mlr -o pprint --barred cat data/small | mlr -i pprint --barred-input -o json filter '$b == "pan"'
GENMD-EOF
## Markdown tabular
Markdown format looks like this:

View file

@ -19,9 +19,7 @@ Quick links:
This is simply a copy of what you should see on running `man mlr` at a command prompt, once Miller is installed on your system.
<pre class="pre-non-highlight-non-pair">
MILLER(1) MILLER(1)
4mMILLER24m(1) 4mMILLER24m(1)
1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
@ -697,8 +695,10 @@ MILLER(1) MILLER(1)
1mPPRINT-ONLY FLAGS0m
These are flags which are applicable to PPRINT format.
--barred Prints a border around PPRINT output (not available
for input).
--barred or --barred-output
Prints a border around PPRINT output.
--barred-input When used in conjunction with --pprint, accepts
barred input.
--right Right-justifies all fields for PPRINT output.
1mPROFILING FLAGS0m
@ -807,7 +807,7 @@ MILLER(1) MILLER(1)
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
tsv " " N/A "\n"
tsv " " N/A "\n"
xtab "\n" " " "\n\n"
--fs {string} Specify FS for input and output.
@ -3687,7 +3687,5 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io
2024-01-01 MILLER(1)
2024-01-20 4mMILLER24m(1)
</pre>

View file

@ -1,6 +1,4 @@
MILLER(1) MILLER(1)
4mMILLER24m(1) 4mMILLER24m(1)
1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
@ -676,8 +674,10 @@ MILLER(1) MILLER(1)
1mPPRINT-ONLY FLAGS0m
These are flags which are applicable to PPRINT format.
--barred Prints a border around PPRINT output (not available
for input).
--barred or --barred-output
Prints a border around PPRINT output.
--barred-input When used in conjunction with --pprint, accepts
barred input.
--right Right-justifies all fields for PPRINT output.
1mPROFILING FLAGS0m
@ -786,7 +786,7 @@ MILLER(1) MILLER(1)
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
tsv " " N/A "\n"
tsv " " N/A "\n"
xtab "\n" " " "\n\n"
--fs {string} Specify FS for input and output.
@ -3666,6 +3666,4 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io
2024-01-01 MILLER(1)
2024-01-20 4mMILLER24m(1)

View file

@ -373,7 +373,8 @@ These are flags which are applicable to PPRINT format.
**Flags:**
* `--barred`: Prints a border around PPRINT output (not available for input).
* `--barred or --barred-output`: Prints a border around PPRINT output.
* `--barred-input`: When used in conjunction with --pprint, accepts barred input.
* `--right`: Right-justifies all fields for PPRINT output.
## Profiling flags

View file

@ -1,6 +1,4 @@
MILLER(1) MILLER(1)
4mMILLER24m(1) 4mMILLER24m(1)
1mNAME0m
Miller -- like awk, sed, cut, join, and sort for name-indexed data such
@ -676,8 +674,10 @@ MILLER(1) MILLER(1)
1mPPRINT-ONLY FLAGS0m
These are flags which are applicable to PPRINT format.
--barred Prints a border around PPRINT output (not available
for input).
--barred or --barred-output
Prints a border around PPRINT output.
--barred-input When used in conjunction with --pprint, accepts
barred input.
--right Right-justifies all fields for PPRINT output.
1mPROFILING FLAGS0m
@ -786,7 +786,7 @@ MILLER(1) MILLER(1)
markdown " " N/A "\n"
nidx " " N/A "\n"
pprint " " N/A "\n"
tsv " " N/A "\n"
tsv " " N/A "\n"
xtab "\n" " " "\n\n"
--fs {string} Specify FS for input and output.
@ -3666,6 +3666,4 @@ MILLER(1) MILLER(1)
MIME Type for Comma-Separated Values (CSV) Files, the Miller docsite
https://miller.readthedocs.io
2024-01-01 MILLER(1)
2024-01-20 4mMILLER24m(1)

View file

@ -19,7 +19,7 @@ def main
# Live code-generation needs to be using mlr from *this* tree, not from
# somewhere else in the PATH.
unless File.executable?('../mlr')
$stderr.puts "#{$0}: Need ../../mlr to exist: please check 'make build' in ../.."
$stderr.puts "#{$0}: Need ../mlr to exist: please check 'make build' in ../.."
exit 1
end
`../mlr --version`

View file

@ -2,12 +2,12 @@
.\" Title: mlr
.\" Author: [see the "AUTHOR" section]
.\" Generator: ./mkman.rb
.\" Date: 2024-01-01
.\" Date: 2024-01-20
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "MILLER" "1" "2024-01-01" "\ \&" "\ \&"
.TH "MILLER" "1" "2024-01-20" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Portability definitions
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -811,8 +811,10 @@ those can be joined with a "-", like "red-bold", "bold-170", "bold-underline", e
.nf
These are flags which are applicable to PPRINT format.
--barred Prints a border around PPRINT output (not available
for input).
--barred or --barred-output
Prints a border around PPRINT output.
--barred-input When used in conjunction with --pprint, accepts
barred input.
--right Right-justifies all fields for PPRINT output.
.fi
.if n \{\

View file

@ -494,13 +494,24 @@ var PPRINTOnlyFlagSection = FlagSection{
},
{
name: "--barred",
help: "Prints a border around PPRINT output (not available for input).",
name: "--barred",
altNames: []string{"--barred-output"},
help: "Prints a border around PPRINT output.",
parser: func(args []string, argc int, pargi *int, options *TOptions) {
options.WriterOptions.BarredPprintOutput = true
*pargi += 1
},
},
{
name: "--barred-input",
help: "When used in conjunction with --pprint, accepts barred input.",
parser: func(args []string, argc int, pargi *int, options *TOptions) {
options.ReaderOptions.BarredPprintInput = true
options.ReaderOptions.IFS = "|"
*pargi += 1
},
},
},
}

View file

@ -57,6 +57,7 @@ type TReaderOptions struct {
AllowRaggedCSVInput bool
CSVLazyQuotes bool
CSVTrimLeadingSpace bool
BarredPprintInput bool
CommentHandling TCommentHandling
CommentString string

View file

@ -78,26 +78,6 @@ func NewRecordReaderCSVLite(
return reader, nil
}
func NewRecordReaderPPRINT(
readerOptions *cli.TReaderOptions,
recordsPerBatch int64,
) (*RecordReaderCSVLite, error) {
reader := &RecordReaderCSVLite{
readerOptions: readerOptions,
recordsPerBatch: recordsPerBatch,
fieldSplitter: newFieldSplitter(readerOptions),
useVoidRep: true,
voidRep: "-",
}
if reader.readerOptions.UseImplicitCSVHeader {
reader.recordBatchGetter = getRecordBatchImplicitCSVHeader
} else {
reader.recordBatchGetter = getRecordBatchExplicitCSVHeader
}
return reader, nil
}
func (reader *RecordReaderCSVLite) Read(
filenames []string,
context types.Context,

View file

@ -0,0 +1,462 @@
package input
// Multi-file cases:
//
// a,a a,b c d
// -- FILE1: -- FILE1: -- FILE1: -- FILE1:
// a,b,c a,b,c a,b,c a,b,c
// 1,2,3 1,2,3 1,2,3 1,2,3
// 4,5,6 4,5,6 4,5,6 4,5,6
// -- FILE2: -- FILE2:
// a,b,c d,e,f,g a,b,c d,e,f
// 7,8,9 3,4,5,6 7,8,9 3,4,5
// --OUTPUT: --OUTPUT: --OUTPUT: --OUTPUT:
// a,b,c a,b,c a,b,c a,b,c
// 1,2,3 1,2,3 1,2,3 1,2,3
// 4,5,6 4,5,6 4,5,6 4,5,6
// 7,8,9 7,8,9
// d,e,f,g d,e,f
// 3,4,5,6 3,4,5
import (
"container/list"
"fmt"
"io"
"regexp"
"strconv"
"strings"
"github.com/johnkerl/miller/pkg/cli"
"github.com/johnkerl/miller/pkg/lib"
"github.com/johnkerl/miller/pkg/mlrval"
"github.com/johnkerl/miller/pkg/types"
)
func NewRecordReaderPPRINT(
readerOptions *cli.TReaderOptions,
recordsPerBatch int64,
) (IRecordReader, error) {
if readerOptions.BarredPprintInput {
// Implemented in this file
// XXX TEMP
readerOptions.IFS = "|"
readerOptions.AllowRepeatIFS = false
reader := &RecordReaderPprintBarred{
readerOptions: readerOptions,
recordsPerBatch: recordsPerBatch,
separatorMatcher: regexp.MustCompile(`^\+[-+]*\+`),
fieldSplitter: newFieldSplitter(readerOptions),
}
if reader.readerOptions.UseImplicitCSVHeader {
reader.recordBatchGetter = getRecordBatchImplicitPprintHeader
} else {
reader.recordBatchGetter = getRecordBatchExplicitPprintHeader
}
return reader, nil
} else {
// Use the CSVLite record-reader, which is implemented in another file,
// with multiple spaces instead of commas
reader := &RecordReaderCSVLite{
readerOptions: readerOptions,
recordsPerBatch: recordsPerBatch,
fieldSplitter: newFieldSplitter(readerOptions),
useVoidRep: true,
voidRep: "-",
}
// XXX RENAME THERE
if reader.readerOptions.UseImplicitCSVHeader {
reader.recordBatchGetter = getRecordBatchImplicitCSVHeader
} else {
reader.recordBatchGetter = getRecordBatchExplicitCSVHeader
}
return reader, nil
}
}
type RecordReaderPprintBarred struct {
readerOptions *cli.TReaderOptions
recordsPerBatch int64 // distinct from readerOptions.RecordsPerBatch for join/repl
separatorMatcher *regexp.Regexp
fieldSplitter iFieldSplitter
recordBatchGetter recordBatchGetterPprint
inputLineNumber int64
headerStrings []string
}
// recordBatchGetterPprint points to either an explicit-PPRINT-header or
// implicit-PPRINT-header record-batch getter.
type recordBatchGetterPprint func(
reader *RecordReaderPprintBarred,
linesChannel <-chan *list.List,
filename string,
context *types.Context,
errorChannel chan error,
) (
recordsAndContexts *list.List,
eof bool,
)
func (reader *RecordReaderPprintBarred) Read(
filenames []string,
context types.Context,
readerChannel chan<- *list.List, // list of *types.RecordAndContext
errorChannel chan error,
downstreamDoneChannel <-chan bool, // for mlr head
) {
if filenames != nil { // nil for mlr -n
if len(filenames) == 0 { // read from stdin
handle, err := lib.OpenStdin(
reader.readerOptions.Prepipe,
reader.readerOptions.PrepipeIsRaw,
reader.readerOptions.FileInputEncoding,
)
if err != nil {
errorChannel <- err
return
}
reader.processHandle(
handle,
"(stdin)",
&context,
readerChannel,
errorChannel,
downstreamDoneChannel,
)
} else {
for _, filename := range filenames {
handle, err := lib.OpenFileForRead(
filename,
reader.readerOptions.Prepipe,
reader.readerOptions.PrepipeIsRaw,
reader.readerOptions.FileInputEncoding,
)
if err != nil {
errorChannel <- err
return
}
reader.processHandle(
handle,
filename,
&context,
readerChannel,
errorChannel,
downstreamDoneChannel,
)
handle.Close()
}
}
}
readerChannel <- types.NewEndOfStreamMarkerList(&context)
}
func (reader *RecordReaderPprintBarred) processHandle(
handle io.Reader,
filename string,
context *types.Context,
readerChannel chan<- *list.List, // list of *types.RecordAndContext
errorChannel chan error,
downstreamDoneChannel <-chan bool, // for mlr head
) {
context.UpdateForStartOfFile(filename)
reader.inputLineNumber = 0
reader.headerStrings = nil
recordsPerBatch := reader.recordsPerBatch
lineScanner := NewLineScanner(handle, reader.readerOptions.IRS)
linesChannel := make(chan *list.List, recordsPerBatch)
go channelizedLineScanner(lineScanner, linesChannel, downstreamDoneChannel, recordsPerBatch)
for {
recordsAndContexts, eof := reader.recordBatchGetter(reader, linesChannel, filename, context, errorChannel)
if recordsAndContexts.Len() > 0 {
readerChannel <- recordsAndContexts
}
if eof {
break
}
}
}
func getRecordBatchExplicitPprintHeader(
reader *RecordReaderPprintBarred,
linesChannel <-chan *list.List,
filename string,
context *types.Context,
errorChannel chan error,
) (
recordsAndContexts *list.List,
eof bool,
) {
recordsAndContexts = list.New()
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
lines, more := <-linesChannel
if !more {
return recordsAndContexts, true
}
for e := lines.Front(); e != nil; e = e.Next() {
line := e.Value.(string)
reader.inputLineNumber++
// Check for comments-in-data feature
// TODO: function-pointer this away
if reader.readerOptions.CommentHandling != cli.CommentsAreData {
if strings.HasPrefix(line, reader.readerOptions.CommentString) {
if reader.readerOptions.CommentHandling == cli.PassComments {
recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context))
continue
} else if reader.readerOptions.CommentHandling == cli.SkipComments {
continue
}
// else comments are data
}
}
if line == "" {
// Reset to new schema
reader.headerStrings = nil
continue
}
// Example input:
// +-----+-----+----+---------------------+---------------------+
// | a | b | i | x | y |
// +-----+-----+----+---------------------+---------------------+
// | pan | pan | 1 | 0.3467901443380824 | 0.7268028627434533 |
// | eks | pan | 2 | 0.7586799647899636 | 0.5221511083334797 |
// +-----+-----+----+---------------------+---------------------+
// Skip lines like
// +-----+-----+----+---------------------+---------------------+
if reader.separatorMatcher.MatchString(line) {
continue
}
// Skip the leading and trailing pipes
paddedFields := reader.fieldSplitter.Split(line)
npad := len(paddedFields)
fields := make([]string, npad-2)
for i, _ := range paddedFields {
if i == 0 || i == npad-1 {
continue
}
fields[i-1] = strings.TrimSpace(paddedFields[i])
}
if reader.headerStrings == nil {
reader.headerStrings = fields
// Get data lines on subsequent loop iterations
} else {
if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) {
err := fmt.Errorf(
"mlr: PPRINT-barred header/data length mismatch %d != %d "+
"at filename %s line %d.\n",
len(reader.headerStrings), len(fields), filename, reader.inputLineNumber,
)
errorChannel <- err
return
}
record := mlrval.NewMlrmapAsRecord()
if !reader.readerOptions.AllowRaggedCSVInput {
for i, field := range fields {
value := mlrval.FromDeferredType(field)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
} else {
nh := int64(len(reader.headerStrings))
nd := int64(len(fields))
n := lib.IntMin2(nh, nd)
var i int64
for i = 0; i < n; i++ {
field := fields[i]
value := mlrval.FromDeferredType(field)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh < nd {
// if header shorter than data: use 1-up itoa keys
for i = nh; i < nd; i++ {
key := strconv.FormatInt(i+1, 10)
value := mlrval.FromDeferredType(fields[i])
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
}
if nh > nd {
// if header longer than data: use "" values
for i = nd; i < nh; i++ {
record.PutCopy(reader.headerStrings[i], mlrval.VOID)
}
}
}
context.UpdateForInputRecord()
recordsAndContexts.PushBack(types.NewRecordAndContext(record, context))
}
}
return recordsAndContexts, false
}
func getRecordBatchImplicitPprintHeader(
reader *RecordReaderPprintBarred,
linesChannel <-chan *list.List,
filename string,
context *types.Context,
errorChannel chan error,
) (
recordsAndContexts *list.List,
eof bool,
) {
recordsAndContexts = list.New()
dedupeFieldNames := reader.readerOptions.DedupeFieldNames
lines, more := <-linesChannel
if !more {
return recordsAndContexts, true
}
for e := lines.Front(); e != nil; e = e.Next() {
line := e.Value.(string)
reader.inputLineNumber++
// Check for comments-in-data feature
// TODO: function-pointer this away
if reader.readerOptions.CommentHandling != cli.CommentsAreData {
if strings.HasPrefix(line, reader.readerOptions.CommentString) {
if reader.readerOptions.CommentHandling == cli.PassComments {
recordsAndContexts.PushBack(types.NewOutputString(line+"\n", context))
continue
} else if reader.readerOptions.CommentHandling == cli.SkipComments {
continue
}
// else comments are data
}
}
if line == "" {
// Reset to new schema
reader.headerStrings = nil
continue
}
// Example input:
// +-----+-----+----+---------------------+---------------------+
// | a | b | i | x | y |
// +-----+-----+----+---------------------+---------------------+
// | pan | pan | 1 | 0.3467901443380824 | 0.7268028627434533 |
// | eks | pan | 2 | 0.7586799647899636 | 0.5221511083334797 |
// +-----+-----+----+---------------------+---------------------+
// Skip lines like
// +-----+-----+----+---------------------+---------------------+
if reader.separatorMatcher.MatchString(line) {
continue
}
// Skip the leading and trailing pipes
paddedFields := reader.fieldSplitter.Split(line)
npad := len(paddedFields)
fields := make([]string, npad-2)
for i, _ := range paddedFields {
if i == 0 || i == npad-1 {
continue
}
fields[i-1] = strings.TrimSpace(paddedFields[i])
}
if reader.headerStrings == nil {
n := len(fields)
reader.headerStrings = make([]string, n)
for i := 0; i < n; i++ {
reader.headerStrings[i] = strconv.Itoa(i + 1)
}
} else {
if !reader.readerOptions.AllowRaggedCSVInput && len(reader.headerStrings) != len(fields) {
err := fmt.Errorf(
"mlr: CSV header/data length mismatch %d != %d "+
"at filename %s line %d.\n",
len(reader.headerStrings), len(fields), filename, reader.inputLineNumber,
)
errorChannel <- err
return
}
}
record := mlrval.NewMlrmapAsRecord()
if !reader.readerOptions.AllowRaggedCSVInput {
for i, field := range fields {
value := mlrval.FromDeferredType(field)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
} else {
nh := int64(len(reader.headerStrings))
nd := int64(len(fields))
n := lib.IntMin2(nh, nd)
var i int64
for i = 0; i < n; i++ {
field := fields[i]
value := mlrval.FromDeferredType(field)
_, err := record.PutReferenceMaybeDedupe(reader.headerStrings[i], value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh < nd {
// if header shorter than data: use 1-up itoa keys
key := strconv.FormatInt(i+1, 10)
value := mlrval.FromDeferredType(fields[i])
_, err := record.PutReferenceMaybeDedupe(key, value, dedupeFieldNames)
if err != nil {
errorChannel <- err
return
}
}
if nh > nd {
// if header longer than data: use "" values
for i = nd; i < nh; i++ {
_, err := record.PutReferenceMaybeDedupe(
reader.headerStrings[i],
mlrval.VOID.Copy(),
dedupeFieldNames,
)
if err != nil {
errorChannel <- err
return
}
}
}
}
context.UpdateForInputRecord()
recordsAndContexts.PushBack(types.NewRecordAndContext(record, context))
}
return recordsAndContexts, false
}

View file

@ -0,0 +1 @@
mlr --hi --i pprint --barred-input -o json test/input/abixy.tbl

View file

@ -0,0 +1,2 @@
mlr: option "--i" not recognized.
Please run "mlr --help" for usage information.

View file

@ -0,0 +1 @@
mlr --i pprint --barred-input -o json test/input/abixy.tbl

View file

@ -0,0 +1,2 @@
mlr: option "--i" not recognized.
Please run "mlr --help" for usage information.

14
test/input/abixy.tbl Normal file
View file

@ -0,0 +1,14 @@
+-----+-----+----+---------------------+---------------------+
| a | b | i | x | y |
+-----+-----+----+---------------------+---------------------+
| pan | pan | 1 | 0.3467901443380824 | 0.7268028627434533 |
| eks | pan | 2 | 0.7586799647899636 | 0.5221511083334797 |
| wye | wye | 3 | 0.20460330576630303 | 0.33831852551664776 |
| eks | wye | 4 | 0.38139939387114097 | 0.13418874328430463 |
| wye | pan | 5 | 0.5732889198020006 | 0.8636244699032729 |
| zee | pan | 6 | 0.5271261600918548 | 0.49322128674835697 |
| eks | zee | 7 | 0.6117840605678454 | 0.1878849191181694 |
| zee | wye | 8 | 0.5985540091064224 | 0.976181385699006 |
| hat | wye | 9 | 0.03144187646093577 | 0.7495507603507059 |
| pan | wye | 10 | 0.5026260055412137 | 0.9526183602969864 |
+-----+-----+----+---------------------+---------------------+