mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 10:15:36 +00:00
Fix in-place mode and recompression
This commit is contained in:
parent
d39796bbc5
commit
7a54bed754
17 changed files with 189 additions and 46 deletions
|
|
@ -88,5 +88,7 @@ A third way is to abort the process on first instance of bad data:
|
|||
<b>mlr --csv put '$reachable = asserting_string($reachable)' data/het-bool.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
name,reachable
|
||||
barney,false
|
||||
mlr: is_string type-assertion failed at NR=4 FNR=4 FILENAME=data/het-bool.csv
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -676,7 +676,7 @@ AUXILIARY COMMANDS
|
|||
help
|
||||
regtest
|
||||
repl
|
||||
For more information, please invoke mlr {subcommand} --help.
|
||||
For more information, please invoke mlrt {subcommand} --help.
|
||||
|
||||
MLRRC
|
||||
You can set up personal defaults via a $HOME/.mlrrc and/or ./.mlrrc.
|
||||
|
|
@ -2741,5 +2741,5 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-09-19 MILLER(1)
|
||||
2021-09-20 MILLER(1)
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -655,7 +655,7 @@ AUXILIARY COMMANDS
|
|||
help
|
||||
regtest
|
||||
repl
|
||||
For more information, please invoke mlr {subcommand} --help.
|
||||
For more information, please invoke mlrt {subcommand} --help.
|
||||
|
||||
MLRRC
|
||||
You can set up personal defaults via a $HOME/.mlrrc and/or ./.mlrrc.
|
||||
|
|
@ -2720,4 +2720,4 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-09-19 MILLER(1)
|
||||
2021-09-20 MILLER(1)
|
||||
|
|
|
|||
|
|
@ -61,6 +61,28 @@ exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
|
|||
|
||||
Binaries are reliably available using GitHub Actions: see also [Installation](installation.md).
|
||||
|
||||
## Support for reading web URLs
|
||||
|
||||
You can read input with prefixes `https://`, `http://`, and `file://`:
|
||||
|
||||
<pre class="pre-highlight-in-pair">
|
||||
<b>mlr --csv sort -f shape \</b>
|
||||
<b> https://raw.githubusercontent.com/johnkerl/miller/main/docs6/src/gz-example.csv.gz</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
color,shape,flag,k,index,quantity,rate
|
||||
red,circle,true,3,16,13.8103,2.9010
|
||||
yellow,circle,true,8,73,63.9785,4.2370
|
||||
yellow,circle,true,9,87,63.5058,8.3350
|
||||
red,square,true,2,15,79.2778,0.0130
|
||||
red,square,false,4,48,77.5542,7.4670
|
||||
red,square,false,6,64,77.1991,9.5310
|
||||
purple,square,false,10,91,72.3735,8.2430
|
||||
yellow,triangle,true,1,11,43.6498,9.8870
|
||||
purple,triangle,false,5,51,81.2290,8.5910
|
||||
purple,triangle,false,7,65,80.1405,5.8240
|
||||
</pre>
|
||||
|
||||
## In-process support for compressed input
|
||||
|
||||
In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z` and `.bz2` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information.
|
||||
|
|
|
|||
|
|
@ -45,6 +45,15 @@ exceptions. See [Miller on Windows](miller-on-windows.md) for more information.
|
|||
|
||||
Binaries are reliably available using GitHub Actions: see also [Installation](installation.md).
|
||||
|
||||
## Support for reading web URLs
|
||||
|
||||
You can read input with prefixes `https://`, `http://`, and `file://`:
|
||||
|
||||
GENMD_RUN_COMMAND
|
||||
mlr --csv sort -f shape \
|
||||
https://raw.githubusercontent.com/johnkerl/miller/main/docs6/src/gz-example.csv.gz
|
||||
GENMD_EOF
|
||||
|
||||
## In-process support for compressed input
|
||||
|
||||
In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z` and `.bz2` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information.
|
||||
|
|
|
|||
|
|
@ -8,6 +8,8 @@
|
|||
mv localhost:6060 miller6
|
||||
file:///Users/kerl/tmp/bar/miller6/pkg
|
||||
|
||||
! http reader -- new-in-miller-6
|
||||
|
||||
? twi-dm re all-contribs: all-contributors.org
|
||||
* nikos materials -> fold in
|
||||
|
||||
|
|
@ -15,14 +17,9 @@ c array+string slice UTs
|
|||
? nidx no output-coloring?
|
||||
|
||||
C! repifs !! https://pkg.go.dev/regexp#Regexp.Split 2-for-1 -- get regexp as well ?
|
||||
C fix -I + auto-decompress
|
||||
o disallow -I with --prepipe, verbosely
|
||||
o output-writer decoration:
|
||||
src/cli/option_parse.go: name: "--gzin",
|
||||
src/cli/option_parse.go: options.ReaderOptions.FileInputEncoding = lib.FileInputEncodingGzip
|
||||
...
|
||||
|
||||
* r-strings branch!
|
||||
C stats1 --fr
|
||||
|
||||
----------------------------------------------------------------
|
||||
* deduping pass!
|
||||
|
|
|
|||
|
|
@ -127,8 +127,6 @@ If you `mlr csv cat` this, you'll get an error message:
|
|||
<b>mlr --csv cat data/het/ragged.csv</b>
|
||||
</pre>
|
||||
<pre class="pre-non-highlight-in-pair">
|
||||
a,b,c
|
||||
1,2,3
|
||||
mlr : mlr: CSV header/data length mismatch 3 != 2 at filename data/het/ragged.csv row 3.
|
||||
|
||||
</pre>
|
||||
|
|
|
|||
|
|
@ -151,7 +151,5 @@ yellow,circle,true,8,73,63.9785,4.2370
|
|||
yellow,circle,true,9,87,63.5058,8.3350
|
||||
</pre>
|
||||
|
||||
* Using the [in-place flag](reference-main-in-place-processing.md) `-I`,
|
||||
as of August 2021 the overwritten file will _not_ be compressed as it was when it was read:
|
||||
e.g. `mlr -I --csv cat gz-example.csv.gz` will write `gz-example.csv.gz` which contains
|
||||
a plain, uncompressed CSV contents. This is a bug and will be fixed.
|
||||
* Using the [in-place flag](reference-main-in-place-processing.md) `-I`, the overwritten file will
|
||||
be compressed when possible. See the [page on in-place mode](reference-main-in-place-processing.md) for details.
|
||||
|
|
|
|||
|
|
@ -103,7 +103,5 @@ yellow,circle,true,8,73,63.9785,4.2370
|
|||
yellow,circle,true,9,87,63.5058,8.3350
|
||||
GENMD_EOF
|
||||
|
||||
* Using the [in-place flag](reference-main-in-place-processing.md) `-I`,
|
||||
as of August 2021 the overwritten file will _not_ be compressed as it was when it was read:
|
||||
e.g. `mlr -I --csv cat gz-example.csv.gz` will write `gz-example.csv.gz` which contains
|
||||
a plain, uncompressed CSV contents. This is a bug and will be fixed.
|
||||
* Using the [in-place flag](reference-main-in-place-processing.md) `-I`, the overwritten file will
|
||||
be compressed when possible. See the [page on in-place mode](reference-main-in-place-processing.md) for details.
|
||||
|
|
|
|||
|
|
@ -23,6 +23,12 @@ By default, Miller output goes to the screen (or you can redirect a file using `
|
|||
Since this replaces your data with modified data, it's often a good idea to back up your original files somewhere
|
||||
first, to protect against keystroking errors.
|
||||
|
||||
TODO: fix the combination of `-I` and compressed input.
|
||||
Situations in which the input can't be updated in place:
|
||||
|
||||
* If the input file is a URL of the form `http://...`, `https://...`, or `file://...`.
|
||||
* If a [`--prepipe` or `--prepipex` flag](reference-main-compressed-data.md#external-decompressors-on-input) is being used.
|
||||
* If [in-place compression](reference-main-compressed-data.md) is being used and the format is BZIP2. For technical reasons, this can't be recompressed in place. (GZIP and ZLIB, however, are recompressable in place).
|
||||
|
||||
Additional note: `gzip` supports various compression levels, from 1 to 9. If you do `mlr -I ... yourfile.gz` then Miller will produce compressed output using GZIP, but, it makes no attempt to determine, or mimic, the original compression level of the input.
|
||||
|
||||
Please see [Choices for printing to files](10min.md#choices-for-printing-to-files) for examples.
|
||||
|
|
|
|||
|
|
@ -7,6 +7,12 @@ By default, Miller output goes to the screen (or you can redirect a file using `
|
|||
Since this replaces your data with modified data, it's often a good idea to back up your original files somewhere
|
||||
first, to protect against keystroking errors.
|
||||
|
||||
TODO: fix the combination of `-I` and compressed input.
|
||||
Situations in which the input can't be updated in place:
|
||||
|
||||
* If the input file is a URL of the form `http://...`, `https://...`, or `file://...`.
|
||||
* If a [`--prepipe` or `--prepipex` flag](reference-main-compressed-data.md#external-decompressors-on-input) is being used.
|
||||
* If [in-place compression](reference-main-compressed-data.md) is being used and the format is BZIP2. For technical reasons, this can't be recompressed in place. (GZIP and ZLIB, however, are recompressable in place).
|
||||
|
||||
Additional note: `gzip` supports various compression levels, from 1 to 9. If you do `mlr -I ... yourfile.gz` then Miller will produce compressed output using GZIP, but, it makes no attempt to determine, or mimic, the original compression level of the input.
|
||||
|
||||
Please see [Choices for printing to files](10min.md#choices-for-printing-to-files) for examples.
|
||||
|
|
|
|||
|
|
@ -80,5 +80,5 @@ func ShowAuxEntries(o *os.File) {
|
|||
fmt.Fprintf(o, " %s\n", entry.name)
|
||||
}
|
||||
|
||||
fmt.Fprintf(o, "For more information, please invoke mlrt {subcommand} --help.\n")
|
||||
fmt.Fprintf(o, "For more information, please invoke mlr {subcommand} --help.\n")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,12 +11,11 @@ import (
|
|||
"os"
|
||||
"path"
|
||||
|
||||
"mlr/src/platform"
|
||||
|
||||
"mlr/src/auxents"
|
||||
"mlr/src/cli"
|
||||
"mlr/src/climain"
|
||||
"mlr/src/lib"
|
||||
"mlr/src/platform"
|
||||
"mlr/src/stream"
|
||||
"mlr/src/transformers"
|
||||
)
|
||||
|
|
@ -110,38 +109,76 @@ func processInPlace(
|
|||
// reader, mappers, and writer individually for each file name. This
|
||||
// way CSV headers appear in each file, head -n 10 puts 10 rows for
|
||||
// each output file, and so on.
|
||||
|
||||
containingDirectory := path.Dir(fileName)
|
||||
// Names like ./mlr-in-place-2148227797 and ./mlr-in-place-1792078347,
|
||||
// as revealed by printing handle.Name().
|
||||
handle, err := ioutil.TempFile(containingDirectory, "mlr-in-place-")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
tempFileName := handle.Name()
|
||||
|
||||
options, recordTransformers, err := climain.ParseCommandLine(os.Args)
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, os.Args[0], ": ", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
err = stream.Stream([]string{fileName}, options, recordTransformers, handle, false)
|
||||
// We can't in-place update http://, https://, etc. Also, anything with
|
||||
// --prepipe or --prepipex, we won't try to guess how to invert that
|
||||
// command to produce re-compressed output.
|
||||
err = lib.IsUpdateableInPlace(fileName, options.ReaderOptions.Prepipe)
|
||||
if err != nil {
|
||||
fmt.Fprintln(os.Stderr, os.Args[0], ": ", err)
|
||||
fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
containingDirectory := path.Dir(fileName)
|
||||
// Names like ./mlr-in-place-2148227797 and ./mlr-in-place-1792078347,
|
||||
// as revealed by printing handle.Name().
|
||||
handle, err := ioutil.TempFile(containingDirectory, "mlr-in-place-")
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
tempFileName := handle.Name()
|
||||
|
||||
// If the input file is compressed and we'll be doing in-process
|
||||
// decompression as we read the input file, try to do in-process
|
||||
// compression as we write the output.
|
||||
inputFileEncoding := lib.FindInputEncoding(fileName, options.ReaderOptions.FileInputEncoding)
|
||||
|
||||
// Get a handle with, perhaps, a recompression wrapper around it.
|
||||
wrappedHandle, isNew, err := lib.WrapOutputHandle(handle, inputFileEncoding)
|
||||
if err != nil {
|
||||
os.Remove(tempFileName)
|
||||
fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Run the Miller processing stream from the input file to the temp-output file.
|
||||
err = stream.Stream([]string{fileName}, options, recordTransformers, wrappedHandle, false)
|
||||
if err != nil {
|
||||
os.Remove(tempFileName)
|
||||
fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Close the recompressor handle, if any recompression is being applied.
|
||||
if isNew {
|
||||
err = wrappedHandle.Close()
|
||||
if err != nil {
|
||||
os.Remove(tempFileName)
|
||||
fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// Close the handle to the output file. This may force final writes, so
|
||||
// it must be error-checked.
|
||||
err = handle.Close()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err)
|
||||
os.Remove(tempFileName)
|
||||
fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Rename the temp-output file on top of the input file.
|
||||
err = os.Rename(tempFileName, fileName)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err)
|
||||
os.Remove(tempFileName)
|
||||
fmt.Fprintf(os.Stderr, "mlr: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ import (
|
|||
"compress/bzip2"
|
||||
"compress/gzip"
|
||||
"compress/zlib"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
|
|
@ -221,3 +222,71 @@ func IsEOF(err error) bool {
|
|||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Functions for in-place mode
|
||||
|
||||
// IsUpdateableInPlace tells if we can use the input with mlr -I: not for URLs,
|
||||
// and not for prepipe commands (which we don't presume to know how to invert
|
||||
// for output).
|
||||
func IsUpdateableInPlace(
|
||||
filename string,
|
||||
prepipe string,
|
||||
) error {
|
||||
if strings.HasPrefix(filename, "http://") ||
|
||||
strings.HasPrefix(filename, "https://") ||
|
||||
strings.HasPrefix(filename, "file://") {
|
||||
return errors.New("http://, https://, and file:// URLs are not updateable in place.")
|
||||
}
|
||||
if prepipe != "" {
|
||||
return errors.New("input with --prepipe or --prepipex is not updateable in place.")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// FindInputEncoding determines the input encoding (compression), whether from
|
||||
// a flag like --gzin, or from filename suffix like ".gz". If the user did
|
||||
// --gzin on the command line, TFileInputEncoding will be
|
||||
// FileInputEncodingGzip. If they didn't, but the filename ends in ".gz", then
|
||||
// we auto-infer FileInputEncodingGzip. Either way, this function tells if we
|
||||
// will be using in-process decompression within the file-format-specific
|
||||
// record reader.
|
||||
func FindInputEncoding(
|
||||
filename string,
|
||||
inputFileInputEncoding TFileInputEncoding,
|
||||
) TFileInputEncoding {
|
||||
if inputFileInputEncoding != FileInputEncodingDefault {
|
||||
return inputFileInputEncoding
|
||||
}
|
||||
if strings.HasSuffix(filename, ".bz2") {
|
||||
return FileInputEncodingBzip2
|
||||
}
|
||||
if strings.HasSuffix(filename, ".gz") {
|
||||
return FileInputEncodingGzip
|
||||
}
|
||||
if strings.HasSuffix(filename, ".z") {
|
||||
return FileInputEncodingZlib
|
||||
}
|
||||
return FileInputEncodingDefault
|
||||
}
|
||||
|
||||
// WrapOutputHandle wraps a file-write handle with a decompressor. The first
|
||||
// return value is the wrapped handle. The second is true if the returned
|
||||
// handle needs to be closed separately from the original. The third is for
|
||||
// in-process compression we can't undo: namely, as of September 2021 the gzip
|
||||
// and zlib libraries support write-closers, but the bzip2 library does not.
|
||||
func WrapOutputHandle(
|
||||
fileWriteHandle io.WriteCloser,
|
||||
inputFileEncoding TFileInputEncoding,
|
||||
) (io.WriteCloser, bool, error) {
|
||||
switch inputFileEncoding {
|
||||
case FileInputEncodingBzip2:
|
||||
return fileWriteHandle, false, errors.New("bzip2 is not currently supported for in-place mode.")
|
||||
case FileInputEncodingGzip:
|
||||
return gzip.NewWriter(fileWriteHandle), true, nil
|
||||
case FileInputEncodingZlib:
|
||||
return zlib.NewWriter(fileWriteHandle), true, nil
|
||||
default:
|
||||
return fileWriteHandle, false, nil
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package stream
|
|||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"mlr/src/cli"
|
||||
|
|
@ -33,7 +34,7 @@ func Stream(
|
|||
fileNames []string,
|
||||
options cli.TOptions,
|
||||
recordTransformers []transformers.IRecordTransformer,
|
||||
outputStream *os.File,
|
||||
outputStream io.WriteCloser,
|
||||
outputIsStdout bool,
|
||||
) error {
|
||||
|
||||
|
|
|
|||
|
|
@ -655,7 +655,7 @@ AUXILIARY COMMANDS
|
|||
help
|
||||
regtest
|
||||
repl
|
||||
For more information, please invoke mlr {subcommand} --help.
|
||||
For more information, please invoke mlrt {subcommand} --help.
|
||||
|
||||
MLRRC
|
||||
You can set up personal defaults via a $HOME/.mlrrc and/or ./.mlrrc.
|
||||
|
|
@ -2720,4 +2720,4 @@ SEE ALSO
|
|||
|
||||
|
||||
|
||||
2021-09-19 MILLER(1)
|
||||
2021-09-20 MILLER(1)
|
||||
|
|
|
|||
|
|
@ -2,12 +2,12 @@
|
|||
.\" Title: mlr
|
||||
.\" Author: [see the "AUTHOR" section]
|
||||
.\" Generator: ./mkman.rb
|
||||
.\" Date: 2021-09-19
|
||||
.\" Date: 2021-09-20
|
||||
.\" Manual: \ \&
|
||||
.\" Source: \ \&
|
||||
.\" Language: English
|
||||
.\"
|
||||
.TH "MILLER" "1" "2021-09-19" "\ \&" "\ \&"
|
||||
.TH "MILLER" "1" "2021-09-20" "\ \&" "\ \&"
|
||||
.\" -----------------------------------------------------------------
|
||||
.\" * Portability definitions
|
||||
.\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
|
@ -804,7 +804,7 @@ Available subcommands:
|
|||
help
|
||||
regtest
|
||||
repl
|
||||
For more information, please invoke mlr {subcommand} --help.
|
||||
For more information, please invoke mlrt {subcommand} --help.
|
||||
.fi
|
||||
.if n \{\
|
||||
.RE
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue