diff --git a/docs6/src/data-cleaning-examples.md b/docs6/src/data-cleaning-examples.md index e877e6ed2..42e9d0410 100644 --- a/docs6/src/data-cleaning-examples.md +++ b/docs6/src/data-cleaning-examples.md @@ -88,5 +88,7 @@ A third way is to abort the process on first instance of bad data: mlr --csv put '$reachable = asserting_string($reachable)' data/het-bool.csv
+name,reachable
+barney,false
 mlr: is_string type-assertion failed at NR=4 FNR=4 FILENAME=data/het-bool.csv
 
diff --git a/docs6/src/manpage.md b/docs6/src/manpage.md index dfb9ec16d..97b639c6f 100644 --- a/docs6/src/manpage.md +++ b/docs6/src/manpage.md @@ -676,7 +676,7 @@ AUXILIARY COMMANDS help regtest repl - For more information, please invoke mlr {subcommand} --help. + For more information, please invoke mlrt {subcommand} --help. MLRRC You can set up personal defaults via a $HOME/.mlrrc and/or ./.mlrrc. @@ -2741,5 +2741,5 @@ SEE ALSO - 2021-09-19 MILLER(1) + 2021-09-20 MILLER(1) diff --git a/docs6/src/manpage.txt b/docs6/src/manpage.txt index f74f1d8e1..387f4079a 100644 --- a/docs6/src/manpage.txt +++ b/docs6/src/manpage.txt @@ -655,7 +655,7 @@ AUXILIARY COMMANDS help regtest repl - For more information, please invoke mlr {subcommand} --help. + For more information, please invoke mlrt {subcommand} --help. MLRRC You can set up personal defaults via a $HOME/.mlrrc and/or ./.mlrrc. @@ -2720,4 +2720,4 @@ SEE ALSO - 2021-09-19 MILLER(1) + 2021-09-20 MILLER(1) diff --git a/docs6/src/new-in-miller-6.md b/docs6/src/new-in-miller-6.md index 881d77d83..90adf2235 100644 --- a/docs6/src/new-in-miller-6.md +++ b/docs6/src/new-in-miller-6.md @@ -61,6 +61,28 @@ exceptions. See [Miller on Windows](miller-on-windows.md) for more information. Binaries are reliably available using GitHub Actions: see also [Installation](installation.md). +## Support for reading web URLs + +You can read input with prefixes `https://`, `http://`, and `file://`: + +
+mlr --csv sort -f shape \
+  https://raw.githubusercontent.com/johnkerl/miller/main/docs6/src/gz-example.csv.gz
+
+
+color,shape,flag,k,index,quantity,rate
+red,circle,true,3,16,13.8103,2.9010
+yellow,circle,true,8,73,63.9785,4.2370
+yellow,circle,true,9,87,63.5058,8.3350
+red,square,true,2,15,79.2778,0.0130
+red,square,false,4,48,77.5542,7.4670
+red,square,false,6,64,77.1991,9.5310
+purple,square,false,10,91,72.3735,8.2430
+yellow,triangle,true,1,11,43.6498,9.8870
+purple,triangle,false,5,51,81.2290,8.5910
+purple,triangle,false,7,65,80.1405,5.8240
+
+ ## In-process support for compressed input In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z` and `.bz2` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information. diff --git a/docs6/src/new-in-miller-6.md.in b/docs6/src/new-in-miller-6.md.in index e4473c2e2..b831300bb 100644 --- a/docs6/src/new-in-miller-6.md.in +++ b/docs6/src/new-in-miller-6.md.in @@ -45,6 +45,15 @@ exceptions. See [Miller on Windows](miller-on-windows.md) for more information. Binaries are reliably available using GitHub Actions: see also [Installation](installation.md). +## Support for reading web URLs + +You can read input with prefixes `https://`, `http://`, and `file://`: + +GENMD_RUN_COMMAND +mlr --csv sort -f shape \ + https://raw.githubusercontent.com/johnkerl/miller/main/docs6/src/gz-example.csv.gz +GENMD_EOF + ## In-process support for compressed input In addition to `--prepipe gunzip`, you can now use the `--gzin` flag. In fact, if your files end in `.gz` you don't even need to do that -- Miller will autodetect by file extension and automatically uncompress `mlr --csv cat foo.csv.gz`. Similarly for `.z` and `.bz2` files. Please see the page on [Compressed data](reference-main-compressed-data.md) for more information. diff --git a/docs6/src/proofreads.txt b/docs6/src/proofreads.txt index 41bc1723e..937bd833a 100644 --- a/docs6/src/proofreads.txt +++ b/docs6/src/proofreads.txt @@ -8,6 +8,8 @@ mv localhost:6060 miller6 file:///Users/kerl/tmp/bar/miller6/pkg +! http reader -- new-in-miller-6 + ? twi-dm re all-contribs: all-contributors.org * nikos materials -> fold in @@ -15,14 +17,9 @@ c array+string slice UTs ? nidx no output-coloring? C! repifs !! https://pkg.go.dev/regexp#Regexp.Split 2-for-1 -- get regexp as well ? -C fix -I + auto-decompress - o disallow -I with --prepipe, verbosely - o output-writer decoration: - src/cli/option_parse.go: name: "--gzin", - src/cli/option_parse.go: options.ReaderOptions.FileInputEncoding = lib.FileInputEncodingGzip - ... * r-strings branch! +C stats1 --fr ---------------------------------------------------------------- * deduping pass! diff --git a/docs6/src/record-heterogeneity.md b/docs6/src/record-heterogeneity.md index 5243782c8..f17e9287a 100644 --- a/docs6/src/record-heterogeneity.md +++ b/docs6/src/record-heterogeneity.md @@ -127,8 +127,6 @@ If you `mlr csv cat` this, you'll get an error message: mlr --csv cat data/het/ragged.csv
-a,b,c
-1,2,3
 mlr :  mlr: CSV header/data length mismatch 3 != 2 at filename data/het/ragged.csv row 3.
 
 
diff --git a/docs6/src/reference-main-compressed-data.md b/docs6/src/reference-main-compressed-data.md index 36871db4a..7a9a316d3 100644 --- a/docs6/src/reference-main-compressed-data.md +++ b/docs6/src/reference-main-compressed-data.md @@ -151,7 +151,5 @@ yellow,circle,true,8,73,63.9785,4.2370 yellow,circle,true,9,87,63.5058,8.3350 -* Using the [in-place flag](reference-main-in-place-processing.md) `-I`, -as of August 2021 the overwritten file will _not_ be compressed as it was when it was read: -e.g. `mlr -I --csv cat gz-example.csv.gz` will write `gz-example.csv.gz` which contains -a plain, uncompressed CSV contents. This is a bug and will be fixed. +* Using the [in-place flag](reference-main-in-place-processing.md) `-I`, the overwritten file will +be compressed when possible. See the [page on in-place mode](reference-main-in-place-processing.md) for details. diff --git a/docs6/src/reference-main-compressed-data.md.in b/docs6/src/reference-main-compressed-data.md.in index e04a4ba07..0ffb98f75 100644 --- a/docs6/src/reference-main-compressed-data.md.in +++ b/docs6/src/reference-main-compressed-data.md.in @@ -103,7 +103,5 @@ yellow,circle,true,8,73,63.9785,4.2370 yellow,circle,true,9,87,63.5058,8.3350 GENMD_EOF -* Using the [in-place flag](reference-main-in-place-processing.md) `-I`, -as of August 2021 the overwritten file will _not_ be compressed as it was when it was read: -e.g. `mlr -I --csv cat gz-example.csv.gz` will write `gz-example.csv.gz` which contains -a plain, uncompressed CSV contents. This is a bug and will be fixed. +* Using the [in-place flag](reference-main-in-place-processing.md) `-I`, the overwritten file will +be compressed when possible. See the [page on in-place mode](reference-main-in-place-processing.md) for details. diff --git a/docs6/src/reference-main-in-place-processing.md b/docs6/src/reference-main-in-place-processing.md index 5da98bc63..f340cf6f9 100644 --- a/docs6/src/reference-main-in-place-processing.md +++ b/docs6/src/reference-main-in-place-processing.md @@ -23,6 +23,12 @@ By default, Miller output goes to the screen (or you can redirect a file using ` Since this replaces your data with modified data, it's often a good idea to back up your original files somewhere first, to protect against keystroking errors. -TODO: fix the combination of `-I` and compressed input. +Situations in which the input can't be updated in place: + +* If the input file is a URL of the form `http://...`, `https://...`, or `file://...`. +* If a [`--prepipe` or `--prepipex` flag](reference-main-compressed-data.md#external-decompressors-on-input) is being used. +* If [in-place compression](reference-main-compressed-data.md) is being used and the format is BZIP2. For technical reasons, this can't be recompressed in place. (GZIP and ZLIB, however, are recompressable in place). + +Additional note: `gzip` supports various compression levels, from 1 to 9. If you do `mlr -I ... yourfile.gz` then Miller will produce compressed output using GZIP, but, it makes no attempt to determine, or mimic, the original compression level of the input. Please see [Choices for printing to files](10min.md#choices-for-printing-to-files) for examples. diff --git a/docs6/src/reference-main-in-place-processing.md.in b/docs6/src/reference-main-in-place-processing.md.in index 19d142198..1dc1efe65 100644 --- a/docs6/src/reference-main-in-place-processing.md.in +++ b/docs6/src/reference-main-in-place-processing.md.in @@ -7,6 +7,12 @@ By default, Miller output goes to the screen (or you can redirect a file using ` Since this replaces your data with modified data, it's often a good idea to back up your original files somewhere first, to protect against keystroking errors. -TODO: fix the combination of `-I` and compressed input. +Situations in which the input can't be updated in place: + +* If the input file is a URL of the form `http://...`, `https://...`, or `file://...`. +* If a [`--prepipe` or `--prepipex` flag](reference-main-compressed-data.md#external-decompressors-on-input) is being used. +* If [in-place compression](reference-main-compressed-data.md) is being used and the format is BZIP2. For technical reasons, this can't be recompressed in place. (GZIP and ZLIB, however, are recompressable in place). + +Additional note: `gzip` supports various compression levels, from 1 to 9. If you do `mlr -I ... yourfile.gz` then Miller will produce compressed output using GZIP, but, it makes no attempt to determine, or mimic, the original compression level of the input. Please see [Choices for printing to files](10min.md#choices-for-printing-to-files) for examples. diff --git a/go/src/auxents/auxents.go b/go/src/auxents/auxents.go index bc24f97e5..e97e29d1c 100644 --- a/go/src/auxents/auxents.go +++ b/go/src/auxents/auxents.go @@ -80,5 +80,5 @@ func ShowAuxEntries(o *os.File) { fmt.Fprintf(o, " %s\n", entry.name) } - fmt.Fprintf(o, "For more information, please invoke mlrt {subcommand} --help.\n") + fmt.Fprintf(o, "For more information, please invoke mlr {subcommand} --help.\n") } diff --git a/go/src/entrypoint/entrypoint.go b/go/src/entrypoint/entrypoint.go index ee8e893c3..5b0f87b1d 100644 --- a/go/src/entrypoint/entrypoint.go +++ b/go/src/entrypoint/entrypoint.go @@ -11,12 +11,11 @@ import ( "os" "path" - "mlr/src/platform" - "mlr/src/auxents" "mlr/src/cli" "mlr/src/climain" "mlr/src/lib" + "mlr/src/platform" "mlr/src/stream" "mlr/src/transformers" ) @@ -110,38 +109,76 @@ func processInPlace( // reader, mappers, and writer individually for each file name. This // way CSV headers appear in each file, head -n 10 puts 10 rows for // each output file, and so on. - - containingDirectory := path.Dir(fileName) - // Names like ./mlr-in-place-2148227797 and ./mlr-in-place-1792078347, - // as revealed by printing handle.Name(). - handle, err := ioutil.TempFile(containingDirectory, "mlr-in-place-") - if err != nil { - fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err) - os.Exit(1) - } - tempFileName := handle.Name() - options, recordTransformers, err := climain.ParseCommandLine(os.Args) if err != nil { fmt.Fprintln(os.Stderr, os.Args[0], ": ", err) os.Exit(1) } - err = stream.Stream([]string{fileName}, options, recordTransformers, handle, false) + // We can't in-place update http://, https://, etc. Also, anything with + // --prepipe or --prepipex, we won't try to guess how to invert that + // command to produce re-compressed output. + err = lib.IsUpdateableInPlace(fileName, options.ReaderOptions.Prepipe) if err != nil { - fmt.Fprintln(os.Stderr, os.Args[0], ": ", err) + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) os.Exit(1) } + containingDirectory := path.Dir(fileName) + // Names like ./mlr-in-place-2148227797 and ./mlr-in-place-1792078347, + // as revealed by printing handle.Name(). + handle, err := ioutil.TempFile(containingDirectory, "mlr-in-place-") + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + os.Exit(1) + } + tempFileName := handle.Name() + + // If the input file is compressed and we'll be doing in-process + // decompression as we read the input file, try to do in-process + // compression as we write the output. + inputFileEncoding := lib.FindInputEncoding(fileName, options.ReaderOptions.FileInputEncoding) + + // Get a handle with, perhaps, a recompression wrapper around it. + wrappedHandle, isNew, err := lib.WrapOutputHandle(handle, inputFileEncoding) + if err != nil { + os.Remove(tempFileName) + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + os.Exit(1) + } + + // Run the Miller processing stream from the input file to the temp-output file. + err = stream.Stream([]string{fileName}, options, recordTransformers, wrappedHandle, false) + if err != nil { + os.Remove(tempFileName) + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + os.Exit(1) + } + + // Close the recompressor handle, if any recompression is being applied. + if isNew { + err = wrappedHandle.Close() + if err != nil { + os.Remove(tempFileName) + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + os.Exit(1) + } + } + + // Close the handle to the output file. This may force final writes, so + // it must be error-checked. err = handle.Close() if err != nil { - fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err) + os.Remove(tempFileName) + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) os.Exit(1) } + // Rename the temp-output file on top of the input file. err = os.Rename(tempFileName, fileName) if err != nil { - fmt.Fprintf(os.Stderr, "%s: %v\n", "mlr", err) + os.Remove(tempFileName) + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) os.Exit(1) } } diff --git a/go/src/lib/file-readers.go b/go/src/lib/file-readers.go index 00339d7cc..5efbe20ce 100644 --- a/go/src/lib/file-readers.go +++ b/go/src/lib/file-readers.go @@ -24,6 +24,7 @@ import ( "compress/bzip2" "compress/gzip" "compress/zlib" + "errors" "io" "net/http" "os" @@ -221,3 +222,71 @@ func IsEOF(err error) bool { return false } } + +// ---------------------------------------------------------------- +// Functions for in-place mode + +// IsUpdateableInPlace tells if we can use the input with mlr -I: not for URLs, +// and not for prepipe commands (which we don't presume to know how to invert +// for output). +func IsUpdateableInPlace( + filename string, + prepipe string, +) error { + if strings.HasPrefix(filename, "http://") || + strings.HasPrefix(filename, "https://") || + strings.HasPrefix(filename, "file://") { + return errors.New("http://, https://, and file:// URLs are not updateable in place.") + } + if prepipe != "" { + return errors.New("input with --prepipe or --prepipex is not updateable in place.") + } + return nil +} + +// FindInputEncoding determines the input encoding (compression), whether from +// a flag like --gzin, or from filename suffix like ".gz". If the user did +// --gzin on the command line, TFileInputEncoding will be +// FileInputEncodingGzip. If they didn't, but the filename ends in ".gz", then +// we auto-infer FileInputEncodingGzip. Either way, this function tells if we +// will be using in-process decompression within the file-format-specific +// record reader. +func FindInputEncoding( + filename string, + inputFileInputEncoding TFileInputEncoding, +) TFileInputEncoding { + if inputFileInputEncoding != FileInputEncodingDefault { + return inputFileInputEncoding + } + if strings.HasSuffix(filename, ".bz2") { + return FileInputEncodingBzip2 + } + if strings.HasSuffix(filename, ".gz") { + return FileInputEncodingGzip + } + if strings.HasSuffix(filename, ".z") { + return FileInputEncodingZlib + } + return FileInputEncodingDefault +} + +// WrapOutputHandle wraps a file-write handle with a decompressor. The first +// return value is the wrapped handle. The second is true if the returned +// handle needs to be closed separately from the original. The third is for +// in-process compression we can't undo: namely, as of September 2021 the gzip +// and zlib libraries support write-closers, but the bzip2 library does not. +func WrapOutputHandle( + fileWriteHandle io.WriteCloser, + inputFileEncoding TFileInputEncoding, +) (io.WriteCloser, bool, error) { + switch inputFileEncoding { + case FileInputEncodingBzip2: + return fileWriteHandle, false, errors.New("bzip2 is not currently supported for in-place mode.") + case FileInputEncodingGzip: + return gzip.NewWriter(fileWriteHandle), true, nil + case FileInputEncodingZlib: + return zlib.NewWriter(fileWriteHandle), true, nil + default: + return fileWriteHandle, false, nil + } +} diff --git a/go/src/stream/stream.go b/go/src/stream/stream.go index b87b09ac2..db66b057f 100644 --- a/go/src/stream/stream.go +++ b/go/src/stream/stream.go @@ -3,6 +3,7 @@ package stream import ( "errors" "fmt" + "io" "os" "mlr/src/cli" @@ -33,7 +34,7 @@ func Stream( fileNames []string, options cli.TOptions, recordTransformers []transformers.IRecordTransformer, - outputStream *os.File, + outputStream io.WriteCloser, outputIsStdout bool, ) error { diff --git a/man6/manpage.txt b/man6/manpage.txt index f74f1d8e1..387f4079a 100644 --- a/man6/manpage.txt +++ b/man6/manpage.txt @@ -655,7 +655,7 @@ AUXILIARY COMMANDS help regtest repl - For more information, please invoke mlr {subcommand} --help. + For more information, please invoke mlrt {subcommand} --help. MLRRC You can set up personal defaults via a $HOME/.mlrrc and/or ./.mlrrc. @@ -2720,4 +2720,4 @@ SEE ALSO - 2021-09-19 MILLER(1) + 2021-09-20 MILLER(1) diff --git a/man6/mlr6.1 b/man6/mlr6.1 index 6374b1bbc..543d7c129 100644 --- a/man6/mlr6.1 +++ b/man6/mlr6.1 @@ -2,12 +2,12 @@ .\" Title: mlr .\" Author: [see the "AUTHOR" section] .\" Generator: ./mkman.rb -.\" Date: 2021-09-19 +.\" Date: 2021-09-20 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "MILLER" "1" "2021-09-19" "\ \&" "\ \&" +.TH "MILLER" "1" "2021-09-20" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Portability definitions .\" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -804,7 +804,7 @@ Available subcommands: help regtest repl -For more information, please invoke mlr {subcommand} --help. +For more information, please invoke mlrt {subcommand} --help. .fi .if n \{\ .RE