From 06e16ea3ee997a32b7c7590acfcdc60b62b92ece Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 13 Aug 2025 17:07:32 -0500 Subject: [PATCH] Don't parse CSV comments (#1859) * `mlr sort -b` feature * mlr regtest -p test/cases/cli-help && make dev * Don't parse CSV comments * Add tests for PR 1346 * Add tests for PR 1787 * Add test CSV files --- pkg/go-csv/csv_reader.go | 25 ++++++++--- pkg/input/record_reader_csv.go | 44 ++++++------------- test/cases/io-skip-pass-comments/pr-1346/cmd | 1 + .../io-skip-pass-comments/pr-1346/experr | 1 + .../io-skip-pass-comments/pr-1346/expout | 5 +++ .../io-skip-pass-comments/pr-1346/should-fail | 0 .../cases/io-skip-pass-comments/pr-1787-a/cmd | 1 + .../io-skip-pass-comments/pr-1787-a/experr | 1 + .../io-skip-pass-comments/pr-1787-a/expout | 2 + .../pr-1787-a/should-fail | 0 .../cases/io-skip-pass-comments/pr-1787-b/cmd | 1 + .../io-skip-pass-comments/pr-1787-b/experr | 0 .../io-skip-pass-comments/pr-1787-b/expout | 4 ++ .../cases/io-skip-pass-comments/pr-1787-c/cmd | 1 + .../io-skip-pass-comments/pr-1787-c/experr | 0 .../io-skip-pass-comments/pr-1787-c/expout | 3 ++ test/input/pr-1346.csv | 6 +++ test/input/pr-1787.csv | 4 ++ 18 files changed, 62 insertions(+), 37 deletions(-) create mode 100644 test/cases/io-skip-pass-comments/pr-1346/cmd create mode 100644 test/cases/io-skip-pass-comments/pr-1346/experr create mode 100644 test/cases/io-skip-pass-comments/pr-1346/expout create mode 100644 test/cases/io-skip-pass-comments/pr-1346/should-fail create mode 100644 test/cases/io-skip-pass-comments/pr-1787-a/cmd create mode 100644 test/cases/io-skip-pass-comments/pr-1787-a/experr create mode 100644 test/cases/io-skip-pass-comments/pr-1787-a/expout create mode 100644 test/cases/io-skip-pass-comments/pr-1787-a/should-fail create mode 100644 test/cases/io-skip-pass-comments/pr-1787-b/cmd create mode 100644 test/cases/io-skip-pass-comments/pr-1787-b/experr create mode 100644 test/cases/io-skip-pass-comments/pr-1787-b/expout create mode 100644 test/cases/io-skip-pass-comments/pr-1787-c/cmd create mode 100644 test/cases/io-skip-pass-comments/pr-1787-c/experr create mode 100644 test/cases/io-skip-pass-comments/pr-1787-c/expout create mode 100644 test/input/pr-1346.csv create mode 100644 test/input/pr-1787.csv diff --git a/pkg/go-csv/csv_reader.go b/pkg/go-csv/csv_reader.go index 507e9a94c..5a0820a01 100644 --- a/pkg/go-csv/csv_reader.go +++ b/pkg/go-csv/csv_reader.go @@ -311,15 +311,28 @@ func (r *Reader) readRecord(dst []string) ([]string, error) { var errRead error for errRead == nil { line, errRead = r.readLine() - if r.Comment != 0 && nextRune(line) == r.Comment { - line = nil - continue // Skip comment lines - } + + // MILLER-SPECIFIC UPDATE: DO NOT DO THIS + // if r.Comment != 0 && nextRune(line) == r.Comment { + // line = nil + // continue // Skip comment lines + // } + // MILLER-SPECIFIC UPDATE: DO NOT DO THIS // if errRead == nil && len(line) == lengthNL(line) { - // line = nil - // continue // Skip empty lines + // line = nil + // continue // Skip empty lines // } + + // MILLER-SPECIFIC UPDATE: If the line starts with the comment character, + // don't attempt to CSV-parse it -- just hand it back as a single field. + // This allows two things: + // * User comments get passed through as intended, without being reformatted; + // * Users can do things like `# a"b` in their comments without getting an + // imbalanced-double-quote error. + if r.Comment != 0 && nextRune(line) == r.Comment { + return []string{string(line)}, nil + } break } if errRead == io.EOF { diff --git a/pkg/input/record_reader_csv.go b/pkg/input/record_reader_csv.go index 6ed07250d..a154ac8ba 100644 --- a/pkg/input/record_reader_csv.go +++ b/pkg/input/record_reader_csv.go @@ -1,7 +1,6 @@ package input import ( - "bytes" "container/list" "fmt" "io" @@ -109,6 +108,14 @@ func (reader *RecordReaderCSV) processHandle( csvReader.Comma = rune(reader.ifs0) csvReader.LazyQuotes = reader.csvLazyQuotes csvReader.TrimLeadingSpace = reader.csvTrimLeadingSpace + + if reader.readerOptions.CommentHandling != cli.CommentsAreData { + if len(reader.readerOptions.CommentString) == 1 { + // Use our modified fork of the go-csv package + csvReader.Comment = rune(reader.readerOptions.CommentString[0]) + } + } + csvRecordsChannel := make(chan *list.List, recordsPerBatch) go channelizedCSVRecordScanner(csvReader, csvRecordsChannel, downstreamDoneChannel, errorChannel, recordsPerBatch) @@ -318,42 +325,17 @@ func (reader *RecordReaderCSV) maybeConsumeComment( // However, sadly, bytes.Buffer does not implement io.Writer because // its Write method has pointer receiver. So we have a WorkaroundBuffer // struct below which has non-pointer receiver. - buffer := NewWorkaroundBuffer() - csvWriter := csv.NewWriter(buffer) - csvWriter.Comma = rune(reader.ifs0) - csvWriter.Write(csvRecord) - csvWriter.Flush() - recordsAndContexts.PushBack(types.NewOutputString(buffer.String(), context)) + + // Contract with our fork of the go-csv CSV Reader + lib.InternalCodingErrorIf(len(csvRecord) != 1) + recordsAndContexts.PushBack(types.NewOutputString(csvRecord[0], context)) + } else /* reader.readerOptions.CommentHandling == cli.SkipComments */ { // discard entirely } return false } -// ---------------------------------------------------------------- -// As noted above: wraps a bytes.Buffer, whose Write method has pointer -// receiver, in a struct with non-pointer receiver so that it implements -// io.Writer. - -type WorkaroundBuffer struct { - pbuffer *bytes.Buffer -} - -func NewWorkaroundBuffer() WorkaroundBuffer { - var buffer bytes.Buffer - return WorkaroundBuffer{ - pbuffer: &buffer, - } -} - -func (wb WorkaroundBuffer) Write(p []byte) (n int, err error) { - return wb.pbuffer.Write(p) -} - -func (wb WorkaroundBuffer) String() string { - return wb.pbuffer.String() -} - // ---------------------------------------------------------------- // BOM-stripping // diff --git a/test/cases/io-skip-pass-comments/pr-1346/cmd b/test/cases/io-skip-pass-comments/pr-1346/cmd new file mode 100644 index 000000000..611187612 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1346/cmd @@ -0,0 +1 @@ +mlr --skip-comments --csv --pass-comments cat test/input/pr-1346.csv diff --git a/test/cases/io-skip-pass-comments/pr-1346/experr b/test/cases/io-skip-pass-comments/pr-1346/experr new file mode 100644 index 000000000..10864f8ab --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1346/experr @@ -0,0 +1 @@ +mlr: mlr: CSV header/data length mismatch 2 != 1 at filename test/input/pr-1346.csv row 4. diff --git a/test/cases/io-skip-pass-comments/pr-1346/expout b/test/cases/io-skip-pass-comments/pr-1346/expout new file mode 100644 index 000000000..b7872a7a9 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1346/expout @@ -0,0 +1,5 @@ +field1,field2 +a,b +# that was the first record +c,d +# that was the second record, and there is no more data diff --git a/test/cases/io-skip-pass-comments/pr-1346/should-fail b/test/cases/io-skip-pass-comments/pr-1346/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/cmd b/test/cases/io-skip-pass-comments/pr-1787-a/cmd new file mode 100644 index 000000000..8ecdde63e --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-a/cmd @@ -0,0 +1 @@ +mlr --csv cat test/input/pr-1787.csv diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/experr b/test/cases/io-skip-pass-comments/pr-1787-a/experr new file mode 100644 index 000000000..9e02e68bc --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-a/experr @@ -0,0 +1 @@ +mlr: parse error on line 3, column 4: bare " in non-quoted-field. diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/expout b/test/cases/io-skip-pass-comments/pr-1787-a/expout new file mode 100644 index 000000000..bfde6bfa0 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-a/expout @@ -0,0 +1,2 @@ +a,b,c +1,2,3 diff --git a/test/cases/io-skip-pass-comments/pr-1787-a/should-fail b/test/cases/io-skip-pass-comments/pr-1787-a/should-fail new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-b/cmd b/test/cases/io-skip-pass-comments/pr-1787-b/cmd new file mode 100644 index 000000000..c79588a16 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-b/cmd @@ -0,0 +1 @@ +mlr --csv --pass-comments cat test/input/pr-1787.csv diff --git a/test/cases/io-skip-pass-comments/pr-1787-b/experr b/test/cases/io-skip-pass-comments/pr-1787-b/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-b/expout b/test/cases/io-skip-pass-comments/pr-1787-b/expout new file mode 100644 index 000000000..23b8c638c --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-b/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +# x"y +4,5,6 diff --git a/test/cases/io-skip-pass-comments/pr-1787-c/cmd b/test/cases/io-skip-pass-comments/pr-1787-c/cmd new file mode 100644 index 000000000..8e17a1f3e --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-c/cmd @@ -0,0 +1 @@ +mlr --csv --skip-comments cat test/input/pr-1787.csv diff --git a/test/cases/io-skip-pass-comments/pr-1787-c/experr b/test/cases/io-skip-pass-comments/pr-1787-c/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/io-skip-pass-comments/pr-1787-c/expout b/test/cases/io-skip-pass-comments/pr-1787-c/expout new file mode 100644 index 000000000..88700c714 --- /dev/null +++ b/test/cases/io-skip-pass-comments/pr-1787-c/expout @@ -0,0 +1,3 @@ +a,b,c +1,2,3 +4,5,6 diff --git a/test/input/pr-1346.csv b/test/input/pr-1346.csv new file mode 100644 index 000000000..6a46e0994 --- /dev/null +++ b/test/input/pr-1346.csv @@ -0,0 +1,6 @@ +field1,field2 +a,b +# that was the first record +c,d +# that was the second record, and there is no more data + diff --git a/test/input/pr-1787.csv b/test/input/pr-1787.csv new file mode 100644 index 000000000..23b8c638c --- /dev/null +++ b/test/input/pr-1787.csv @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +# x"y +4,5,6