Don't parse CSV comments (#1859)

* `mlr sort -b` feature

* mlr regtest -p test/cases/cli-help && make dev

* Don't parse CSV comments

* Add tests for PR 1346

* Add tests for PR 1787

* Add test CSV files
This commit is contained in:
John Kerl 2025-08-13 17:07:32 -05:00 committed by GitHub
parent 369156b70d
commit 06e16ea3ee
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 62 additions and 37 deletions

View file

@ -311,15 +311,28 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
var errRead error
for errRead == nil {
line, errRead = r.readLine()
if r.Comment != 0 && nextRune(line) == r.Comment {
line = nil
continue // Skip comment lines
}
// MILLER-SPECIFIC UPDATE: DO NOT DO THIS
// if r.Comment != 0 && nextRune(line) == r.Comment {
// line = nil
// continue // Skip comment lines
// }
// MILLER-SPECIFIC UPDATE: DO NOT DO THIS
// if errRead == nil && len(line) == lengthNL(line) {
// line = nil
// continue // Skip empty lines
// line = nil
// continue // Skip empty lines
// }
// MILLER-SPECIFIC UPDATE: If the line starts with the comment character,
// don't attempt to CSV-parse it -- just hand it back as a single field.
// This allows two things:
// * User comments get passed through as intended, without being reformatted;
// * Users can do things like `# a"b` in their comments without getting an
// imbalanced-double-quote error.
if r.Comment != 0 && nextRune(line) == r.Comment {
return []string{string(line)}, nil
}
break
}
if errRead == io.EOF {

View file

@ -1,7 +1,6 @@
package input
import (
"bytes"
"container/list"
"fmt"
"io"
@ -109,6 +108,14 @@ func (reader *RecordReaderCSV) processHandle(
csvReader.Comma = rune(reader.ifs0)
csvReader.LazyQuotes = reader.csvLazyQuotes
csvReader.TrimLeadingSpace = reader.csvTrimLeadingSpace
if reader.readerOptions.CommentHandling != cli.CommentsAreData {
if len(reader.readerOptions.CommentString) == 1 {
// Use our modified fork of the go-csv package
csvReader.Comment = rune(reader.readerOptions.CommentString[0])
}
}
csvRecordsChannel := make(chan *list.List, recordsPerBatch)
go channelizedCSVRecordScanner(csvReader, csvRecordsChannel, downstreamDoneChannel, errorChannel,
recordsPerBatch)
@ -318,42 +325,17 @@ func (reader *RecordReaderCSV) maybeConsumeComment(
// However, sadly, bytes.Buffer does not implement io.Writer because
// its Write method has pointer receiver. So we have a WorkaroundBuffer
// struct below which has non-pointer receiver.
buffer := NewWorkaroundBuffer()
csvWriter := csv.NewWriter(buffer)
csvWriter.Comma = rune(reader.ifs0)
csvWriter.Write(csvRecord)
csvWriter.Flush()
recordsAndContexts.PushBack(types.NewOutputString(buffer.String(), context))
// Contract with our fork of the go-csv CSV Reader
lib.InternalCodingErrorIf(len(csvRecord) != 1)
recordsAndContexts.PushBack(types.NewOutputString(csvRecord[0], context))
} else /* reader.readerOptions.CommentHandling == cli.SkipComments */ {
// discard entirely
}
return false
}
// ----------------------------------------------------------------
// As noted above: wraps a bytes.Buffer, whose Write method has pointer
// receiver, in a struct with non-pointer receiver so that it implements
// io.Writer.
type WorkaroundBuffer struct {
pbuffer *bytes.Buffer
}
func NewWorkaroundBuffer() WorkaroundBuffer {
var buffer bytes.Buffer
return WorkaroundBuffer{
pbuffer: &buffer,
}
}
func (wb WorkaroundBuffer) Write(p []byte) (n int, err error) {
return wb.pbuffer.Write(p)
}
func (wb WorkaroundBuffer) String() string {
return wb.pbuffer.String()
}
// ----------------------------------------------------------------
// BOM-stripping
//