Don't parse CSV comments (#1859)

* `mlr sort -b` feature * mlr regtest -p test/cases/cli-help && make dev * Don't parse CSV comments * Add tests for PR 1346 * Add tests for PR 1787 * Add test CSV files
2026-01-23 02:14:13 +00:00 · 2025-08-13 17:07:32 -05:00 · 2025-08-13 17:07:32 -05:00 · 06e16ea3ee
commit 06e16ea3ee
parent 369156b70d
18 changed files with 62 additions and 37 deletions
--- a/pkg/go-csv/csv_reader.go
+++ b/pkg/go-csv/csv_reader.go
@ -311,15 +311,28 @@ func (r *Reader) readRecord(dst []string) ([]string, error) {
 	var errRead error
 	for errRead == nil {
 		line, errRead = r.readLine()
-		if r.Comment != 0 && nextRune(line) == r.Comment {
-			line = nil
-			continue // Skip comment lines
-		}
+
+		// MILLER-SPECIFIC UPDATE: DO NOT DO THIS
+		// if r.Comment != 0 && nextRune(line) == r.Comment {
+		//   line = nil
+		//   continue // Skip comment lines
+		// }
+
 		// MILLER-SPECIFIC UPDATE: DO NOT DO THIS
 		// if errRead == nil && len(line) == lengthNL(line) {
-		// 	line = nil
-		// 	continue // Skip empty lines
+		//   line = nil
+		//   continue // Skip empty lines
 		// }
+
+		// MILLER-SPECIFIC UPDATE: If the line starts with the comment character,
+		// don't attempt to CSV-parse it -- just hand it back as a single field.
+		// This allows two things:
+		// * User comments get passed through as intended, without being reformatted;
+		// * Users can do things like `# a"b` in their comments without getting an
+		//   imbalanced-double-quote error.
+		if r.Comment != 0 && nextRune(line) == r.Comment {
+			return []string{string(line)}, nil
+		}
 		break
 	}
 	if errRead == io.EOF {
--- a/pkg/input/record_reader_csv.go
+++ b/pkg/input/record_reader_csv.go
@ -1,7 +1,6 @@
 package input

 import (
-	"bytes"
 	"container/list"
 	"fmt"
 	"io"
@ -109,6 +108,14 @@ func (reader *RecordReaderCSV) processHandle(
 	csvReader.Comma = rune(reader.ifs0)
 	csvReader.LazyQuotes = reader.csvLazyQuotes
 	csvReader.TrimLeadingSpace = reader.csvTrimLeadingSpace
+
+	if reader.readerOptions.CommentHandling != cli.CommentsAreData {
+		if len(reader.readerOptions.CommentString) == 1 {
+			// Use our modified fork of the go-csv package
+			csvReader.Comment = rune(reader.readerOptions.CommentString[0])
+		}
+	}
+
 	csvRecordsChannel := make(chan *list.List, recordsPerBatch)
 	go channelizedCSVRecordScanner(csvReader, csvRecordsChannel, downstreamDoneChannel, errorChannel,
 		recordsPerBatch)
@ -318,42 +325,17 @@ func (reader *RecordReaderCSV) maybeConsumeComment(
 		// However, sadly, bytes.Buffer does not implement io.Writer because
 		// its Write method has pointer receiver. So we have a WorkaroundBuffer
 		// struct below which has non-pointer receiver.
-		buffer := NewWorkaroundBuffer()
-		csvWriter := csv.NewWriter(buffer)
-		csvWriter.Comma = rune(reader.ifs0)
-		csvWriter.Write(csvRecord)
-		csvWriter.Flush()
-		recordsAndContexts.PushBack(types.NewOutputString(buffer.String(), context))
+
+		// Contract with our fork of the go-csv CSV Reader
+		lib.InternalCodingErrorIf(len(csvRecord) != 1)
+		recordsAndContexts.PushBack(types.NewOutputString(csvRecord[0], context))
+
 	} else /* reader.readerOptions.CommentHandling == cli.SkipComments */ {
 		// discard entirely
 	}
 	return false
 }

-// ----------------------------------------------------------------
-// As noted above: wraps a bytes.Buffer, whose Write method has pointer
-// receiver, in a struct with non-pointer receiver so that it implements
-// io.Writer.
-
-type WorkaroundBuffer struct {
-	pbuffer *bytes.Buffer
-}
-
-func NewWorkaroundBuffer() WorkaroundBuffer {
-	var buffer bytes.Buffer
-	return WorkaroundBuffer{
-		pbuffer: &buffer,
-	}
-}
-
-func (wb WorkaroundBuffer) Write(p []byte) (n int, err error) {
-	return wb.pbuffer.Write(p)
-}
-
-func (wb WorkaroundBuffer) String() string {
-	return wb.pbuffer.String()
-}
-
 // ----------------------------------------------------------------
 // BOM-stripping
 //