remove mmap-readers, which were high-maintenance and not able to be used when most needed

This commit is contained in:
John Kerl 2020-01-26 10:21:31 -05:00
parent 740066fcc9
commit 2632ddc716
28 changed files with 93 additions and 4058 deletions

View file

@ -81,7 +81,6 @@ TEST_BYTE_READERS_SRCS = \
lib/string_builder.c \
input/string_byte_reader.c \
input/stdio_byte_reader.c \
input/mmap_byte_reader.c \
unit_test/test_byte_readers.c
TEST_LINE_READERS_SRCS = \
@ -137,18 +136,12 @@ TEST_LREC_SRCS = \
containers/sllmv.c \
containers/mlhmmv.c \
input/line_readers.c \
input/file_reader_mmap.c \
input/file_reader_stdio.c \
input/file_ingestor_stdio.c \
input/lrec_reader_mmap_csvlite.c \
input/lrec_reader_stdio_csvlite.c \
input/lrec_reader_mmap_dkvp.c \
input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c \
input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c \
input/lrec_reader_stdio_xtab.c \
input/lrec_reader_mmap_json.c \
input/lrec_reader_stdio_json.c \
input/mlr_json_adapter.c \
input/json_parser.c \
@ -187,18 +180,12 @@ TEST_MULTIPLE_CONTAINERS_SRCS = \
containers/top_keeper.c \
containers/dheap.c \
input/line_readers.c \
input/file_reader_mmap.c \
input/file_reader_stdio.c \
input/file_ingestor_stdio.c \
input/lrec_reader_mmap_csvlite.c \
input/lrec_reader_stdio_csvlite.c \
input/lrec_reader_mmap_dkvp.c \
input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c \
input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c \
input/lrec_reader_stdio_xtab.c \
input/lrec_reader_mmap_json.c \
input/lrec_reader_stdio_json.c \
input/mlr_json_adapter.c \
input/json_parser.c \
@ -358,27 +345,19 @@ TEST_JOIN_BUCKET_KEEPER_SRCS = \
containers/mixutil.c \
containers/header_keeper.c \
containers/join_bucket_keeper.c \
input/mmap_byte_reader.c \
input/stdio_byte_reader.c \
input/line_readers.c \
input/lrec_reader_gen.c \
input/lrec_reader_in_memory.c \
input/lrec_readers.c \
input/lrec_reader_mmap_csv.c \
input/lrec_reader_stdio_csv.c \
input/lrec_reader_mmap_csvlite.c \
input/lrec_reader_stdio_csvlite.c \
input/lrec_reader_mmap_dkvp.c \
input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c \
input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c \
input/lrec_reader_stdio_xtab.c \
input/lrec_reader_mmap_json.c \
input/lrec_reader_stdio_json.c \
input/mlr_json_adapter.c \
input/json_parser.c \
input/file_reader_mmap.c \
input/file_reader_stdio.c \
input/file_ingestor_stdio.c \
input/peek_file_reader.c \
@ -398,7 +377,6 @@ EXPERIMENTAL_READER_SRCS = \
lib/string_array.c \
lib/string_builder.c \
input/stdio_byte_reader.c \
input/file_reader_mmap.c \
input/line_readers.c \
containers/parse_trie.c \
experimental/getlines.c
@ -492,7 +470,6 @@ unit-test: test-mlrutil test-mlrregex test-argparse test-line-readers test-byte-
reg-test:
./reg_test/run
./reg_test/run --no-mmap
# ----------------------------------------------------------------
# Run this after unit-test expected output has changed, and is verified to be

View file

@ -76,7 +76,6 @@ TEST_BYTE_READERS_SRCS = \
lib/string_builder.c \
input/string_byte_reader.c \
input/stdio_byte_reader.c \
input/mmap_byte_reader.c \
unit_test/test_byte_readers.c
TEST_LINE_READERS_SRCS = \
@ -125,18 +124,12 @@ TEST_LREC_SRCS = \
containers/sllmv.c \
containers/mlhmmv.c \
input/line_readers.c \
input/file_reader_mmap.c \
input/file_reader_stdio.c \
input/file_ingestor_stdio.c \
input/lrec_reader_mmap_csvlite.c \
input/lrec_reader_stdio_csvlite.c \
input/lrec_reader_mmap_dkvp.c \
input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c \
input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c \
input/lrec_reader_stdio_xtab.c \
input/lrec_reader_mmap_json.c \
input/lrec_reader_stdio_json.c \
input/mlr_json_adapter.c \
input/json_parser.c \
@ -173,18 +166,12 @@ TEST_MULTIPLE_CONTAINERS_SRCS = \
containers/top_keeper.c \
containers/dheap.c \
input/line_readers.c \
input/file_reader_mmap.c \
input/file_reader_stdio.c \
input/file_ingestor_stdio.c \
input/lrec_reader_mmap_csvlite.c \
input/lrec_reader_stdio_csvlite.c \
input/lrec_reader_mmap_dkvp.c \
input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c \
input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c \
input/lrec_reader_stdio_xtab.c \
input/lrec_reader_mmap_json.c \
input/lrec_reader_stdio_json.c \
input/mlr_json_adapter.c \
input/json_parser.c \
@ -325,26 +312,18 @@ TEST_JOIN_BUCKET_KEEPER_SRCS = \
containers/mixutil.c \
containers/header_keeper.c \
containers/join_bucket_keeper.c \
input/mmap_byte_reader.c \
input/stdio_byte_reader.c \
input/line_readers.c \
input/lrec_reader_in_memory.c \
input/lrec_readers.c \
input/lrec_reader_mmap_csv.c \
input/lrec_reader_stdio_csv.c \
input/lrec_reader_mmap_csvlite.c \
input/lrec_reader_stdio_csvlite.c \
input/lrec_reader_mmap_dkvp.c \
input/lrec_reader_stdio_dkvp.c \
input/lrec_reader_mmap_nidx.c \
input/lrec_reader_stdio_nidx.c \
input/lrec_reader_mmap_xtab.c \
input/lrec_reader_stdio_xtab.c \
input/lrec_reader_mmap_json.c \
input/lrec_reader_stdio_json.c \
input/mlr_json_adapter.c \
input/json_parser.c \
input/file_reader_mmap.c \
input/file_reader_stdio.c \
input/file_ingestor_stdio.c \
input/peek_file_reader.c \
@ -362,7 +341,6 @@ EXPERIMENTAL_READER_SRCS = \
lib/string_array.c \
lib/string_builder.c \
input/stdio_byte_reader.c \
input/file_reader_mmap.c \
input/line_readers.c \
containers/parse_trie.c \
experimental/getlines.c

View file

@ -33,7 +33,6 @@
#define DEFAULT_JSON_FLATTEN_SEPARATOR ":"
#define DEFAULT_OOSVAR_FLATTEN_SEPARATOR ":"
#define DEFAULT_COMMENT_STRING "#"
#define DEFAULT_MAX_FILE_SIZE_FOR_MMAP (4LL*1024LL*1024LL*1024LL)
// ASCII 1f and 1e
#define ASV_FS "\x1f"
@ -278,36 +277,9 @@ cli_opts_t* parse_command_line(int argc, char** argv, sllv_t** ppmapper_list) {
slls_append(popts->filenames, argv[argi], NO_FREE);
}
// Check for use of mmap. It's about 20% faster than stdio (due to fewer data copies
// -- lrecs can be pointer-backed by mmap memory) but we can't use it in all situations.
if (no_input) {
slls_free(popts->filenames);
popts->filenames = NULL;
} else if (popts->filenames->length == 0) {
// No filenames means read from standard input, and standard input cannot be mmapped.
popts->reader_opts.use_mmap_for_read = FALSE;
} else if (popts->filenames->length > 10) {
// https://github.com/johnkerl/miller/issues/256: too many small files is as bad as one big one
// (for which see immediately below).
popts->reader_opts.use_mmap_for_read = FALSE;
} else if (popts->reader_opts.use_mmap_for_read == TRUE) {
// https://github.com/johnkerl/miller/issues/160: don't use mmap for large files.
//
// If any input files don't exist, don't error out just yet ... it's possible that the user
// is doing some complex put-with-tee or somesuch which will create the input file by the
// time it's needed. In that case we of course can't know the size yet, so avoid mmap there
// to be safe.
int all_exist_and_are_small_enough = TRUE;
for (sllse_t* pe = popts->filenames->phead; pe != NULL; pe = pe->pnext) {
ssize_t file_size = get_file_size(pe->value);
if (file_size == (ssize_t)(-1) || file_size >= popts->reader_opts.max_file_size_for_mmap) {
all_exist_and_are_small_enough = FALSE;
break;
}
}
if (!all_exist_and_are_small_enough) {
popts->reader_opts.use_mmap_for_read = FALSE;
}
}
if (popts->do_in_place && (popts->filenames == NULL || popts->filenames->length == 0)) {
@ -842,14 +814,6 @@ static void main_usage_data_format_options(FILE* o, char* argv0) {
fprintf(o, "\n");
fprintf(o, " -p is a keystroke-saver for --nidx --fs space --repifs\n");
fprintf(o, "\n");
fprintf(o, " --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or\n");
fprintf(o, " for files less than n bytes in size. Default is for\n");
fprintf(o, " files less than %lld bytes in size.\n", DEFAULT_MAX_FILE_SIZE_FOR_MMAP);
fprintf(o, " 'Whenever possible' means always except for when reading\n");
fprintf(o, " standard input which is not mmappable. If you don't know\n");
fprintf(o, " what this means, don't worry about it -- it's a minor\n");
fprintf(o, " performance optimization.\n");
fprintf(o, "\n");
fprintf(o, " Examples: --csv for CSV-formatted input and output; --idkvp --opprint for\n");
fprintf(o, " DKVP-formatted input and pretty-printed output.\n");
fprintf(o, "\n");
@ -1139,14 +1103,11 @@ void cli_reader_opts_init(cli_reader_opts_t* preader_opts) {
preader_opts->allow_repeat_ips = NEITHER_TRUE_NOR_FALSE;
preader_opts->use_implicit_csv_header = NEITHER_TRUE_NOR_FALSE;
preader_opts->allow_ragged_csv_input = NEITHER_TRUE_NOR_FALSE;
preader_opts->use_mmap_for_read = NEITHER_TRUE_NOR_FALSE;
preader_opts->prepipe = NULL;
preader_opts->comment_handling = COMMENTS_ARE_DATA;
preader_opts->comment_string = NULL;
preader_opts->max_file_size_for_mmap = DEFAULT_MAX_FILE_SIZE_FOR_MMAP;
// xxx temp
preader_opts->generator_opts.field_name = "i";
preader_opts->generator_opts.start = 0LL;
@ -1198,13 +1159,6 @@ void cli_apply_reader_defaults(cli_reader_opts_t* preader_opts) {
if (preader_opts->allow_ragged_csv_input == NEITHER_TRUE_NOR_FALSE)
preader_opts->allow_ragged_csv_input = FALSE;
if (preader_opts->use_mmap_for_read == NEITHER_TRUE_NOR_FALSE)
#if MLR_ARCH_MMAP_ENABLED
preader_opts->use_mmap_for_read = TRUE;
#else
preader_opts->use_mmap_for_read = FALSE;
#endif
if (preader_opts->input_json_flatten_separator == NULL)
preader_opts->input_json_flatten_separator = DEFAULT_JSON_FLATTEN_SEPARATOR;
}
@ -1311,9 +1265,6 @@ void cli_merge_reader_opts(cli_reader_opts_t* pfunc_opts, cli_reader_opts_t* pma
if (pfunc_opts->allow_ragged_csv_input == NEITHER_TRUE_NOR_FALSE)
pfunc_opts->allow_ragged_csv_input = pmain_opts->allow_ragged_csv_input;
if (pfunc_opts->use_mmap_for_read == NEITHER_TRUE_NOR_FALSE)
pfunc_opts->use_mmap_for_read = pmain_opts->use_mmap_for_read;
if (pfunc_opts->input_json_flatten_separator == NULL)
pfunc_opts->input_json_flatten_separator = pmain_opts->input_json_flatten_separator;
}
@ -1642,28 +1593,18 @@ int cli_handle_reader_options(char** argv, int argc, int *pargi, cli_reader_opts
argi += 1;
} else if (streq(argv[argi], "--mmap")) {
preader_opts->use_mmap_for_read = TRUE;
// No-op as of 5.6.3 (mmap is being abandoned) but don't break
// the command-line user experience.
argi += 1;
} else if (streq(argv[argi], "--no-mmap")) {
preader_opts->use_mmap_for_read = FALSE;
// No-op as of 5.6.3 (mmap is being abandoned) but don't break
// the command-line user experience.
argi += 1;
} else if (streq(argv[argi], "--mmap-below")) {
check_arg_count(argv, argi, argc, 2);
preader_opts->use_mmap_for_read = TRUE;
long long llmax;
if (sscanf(argv[argi+1], "%lld", &llmax) != 1) {
fprintf(stderr, "%s: could not scan \"%s\".\n",
MLR_GLOBALS.bargv0, argv[argi+1]);
}
preader_opts->max_file_size_for_mmap = llmax;
argi += 2;
} else if (streq(argv[argi], "--prepipe")) {
check_arg_count(argv, argi, argc, 2);
preader_opts->prepipe = argv[argi+1];
preader_opts->use_mmap_for_read = FALSE;
argi += 2;
} else if (streq(argv[argi], "--skip-comments")) {

View file

@ -37,7 +37,6 @@ typedef struct _cli_reader_opts_t {
int allow_repeat_ips;
int use_implicit_csv_header;
int allow_ragged_csv_input;
int use_mmap_for_read;
// Command for popen on input, e.g. "zcat -cf <". Can be null in which case
// files are read directly rather than through a pipe.
@ -46,9 +45,6 @@ typedef struct _cli_reader_opts_t {
comment_handling_t comment_handling;
char* comment_string;
// https://github.com/johnkerl/miller/issues/160
ssize_t max_file_size_for_mmap;
// Fake internal-data-generator 'reader'
generator_opts_t generator_opts;

View file

@ -2,8 +2,6 @@ noinst_LTLIBRARIES= libinput.la
libinput_la_SOURCES= \
byte_reader.h \
byte_readers.h \
file_reader_mmap.c \
file_reader_mmap.h \
file_reader_stdio.c \
file_reader_stdio.h \
file_ingestor_stdio.c \
@ -17,12 +15,6 @@ libinput_la_SOURCES= \
lrec_reader.h \
lrec_reader_gen.c \
lrec_reader_in_memory.c \
lrec_reader_mmap_csv.c \
lrec_reader_mmap_csvlite.c \
lrec_reader_mmap_dkvp.c \
lrec_reader_mmap_json.c \
lrec_reader_mmap_nidx.c \
lrec_reader_mmap_xtab.c \
lrec_reader_stdio_csv.c \
lrec_reader_stdio_csvlite.c \
lrec_reader_stdio_dkvp.c \
@ -31,7 +23,6 @@ libinput_la_SOURCES= \
lrec_reader_stdio_xtab.c \
lrec_readers.c \
lrec_readers.h \
mmap_byte_reader.c \
peek_file_reader.c \
peek_file_reader.h \
stdio_byte_reader.c \

View file

@ -4,10 +4,8 @@
byte_reader_t* string_byte_reader_alloc();
byte_reader_t* stdio_byte_reader_alloc();
byte_reader_t* mmap_byte_reader_alloc();
void string_byte_reader_free(byte_reader_t* pbr);
void stdio_byte_reader_free(byte_reader_t* pbr);
void mmap_byte_reader_free(byte_reader_t* pbr);
#endif // BYTE_READERS_H

View file

@ -1,84 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include "lib/mlr_arch.h"
#include "lib/mlrutil.h"
#include "lib/mlr_globals.h"
#include "file_reader_mmap.h"
#if MLR_ARCH_MMAP_ENABLED
static char empty_buf[1] = { 0 };
#endif
// ----------------------------------------------------------------
file_reader_mmap_state_t* file_reader_mmap_open(char* prepipe, char* file_name) {
#if MLR_ARCH_MMAP_ENABLED
// popen is a stdio construct, not an mmap construct, and it can't be supported here.
if (prepipe != NULL) {
fprintf(stderr, "%s: coding error detected in file %s at line %d.\n",
MLR_GLOBALS.bargv0, __FILE__, __LINE__);
exit(1);
}
file_reader_mmap_state_t* pstate = mlr_malloc_or_die(sizeof(file_reader_mmap_state_t));
pstate->fd = open(file_name, O_RDONLY);
if (pstate->fd < 0) {
perror("open");
fprintf(stderr, "%s: could not open \"%s\"\n", MLR_GLOBALS.bargv0, file_name);
exit(1);
}
struct stat stat;
if (fstat(pstate->fd, &stat) < 0) {
perror("fstat");
fprintf(stderr, "%s: could not fstat \"%s\"\n", MLR_GLOBALS.bargv0, file_name);
exit(1);
}
if (stat.st_size == 0) {
// mmap doesn't allow us to map zero-length files but zero-length files do exist.
pstate->sol = &empty_buf[0];
} else {
pstate->sol = mmap(NULL, (size_t)stat.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE, pstate->fd, (off_t)0);
if (pstate->sol == MAP_FAILED) {
perror("mmap");
fprintf(stderr, "%s: could not mmap \"%s\"\n", MLR_GLOBALS.bargv0, file_name);
exit(1);
}
}
pstate->eof = pstate->sol + stat.st_size;
// POSIX semantics: the mmap itself increments a reference count to the file, in addition to the
// open. We close the file but keep the mmap reference until a subsequent munmap.
if (close(pstate->fd) < 0) {
perror("close");
exit(1);
}
return pstate;
#else
fprintf(stderr, "%s: mmap is unsupported on this architecture.\n", MLR_GLOBALS.bargv0);
exit(1);
return NULL;
#endif
}
// ----------------------------------------------------------------
// Here we intentionally do not munmap.
//
// This method is used by various lrec readers, where lrecs are instantiated with keys/values
// pointing into mmapped file-contents buffers. This is done for the sake of performance, to reduce
// data-copies. But it also means we can't unmap files after ingesting lrecs, since the lrecs in
// question might be retained after the input-file closes. Example: mlr sort on multiple files.
void file_reader_mmap_close(file_reader_mmap_state_t* pstate, char* prepipe) {
free(pstate);
}
// ----------------------------------------------------------------
void* file_reader_mmap_vopen(void* pvstate, char* prepipe, char* file_name) {
return file_reader_mmap_open(prepipe, file_name);
}
// ----------------------------------------------------------------
void file_reader_mmap_vclose(void* pvstate, void* pvhandle, char* prepipe) {
file_reader_mmap_close(pvhandle, prepipe);
}

View file

@ -1,20 +0,0 @@
// ================================================================
// Abstraction layer for mmapped file-read logic.
// ================================================================
#ifndef FILE_READER_MMAP_H
#define FILE_READER_MMAP_H
typedef struct _file_reader_mmap_state_t {
char* sol;
char* eof;
int fd;
} file_reader_mmap_state_t;
file_reader_mmap_state_t* file_reader_mmap_open(char* prepipe, char* file_name);
void file_reader_mmap_close(file_reader_mmap_state_t* pstate, char* prepipe);
void* file_reader_mmap_vopen(void* pvstate, char* prepipe, char* file_name);
void file_reader_mmap_vclose(void* pvstate, void* pvhandle, char* prepipe);
#endif // FILE_READER_MMAP_H

View file

@ -4,7 +4,6 @@
#include <stdio.h>
#include "lib/context.h"
#include "containers/lrec.h"
#include "input/file_reader_mmap.h"
struct _lrec_reader_t; // forward reference for method declarations

View file

@ -2,7 +2,6 @@
#include <stdlib.h>
#include "lib/mlr_globals.h"
#include "lib/mlrutil.h"
#include "input/file_reader_mmap.h"
#include "input/lrec_readers.h"
typedef struct _lrec_reader_gen_state_t {

View file

@ -1,546 +0,0 @@
// ================================================================
// Note: there are multiple process methods with a lot of code duplication.
// This is intentional. Much of Miller's measured processing time is in the
// lrec-reader process methods. This is code which needs to execute on every
// byte of input and even moving a single runtime if-statement into a
// function-pointer assignment at alloc time can have noticeable effects on
// performance (5-10% in some cases).
// ================================================================
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include "cli/comment_handling.h"
#include "lib/mlr_globals.h"
#include "lib/mlrutil.h"
#include "lib/string_builder.h"
#include "input/file_reader_mmap.h"
#include "input/lrec_readers.h"
#include "input/peek_file_reader.h"
#include "containers/rslls.h"
#include "containers/lhmslv.h"
#include "containers/parse_trie.h"
// Idea of pheader_keepers: each header_keeper object retains the input-line backing
// and the slls_t for a CSV header line which is used by one or more CSV data
// lines. Meanwhile some mappers retain input records from the entire data
// stream, including header-schema changes in the input stream. This means we
// need to keep headers intact as long as any lrecs are pointing to them. One
// option is reference-counting which I experimented with; it was messy and
// error-prone. The approach used here is to keep a hash map from header-schema
// to header_keeper object. The current pheader_keeper is a pointer into one of
// those. Then when the reader is freed, all the header-keepers are freed.
// ----------------------------------------------------------------
#define STRING_BUILDER_INIT_SIZE 1024
#define IRS_TOKEN 0x2001
#define IFS_TOKEN 0x2002
#define DQUOTE_TOKEN 0x2003
#define DQUOTE_IRS_TOKEN 0x2004
#define DQUOTE_IRS2_TOKEN 0x2005 // alternate line-ending for autodetect LF/CRLF
#define DQUOTE_IFS_TOKEN 0x2006
#define DQUOTE_DQUOTE_TOKEN 0x2007
// ----------------------------------------------------------------
typedef struct _lrec_reader_mmap_csv_state_t {
// Input line number is not the same as the record-counter in context_t,
// which counts records.
long long ilno;
char* eof;
char* irs;
char* ifs_eof;
char* ifs;
char* dquote;
char* dquote_irs;
char* dquote_irs2;
char* dquote_ifs;
char* dquote_eof;
char* dquote_dquote;
int do_auto_line_term;
comment_handling_t comment_handling;
char* comment_string;
int comment_string_length;
int dquotelen;
rslls_t* pfields;
string_builder_t* psb;
parse_trie_t* pno_dquote_parse_trie;
parse_trie_t* pdquote_parse_trie;
int expect_header_line_next;
int use_implicit_csv_header;
int allow_ragged_csv_input;
header_keeper_t* pheader_keeper;
lhmslv_t* pheader_keepers;
} lrec_reader_mmap_csv_state_t;
static void lrec_reader_mmap_csv_free(lrec_reader_t* preader);
static void lrec_reader_mmap_csv_sof(void* pvstate, void* pvhandle);
static lrec_t* lrec_reader_mmap_csv_process(void* pvstate, void* pvhandle, context_t* pctx);
static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate,
rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx);
static lrec_t* paste_indices_and_data(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, context_t* pctx);
static lrec_t* paste_header_and_data_ragged(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
context_t* pctx);
static lrec_t* paste_header_and_data_rectangular(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
context_t* pctx);
// ----------------------------------------------------------------
lrec_reader_t* lrec_reader_mmap_csv_alloc(char* irs, char* ifs, int use_implicit_csv_header,
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string)
{
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
lrec_reader_mmap_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_csv_state_t));
pstate->ilno = 0LL;
pstate->do_auto_line_term = FALSE;
if (streq(irs, "auto")) {
pstate->do_auto_line_term = TRUE;
irs = "\n";
}
pstate->comment_handling = comment_handling;
pstate->comment_string = comment_string;
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
pstate->eof = "\xff";
pstate->irs = irs;
pstate->ifs = ifs;
pstate->ifs_eof = mlr_paste_2_strings(pstate->ifs, "\xff");
pstate->dquote = "\"";
pstate->dquote_ifs = mlr_paste_2_strings("\"", pstate->ifs);
pstate->dquote_eof = "\"\xff";
pstate->dquote_dquote = "\"\"";
pstate->dquotelen = strlen(pstate->dquote);
pstate->pno_dquote_parse_trie = parse_trie_alloc();
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->irs, IRS_TOKEN);
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs, IFS_TOKEN);
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->dquote, DQUOTE_TOKEN);
pstate->pdquote_parse_trie = parse_trie_alloc();
if (pstate->do_auto_line_term) {
pstate->dquote_irs = mlr_paste_2_strings("\"", "\n");
pstate->dquote_irs2 = mlr_paste_2_strings("\"", "\r\n");
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_TOKEN);
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs2, DQUOTE_IRS2_TOKEN);
} else {
pstate->dquote_irs = mlr_paste_2_strings("\"", pstate->irs);
pstate->dquote_irs2 = NULL;
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_TOKEN);
}
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_ifs, DQUOTE_IFS_TOKEN);
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_dquote, DQUOTE_DQUOTE_TOKEN);
pstate->pfields = rslls_alloc();
pstate->psb = sb_alloc(STRING_BUILDER_INIT_SIZE);
pstate->expect_header_line_next = use_implicit_csv_header ? FALSE : TRUE;
pstate->use_implicit_csv_header = use_implicit_csv_header;
pstate->allow_ragged_csv_input = allow_ragged_csv_input;
pstate->pheader_keeper = NULL;
pstate->pheader_keepers = lhmslv_alloc();
plrec_reader->pvstate = (void*)pstate;
plrec_reader->popen_func = file_reader_mmap_vopen;
plrec_reader->pclose_func = file_reader_mmap_vclose;
plrec_reader->pprocess_func = lrec_reader_mmap_csv_process;
plrec_reader->psof_func = lrec_reader_mmap_csv_sof;
plrec_reader->pfree_func = lrec_reader_mmap_csv_free;
return plrec_reader;
}
// ----------------------------------------------------------------
static void lrec_reader_mmap_csv_free(lrec_reader_t* preader) {
lrec_reader_mmap_csv_state_t* pstate = preader->pvstate;
for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) {
header_keeper_t* pheader_keeper = pe->pvvalue;
header_keeper_free(pheader_keeper);
}
lhmslv_free(pstate->pheader_keepers);
parse_trie_free(pstate->pno_dquote_parse_trie);
parse_trie_free(pstate->pdquote_parse_trie);
rslls_free(pstate->pfields);
sb_free(pstate->psb);
free(pstate->ifs_eof);
free(pstate->dquote_irs);
free(pstate->dquote_irs2);
free(pstate->dquote_ifs);
free(pstate);
free(preader);
}
// ----------------------------------------------------------------
static void lrec_reader_mmap_csv_sof(void* pvstate, void* pvhandle) {
lrec_reader_mmap_csv_state_t* pstate = pvstate;
pstate->ilno = 0LL;
pstate->expect_header_line_next = pstate->use_implicit_csv_header ? FALSE : TRUE;
// Strip UTF-8 BOM if any
file_reader_mmap_state_t* phandle = pvhandle;
if ((phandle->eof - phandle->sol) >= 3) {
if (memcmp(phandle->sol, "\xef\xbb\xbf", 3) == 0) {
phandle->sol += 3;
}
}
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_csv_process(void* pvstate, void* pvhandle, context_t* pctx) {
lrec_reader_mmap_csv_state_t* pstate = pvstate;
file_reader_mmap_state_t* phandle = pvhandle;
// Ingest the next header line, if expected
if (pstate->expect_header_line_next) {
while (TRUE) {
if (!lrec_reader_mmap_csv_get_fields(pstate, pstate->pfields, phandle, pctx))
return NULL;
pstate->ilno++;
// We check for comments here rather than within the parser since it's important
// for users to be able to comment out lines containing double-quoted newlines.
if (pstate->comment_string != NULL && pstate->pfields->phead != NULL) {
if (streqn(pstate->pfields->phead->value, pstate->comment_string, pstate->comment_string_length)) {
if (pstate->comment_handling == PASS_COMMENTS) {
int i = 0;
for (
rsllse_t* pe = pstate->pfields->phead;
i < pstate->pfields->length && pe != NULL;
pe = pe->pnext, i++)
{
if (i > 0)
fputs(pstate->ifs, stdout);
fputs(pe->value, stdout);
}
if (pstate->do_auto_line_term) {
fputs(pctx->auto_line_term, stdout);
} else {
fputs(pstate->irs, stdout);
}
}
rslls_reset(pstate->pfields);
continue;
}
}
slls_t* pheader_fields = slls_alloc();
int i = 0;
for (rsllse_t* pe = pstate->pfields->phead; i < pstate->pfields->length && pe != NULL; pe = pe->pnext, i++) {
if (*pe->value == 0) {
fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
// Transfer pointer-free responsibility from the rslls to the
// header fields in the header keeper
slls_append(pheader_fields, pe->value, pe->free_flag);
pe->free_flag = 0;
}
rslls_reset(pstate->pfields);
pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields);
if (pstate->pheader_keeper == NULL) {
pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields);
lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper,
NO_FREE); // freed by header-keeper
} else { // Re-use the header-keeper in the header cache
slls_free(pheader_fields);
}
pstate->expect_header_line_next = FALSE;
break;
}
}
// Ingest the next data line, if expected
while (TRUE) {
int rc = lrec_reader_mmap_csv_get_fields(pstate, pstate->pfields, phandle, pctx);
pstate->ilno++;
if (rc == FALSE) // EOF
return NULL;
// We check for comments here rather than within the parser since it's important
// for users to be able to comment out lines containing double-quoted newlines.
if (pstate->comment_string != NULL && pstate->pfields->phead != NULL) {
if (streqn(pstate->pfields->phead->value, pstate->comment_string, pstate->comment_string_length)) {
if (pstate->comment_handling == PASS_COMMENTS) {
int i = 0;
for (
rsllse_t* pe = pstate->pfields->phead;
i < pstate->pfields->length && pe != NULL;
pe = pe->pnext, i++)
{
if (i > 0)
fputs(pstate->ifs, stdout);
fputs(pe->value, stdout);
}
if (pstate->do_auto_line_term) {
fputs(pctx->auto_line_term, stdout);
} else {
fputs(pstate->irs, stdout);
}
}
rslls_reset(pstate->pfields);
continue;
}
}
lrec_t* prec = pstate->use_implicit_csv_header
? paste_indices_and_data(pstate, pstate->pfields, pctx)
: pstate->allow_ragged_csv_input
? paste_header_and_data_ragged(pstate, pstate->pfields, pctx)
: paste_header_and_data_rectangular(pstate, pstate->pfields, pctx);
rslls_reset(pstate->pfields);
return prec;
}
}
static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate,
rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx)
{
int rc, token = 0, matchlen = 0, record_done = FALSE, field_done = FALSE;
string_builder_t* psb = pstate->psb;
if (phandle->sol >= phandle->eof)
return FALSE;
char* p = phandle->sol;
char* e = p;
// loop over fields in record
record_done = FALSE;
while (!record_done) {
// Assumption is dquote is "\""
if (*e != pstate->dquote[0]) { // start of non-quoted field
// Loop over characters in field
field_done = FALSE;
while (!field_done) {
MLR_INTERNAL_CODING_ERROR_IF(e > phandle->eof);
rc = parse_trie_match(pstate->pno_dquote_parse_trie, e, phandle->eof, &token, &matchlen);
if (rc) {
switch(token) {
case IFS_TOKEN: // end of field
*e = 0;
rslls_append(pfields, p, NO_FREE, 0);
p = e + matchlen;
field_done = TRUE;
break;
case IRS_TOKEN: // end of record
*e = 0;
if (pstate->do_auto_line_term) {
if (e > p && e[-1] == '\r') {
e[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
rslls_append(pfields, p, NO_FREE, 0);
p = e + matchlen;
field_done = TRUE;
record_done = TRUE;
break;
case DQUOTE_TOKEN: // CSV syntax error: fields containing quotes must be fully wrapped in quotes
fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n",
MLR_GLOBALS.bargv0, pstate->ilno);
exit(1);
break;
default:
fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
MLR_GLOBALS.bargv0, token, pstate->ilno);
exit(1);
break;
}
e += matchlen;
} else if (e >= phandle->eof) {
// We read to end of file without seeing end of line. We can't always zero-poke a null character to
// terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's
// our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking
// at EOF is one byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(p, phandle->eof - p);
rslls_append(pfields, copy, FREE_ENTRY_VALUE, 0);
p = e + matchlen;
field_done = TRUE;
record_done = TRUE;
break;
} else {
e++;
}
}
} else { // start of quoted field
e += pstate->dquotelen;
p = e;
// loop over characters in field
field_done = FALSE;
int contiguous = TRUE;
// If there are no embedded double-double quotes, then the field value is a contiguous
// array of bytes between the start and end double-quotes (non-inclusive). E.g. "ab,c"
// has contents ab,c. In that case we can point the rslls at that range of bytes
// with no data-copying. However, if there are embedded double-double quotes, then
// we use the string-build logic to build up a dynamically allocated string. E.g.
// "ab""c" becomes ab"c.
while (!field_done) {
if (e >= phandle->eof) {
fprintf(stderr, "%s: unmatched double quote at line %lld.\n",
MLR_GLOBALS.bargv0, pstate->ilno);
exit(1);
}
rc = parse_trie_match(pstate->pdquote_parse_trie, e, phandle->eof, &token, &matchlen);
if (rc) {
switch(token) {
case DQUOTE_IFS_TOKEN: // end of field
*e = 0;
if (contiguous)
rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT);
else
rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
p = e + matchlen;
field_done = TRUE;
break;
case DQUOTE_IRS_TOKEN: // end of record
case DQUOTE_IRS2_TOKEN: // end of record
*e = 0;
if (pstate->do_auto_line_term) {
if (e > p && e[-1] == '\r') {
e[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
if (contiguous)
rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT);
else
rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
p = e + matchlen;
field_done = TRUE;
record_done = TRUE;
break;
case DQUOTE_DQUOTE_TOKEN: // RFC-4180 CSV: "" inside a dquoted field is an escape for "
if (contiguous) { // not anymore it isn't
sb_append_char_range(psb, p, e);
contiguous = FALSE;
} else {
sb_append_char(psb, pstate->dquote[0]);
}
break;
default:
fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
MLR_GLOBALS.bargv0, token, pstate->ilno);
exit(1);
break;
}
e += matchlen;
} else {
if (!contiguous)
sb_append_char(psb, *e);
e++;
}
}
}
}
phandle->sol = e;
return TRUE;
}
// ----------------------------------------------------------------
static lrec_t* paste_indices_and_data(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
context_t* pctx)
{
int idx = 0;
lrec_t* prec = lrec_unbacked_alloc();
for (rsllse_t* pd = pdata_fields->phead; idx < pdata_fields->length && pd != NULL; pd = pd->pnext) {
idx++;
char key_free_flags = 0;
char* key = low_int_to_string(idx, &key_free_flags);
char value_free_flags = pd->free_flag;
// Transfer pointer-free responsibility from the rslls to the lrec object
lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag);
pd->free_flag = 0;
}
return prec;
}
// ----------------------------------------------------------------
static lrec_t* paste_header_and_data_ragged(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
context_t* pctx)
{
lrec_t* prec = lrec_unbacked_alloc();
sllse_t* ph = pstate->pheader_keeper->pkeys->phead;
rsllse_t* pd = pdata_fields->phead;
int idx = 0;
int hlen = pstate->pheader_keeper->pkeys->length;
int dlen = pdata_fields->length;
// Process fields up to minimum of header length and data length
// Note that pd->pnext can be non-null due to pointer-reuse semantics of rslls,
// so use list-length attributes for end-of-list check.
for (idx = 0; idx < hlen && idx < dlen; idx++, ph = ph->pnext, pd = pd->pnext) {
// Transfer pointer-free responsibility from the rslls to the lrec object
lrec_put_ext(prec, ph->value, pd->value, pd->free_flag, pd->quote_flag);
pd->free_flag = 0;
}
if (hlen > dlen) {
// Header is longer. Empty-fill the remaining data fields.
// E.g. if the input looks like
// a,b,c,d <-- header
// 1,2 <-- data
// then put c="", d="".
for ( ; idx < hlen; idx++, ph = ph->pnext) {
lrec_put_ext(prec, ph->value, "", NO_FREE, 0);
}
} else {
// Data is longer. Use positional indices to label the remaining data fields.
for ( ; idx < dlen; idx++, pd = pd->pnext) {
char key_free_flags = 0;
char* key = low_int_to_string(idx+1, &key_free_flags);
char value_free_flags = pd->free_flag;
// Transfer pointer-free responsibility from the rslls to the lrec object
lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag);
pd->free_flag = 0;
}
}
return prec;
}
// ----------------------------------------------------------------
static lrec_t* paste_header_and_data_rectangular(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
context_t* pctx)
{
if (pstate->pheader_keeper->pkeys->length != pdata_fields->length) {
fprintf(stderr, "%s: Header/data length mismatch (%llu != %llu) at file \"%s\" line %lld.\n",
MLR_GLOBALS.bargv0, pstate->pheader_keeper->pkeys->length, pdata_fields->length,
pctx->filename, pstate->ilno);
exit(1);
}
lrec_t* prec = lrec_unbacked_alloc();
sllse_t* ph = pstate->pheader_keeper->pkeys->phead;
rsllse_t* pd = pdata_fields->phead;
for ( ; ph != NULL && pd != NULL; ph = ph->pnext, pd = pd->pnext) {
// Transfer pointer-free responsibility from the rslls to the lrec object
lrec_put_ext(prec, ph->value, pd->value, pd->free_flag, pd->quote_flag);
pd->free_flag = 0;
}
return prec;
}

View file

@ -1,876 +0,0 @@
// ================================================================
// Note: there are multiple process methods with a lot of code duplication.
// This is intentional. Much of Miller's measured processing time is in the
// lrec-reader process methods. This is code which needs to execute on every
// byte of input and even moving a single runtime if-statement into a
// function-pointer assignment at alloc time can have noticeable effects on
// performance (5-10% in some cases).
// ================================================================
#include <stdio.h>
#include <stdlib.h>
#include "cli/comment_handling.h"
#include "lib/mlr_globals.h"
#include "lib/mlrutil.h"
#include "containers/slls.h"
#include "containers/lhmslv.h"
#include "input/file_reader_mmap.h"
#include "input/lrec_readers.h"
// ----------------------------------------------------------------
// Multi-file cases:
//
// a,a a,b c d
// -- FILE1: -- FILE1: -- FILE1: -- FILE1:
// a,b,c a,b,c a,b,c a,b,c
// 1,2,3 1,2,3 1,2,3 1,2,3
// 4,5,6 4,5,6 4,5,6 4,5,6
// -- FILE2: -- FILE2:
// a,b,c d,e,f,g a,b,c d,e,f
// 7,8,9 3,4,5,6 7,8,9 3,4,5
// --OUTPUT: --OUTPUT: --OUTPUT: --OUTPUT:
// a,b,c a,b,c a,b,c a,b,c
// 1,2,3 1,2,3 1,2,3 1,2,3
// 4,5,6 4,5,6 4,5,6 4,5,6
// 7,8,9 7,8,9
// d,e,f,g d,e,f
// 3,4,5,6 3,4,5
// ----------------------------------------------------------------
typedef struct _lrec_reader_mmap_csvlite_state_t {
long long ifnr;
long long ilno; // Line-level, not record-level as in context_t
char* irs;
char* ifs;
int irslen;
int ifslen;
int allow_repeat_ifs;
int do_auto_line_term;
int use_implicit_csv_header;
int allow_ragged_csv_input;
comment_handling_t comment_handling;
char* comment_string;
int comment_string_length;
int expect_header_line_next;
header_keeper_t* pheader_keeper;
lhmslv_t* pheader_keepers;
} lrec_reader_mmap_csvlite_state_t;
static void lrec_reader_mmap_csvlite_free(lrec_reader_t* preader);
static void lrec_reader_mmap_csvlite_sof(void* pvstate, void* pvhandle);
static lrec_t* lrec_reader_mmap_csvlite_process_single_seps(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_csvlite_process_multi_seps(void* pvstate, void* pvhandle, context_t* pctx);
static slls_t* lrec_reader_mmap_csvlite_get_header_single_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx);
static slls_t* lrec_reader_mmap_csvlite_get_header_multi_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate);
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
static int handle_comment_line_single_irs(
file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate,
char irs);
static int handle_comment_line_multi_irs(
file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate);
// ----------------------------------------------------------------
lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char* irs, char* ifs, int allow_repeat_ifs, int use_implicit_csv_header,
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string)
{
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
lrec_reader_mmap_csvlite_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_csvlite_state_t));
pstate->ifnr = 0LL;
pstate->irs = irs;
pstate->ifs = ifs;
pstate->irslen = strlen(irs);
pstate->ifslen = strlen(ifs);
pstate->allow_repeat_ifs = allow_repeat_ifs;
pstate->do_auto_line_term = FALSE;
pstate->use_implicit_csv_header = use_implicit_csv_header;
pstate->allow_ragged_csv_input = allow_ragged_csv_input;
pstate->comment_handling = comment_handling;
pstate->comment_string = comment_string;
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
pstate->expect_header_line_next = use_implicit_csv_header ? FALSE : TRUE;
pstate->pheader_keeper = NULL;
pstate->pheader_keepers = lhmslv_alloc();
plrec_reader->pvstate = (void*)pstate;
plrec_reader->popen_func = file_reader_mmap_vopen;
plrec_reader->pclose_func = file_reader_mmap_vclose;
if (streq(irs, "auto")) {
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
// either case the final character is "\n". Then for autodetect we
// simply check if there's a character in the line before the '\n', and
// if that is '\r'.
pstate->do_auto_line_term = TRUE;
pstate->irs = "\n";
pstate->irslen = 1;
plrec_reader->pprocess_func = (pstate->ifslen == 1)
? lrec_reader_mmap_csvlite_process_single_seps
: lrec_reader_mmap_csvlite_process_multi_seps;
} else {
plrec_reader->pprocess_func = (pstate->irslen == 1 && pstate->ifslen == 1)
? lrec_reader_mmap_csvlite_process_single_seps
: lrec_reader_mmap_csvlite_process_multi_seps;
}
plrec_reader->psof_func = lrec_reader_mmap_csvlite_sof;
plrec_reader->pfree_func = lrec_reader_mmap_csvlite_free;
return plrec_reader;
}
// ----------------------------------------------------------------
static void lrec_reader_mmap_csvlite_free(lrec_reader_t* preader) {
lrec_reader_mmap_csvlite_state_t* pstate = preader->pvstate;
for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) {
header_keeper_t* pheader_keeper = pe->pvvalue;
header_keeper_free(pheader_keeper);
}
lhmslv_free(pstate->pheader_keepers);
free(pstate);
free(preader);
}
static void lrec_reader_mmap_csvlite_sof(void* pvstate, void* pvhandle) {
lrec_reader_mmap_csvlite_state_t* pstate = pvstate;
pstate->ifnr = 0LL;
pstate->ilno = 0LL;
pstate->expect_header_line_next = pstate->use_implicit_csv_header ? FALSE : TRUE;
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_csvlite_process_single_seps(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_csvlite_state_t* pstate = pvstate;
while (TRUE) {
if (pstate->expect_header_line_next) {
slls_t* pheader_fields = lrec_reader_mmap_csvlite_get_header_single_seps(phandle, pstate, pctx);
if (pheader_fields == NULL) { // EOF
return NULL;
}
for (sllse_t* pe = pheader_fields->phead; pe != NULL; pe = pe->pnext) {
if (*pe->value == 0) {
fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
}
pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields);
if (pstate->pheader_keeper == NULL) {
pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields);
lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper,
NO_FREE); // freed by header-keeper
} else { // Re-use the header-keeper in the header cache
slls_free(pheader_fields);
}
pstate->expect_header_line_next = FALSE;
}
int end_of_stanza = FALSE;
lrec_t* prec = pstate->use_implicit_csv_header
? lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(phandle, pstate, pctx,
pstate->pheader_keeper, &end_of_stanza)
: lrec_reader_mmap_csvlite_get_record_single_seps(phandle, pstate, pctx,
pstate->pheader_keeper, &end_of_stanza);
if (end_of_stanza) {
pstate->expect_header_line_next = TRUE;
} else if (prec == NULL) { // EOF
return NULL;
} else {
return prec;
}
}
}
static lrec_t* lrec_reader_mmap_csvlite_process_multi_seps(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_csvlite_state_t* pstate = pvstate;
while (TRUE) {
if (pstate->expect_header_line_next) {
slls_t* pheader_fields = lrec_reader_mmap_csvlite_get_header_multi_seps(phandle, pstate);
if (pheader_fields == NULL) // EOF
return NULL;
for (sllse_t* pe = pheader_fields->phead; pe != NULL; pe = pe->pnext) {
if (*pe->value == 0) {
fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
}
pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields);
if (pstate->pheader_keeper == NULL) {
pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields);
lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper,
NO_FREE); // freed by header-keeper
} else { // Re-use the header-keeper in the header cache
slls_free(pheader_fields);
}
pstate->expect_header_line_next = FALSE;
}
int end_of_stanza = FALSE;
lrec_t* prec = pstate->use_implicit_csv_header
? lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(phandle, pstate, pctx,
pstate->pheader_keeper, &end_of_stanza)
: lrec_reader_mmap_csvlite_get_record_multi_seps(phandle, pstate, pctx,
pstate->pheader_keeper, &end_of_stanza);
if (end_of_stanza) {
pstate->expect_header_line_next = TRUE;
} else if (prec == NULL) { // EOF
return NULL;
} else {
return prec;
}
}
}
// ----------------------------------------------------------------
static slls_t* lrec_reader_mmap_csvlite_get_header_single_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx)
{
char irs = pstate->irs[0];
char ifs = pstate->ifs[0];
int allow_repeat_ifs = pstate->allow_repeat_ifs;
slls_t* pheader_names = slls_alloc();
// Skip blank/comment lines and seek to header line
while (TRUE) {
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
phandle->sol++;
pstate->ilno++;
continue;
}
if (pstate->comment_string != NULL && handle_comment_line_single_irs(phandle, pstate, irs)) {
continue;
}
break;
}
char* p = phandle->sol;
if (allow_repeat_ifs) {
while (*p == ifs)
p++;
}
char* osol = p;
char* header_name = p;
for ( ; p < phandle->eof && *p; ) {
if (*p == irs) {
*p = 0;
if (pstate->do_auto_line_term) {
if (p > phandle->sol && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
pstate->ilno++;
break;
} else if (*p == ifs) {
*p = 0;
slls_append_no_free(pheader_names, header_name);
p++;
if (allow_repeat_ifs) {
while (*p == ifs)
p++;
}
header_name = p;
} else {
p++;
}
}
if (allow_repeat_ifs && *header_name == 0) {
// OK
} else if (p == osol) {
// OK
} else {
slls_append_no_free(pheader_names, header_name);
}
return pheader_names;
}
static slls_t* lrec_reader_mmap_csvlite_get_header_multi_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate)
{
char* irs = pstate->irs;
char* ifs = pstate->ifs;
int irslen = pstate->irslen;
int ifslen = pstate->ifslen;
int allow_repeat_ifs = pstate->allow_repeat_ifs;
// Skip blank/comment lines and seek to header line
while (TRUE) {
if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) {
phandle->sol += irslen;
pstate->ilno++;
continue;
}
if (pstate->comment_string != NULL && handle_comment_line_multi_irs(phandle, pstate)) {
continue;
}
break;
}
slls_t* pheader_names = slls_alloc();
// Parse the header line
char* p = phandle->sol;
if (allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
char* osol = p;
char* header_name = p;
for ( ; p < phandle->eof && *p; ) {
if (streqn(p, irs, irslen)) {
*p = 0;
phandle->sol = p + irslen;
pstate->ilno++;
break;
} else if (streqn(p, ifs, ifslen)) {
*p = 0;
slls_append_no_free(pheader_names, header_name);
p += ifslen;
if (allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
header_name = p;
} else {
p++;
}
}
if (allow_repeat_ifs && *header_name == 0) {
// OK
} else if (p == osol) {
// OK
} else {
slls_append_no_free(pheader_names, header_name);
}
return pheader_names;
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
{
char irs = pstate->irs[0];
char ifs = pstate->ifs[0];
int allow_repeat_ifs = pstate->allow_repeat_ifs;
// Skip comment lines
if (pstate->comment_string != NULL) {
while (handle_comment_line_single_irs(phandle, pstate, irs))
;
}
if (phandle->sol >= phandle->eof)
return NULL;
char* line = phandle->sol;
lrec_t* prec = lrec_unbacked_alloc();
sllse_t* pe = pheader_keeper->pkeys->phead;
char* p = line;
if (allow_repeat_ifs) {
while (*p == ifs)
p++;
}
char* key = NULL;
char* value = p;
int saw_rs = FALSE;
int idx = 0;
for ( ; p < phandle->eof && *p; ) {
if (*p == irs) {
if (p == line) {
*pend_of_stanza = TRUE;
lrec_free(prec);
return NULL;
}
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
pstate->ilno++;
saw_rs = TRUE;
break;
} else if (*p == ifs) {
*p = 0;
idx++;
if (pe == NULL) {
// Data line has more fields than the header line did
if (pstate->allow_ragged_csv_input) {
char free_flags = NO_FREE;
key = low_int_to_string(idx, &free_flags);
lrec_put(prec, key, value, free_flags);
} else {
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
} else {
key = pe->value;
pe = pe->pnext;
lrec_put(prec, key, value, NO_FREE);
}
p++;
if (allow_repeat_ifs) {
while (*p == ifs)
p++;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (allow_repeat_ifs && *value == 0)
return prec;
char free_flags = NO_FREE;
if (pe == NULL) {
// Data line has more fields than the header line did
if (pstate->allow_ragged_csv_input) {
idx++;
key = low_int_to_string(idx, &free_flags);
} else {
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
} else {
key = pe->value;
}
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, NO_FREE);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
if (pe != NULL && pe->pnext != NULL) {
// Header line has more fields than the data line did
if (pstate->allow_ragged_csv_input) {
for (pe = pe->pnext ; pe != NULL; pe = pe->pnext) {
key = pe->value;
lrec_put(prec, key, "", NO_FREE);
}
} else {
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
}
return prec;
}
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
{
// Skip comment lines
if (pstate->comment_string != NULL) {
while (handle_comment_line_multi_irs(phandle, pstate))
;
}
if (phandle->sol >= phandle->eof)
return NULL;
char* irs = pstate->irs;
char* ifs = pstate->ifs;
int irslen = pstate->irslen;
int ifslen = pstate->ifslen;
int allow_repeat_ifs = pstate->allow_repeat_ifs;
lrec_t* prec = lrec_unbacked_alloc();
char* line = phandle->sol;
sllse_t* pe = pheader_keeper->pkeys->phead;
char* p = line;
if (allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
char* key = NULL;
char* value = p;
int saw_rs = FALSE;
int idx = 0;
for ( ; p < phandle->eof && *p; ) {
if (streqn(p, irs, irslen)) {
if (p == line) {
*pend_of_stanza = TRUE;
lrec_free(prec);
return NULL;
}
*p = 0;
phandle->sol = p + irslen;
pstate->ilno++;
saw_rs = TRUE;
break;
} else if (streqn(p, ifs, ifslen)) {
*p = 0;
idx++;
if (pe == NULL) {
// Data line has more fields than the header line did
if (pstate->allow_ragged_csv_input) {
char free_flags = NO_FREE;
key = low_int_to_string(idx, &free_flags);
lrec_put(prec, key, value, free_flags);
} else {
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
}
key = pe->value;
pe = pe->pnext;
lrec_put(prec, key, value, NO_FREE);
p += ifslen;
if (allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (allow_repeat_ifs && *value == 0)
return prec;
char free_flags = NO_FREE;
if (pe == NULL) {
// Data line has more fields than the header line did
if (pstate->allow_ragged_csv_input) {
idx++;
key = low_int_to_string(idx, &free_flags);
} else {
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
} else {
key = pe->value;
}
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, NO_FREE);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
if (pe != NULL && pe->pnext != NULL) {
// Header line has more fields than the data line did
if (pstate->allow_ragged_csv_input) {
for (pe = pe->pnext ; pe != NULL; pe = pe->pnext) {
key = pe->value;
lrec_put(prec, key, "", NO_FREE);
}
} else {
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
exit(1);
}
}
return prec;
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
{
char irs = pstate->irs[0];
char ifs = pstate->ifs[0];
int allow_repeat_ifs = pstate->allow_repeat_ifs;
// Skip comment lines
if (pstate->comment_string != NULL) {
while (handle_comment_line_single_irs(phandle, pstate, irs))
;
}
if (phandle->sol >= phandle->eof)
return NULL;
lrec_t* prec = lrec_unbacked_alloc();
char* line = phandle->sol;
char* p = line;
if (allow_repeat_ifs) {
while (*p == ifs)
p++;
}
char* key = NULL;
char* value = p;
char free_flags = NO_FREE;
int idx = 0;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (*p == irs) {
if (p == line) {
*pend_of_stanza = TRUE;
lrec_free(prec);
return NULL;
}
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
pstate->ilno++;
saw_rs = TRUE;
break;
} else if (*p == ifs) {
*p = 0;
key = low_int_to_string(++idx, &free_flags);
lrec_put(prec, key, value, free_flags);
p++;
if (allow_repeat_ifs) {
while (*p == ifs)
p++;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (allow_repeat_ifs && *value == 0)
return prec;
key = low_int_to_string(++idx, &free_flags);
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, free_flags);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
}
return prec;
}
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
{
// Skip comment lines
if (pstate->comment_string != NULL) {
while (handle_comment_line_multi_irs(phandle, pstate))
;
}
if (phandle->sol >= phandle->eof)
return NULL;
char* irs = pstate->irs;
char* ifs = pstate->ifs;
int irslen = pstate->irslen;
int ifslen = pstate->ifslen;
int allow_repeat_ifs = pstate->allow_repeat_ifs;
lrec_t* prec = lrec_unbacked_alloc();
char* line = phandle->sol;
char* p = line;
if (allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
char* key = NULL;
char* value = p;
char free_flags;
int idx = 0;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (streqn(p, irs, irslen)) {
if (p == line) {
*pend_of_stanza = TRUE;
lrec_free(prec);
return NULL;
}
*p = 0;
phandle->sol = p + irslen;
pstate->ilno++;
saw_rs = TRUE;
break;
} else if (streqn(p, ifs, ifslen)) {
*p = 0;
key = low_int_to_string(++idx, &free_flags);
lrec_put(prec, key, value, free_flags);
p += ifslen;
if (allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (allow_repeat_ifs && *value == 0)
return prec;
key = low_int_to_string(++idx, &free_flags);
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, free_flags);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
}
return prec;
}
// ----------------------------------------------------------------
static int handle_comment_line_single_irs(
file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate,
char irs)
{
if ((phandle->eof - phandle->sol) >= pstate->comment_string_length
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
{
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < pstate->comment_string_length; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += pstate->comment_string_length;
while (phandle->sol < phandle->eof && *phandle->sol != irs) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
pstate->ilno++;
return TRUE;
} else {
return FALSE;
}
}
// ----------------------------------------------------------------
static int handle_comment_line_multi_irs(
file_reader_mmap_state_t* phandle,
lrec_reader_mmap_csvlite_state_t* pstate)
{
if ((phandle->eof - phandle->sol) >= pstate->comment_string_length
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
{
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < pstate->comment_string_length; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += pstate->comment_string_length;
while ((phandle->eof - phandle->sol >= pstate->irslen) && !streqn(phandle->sol, pstate->irs, pstate->irslen)) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
if ((phandle->eof - phandle->sol >= pstate->irslen) && streqn(phandle->sol, pstate->irs, pstate->irslen)) {
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < pstate->irslen; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += pstate->irslen;
}
pstate->ilno++;
return TRUE;
} else {
return FALSE;
}
}

View file

@ -1,683 +0,0 @@
// ================================================================
// Note: there are multiple process methods with a lot of code duplication.
// This is intentional. Much of Miller's measured processing time is in the
// lrec-reader process methods. This is code which needs to execute on every
// byte of input and even moving a single runtime if-statement into a
// function-pointer assignment at alloc time can have noticeable effects on
// performance (5-10% in some cases).
// ================================================================
#include <stdio.h>
#include <stdlib.h>
#include "cli/comment_handling.h"
#include "lib/mlr_globals.h"
#include "lib/mlrutil.h"
#include "input/file_reader_mmap.h"
#include "input/lrec_readers.h"
typedef struct _lrec_reader_mmap_dkvp_state_t {
char* irs;
char* ifs;
char* ips;
int irslen;
int ifslen;
int ipslen;
int allow_repeat_ifs;
int do_auto_line_term;
comment_handling_t comment_handling;
char* comment_string;
int comment_string_length;
} lrec_reader_mmap_dkvp_state_t;
static void lrec_reader_mmap_dkvp_free(lrec_reader_t* preader);
static void lrec_reader_mmap_dkvp_sof(void* pvstate, void* pvhandle);
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_parse_mmap_dkvp_single_irs_single_others(file_reader_mmap_state_t *phandle,
char irs, char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
static lrec_t* lrec_parse_mmap_dkvp_single_irs_multi_others(file_reader_mmap_state_t *phandle,
char irs, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_single_others(file_reader_mmap_state_t *phandle,
char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_multi_others(file_reader_mmap_state_t *phandle,
lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
static void skip_over_comment_lines_single_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_dkvp_state_t* pstate,
char irs);
static void skip_over_comment_lines_multi_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_dkvp_state_t* pstate,
char* irs,
int irslen);
// ----------------------------------------------------------------
lrec_reader_t* lrec_reader_mmap_dkvp_alloc(char* irs, char* ifs, char* ips, int allow_repeat_ifs,
comment_handling_t comment_handling, char* comment_string)
{
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
lrec_reader_mmap_dkvp_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_dkvp_state_t));
pstate->irs = irs;
pstate->ifs = ifs;
pstate->ips = ips;
pstate->irslen = strlen(irs);
pstate->ifslen = strlen(ifs);
pstate->ipslen = strlen(ips);
pstate->allow_repeat_ifs = allow_repeat_ifs;
pstate->do_auto_line_term = FALSE;
pstate->comment_handling = comment_handling;
pstate->comment_string = comment_string;
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
plrec_reader->pvstate = (void*)pstate;
plrec_reader->popen_func = file_reader_mmap_vopen;
plrec_reader->pclose_func = file_reader_mmap_vclose;
if (streq(irs, "auto")) {
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
// either case the final character is "\n". Then for autodetect we
// simply check if there's a character in the line before the '\n', and
// if that is '\r'.
pstate->do_auto_line_term = TRUE;
pstate->irs = "\n";
pstate->irslen = 1;
plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1)
? lrec_reader_mmap_dkvp_process_single_irs_single_others
: lrec_reader_mmap_dkvp_process_single_irs_multi_others;
} else if (pstate->irslen == 1) {
plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1)
? lrec_reader_mmap_dkvp_process_single_irs_single_others
: lrec_reader_mmap_dkvp_process_single_irs_multi_others;
} else {
plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1)
? lrec_reader_mmap_dkvp_process_multi_irs_single_others
: lrec_reader_mmap_dkvp_process_multi_irs_multi_others;
}
plrec_reader->psof_func = lrec_reader_mmap_dkvp_sof;
plrec_reader->pfree_func = lrec_reader_mmap_dkvp_free;
return plrec_reader;
}
static void lrec_reader_mmap_dkvp_free(lrec_reader_t* preader) {
free(preader->pvstate);
free(preader);
}
// No-op for stateless readers such as this one.
static void lrec_reader_mmap_dkvp_sof(void* pvstate, void* pvhandle) {
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_dkvp_single_irs_single_others(phandle, pstate->irs[0], pstate->ifs[0], pstate->ips[0],
pstate, pctx);
}
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_dkvp_single_irs_multi_others(phandle, pstate->irs[0], pstate, pctx);
}
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_dkvp_multi_irs_single_others(phandle, pstate->ifs[0], pstate->ips[0],
pstate, pctx);
}
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_dkvp_multi_irs_multi_others(phandle, pstate, pctx);
}
// ----------------------------------------------------------------
static lrec_t* lrec_parse_mmap_dkvp_single_irs_single_others(file_reader_mmap_state_t *phandle,
char irs, char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_single_irs(phandle, pstate, irs);
if (phandle->sol >= phandle->eof)
return NULL;
char* line = phandle->sol;
lrec_t* prec = lrec_unbacked_alloc();
int idx = 0;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
char* key = p;
char* value = p;
int saw_ps = FALSE;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (*p == irs) {
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
saw_rs = TRUE;
break;
} else if (*p == ifs) {
saw_ps = FALSE;
*p = 0;
idx++;
if (*key == 0 || value <= key) {
// E.g the pair has no equals sign: "a" rather than "a=1" or
// "a=". Here we use the positional index as the key. This way
// DKVP is a generalization of NIDX.
char free_flags = NO_FREE;
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
lrec_put(prec, key, value, NO_FREE);
}
p++;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
key = p;
value = p;
} else if (*p == ips && !saw_ps) {
*p = 0;
p++;
value = p;
saw_ps = TRUE;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
return prec;
// There are two ways out of that loop: saw IRS, or saw end of file.
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof)
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
else
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
if (value >= phandle->eof)
lrec_put(prec, key, "", NO_FREE);
else
lrec_put(prec, key, value, NO_FREE);
}
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof) {
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
}
}
else {
if (value >= phandle->eof) {
lrec_put(prec, key, "", NO_FREE);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
}
}
return prec;
}
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_single_others(file_reader_mmap_state_t *phandle,
char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
if (phandle->sol >= phandle->eof)
return NULL;
char* line = phandle->sol;
lrec_t* prec = lrec_unbacked_alloc();
int idx = 0;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
char* key = p;
char* value = p;
int saw_ps = FALSE;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (streqn(p, pstate->irs, pstate->irslen)) {
*p = 0;
phandle->sol = p + pstate->irslen;
saw_rs = TRUE;
break;
} else if (*p == ifs) {
saw_ps = FALSE;
*p = 0;
idx++;
if (*key == 0 || value <= key) {
// E.g the pair has no equals sign: "a" rather than "a=1" or
// "a=". Here we use the positional index as the key. This way
// DKVP is a generalization of NIDX.
char free_flags = NO_FREE;
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
lrec_put(prec, key, value, NO_FREE);
}
p++;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
key = p;
value = p;
} else if (*p == ips && !saw_ps) {
*p = 0;
p++;
value = p;
saw_ps = TRUE;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
return prec;
// There are two ways out of that loop: saw IRS, or saw end of file.
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof)
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
else
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
if (value >= phandle->eof)
lrec_put(prec, key, "", NO_FREE);
else
lrec_put(prec, key, value, NO_FREE);
}
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof) {
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
}
}
else {
if (value >= phandle->eof) {
lrec_put(prec, key, "", NO_FREE);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
}
}
return prec;
}
static lrec_t* lrec_parse_mmap_dkvp_single_irs_multi_others(file_reader_mmap_state_t *phandle, char irs,
lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_single_irs(phandle, pstate, irs);
if (phandle->sol >= phandle->eof)
return NULL;
char* line = phandle->sol;
lrec_t* prec = lrec_unbacked_alloc();
int idx = 0;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (streqn(p, pstate->ifs, pstate->ifslen))
p += pstate->ifslen;
}
char* key = p;
char* value = p;
int saw_ps = FALSE;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (*p == irs) {
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
saw_rs = TRUE;
break;
} else if (streqn(p, pstate->ifs, pstate->ifslen)) {
saw_ps = FALSE;
*p = 0;
idx++;
if (*key == 0 || value <= key) {
// E.g the pair has no equals sign: "a" rather than "a=1" or
// "a=". Here we use the positional index as the key. This way
// DKVP is a generalization of NIDX.
char free_flags = NO_FREE;
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
lrec_put(prec, key, value, NO_FREE);
}
p += pstate->ifslen;
if (pstate->allow_repeat_ifs) {
while (streqn(p, pstate->ifs, pstate->ifslen))
p += pstate->ifslen;
}
key = p;
value = p;
} else if (streqn(p, pstate->ips, pstate->ipslen) && !saw_ps) {
*p = 0;
p += pstate->ipslen;
value = p;
saw_ps = TRUE;
} else {
p++;
}
}
*p = 0;
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
return prec;
// There are two ways out of that loop: saw IRS, or saw end of file.
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof)
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
else
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
if (value >= phandle->eof)
lrec_put(prec, key, "", NO_FREE);
else
lrec_put(prec, key, value, NO_FREE);
}
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof) {
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
}
}
else {
if (value >= phandle->eof) {
lrec_put(prec, key, "", NO_FREE);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
}
}
return prec;
}
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_multi_others(file_reader_mmap_state_t *phandle,
lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
if (phandle->sol >= phandle->eof)
return NULL;
char* line = phandle->sol;
lrec_t* prec = lrec_unbacked_alloc();
int idx = 0;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (streqn(p, pstate->ifs, pstate->ifslen))
p += pstate->ifslen;
}
char* key = p;
char* value = p;
int saw_ps = FALSE;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (streqn(p, pstate->irs, pstate->irslen)) {
*p = 0;
phandle->sol = p + pstate->irslen;
saw_rs = TRUE;
break;
} else if (streqn(p, pstate->ifs, pstate->ifslen)) {
saw_ps = FALSE;
*p = 0;
idx++;
if (*key == 0 || value <= key) {
// E.g the pair has no equals sign: "a" rather than "a=1" or
// "a=". Here we use the positional index as the key. This way
// DKVP is a generalization of NIDX.
char free_flags = NO_FREE;
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
lrec_put(prec, key, value, NO_FREE);
}
p += pstate->ifslen;
if (pstate->allow_repeat_ifs) {
while (streqn(p, pstate->ifs, pstate->ifslen))
p += pstate->ifslen;
}
key = p;
value = p;
} else if (streqn(p, pstate->ips, pstate->ipslen) && !saw_ps) {
*p = 0;
p += pstate->ipslen;
value = p;
saw_ps = TRUE;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
return prec;
// There are two ways out of that loop: saw IRS, or saw end of file.
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof)
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
else
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
}
else {
if (value >= phandle->eof)
lrec_put(prec, key, "", NO_FREE);
else
lrec_put(prec, key, value, NO_FREE);
}
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
if (*key == 0 || value <= key) {
char free_flags = NO_FREE;
if (value >= phandle->eof) {
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
}
}
else {
if (value >= phandle->eof) {
lrec_put(prec, key, "", NO_FREE);
} else {
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
}
}
return prec;
}
// ----------------------------------------------------------------
static void skip_over_comment_lines_single_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_dkvp_state_t* pstate,
char irs)
{
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
{
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < pstate->comment_string_length; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += pstate->comment_string_length;
while (phandle->sol < phandle->eof && *phandle->sol != irs) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
}
}
static void skip_over_comment_lines_multi_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_dkvp_state_t* pstate,
char* irs,
int irslen)
{
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
{
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < pstate->comment_string_length; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += pstate->comment_string_length;
while ((phandle->eof - phandle->sol) >= irslen && !streqn(phandle->sol, irs, irslen)) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) {
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < irslen; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += irslen;
}
}
}

View file

@ -1,220 +0,0 @@
// ================================================================
// Note: there are multiple process methods with a lot of code duplication.
// This is intentional. Much of Miller's measured processing time is in the
// lrec-reader process methods. This is code which needs to execute on every
// byte of input and even moving a single runtime if-statement into a
// function-pointer assignment at alloc time can have noticeable effects on
// performance (5-10% in some cases).
// ================================================================
// ================================================================
// Unlike other Miller record-readers, there is no streaming for JSON input: no
// records are processed until EOF is seen. See also
// https://github.com/johnkerl/miller/issues/99.
// ================================================================
#include <stdio.h>
#include <stdlib.h>
#include "cli/json_array_ingest.h"
#include "cli/comment_handling.h"
#include "lib/mlr_globals.h"
#include "lib/mlrutil.h"
#include "input/file_reader_mmap.h"
#include "input/lrec_readers.h"
#include "input/json_parser.h"
#include "input/mlr_json_adapter.h"
typedef struct _lrec_reader_mmap_json_state_t {
// The list of top-level JSON objects is backed by the file contents. The records are in turn
// backed by the top-level JSON objects. This means the latter should not be freed while
// the records are in used. (This is done to reduce data copies, for performance: we can
// manipulate pointers to strings rather than copying strings.)
//
// In particular, in the multifile-input case, we need to keep *all* parsed JSON (and
// not free one file's data when we proceed to the next) since records with pointers
// into the parsed JSON may still be in use -- e.g. mlr sort.
sllv_t* ptop_level_json_objects;
sllv_t* precords;
char* input_json_flatten_separator;
json_array_ingest_t json_array_ingest;
char* specified_line_term;
int do_auto_line_term;
char* detected_line_term;
comment_handling_t comment_handling;
char* comment_string;
} lrec_reader_mmap_json_state_t;
static void lrec_reader_mmap_json_free(lrec_reader_t* preader);
static void lrec_reader_mmap_json_sof(void* pvstate, void* pvhandle);
static lrec_t* lrec_reader_mmap_json_process(void* pvstate, void* pvhandle, context_t* pctx);
// ----------------------------------------------------------------
lrec_reader_t* lrec_reader_mmap_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term,
comment_handling_t comment_handling, char* comment_string)
{
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
lrec_reader_mmap_json_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_json_state_t));
pstate->ptop_level_json_objects = sllv_alloc();
pstate->precords = sllv_alloc();
pstate->input_json_flatten_separator = input_json_flatten_separator;
pstate->json_array_ingest = json_array_ingest;
pstate->specified_line_term = line_term;
pstate->do_auto_line_term = FALSE;
pstate->detected_line_term = "\n"; // xxx adapt to MLR_GLOBALS/ctx-const for Windows port
pstate->comment_handling = comment_handling;
pstate->comment_string = comment_string;
if (streq(line_term, "auto")) {
pstate->do_auto_line_term = TRUE;
}
plrec_reader->pvstate = (void*)pstate;
plrec_reader->popen_func = file_reader_mmap_vopen;
plrec_reader->pclose_func = file_reader_mmap_vclose;
plrec_reader->pprocess_func = lrec_reader_mmap_json_process;
plrec_reader->psof_func = lrec_reader_mmap_json_sof;
plrec_reader->pfree_func = lrec_reader_mmap_json_free;
return plrec_reader;
}
static void lrec_reader_mmap_json_free(lrec_reader_t* preader) {
lrec_reader_mmap_json_state_t* pstate = preader->pvstate;
for (sllve_t* pe = pstate->ptop_level_json_objects->phead; pe != NULL; pe = pe->pnext) {
json_value_t* top_level_json_object = pe->pvvalue;
json_free_value(top_level_json_object);
}
sllv_free(pstate->ptop_level_json_objects);
pstate->ptop_level_json_objects = NULL;
for (sllve_t* pf = pstate->precords->phead; pf != NULL; pf = pf->pnext) {
lrec_t* prec = pf->pvvalue;
lrec_free(prec);
}
sllv_free(pstate->precords);
pstate->precords = NULL;
free(pstate);
free(preader);
}
// The mmap-JSON lrec-reader is non-streaming: we ingest all records here in the start-of-file hook.
// Then in the process method we pop one lrec off the list at a time, until they are all exhausted.
// This is in contrast to other Miller lrec-readers.
//
// It would be possible to extend the streaming framework to also have an end-of-file hook
// which we could use here to free parsed-JSON data. However, we simply leverage the start-of-file
// hook for the *next* file (if any) or the free method (if not): these free parsed-JSON structures
// from the previous file (if any).
static void lrec_reader_mmap_json_sof(void* pvstate, void* pvhandle) {
lrec_reader_mmap_json_state_t* pstate = pvstate;
file_reader_mmap_state_t* phandle = pvhandle;
json_char* json_input = (json_char*)phandle->sol;
json_value_t* parsed_top_level_json;
json_char error_buf[JSON_ERROR_MAX];
// This enables us to handle input of the form
//
// { "a" : 1 }
// { "b" : 2 }
// { "c" : 3 }
//
// in addition to
//
// [
// { "a" : 1 }
// { "b" : 2 }
// { "c" : 3 }
// ]
//
// This is in line with what jq can handle. In this case, json_parse will return
// once for each top-level item and will give us back a pointer to the start of
// the rest of the input stream, so we can call json_parse on the rest until it is
// all exhausted.
json_char* item_start = json_input;
int length = phandle->eof - phandle->sol;
char* detected_line_term = NULL;
while (TRUE) {
// Find the first line-ending sequence (if any): LF or CRLF.
if (pstate->do_auto_line_term) {
if (detected_line_term == NULL) {
for (char* p = phandle->sol; p < phandle->eof; p++) {
if (p[0] == '\n') {
if (p > phandle->sol && p[-1] == '\r') {
detected_line_term = "\r\n";
} else {
detected_line_term = "\n";
}
break;
}
}
}
}
// Skip comments. For JSON, we ingest the entire blob, this is a matter of finding and iterating over lines.
// Miller data comments must be at start of line.
if (pstate->comment_handling != COMMENTS_ARE_DATA) {
char* line_term = pstate->specified_line_term;
if (pstate->do_auto_line_term && detected_line_term != NULL)
line_term = detected_line_term;
mlr_json_strip_comments(item_start, item_start + length, pstate->comment_handling, pstate->comment_string,
line_term);
}
// Trim trailing whitespace.
char* item_end = item_start + length;
mlr_json_end_strip(item_start, &item_end);
length = item_end - item_start;
if (length == 0)
break;
parsed_top_level_json = json_parse(item_start, length, error_buf, &item_start);
if (parsed_top_level_json == NULL) {
fprintf(stderr, "%s: Unable to parse JSON data: %s\n", MLR_GLOBALS.bargv0, error_buf);
exit(1);
}
sllv_append(pstate->ptop_level_json_objects, parsed_top_level_json);
// The lrecs have their string pointers pointing into the parsed-JSON objects (for
// efficiency) so it's important we not free the latter until our free method.
if (!reference_json_objects_as_lrecs(pstate->precords, parsed_top_level_json,
pstate->input_json_flatten_separator, pstate->json_array_ingest))
{
fprintf(stderr, "%s: Unable to parse JSON data.\n", MLR_GLOBALS.bargv0);
exit(1);
}
if (item_start == NULL)
break;
if (*item_start == 0)
break;
length -= (item_start - json_input);
json_input = item_start;
// json_parse goes up to the '\r' or '\n' (whichever is found first) on the first
// parse, then keeps going from there on the next. E.g. in the CRLF case it
// consumes the CR at the end of the first read and consumes the LF at the start
// of the second, and so on. After the very last parse, we need to here consume
// the final '\n' which is (by itself) a parse error.
if (length == 1 && *(char*)json_input == '\n') {
break;
}
}
if (detected_line_term != NULL) {
pstate->detected_line_term = detected_line_term;
}
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_json_process(void* pvstate, void* pvhandle, context_t* pctx) {
lrec_reader_mmap_json_state_t* pstate = pvstate;
if (pstate->do_auto_line_term) {
context_set_autodetected_line_term(pctx, pstate->detected_line_term);
}
return sllv_pop(pstate->precords);
}

View file

@ -1,512 +0,0 @@
// ================================================================
// Note: there are multiple process methods with a lot of code duplication.
// This is intentional. Much of Miller's measured processing time is in the
// lrec-reader process methods. This is code which needs to execute on every
// byte of input and even moving a single runtime if-statement into a
// function-pointer assignment at alloc time can have noticeable effects on
// performance (5-10% in some cases).
// ================================================================
#include <stdlib.h>
#include "cli/comment_handling.h"
#include "lib/mlrutil.h"
#include "input/file_reader_mmap.h"
#include "input/lrec_readers.h"
typedef struct _lrec_reader_mmap_nidx_state_t {
char* irs;
char* ifs;
int irslen;
int ifslen;
int allow_repeat_ifs;
int do_auto_line_term;
comment_handling_t comment_handling;
char* comment_string;
int comment_string_length;
} lrec_reader_mmap_nidx_state_t;
static void lrec_reader_mmap_nidx_free(lrec_reader_t* preader);
static void lrec_reader_mmap_nidx_sof(void* pvstate, void* pvhandle);
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_parse_mmap_nidx_single_irs_single_ifs(file_reader_mmap_state_t *phandle,
char irs, char ifs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx);
static lrec_t* lrec_parse_mmap_nidx_single_irs_multi_ifs(file_reader_mmap_state_t *phandle,
char irs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx);
static lrec_t* lrec_parse_mmap_nidx_multi_irs_single_ifs(file_reader_mmap_state_t *phandle,
char ifs, lrec_reader_mmap_nidx_state_t* pstate);
static lrec_t* lrec_parse_mmap_nidx_multi_irs_multi_ifs(file_reader_mmap_state_t *phandle,
lrec_reader_mmap_nidx_state_t* pstate);
static void skip_over_comment_lines_single_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_nidx_state_t* pstate,
char irs);
static void skip_over_comment_lines_multi_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_nidx_state_t* pstate,
char* irs,
int irslen);
// ----------------------------------------------------------------
lrec_reader_t* lrec_reader_mmap_nidx_alloc(char* irs, char* ifs, int allow_repeat_ifs,
comment_handling_t comment_handling, char* comment_string)
{
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
lrec_reader_mmap_nidx_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_nidx_state_t));
pstate->irs = irs;
pstate->ifs = ifs;
pstate->irslen = strlen(pstate->irs);
pstate->ifslen = strlen(pstate->ifs);
pstate->allow_repeat_ifs = allow_repeat_ifs;
pstate->do_auto_line_term = FALSE;
pstate->comment_handling = comment_handling;
pstate->comment_string = comment_string;
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
plrec_reader->pvstate = (void*)pstate;
plrec_reader->popen_func = file_reader_mmap_vopen;
plrec_reader->pclose_func = file_reader_mmap_vclose;
if (streq(irs, "auto")) {
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
// either case the final character is "\n". Then for autodetect we
// simply check if there's a character in the line before the '\n', and
// if that is '\r'.
pstate->do_auto_line_term = TRUE;
pstate->irs = "\n";
pstate->irslen = 1;
plrec_reader->pprocess_func = (pstate->ifslen == 1)
? lrec_reader_mmap_nidx_process_single_irs_single_ifs
: lrec_reader_mmap_nidx_process_single_irs_multi_ifs;
} else if (pstate->irslen == 1) {
plrec_reader->pprocess_func = (pstate->ifslen == 1)
? lrec_reader_mmap_nidx_process_single_irs_single_ifs
: lrec_reader_mmap_nidx_process_single_irs_multi_ifs;
} else {
plrec_reader->pprocess_func = (pstate->ifslen == 1)
? lrec_reader_mmap_nidx_process_multi_irs_single_ifs
: lrec_reader_mmap_nidx_process_multi_irs_multi_ifs;
}
plrec_reader->psof_func = lrec_reader_mmap_nidx_sof;
plrec_reader->pfree_func = lrec_reader_mmap_nidx_free;
return plrec_reader;
}
static void lrec_reader_mmap_nidx_free(lrec_reader_t* preader) {
free(preader->pvstate);
free(preader);
}
// No-op for stateless readers such as this one.
static void lrec_reader_mmap_nidx_sof(void* pvstate, void* pvhandle) {
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_nidx_single_irs_single_ifs(phandle, pstate->irs[0], pstate->ifs[0], pstate, pctx);
}
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_nidx_single_irs_multi_ifs(phandle, pstate->irs[0], pstate, pctx);
}
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_nidx_multi_irs_single_ifs(phandle, pstate->ifs[0], pstate);
}
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_nidx_multi_irs_multi_ifs(phandle, pstate);
}
// ----------------------------------------------------------------
static lrec_t* lrec_parse_mmap_nidx_single_irs_single_ifs(file_reader_mmap_state_t *phandle,
char irs, char ifs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_single_irs(phandle, pstate, irs);
if (phandle->sol >= phandle->eof)
return NULL;
char* line = phandle->sol;
lrec_t* prec = lrec_unbacked_alloc();
int idx = 0;
char free_flags = NO_FREE;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
char* key = NULL;
char* value = p;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (*p == irs) {
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
saw_rs = TRUE;
break;
} else if (*p == ifs) {
*p = 0;
idx++;
key = low_int_to_string(idx, &free_flags);
lrec_put(prec, key, value, free_flags);
p++;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *value == 0)
return prec;
key = low_int_to_string(idx, &free_flags);
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, free_flags);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
}
return prec;
}
static lrec_t* lrec_parse_mmap_nidx_single_irs_multi_ifs(file_reader_mmap_state_t *phandle,
char irs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_single_irs(phandle, pstate, irs);
lrec_t* prec = lrec_unbacked_alloc();
char* ifs = pstate->ifs;
int ifslen = pstate->ifslen;
char* line = phandle->sol;
int idx = 0;
char free_flags = NO_FREE;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
char* key = NULL;
char* value = p;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (*p == irs) {
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
saw_rs = TRUE;
break;
} else if (streqn(p, ifs, ifslen)) {
*p = 0;
idx++;
key = low_int_to_string(idx, &free_flags);
lrec_put(prec, key, value, free_flags);
p += ifslen;
if (pstate->allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *value == 0)
return prec;
key = low_int_to_string(idx, &free_flags);
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, free_flags);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
}
return prec;
}
static lrec_t* lrec_parse_mmap_nidx_multi_irs_single_ifs(file_reader_mmap_state_t *phandle,
char ifs, lrec_reader_mmap_nidx_state_t* pstate)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
lrec_t* prec = lrec_unbacked_alloc();
char* line = phandle->sol;
int idx = 0;
char free_flags = NO_FREE;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
char* key = NULL;
char* value = p;
int saw_rs = FALSE;
char* irs = pstate->irs;
int irslen = pstate->irslen;
for ( ; p < phandle->eof && *p; ) {
if (streqn(p, irs, irslen)) {
*p = 0;
phandle->sol = p + irslen;
saw_rs = TRUE;
break;
} else if (*p == ifs) {
*p = 0;
idx++;
key = low_int_to_string(idx, &free_flags);
lrec_put(prec, key, value, free_flags);
p++;
if (pstate->allow_repeat_ifs) {
while (*p == ifs)
p++;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *value == 0)
return prec;
key = low_int_to_string(idx, &free_flags);
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, free_flags);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
}
return prec;
}
static lrec_t* lrec_parse_mmap_nidx_multi_irs_multi_ifs(file_reader_mmap_state_t *phandle,
lrec_reader_mmap_nidx_state_t* pstate)
{
if (pstate->comment_string != NULL)
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
lrec_t* prec = lrec_unbacked_alloc();
char* line = phandle->sol;
int idx = 0;
char free_flags = NO_FREE;
char* ifs = pstate->ifs;
int ifslen = pstate->ifslen;
char* irs = pstate->irs;
int irslen = pstate->irslen;
char* p = line;
if (pstate->allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
char* key = NULL;
char* value = p;
int saw_rs = FALSE;
for ( ; p < phandle->eof && *p; ) {
if (streqn(p, irs, irslen)) {
*p = 0;
phandle->sol = p + irslen;
saw_rs = TRUE;
break;
} else if (streqn(p, ifs, ifslen)) {
*p = 0;
idx++;
key = low_int_to_string(idx, &free_flags);
lrec_put(prec, key, value, free_flags);
p += ifslen;
if (pstate->allow_repeat_ifs) {
while (streqn(p, ifs, ifslen))
p += ifslen;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
idx++;
if (pstate->allow_repeat_ifs && *value == 0)
return prec;
key = low_int_to_string(idx, &free_flags);
if (saw_rs) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
// C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, free_flags);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
// byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
}
return prec;
}
// ----------------------------------------------------------------
static void skip_over_comment_lines_single_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_nidx_state_t* pstate,
char irs)
{
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
{
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < pstate->comment_string_length; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += pstate->comment_string_length;
while (phandle->sol < phandle->eof && *phandle->sol != irs) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
}
}
static void skip_over_comment_lines_multi_irs(
file_reader_mmap_state_t *phandle,
lrec_reader_mmap_nidx_state_t* pstate,
char* irs,
int irslen)
{
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
{
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < pstate->comment_string_length; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += pstate->comment_string_length;
while ((phandle->eof - phandle->sol) >= irslen && !streqn(phandle->sol, irs, irslen)) {
if (pstate->comment_handling == PASS_COMMENTS)
fputc(*phandle->sol, stdout);
phandle->sol++;
}
if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) {
if (pstate->comment_handling == PASS_COMMENTS)
for (int i = 0; i < irslen; i++)
fputc(phandle->sol[i], stdout);
phandle->sol += irslen;
}
}
}

View file

@ -1,529 +0,0 @@
// ================================================================
// Note: there are multiple process methods with a lot of code duplication.
// This is intentional. Much of Miller's measured processing time is in the
// lrec-reader process methods. This is code which needs to execute on every
// byte of input and even moving a single runtime if-statement into a
// function-pointer assignment at alloc time can have noticeable effects on
// performance (5-10% in some cases).
// ================================================================
#include <stdio.h>
#include <stdlib.h>
#include "cli/comment_handling.h"
#include "lib/mlr_globals.h"
#include "lib/mlrutil.h"
#include "input/file_reader_mmap.h"
#include "input/lrec_readers.h"
typedef struct _lrec_reader_mmap_xtab_state_t {
char* ifs;
char* ips;
int ifslen;
int ipslen;
int allow_repeat_ips;
int do_auto_line_term;
} lrec_reader_mmap_xtab_state_t;
static void lrec_reader_mmap_xtab_free(lrec_reader_t* preader);
static void lrec_reader_mmap_xtab_sof(void* pvstate, void* pvhandle);
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx);
static lrec_t* lrec_parse_mmap_xtab_single_ifs_single_ips(file_reader_mmap_state_t* phandle, char ifs, char ips,
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx);
static lrec_t* lrec_parse_mmap_xtab_single_ifs_multi_ips(file_reader_mmap_state_t* phandle, char ifs,
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx);
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_single_ips(file_reader_mmap_state_t* phandle, char ips,
lrec_reader_mmap_xtab_state_t* pstate);
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_multi_ips(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_xtab_state_t* pstate);
// ----------------------------------------------------------------
lrec_reader_t* lrec_reader_mmap_xtab_alloc(char* ifs, char* ips, int allow_repeat_ips,
comment_handling_t comment_handling, char* comment_string)
{
// lrec_reader_alloc should have shunted away from us in this case.
// (Interleaving blank-line handling, line-term autodetect, and comment-handling all in
// the byte-at-a-time logic turned out to be a mess in this file. In the stdio implementation,
// by constrast, it falls out rather easily.)
if (comment_string != NULL) {
fprintf(stderr, "%s: internal coding error detected in file %s at line %d.\n",
MLR_GLOBALS.bargv0, __FILE__, __LINE__);
exit(1);
}
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
lrec_reader_mmap_xtab_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_xtab_state_t));
pstate->ifs = ifs;
pstate->ips = ips;
pstate->ifslen = strlen(pstate->ifs);
pstate->ipslen = strlen(pstate->ips);
pstate->allow_repeat_ips = allow_repeat_ips;
pstate->do_auto_line_term = FALSE;
plrec_reader->pvstate = (void*)pstate;
plrec_reader->popen_func = file_reader_mmap_vopen;
plrec_reader->pclose_func = file_reader_mmap_vclose;
if (streq(ifs, "auto")) {
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
// either case the final character is "\n". Then for autodetect we
// simply check if there's a character in the line before the '\n', and
// if that is '\r'.
pstate->do_auto_line_term = TRUE;
pstate->ifs = "\n";
pstate->ifslen = 1;
plrec_reader->pprocess_func = (pstate->ipslen == 1)
? lrec_reader_mmap_xtab_process_single_ifs_single_ips
: lrec_reader_mmap_xtab_process_single_ifs_multi_ips;
} else if (pstate->ifslen == 1) {
plrec_reader->pprocess_func = (pstate->ipslen == 1)
? lrec_reader_mmap_xtab_process_single_ifs_single_ips
: lrec_reader_mmap_xtab_process_single_ifs_multi_ips;
} else {
plrec_reader->pprocess_func = (pstate->ipslen == 1)
? lrec_reader_mmap_xtab_process_multi_ifs_single_ips
: lrec_reader_mmap_xtab_process_multi_ifs_multi_ips;
}
plrec_reader->psof_func = lrec_reader_mmap_xtab_sof;
plrec_reader->pfree_func = lrec_reader_mmap_xtab_free;
return plrec_reader;
}
// ----------------------------------------------------------------
static void lrec_reader_mmap_xtab_free(lrec_reader_t* preader) {
free(preader->pvstate);
free(preader);
}
static void lrec_reader_mmap_xtab_sof(void* pvstate, void* pvhandle) {
}
// ----------------------------------------------------------------
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_xtab_single_ifs_single_ips(phandle, pstate->ifs[0], pstate->ips[0],
pstate, pctx);
}
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_xtab_single_ifs_multi_ips(phandle, pstate->ifs[0], pstate, pctx);
}
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_xtab_multi_ifs_single_ips(phandle, pstate->ips[0], pstate);
}
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx) {
file_reader_mmap_state_t* phandle = pvhandle;
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
if (phandle->sol >= phandle->eof)
return NULL;
else
return lrec_parse_mmap_xtab_multi_ifs_multi_ips(phandle, pstate);
}
// ----------------------------------------------------------------
static lrec_t* lrec_parse_mmap_xtab_single_ifs_single_ips(file_reader_mmap_state_t* phandle, char ifs, char ips,
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx)
{
if (pstate->do_auto_line_term) {
// Skip over otherwise empty LF-only or CRLF-only lines.
while (phandle->sol < phandle->eof) {
if (*phandle->sol == '\n') {
context_set_autodetected_lf(pctx);
phandle->sol += 1;
} else if (*phandle->sol == '\r') {
char* q = phandle->sol + 1;
if (q < phandle->eof && *q == '\n') {
context_set_autodetected_crlf(pctx);
phandle->sol += 2;
} else {
phandle->sol += 1;
}
} else {
break;
}
}
} else {
// Skip over otherwise empty IFS-only lines
while (phandle->sol < phandle->eof && *phandle->sol == ifs) {
phandle->sol++;
}
}
if (phandle->sol >= phandle->eof)
return NULL;
lrec_t* prec = lrec_unbacked_alloc();
// Loop over fields, one per line
while (TRUE) {
if (phandle->sol >= phandle->eof)
break;
char* line = phandle->sol;
char* key = line;
char* value = "";
char* p;
int saw_ips_in_field = FALSE;
// Construct one field
int saw_eol = FALSE;
for (p = line; p < phandle->eof && *p; ) {
if (*p == ifs) {
saw_ips_in_field = FALSE;
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
saw_eol = TRUE;
break;
} else if (!saw_ips_in_field && *p == ips) {
saw_ips_in_field = TRUE;
key = line;
*p = 0;
p++;
if (pstate->allow_repeat_ips) {
while (*p == ips)
p++;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (saw_eol) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
// the C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, NO_FREE);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
// EOF is one byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
if (phandle->sol >= phandle->eof)
break;
if (pstate->do_auto_line_term) {
char* p = phandle->sol;
char* q = phandle->sol + 1;
if (*p == '\n')
break;
if (q < phandle->eof && *p == '\r' && *q == '\n')
break;
} else {
if (*phandle->sol == ifs)
break;
}
}
if (prec->field_count == 0) {
lrec_free(prec);
return NULL;
} else {
return prec;
}
}
static lrec_t* lrec_parse_mmap_xtab_single_ifs_multi_ips(file_reader_mmap_state_t* phandle, char ifs,
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx)
{
if (pstate->do_auto_line_term) {
// Skip over otherwise empty LF-only or CRLF-only lines.
while (phandle->sol < phandle->eof) {
if (*phandle->sol == '\n') {
context_set_autodetected_lf(pctx);
phandle->sol += 1;
} else if (*phandle->sol == '\r') {
char* q = phandle->sol + 1;
if (q < phandle->eof && *q == '\n') {
context_set_autodetected_crlf(pctx);
phandle->sol += 2;
} else {
phandle->sol += 1;
}
} else {
break;
}
}
} else {
// Skip over otherwise empty IFS-only lines.
while (phandle->sol < phandle->eof && *phandle->sol == ifs)
phandle->sol++;
}
if (phandle->sol >= phandle->eof)
return NULL;
char* ips = pstate->ips;
int ipslen = pstate->ipslen;
lrec_t* prec = lrec_unbacked_alloc();
// Loop over fields, one per line
while (TRUE) {
if (phandle->sol >= phandle->eof)
break;
char* line = phandle->sol;
char* key = line;
char* value = "";
char* p;
int saw_ips_in_field = FALSE;
// Construct one field
int saw_eol = FALSE;
for (p = line; p < phandle->eof && *p; ) {
if (*p == ifs) {
saw_ips_in_field = FALSE;
*p = 0;
if (pstate->do_auto_line_term) {
if (p > line && p[-1] == '\r') {
p[-1] = 0;
context_set_autodetected_crlf(pctx);
} else {
context_set_autodetected_lf(pctx);
}
}
phandle->sol = p+1;
saw_eol = TRUE;
break;
} else if (!saw_ips_in_field && streqn(p, ips, ipslen)) {
saw_ips_in_field = TRUE;
key = line;
*p = 0;
p += ipslen;
if (pstate->allow_repeat_ips) {
while (streqn(p, ips, ipslen))
p += ipslen;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (saw_eol) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
// the C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, NO_FREE);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
// EOF is one byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
if (phandle->sol >= phandle->eof || *phandle->sol == ifs)
break;
}
if (prec->field_count == 0) {
lrec_free(prec);
return NULL;
} else {
return prec;
}
}
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_single_ips(file_reader_mmap_state_t* phandle, char ips,
lrec_reader_mmap_xtab_state_t* pstate)
{
char* ifs = pstate->ifs;
int ifslen = pstate->ifslen;
// Skip blank lines
while (phandle->eof - phandle->sol >= ifslen && streqn(phandle->sol, ifs, ifslen)) {
phandle->sol += ifslen;
}
if (phandle->sol >= phandle->eof)
return NULL;
lrec_t* prec = lrec_unbacked_alloc();
// Loop over fields, one per line
while (TRUE) {
if (phandle->sol >= phandle->eof)
break;
char* line = phandle->sol;
char* key = line;
char* value = "";
char* p;
int saw_ips_in_field = FALSE;
// Construct one field
int saw_eol = FALSE;
for (p = line; p < phandle->eof && *p; ) {
if (streqn(p, ifs, ifslen)) {
saw_ips_in_field = FALSE;
*p = 0;
phandle->sol = p + ifslen;
saw_eol = TRUE;
break;
} else if (!saw_ips_in_field && *p == ips) {
saw_ips_in_field = TRUE;
key = line;
*p = 0;
p++;
if (pstate->allow_repeat_ips) {
while (*p == ips)
p++;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (saw_eol) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
// the C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, NO_FREE);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
// EOF is one byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
if (phandle->sol >= phandle->eof || streqn(phandle->sol, ifs, ifslen))
break;
}
if (prec->field_count == 0) {
lrec_free(prec);
return NULL;
} else {
return prec;
}
}
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_multi_ips(file_reader_mmap_state_t* phandle,
lrec_reader_mmap_xtab_state_t* pstate)
{
char* ips = pstate->ips;
int ipslen = pstate->ipslen;
char* ifs = pstate->ifs;
int ifslen = pstate->ifslen;
// Skip blank lines
while (phandle->eof - phandle->sol >= ifslen && streqn(phandle->sol, ifs, ifslen)) {
phandle->sol += ifslen;
}
if (phandle->sol >= phandle->eof)
return NULL;
lrec_t* prec = lrec_unbacked_alloc();
// Loop over fields, one per line
while (TRUE) {
if (phandle->sol >= phandle->eof)
break;
char* line = phandle->sol;
char* key = line;
char* value = "";
char* p;
int saw_ips_in_field = FALSE;
// Construct one field
int saw_eol = FALSE;
for (p = line; p < phandle->eof && *p; ) {
if (streqn(p, ifs, ifslen)) {
saw_ips_in_field = FALSE;
*p = 0;
phandle->sol = p + ifslen;
saw_eol = TRUE;
break;
} else if (!saw_ips_in_field && streqn(p, ips, ipslen)) {
saw_ips_in_field = TRUE;
key = line;
*p = 0;
p += ipslen;
if (pstate->allow_repeat_ips) {
while (streqn(p, ips, ipslen))
p += ipslen;
}
value = p;
} else {
p++;
}
}
if (p >= phandle->eof)
phandle->sol = p+1;
if (saw_eol) {
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
// the C string so it's OK to retain a pointer to that.
lrec_put(prec, key, value, NO_FREE);
} else {
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
// EOF is one byte past the page and that will segv us.
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
}
if (phandle->sol >= phandle->eof || streqn(phandle->sol, ifs, ifslen))
break;
}
if (prec->field_count == 0) {
lrec_free(prec);
return NULL;
} else {
return prec;
}
}

View file

@ -348,10 +348,9 @@ static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstat
if (pfr_peek_char(pfr) == (char)EOF) // char defaults to unsigned on some platforms
return FALSE;
// Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap
// we can test the first 3 bytes, then skip past them or not. For stdio on files we can fread
// the first 3 bytes, then rewind the fp if they're not the UTF-8 BOM. But for stdio on stdin
// (which is the primary reason we support stdio in Miller), we cannot rewind: stdin is not
// Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap we can test the
// first 3 bytes, then skip past them or not. For stdio on files we can fread the first 3 bytes, then rewind the fp
// if they're not the UTF-8 BOM. But for stdio on stdin, we cannot rewind: stdin is not
// rewindable.
if (is_header) {
pfr_buffer_by(pfr, UTF8_BOM_LENGTH);

View file

@ -8,10 +8,11 @@
// ================================================================
// ================================================================
// This has at present a lot of code duplication with lrec_reader_mmap_json.
// This is because we read the entire input file into memory and get a pointer
// to it, which is a lot like mmap. At some future point we may implement a
// streaming JSON parser at which point the two files would diverge.
// Note: this is a non-streaming JSON reader which reads the entire input file
// into memory and gets a pointer to it. At some future point we may implement
// a streaming JSON parser at which point this would change dramatically.
//
// See also https://github.com/johnkerl/miller/issues/99
// ================================================================
#include <stdio.h>

View file

@ -9,50 +9,24 @@ lrec_reader_t* lrec_reader_alloc(cli_reader_opts_t* popts) {
generator_opts_t* pgopts = &popts->generator_opts;
return lrec_reader_gen_alloc(pgopts->field_name, pgopts->start, pgopts->stop, pgopts->step);
} else if (streq(popts->ifile_fmt, "dkvp")) {
if (popts->use_mmap_for_read)
return lrec_reader_mmap_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs,
popts->comment_handling, popts->comment_string);
else
return lrec_reader_stdio_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs,
popts->comment_handling, popts->comment_string);
return lrec_reader_stdio_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs,
popts->comment_handling, popts->comment_string);
} else if (streq(popts->ifile_fmt, "csv")) {
if (popts->use_mmap_for_read)
return lrec_reader_mmap_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header,
popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string);
else
return lrec_reader_stdio_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header,
popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string);
return lrec_reader_stdio_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header,
popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string);
} else if (streq(popts->ifile_fmt, "csvlite")) {
if (popts->use_mmap_for_read)
return lrec_reader_mmap_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling,
popts->comment_string);
else
return lrec_reader_stdio_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling,
popts->comment_string);
return lrec_reader_stdio_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling,
popts->comment_string);
} else if (streq(popts->ifile_fmt, "nidx")) {
if (popts->use_mmap_for_read)
return lrec_reader_mmap_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
popts->comment_handling, popts->comment_string);
else
return lrec_reader_stdio_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
popts->comment_handling, popts->comment_string);
return lrec_reader_stdio_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
popts->comment_handling, popts->comment_string);
} else if (streq(popts->ifile_fmt, "xtab")) {
// Use stdio-xtab for comment handling; not supported in the mmap-xtab reader.
if (popts->use_mmap_for_read && popts->comment_string == NULL)
return lrec_reader_mmap_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips,
popts->comment_handling, popts->comment_string);
else
return lrec_reader_stdio_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips,
popts->comment_handling, popts->comment_string);
return lrec_reader_stdio_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips,
popts->comment_handling, popts->comment_string);
} else if (streq(popts->ifile_fmt, "json")) {
if (popts->use_mmap_for_read)
return lrec_reader_mmap_json_alloc(popts->input_json_flatten_separator,
popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string);
else
return lrec_reader_stdio_json_alloc(popts->input_json_flatten_separator,
popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string);
return lrec_reader_stdio_json_alloc(popts->input_json_flatten_separator,
popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string);
} else {
return NULL;
}

View file

@ -24,19 +24,6 @@ lrec_reader_t* lrec_reader_stdio_xtab_alloc(char* ifs, char* ips, int allow_repe
lrec_reader_t* lrec_reader_stdio_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term,
comment_handling_t comment_handling, char* comment_string);
lrec_reader_t* lrec_reader_mmap_csv_alloc(char* irs, char* ifs, int use_implicit_csv_header,
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string);
lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char* irs, char* ifs, int allow_repeat_ifs, int use_implicit_csv_header,
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string);
lrec_reader_t* lrec_reader_mmap_dkvp_alloc(char* irs, char* ifs, char* ips, int allow_repeat_ifs,
comment_handling_t comment_handling, char* comment_string);
lrec_reader_t* lrec_reader_mmap_nidx_alloc(char* irs, char* ifs, int allow_repeat_ifs,
comment_handling_t comment_handling, char* comment_string);
lrec_reader_t* lrec_reader_mmap_xtab_alloc(char* ifs, char* ips, int allow_repeat_ips,
comment_handling_t comment_handling, char* comment_string);
lrec_reader_t* lrec_reader_mmap_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term,
comment_handling_t comment_handling, char* comment_string);
lrec_reader_t* lrec_reader_in_memory_alloc(sllv_t* precords);
// ----------------------------------------------------------------

View file

@ -273,7 +273,7 @@ static int populate_from_nested_array(lrec_t* prec, json_value_t* pjson_array, c
}
// ----------------------------------------------------------------
// * The buffer is an entire JSON blob, e.g. contents from stdio read or mmap; peof-psof is the file size so peof is one
// * The buffer is an entire JSON blob, e.g. contents from stdio read; peof-psof is the file size so peof is one
// byte *after* the last valid file byte.
// * The buffer is not assumed to be null-terminated.
// * Any lines beginning with comment_string are modified by poking space characters up to line_term.

View file

@ -16,7 +16,7 @@
int reference_json_objects_as_lrecs(sllv_t* precords, json_value_t* ptop_level_json, char* flatten_sep,
json_array_ingest_t json_array_ingest);
// * The buffer is an entire JSON blob, e.g. contents from stdio read or mmap; peof-psof is the file size so peof is one
// * The buffer is an entire JSON blob, e.g. contents from stdio read; peof-psof is the file size so peof is one
// byte *after* the last valid file byte.
// * The buffer is not assumed to be null-terminated.
// * Any lines beginning with comment_string are modified by poking space characters up to line_term.

View file

@ -1,112 +0,0 @@
#include <string.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include "lib/mlr_arch.h"
#include "input/byte_readers.h"
#include "lib/mlr_globals.h"
#include "lib/mlrutil.h"
#if MLR_ARCH_MMAP_ENABLED
static char empty_buf[1] = { 0 };
#endif
typedef struct _mmap_byte_reader_state_t {
char* filename;
int fd;
char* sof;
char* p;
char* eof;
} mmap_byte_reader_state_t;
static int mmap_byte_reader_open_func(struct _byte_reader_t* pbr, char* prepipe, char* filename);
static int mmap_byte_reader_read_func(struct _byte_reader_t* pbr);
static void mmap_byte_reader_close_func(struct _byte_reader_t* pbr, char* prepipe);
// ----------------------------------------------------------------
byte_reader_t* mmap_byte_reader_alloc() {
byte_reader_t* pbr = mlr_malloc_or_die(sizeof(byte_reader_t));
pbr->pvstate = NULL;
pbr->popen_func = mmap_byte_reader_open_func;
pbr->pread_func = mmap_byte_reader_read_func;
pbr->pclose_func = mmap_byte_reader_close_func;
return pbr;
}
void mmap_byte_reader_free(byte_reader_t* pbr) {
mmap_byte_reader_state_t* pstate = pbr->pvstate;
if (pstate != NULL) {
free(pstate->filename); // null-ok semantics
}
free(pbr);
}
// ----------------------------------------------------------------
static int mmap_byte_reader_open_func(struct _byte_reader_t* pbr, char* prepipe, char* filename) {
#if MLR_ARCH_MMAP_ENABLED
// popen is a stdio construct, not an mmap construct, and it can't be supported here.
if (prepipe != NULL) {
fprintf(stderr, "%s: coding error detected in file %s at line %d.\n",
MLR_GLOBALS.bargv0, __FILE__, __LINE__);
exit(1);
}
mmap_byte_reader_state_t* pstate = mlr_malloc_or_die(sizeof(mmap_byte_reader_state_t));
pstate->filename = mlr_strdup_or_die(filename);
pstate->fd = open(filename, O_RDONLY);
if (pstate->fd < 0) {
perror("open");
fprintf(stderr, "%s: Couldn't open \"%s\" for read.\n", MLR_GLOBALS.bargv0, filename);
exit(1);
}
struct stat stat;
if (fstat(pstate->fd, &stat) < 0) {
perror("fstat");
fprintf(stderr, "%s: could not fstat \"%s\"\n", MLR_GLOBALS.bargv0, filename);
exit(1);
}
if (stat.st_size == 0) {
// mmap doesn't allow us to map zero-length files but zero-length files do exist.
pstate->sof = &empty_buf[0];
} else {
pstate->sof = mmap(NULL, (size_t)stat.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE,
pstate->fd, (off_t)0);
if (pstate->sof == MAP_FAILED) {
perror("mmap");
fprintf(stderr, "%s: could not mmap \"%s\"\n", MLR_GLOBALS.bargv0, filename);
exit(1);
}
}
pstate->eof = pstate->sof + stat.st_size;
pstate->p = pstate->sof;
pbr->pvstate = pstate;
return TRUE;
#else
fprintf(stderr, "%s: mmap is unsupported on this architecture.\n", MLR_GLOBALS.bargv0);
exit(1);
return TRUE;
#endif
}
static int mmap_byte_reader_read_func(struct _byte_reader_t* pbr) {
mmap_byte_reader_state_t* pstate = pbr->pvstate;
if (pstate->p >= pstate->eof) {
return EOF;
} else {
int c = *pstate->p;
pstate->p++;
return c;
}
}
static void mmap_byte_reader_close_func(struct _byte_reader_t* pbr, char* prepipe) {
mmap_byte_reader_state_t* pstate = pbr->pvstate;
if (close(pstate->fd) < 0) {
perror("close");
fprintf(stderr, "%s: close error on file \"%s\".\n", MLR_GLOBALS.bargv0, pstate->filename);
exit(1);
}
}

View file

@ -23,14 +23,6 @@
#define mlr_arch_getc(stream) getc_unlocked(stream)
#endif
// ----------------------------------------------------------------
#ifdef MLR_ON_MSYS2
#define MLR_ARCH_MMAP_ENABLED 0
#else
#define MLR_ARCH_MMAP_ENABLED 1
#include <sys/mman.h>
#endif
// ----------------------------------------------------------------
int mlr_arch_setenv(const char *name, const char *value);
int mlr_arch_unsetenv(const char *name);

View file

@ -107,8 +107,6 @@ static void mapper_join_usage(FILE* o, char* argv0, char* verb) {
fprintf(o, " --ips {pair-separator character}\n");
fprintf(o, " --repifs\n");
fprintf(o, " --repips\n");
fprintf(o, " --mmap\n");
fprintf(o, " --no-mmap\n");
fprintf(o, "Please use \"%s --usage-separator-options\" for information on specifying separators.\n",
argv0);
fprintf(o, "Please see http://johnkerl.org/miller/doc/reference.html for more information\n");
@ -237,10 +235,6 @@ static mapper_t* mapper_join_parse_cli(int* pargi, int argc, char** argv,
cli_merge_reader_opts(&popts->reader_opts, pmain_reader_opts);
// popen is a stdio construct, not an mmap construct, and it can't be supported here.
if (popts->prepipe != NULL)
popts->reader_opts.use_mmap_for_read = FALSE;
if (popts->left_file_name == NULL) {
fprintf(stderr, "%s %s: need left file name\n", MLR_GLOBALS.bargv0, verb);
mapper_join_usage(stderr, argv[0], verb);

View file

@ -47217,71 +47217,6 @@ a=1,b=2,c=3
a=4,b=5,c=6
================================================================
MMAP AT PAGE BOUNDARIES
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-ifs.dkvp
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-irs.dkvp
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-no-ifs.dkvp
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,3=z
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-no-final-irs.dkvp
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=ccccccccccccccccccccccccccccccccccccccccccccccccc
mlr --nidx tail -n 4 ./reg_test/input/page-aligned-no-final-irs.nidx
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333
mlr --csvlite tail -n 4 ./reg_test/input/page-aligned-no-final-irs.csvl
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,ccccccccccccccccccccccccccccccccccccccccccc
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333
mlr --csv --rs lf tail -n 4 ./reg_test/input/page-aligned-no-final-irs.csvl
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,ccccccccccccccccccccccccccccccccccccccccccc
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333
mlr --xtab tail -n 4 ./reg_test/input/page-aligned-no-final-eol.xtab
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
cccccccccccccccccccc 33333333333333333333
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
cccccccccccccccccccc 33333333333333333333
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
cccccccccccccccccccc 33333333333333333333
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
cccccccccccccccccccc 3333333333333333333333
================================================================
INT64 I/O
@ -47675,54 +47610,54 @@ x,"y""yy",z
================================================================
RFC-CSV
mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple.csv-crlf
mlr --csv cat ./reg_test/input/rfc-csv/simple.csv-crlf
a,b,c
1,x,3
4,5,6
x,"y""yy",z
mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv
mlr --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv
a,b,c
1,x,3
4,5,6
mlr --mmap --csv cat ./reg_test/input/rfc-csv/narrow.csv
mlr --csv cat ./reg_test/input/rfc-csv/narrow.csv
a
1
2
3
4
mlr --mmap --csv cat ./reg_test/input/rfc-csv/narrow-truncated.csv
mlr --csv cat ./reg_test/input/rfc-csv/narrow-truncated.csv
a
1
2
3
4
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-comma.csv
mlr --csv cat ./reg_test/input/rfc-csv/quoted-comma.csv
a,b,c
1,"x,3",y
4,5,6
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-comma-truncated.csv
mlr --csv cat ./reg_test/input/rfc-csv/quoted-comma-truncated.csv
a,b,c
1,"x,3",y
4,5,6
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-crlf.csv
mlr --csv cat ./reg_test/input/rfc-csv/quoted-crlf.csv
a,b,c
1,"x
3",y
4,5,6
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-crlf-truncated.csv
mlr --csv cat ./reg_test/input/rfc-csv/quoted-crlf-truncated.csv
a,b,c
1,"x
3",y
4,5,6
mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv ./reg_test/input/rfc-csv/simple.csv-crlf
mlr --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv ./reg_test/input/rfc-csv/simple.csv-crlf
a,b,c
1,x,3
4,5,6
@ -47730,7 +47665,7 @@ a,b,c
4,5,6
x,"y""yy",z
mlr --mmap --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b ./reg_test/input/rfc-csv/modify-defaults.csv
mlr --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b ./reg_test/input/rfc-csv/modify-defaults.csv
a|c
1|3
@ -47738,22 +47673,13 @@ a|c
4|6
mlr --mmap --csv --rs lf --quote-original cut -o -f c,b,a ./reg_test/input/quote-original.csv
mlr --csv --rs lf --quote-original cut -o -f c,b,a ./reg_test/input/quote-original.csv
c,b,a
3,2,1
6,"5",4
"9",8,"7"
mlr --mmap --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv
a 1
b 2
c 3
a 4
b 5
c
mlr --no-mmap --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv
mlr --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv
a 1
b 2
c 3
@ -47818,7 +47744,7 @@ c i
================================================================
RAGGED NON-RFC CSV
mlr --mmap --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv
mlr --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv
a 1
b 2
c 3
@ -47832,35 +47758,7 @@ b 7
c 8
4 9
mlr --no-mmap --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv
a 1
b 2
c 3
a 4
b 5
c
a 6
b 7
c 8
4 9
mlr --mmap --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv
a 1
b 2
c 3
a 4
b 5
c
a 6
b 7
c 8
4 9
mlr --no-mmap --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv
mlr --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv
a 1
b 2
c 3
@ -48177,9 +48075,9 @@ i 4
================================================================
MULTI-CHARACTER SEPARATORS FOR XTAB
mlr --mmap --xtab --ifs crlf --ofs Z cut -x -f b ./reg_test/input/truncated.xtab-crlf
mlr --xtab --ifs crlf --ofs Z cut -x -f b ./reg_test/input/truncated.xtab-crlf
a 1Zc 3ZZd 4Ze 5Z
mlr --mmap --xtab --ips . --ops @ cut -x -f b ./reg_test/input/dots.xtab
mlr --xtab --ips . --ops @ cut -x -f b ./reg_test/input/dots.xtab
a@1
c@345
@ -48195,12 +48093,7 @@ sum@@@@3
================================================================
EMBEDDED IPS FOR XTAB
mlr --xtab --mmap cat ./reg_test/input/embedded-ips.xtab
a 1
b 2
c 3 4 5
mlr --xtab --no-mmap cat ./reg_test/input/embedded-ips.xtab
mlr --xtab cat ./reg_test/input/embedded-ips.xtab
a 1
b 2
c 3 4 5
@ -48374,7 +48267,7 @@ mlr --opprint --barred --right cat ./reg_test/input/abixy-het
================================================================
MULTI-CHARACTER IXS SPECIFIERS
mlr --oxtab --idkvp --mmap --irs lf --ifs , --ips = cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
mlr --oxtab --idkvp --irs lf --ifs , --ips = cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
x :0.641593543645736508/
a :wye/
i :0/
@ -48395,7 +48288,7 @@ x :0.676537984365847889/
a :zee/
i :4/
mlr --oxtab --idkvp --mmap --irs lf --ifs /, --ips =: cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
mlr --oxtab --idkvp --irs lf --ifs /, --ips =: cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
x 0.641593543645736508
a wye
i 0
@ -49746,7 +49639,7 @@ a=hat,b=wye,i=9,x=0.03144187646093577,y=0.7495507603507059
a=pan,b=wye,i=10,x=0.5026260055412137,y=0.9526183602969864
---------------------------------------------------------------- mmap nidx
---------------------------------------------------------------- nidx
mlr --irs auto --ors lf --nidx --fs comma cat ./reg_test/input/line-term-lf.dkvp
a=pan,b=pan,i=1,x=0.3467901443380824,y=0.7268028627434533
a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797
@ -49796,7 +49689,7 @@ a=hat,b=wye,i=9,x=0.03144187646093577,y=0.7495507603507059
a=pan,b=wye,i=10,x=0.5026260055412137,y=0.9526183602969864
---------------------------------------------------------------- mmap csvlite
---------------------------------------------------------------- csvlite
mlr --irs auto --ors lf --csvlite cat ./reg_test/input/line-term-lf.csv
a,b,i,x,y
pan,pan,1,0.3467901443380824,0.7268028627434533
@ -49850,7 +49743,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059
pan,wye,10,0.5026260055412137,0.9526183602969864
---------------------------------------------------------------- mmap pprint
---------------------------------------------------------------- pprint
mlr --irs auto --ors lf --pprint cat ./reg_test/input/line-term-lf.csv
a,b,i,x,y
pan,pan,1,0.3467901443380824,0.7268028627434533
@ -49904,7 +49797,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059
pan,wye,10,0.5026260055412137,0.9526183602969864
---------------------------------------------------------------- mmap xtab
---------------------------------------------------------------- xtab
mlr --ifs auto --xtab cat ./reg_test/input/line-term-lf.xtab
a pan
b pan
@ -50150,7 +50043,7 @@ x 0.5026260055412137
y 0.9526183602969864
---------------------------------------------------------------- mmap xtab
---------------------------------------------------------------- xtab
mlr --ifs auto --xtab cat ./reg_test/input/line-term-lf.xtab
a pan
b pan
@ -50396,7 +50289,7 @@ x 0.5026260055412137
y 0.9526183602969864
---------------------------------------------------------------- mmap csv
---------------------------------------------------------------- csv
mlr --irs auto --ors lf --csv cat ./reg_test/input/line-term-lf.csv
a,b,i,x,y
pan,pan,1,0.3467901443380824,0.7268028627434533
@ -50450,7 +50343,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059
pan,wye,10,0.5026260055412137,0.9526183602969864
---------------------------------------------------------------- mmap json nowrap nostack
---------------------------------------------------------------- json nowrap nostack
mlr --irs auto --ors lf --json cat ./reg_test/input/line-term-lf.json
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
{ "a": "eks", "b": "pan", "i": 2, "x": 0.7586799647899636, "y": 0.5221511083334797 }
@ -50500,7 +50393,7 @@ mlr --json cat ./reg_test/input/line-term-crlf.json
{ "a": "pan", "b": "wye", "i": 10, "x": 0.5026260055412137, "y": 0.9526183602969864 }
---------------------------------------------------------------- mmap json yeswrap nostack
---------------------------------------------------------------- json yeswrap nostack
mlr --irs auto --ors lf --jlistwrap --json cat ./reg_test/input/line-term-lf-wrap.json
[
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
@ -50558,7 +50451,7 @@ mlr --jlistwrap --json cat ./reg_test/input/line-term-crlf-wrap.json
]
---------------------------------------------------------------- mmap json nowrap yesstack
---------------------------------------------------------------- json nowrap yesstack
mlr --irs auto --json --jvstack cat ./reg_test/input/line-term-lf.json
{
"a": "pan",
@ -50848,7 +50741,7 @@ mlr --json --jvstack cat ./reg_test/input/line-term-crlf.json
}
---------------------------------------------------------------- mmap json yeswrap yesstack
---------------------------------------------------------------- json yeswrap yesstack
mlr --irs auto --ors lf --jlistwrap --json --jvstack cat ./reg_test/input/line-term-lf-wrap.json
[
{
@ -51146,7 +51039,7 @@ mlr --jlistwrap --json --jvstack cat ./reg_test/input/line-term-crlf-wrap.json
]
---------------------------------------------------------------- mmap json nowrap nostack
---------------------------------------------------------------- json nowrap nostack
mlr --irs auto --ors lf --json cat ./reg_test/input/line-term-lf.json
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
{ "a": "eks", "b": "pan", "i": 2, "x": 0.7586799647899636, "y": 0.5221511083334797 }
@ -51196,7 +51089,7 @@ mlr --json cat ./reg_test/input/line-term-crlf.json
{ "a": "pan", "b": "wye", "i": 10, "x": 0.5026260055412137, "y": 0.9526183602969864 }
---------------------------------------------------------------- mmap json yeswrap nostack
---------------------------------------------------------------- json yeswrap nostack
mlr --irs auto --ors lf --jlistwrap --json cat ./reg_test/input/line-term-lf-wrap.json
[
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
@ -51254,7 +51147,7 @@ mlr --jlistwrap --json cat ./reg_test/input/line-term-crlf-wrap.json
]
---------------------------------------------------------------- mmap json nowrap yesstack
---------------------------------------------------------------- json nowrap yesstack
mlr --irs auto --ors lf --json --jvstack cat ./reg_test/input/line-term-lf.json
{
"a": "pan",
@ -51544,7 +51437,7 @@ mlr --json --jvstack cat ./reg_test/input/line-term-crlf.json
}
---------------------------------------------------------------- mmap json yeswrap yesstack
---------------------------------------------------------------- json yeswrap yesstack
mlr --irs auto --ors lf --jlistwrap --json --jvstack cat ./reg_test/input/line-term-lf-wrap.json
[
{

View file

@ -44,12 +44,6 @@ if [ "$1" = "--valgrind" ]; then
# ../tools/clean-valg can be used to filter the output.
path_to_mlr="valgrind --leak-check=full ${path_to_mlr}g"
path_to_mlr_for_auxents="$path_to_mlr"
elif [ "$1" = "--no-mmap" ]; then
path_to_mlr_for_auxents="${path_to_mlr}"
path_to_mlr="${path_to_mlr} --no-mmap"
elif [ "$1" = "--valgrind-no-mmap" ]; then
path_to_mlr="valgrind --leak-check=full ${path_to_mlr}g --no-mmap"
path_to_mlr_for_auxents="valgrind --leak-check=full ${path_to_mlr}g"
fi
echo Using mlr executable $path_to_mlr
@ -5755,18 +5749,6 @@ mention pass comments1-crlf.csv
run_mlr --pass-comments --icsv --odkvp cat < $outdir/comments1-crlf.csv
run_mlr --pass-comments --icsv --odkvp cat $outdir/comments1-crlf.csv
# ----------------------------------------------------------------
announce MMAP AT PAGE BOUNDARIES
run_mlr --dkvp tail -n 4 $indir/page-aligned-final-ifs.dkvp
run_mlr --dkvp tail -n 4 $indir/page-aligned-final-irs.dkvp
run_mlr --dkvp tail -n 4 $indir/page-aligned-final-no-ifs.dkvp
run_mlr --dkvp tail -n 4 $indir/page-aligned-no-final-irs.dkvp
run_mlr --nidx tail -n 4 $indir/page-aligned-no-final-irs.nidx
run_mlr --csvlite tail -n 4 $indir/page-aligned-no-final-irs.csvl
run_mlr --csv --rs lf tail -n 4 $indir/page-aligned-no-final-irs.csvl
run_mlr --xtab tail -n 4 $indir/page-aligned-no-final-eol.xtab
# ----------------------------------------------------------------
announce INT64 I/O
@ -5797,20 +5779,19 @@ run_mlr --csv cat < $indir/rfc-csv/simple.csv-crlf
# ----------------------------------------------------------------
announce RFC-CSV
run_mlr --mmap --csv cat $indir/rfc-csv/simple.csv-crlf
run_mlr --mmap --csv cat $indir/rfc-csv/simple-truncated.csv
run_mlr --mmap --csv cat $indir/rfc-csv/narrow.csv
run_mlr --mmap --csv cat $indir/rfc-csv/narrow-truncated.csv
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-comma.csv
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-comma-truncated.csv
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-crlf.csv
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-crlf-truncated.csv
run_mlr --mmap --csv cat $indir/rfc-csv/simple-truncated.csv $indir/rfc-csv/simple.csv-crlf
run_mlr --mmap --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b $indir/rfc-csv/modify-defaults.csv
run_mlr --mmap --csv --rs lf --quote-original cut -o -f c,b,a $indir/quote-original.csv
run_mlr --csv cat $indir/rfc-csv/simple.csv-crlf
run_mlr --csv cat $indir/rfc-csv/simple-truncated.csv
run_mlr --csv cat $indir/rfc-csv/narrow.csv
run_mlr --csv cat $indir/rfc-csv/narrow-truncated.csv
run_mlr --csv cat $indir/rfc-csv/quoted-comma.csv
run_mlr --csv cat $indir/rfc-csv/quoted-comma-truncated.csv
run_mlr --csv cat $indir/rfc-csv/quoted-crlf.csv
run_mlr --csv cat $indir/rfc-csv/quoted-crlf-truncated.csv
run_mlr --csv cat $indir/rfc-csv/simple-truncated.csv $indir/rfc-csv/simple.csv-crlf
run_mlr --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b $indir/rfc-csv/modify-defaults.csv
run_mlr --csv --rs lf --quote-original cut -o -f c,b,a $indir/quote-original.csv
run_mlr --mmap --icsv --oxtab cat $indir/comma-at-eof.csv
run_mlr --no-mmap --icsv --oxtab cat $indir/comma-at-eof.csv
run_mlr --icsv --oxtab cat $indir/comma-at-eof.csv
run_mlr --csv --quote-all cat $indir/rfc-csv/simple.csv-crlf
run_mlr --csv --quote-original cat $indir/rfc-csv/simple.csv-crlf
@ -5822,10 +5803,8 @@ run_mlr --iusv --oxtab cat $indir/example.usv
# ----------------------------------------------------------------
announce RAGGED NON-RFC CSV
run_mlr --mmap --icsv --oxtab --ragged cat $indir/ragged.csv
run_mlr --no-mmap --icsv --oxtab --ragged cat $indir/ragged.csv
run_mlr --mmap --icsvlite --oxtab --ragged cat $indir/ragged.csv
run_mlr --no-mmap --icsvlite --oxtab --ragged cat $indir/ragged.csv
run_mlr --icsv --oxtab --ragged cat $indir/ragged.csv
run_mlr --icsvlite --oxtab --ragged cat $indir/ragged.csv
# ----------------------------------------------------------------
announce MARKDOWN OUTPUT
@ -5866,15 +5845,14 @@ run_mlr --oxtab --icsvlite --irs crlf --ifs /, cut -o -f x,a,i $indir/multi-s
# ----------------------------------------------------------------
announce MULTI-CHARACTER SEPARATORS FOR XTAB
run_mlr --mmap --xtab --ifs crlf --ofs Z cut -x -f b $indir/truncated.xtab-crlf
run_mlr --mmap --xtab --ips . --ops @ cut -x -f b $indir/dots.xtab
run_mlr --xtab --ifs crlf --ofs Z cut -x -f b $indir/truncated.xtab-crlf
run_mlr --xtab --ips . --ops @ cut -x -f b $indir/dots.xtab
run_mlr --xtab --ips ": " --ops '@@@@' put '$sum=int($a+$b)' $indir/multi-ips.dkvp
# ----------------------------------------------------------------
announce EMBEDDED IPS FOR XTAB
run_mlr --xtab --mmap cat $indir/embedded-ips.xtab
run_mlr --xtab --no-mmap cat $indir/embedded-ips.xtab
run_mlr --xtab cat $indir/embedded-ips.xtab
# ----------------------------------------------------------------
announce MULTI-CHARACTER IRS FOR PPRINT
@ -5893,8 +5871,8 @@ run_mlr --opprint --barred --right cat $indir/abixy-het
# ----------------------------------------------------------------
announce MULTI-CHARACTER IXS SPECIFIERS
run_mlr --oxtab --idkvp --mmap --irs lf --ifs '\x2c' --ips '\075' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
run_mlr --oxtab --idkvp --mmap --irs lf --ifs /, --ips '\x3d\x3a' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
run_mlr --oxtab --idkvp --irs lf --ifs '\x2c' --ips '\075' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
run_mlr --oxtab --idkvp --irs lf --ifs /, --ips '\x3d\x3a' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
# ----------------------------------------------------------------
announce JSON I/O
@ -6010,96 +5988,96 @@ run_mlr --irs auto --ors lf cat $indir/line-term-crlf.dkvp
run_mlr cat $indir/line-term-lf.dkvp
run_mlr cat $indir/line-term-crlf.dkvp
mention mmap nidx
mention nidx
run_mlr --irs auto --ors lf --nidx --fs comma cat $indir/line-term-lf.dkvp
run_mlr --irs auto --ors lf --nidx --fs comma cat $indir/line-term-crlf.dkvp
run_mlr --nidx --fs comma cat $indir/line-term-lf.dkvp
run_mlr --nidx --fs comma cat $indir/line-term-crlf.dkvp
mention mmap csvlite
mention csvlite
run_mlr --irs auto --ors lf --csvlite cat $indir/line-term-lf.csv
run_mlr --irs auto --ors lf --csvlite cat $indir/line-term-crlf.csv
run_mlr --csvlite cat $indir/line-term-lf.csv
run_mlr --csvlite cat $indir/line-term-crlf.csv
mention mmap pprint
mention pprint
run_mlr --irs auto --ors lf --pprint cat $indir/line-term-lf.csv
run_mlr --irs auto --ors lf --pprint cat $indir/line-term-crlf.csv
run_mlr --pprint cat $indir/line-term-lf.csv
run_mlr --pprint cat $indir/line-term-crlf.csv
mention mmap xtab
mention xtab
run_mlr --ifs auto --xtab cat $indir/line-term-lf.xtab
run_mlr --ifs auto --xtab cat $indir/line-term-crlf.xtab
run_mlr --fs auto --xtab cat $indir/line-term-lf.xtab
run_mlr --fs auto --xtab cat $indir/line-term-crlf.xtab
mention mmap xtab
mention xtab
run_mlr --ifs auto --xtab cat $indir/line-term-lf.xtab
run_mlr --ifs auto --xtab cat $indir/line-term-crlf.xtab
run_mlr --fs auto --xtab cat $indir/line-term-lf.xtab
run_mlr --fs auto --xtab cat $indir/line-term-crlf.xtab
mention mmap csv
mention csv
run_mlr --irs auto --ors lf --csv cat $indir/line-term-lf.csv
run_mlr --irs auto --ors lf --csv cat $indir/line-term-crlf.csv
run_mlr --csv cat $indir/line-term-lf.csv
run_mlr --csv cat $indir/line-term-crlf.csv
mention mmap json nowrap nostack
mention json nowrap nostack
run_mlr --irs auto --ors lf --json cat $indir/line-term-lf.json
run_mlr --irs auto --ors lf --json cat $indir/line-term-crlf.json
run_mlr --json cat $indir/line-term-lf.json
run_mlr --json cat $indir/line-term-crlf.json
mention mmap json yeswrap nostack
mention json yeswrap nostack
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-lf-wrap.json
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-crlf-wrap.json
run_mlr --jlistwrap --json cat $indir/line-term-lf-wrap.json
run_mlr --jlistwrap --json cat $indir/line-term-crlf-wrap.json
mention mmap json nowrap yesstack
mention json nowrap yesstack
run_mlr --irs auto --json --jvstack cat $indir/line-term-lf.json
run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-crlf.json
run_mlr --json --jvstack cat $indir/line-term-lf.json
run_mlr --json --jvstack cat $indir/line-term-crlf.json
mention mmap json yeswrap yesstack
mention json yeswrap yesstack
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json
run_mlr --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json
run_mlr --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json
mention mmap json nowrap nostack
mention json nowrap nostack
run_mlr --irs auto --ors lf --json cat $indir/line-term-lf.json
run_mlr --irs auto --ors lf --json cat $indir/line-term-crlf.json
run_mlr --json cat $indir/line-term-lf.json
run_mlr --json cat $indir/line-term-crlf.json
mention mmap json yeswrap nostack
mention json yeswrap nostack
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-lf-wrap.json
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-crlf-wrap.json
run_mlr --jlistwrap --json cat $indir/line-term-lf-wrap.json
run_mlr --jlistwrap --json cat $indir/line-term-crlf-wrap.json
mention mmap json nowrap yesstack
mention json nowrap yesstack
run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-lf.json
run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-crlf.json
run_mlr --json --jvstack cat $indir/line-term-lf.json
run_mlr --json --jvstack cat $indir/line-term-crlf.json
mention mmap json yeswrap yesstack
mention json yeswrap yesstack
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json
run_mlr --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json

View file

@ -116,92 +116,12 @@ static char* test_stdio_byte_reader_reuse() {
return NULL;
}
// ----------------------------------------------------------------
static char* test_mmap_byte_reader_1() {
#if MLR_ARCH_MMAP_ENABLED
byte_reader_t* pbr = mmap_byte_reader_alloc();
char* contents = "";
char* path = write_temp_file_or_die(contents);
int ok = pbr->popen_func(pbr, NULL, path);
mu_assert_lf(ok == TRUE);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
unlink_file_or_die(path);
return NULL;
#endif
}
// ----------------------------------------------------------------
static char* test_mmap_byte_reader_2() {
#if MLR_ARCH_MMAP_ENABLED
byte_reader_t* pbr = mmap_byte_reader_alloc();
char* contents = "abcdefg";
char* path = write_temp_file_or_die(contents);
int ok = pbr->popen_func(pbr, NULL, path);
mu_assert_lf(ok == TRUE);
mu_assert_lf(pbr->pread_func(pbr) == 'a');
mu_assert_lf(pbr->pread_func(pbr) == 'b');
mu_assert_lf(pbr->pread_func(pbr) == 'c');
mu_assert_lf(pbr->pread_func(pbr) == 'd');
mu_assert_lf(pbr->pread_func(pbr) == 'e');
mu_assert_lf(pbr->pread_func(pbr) == 'f');
mu_assert_lf(pbr->pread_func(pbr) == 'g');
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
unlink_file_or_die(path);
return NULL;
#endif
}
// ----------------------------------------------------------------
static char* test_mmap_byte_reader_reuse() {
#if MLR_ARCH_MMAP_ENABLED
byte_reader_t* pbr = mmap_byte_reader_alloc();
char* contents = "abc";
char* path = write_temp_file_or_die(contents);
int ok = pbr->popen_func(pbr, NULL, path);
mu_assert_lf(ok == TRUE);
mu_assert_lf(pbr->pread_func(pbr) == 'a');
mu_assert_lf(pbr->pread_func(pbr) == 'b');
mu_assert_lf(pbr->pread_func(pbr) == 'c');
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
unlink_file_or_die(path);
contents = "defg";
path = write_temp_file_or_die(contents);
ok = pbr->popen_func(pbr, NULL, path);
mu_assert_lf(ok == TRUE);
mu_assert_lf(pbr->pread_func(pbr) == 'd');
mu_assert_lf(pbr->pread_func(pbr) == 'e');
mu_assert_lf(pbr->pread_func(pbr) == 'f');
mu_assert_lf(pbr->pread_func(pbr) == 'g');
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
mu_assert_lf(pbr->pread_func(pbr) == EOF);
unlink_file_or_die(path);
return NULL;
#endif
}
// ================================================================
static char * run_all_tests() {
mu_run_test(test_string_byte_reader);
mu_run_test(test_stdio_byte_reader_1);
mu_run_test(test_stdio_byte_reader_2);
mu_run_test(test_stdio_byte_reader_reuse);
mu_run_test(test_mmap_byte_reader_1);
mu_run_test(test_mmap_byte_reader_2);
mu_run_test(test_mmap_byte_reader_reuse);
return 0;
}