From 2632ddc71669129192b70a7c28db6a60a0164d8a Mon Sep 17 00:00:00 2001 From: John Kerl Date: Sun, 26 Jan 2020 10:21:31 -0500 Subject: [PATCH] remove mmap-readers, which were high-maintenance and not able to be used when most needed --- c/Makefile.no-autoconfig | 23 - c/Makefile.windows | 22 - c/cli/mlrcli.c | 67 +-- c/cli/mlrcli.h | 4 - c/input/Makefile.am | 9 - c/input/byte_readers.h | 2 - c/input/file_reader_mmap.c | 84 --- c/input/file_reader_mmap.h | 20 - c/input/lrec_reader.h | 1 - c/input/lrec_reader_gen.c | 1 - c/input/lrec_reader_mmap_csv.c | 546 ------------------ c/input/lrec_reader_mmap_csvlite.c | 876 ----------------------------- c/input/lrec_reader_mmap_dkvp.c | 683 ---------------------- c/input/lrec_reader_mmap_json.c | 220 -------- c/input/lrec_reader_mmap_nidx.c | 512 ----------------- c/input/lrec_reader_mmap_xtab.c | 529 ----------------- c/input/lrec_reader_stdio_csv.c | 7 +- c/input/lrec_reader_stdio_json.c | 9 +- c/input/lrec_readers.c | 52 +- c/input/lrec_readers.h | 13 - c/input/mlr_json_adapter.c | 2 +- c/input/mlr_json_adapter.h | 2 +- c/input/mmap_byte_reader.c | 112 ---- c/lib/mlr_arch.h | 8 - c/mapping/mapper_join.c | 6 - c/reg_test/expected/out | 173 ++---- c/reg_test/run | 88 ++- c/unit_test/test_byte_readers.c | 80 --- 28 files changed, 93 insertions(+), 4058 deletions(-) delete mode 100644 c/input/file_reader_mmap.c delete mode 100644 c/input/file_reader_mmap.h delete mode 100644 c/input/lrec_reader_mmap_csv.c delete mode 100644 c/input/lrec_reader_mmap_csvlite.c delete mode 100644 c/input/lrec_reader_mmap_dkvp.c delete mode 100644 c/input/lrec_reader_mmap_json.c delete mode 100644 c/input/lrec_reader_mmap_nidx.c delete mode 100644 c/input/lrec_reader_mmap_xtab.c delete mode 100644 c/input/mmap_byte_reader.c diff --git a/c/Makefile.no-autoconfig b/c/Makefile.no-autoconfig index 9072c5f2a..747cd22d3 100644 --- a/c/Makefile.no-autoconfig +++ b/c/Makefile.no-autoconfig @@ -81,7 +81,6 @@ TEST_BYTE_READERS_SRCS = \ lib/string_builder.c \ input/string_byte_reader.c \ input/stdio_byte_reader.c \ - input/mmap_byte_reader.c \ unit_test/test_byte_readers.c TEST_LINE_READERS_SRCS = \ @@ -137,18 +136,12 @@ TEST_LREC_SRCS = \ containers/sllmv.c \ containers/mlhmmv.c \ input/line_readers.c \ - input/file_reader_mmap.c \ input/file_reader_stdio.c \ input/file_ingestor_stdio.c \ - input/lrec_reader_mmap_csvlite.c \ input/lrec_reader_stdio_csvlite.c \ - input/lrec_reader_mmap_dkvp.c \ input/lrec_reader_stdio_dkvp.c \ - input/lrec_reader_mmap_nidx.c \ input/lrec_reader_stdio_nidx.c \ - input/lrec_reader_mmap_xtab.c \ input/lrec_reader_stdio_xtab.c \ - input/lrec_reader_mmap_json.c \ input/lrec_reader_stdio_json.c \ input/mlr_json_adapter.c \ input/json_parser.c \ @@ -187,18 +180,12 @@ TEST_MULTIPLE_CONTAINERS_SRCS = \ containers/top_keeper.c \ containers/dheap.c \ input/line_readers.c \ - input/file_reader_mmap.c \ input/file_reader_stdio.c \ input/file_ingestor_stdio.c \ - input/lrec_reader_mmap_csvlite.c \ input/lrec_reader_stdio_csvlite.c \ - input/lrec_reader_mmap_dkvp.c \ input/lrec_reader_stdio_dkvp.c \ - input/lrec_reader_mmap_nidx.c \ input/lrec_reader_stdio_nidx.c \ - input/lrec_reader_mmap_xtab.c \ input/lrec_reader_stdio_xtab.c \ - input/lrec_reader_mmap_json.c \ input/lrec_reader_stdio_json.c \ input/mlr_json_adapter.c \ input/json_parser.c \ @@ -358,27 +345,19 @@ TEST_JOIN_BUCKET_KEEPER_SRCS = \ containers/mixutil.c \ containers/header_keeper.c \ containers/join_bucket_keeper.c \ - input/mmap_byte_reader.c \ input/stdio_byte_reader.c \ input/line_readers.c \ input/lrec_reader_gen.c \ input/lrec_reader_in_memory.c \ input/lrec_readers.c \ - input/lrec_reader_mmap_csv.c \ input/lrec_reader_stdio_csv.c \ - input/lrec_reader_mmap_csvlite.c \ input/lrec_reader_stdio_csvlite.c \ - input/lrec_reader_mmap_dkvp.c \ input/lrec_reader_stdio_dkvp.c \ - input/lrec_reader_mmap_nidx.c \ input/lrec_reader_stdio_nidx.c \ - input/lrec_reader_mmap_xtab.c \ input/lrec_reader_stdio_xtab.c \ - input/lrec_reader_mmap_json.c \ input/lrec_reader_stdio_json.c \ input/mlr_json_adapter.c \ input/json_parser.c \ - input/file_reader_mmap.c \ input/file_reader_stdio.c \ input/file_ingestor_stdio.c \ input/peek_file_reader.c \ @@ -398,7 +377,6 @@ EXPERIMENTAL_READER_SRCS = \ lib/string_array.c \ lib/string_builder.c \ input/stdio_byte_reader.c \ - input/file_reader_mmap.c \ input/line_readers.c \ containers/parse_trie.c \ experimental/getlines.c @@ -492,7 +470,6 @@ unit-test: test-mlrutil test-mlrregex test-argparse test-line-readers test-byte- reg-test: ./reg_test/run - ./reg_test/run --no-mmap # ---------------------------------------------------------------- # Run this after unit-test expected output has changed, and is verified to be diff --git a/c/Makefile.windows b/c/Makefile.windows index d437a7722..6cbccc621 100644 --- a/c/Makefile.windows +++ b/c/Makefile.windows @@ -76,7 +76,6 @@ TEST_BYTE_READERS_SRCS = \ lib/string_builder.c \ input/string_byte_reader.c \ input/stdio_byte_reader.c \ - input/mmap_byte_reader.c \ unit_test/test_byte_readers.c TEST_LINE_READERS_SRCS = \ @@ -125,18 +124,12 @@ TEST_LREC_SRCS = \ containers/sllmv.c \ containers/mlhmmv.c \ input/line_readers.c \ - input/file_reader_mmap.c \ input/file_reader_stdio.c \ input/file_ingestor_stdio.c \ - input/lrec_reader_mmap_csvlite.c \ input/lrec_reader_stdio_csvlite.c \ - input/lrec_reader_mmap_dkvp.c \ input/lrec_reader_stdio_dkvp.c \ - input/lrec_reader_mmap_nidx.c \ input/lrec_reader_stdio_nidx.c \ - input/lrec_reader_mmap_xtab.c \ input/lrec_reader_stdio_xtab.c \ - input/lrec_reader_mmap_json.c \ input/lrec_reader_stdio_json.c \ input/mlr_json_adapter.c \ input/json_parser.c \ @@ -173,18 +166,12 @@ TEST_MULTIPLE_CONTAINERS_SRCS = \ containers/top_keeper.c \ containers/dheap.c \ input/line_readers.c \ - input/file_reader_mmap.c \ input/file_reader_stdio.c \ input/file_ingestor_stdio.c \ - input/lrec_reader_mmap_csvlite.c \ input/lrec_reader_stdio_csvlite.c \ - input/lrec_reader_mmap_dkvp.c \ input/lrec_reader_stdio_dkvp.c \ - input/lrec_reader_mmap_nidx.c \ input/lrec_reader_stdio_nidx.c \ - input/lrec_reader_mmap_xtab.c \ input/lrec_reader_stdio_xtab.c \ - input/lrec_reader_mmap_json.c \ input/lrec_reader_stdio_json.c \ input/mlr_json_adapter.c \ input/json_parser.c \ @@ -325,26 +312,18 @@ TEST_JOIN_BUCKET_KEEPER_SRCS = \ containers/mixutil.c \ containers/header_keeper.c \ containers/join_bucket_keeper.c \ - input/mmap_byte_reader.c \ input/stdio_byte_reader.c \ input/line_readers.c \ input/lrec_reader_in_memory.c \ input/lrec_readers.c \ - input/lrec_reader_mmap_csv.c \ input/lrec_reader_stdio_csv.c \ - input/lrec_reader_mmap_csvlite.c \ input/lrec_reader_stdio_csvlite.c \ - input/lrec_reader_mmap_dkvp.c \ input/lrec_reader_stdio_dkvp.c \ - input/lrec_reader_mmap_nidx.c \ input/lrec_reader_stdio_nidx.c \ - input/lrec_reader_mmap_xtab.c \ input/lrec_reader_stdio_xtab.c \ - input/lrec_reader_mmap_json.c \ input/lrec_reader_stdio_json.c \ input/mlr_json_adapter.c \ input/json_parser.c \ - input/file_reader_mmap.c \ input/file_reader_stdio.c \ input/file_ingestor_stdio.c \ input/peek_file_reader.c \ @@ -362,7 +341,6 @@ EXPERIMENTAL_READER_SRCS = \ lib/string_array.c \ lib/string_builder.c \ input/stdio_byte_reader.c \ - input/file_reader_mmap.c \ input/line_readers.c \ containers/parse_trie.c \ experimental/getlines.c diff --git a/c/cli/mlrcli.c b/c/cli/mlrcli.c index f9e4b233a..c5d1ef1aa 100644 --- a/c/cli/mlrcli.c +++ b/c/cli/mlrcli.c @@ -33,7 +33,6 @@ #define DEFAULT_JSON_FLATTEN_SEPARATOR ":" #define DEFAULT_OOSVAR_FLATTEN_SEPARATOR ":" #define DEFAULT_COMMENT_STRING "#" -#define DEFAULT_MAX_FILE_SIZE_FOR_MMAP (4LL*1024LL*1024LL*1024LL) // ASCII 1f and 1e #define ASV_FS "\x1f" @@ -278,36 +277,9 @@ cli_opts_t* parse_command_line(int argc, char** argv, sllv_t** ppmapper_list) { slls_append(popts->filenames, argv[argi], NO_FREE); } - // Check for use of mmap. It's about 20% faster than stdio (due to fewer data copies - // -- lrecs can be pointer-backed by mmap memory) but we can't use it in all situations. if (no_input) { slls_free(popts->filenames); popts->filenames = NULL; - } else if (popts->filenames->length == 0) { - // No filenames means read from standard input, and standard input cannot be mmapped. - popts->reader_opts.use_mmap_for_read = FALSE; - } else if (popts->filenames->length > 10) { - // https://github.com/johnkerl/miller/issues/256: too many small files is as bad as one big one - // (for which see immediately below). - popts->reader_opts.use_mmap_for_read = FALSE; - } else if (popts->reader_opts.use_mmap_for_read == TRUE) { - // https://github.com/johnkerl/miller/issues/160: don't use mmap for large files. - // - // If any input files don't exist, don't error out just yet ... it's possible that the user - // is doing some complex put-with-tee or somesuch which will create the input file by the - // time it's needed. In that case we of course can't know the size yet, so avoid mmap there - // to be safe. - int all_exist_and_are_small_enough = TRUE; - for (sllse_t* pe = popts->filenames->phead; pe != NULL; pe = pe->pnext) { - ssize_t file_size = get_file_size(pe->value); - if (file_size == (ssize_t)(-1) || file_size >= popts->reader_opts.max_file_size_for_mmap) { - all_exist_and_are_small_enough = FALSE; - break; - } - } - if (!all_exist_and_are_small_enough) { - popts->reader_opts.use_mmap_for_read = FALSE; - } } if (popts->do_in_place && (popts->filenames == NULL || popts->filenames->length == 0)) { @@ -842,14 +814,6 @@ static void main_usage_data_format_options(FILE* o, char* argv0) { fprintf(o, "\n"); fprintf(o, " -p is a keystroke-saver for --nidx --fs space --repifs\n"); fprintf(o, "\n"); - fprintf(o, " --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or\n"); - fprintf(o, " for files less than n bytes in size. Default is for\n"); - fprintf(o, " files less than %lld bytes in size.\n", DEFAULT_MAX_FILE_SIZE_FOR_MMAP); - fprintf(o, " 'Whenever possible' means always except for when reading\n"); - fprintf(o, " standard input which is not mmappable. If you don't know\n"); - fprintf(o, " what this means, don't worry about it -- it's a minor\n"); - fprintf(o, " performance optimization.\n"); - fprintf(o, "\n"); fprintf(o, " Examples: --csv for CSV-formatted input and output; --idkvp --opprint for\n"); fprintf(o, " DKVP-formatted input and pretty-printed output.\n"); fprintf(o, "\n"); @@ -1139,14 +1103,11 @@ void cli_reader_opts_init(cli_reader_opts_t* preader_opts) { preader_opts->allow_repeat_ips = NEITHER_TRUE_NOR_FALSE; preader_opts->use_implicit_csv_header = NEITHER_TRUE_NOR_FALSE; preader_opts->allow_ragged_csv_input = NEITHER_TRUE_NOR_FALSE; - preader_opts->use_mmap_for_read = NEITHER_TRUE_NOR_FALSE; preader_opts->prepipe = NULL; preader_opts->comment_handling = COMMENTS_ARE_DATA; preader_opts->comment_string = NULL; - preader_opts->max_file_size_for_mmap = DEFAULT_MAX_FILE_SIZE_FOR_MMAP; - // xxx temp preader_opts->generator_opts.field_name = "i"; preader_opts->generator_opts.start = 0LL; @@ -1198,13 +1159,6 @@ void cli_apply_reader_defaults(cli_reader_opts_t* preader_opts) { if (preader_opts->allow_ragged_csv_input == NEITHER_TRUE_NOR_FALSE) preader_opts->allow_ragged_csv_input = FALSE; - if (preader_opts->use_mmap_for_read == NEITHER_TRUE_NOR_FALSE) -#if MLR_ARCH_MMAP_ENABLED - preader_opts->use_mmap_for_read = TRUE; -#else - preader_opts->use_mmap_for_read = FALSE; -#endif - if (preader_opts->input_json_flatten_separator == NULL) preader_opts->input_json_flatten_separator = DEFAULT_JSON_FLATTEN_SEPARATOR; } @@ -1311,9 +1265,6 @@ void cli_merge_reader_opts(cli_reader_opts_t* pfunc_opts, cli_reader_opts_t* pma if (pfunc_opts->allow_ragged_csv_input == NEITHER_TRUE_NOR_FALSE) pfunc_opts->allow_ragged_csv_input = pmain_opts->allow_ragged_csv_input; - if (pfunc_opts->use_mmap_for_read == NEITHER_TRUE_NOR_FALSE) - pfunc_opts->use_mmap_for_read = pmain_opts->use_mmap_for_read; - if (pfunc_opts->input_json_flatten_separator == NULL) pfunc_opts->input_json_flatten_separator = pmain_opts->input_json_flatten_separator; } @@ -1642,28 +1593,18 @@ int cli_handle_reader_options(char** argv, int argc, int *pargi, cli_reader_opts argi += 1; } else if (streq(argv[argi], "--mmap")) { - preader_opts->use_mmap_for_read = TRUE; + // No-op as of 5.6.3 (mmap is being abandoned) but don't break + // the command-line user experience. argi += 1; } else if (streq(argv[argi], "--no-mmap")) { - preader_opts->use_mmap_for_read = FALSE; + // No-op as of 5.6.3 (mmap is being abandoned) but don't break + // the command-line user experience. argi += 1; - } else if (streq(argv[argi], "--mmap-below")) { - check_arg_count(argv, argi, argc, 2); - preader_opts->use_mmap_for_read = TRUE; - long long llmax; - if (sscanf(argv[argi+1], "%lld", &llmax) != 1) { - fprintf(stderr, "%s: could not scan \"%s\".\n", - MLR_GLOBALS.bargv0, argv[argi+1]); - } - preader_opts->max_file_size_for_mmap = llmax; - argi += 2; - } else if (streq(argv[argi], "--prepipe")) { check_arg_count(argv, argi, argc, 2); preader_opts->prepipe = argv[argi+1]; - preader_opts->use_mmap_for_read = FALSE; argi += 2; } else if (streq(argv[argi], "--skip-comments")) { diff --git a/c/cli/mlrcli.h b/c/cli/mlrcli.h index 1ab7edbff..521ecdf4d 100644 --- a/c/cli/mlrcli.h +++ b/c/cli/mlrcli.h @@ -37,7 +37,6 @@ typedef struct _cli_reader_opts_t { int allow_repeat_ips; int use_implicit_csv_header; int allow_ragged_csv_input; - int use_mmap_for_read; // Command for popen on input, e.g. "zcat -cf <". Can be null in which case // files are read directly rather than through a pipe. @@ -46,9 +45,6 @@ typedef struct _cli_reader_opts_t { comment_handling_t comment_handling; char* comment_string; - // https://github.com/johnkerl/miller/issues/160 - ssize_t max_file_size_for_mmap; - // Fake internal-data-generator 'reader' generator_opts_t generator_opts; diff --git a/c/input/Makefile.am b/c/input/Makefile.am index ce058b8e0..dc17cb846 100644 --- a/c/input/Makefile.am +++ b/c/input/Makefile.am @@ -2,8 +2,6 @@ noinst_LTLIBRARIES= libinput.la libinput_la_SOURCES= \ byte_reader.h \ byte_readers.h \ - file_reader_mmap.c \ - file_reader_mmap.h \ file_reader_stdio.c \ file_reader_stdio.h \ file_ingestor_stdio.c \ @@ -17,12 +15,6 @@ libinput_la_SOURCES= \ lrec_reader.h \ lrec_reader_gen.c \ lrec_reader_in_memory.c \ - lrec_reader_mmap_csv.c \ - lrec_reader_mmap_csvlite.c \ - lrec_reader_mmap_dkvp.c \ - lrec_reader_mmap_json.c \ - lrec_reader_mmap_nidx.c \ - lrec_reader_mmap_xtab.c \ lrec_reader_stdio_csv.c \ lrec_reader_stdio_csvlite.c \ lrec_reader_stdio_dkvp.c \ @@ -31,7 +23,6 @@ libinput_la_SOURCES= \ lrec_reader_stdio_xtab.c \ lrec_readers.c \ lrec_readers.h \ - mmap_byte_reader.c \ peek_file_reader.c \ peek_file_reader.h \ stdio_byte_reader.c \ diff --git a/c/input/byte_readers.h b/c/input/byte_readers.h index 1a0fd7cf9..72669ff26 100644 --- a/c/input/byte_readers.h +++ b/c/input/byte_readers.h @@ -4,10 +4,8 @@ byte_reader_t* string_byte_reader_alloc(); byte_reader_t* stdio_byte_reader_alloc(); -byte_reader_t* mmap_byte_reader_alloc(); void string_byte_reader_free(byte_reader_t* pbr); void stdio_byte_reader_free(byte_reader_t* pbr); -void mmap_byte_reader_free(byte_reader_t* pbr); #endif // BYTE_READERS_H diff --git a/c/input/file_reader_mmap.c b/c/input/file_reader_mmap.c deleted file mode 100644 index be5fa9089..000000000 --- a/c/input/file_reader_mmap.c +++ /dev/null @@ -1,84 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "lib/mlr_arch.h" -#include "lib/mlrutil.h" -#include "lib/mlr_globals.h" -#include "file_reader_mmap.h" - -#if MLR_ARCH_MMAP_ENABLED -static char empty_buf[1] = { 0 }; -#endif - -// ---------------------------------------------------------------- -file_reader_mmap_state_t* file_reader_mmap_open(char* prepipe, char* file_name) { -#if MLR_ARCH_MMAP_ENABLED - // popen is a stdio construct, not an mmap construct, and it can't be supported here. - if (prepipe != NULL) { - fprintf(stderr, "%s: coding error detected in file %s at line %d.\n", - MLR_GLOBALS.bargv0, __FILE__, __LINE__); - exit(1); - } - - file_reader_mmap_state_t* pstate = mlr_malloc_or_die(sizeof(file_reader_mmap_state_t)); - pstate->fd = open(file_name, O_RDONLY); - if (pstate->fd < 0) { - perror("open"); - fprintf(stderr, "%s: could not open \"%s\"\n", MLR_GLOBALS.bargv0, file_name); - exit(1); - } - struct stat stat; - if (fstat(pstate->fd, &stat) < 0) { - perror("fstat"); - fprintf(stderr, "%s: could not fstat \"%s\"\n", MLR_GLOBALS.bargv0, file_name); - exit(1); - } - if (stat.st_size == 0) { - // mmap doesn't allow us to map zero-length files but zero-length files do exist. - pstate->sol = &empty_buf[0]; - } else { - pstate->sol = mmap(NULL, (size_t)stat.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE, pstate->fd, (off_t)0); - if (pstate->sol == MAP_FAILED) { - perror("mmap"); - fprintf(stderr, "%s: could not mmap \"%s\"\n", MLR_GLOBALS.bargv0, file_name); - exit(1); - } - } - pstate->eof = pstate->sol + stat.st_size; - // POSIX semantics: the mmap itself increments a reference count to the file, in addition to the - // open. We close the file but keep the mmap reference until a subsequent munmap. - if (close(pstate->fd) < 0) { - perror("close"); - exit(1); - } - return pstate; -#else - fprintf(stderr, "%s: mmap is unsupported on this architecture.\n", MLR_GLOBALS.bargv0); - exit(1); - return NULL; -#endif -} - -// ---------------------------------------------------------------- -// Here we intentionally do not munmap. -// -// This method is used by various lrec readers, where lrecs are instantiated with keys/values -// pointing into mmapped file-contents buffers. This is done for the sake of performance, to reduce -// data-copies. But it also means we can't unmap files after ingesting lrecs, since the lrecs in -// question might be retained after the input-file closes. Example: mlr sort on multiple files. -void file_reader_mmap_close(file_reader_mmap_state_t* pstate, char* prepipe) { - free(pstate); -} - -// ---------------------------------------------------------------- -void* file_reader_mmap_vopen(void* pvstate, char* prepipe, char* file_name) { - return file_reader_mmap_open(prepipe, file_name); -} - -// ---------------------------------------------------------------- -void file_reader_mmap_vclose(void* pvstate, void* pvhandle, char* prepipe) { - file_reader_mmap_close(pvhandle, prepipe); -} diff --git a/c/input/file_reader_mmap.h b/c/input/file_reader_mmap.h deleted file mode 100644 index 08e09ff03..000000000 --- a/c/input/file_reader_mmap.h +++ /dev/null @@ -1,20 +0,0 @@ -// ================================================================ -// Abstraction layer for mmapped file-read logic. -// ================================================================ - -#ifndef FILE_READER_MMAP_H -#define FILE_READER_MMAP_H - -typedef struct _file_reader_mmap_state_t { - char* sol; - char* eof; - int fd; -} file_reader_mmap_state_t; - -file_reader_mmap_state_t* file_reader_mmap_open(char* prepipe, char* file_name); -void file_reader_mmap_close(file_reader_mmap_state_t* pstate, char* prepipe); - -void* file_reader_mmap_vopen(void* pvstate, char* prepipe, char* file_name); -void file_reader_mmap_vclose(void* pvstate, void* pvhandle, char* prepipe); - -#endif // FILE_READER_MMAP_H diff --git a/c/input/lrec_reader.h b/c/input/lrec_reader.h index 7a8bfddc7..b4e050cb9 100644 --- a/c/input/lrec_reader.h +++ b/c/input/lrec_reader.h @@ -4,7 +4,6 @@ #include #include "lib/context.h" #include "containers/lrec.h" -#include "input/file_reader_mmap.h" struct _lrec_reader_t; // forward reference for method declarations diff --git a/c/input/lrec_reader_gen.c b/c/input/lrec_reader_gen.c index d57fed9a6..4335c5830 100644 --- a/c/input/lrec_reader_gen.c +++ b/c/input/lrec_reader_gen.c @@ -2,7 +2,6 @@ #include #include "lib/mlr_globals.h" #include "lib/mlrutil.h" -#include "input/file_reader_mmap.h" #include "input/lrec_readers.h" typedef struct _lrec_reader_gen_state_t { diff --git a/c/input/lrec_reader_mmap_csv.c b/c/input/lrec_reader_mmap_csv.c deleted file mode 100644 index aa4e999a6..000000000 --- a/c/input/lrec_reader_mmap_csv.c +++ /dev/null @@ -1,546 +0,0 @@ -// ================================================================ -// Note: there are multiple process methods with a lot of code duplication. -// This is intentional. Much of Miller's measured processing time is in the -// lrec-reader process methods. This is code which needs to execute on every -// byte of input and even moving a single runtime if-statement into a -// function-pointer assignment at alloc time can have noticeable effects on -// performance (5-10% in some cases). -// ================================================================ - -#include -#include -#include -#include "cli/comment_handling.h" -#include "lib/mlr_globals.h" -#include "lib/mlrutil.h" -#include "lib/string_builder.h" -#include "input/file_reader_mmap.h" -#include "input/lrec_readers.h" -#include "input/peek_file_reader.h" -#include "containers/rslls.h" -#include "containers/lhmslv.h" -#include "containers/parse_trie.h" - -// Idea of pheader_keepers: each header_keeper object retains the input-line backing -// and the slls_t for a CSV header line which is used by one or more CSV data -// lines. Meanwhile some mappers retain input records from the entire data -// stream, including header-schema changes in the input stream. This means we -// need to keep headers intact as long as any lrecs are pointing to them. One -// option is reference-counting which I experimented with; it was messy and -// error-prone. The approach used here is to keep a hash map from header-schema -// to header_keeper object. The current pheader_keeper is a pointer into one of -// those. Then when the reader is freed, all the header-keepers are freed. - -// ---------------------------------------------------------------- -#define STRING_BUILDER_INIT_SIZE 1024 - -#define IRS_TOKEN 0x2001 -#define IFS_TOKEN 0x2002 -#define DQUOTE_TOKEN 0x2003 -#define DQUOTE_IRS_TOKEN 0x2004 -#define DQUOTE_IRS2_TOKEN 0x2005 // alternate line-ending for autodetect LF/CRLF -#define DQUOTE_IFS_TOKEN 0x2006 -#define DQUOTE_DQUOTE_TOKEN 0x2007 - -// ---------------------------------------------------------------- -typedef struct _lrec_reader_mmap_csv_state_t { - // Input line number is not the same as the record-counter in context_t, - // which counts records. - long long ilno; - - char* eof; - char* irs; - char* ifs_eof; - char* ifs; - char* dquote; - char* dquote_irs; - char* dquote_irs2; - char* dquote_ifs; - char* dquote_eof; - char* dquote_dquote; - int do_auto_line_term; - comment_handling_t comment_handling; - char* comment_string; - int comment_string_length; - - int dquotelen; - - rslls_t* pfields; - string_builder_t* psb; - - parse_trie_t* pno_dquote_parse_trie; - parse_trie_t* pdquote_parse_trie; - - int expect_header_line_next; - int use_implicit_csv_header; - int allow_ragged_csv_input; - header_keeper_t* pheader_keeper; - lhmslv_t* pheader_keepers; - -} lrec_reader_mmap_csv_state_t; - -static void lrec_reader_mmap_csv_free(lrec_reader_t* preader); -static void lrec_reader_mmap_csv_sof(void* pvstate, void* pvhandle); -static lrec_t* lrec_reader_mmap_csv_process(void* pvstate, void* pvhandle, context_t* pctx); -static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate, - rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx); -static lrec_t* paste_indices_and_data(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, context_t* pctx); -static lrec_t* paste_header_and_data_ragged(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, - context_t* pctx); -static lrec_t* paste_header_and_data_rectangular(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, - context_t* pctx); - -// ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_mmap_csv_alloc(char* irs, char* ifs, int use_implicit_csv_header, - int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string) -{ - lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - - lrec_reader_mmap_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_csv_state_t)); - pstate->ilno = 0LL; - - pstate->do_auto_line_term = FALSE; - if (streq(irs, "auto")) { - pstate->do_auto_line_term = TRUE; - irs = "\n"; - } - - pstate->comment_handling = comment_handling; - pstate->comment_string = comment_string; - pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string); - - pstate->eof = "\xff"; - pstate->irs = irs; - pstate->ifs = ifs; - pstate->ifs_eof = mlr_paste_2_strings(pstate->ifs, "\xff"); - pstate->dquote = "\""; - - pstate->dquote_ifs = mlr_paste_2_strings("\"", pstate->ifs); - pstate->dquote_eof = "\"\xff"; - pstate->dquote_dquote = "\"\""; - - pstate->dquotelen = strlen(pstate->dquote); - - pstate->pno_dquote_parse_trie = parse_trie_alloc(); - parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->irs, IRS_TOKEN); - parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs, IFS_TOKEN); - parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->dquote, DQUOTE_TOKEN); - - pstate->pdquote_parse_trie = parse_trie_alloc(); - if (pstate->do_auto_line_term) { - pstate->dquote_irs = mlr_paste_2_strings("\"", "\n"); - pstate->dquote_irs2 = mlr_paste_2_strings("\"", "\r\n"); - parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_TOKEN); - parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs2, DQUOTE_IRS2_TOKEN); - } else { - pstate->dquote_irs = mlr_paste_2_strings("\"", pstate->irs); - pstate->dquote_irs2 = NULL; - parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_TOKEN); - } - parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_ifs, DQUOTE_IFS_TOKEN); - parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_dquote, DQUOTE_DQUOTE_TOKEN); - - pstate->pfields = rslls_alloc(); - pstate->psb = sb_alloc(STRING_BUILDER_INIT_SIZE); - - pstate->expect_header_line_next = use_implicit_csv_header ? FALSE : TRUE; - pstate->use_implicit_csv_header = use_implicit_csv_header; - pstate->allow_ragged_csv_input = allow_ragged_csv_input; - pstate->pheader_keeper = NULL; - pstate->pheader_keepers = lhmslv_alloc(); - - plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = file_reader_mmap_vopen; - plrec_reader->pclose_func = file_reader_mmap_vclose; - plrec_reader->pprocess_func = lrec_reader_mmap_csv_process; - plrec_reader->psof_func = lrec_reader_mmap_csv_sof; - plrec_reader->pfree_func = lrec_reader_mmap_csv_free; - - return plrec_reader; -} - -// ---------------------------------------------------------------- -static void lrec_reader_mmap_csv_free(lrec_reader_t* preader) { - lrec_reader_mmap_csv_state_t* pstate = preader->pvstate; - for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) { - header_keeper_t* pheader_keeper = pe->pvvalue; - header_keeper_free(pheader_keeper); - } - lhmslv_free(pstate->pheader_keepers); - parse_trie_free(pstate->pno_dquote_parse_trie); - parse_trie_free(pstate->pdquote_parse_trie); - rslls_free(pstate->pfields); - sb_free(pstate->psb); - free(pstate->ifs_eof); - free(pstate->dquote_irs); - free(pstate->dquote_irs2); - free(pstate->dquote_ifs); - free(pstate); - free(preader); -} - -// ---------------------------------------------------------------- -static void lrec_reader_mmap_csv_sof(void* pvstate, void* pvhandle) { - lrec_reader_mmap_csv_state_t* pstate = pvstate; - pstate->ilno = 0LL; - pstate->expect_header_line_next = pstate->use_implicit_csv_header ? FALSE : TRUE; - - // Strip UTF-8 BOM if any - file_reader_mmap_state_t* phandle = pvhandle; - if ((phandle->eof - phandle->sol) >= 3) { - if (memcmp(phandle->sol, "\xef\xbb\xbf", 3) == 0) { - phandle->sol += 3; - } - } -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_csv_process(void* pvstate, void* pvhandle, context_t* pctx) { - lrec_reader_mmap_csv_state_t* pstate = pvstate; - file_reader_mmap_state_t* phandle = pvhandle; - - // Ingest the next header line, if expected - if (pstate->expect_header_line_next) { - while (TRUE) { - if (!lrec_reader_mmap_csv_get_fields(pstate, pstate->pfields, phandle, pctx)) - return NULL; - pstate->ilno++; - - // We check for comments here rather than within the parser since it's important - // for users to be able to comment out lines containing double-quoted newlines. - if (pstate->comment_string != NULL && pstate->pfields->phead != NULL) { - if (streqn(pstate->pfields->phead->value, pstate->comment_string, pstate->comment_string_length)) { - if (pstate->comment_handling == PASS_COMMENTS) { - int i = 0; - for ( - rsllse_t* pe = pstate->pfields->phead; - i < pstate->pfields->length && pe != NULL; - pe = pe->pnext, i++) - { - if (i > 0) - fputs(pstate->ifs, stdout); - fputs(pe->value, stdout); - } - if (pstate->do_auto_line_term) { - fputs(pctx->auto_line_term, stdout); - } else { - fputs(pstate->irs, stdout); - } - } - rslls_reset(pstate->pfields); - continue; - } - } - - slls_t* pheader_fields = slls_alloc(); - int i = 0; - for (rsllse_t* pe = pstate->pfields->phead; i < pstate->pfields->length && pe != NULL; pe = pe->pnext, i++) { - if (*pe->value == 0) { - fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - // Transfer pointer-free responsibility from the rslls to the - // header fields in the header keeper - slls_append(pheader_fields, pe->value, pe->free_flag); - pe->free_flag = 0; - } - rslls_reset(pstate->pfields); - - pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields); - if (pstate->pheader_keeper == NULL) { - pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields); - lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper, - NO_FREE); // freed by header-keeper - } else { // Re-use the header-keeper in the header cache - slls_free(pheader_fields); - } - - pstate->expect_header_line_next = FALSE; - break; - } - } - - // Ingest the next data line, if expected - while (TRUE) { - int rc = lrec_reader_mmap_csv_get_fields(pstate, pstate->pfields, phandle, pctx); - pstate->ilno++; - if (rc == FALSE) // EOF - return NULL; - - // We check for comments here rather than within the parser since it's important - // for users to be able to comment out lines containing double-quoted newlines. - if (pstate->comment_string != NULL && pstate->pfields->phead != NULL) { - if (streqn(pstate->pfields->phead->value, pstate->comment_string, pstate->comment_string_length)) { - if (pstate->comment_handling == PASS_COMMENTS) { - int i = 0; - for ( - rsllse_t* pe = pstate->pfields->phead; - i < pstate->pfields->length && pe != NULL; - pe = pe->pnext, i++) - { - if (i > 0) - fputs(pstate->ifs, stdout); - fputs(pe->value, stdout); - } - if (pstate->do_auto_line_term) { - fputs(pctx->auto_line_term, stdout); - } else { - fputs(pstate->irs, stdout); - } - } - rslls_reset(pstate->pfields); - continue; - } - } - - lrec_t* prec = pstate->use_implicit_csv_header - ? paste_indices_and_data(pstate, pstate->pfields, pctx) - : pstate->allow_ragged_csv_input - ? paste_header_and_data_ragged(pstate, pstate->pfields, pctx) - : paste_header_and_data_rectangular(pstate, pstate->pfields, pctx); - rslls_reset(pstate->pfields); - return prec; - } -} - -static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate, - rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx) -{ - int rc, token = 0, matchlen = 0, record_done = FALSE, field_done = FALSE; - string_builder_t* psb = pstate->psb; - - if (phandle->sol >= phandle->eof) - return FALSE; - - char* p = phandle->sol; - char* e = p; - - // loop over fields in record - record_done = FALSE; - while (!record_done) { - // Assumption is dquote is "\"" - if (*e != pstate->dquote[0]) { // start of non-quoted field - - // Loop over characters in field - field_done = FALSE; - while (!field_done) { - MLR_INTERNAL_CODING_ERROR_IF(e > phandle->eof); - rc = parse_trie_match(pstate->pno_dquote_parse_trie, e, phandle->eof, &token, &matchlen); - if (rc) { - switch(token) { - case IFS_TOKEN: // end of field - *e = 0; - rslls_append(pfields, p, NO_FREE, 0); - p = e + matchlen; - field_done = TRUE; - break; - case IRS_TOKEN: // end of record - *e = 0; - - if (pstate->do_auto_line_term) { - if (e > p && e[-1] == '\r') { - e[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - rslls_append(pfields, p, NO_FREE, 0); - p = e + matchlen; - field_done = TRUE; - record_done = TRUE; - break; - case DQUOTE_TOKEN: // CSV syntax error: fields containing quotes must be fully wrapped in quotes - fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n", - MLR_GLOBALS.bargv0, pstate->ilno); - exit(1); - break; - default: - fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", - MLR_GLOBALS.bargv0, token, pstate->ilno); - exit(1); - break; - } - e += matchlen; - } else if (e >= phandle->eof) { - // We read to end of file without seeing end of line. We can't always zero-poke a null character to - // terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's - // our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking - // at EOF is one byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(p, phandle->eof - p); - rslls_append(pfields, copy, FREE_ENTRY_VALUE, 0); - p = e + matchlen; - field_done = TRUE; - record_done = TRUE; - break; - } else { - e++; - } - } - - } else { // start of quoted field - e += pstate->dquotelen; - p = e; - - // loop over characters in field - field_done = FALSE; - int contiguous = TRUE; - // If there are no embedded double-double quotes, then the field value is a contiguous - // array of bytes between the start and end double-quotes (non-inclusive). E.g. "ab,c" - // has contents ab,c. In that case we can point the rslls at that range of bytes - // with no data-copying. However, if there are embedded double-double quotes, then - // we use the string-build logic to build up a dynamically allocated string. E.g. - // "ab""c" becomes ab"c. - while (!field_done) { - if (e >= phandle->eof) { - fprintf(stderr, "%s: unmatched double quote at line %lld.\n", - MLR_GLOBALS.bargv0, pstate->ilno); - exit(1); - } - - rc = parse_trie_match(pstate->pdquote_parse_trie, e, phandle->eof, &token, &matchlen); - - if (rc) { - switch(token) { - case DQUOTE_IFS_TOKEN: // end of field - *e = 0; - if (contiguous) - rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT); - else - rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); - p = e + matchlen; - field_done = TRUE; - break; - case DQUOTE_IRS_TOKEN: // end of record - case DQUOTE_IRS2_TOKEN: // end of record - *e = 0; - - if (pstate->do_auto_line_term) { - if (e > p && e[-1] == '\r') { - e[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - if (contiguous) - rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT); - else - rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT); - p = e + matchlen; - field_done = TRUE; - record_done = TRUE; - break; - case DQUOTE_DQUOTE_TOKEN: // RFC-4180 CSV: "" inside a dquoted field is an escape for " - if (contiguous) { // not anymore it isn't - sb_append_char_range(psb, p, e); - contiguous = FALSE; - } else { - sb_append_char(psb, pstate->dquote[0]); - } - break; - default: - fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n", - MLR_GLOBALS.bargv0, token, pstate->ilno); - exit(1); - break; - } - e += matchlen; - } else { - if (!contiguous) - sb_append_char(psb, *e); - e++; - } - } - } - } - phandle->sol = e; - - return TRUE; -} - -// ---------------------------------------------------------------- -static lrec_t* paste_indices_and_data(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, - context_t* pctx) -{ - int idx = 0; - lrec_t* prec = lrec_unbacked_alloc(); - for (rsllse_t* pd = pdata_fields->phead; idx < pdata_fields->length && pd != NULL; pd = pd->pnext) { - idx++; - char key_free_flags = 0; - char* key = low_int_to_string(idx, &key_free_flags); - char value_free_flags = pd->free_flag; - // Transfer pointer-free responsibility from the rslls to the lrec object - lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag); - pd->free_flag = 0; - } - return prec; -} - -// ---------------------------------------------------------------- -static lrec_t* paste_header_and_data_ragged(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, - context_t* pctx) -{ - lrec_t* prec = lrec_unbacked_alloc(); - sllse_t* ph = pstate->pheader_keeper->pkeys->phead; - rsllse_t* pd = pdata_fields->phead; - int idx = 0; - int hlen = pstate->pheader_keeper->pkeys->length; - int dlen = pdata_fields->length; - - // Process fields up to minimum of header length and data length - // Note that pd->pnext can be non-null due to pointer-reuse semantics of rslls, - // so use list-length attributes for end-of-list check. - for (idx = 0; idx < hlen && idx < dlen; idx++, ph = ph->pnext, pd = pd->pnext) { - // Transfer pointer-free responsibility from the rslls to the lrec object - lrec_put_ext(prec, ph->value, pd->value, pd->free_flag, pd->quote_flag); - pd->free_flag = 0; - } - - if (hlen > dlen) { - // Header is longer. Empty-fill the remaining data fields. - // E.g. if the input looks like - // a,b,c,d <-- header - // 1,2 <-- data - // then put c="", d="". - for ( ; idx < hlen; idx++, ph = ph->pnext) { - lrec_put_ext(prec, ph->value, "", NO_FREE, 0); - } - } else { - // Data is longer. Use positional indices to label the remaining data fields. - for ( ; idx < dlen; idx++, pd = pd->pnext) { - char key_free_flags = 0; - char* key = low_int_to_string(idx+1, &key_free_flags); - char value_free_flags = pd->free_flag; - // Transfer pointer-free responsibility from the rslls to the lrec object - lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag); - pd->free_flag = 0; - } - } - - return prec; -} - -// ---------------------------------------------------------------- -static lrec_t* paste_header_and_data_rectangular(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, - context_t* pctx) -{ - if (pstate->pheader_keeper->pkeys->length != pdata_fields->length) { - fprintf(stderr, "%s: Header/data length mismatch (%llu != %llu) at file \"%s\" line %lld.\n", - MLR_GLOBALS.bargv0, pstate->pheader_keeper->pkeys->length, pdata_fields->length, - pctx->filename, pstate->ilno); - exit(1); - } - lrec_t* prec = lrec_unbacked_alloc(); - sllse_t* ph = pstate->pheader_keeper->pkeys->phead; - rsllse_t* pd = pdata_fields->phead; - for ( ; ph != NULL && pd != NULL; ph = ph->pnext, pd = pd->pnext) { - // Transfer pointer-free responsibility from the rslls to the lrec object - lrec_put_ext(prec, ph->value, pd->value, pd->free_flag, pd->quote_flag); - pd->free_flag = 0; - } - return prec; -} diff --git a/c/input/lrec_reader_mmap_csvlite.c b/c/input/lrec_reader_mmap_csvlite.c deleted file mode 100644 index 92110c24c..000000000 --- a/c/input/lrec_reader_mmap_csvlite.c +++ /dev/null @@ -1,876 +0,0 @@ -// ================================================================ -// Note: there are multiple process methods with a lot of code duplication. -// This is intentional. Much of Miller's measured processing time is in the -// lrec-reader process methods. This is code which needs to execute on every -// byte of input and even moving a single runtime if-statement into a -// function-pointer assignment at alloc time can have noticeable effects on -// performance (5-10% in some cases). -// ================================================================ - -#include -#include -#include "cli/comment_handling.h" -#include "lib/mlr_globals.h" -#include "lib/mlrutil.h" -#include "containers/slls.h" -#include "containers/lhmslv.h" -#include "input/file_reader_mmap.h" -#include "input/lrec_readers.h" - -// ---------------------------------------------------------------- -// Multi-file cases: -// -// a,a a,b c d -// -- FILE1: -- FILE1: -- FILE1: -- FILE1: -// a,b,c a,b,c a,b,c a,b,c -// 1,2,3 1,2,3 1,2,3 1,2,3 -// 4,5,6 4,5,6 4,5,6 4,5,6 -// -- FILE2: -- FILE2: -// a,b,c d,e,f,g a,b,c d,e,f -// 7,8,9 3,4,5,6 7,8,9 3,4,5 -// --OUTPUT: --OUTPUT: --OUTPUT: --OUTPUT: -// a,b,c a,b,c a,b,c a,b,c -// 1,2,3 1,2,3 1,2,3 1,2,3 -// 4,5,6 4,5,6 4,5,6 4,5,6 -// 7,8,9 7,8,9 -// d,e,f,g d,e,f -// 3,4,5,6 3,4,5 -// ---------------------------------------------------------------- - -typedef struct _lrec_reader_mmap_csvlite_state_t { - long long ifnr; - long long ilno; // Line-level, not record-level as in context_t - char* irs; - char* ifs; - int irslen; - int ifslen; - int allow_repeat_ifs; - int do_auto_line_term; - int use_implicit_csv_header; - int allow_ragged_csv_input; - comment_handling_t comment_handling; - char* comment_string; - int comment_string_length; - - int expect_header_line_next; - header_keeper_t* pheader_keeper; - lhmslv_t* pheader_keepers; -} lrec_reader_mmap_csvlite_state_t; - -static void lrec_reader_mmap_csvlite_free(lrec_reader_t* preader); -static void lrec_reader_mmap_csvlite_sof(void* pvstate, void* pvhandle); -static lrec_t* lrec_reader_mmap_csvlite_process_single_seps(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_csvlite_process_multi_seps(void* pvstate, void* pvhandle, context_t* pctx); - -static slls_t* lrec_reader_mmap_csvlite_get_header_single_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx); - -static slls_t* lrec_reader_mmap_csvlite_get_header_multi_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate); - -static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza); - -static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza); - -static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza); - -static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza); - -static int handle_comment_line_single_irs( - file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, - char irs); - -static int handle_comment_line_multi_irs( - file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate); - -// ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char* irs, char* ifs, int allow_repeat_ifs, int use_implicit_csv_header, - int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string) -{ - lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - - lrec_reader_mmap_csvlite_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_csvlite_state_t)); - pstate->ifnr = 0LL; - pstate->irs = irs; - pstate->ifs = ifs; - pstate->irslen = strlen(irs); - pstate->ifslen = strlen(ifs); - pstate->allow_repeat_ifs = allow_repeat_ifs; - pstate->do_auto_line_term = FALSE; - pstate->use_implicit_csv_header = use_implicit_csv_header; - pstate->allow_ragged_csv_input = allow_ragged_csv_input; - pstate->comment_handling = comment_handling; - pstate->comment_string = comment_string; - pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string); - - pstate->expect_header_line_next = use_implicit_csv_header ? FALSE : TRUE; - pstate->pheader_keeper = NULL; - pstate->pheader_keepers = lhmslv_alloc(); - - plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = file_reader_mmap_vopen; - plrec_reader->pclose_func = file_reader_mmap_vclose; - - if (streq(irs, "auto")) { - // Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In - // either case the final character is "\n". Then for autodetect we - // simply check if there's a character in the line before the '\n', and - // if that is '\r'. - pstate->do_auto_line_term = TRUE; - pstate->irs = "\n"; - pstate->irslen = 1; - plrec_reader->pprocess_func = (pstate->ifslen == 1) - ? lrec_reader_mmap_csvlite_process_single_seps - : lrec_reader_mmap_csvlite_process_multi_seps; - } else { - plrec_reader->pprocess_func = (pstate->irslen == 1 && pstate->ifslen == 1) - ? lrec_reader_mmap_csvlite_process_single_seps - : lrec_reader_mmap_csvlite_process_multi_seps; - } - - plrec_reader->psof_func = lrec_reader_mmap_csvlite_sof; - plrec_reader->pfree_func = lrec_reader_mmap_csvlite_free; - - return plrec_reader; -} - -// ---------------------------------------------------------------- -static void lrec_reader_mmap_csvlite_free(lrec_reader_t* preader) { - lrec_reader_mmap_csvlite_state_t* pstate = preader->pvstate; - for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) { - header_keeper_t* pheader_keeper = pe->pvvalue; - header_keeper_free(pheader_keeper); - } - lhmslv_free(pstate->pheader_keepers); - free(pstate); - free(preader); -} - -static void lrec_reader_mmap_csvlite_sof(void* pvstate, void* pvhandle) { - lrec_reader_mmap_csvlite_state_t* pstate = pvstate; - pstate->ifnr = 0LL; - pstate->ilno = 0LL; - pstate->expect_header_line_next = pstate->use_implicit_csv_header ? FALSE : TRUE; -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_csvlite_process_single_seps(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_csvlite_state_t* pstate = pvstate; - - while (TRUE) { - if (pstate->expect_header_line_next) { - - slls_t* pheader_fields = lrec_reader_mmap_csvlite_get_header_single_seps(phandle, pstate, pctx); - if (pheader_fields == NULL) { // EOF - return NULL; - } - - for (sllse_t* pe = pheader_fields->phead; pe != NULL; pe = pe->pnext) { - if (*pe->value == 0) { - fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } - - pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields); - if (pstate->pheader_keeper == NULL) { - pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields); - lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper, - NO_FREE); // freed by header-keeper - } else { // Re-use the header-keeper in the header cache - slls_free(pheader_fields); - } - pstate->expect_header_line_next = FALSE; - } - - int end_of_stanza = FALSE; - lrec_t* prec = pstate->use_implicit_csv_header - ? lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(phandle, pstate, pctx, - pstate->pheader_keeper, &end_of_stanza) - : lrec_reader_mmap_csvlite_get_record_single_seps(phandle, pstate, pctx, - pstate->pheader_keeper, &end_of_stanza); - if (end_of_stanza) { - pstate->expect_header_line_next = TRUE; - } else if (prec == NULL) { // EOF - return NULL; - } else { - return prec; - } - } -} - -static lrec_t* lrec_reader_mmap_csvlite_process_multi_seps(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_csvlite_state_t* pstate = pvstate; - - while (TRUE) { - if (pstate->expect_header_line_next) { - - slls_t* pheader_fields = lrec_reader_mmap_csvlite_get_header_multi_seps(phandle, pstate); - if (pheader_fields == NULL) // EOF - return NULL; - - for (sllse_t* pe = pheader_fields->phead; pe != NULL; pe = pe->pnext) { - if (*pe->value == 0) { - fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } - - pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields); - if (pstate->pheader_keeper == NULL) { - pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields); - lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper, - NO_FREE); // freed by header-keeper - } else { // Re-use the header-keeper in the header cache - slls_free(pheader_fields); - } - pstate->expect_header_line_next = FALSE; - } - - int end_of_stanza = FALSE; - lrec_t* prec = pstate->use_implicit_csv_header - ? lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(phandle, pstate, pctx, - pstate->pheader_keeper, &end_of_stanza) - : lrec_reader_mmap_csvlite_get_record_multi_seps(phandle, pstate, pctx, - pstate->pheader_keeper, &end_of_stanza); - if (end_of_stanza) { - pstate->expect_header_line_next = TRUE; - } else if (prec == NULL) { // EOF - return NULL; - } else { - return prec; - } - } -} - -// ---------------------------------------------------------------- -static slls_t* lrec_reader_mmap_csvlite_get_header_single_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx) -{ - char irs = pstate->irs[0]; - char ifs = pstate->ifs[0]; - int allow_repeat_ifs = pstate->allow_repeat_ifs; - - slls_t* pheader_names = slls_alloc(); - - // Skip blank/comment lines and seek to header line - while (TRUE) { - if (phandle->sol < phandle->eof && *phandle->sol == irs) { - phandle->sol++; - pstate->ilno++; - continue; - } - if (pstate->comment_string != NULL && handle_comment_line_single_irs(phandle, pstate, irs)) { - continue; - } - break; - } - - char* p = phandle->sol; - if (allow_repeat_ifs) { - while (*p == ifs) - p++; - } - char* osol = p; - char* header_name = p; - - for ( ; p < phandle->eof && *p; ) { - if (*p == irs) { - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > phandle->sol && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - pstate->ilno++; - break; - } else if (*p == ifs) { - *p = 0; - - slls_append_no_free(pheader_names, header_name); - - p++; - if (allow_repeat_ifs) { - while (*p == ifs) - p++; - } - header_name = p; - } else { - p++; - } - } - if (allow_repeat_ifs && *header_name == 0) { - // OK - } else if (p == osol) { - // OK - } else { - slls_append_no_free(pheader_names, header_name); - } - - return pheader_names; -} - -static slls_t* lrec_reader_mmap_csvlite_get_header_multi_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate) -{ - char* irs = pstate->irs; - char* ifs = pstate->ifs; - int irslen = pstate->irslen; - int ifslen = pstate->ifslen; - int allow_repeat_ifs = pstate->allow_repeat_ifs; - - // Skip blank/comment lines and seek to header line - while (TRUE) { - if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) { - phandle->sol += irslen; - pstate->ilno++; - continue; - } - if (pstate->comment_string != NULL && handle_comment_line_multi_irs(phandle, pstate)) { - continue; - } - break; - } - - slls_t* pheader_names = slls_alloc(); - - // Parse the header line - char* p = phandle->sol; - if (allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - char* osol = p; - char* header_name = p; - - for ( ; p < phandle->eof && *p; ) { - if (streqn(p, irs, irslen)) { - *p = 0; - phandle->sol = p + irslen; - pstate->ilno++; - break; - } else if (streqn(p, ifs, ifslen)) { - *p = 0; - - slls_append_no_free(pheader_names, header_name); - - p += ifslen; - if (allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - header_name = p; - } else { - p++; - } - } - if (allow_repeat_ifs && *header_name == 0) { - // OK - } else if (p == osol) { - // OK - } else { - slls_append_no_free(pheader_names, header_name); - } - - return pheader_names; -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza) -{ - char irs = pstate->irs[0]; - char ifs = pstate->ifs[0]; - int allow_repeat_ifs = pstate->allow_repeat_ifs; - - // Skip comment lines - if (pstate->comment_string != NULL) { - while (handle_comment_line_single_irs(phandle, pstate, irs)) - ; - } - - if (phandle->sol >= phandle->eof) - return NULL; - - char* line = phandle->sol; - lrec_t* prec = lrec_unbacked_alloc(); - - sllse_t* pe = pheader_keeper->pkeys->phead; - char* p = line; - if (allow_repeat_ifs) { - while (*p == ifs) - p++; - } - char* key = NULL; - char* value = p; - int saw_rs = FALSE; - int idx = 0; - for ( ; p < phandle->eof && *p; ) { - if (*p == irs) { - if (p == line) { - *pend_of_stanza = TRUE; - lrec_free(prec); - return NULL; - } - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - pstate->ilno++; - saw_rs = TRUE; - break; - } else if (*p == ifs) { - *p = 0; - idx++; - if (pe == NULL) { - // Data line has more fields than the header line did - if (pstate->allow_ragged_csv_input) { - char free_flags = NO_FREE; - key = low_int_to_string(idx, &free_flags); - lrec_put(prec, key, value, free_flags); - } else { - fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } else { - key = pe->value; - pe = pe->pnext; - lrec_put(prec, key, value, NO_FREE); - } - p++; - if (allow_repeat_ifs) { - while (*p == ifs) - p++; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (allow_repeat_ifs && *value == 0) - return prec; - - char free_flags = NO_FREE; - if (pe == NULL) { - // Data line has more fields than the header line did - if (pstate->allow_ragged_csv_input) { - idx++; - key = low_int_to_string(idx, &free_flags); - } else { - fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } else { - key = pe->value; - } - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, NO_FREE); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - - if (pe != NULL && pe->pnext != NULL) { - // Header line has more fields than the data line did - if (pstate->allow_ragged_csv_input) { - for (pe = pe->pnext ; pe != NULL; pe = pe->pnext) { - key = pe->value; - lrec_put(prec, key, "", NO_FREE); - } - } else { - fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } - - return prec; -} - -static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza) -{ - // Skip comment lines - if (pstate->comment_string != NULL) { - while (handle_comment_line_multi_irs(phandle, pstate)) - ; - } - if (phandle->sol >= phandle->eof) - return NULL; - - char* irs = pstate->irs; - char* ifs = pstate->ifs; - int irslen = pstate->irslen; - int ifslen = pstate->ifslen; - int allow_repeat_ifs = pstate->allow_repeat_ifs; - - lrec_t* prec = lrec_unbacked_alloc(); - char* line = phandle->sol; - - sllse_t* pe = pheader_keeper->pkeys->phead; - char* p = line; - if (allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - char* key = NULL; - char* value = p; - int saw_rs = FALSE; - int idx = 0; - for ( ; p < phandle->eof && *p; ) { - if (streqn(p, irs, irslen)) { - if (p == line) { - *pend_of_stanza = TRUE; - lrec_free(prec); - return NULL; - } - *p = 0; - phandle->sol = p + irslen; - pstate->ilno++; - saw_rs = TRUE; - break; - } else if (streqn(p, ifs, ifslen)) { - *p = 0; - idx++; - if (pe == NULL) { - // Data line has more fields than the header line did - if (pstate->allow_ragged_csv_input) { - char free_flags = NO_FREE; - key = low_int_to_string(idx, &free_flags); - lrec_put(prec, key, value, free_flags); - } else { - fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } - key = pe->value; - pe = pe->pnext; - lrec_put(prec, key, value, NO_FREE); - - p += ifslen; - if (allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (allow_repeat_ifs && *value == 0) - return prec; - - char free_flags = NO_FREE; - if (pe == NULL) { - // Data line has more fields than the header line did - if (pstate->allow_ragged_csv_input) { - idx++; - key = low_int_to_string(idx, &free_flags); - } else { - fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } else { - key = pe->value; - } - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, NO_FREE); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - - if (pe != NULL && pe->pnext != NULL) { - // Header line has more fields than the data line did - if (pstate->allow_ragged_csv_input) { - for (pe = pe->pnext ; pe != NULL; pe = pe->pnext) { - key = pe->value; - lrec_put(prec, key, "", NO_FREE); - } - } else { - fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n", - MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno); - exit(1); - } - } - - return prec; -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza) -{ - char irs = pstate->irs[0]; - char ifs = pstate->ifs[0]; - int allow_repeat_ifs = pstate->allow_repeat_ifs; - - // Skip comment lines - if (pstate->comment_string != NULL) { - while (handle_comment_line_single_irs(phandle, pstate, irs)) - ; - } - if (phandle->sol >= phandle->eof) - return NULL; - - lrec_t* prec = lrec_unbacked_alloc(); - char* line = phandle->sol; - - char* p = line; - if (allow_repeat_ifs) { - while (*p == ifs) - p++; - } - char* key = NULL; - char* value = p; - char free_flags = NO_FREE; - int idx = 0; - int saw_rs = FALSE; - for ( ; p < phandle->eof && *p; ) { - if (*p == irs) { - if (p == line) { - *pend_of_stanza = TRUE; - lrec_free(prec); - return NULL; - } - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - pstate->ilno++; - saw_rs = TRUE; - break; - } else if (*p == ifs) { - *p = 0; - key = low_int_to_string(++idx, &free_flags); - lrec_put(prec, key, value, free_flags); - p++; - if (allow_repeat_ifs) { - while (*p == ifs) - p++; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (allow_repeat_ifs && *value == 0) - return prec; - - key = low_int_to_string(++idx, &free_flags); - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, free_flags); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE); - } - - return prec; -} - -static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza) -{ - // Skip comment lines - if (pstate->comment_string != NULL) { - while (handle_comment_line_multi_irs(phandle, pstate)) - ; - } - if (phandle->sol >= phandle->eof) - return NULL; - - char* irs = pstate->irs; - char* ifs = pstate->ifs; - int irslen = pstate->irslen; - int ifslen = pstate->ifslen; - int allow_repeat_ifs = pstate->allow_repeat_ifs; - - lrec_t* prec = lrec_unbacked_alloc(); - char* line = phandle->sol; - - char* p = line; - if (allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - char* key = NULL; - char* value = p; - char free_flags; - int idx = 0; - int saw_rs = FALSE; - for ( ; p < phandle->eof && *p; ) { - if (streqn(p, irs, irslen)) { - if (p == line) { - *pend_of_stanza = TRUE; - lrec_free(prec); - return NULL; - } - *p = 0; - phandle->sol = p + irslen; - pstate->ilno++; - saw_rs = TRUE; - break; - } else if (streqn(p, ifs, ifslen)) { - *p = 0; - key = low_int_to_string(++idx, &free_flags); - lrec_put(prec, key, value, free_flags); - - p += ifslen; - if (allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (allow_repeat_ifs && *value == 0) - return prec; - - key = low_int_to_string(++idx, &free_flags); - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, free_flags); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE); - } - - return prec; -} - -// ---------------------------------------------------------------- -static int handle_comment_line_single_irs( - file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate, - char irs) -{ - if ((phandle->eof - phandle->sol) >= pstate->comment_string_length - && streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length)) - { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < pstate->comment_string_length; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += pstate->comment_string_length; - while (phandle->sol < phandle->eof && *phandle->sol != irs) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - if (phandle->sol < phandle->eof && *phandle->sol == irs) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - pstate->ilno++; - return TRUE; - } else { - return FALSE; - } -} - -// ---------------------------------------------------------------- -static int handle_comment_line_multi_irs( - file_reader_mmap_state_t* phandle, - lrec_reader_mmap_csvlite_state_t* pstate) -{ - if ((phandle->eof - phandle->sol) >= pstate->comment_string_length - && streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length)) - { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < pstate->comment_string_length; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += pstate->comment_string_length; - while ((phandle->eof - phandle->sol >= pstate->irslen) && !streqn(phandle->sol, pstate->irs, pstate->irslen)) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - if ((phandle->eof - phandle->sol >= pstate->irslen) && streqn(phandle->sol, pstate->irs, pstate->irslen)) { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < pstate->irslen; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += pstate->irslen; - } - pstate->ilno++; - return TRUE; - } else { - return FALSE; - } -} diff --git a/c/input/lrec_reader_mmap_dkvp.c b/c/input/lrec_reader_mmap_dkvp.c deleted file mode 100644 index 1ecd0f624..000000000 --- a/c/input/lrec_reader_mmap_dkvp.c +++ /dev/null @@ -1,683 +0,0 @@ -// ================================================================ -// Note: there are multiple process methods with a lot of code duplication. -// This is intentional. Much of Miller's measured processing time is in the -// lrec-reader process methods. This is code which needs to execute on every -// byte of input and even moving a single runtime if-statement into a -// function-pointer assignment at alloc time can have noticeable effects on -// performance (5-10% in some cases). -// ================================================================ - -#include -#include -#include "cli/comment_handling.h" -#include "lib/mlr_globals.h" -#include "lib/mlrutil.h" -#include "input/file_reader_mmap.h" -#include "input/lrec_readers.h" - -typedef struct _lrec_reader_mmap_dkvp_state_t { - char* irs; - char* ifs; - char* ips; - int irslen; - int ifslen; - int ipslen; - int allow_repeat_ifs; - int do_auto_line_term; - comment_handling_t comment_handling; - char* comment_string; - int comment_string_length; -} lrec_reader_mmap_dkvp_state_t; - -static void lrec_reader_mmap_dkvp_free(lrec_reader_t* preader); -static void lrec_reader_mmap_dkvp_sof(void* pvstate, void* pvhandle); -static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx); - -static lrec_t* lrec_parse_mmap_dkvp_single_irs_single_others(file_reader_mmap_state_t *phandle, - char irs, char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx); - -static lrec_t* lrec_parse_mmap_dkvp_single_irs_multi_others(file_reader_mmap_state_t *phandle, - char irs, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx); - -static lrec_t* lrec_parse_mmap_dkvp_multi_irs_single_others(file_reader_mmap_state_t *phandle, - char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx); - -static lrec_t* lrec_parse_mmap_dkvp_multi_irs_multi_others(file_reader_mmap_state_t *phandle, - lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx); - -static void skip_over_comment_lines_single_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_dkvp_state_t* pstate, - char irs); - -static void skip_over_comment_lines_multi_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_dkvp_state_t* pstate, - char* irs, - int irslen); - -// ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_mmap_dkvp_alloc(char* irs, char* ifs, char* ips, int allow_repeat_ifs, - comment_handling_t comment_handling, char* comment_string) -{ - lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - - lrec_reader_mmap_dkvp_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_dkvp_state_t)); - pstate->irs = irs; - pstate->ifs = ifs; - pstate->ips = ips; - pstate->irslen = strlen(irs); - pstate->ifslen = strlen(ifs); - pstate->ipslen = strlen(ips); - pstate->allow_repeat_ifs = allow_repeat_ifs; - pstate->do_auto_line_term = FALSE; - pstate->comment_handling = comment_handling; - pstate->comment_string = comment_string; - pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string); - - plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = file_reader_mmap_vopen; - plrec_reader->pclose_func = file_reader_mmap_vclose; - if (streq(irs, "auto")) { - // Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In - // either case the final character is "\n". Then for autodetect we - // simply check if there's a character in the line before the '\n', and - // if that is '\r'. - pstate->do_auto_line_term = TRUE; - pstate->irs = "\n"; - pstate->irslen = 1; - plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1) - ? lrec_reader_mmap_dkvp_process_single_irs_single_others - : lrec_reader_mmap_dkvp_process_single_irs_multi_others; - } else if (pstate->irslen == 1) { - plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1) - ? lrec_reader_mmap_dkvp_process_single_irs_single_others - : lrec_reader_mmap_dkvp_process_single_irs_multi_others; - } else { - plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1) - ? lrec_reader_mmap_dkvp_process_multi_irs_single_others - : lrec_reader_mmap_dkvp_process_multi_irs_multi_others; - } - plrec_reader->psof_func = lrec_reader_mmap_dkvp_sof; - plrec_reader->pfree_func = lrec_reader_mmap_dkvp_free; - - return plrec_reader; -} - -static void lrec_reader_mmap_dkvp_free(lrec_reader_t* preader) { - free(preader->pvstate); - free(preader); -} - -// No-op for stateless readers such as this one. -static void lrec_reader_mmap_dkvp_sof(void* pvstate, void* pvhandle) { -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_dkvp_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_dkvp_single_irs_single_others(phandle, pstate->irs[0], pstate->ifs[0], pstate->ips[0], - pstate, pctx); -} - -static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_dkvp_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_dkvp_single_irs_multi_others(phandle, pstate->irs[0], pstate, pctx); -} - -static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_dkvp_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_dkvp_multi_irs_single_others(phandle, pstate->ifs[0], pstate->ips[0], - pstate, pctx); -} - -static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_dkvp_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_dkvp_multi_irs_multi_others(phandle, pstate, pctx); -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_parse_mmap_dkvp_single_irs_single_others(file_reader_mmap_state_t *phandle, - char irs, char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_single_irs(phandle, pstate, irs); - - if (phandle->sol >= phandle->eof) - return NULL; - - char* line = phandle->sol; - lrec_t* prec = lrec_unbacked_alloc(); - - int idx = 0; - char* p = line; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - char* key = p; - char* value = p; - - int saw_ps = FALSE; - int saw_rs = FALSE; - - for ( ; p < phandle->eof && *p; ) { - if (*p == irs) { - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - saw_rs = TRUE; - break; - } else if (*p == ifs) { - saw_ps = FALSE; - *p = 0; - - idx++; - if (*key == 0 || value <= key) { - // E.g the pair has no equals sign: "a" rather than "a=1" or - // "a=". Here we use the positional index as the key. This way - // DKVP is a generalization of NIDX. - char free_flags = NO_FREE; - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - lrec_put(prec, key, value, NO_FREE); - } - - p++; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - key = p; - value = p; - } else if (*p == ips && !saw_ps) { - *p = 0; - p++; - value = p; - saw_ps = TRUE; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *key == 0 && *value == 0) - return prec; - - // There are two ways out of that loop: saw IRS, or saw end of file. - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - else - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - if (value >= phandle->eof) - lrec_put(prec, key, "", NO_FREE); - else - lrec_put(prec, key, value, NO_FREE); - } - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) { - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE); - } - } - else { - if (value >= phandle->eof) { - lrec_put(prec, key, "", NO_FREE); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - } - } - - return prec; -} - -static lrec_t* lrec_parse_mmap_dkvp_multi_irs_single_others(file_reader_mmap_state_t *phandle, - char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen); - - if (phandle->sol >= phandle->eof) - return NULL; - - char* line = phandle->sol; - lrec_t* prec = lrec_unbacked_alloc(); - - int idx = 0; - char* p = line; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - char* key = p; - char* value = p; - - int saw_ps = FALSE; - int saw_rs = FALSE; - - for ( ; p < phandle->eof && *p; ) { - if (streqn(p, pstate->irs, pstate->irslen)) { - *p = 0; - phandle->sol = p + pstate->irslen; - saw_rs = TRUE; - break; - } else if (*p == ifs) { - saw_ps = FALSE; - *p = 0; - - idx++; - if (*key == 0 || value <= key) { - // E.g the pair has no equals sign: "a" rather than "a=1" or - // "a=". Here we use the positional index as the key. This way - // DKVP is a generalization of NIDX. - char free_flags = NO_FREE; - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - lrec_put(prec, key, value, NO_FREE); - } - - p++; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - key = p; - value = p; - } else if (*p == ips && !saw_ps) { - *p = 0; - p++; - value = p; - saw_ps = TRUE; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *key == 0 && *value == 0) - return prec; - - // There are two ways out of that loop: saw IRS, or saw end of file. - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - else - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - if (value >= phandle->eof) - lrec_put(prec, key, "", NO_FREE); - else - lrec_put(prec, key, value, NO_FREE); - } - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) { - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE); - } - } - else { - if (value >= phandle->eof) { - lrec_put(prec, key, "", NO_FREE); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - } - } - - return prec; -} - -static lrec_t* lrec_parse_mmap_dkvp_single_irs_multi_others(file_reader_mmap_state_t *phandle, char irs, - lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_single_irs(phandle, pstate, irs); - - if (phandle->sol >= phandle->eof) - return NULL; - - char* line = phandle->sol; - lrec_t* prec = lrec_unbacked_alloc(); - - int idx = 0; - char* p = line; - if (pstate->allow_repeat_ifs) { - while (streqn(p, pstate->ifs, pstate->ifslen)) - p += pstate->ifslen; - } - char* key = p; - char* value = p; - - int saw_ps = FALSE; - int saw_rs = FALSE; - - for ( ; p < phandle->eof && *p; ) { - if (*p == irs) { - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - saw_rs = TRUE; - break; - } else if (streqn(p, pstate->ifs, pstate->ifslen)) { - saw_ps = FALSE; - *p = 0; - - idx++; - if (*key == 0 || value <= key) { - // E.g the pair has no equals sign: "a" rather than "a=1" or - // "a=". Here we use the positional index as the key. This way - // DKVP is a generalization of NIDX. - char free_flags = NO_FREE; - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - lrec_put(prec, key, value, NO_FREE); - } - - p += pstate->ifslen; - if (pstate->allow_repeat_ifs) { - while (streqn(p, pstate->ifs, pstate->ifslen)) - p += pstate->ifslen; - } - key = p; - value = p; - } else if (streqn(p, pstate->ips, pstate->ipslen) && !saw_ps) { - *p = 0; - p += pstate->ipslen; - value = p; - saw_ps = TRUE; - } else { - p++; - } - } - *p = 0; - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *key == 0 && *value == 0) - return prec; - - // There are two ways out of that loop: saw IRS, or saw end of file. - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - else - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - if (value >= phandle->eof) - lrec_put(prec, key, "", NO_FREE); - else - lrec_put(prec, key, value, NO_FREE); - } - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) { - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE); - } - } - else { - if (value >= phandle->eof) { - lrec_put(prec, key, "", NO_FREE); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - } - } - - return prec; -} - -static lrec_t* lrec_parse_mmap_dkvp_multi_irs_multi_others(file_reader_mmap_state_t *phandle, - lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen); - - if (phandle->sol >= phandle->eof) - return NULL; - - char* line = phandle->sol; - lrec_t* prec = lrec_unbacked_alloc(); - - int idx = 0; - char* p = line; - if (pstate->allow_repeat_ifs) { - while (streqn(p, pstate->ifs, pstate->ifslen)) - p += pstate->ifslen; - } - char* key = p; - char* value = p; - - int saw_ps = FALSE; - int saw_rs = FALSE; - - for ( ; p < phandle->eof && *p; ) { - if (streqn(p, pstate->irs, pstate->irslen)) { - *p = 0; - phandle->sol = p + pstate->irslen; - saw_rs = TRUE; - break; - } else if (streqn(p, pstate->ifs, pstate->ifslen)) { - saw_ps = FALSE; - *p = 0; - - idx++; - if (*key == 0 || value <= key) { - // E.g the pair has no equals sign: "a" rather than "a=1" or - // "a=". Here we use the positional index as the key. This way - // DKVP is a generalization of NIDX. - char free_flags = NO_FREE; - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - lrec_put(prec, key, value, NO_FREE); - } - - p += pstate->ifslen; - if (pstate->allow_repeat_ifs) { - while (streqn(p, pstate->ifs, pstate->ifslen)) - p += pstate->ifslen; - } - key = p; - value = p; - } else if (streqn(p, pstate->ips, pstate->ipslen) && !saw_ps) { - *p = 0; - p += pstate->ipslen; - value = p; - saw_ps = TRUE; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *key == 0 && *value == 0) - return prec; - - // There are two ways out of that loop: saw IRS, or saw end of file. - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - else - lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags); - } - else { - if (value >= phandle->eof) - lrec_put(prec, key, "", NO_FREE); - else - lrec_put(prec, key, value, NO_FREE); - } - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - if (*key == 0 || value <= key) { - char free_flags = NO_FREE; - if (value >= phandle->eof) { - lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE); - } - } - else { - if (value >= phandle->eof) { - lrec_put(prec, key, "", NO_FREE); - } else { - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - } - } - - return prec; -} - -// ---------------------------------------------------------------- -static void skip_over_comment_lines_single_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_dkvp_state_t* pstate, - char irs) -{ - while ((phandle->eof - phandle->sol) >= pstate->comment_string_length - && streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length)) - { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < pstate->comment_string_length; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += pstate->comment_string_length; - while (phandle->sol < phandle->eof && *phandle->sol != irs) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - if (phandle->sol < phandle->eof && *phandle->sol == irs) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - } -} - -static void skip_over_comment_lines_multi_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_dkvp_state_t* pstate, - char* irs, - int irslen) -{ - while ((phandle->eof - phandle->sol) >= pstate->comment_string_length - && streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length)) - { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < pstate->comment_string_length; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += pstate->comment_string_length; - while ((phandle->eof - phandle->sol) >= irslen && !streqn(phandle->sol, irs, irslen)) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < irslen; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += irslen; - } - } -} diff --git a/c/input/lrec_reader_mmap_json.c b/c/input/lrec_reader_mmap_json.c deleted file mode 100644 index aa4cd48e8..000000000 --- a/c/input/lrec_reader_mmap_json.c +++ /dev/null @@ -1,220 +0,0 @@ -// ================================================================ -// Note: there are multiple process methods with a lot of code duplication. -// This is intentional. Much of Miller's measured processing time is in the -// lrec-reader process methods. This is code which needs to execute on every -// byte of input and even moving a single runtime if-statement into a -// function-pointer assignment at alloc time can have noticeable effects on -// performance (5-10% in some cases). -// ================================================================ - -// ================================================================ -// Unlike other Miller record-readers, there is no streaming for JSON input: no -// records are processed until EOF is seen. See also -// https://github.com/johnkerl/miller/issues/99. -// ================================================================ - -#include -#include -#include "cli/json_array_ingest.h" -#include "cli/comment_handling.h" -#include "lib/mlr_globals.h" -#include "lib/mlrutil.h" -#include "input/file_reader_mmap.h" -#include "input/lrec_readers.h" -#include "input/json_parser.h" -#include "input/mlr_json_adapter.h" - -typedef struct _lrec_reader_mmap_json_state_t { - // The list of top-level JSON objects is backed by the file contents. The records are in turn - // backed by the top-level JSON objects. This means the latter should not be freed while - // the records are in used. (This is done to reduce data copies, for performance: we can - // manipulate pointers to strings rather than copying strings.) - // - // In particular, in the multifile-input case, we need to keep *all* parsed JSON (and - // not free one file's data when we proceed to the next) since records with pointers - // into the parsed JSON may still be in use -- e.g. mlr sort. - sllv_t* ptop_level_json_objects; - sllv_t* precords; - char* input_json_flatten_separator; - json_array_ingest_t json_array_ingest; - char* specified_line_term; - int do_auto_line_term; - char* detected_line_term; - comment_handling_t comment_handling; - char* comment_string; -} lrec_reader_mmap_json_state_t; - -static void lrec_reader_mmap_json_free(lrec_reader_t* preader); -static void lrec_reader_mmap_json_sof(void* pvstate, void* pvhandle); -static lrec_t* lrec_reader_mmap_json_process(void* pvstate, void* pvhandle, context_t* pctx); - -// ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_mmap_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term, - comment_handling_t comment_handling, char* comment_string) -{ - lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - - lrec_reader_mmap_json_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_json_state_t)); - pstate->ptop_level_json_objects = sllv_alloc(); - pstate->precords = sllv_alloc(); - pstate->input_json_flatten_separator = input_json_flatten_separator; - pstate->json_array_ingest = json_array_ingest; - pstate->specified_line_term = line_term; - pstate->do_auto_line_term = FALSE; - pstate->detected_line_term = "\n"; // xxx adapt to MLR_GLOBALS/ctx-const for Windows port - pstate->comment_handling = comment_handling; - pstate->comment_string = comment_string; - - if (streq(line_term, "auto")) { - pstate->do_auto_line_term = TRUE; - } - - plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = file_reader_mmap_vopen; - plrec_reader->pclose_func = file_reader_mmap_vclose; - plrec_reader->pprocess_func = lrec_reader_mmap_json_process; - plrec_reader->psof_func = lrec_reader_mmap_json_sof; - plrec_reader->pfree_func = lrec_reader_mmap_json_free; - - return plrec_reader; -} - -static void lrec_reader_mmap_json_free(lrec_reader_t* preader) { - lrec_reader_mmap_json_state_t* pstate = preader->pvstate; - - for (sllve_t* pe = pstate->ptop_level_json_objects->phead; pe != NULL; pe = pe->pnext) { - json_value_t* top_level_json_object = pe->pvvalue; - json_free_value(top_level_json_object); - } - sllv_free(pstate->ptop_level_json_objects); - pstate->ptop_level_json_objects = NULL; - for (sllve_t* pf = pstate->precords->phead; pf != NULL; pf = pf->pnext) { - lrec_t* prec = pf->pvvalue; - lrec_free(prec); - } - sllv_free(pstate->precords); - pstate->precords = NULL; - - free(pstate); - free(preader); -} - -// The mmap-JSON lrec-reader is non-streaming: we ingest all records here in the start-of-file hook. -// Then in the process method we pop one lrec off the list at a time, until they are all exhausted. -// This is in contrast to other Miller lrec-readers. -// -// It would be possible to extend the streaming framework to also have an end-of-file hook -// which we could use here to free parsed-JSON data. However, we simply leverage the start-of-file -// hook for the *next* file (if any) or the free method (if not): these free parsed-JSON structures -// from the previous file (if any). -static void lrec_reader_mmap_json_sof(void* pvstate, void* pvhandle) { - lrec_reader_mmap_json_state_t* pstate = pvstate; - file_reader_mmap_state_t* phandle = pvhandle; - json_char* json_input = (json_char*)phandle->sol; - json_value_t* parsed_top_level_json; - json_char error_buf[JSON_ERROR_MAX]; - - // This enables us to handle input of the form - // - // { "a" : 1 } - // { "b" : 2 } - // { "c" : 3 } - // - // in addition to - // - // [ - // { "a" : 1 } - // { "b" : 2 } - // { "c" : 3 } - // ] - // - // This is in line with what jq can handle. In this case, json_parse will return - // once for each top-level item and will give us back a pointer to the start of - // the rest of the input stream, so we can call json_parse on the rest until it is - // all exhausted. - - json_char* item_start = json_input; - int length = phandle->eof - phandle->sol; - char* detected_line_term = NULL; - - while (TRUE) { - - // Find the first line-ending sequence (if any): LF or CRLF. - if (pstate->do_auto_line_term) { - if (detected_line_term == NULL) { - for (char* p = phandle->sol; p < phandle->eof; p++) { - if (p[0] == '\n') { - if (p > phandle->sol && p[-1] == '\r') { - detected_line_term = "\r\n"; - } else { - detected_line_term = "\n"; - } - break; - } - } - } - } - - // Skip comments. For JSON, we ingest the entire blob, this is a matter of finding and iterating over lines. - // Miller data comments must be at start of line. - if (pstate->comment_handling != COMMENTS_ARE_DATA) { - char* line_term = pstate->specified_line_term; - if (pstate->do_auto_line_term && detected_line_term != NULL) - line_term = detected_line_term; - mlr_json_strip_comments(item_start, item_start + length, pstate->comment_handling, pstate->comment_string, - line_term); - } - - // Trim trailing whitespace. - char* item_end = item_start + length; - mlr_json_end_strip(item_start, &item_end); - length = item_end - item_start; - - if (length == 0) - break; - - parsed_top_level_json = json_parse(item_start, length, error_buf, &item_start); - if (parsed_top_level_json == NULL) { - fprintf(stderr, "%s: Unable to parse JSON data: %s\n", MLR_GLOBALS.bargv0, error_buf); - exit(1); - } - - sllv_append(pstate->ptop_level_json_objects, parsed_top_level_json); - - // The lrecs have their string pointers pointing into the parsed-JSON objects (for - // efficiency) so it's important we not free the latter until our free method. - if (!reference_json_objects_as_lrecs(pstate->precords, parsed_top_level_json, - pstate->input_json_flatten_separator, pstate->json_array_ingest)) - { - fprintf(stderr, "%s: Unable to parse JSON data.\n", MLR_GLOBALS.bargv0); - exit(1); - } - - if (item_start == NULL) - break; - if (*item_start == 0) - break; - length -= (item_start - json_input); - json_input = item_start; - // json_parse goes up to the '\r' or '\n' (whichever is found first) on the first - // parse, then keeps going from there on the next. E.g. in the CRLF case it - // consumes the CR at the end of the first read and consumes the LF at the start - // of the second, and so on. After the very last parse, we need to here consume - // the final '\n' which is (by itself) a parse error. - if (length == 1 && *(char*)json_input == '\n') { - break; - } - } - if (detected_line_term != NULL) { - pstate->detected_line_term = detected_line_term; - } -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_json_process(void* pvstate, void* pvhandle, context_t* pctx) { - lrec_reader_mmap_json_state_t* pstate = pvstate; - if (pstate->do_auto_line_term) { - context_set_autodetected_line_term(pctx, pstate->detected_line_term); - } - return sllv_pop(pstate->precords); -} diff --git a/c/input/lrec_reader_mmap_nidx.c b/c/input/lrec_reader_mmap_nidx.c deleted file mode 100644 index 3cfbeca45..000000000 --- a/c/input/lrec_reader_mmap_nidx.c +++ /dev/null @@ -1,512 +0,0 @@ -// ================================================================ -// Note: there are multiple process methods with a lot of code duplication. -// This is intentional. Much of Miller's measured processing time is in the -// lrec-reader process methods. This is code which needs to execute on every -// byte of input and even moving a single runtime if-statement into a -// function-pointer assignment at alloc time can have noticeable effects on -// performance (5-10% in some cases). -// ================================================================ - -#include -#include "cli/comment_handling.h" -#include "lib/mlrutil.h" -#include "input/file_reader_mmap.h" -#include "input/lrec_readers.h" - -typedef struct _lrec_reader_mmap_nidx_state_t { - char* irs; - char* ifs; - int irslen; - int ifslen; - int allow_repeat_ifs; - int do_auto_line_term; - comment_handling_t comment_handling; - char* comment_string; - int comment_string_length; -} lrec_reader_mmap_nidx_state_t; - -static void lrec_reader_mmap_nidx_free(lrec_reader_t* preader); -static void lrec_reader_mmap_nidx_sof(void* pvstate, void* pvhandle); -static lrec_t* lrec_reader_mmap_nidx_process_single_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_nidx_process_single_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx); - -static lrec_t* lrec_parse_mmap_nidx_single_irs_single_ifs(file_reader_mmap_state_t *phandle, - char irs, char ifs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx); - -static lrec_t* lrec_parse_mmap_nidx_single_irs_multi_ifs(file_reader_mmap_state_t *phandle, - char irs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx); - -static lrec_t* lrec_parse_mmap_nidx_multi_irs_single_ifs(file_reader_mmap_state_t *phandle, - char ifs, lrec_reader_mmap_nidx_state_t* pstate); - -static lrec_t* lrec_parse_mmap_nidx_multi_irs_multi_ifs(file_reader_mmap_state_t *phandle, - lrec_reader_mmap_nidx_state_t* pstate); - -static void skip_over_comment_lines_single_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_nidx_state_t* pstate, - char irs); - -static void skip_over_comment_lines_multi_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_nidx_state_t* pstate, - char* irs, - int irslen); - -// ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_mmap_nidx_alloc(char* irs, char* ifs, int allow_repeat_ifs, - comment_handling_t comment_handling, char* comment_string) -{ - lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - - lrec_reader_mmap_nidx_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_nidx_state_t)); - pstate->irs = irs; - pstate->ifs = ifs; - pstate->irslen = strlen(pstate->irs); - pstate->ifslen = strlen(pstate->ifs); - pstate->allow_repeat_ifs = allow_repeat_ifs; - pstate->do_auto_line_term = FALSE; - pstate->comment_handling = comment_handling; - pstate->comment_string = comment_string; - pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string); - - plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = file_reader_mmap_vopen; - plrec_reader->pclose_func = file_reader_mmap_vclose; - - if (streq(irs, "auto")) { - // Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In - // either case the final character is "\n". Then for autodetect we - // simply check if there's a character in the line before the '\n', and - // if that is '\r'. - pstate->do_auto_line_term = TRUE; - pstate->irs = "\n"; - pstate->irslen = 1; - plrec_reader->pprocess_func = (pstate->ifslen == 1) - ? lrec_reader_mmap_nidx_process_single_irs_single_ifs - : lrec_reader_mmap_nidx_process_single_irs_multi_ifs; - } else if (pstate->irslen == 1) { - plrec_reader->pprocess_func = (pstate->ifslen == 1) - ? lrec_reader_mmap_nidx_process_single_irs_single_ifs - : lrec_reader_mmap_nidx_process_single_irs_multi_ifs; - } else { - plrec_reader->pprocess_func = (pstate->ifslen == 1) - ? lrec_reader_mmap_nidx_process_multi_irs_single_ifs - : lrec_reader_mmap_nidx_process_multi_irs_multi_ifs; - } - - plrec_reader->psof_func = lrec_reader_mmap_nidx_sof; - plrec_reader->pfree_func = lrec_reader_mmap_nidx_free; - - return plrec_reader; -} - -static void lrec_reader_mmap_nidx_free(lrec_reader_t* preader) { - free(preader->pvstate); - free(preader); -} - -// No-op for stateless readers such as this one. -static void lrec_reader_mmap_nidx_sof(void* pvstate, void* pvhandle) { -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_nidx_process_single_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_nidx_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_nidx_single_irs_single_ifs(phandle, pstate->irs[0], pstate->ifs[0], pstate, pctx); -} - -static lrec_t* lrec_reader_mmap_nidx_process_single_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_nidx_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_nidx_single_irs_multi_ifs(phandle, pstate->irs[0], pstate, pctx); -} - -static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_nidx_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_nidx_multi_irs_single_ifs(phandle, pstate->ifs[0], pstate); -} - -static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_nidx_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_nidx_multi_irs_multi_ifs(phandle, pstate); -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_parse_mmap_nidx_single_irs_single_ifs(file_reader_mmap_state_t *phandle, - char irs, char ifs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_single_irs(phandle, pstate, irs); - - if (phandle->sol >= phandle->eof) - return NULL; - - char* line = phandle->sol; - lrec_t* prec = lrec_unbacked_alloc(); - - int idx = 0; - char free_flags = NO_FREE; - - char* p = line; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - char* key = NULL; - char* value = p; - int saw_rs = FALSE; - for ( ; p < phandle->eof && *p; ) { - if (*p == irs) { - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - saw_rs = TRUE; - break; - } else if (*p == ifs) { - *p = 0; - - idx++; - key = low_int_to_string(idx, &free_flags); - lrec_put(prec, key, value, free_flags); - - p++; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *value == 0) - return prec; - - key = low_int_to_string(idx, &free_flags); - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, free_flags); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE); - } - - return prec; -} - -static lrec_t* lrec_parse_mmap_nidx_single_irs_multi_ifs(file_reader_mmap_state_t *phandle, - char irs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_single_irs(phandle, pstate, irs); - - lrec_t* prec = lrec_unbacked_alloc(); - - char* ifs = pstate->ifs; - int ifslen = pstate->ifslen; - - char* line = phandle->sol; - int idx = 0; - char free_flags = NO_FREE; - - char* p = line; - if (pstate->allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - char* key = NULL; - char* value = p; - int saw_rs = FALSE; - - for ( ; p < phandle->eof && *p; ) { - if (*p == irs) { - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - saw_rs = TRUE; - break; - } else if (streqn(p, ifs, ifslen)) { - *p = 0; - - idx++; - key = low_int_to_string(idx, &free_flags); - lrec_put(prec, key, value, free_flags); - - p += ifslen; - if (pstate->allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *value == 0) - return prec; - - key = low_int_to_string(idx, &free_flags); - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, free_flags); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE); - } - - return prec; -} - -static lrec_t* lrec_parse_mmap_nidx_multi_irs_single_ifs(file_reader_mmap_state_t *phandle, - char ifs, lrec_reader_mmap_nidx_state_t* pstate) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen); - - lrec_t* prec = lrec_unbacked_alloc(); - - char* line = phandle->sol; - int idx = 0; - char free_flags = NO_FREE; - - char* p = line; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - char* key = NULL; - char* value = p; - int saw_rs = FALSE; - - char* irs = pstate->irs; - int irslen = pstate->irslen; - - for ( ; p < phandle->eof && *p; ) { - if (streqn(p, irs, irslen)) { - *p = 0; - phandle->sol = p + irslen; - saw_rs = TRUE; - break; - } else if (*p == ifs) { - *p = 0; - - idx++; - key = low_int_to_string(idx, &free_flags); - lrec_put(prec, key, value, free_flags); - - p++; - if (pstate->allow_repeat_ifs) { - while (*p == ifs) - p++; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *value == 0) - return prec; - - key = low_int_to_string(idx, &free_flags); - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, free_flags); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE); - } - - return prec; -} - -static lrec_t* lrec_parse_mmap_nidx_multi_irs_multi_ifs(file_reader_mmap_state_t *phandle, - lrec_reader_mmap_nidx_state_t* pstate) -{ - if (pstate->comment_string != NULL) - skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen); - - lrec_t* prec = lrec_unbacked_alloc(); - - char* line = phandle->sol; - int idx = 0; - char free_flags = NO_FREE; - - char* ifs = pstate->ifs; - int ifslen = pstate->ifslen; - char* irs = pstate->irs; - int irslen = pstate->irslen; - - char* p = line; - if (pstate->allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - char* key = NULL; - char* value = p; - int saw_rs = FALSE; - for ( ; p < phandle->eof && *p; ) { - if (streqn(p, irs, irslen)) { - *p = 0; - phandle->sol = p + irslen; - saw_rs = TRUE; - break; - } else if (streqn(p, ifs, ifslen)) { - *p = 0; - - idx++; - key = low_int_to_string(idx, &free_flags); - lrec_put(prec, key, value, free_flags); - - p += ifslen; - if (pstate->allow_repeat_ifs) { - while (streqn(p, ifs, ifslen)) - p += ifslen; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - idx++; - - if (pstate->allow_repeat_ifs && *value == 0) - return prec; - - key = low_int_to_string(idx, &free_flags); - - if (saw_rs) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the - // C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, free_flags); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character - // to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our - // copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one - // byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE); - } - - return prec; -} - -// ---------------------------------------------------------------- -static void skip_over_comment_lines_single_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_nidx_state_t* pstate, - char irs) -{ - while ((phandle->eof - phandle->sol) >= pstate->comment_string_length - && streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length)) - { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < pstate->comment_string_length; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += pstate->comment_string_length; - while (phandle->sol < phandle->eof && *phandle->sol != irs) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - if (phandle->sol < phandle->eof && *phandle->sol == irs) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - } -} - -static void skip_over_comment_lines_multi_irs( - file_reader_mmap_state_t *phandle, - lrec_reader_mmap_nidx_state_t* pstate, - char* irs, - int irslen) -{ - while ((phandle->eof - phandle->sol) >= pstate->comment_string_length - && streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length)) - { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < pstate->comment_string_length; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += pstate->comment_string_length; - while ((phandle->eof - phandle->sol) >= irslen && !streqn(phandle->sol, irs, irslen)) { - if (pstate->comment_handling == PASS_COMMENTS) - fputc(*phandle->sol, stdout); - phandle->sol++; - } - if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) { - if (pstate->comment_handling == PASS_COMMENTS) - for (int i = 0; i < irslen; i++) - fputc(phandle->sol[i], stdout); - phandle->sol += irslen; - } - } -} diff --git a/c/input/lrec_reader_mmap_xtab.c b/c/input/lrec_reader_mmap_xtab.c deleted file mode 100644 index 8f6ef5ddc..000000000 --- a/c/input/lrec_reader_mmap_xtab.c +++ /dev/null @@ -1,529 +0,0 @@ -// ================================================================ -// Note: there are multiple process methods with a lot of code duplication. -// This is intentional. Much of Miller's measured processing time is in the -// lrec-reader process methods. This is code which needs to execute on every -// byte of input and even moving a single runtime if-statement into a -// function-pointer assignment at alloc time can have noticeable effects on -// performance (5-10% in some cases). -// ================================================================ - -#include -#include -#include "cli/comment_handling.h" -#include "lib/mlr_globals.h" -#include "lib/mlrutil.h" -#include "input/file_reader_mmap.h" -#include "input/lrec_readers.h" - -typedef struct _lrec_reader_mmap_xtab_state_t { - char* ifs; - char* ips; - int ifslen; - int ipslen; - int allow_repeat_ips; - int do_auto_line_term; -} lrec_reader_mmap_xtab_state_t; - -static void lrec_reader_mmap_xtab_free(lrec_reader_t* preader); -static void lrec_reader_mmap_xtab_sof(void* pvstate, void* pvhandle); -static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx); -static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx); - -static lrec_t* lrec_parse_mmap_xtab_single_ifs_single_ips(file_reader_mmap_state_t* phandle, char ifs, char ips, - lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx); - -static lrec_t* lrec_parse_mmap_xtab_single_ifs_multi_ips(file_reader_mmap_state_t* phandle, char ifs, - lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx); - -static lrec_t* lrec_parse_mmap_xtab_multi_ifs_single_ips(file_reader_mmap_state_t* phandle, char ips, - lrec_reader_mmap_xtab_state_t* pstate); - -static lrec_t* lrec_parse_mmap_xtab_multi_ifs_multi_ips(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_xtab_state_t* pstate); - -// ---------------------------------------------------------------- -lrec_reader_t* lrec_reader_mmap_xtab_alloc(char* ifs, char* ips, int allow_repeat_ips, - comment_handling_t comment_handling, char* comment_string) -{ - // lrec_reader_alloc should have shunted away from us in this case. - // (Interleaving blank-line handling, line-term autodetect, and comment-handling all in - // the byte-at-a-time logic turned out to be a mess in this file. In the stdio implementation, - // by constrast, it falls out rather easily.) - if (comment_string != NULL) { - fprintf(stderr, "%s: internal coding error detected in file %s at line %d.\n", - MLR_GLOBALS.bargv0, __FILE__, __LINE__); - exit(1); - } - - lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t)); - - lrec_reader_mmap_xtab_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_xtab_state_t)); - pstate->ifs = ifs; - pstate->ips = ips; - pstate->ifslen = strlen(pstate->ifs); - pstate->ipslen = strlen(pstate->ips); - pstate->allow_repeat_ips = allow_repeat_ips; - pstate->do_auto_line_term = FALSE; - - plrec_reader->pvstate = (void*)pstate; - plrec_reader->popen_func = file_reader_mmap_vopen; - plrec_reader->pclose_func = file_reader_mmap_vclose; - - if (streq(ifs, "auto")) { - // Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In - // either case the final character is "\n". Then for autodetect we - // simply check if there's a character in the line before the '\n', and - // if that is '\r'. - pstate->do_auto_line_term = TRUE; - pstate->ifs = "\n"; - pstate->ifslen = 1; - plrec_reader->pprocess_func = (pstate->ipslen == 1) - ? lrec_reader_mmap_xtab_process_single_ifs_single_ips - : lrec_reader_mmap_xtab_process_single_ifs_multi_ips; - } else if (pstate->ifslen == 1) { - plrec_reader->pprocess_func = (pstate->ipslen == 1) - ? lrec_reader_mmap_xtab_process_single_ifs_single_ips - : lrec_reader_mmap_xtab_process_single_ifs_multi_ips; - } else { - plrec_reader->pprocess_func = (pstate->ipslen == 1) - ? lrec_reader_mmap_xtab_process_multi_ifs_single_ips - : lrec_reader_mmap_xtab_process_multi_ifs_multi_ips; - } - - plrec_reader->psof_func = lrec_reader_mmap_xtab_sof; - plrec_reader->pfree_func = lrec_reader_mmap_xtab_free; - - return plrec_reader; -} - -// ---------------------------------------------------------------- -static void lrec_reader_mmap_xtab_free(lrec_reader_t* preader) { - free(preader->pvstate); - free(preader); -} - -static void lrec_reader_mmap_xtab_sof(void* pvstate, void* pvhandle) { -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_xtab_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_xtab_single_ifs_single_ips(phandle, pstate->ifs[0], pstate->ips[0], - pstate, pctx); -} - -static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_xtab_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_xtab_single_ifs_multi_ips(phandle, pstate->ifs[0], pstate, pctx); -} - -static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_xtab_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_xtab_multi_ifs_single_ips(phandle, pstate->ips[0], pstate); -} - -static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx) { - file_reader_mmap_state_t* phandle = pvhandle; - lrec_reader_mmap_xtab_state_t* pstate = pvstate; - if (phandle->sol >= phandle->eof) - return NULL; - else - return lrec_parse_mmap_xtab_multi_ifs_multi_ips(phandle, pstate); -} - -// ---------------------------------------------------------------- -static lrec_t* lrec_parse_mmap_xtab_single_ifs_single_ips(file_reader_mmap_state_t* phandle, char ifs, char ips, - lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx) -{ - if (pstate->do_auto_line_term) { - // Skip over otherwise empty LF-only or CRLF-only lines. - while (phandle->sol < phandle->eof) { - if (*phandle->sol == '\n') { - context_set_autodetected_lf(pctx); - phandle->sol += 1; - } else if (*phandle->sol == '\r') { - char* q = phandle->sol + 1; - if (q < phandle->eof && *q == '\n') { - context_set_autodetected_crlf(pctx); - phandle->sol += 2; - } else { - phandle->sol += 1; - } - } else { - break; - } - } - } else { - // Skip over otherwise empty IFS-only lines - while (phandle->sol < phandle->eof && *phandle->sol == ifs) { - phandle->sol++; - } - } - - if (phandle->sol >= phandle->eof) - return NULL; - - lrec_t* prec = lrec_unbacked_alloc(); - - // Loop over fields, one per line - while (TRUE) { - if (phandle->sol >= phandle->eof) - break; - - char* line = phandle->sol; - char* key = line; - char* value = ""; - char* p; - int saw_ips_in_field = FALSE; - - // Construct one field - int saw_eol = FALSE; - for (p = line; p < phandle->eof && *p; ) { - if (*p == ifs) { - saw_ips_in_field = FALSE; - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - saw_eol = TRUE; - break; - } else if (!saw_ips_in_field && *p == ips) { - saw_ips_in_field = TRUE; - key = line; - *p = 0; - - p++; - if (pstate->allow_repeat_ips) { - while (*p == ips) - p++; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (saw_eol) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate - // the C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, NO_FREE); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null - // character to terminate the C string: if the file size is not a multiple of the OS page size it'll work - // (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at - // EOF is one byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - - if (phandle->sol >= phandle->eof) - break; - - if (pstate->do_auto_line_term) { - char* p = phandle->sol; - char* q = phandle->sol + 1; - if (*p == '\n') - break; - if (q < phandle->eof && *p == '\r' && *q == '\n') - break; - } else { - if (*phandle->sol == ifs) - break; - } - } - if (prec->field_count == 0) { - lrec_free(prec); - return NULL; - } else { - return prec; - } -} - -static lrec_t* lrec_parse_mmap_xtab_single_ifs_multi_ips(file_reader_mmap_state_t* phandle, char ifs, - lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx) -{ - if (pstate->do_auto_line_term) { - // Skip over otherwise empty LF-only or CRLF-only lines. - while (phandle->sol < phandle->eof) { - if (*phandle->sol == '\n') { - context_set_autodetected_lf(pctx); - phandle->sol += 1; - } else if (*phandle->sol == '\r') { - char* q = phandle->sol + 1; - if (q < phandle->eof && *q == '\n') { - context_set_autodetected_crlf(pctx); - phandle->sol += 2; - } else { - phandle->sol += 1; - } - } else { - break; - } - } - } else { - // Skip over otherwise empty IFS-only lines. - while (phandle->sol < phandle->eof && *phandle->sol == ifs) - phandle->sol++; - } - - if (phandle->sol >= phandle->eof) - return NULL; - - char* ips = pstate->ips; - int ipslen = pstate->ipslen; - - lrec_t* prec = lrec_unbacked_alloc(); - - // Loop over fields, one per line - while (TRUE) { - if (phandle->sol >= phandle->eof) - break; - - char* line = phandle->sol; - char* key = line; - char* value = ""; - char* p; - int saw_ips_in_field = FALSE; - - // Construct one field - int saw_eol = FALSE; - for (p = line; p < phandle->eof && *p; ) { - if (*p == ifs) { - saw_ips_in_field = FALSE; - *p = 0; - - if (pstate->do_auto_line_term) { - if (p > line && p[-1] == '\r') { - p[-1] = 0; - context_set_autodetected_crlf(pctx); - } else { - context_set_autodetected_lf(pctx); - } - } - - phandle->sol = p+1; - saw_eol = TRUE; - break; - } else if (!saw_ips_in_field && streqn(p, ips, ipslen)) { - saw_ips_in_field = TRUE; - key = line; - *p = 0; - - p += ipslen; - if (pstate->allow_repeat_ips) { - while (streqn(p, ips, ipslen)) - p += ipslen; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (saw_eol) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate - // the C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, NO_FREE); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null - // character to terminate the C string: if the file size is not a multiple of the OS page size it'll work - // (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at - // EOF is one byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - - if (phandle->sol >= phandle->eof || *phandle->sol == ifs) - break; - } - if (prec->field_count == 0) { - lrec_free(prec); - return NULL; - } else { - return prec; - } -} - -static lrec_t* lrec_parse_mmap_xtab_multi_ifs_single_ips(file_reader_mmap_state_t* phandle, char ips, - lrec_reader_mmap_xtab_state_t* pstate) -{ - char* ifs = pstate->ifs; - int ifslen = pstate->ifslen; - - // Skip blank lines - while (phandle->eof - phandle->sol >= ifslen && streqn(phandle->sol, ifs, ifslen)) { - phandle->sol += ifslen; - } - - if (phandle->sol >= phandle->eof) - return NULL; - - lrec_t* prec = lrec_unbacked_alloc(); - - // Loop over fields, one per line - while (TRUE) { - if (phandle->sol >= phandle->eof) - break; - - char* line = phandle->sol; - char* key = line; - char* value = ""; - char* p; - int saw_ips_in_field = FALSE; - - // Construct one field - int saw_eol = FALSE; - for (p = line; p < phandle->eof && *p; ) { - if (streqn(p, ifs, ifslen)) { - saw_ips_in_field = FALSE; - *p = 0; - phandle->sol = p + ifslen; - saw_eol = TRUE; - break; - } else if (!saw_ips_in_field && *p == ips) { - saw_ips_in_field = TRUE; - key = line; - *p = 0; - - p++; - if (pstate->allow_repeat_ips) { - while (*p == ips) - p++; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (saw_eol) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate - // the C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, NO_FREE); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null - // character to terminate the C string: if the file size is not a multiple of the OS page size it'll work - // (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at - // EOF is one byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - - if (phandle->sol >= phandle->eof || streqn(phandle->sol, ifs, ifslen)) - break; - } - if (prec->field_count == 0) { - lrec_free(prec); - return NULL; - } else { - return prec; - } -} - -static lrec_t* lrec_parse_mmap_xtab_multi_ifs_multi_ips(file_reader_mmap_state_t* phandle, - lrec_reader_mmap_xtab_state_t* pstate) -{ - char* ips = pstate->ips; - int ipslen = pstate->ipslen; - char* ifs = pstate->ifs; - int ifslen = pstate->ifslen; - - // Skip blank lines - while (phandle->eof - phandle->sol >= ifslen && streqn(phandle->sol, ifs, ifslen)) { - phandle->sol += ifslen; - } - - if (phandle->sol >= phandle->eof) - return NULL; - - lrec_t* prec = lrec_unbacked_alloc(); - - // Loop over fields, one per line - while (TRUE) { - if (phandle->sol >= phandle->eof) - break; - - char* line = phandle->sol; - char* key = line; - char* value = ""; - char* p; - int saw_ips_in_field = FALSE; - - // Construct one field - int saw_eol = FALSE; - for (p = line; p < phandle->eof && *p; ) { - if (streqn(p, ifs, ifslen)) { - saw_ips_in_field = FALSE; - *p = 0; - phandle->sol = p + ifslen; - saw_eol = TRUE; - break; - } else if (!saw_ips_in_field && streqn(p, ips, ipslen)) { - saw_ips_in_field = TRUE; - key = line; - *p = 0; - - p += ipslen; - if (pstate->allow_repeat_ips) { - while (streqn(p, ips, ipslen)) - p += ipslen; - } - value = p; - } else { - p++; - } - } - if (p >= phandle->eof) - phandle->sol = p+1; - - if (saw_eol) { - // Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate - // the C string so it's OK to retain a pointer to that. - lrec_put(prec, key, value, NO_FREE); - } else { - // Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null - // character to terminate the C string: if the file size is not a multiple of the OS page size it'll work - // (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at - // EOF is one byte past the page and that will segv us. - char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value); - lrec_put(prec, key, copy, FREE_ENTRY_VALUE); - } - - if (phandle->sol >= phandle->eof || streqn(phandle->sol, ifs, ifslen)) - break; - } - if (prec->field_count == 0) { - lrec_free(prec); - return NULL; - } else { - return prec; - } -} diff --git a/c/input/lrec_reader_stdio_csv.c b/c/input/lrec_reader_stdio_csv.c index 5a0fb9996..fba897405 100644 --- a/c/input/lrec_reader_stdio_csv.c +++ b/c/input/lrec_reader_stdio_csv.c @@ -348,10 +348,9 @@ static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstat if (pfr_peek_char(pfr) == (char)EOF) // char defaults to unsigned on some platforms return FALSE; - // Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap - // we can test the first 3 bytes, then skip past them or not. For stdio on files we can fread - // the first 3 bytes, then rewind the fp if they're not the UTF-8 BOM. But for stdio on stdin - // (which is the primary reason we support stdio in Miller), we cannot rewind: stdin is not + // Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap we can test the + // first 3 bytes, then skip past them or not. For stdio on files we can fread the first 3 bytes, then rewind the fp + // if they're not the UTF-8 BOM. But for stdio on stdin, we cannot rewind: stdin is not // rewindable. if (is_header) { pfr_buffer_by(pfr, UTF8_BOM_LENGTH); diff --git a/c/input/lrec_reader_stdio_json.c b/c/input/lrec_reader_stdio_json.c index 427cd54e6..9b21e987a 100644 --- a/c/input/lrec_reader_stdio_json.c +++ b/c/input/lrec_reader_stdio_json.c @@ -8,10 +8,11 @@ // ================================================================ // ================================================================ -// This has at present a lot of code duplication with lrec_reader_mmap_json. -// This is because we read the entire input file into memory and get a pointer -// to it, which is a lot like mmap. At some future point we may implement a -// streaming JSON parser at which point the two files would diverge. +// Note: this is a non-streaming JSON reader which reads the entire input file +// into memory and gets a pointer to it. At some future point we may implement +// a streaming JSON parser at which point this would change dramatically. +// +// See also https://github.com/johnkerl/miller/issues/99 // ================================================================ #include diff --git a/c/input/lrec_readers.c b/c/input/lrec_readers.c index 43cd4002f..d58682e94 100644 --- a/c/input/lrec_readers.c +++ b/c/input/lrec_readers.c @@ -9,50 +9,24 @@ lrec_reader_t* lrec_reader_alloc(cli_reader_opts_t* popts) { generator_opts_t* pgopts = &popts->generator_opts; return lrec_reader_gen_alloc(pgopts->field_name, pgopts->start, pgopts->stop, pgopts->step); } else if (streq(popts->ifile_fmt, "dkvp")) { - if (popts->use_mmap_for_read) - return lrec_reader_mmap_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs, - popts->comment_handling, popts->comment_string); - else - return lrec_reader_stdio_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs, - popts->comment_handling, popts->comment_string); + return lrec_reader_stdio_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs, + popts->comment_handling, popts->comment_string); } else if (streq(popts->ifile_fmt, "csv")) { - if (popts->use_mmap_for_read) - return lrec_reader_mmap_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header, - popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string); - else - return lrec_reader_stdio_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header, - popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string); + return lrec_reader_stdio_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header, + popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string); } else if (streq(popts->ifile_fmt, "csvlite")) { - if (popts->use_mmap_for_read) - return lrec_reader_mmap_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs, - popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling, - popts->comment_string); - else - return lrec_reader_stdio_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs, - popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling, - popts->comment_string); + return lrec_reader_stdio_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs, + popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling, + popts->comment_string); } else if (streq(popts->ifile_fmt, "nidx")) { - if (popts->use_mmap_for_read) - return lrec_reader_mmap_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs, - popts->comment_handling, popts->comment_string); - else - return lrec_reader_stdio_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs, - popts->comment_handling, popts->comment_string); + return lrec_reader_stdio_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs, + popts->comment_handling, popts->comment_string); } else if (streq(popts->ifile_fmt, "xtab")) { - // Use stdio-xtab for comment handling; not supported in the mmap-xtab reader. - if (popts->use_mmap_for_read && popts->comment_string == NULL) - return lrec_reader_mmap_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips, - popts->comment_handling, popts->comment_string); - else - return lrec_reader_stdio_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips, - popts->comment_handling, popts->comment_string); + return lrec_reader_stdio_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips, + popts->comment_handling, popts->comment_string); } else if (streq(popts->ifile_fmt, "json")) { - if (popts->use_mmap_for_read) - return lrec_reader_mmap_json_alloc(popts->input_json_flatten_separator, - popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string); - else - return lrec_reader_stdio_json_alloc(popts->input_json_flatten_separator, - popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string); + return lrec_reader_stdio_json_alloc(popts->input_json_flatten_separator, + popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string); } else { return NULL; } diff --git a/c/input/lrec_readers.h b/c/input/lrec_readers.h index 480772e25..935f1ff5d 100644 --- a/c/input/lrec_readers.h +++ b/c/input/lrec_readers.h @@ -24,19 +24,6 @@ lrec_reader_t* lrec_reader_stdio_xtab_alloc(char* ifs, char* ips, int allow_repe lrec_reader_t* lrec_reader_stdio_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term, comment_handling_t comment_handling, char* comment_string); -lrec_reader_t* lrec_reader_mmap_csv_alloc(char* irs, char* ifs, int use_implicit_csv_header, - int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string); -lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char* irs, char* ifs, int allow_repeat_ifs, int use_implicit_csv_header, - int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string); -lrec_reader_t* lrec_reader_mmap_dkvp_alloc(char* irs, char* ifs, char* ips, int allow_repeat_ifs, - comment_handling_t comment_handling, char* comment_string); -lrec_reader_t* lrec_reader_mmap_nidx_alloc(char* irs, char* ifs, int allow_repeat_ifs, - comment_handling_t comment_handling, char* comment_string); -lrec_reader_t* lrec_reader_mmap_xtab_alloc(char* ifs, char* ips, int allow_repeat_ips, - comment_handling_t comment_handling, char* comment_string); -lrec_reader_t* lrec_reader_mmap_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term, - comment_handling_t comment_handling, char* comment_string); - lrec_reader_t* lrec_reader_in_memory_alloc(sllv_t* precords); // ---------------------------------------------------------------- diff --git a/c/input/mlr_json_adapter.c b/c/input/mlr_json_adapter.c index 695615594..1439d7d32 100644 --- a/c/input/mlr_json_adapter.c +++ b/c/input/mlr_json_adapter.c @@ -273,7 +273,7 @@ static int populate_from_nested_array(lrec_t* prec, json_value_t* pjson_array, c } // ---------------------------------------------------------------- -// * The buffer is an entire JSON blob, e.g. contents from stdio read or mmap; peof-psof is the file size so peof is one +// * The buffer is an entire JSON blob, e.g. contents from stdio read; peof-psof is the file size so peof is one // byte *after* the last valid file byte. // * The buffer is not assumed to be null-terminated. // * Any lines beginning with comment_string are modified by poking space characters up to line_term. diff --git a/c/input/mlr_json_adapter.h b/c/input/mlr_json_adapter.h index 8253be3e1..60ebdd511 100644 --- a/c/input/mlr_json_adapter.h +++ b/c/input/mlr_json_adapter.h @@ -16,7 +16,7 @@ int reference_json_objects_as_lrecs(sllv_t* precords, json_value_t* ptop_level_json, char* flatten_sep, json_array_ingest_t json_array_ingest); -// * The buffer is an entire JSON blob, e.g. contents from stdio read or mmap; peof-psof is the file size so peof is one +// * The buffer is an entire JSON blob, e.g. contents from stdio read; peof-psof is the file size so peof is one // byte *after* the last valid file byte. // * The buffer is not assumed to be null-terminated. // * Any lines beginning with comment_string are modified by poking space characters up to line_term. diff --git a/c/input/mmap_byte_reader.c b/c/input/mmap_byte_reader.c deleted file mode 100644 index 0946ccc43..000000000 --- a/c/input/mmap_byte_reader.c +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include -#include -#include -#include "lib/mlr_arch.h" -#include "input/byte_readers.h" -#include "lib/mlr_globals.h" -#include "lib/mlrutil.h" - -#if MLR_ARCH_MMAP_ENABLED -static char empty_buf[1] = { 0 }; -#endif - -typedef struct _mmap_byte_reader_state_t { - char* filename; - int fd; - char* sof; - char* p; - char* eof; -} mmap_byte_reader_state_t; - -static int mmap_byte_reader_open_func(struct _byte_reader_t* pbr, char* prepipe, char* filename); -static int mmap_byte_reader_read_func(struct _byte_reader_t* pbr); -static void mmap_byte_reader_close_func(struct _byte_reader_t* pbr, char* prepipe); - -// ---------------------------------------------------------------- -byte_reader_t* mmap_byte_reader_alloc() { - byte_reader_t* pbr = mlr_malloc_or_die(sizeof(byte_reader_t)); - - pbr->pvstate = NULL; - pbr->popen_func = mmap_byte_reader_open_func; - pbr->pread_func = mmap_byte_reader_read_func; - pbr->pclose_func = mmap_byte_reader_close_func; - - return pbr; -} - -void mmap_byte_reader_free(byte_reader_t* pbr) { - mmap_byte_reader_state_t* pstate = pbr->pvstate; - if (pstate != NULL) { - free(pstate->filename); // null-ok semantics - } - free(pbr); -} - -// ---------------------------------------------------------------- -static int mmap_byte_reader_open_func(struct _byte_reader_t* pbr, char* prepipe, char* filename) { -#if MLR_ARCH_MMAP_ENABLED - // popen is a stdio construct, not an mmap construct, and it can't be supported here. - if (prepipe != NULL) { - fprintf(stderr, "%s: coding error detected in file %s at line %d.\n", - MLR_GLOBALS.bargv0, __FILE__, __LINE__); - exit(1); - } - - mmap_byte_reader_state_t* pstate = mlr_malloc_or_die(sizeof(mmap_byte_reader_state_t)); - pstate->filename = mlr_strdup_or_die(filename); - pstate->fd = open(filename, O_RDONLY); - if (pstate->fd < 0) { - perror("open"); - fprintf(stderr, "%s: Couldn't open \"%s\" for read.\n", MLR_GLOBALS.bargv0, filename); - exit(1); - } - - struct stat stat; - if (fstat(pstate->fd, &stat) < 0) { - perror("fstat"); - fprintf(stderr, "%s: could not fstat \"%s\"\n", MLR_GLOBALS.bargv0, filename); - exit(1); - } - if (stat.st_size == 0) { - // mmap doesn't allow us to map zero-length files but zero-length files do exist. - pstate->sof = &empty_buf[0]; - } else { - pstate->sof = mmap(NULL, (size_t)stat.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE, - pstate->fd, (off_t)0); - if (pstate->sof == MAP_FAILED) { - perror("mmap"); - fprintf(stderr, "%s: could not mmap \"%s\"\n", MLR_GLOBALS.bargv0, filename); - exit(1); - } - } - pstate->eof = pstate->sof + stat.st_size; - pstate->p = pstate->sof; - pbr->pvstate = pstate; - return TRUE; -#else - fprintf(stderr, "%s: mmap is unsupported on this architecture.\n", MLR_GLOBALS.bargv0); - exit(1); - return TRUE; -#endif -} - -static int mmap_byte_reader_read_func(struct _byte_reader_t* pbr) { - mmap_byte_reader_state_t* pstate = pbr->pvstate; - if (pstate->p >= pstate->eof) { - return EOF; - } else { - int c = *pstate->p; - pstate->p++; - return c; - } -} - -static void mmap_byte_reader_close_func(struct _byte_reader_t* pbr, char* prepipe) { - mmap_byte_reader_state_t* pstate = pbr->pvstate; - if (close(pstate->fd) < 0) { - perror("close"); - fprintf(stderr, "%s: close error on file \"%s\".\n", MLR_GLOBALS.bargv0, pstate->filename); - exit(1); - } -} diff --git a/c/lib/mlr_arch.h b/c/lib/mlr_arch.h index 28d79bbfb..18ebcd966 100644 --- a/c/lib/mlr_arch.h +++ b/c/lib/mlr_arch.h @@ -23,14 +23,6 @@ #define mlr_arch_getc(stream) getc_unlocked(stream) #endif -// ---------------------------------------------------------------- -#ifdef MLR_ON_MSYS2 -#define MLR_ARCH_MMAP_ENABLED 0 -#else -#define MLR_ARCH_MMAP_ENABLED 1 -#include -#endif - // ---------------------------------------------------------------- int mlr_arch_setenv(const char *name, const char *value); int mlr_arch_unsetenv(const char *name); diff --git a/c/mapping/mapper_join.c b/c/mapping/mapper_join.c index 416f21cc2..c237443fa 100644 --- a/c/mapping/mapper_join.c +++ b/c/mapping/mapper_join.c @@ -107,8 +107,6 @@ static void mapper_join_usage(FILE* o, char* argv0, char* verb) { fprintf(o, " --ips {pair-separator character}\n"); fprintf(o, " --repifs\n"); fprintf(o, " --repips\n"); - fprintf(o, " --mmap\n"); - fprintf(o, " --no-mmap\n"); fprintf(o, "Please use \"%s --usage-separator-options\" for information on specifying separators.\n", argv0); fprintf(o, "Please see http://johnkerl.org/miller/doc/reference.html for more information\n"); @@ -237,10 +235,6 @@ static mapper_t* mapper_join_parse_cli(int* pargi, int argc, char** argv, cli_merge_reader_opts(&popts->reader_opts, pmain_reader_opts); - // popen is a stdio construct, not an mmap construct, and it can't be supported here. - if (popts->prepipe != NULL) - popts->reader_opts.use_mmap_for_read = FALSE; - if (popts->left_file_name == NULL) { fprintf(stderr, "%s %s: need left file name\n", MLR_GLOBALS.bargv0, verb); mapper_join_usage(stderr, argv[0], verb); diff --git a/c/reg_test/expected/out b/c/reg_test/expected/out index 882a81f37..f480581ec 100644 --- a/c/reg_test/expected/out +++ b/c/reg_test/expected/out @@ -47217,71 +47217,6 @@ a=1,b=2,c=3 a=4,b=5,c=6 -================================================================ -MMAP AT PAGE BOUNDARIES - -mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-ifs.dkvp -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z= - -mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-irs.dkvp -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc - -mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-no-ifs.dkvp -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,3=z - -mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-no-final-irs.dkvp -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc -x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=ccccccccccccccccccccccccccccccccccccccccccccccccc - -mlr --nidx tail -n 4 ./reg_test/input/page-aligned-no-final-irs.nidx -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333 - -mlr --csvlite tail -n 4 ./reg_test/input/page-aligned-no-final-irs.csvl -aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,ccccccccccccccccccccccccccccccccccccccccccc -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333 - -mlr --csv --rs lf tail -n 4 ./reg_test/input/page-aligned-no-final-irs.csvl -aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,ccccccccccccccccccccccccccccccccccccccccccc -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333 -11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333 - -mlr --xtab tail -n 4 ./reg_test/input/page-aligned-no-final-eol.xtab -aaaaaaaaaaaaaaaaaaaa 111111111111111111111 -bbbbbbbbbbbbbbbbbbbb 22222222222222222222 -cccccccccccccccccccc 33333333333333333333 - -aaaaaaaaaaaaaaaaaaaa 111111111111111111111 -bbbbbbbbbbbbbbbbbbbb 22222222222222222222 -cccccccccccccccccccc 33333333333333333333 - -aaaaaaaaaaaaaaaaaaaa 111111111111111111111 -bbbbbbbbbbbbbbbbbbbb 22222222222222222222 -cccccccccccccccccccc 33333333333333333333 - -aaaaaaaaaaaaaaaaaaaa 111111111111111111111 -bbbbbbbbbbbbbbbbbbbb 22222222222222222222 -cccccccccccccccccccc 3333333333333333333333 - - ================================================================ INT64 I/O @@ -47675,54 +47610,54 @@ x,"y""yy",z ================================================================ RFC-CSV -mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple.csv-crlf +mlr --csv cat ./reg_test/input/rfc-csv/simple.csv-crlf a,b,c 1,x,3 4,5,6 x,"y""yy",z -mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv +mlr --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv a,b,c 1,x,3 4,5,6 -mlr --mmap --csv cat ./reg_test/input/rfc-csv/narrow.csv +mlr --csv cat ./reg_test/input/rfc-csv/narrow.csv a 1 2 3 4 -mlr --mmap --csv cat ./reg_test/input/rfc-csv/narrow-truncated.csv +mlr --csv cat ./reg_test/input/rfc-csv/narrow-truncated.csv a 1 2 3 4 -mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-comma.csv +mlr --csv cat ./reg_test/input/rfc-csv/quoted-comma.csv a,b,c 1,"x,3",y 4,5,6 -mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-comma-truncated.csv +mlr --csv cat ./reg_test/input/rfc-csv/quoted-comma-truncated.csv a,b,c 1,"x,3",y 4,5,6 -mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-crlf.csv +mlr --csv cat ./reg_test/input/rfc-csv/quoted-crlf.csv a,b,c 1,"x 3",y 4,5,6 -mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-crlf-truncated.csv +mlr --csv cat ./reg_test/input/rfc-csv/quoted-crlf-truncated.csv a,b,c 1,"x 3",y 4,5,6 -mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv ./reg_test/input/rfc-csv/simple.csv-crlf +mlr --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv ./reg_test/input/rfc-csv/simple.csv-crlf a,b,c 1,x,3 4,5,6 @@ -47730,7 +47665,7 @@ a,b,c 4,5,6 x,"y""yy",z -mlr --mmap --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b ./reg_test/input/rfc-csv/modify-defaults.csv +mlr --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b ./reg_test/input/rfc-csv/modify-defaults.csv a|c 1|3 @@ -47738,22 +47673,13 @@ a|c 4|6 -mlr --mmap --csv --rs lf --quote-original cut -o -f c,b,a ./reg_test/input/quote-original.csv +mlr --csv --rs lf --quote-original cut -o -f c,b,a ./reg_test/input/quote-original.csv c,b,a 3,2,1 6,"5",4 "9",8,"7" -mlr --mmap --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv -a 1 -b 2 -c 3 - -a 4 -b 5 -c - -mlr --no-mmap --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv +mlr --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv a 1 b 2 c 3 @@ -47818,7 +47744,7 @@ c i ================================================================ RAGGED NON-RFC CSV -mlr --mmap --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv +mlr --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv a 1 b 2 c 3 @@ -47832,35 +47758,7 @@ b 7 c 8 4 9 -mlr --no-mmap --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv -a 1 -b 2 -c 3 - -a 4 -b 5 -c - -a 6 -b 7 -c 8 -4 9 - -mlr --mmap --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv -a 1 -b 2 -c 3 - -a 4 -b 5 -c - -a 6 -b 7 -c 8 -4 9 - -mlr --no-mmap --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv +mlr --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv a 1 b 2 c 3 @@ -48177,9 +48075,9 @@ i 4 ================================================================ MULTI-CHARACTER SEPARATORS FOR XTAB -mlr --mmap --xtab --ifs crlf --ofs Z cut -x -f b ./reg_test/input/truncated.xtab-crlf +mlr --xtab --ifs crlf --ofs Z cut -x -f b ./reg_test/input/truncated.xtab-crlf a 1Zc 3ZZd 4Ze 5Z -mlr --mmap --xtab --ips . --ops @ cut -x -f b ./reg_test/input/dots.xtab +mlr --xtab --ips . --ops @ cut -x -f b ./reg_test/input/dots.xtab a@1 c@345 @@ -48195,12 +48093,7 @@ sum@@@@3 ================================================================ EMBEDDED IPS FOR XTAB -mlr --xtab --mmap cat ./reg_test/input/embedded-ips.xtab -a 1 -b 2 -c 3 4 5 - -mlr --xtab --no-mmap cat ./reg_test/input/embedded-ips.xtab +mlr --xtab cat ./reg_test/input/embedded-ips.xtab a 1 b 2 c 3 4 5 @@ -48374,7 +48267,7 @@ mlr --opprint --barred --right cat ./reg_test/input/abixy-het ================================================================ MULTI-CHARACTER IXS SPECIFIERS -mlr --oxtab --idkvp --mmap --irs lf --ifs , --ips = cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf +mlr --oxtab --idkvp --irs lf --ifs , --ips = cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf x :0.641593543645736508/ a :wye/ i :0/ @@ -48395,7 +48288,7 @@ x :0.676537984365847889/ a :zee/ i :4/ -mlr --oxtab --idkvp --mmap --irs lf --ifs /, --ips =: cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf +mlr --oxtab --idkvp --irs lf --ifs /, --ips =: cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf x 0.641593543645736508 a wye i 0 @@ -49746,7 +49639,7 @@ a=hat,b=wye,i=9,x=0.03144187646093577,y=0.7495507603507059 a=pan,b=wye,i=10,x=0.5026260055412137,y=0.9526183602969864 ----------------------------------------------------------------- mmap nidx +---------------------------------------------------------------- nidx mlr --irs auto --ors lf --nidx --fs comma cat ./reg_test/input/line-term-lf.dkvp a=pan,b=pan,i=1,x=0.3467901443380824,y=0.7268028627434533 a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797 @@ -49796,7 +49689,7 @@ a=hat,b=wye,i=9,x=0.03144187646093577,y=0.7495507603507059 a=pan,b=wye,i=10,x=0.5026260055412137,y=0.9526183602969864 ----------------------------------------------------------------- mmap csvlite +---------------------------------------------------------------- csvlite mlr --irs auto --ors lf --csvlite cat ./reg_test/input/line-term-lf.csv a,b,i,x,y pan,pan,1,0.3467901443380824,0.7268028627434533 @@ -49850,7 +49743,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059 pan,wye,10,0.5026260055412137,0.9526183602969864 ----------------------------------------------------------------- mmap pprint +---------------------------------------------------------------- pprint mlr --irs auto --ors lf --pprint cat ./reg_test/input/line-term-lf.csv a,b,i,x,y pan,pan,1,0.3467901443380824,0.7268028627434533 @@ -49904,7 +49797,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059 pan,wye,10,0.5026260055412137,0.9526183602969864 ----------------------------------------------------------------- mmap xtab +---------------------------------------------------------------- xtab mlr --ifs auto --xtab cat ./reg_test/input/line-term-lf.xtab a pan b pan @@ -50150,7 +50043,7 @@ x 0.5026260055412137 y 0.9526183602969864 ----------------------------------------------------------------- mmap xtab +---------------------------------------------------------------- xtab mlr --ifs auto --xtab cat ./reg_test/input/line-term-lf.xtab a pan b pan @@ -50396,7 +50289,7 @@ x 0.5026260055412137 y 0.9526183602969864 ----------------------------------------------------------------- mmap csv +---------------------------------------------------------------- csv mlr --irs auto --ors lf --csv cat ./reg_test/input/line-term-lf.csv a,b,i,x,y pan,pan,1,0.3467901443380824,0.7268028627434533 @@ -50450,7 +50343,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059 pan,wye,10,0.5026260055412137,0.9526183602969864 ----------------------------------------------------------------- mmap json nowrap nostack +---------------------------------------------------------------- json nowrap nostack mlr --irs auto --ors lf --json cat ./reg_test/input/line-term-lf.json { "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 } { "a": "eks", "b": "pan", "i": 2, "x": 0.7586799647899636, "y": 0.5221511083334797 } @@ -50500,7 +50393,7 @@ mlr --json cat ./reg_test/input/line-term-crlf.json { "a": "pan", "b": "wye", "i": 10, "x": 0.5026260055412137, "y": 0.9526183602969864 } ----------------------------------------------------------------- mmap json yeswrap nostack +---------------------------------------------------------------- json yeswrap nostack mlr --irs auto --ors lf --jlistwrap --json cat ./reg_test/input/line-term-lf-wrap.json [ { "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 } @@ -50558,7 +50451,7 @@ mlr --jlistwrap --json cat ./reg_test/input/line-term-crlf-wrap.json ] ----------------------------------------------------------------- mmap json nowrap yesstack +---------------------------------------------------------------- json nowrap yesstack mlr --irs auto --json --jvstack cat ./reg_test/input/line-term-lf.json { "a": "pan", @@ -50848,7 +50741,7 @@ mlr --json --jvstack cat ./reg_test/input/line-term-crlf.json } ----------------------------------------------------------------- mmap json yeswrap yesstack +---------------------------------------------------------------- json yeswrap yesstack mlr --irs auto --ors lf --jlistwrap --json --jvstack cat ./reg_test/input/line-term-lf-wrap.json [ { @@ -51146,7 +51039,7 @@ mlr --jlistwrap --json --jvstack cat ./reg_test/input/line-term-crlf-wrap.json ] ----------------------------------------------------------------- mmap json nowrap nostack +---------------------------------------------------------------- json nowrap nostack mlr --irs auto --ors lf --json cat ./reg_test/input/line-term-lf.json { "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 } { "a": "eks", "b": "pan", "i": 2, "x": 0.7586799647899636, "y": 0.5221511083334797 } @@ -51196,7 +51089,7 @@ mlr --json cat ./reg_test/input/line-term-crlf.json { "a": "pan", "b": "wye", "i": 10, "x": 0.5026260055412137, "y": 0.9526183602969864 } ----------------------------------------------------------------- mmap json yeswrap nostack +---------------------------------------------------------------- json yeswrap nostack mlr --irs auto --ors lf --jlistwrap --json cat ./reg_test/input/line-term-lf-wrap.json [ { "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 } @@ -51254,7 +51147,7 @@ mlr --jlistwrap --json cat ./reg_test/input/line-term-crlf-wrap.json ] ----------------------------------------------------------------- mmap json nowrap yesstack +---------------------------------------------------------------- json nowrap yesstack mlr --irs auto --ors lf --json --jvstack cat ./reg_test/input/line-term-lf.json { "a": "pan", @@ -51544,7 +51437,7 @@ mlr --json --jvstack cat ./reg_test/input/line-term-crlf.json } ----------------------------------------------------------------- mmap json yeswrap yesstack +---------------------------------------------------------------- json yeswrap yesstack mlr --irs auto --ors lf --jlistwrap --json --jvstack cat ./reg_test/input/line-term-lf-wrap.json [ { diff --git a/c/reg_test/run b/c/reg_test/run index 26a16008c..1102cf625 100755 --- a/c/reg_test/run +++ b/c/reg_test/run @@ -44,12 +44,6 @@ if [ "$1" = "--valgrind" ]; then # ../tools/clean-valg can be used to filter the output. path_to_mlr="valgrind --leak-check=full ${path_to_mlr}g" path_to_mlr_for_auxents="$path_to_mlr" -elif [ "$1" = "--no-mmap" ]; then - path_to_mlr_for_auxents="${path_to_mlr}" - path_to_mlr="${path_to_mlr} --no-mmap" -elif [ "$1" = "--valgrind-no-mmap" ]; then - path_to_mlr="valgrind --leak-check=full ${path_to_mlr}g --no-mmap" - path_to_mlr_for_auxents="valgrind --leak-check=full ${path_to_mlr}g" fi echo Using mlr executable $path_to_mlr @@ -5755,18 +5749,6 @@ mention pass comments1-crlf.csv run_mlr --pass-comments --icsv --odkvp cat < $outdir/comments1-crlf.csv run_mlr --pass-comments --icsv --odkvp cat $outdir/comments1-crlf.csv -# ---------------------------------------------------------------- -announce MMAP AT PAGE BOUNDARIES - -run_mlr --dkvp tail -n 4 $indir/page-aligned-final-ifs.dkvp -run_mlr --dkvp tail -n 4 $indir/page-aligned-final-irs.dkvp -run_mlr --dkvp tail -n 4 $indir/page-aligned-final-no-ifs.dkvp -run_mlr --dkvp tail -n 4 $indir/page-aligned-no-final-irs.dkvp -run_mlr --nidx tail -n 4 $indir/page-aligned-no-final-irs.nidx -run_mlr --csvlite tail -n 4 $indir/page-aligned-no-final-irs.csvl -run_mlr --csv --rs lf tail -n 4 $indir/page-aligned-no-final-irs.csvl -run_mlr --xtab tail -n 4 $indir/page-aligned-no-final-eol.xtab - # ---------------------------------------------------------------- announce INT64 I/O @@ -5797,20 +5779,19 @@ run_mlr --csv cat < $indir/rfc-csv/simple.csv-crlf # ---------------------------------------------------------------- announce RFC-CSV -run_mlr --mmap --csv cat $indir/rfc-csv/simple.csv-crlf -run_mlr --mmap --csv cat $indir/rfc-csv/simple-truncated.csv -run_mlr --mmap --csv cat $indir/rfc-csv/narrow.csv -run_mlr --mmap --csv cat $indir/rfc-csv/narrow-truncated.csv -run_mlr --mmap --csv cat $indir/rfc-csv/quoted-comma.csv -run_mlr --mmap --csv cat $indir/rfc-csv/quoted-comma-truncated.csv -run_mlr --mmap --csv cat $indir/rfc-csv/quoted-crlf.csv -run_mlr --mmap --csv cat $indir/rfc-csv/quoted-crlf-truncated.csv -run_mlr --mmap --csv cat $indir/rfc-csv/simple-truncated.csv $indir/rfc-csv/simple.csv-crlf -run_mlr --mmap --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b $indir/rfc-csv/modify-defaults.csv -run_mlr --mmap --csv --rs lf --quote-original cut -o -f c,b,a $indir/quote-original.csv +run_mlr --csv cat $indir/rfc-csv/simple.csv-crlf +run_mlr --csv cat $indir/rfc-csv/simple-truncated.csv +run_mlr --csv cat $indir/rfc-csv/narrow.csv +run_mlr --csv cat $indir/rfc-csv/narrow-truncated.csv +run_mlr --csv cat $indir/rfc-csv/quoted-comma.csv +run_mlr --csv cat $indir/rfc-csv/quoted-comma-truncated.csv +run_mlr --csv cat $indir/rfc-csv/quoted-crlf.csv +run_mlr --csv cat $indir/rfc-csv/quoted-crlf-truncated.csv +run_mlr --csv cat $indir/rfc-csv/simple-truncated.csv $indir/rfc-csv/simple.csv-crlf +run_mlr --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b $indir/rfc-csv/modify-defaults.csv +run_mlr --csv --rs lf --quote-original cut -o -f c,b,a $indir/quote-original.csv -run_mlr --mmap --icsv --oxtab cat $indir/comma-at-eof.csv -run_mlr --no-mmap --icsv --oxtab cat $indir/comma-at-eof.csv +run_mlr --icsv --oxtab cat $indir/comma-at-eof.csv run_mlr --csv --quote-all cat $indir/rfc-csv/simple.csv-crlf run_mlr --csv --quote-original cat $indir/rfc-csv/simple.csv-crlf @@ -5822,10 +5803,8 @@ run_mlr --iusv --oxtab cat $indir/example.usv # ---------------------------------------------------------------- announce RAGGED NON-RFC CSV -run_mlr --mmap --icsv --oxtab --ragged cat $indir/ragged.csv -run_mlr --no-mmap --icsv --oxtab --ragged cat $indir/ragged.csv -run_mlr --mmap --icsvlite --oxtab --ragged cat $indir/ragged.csv -run_mlr --no-mmap --icsvlite --oxtab --ragged cat $indir/ragged.csv +run_mlr --icsv --oxtab --ragged cat $indir/ragged.csv +run_mlr --icsvlite --oxtab --ragged cat $indir/ragged.csv # ---------------------------------------------------------------- announce MARKDOWN OUTPUT @@ -5866,15 +5845,14 @@ run_mlr --oxtab --icsvlite --irs crlf --ifs /, cut -o -f x,a,i $indir/multi-s # ---------------------------------------------------------------- announce MULTI-CHARACTER SEPARATORS FOR XTAB -run_mlr --mmap --xtab --ifs crlf --ofs Z cut -x -f b $indir/truncated.xtab-crlf -run_mlr --mmap --xtab --ips . --ops @ cut -x -f b $indir/dots.xtab +run_mlr --xtab --ifs crlf --ofs Z cut -x -f b $indir/truncated.xtab-crlf +run_mlr --xtab --ips . --ops @ cut -x -f b $indir/dots.xtab run_mlr --xtab --ips ": " --ops '@@@@' put '$sum=int($a+$b)' $indir/multi-ips.dkvp # ---------------------------------------------------------------- announce EMBEDDED IPS FOR XTAB -run_mlr --xtab --mmap cat $indir/embedded-ips.xtab -run_mlr --xtab --no-mmap cat $indir/embedded-ips.xtab +run_mlr --xtab cat $indir/embedded-ips.xtab # ---------------------------------------------------------------- announce MULTI-CHARACTER IRS FOR PPRINT @@ -5893,8 +5871,8 @@ run_mlr --opprint --barred --right cat $indir/abixy-het # ---------------------------------------------------------------- announce MULTI-CHARACTER IXS SPECIFIERS -run_mlr --oxtab --idkvp --mmap --irs lf --ifs '\x2c' --ips '\075' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf -run_mlr --oxtab --idkvp --mmap --irs lf --ifs /, --ips '\x3d\x3a' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf +run_mlr --oxtab --idkvp --irs lf --ifs '\x2c' --ips '\075' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf +run_mlr --oxtab --idkvp --irs lf --ifs /, --ips '\x3d\x3a' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf # ---------------------------------------------------------------- announce JSON I/O @@ -6010,96 +5988,96 @@ run_mlr --irs auto --ors lf cat $indir/line-term-crlf.dkvp run_mlr cat $indir/line-term-lf.dkvp run_mlr cat $indir/line-term-crlf.dkvp -mention mmap nidx +mention nidx run_mlr --irs auto --ors lf --nidx --fs comma cat $indir/line-term-lf.dkvp run_mlr --irs auto --ors lf --nidx --fs comma cat $indir/line-term-crlf.dkvp run_mlr --nidx --fs comma cat $indir/line-term-lf.dkvp run_mlr --nidx --fs comma cat $indir/line-term-crlf.dkvp -mention mmap csvlite +mention csvlite run_mlr --irs auto --ors lf --csvlite cat $indir/line-term-lf.csv run_mlr --irs auto --ors lf --csvlite cat $indir/line-term-crlf.csv run_mlr --csvlite cat $indir/line-term-lf.csv run_mlr --csvlite cat $indir/line-term-crlf.csv -mention mmap pprint +mention pprint run_mlr --irs auto --ors lf --pprint cat $indir/line-term-lf.csv run_mlr --irs auto --ors lf --pprint cat $indir/line-term-crlf.csv run_mlr --pprint cat $indir/line-term-lf.csv run_mlr --pprint cat $indir/line-term-crlf.csv -mention mmap xtab +mention xtab run_mlr --ifs auto --xtab cat $indir/line-term-lf.xtab run_mlr --ifs auto --xtab cat $indir/line-term-crlf.xtab run_mlr --fs auto --xtab cat $indir/line-term-lf.xtab run_mlr --fs auto --xtab cat $indir/line-term-crlf.xtab -mention mmap xtab +mention xtab run_mlr --ifs auto --xtab cat $indir/line-term-lf.xtab run_mlr --ifs auto --xtab cat $indir/line-term-crlf.xtab run_mlr --fs auto --xtab cat $indir/line-term-lf.xtab run_mlr --fs auto --xtab cat $indir/line-term-crlf.xtab -mention mmap csv +mention csv run_mlr --irs auto --ors lf --csv cat $indir/line-term-lf.csv run_mlr --irs auto --ors lf --csv cat $indir/line-term-crlf.csv run_mlr --csv cat $indir/line-term-lf.csv run_mlr --csv cat $indir/line-term-crlf.csv -mention mmap json nowrap nostack +mention json nowrap nostack run_mlr --irs auto --ors lf --json cat $indir/line-term-lf.json run_mlr --irs auto --ors lf --json cat $indir/line-term-crlf.json run_mlr --json cat $indir/line-term-lf.json run_mlr --json cat $indir/line-term-crlf.json -mention mmap json yeswrap nostack +mention json yeswrap nostack run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-lf-wrap.json run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-crlf-wrap.json run_mlr --jlistwrap --json cat $indir/line-term-lf-wrap.json run_mlr --jlistwrap --json cat $indir/line-term-crlf-wrap.json -mention mmap json nowrap yesstack +mention json nowrap yesstack run_mlr --irs auto --json --jvstack cat $indir/line-term-lf.json run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-crlf.json run_mlr --json --jvstack cat $indir/line-term-lf.json run_mlr --json --jvstack cat $indir/line-term-crlf.json -mention mmap json yeswrap yesstack +mention json yeswrap yesstack run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json run_mlr --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json run_mlr --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json -mention mmap json nowrap nostack +mention json nowrap nostack run_mlr --irs auto --ors lf --json cat $indir/line-term-lf.json run_mlr --irs auto --ors lf --json cat $indir/line-term-crlf.json run_mlr --json cat $indir/line-term-lf.json run_mlr --json cat $indir/line-term-crlf.json -mention mmap json yeswrap nostack +mention json yeswrap nostack run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-lf-wrap.json run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-crlf-wrap.json run_mlr --jlistwrap --json cat $indir/line-term-lf-wrap.json run_mlr --jlistwrap --json cat $indir/line-term-crlf-wrap.json -mention mmap json nowrap yesstack +mention json nowrap yesstack run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-lf.json run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-crlf.json run_mlr --json --jvstack cat $indir/line-term-lf.json run_mlr --json --jvstack cat $indir/line-term-crlf.json -mention mmap json yeswrap yesstack +mention json yeswrap yesstack run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json run_mlr --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json diff --git a/c/unit_test/test_byte_readers.c b/c/unit_test/test_byte_readers.c index d1fbb45ce..079ee72d3 100644 --- a/c/unit_test/test_byte_readers.c +++ b/c/unit_test/test_byte_readers.c @@ -116,92 +116,12 @@ static char* test_stdio_byte_reader_reuse() { return NULL; } -// ---------------------------------------------------------------- -static char* test_mmap_byte_reader_1() { -#if MLR_ARCH_MMAP_ENABLED - byte_reader_t* pbr = mmap_byte_reader_alloc(); - - char* contents = ""; - char* path = write_temp_file_or_die(contents); - int ok = pbr->popen_func(pbr, NULL, path); - mu_assert_lf(ok == TRUE); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - unlink_file_or_die(path); - - return NULL; -#endif -} - -// ---------------------------------------------------------------- -static char* test_mmap_byte_reader_2() { -#if MLR_ARCH_MMAP_ENABLED - byte_reader_t* pbr = mmap_byte_reader_alloc(); - - char* contents = "abcdefg"; - char* path = write_temp_file_or_die(contents); - int ok = pbr->popen_func(pbr, NULL, path); - mu_assert_lf(ok == TRUE); - mu_assert_lf(pbr->pread_func(pbr) == 'a'); - mu_assert_lf(pbr->pread_func(pbr) == 'b'); - mu_assert_lf(pbr->pread_func(pbr) == 'c'); - mu_assert_lf(pbr->pread_func(pbr) == 'd'); - mu_assert_lf(pbr->pread_func(pbr) == 'e'); - mu_assert_lf(pbr->pread_func(pbr) == 'f'); - mu_assert_lf(pbr->pread_func(pbr) == 'g'); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - unlink_file_or_die(path); - - return NULL; -#endif -} - -// ---------------------------------------------------------------- -static char* test_mmap_byte_reader_reuse() { -#if MLR_ARCH_MMAP_ENABLED - byte_reader_t* pbr = mmap_byte_reader_alloc(); - - char* contents = "abc"; - char* path = write_temp_file_or_die(contents); - int ok = pbr->popen_func(pbr, NULL, path); - mu_assert_lf(ok == TRUE); - mu_assert_lf(pbr->pread_func(pbr) == 'a'); - mu_assert_lf(pbr->pread_func(pbr) == 'b'); - mu_assert_lf(pbr->pread_func(pbr) == 'c'); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - unlink_file_or_die(path); - - contents = "defg"; - path = write_temp_file_or_die(contents); - ok = pbr->popen_func(pbr, NULL, path); - mu_assert_lf(ok == TRUE); - mu_assert_lf(pbr->pread_func(pbr) == 'd'); - mu_assert_lf(pbr->pread_func(pbr) == 'e'); - mu_assert_lf(pbr->pread_func(pbr) == 'f'); - mu_assert_lf(pbr->pread_func(pbr) == 'g'); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - mu_assert_lf(pbr->pread_func(pbr) == EOF); - unlink_file_or_die(path); - - return NULL; -#endif -} - // ================================================================ static char * run_all_tests() { mu_run_test(test_string_byte_reader); mu_run_test(test_stdio_byte_reader_1); mu_run_test(test_stdio_byte_reader_2); mu_run_test(test_stdio_byte_reader_reuse); - mu_run_test(test_mmap_byte_reader_1); - mu_run_test(test_mmap_byte_reader_2); - mu_run_test(test_mmap_byte_reader_reuse); return 0; }