mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 02:14:13 +00:00
remove mmap-readers, which were high-maintenance and not able to be used when most needed
This commit is contained in:
parent
740066fcc9
commit
2632ddc716
28 changed files with 93 additions and 4058 deletions
|
|
@ -81,7 +81,6 @@ TEST_BYTE_READERS_SRCS = \
|
|||
lib/string_builder.c \
|
||||
input/string_byte_reader.c \
|
||||
input/stdio_byte_reader.c \
|
||||
input/mmap_byte_reader.c \
|
||||
unit_test/test_byte_readers.c
|
||||
|
||||
TEST_LINE_READERS_SRCS = \
|
||||
|
|
@ -137,18 +136,12 @@ TEST_LREC_SRCS = \
|
|||
containers/sllmv.c \
|
||||
containers/mlhmmv.c \
|
||||
input/line_readers.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/file_reader_stdio.c \
|
||||
input/file_ingestor_stdio.c \
|
||||
input/lrec_reader_mmap_csvlite.c \
|
||||
input/lrec_reader_stdio_csvlite.c \
|
||||
input/lrec_reader_mmap_dkvp.c \
|
||||
input/lrec_reader_stdio_dkvp.c \
|
||||
input/lrec_reader_mmap_nidx.c \
|
||||
input/lrec_reader_stdio_nidx.c \
|
||||
input/lrec_reader_mmap_xtab.c \
|
||||
input/lrec_reader_stdio_xtab.c \
|
||||
input/lrec_reader_mmap_json.c \
|
||||
input/lrec_reader_stdio_json.c \
|
||||
input/mlr_json_adapter.c \
|
||||
input/json_parser.c \
|
||||
|
|
@ -187,18 +180,12 @@ TEST_MULTIPLE_CONTAINERS_SRCS = \
|
|||
containers/top_keeper.c \
|
||||
containers/dheap.c \
|
||||
input/line_readers.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/file_reader_stdio.c \
|
||||
input/file_ingestor_stdio.c \
|
||||
input/lrec_reader_mmap_csvlite.c \
|
||||
input/lrec_reader_stdio_csvlite.c \
|
||||
input/lrec_reader_mmap_dkvp.c \
|
||||
input/lrec_reader_stdio_dkvp.c \
|
||||
input/lrec_reader_mmap_nidx.c \
|
||||
input/lrec_reader_stdio_nidx.c \
|
||||
input/lrec_reader_mmap_xtab.c \
|
||||
input/lrec_reader_stdio_xtab.c \
|
||||
input/lrec_reader_mmap_json.c \
|
||||
input/lrec_reader_stdio_json.c \
|
||||
input/mlr_json_adapter.c \
|
||||
input/json_parser.c \
|
||||
|
|
@ -358,27 +345,19 @@ TEST_JOIN_BUCKET_KEEPER_SRCS = \
|
|||
containers/mixutil.c \
|
||||
containers/header_keeper.c \
|
||||
containers/join_bucket_keeper.c \
|
||||
input/mmap_byte_reader.c \
|
||||
input/stdio_byte_reader.c \
|
||||
input/line_readers.c \
|
||||
input/lrec_reader_gen.c \
|
||||
input/lrec_reader_in_memory.c \
|
||||
input/lrec_readers.c \
|
||||
input/lrec_reader_mmap_csv.c \
|
||||
input/lrec_reader_stdio_csv.c \
|
||||
input/lrec_reader_mmap_csvlite.c \
|
||||
input/lrec_reader_stdio_csvlite.c \
|
||||
input/lrec_reader_mmap_dkvp.c \
|
||||
input/lrec_reader_stdio_dkvp.c \
|
||||
input/lrec_reader_mmap_nidx.c \
|
||||
input/lrec_reader_stdio_nidx.c \
|
||||
input/lrec_reader_mmap_xtab.c \
|
||||
input/lrec_reader_stdio_xtab.c \
|
||||
input/lrec_reader_mmap_json.c \
|
||||
input/lrec_reader_stdio_json.c \
|
||||
input/mlr_json_adapter.c \
|
||||
input/json_parser.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/file_reader_stdio.c \
|
||||
input/file_ingestor_stdio.c \
|
||||
input/peek_file_reader.c \
|
||||
|
|
@ -398,7 +377,6 @@ EXPERIMENTAL_READER_SRCS = \
|
|||
lib/string_array.c \
|
||||
lib/string_builder.c \
|
||||
input/stdio_byte_reader.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/line_readers.c \
|
||||
containers/parse_trie.c \
|
||||
experimental/getlines.c
|
||||
|
|
@ -492,7 +470,6 @@ unit-test: test-mlrutil test-mlrregex test-argparse test-line-readers test-byte-
|
|||
|
||||
reg-test:
|
||||
./reg_test/run
|
||||
./reg_test/run --no-mmap
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
# Run this after unit-test expected output has changed, and is verified to be
|
||||
|
|
|
|||
|
|
@ -76,7 +76,6 @@ TEST_BYTE_READERS_SRCS = \
|
|||
lib/string_builder.c \
|
||||
input/string_byte_reader.c \
|
||||
input/stdio_byte_reader.c \
|
||||
input/mmap_byte_reader.c \
|
||||
unit_test/test_byte_readers.c
|
||||
|
||||
TEST_LINE_READERS_SRCS = \
|
||||
|
|
@ -125,18 +124,12 @@ TEST_LREC_SRCS = \
|
|||
containers/sllmv.c \
|
||||
containers/mlhmmv.c \
|
||||
input/line_readers.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/file_reader_stdio.c \
|
||||
input/file_ingestor_stdio.c \
|
||||
input/lrec_reader_mmap_csvlite.c \
|
||||
input/lrec_reader_stdio_csvlite.c \
|
||||
input/lrec_reader_mmap_dkvp.c \
|
||||
input/lrec_reader_stdio_dkvp.c \
|
||||
input/lrec_reader_mmap_nidx.c \
|
||||
input/lrec_reader_stdio_nidx.c \
|
||||
input/lrec_reader_mmap_xtab.c \
|
||||
input/lrec_reader_stdio_xtab.c \
|
||||
input/lrec_reader_mmap_json.c \
|
||||
input/lrec_reader_stdio_json.c \
|
||||
input/mlr_json_adapter.c \
|
||||
input/json_parser.c \
|
||||
|
|
@ -173,18 +166,12 @@ TEST_MULTIPLE_CONTAINERS_SRCS = \
|
|||
containers/top_keeper.c \
|
||||
containers/dheap.c \
|
||||
input/line_readers.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/file_reader_stdio.c \
|
||||
input/file_ingestor_stdio.c \
|
||||
input/lrec_reader_mmap_csvlite.c \
|
||||
input/lrec_reader_stdio_csvlite.c \
|
||||
input/lrec_reader_mmap_dkvp.c \
|
||||
input/lrec_reader_stdio_dkvp.c \
|
||||
input/lrec_reader_mmap_nidx.c \
|
||||
input/lrec_reader_stdio_nidx.c \
|
||||
input/lrec_reader_mmap_xtab.c \
|
||||
input/lrec_reader_stdio_xtab.c \
|
||||
input/lrec_reader_mmap_json.c \
|
||||
input/lrec_reader_stdio_json.c \
|
||||
input/mlr_json_adapter.c \
|
||||
input/json_parser.c \
|
||||
|
|
@ -325,26 +312,18 @@ TEST_JOIN_BUCKET_KEEPER_SRCS = \
|
|||
containers/mixutil.c \
|
||||
containers/header_keeper.c \
|
||||
containers/join_bucket_keeper.c \
|
||||
input/mmap_byte_reader.c \
|
||||
input/stdio_byte_reader.c \
|
||||
input/line_readers.c \
|
||||
input/lrec_reader_in_memory.c \
|
||||
input/lrec_readers.c \
|
||||
input/lrec_reader_mmap_csv.c \
|
||||
input/lrec_reader_stdio_csv.c \
|
||||
input/lrec_reader_mmap_csvlite.c \
|
||||
input/lrec_reader_stdio_csvlite.c \
|
||||
input/lrec_reader_mmap_dkvp.c \
|
||||
input/lrec_reader_stdio_dkvp.c \
|
||||
input/lrec_reader_mmap_nidx.c \
|
||||
input/lrec_reader_stdio_nidx.c \
|
||||
input/lrec_reader_mmap_xtab.c \
|
||||
input/lrec_reader_stdio_xtab.c \
|
||||
input/lrec_reader_mmap_json.c \
|
||||
input/lrec_reader_stdio_json.c \
|
||||
input/mlr_json_adapter.c \
|
||||
input/json_parser.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/file_reader_stdio.c \
|
||||
input/file_ingestor_stdio.c \
|
||||
input/peek_file_reader.c \
|
||||
|
|
@ -362,7 +341,6 @@ EXPERIMENTAL_READER_SRCS = \
|
|||
lib/string_array.c \
|
||||
lib/string_builder.c \
|
||||
input/stdio_byte_reader.c \
|
||||
input/file_reader_mmap.c \
|
||||
input/line_readers.c \
|
||||
containers/parse_trie.c \
|
||||
experimental/getlines.c
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@
|
|||
#define DEFAULT_JSON_FLATTEN_SEPARATOR ":"
|
||||
#define DEFAULT_OOSVAR_FLATTEN_SEPARATOR ":"
|
||||
#define DEFAULT_COMMENT_STRING "#"
|
||||
#define DEFAULT_MAX_FILE_SIZE_FOR_MMAP (4LL*1024LL*1024LL*1024LL)
|
||||
|
||||
// ASCII 1f and 1e
|
||||
#define ASV_FS "\x1f"
|
||||
|
|
@ -278,36 +277,9 @@ cli_opts_t* parse_command_line(int argc, char** argv, sllv_t** ppmapper_list) {
|
|||
slls_append(popts->filenames, argv[argi], NO_FREE);
|
||||
}
|
||||
|
||||
// Check for use of mmap. It's about 20% faster than stdio (due to fewer data copies
|
||||
// -- lrecs can be pointer-backed by mmap memory) but we can't use it in all situations.
|
||||
if (no_input) {
|
||||
slls_free(popts->filenames);
|
||||
popts->filenames = NULL;
|
||||
} else if (popts->filenames->length == 0) {
|
||||
// No filenames means read from standard input, and standard input cannot be mmapped.
|
||||
popts->reader_opts.use_mmap_for_read = FALSE;
|
||||
} else if (popts->filenames->length > 10) {
|
||||
// https://github.com/johnkerl/miller/issues/256: too many small files is as bad as one big one
|
||||
// (for which see immediately below).
|
||||
popts->reader_opts.use_mmap_for_read = FALSE;
|
||||
} else if (popts->reader_opts.use_mmap_for_read == TRUE) {
|
||||
// https://github.com/johnkerl/miller/issues/160: don't use mmap for large files.
|
||||
//
|
||||
// If any input files don't exist, don't error out just yet ... it's possible that the user
|
||||
// is doing some complex put-with-tee or somesuch which will create the input file by the
|
||||
// time it's needed. In that case we of course can't know the size yet, so avoid mmap there
|
||||
// to be safe.
|
||||
int all_exist_and_are_small_enough = TRUE;
|
||||
for (sllse_t* pe = popts->filenames->phead; pe != NULL; pe = pe->pnext) {
|
||||
ssize_t file_size = get_file_size(pe->value);
|
||||
if (file_size == (ssize_t)(-1) || file_size >= popts->reader_opts.max_file_size_for_mmap) {
|
||||
all_exist_and_are_small_enough = FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!all_exist_and_are_small_enough) {
|
||||
popts->reader_opts.use_mmap_for_read = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
if (popts->do_in_place && (popts->filenames == NULL || popts->filenames->length == 0)) {
|
||||
|
|
@ -842,14 +814,6 @@ static void main_usage_data_format_options(FILE* o, char* argv0) {
|
|||
fprintf(o, "\n");
|
||||
fprintf(o, " -p is a keystroke-saver for --nidx --fs space --repifs\n");
|
||||
fprintf(o, "\n");
|
||||
fprintf(o, " --mmap --no-mmap --mmap-below {n} Use mmap for files whenever possible, never, or\n");
|
||||
fprintf(o, " for files less than n bytes in size. Default is for\n");
|
||||
fprintf(o, " files less than %lld bytes in size.\n", DEFAULT_MAX_FILE_SIZE_FOR_MMAP);
|
||||
fprintf(o, " 'Whenever possible' means always except for when reading\n");
|
||||
fprintf(o, " standard input which is not mmappable. If you don't know\n");
|
||||
fprintf(o, " what this means, don't worry about it -- it's a minor\n");
|
||||
fprintf(o, " performance optimization.\n");
|
||||
fprintf(o, "\n");
|
||||
fprintf(o, " Examples: --csv for CSV-formatted input and output; --idkvp --opprint for\n");
|
||||
fprintf(o, " DKVP-formatted input and pretty-printed output.\n");
|
||||
fprintf(o, "\n");
|
||||
|
|
@ -1139,14 +1103,11 @@ void cli_reader_opts_init(cli_reader_opts_t* preader_opts) {
|
|||
preader_opts->allow_repeat_ips = NEITHER_TRUE_NOR_FALSE;
|
||||
preader_opts->use_implicit_csv_header = NEITHER_TRUE_NOR_FALSE;
|
||||
preader_opts->allow_ragged_csv_input = NEITHER_TRUE_NOR_FALSE;
|
||||
preader_opts->use_mmap_for_read = NEITHER_TRUE_NOR_FALSE;
|
||||
|
||||
preader_opts->prepipe = NULL;
|
||||
preader_opts->comment_handling = COMMENTS_ARE_DATA;
|
||||
preader_opts->comment_string = NULL;
|
||||
|
||||
preader_opts->max_file_size_for_mmap = DEFAULT_MAX_FILE_SIZE_FOR_MMAP;
|
||||
|
||||
// xxx temp
|
||||
preader_opts->generator_opts.field_name = "i";
|
||||
preader_opts->generator_opts.start = 0LL;
|
||||
|
|
@ -1198,13 +1159,6 @@ void cli_apply_reader_defaults(cli_reader_opts_t* preader_opts) {
|
|||
if (preader_opts->allow_ragged_csv_input == NEITHER_TRUE_NOR_FALSE)
|
||||
preader_opts->allow_ragged_csv_input = FALSE;
|
||||
|
||||
if (preader_opts->use_mmap_for_read == NEITHER_TRUE_NOR_FALSE)
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
preader_opts->use_mmap_for_read = TRUE;
|
||||
#else
|
||||
preader_opts->use_mmap_for_read = FALSE;
|
||||
#endif
|
||||
|
||||
if (preader_opts->input_json_flatten_separator == NULL)
|
||||
preader_opts->input_json_flatten_separator = DEFAULT_JSON_FLATTEN_SEPARATOR;
|
||||
}
|
||||
|
|
@ -1311,9 +1265,6 @@ void cli_merge_reader_opts(cli_reader_opts_t* pfunc_opts, cli_reader_opts_t* pma
|
|||
if (pfunc_opts->allow_ragged_csv_input == NEITHER_TRUE_NOR_FALSE)
|
||||
pfunc_opts->allow_ragged_csv_input = pmain_opts->allow_ragged_csv_input;
|
||||
|
||||
if (pfunc_opts->use_mmap_for_read == NEITHER_TRUE_NOR_FALSE)
|
||||
pfunc_opts->use_mmap_for_read = pmain_opts->use_mmap_for_read;
|
||||
|
||||
if (pfunc_opts->input_json_flatten_separator == NULL)
|
||||
pfunc_opts->input_json_flatten_separator = pmain_opts->input_json_flatten_separator;
|
||||
}
|
||||
|
|
@ -1642,28 +1593,18 @@ int cli_handle_reader_options(char** argv, int argc, int *pargi, cli_reader_opts
|
|||
argi += 1;
|
||||
|
||||
} else if (streq(argv[argi], "--mmap")) {
|
||||
preader_opts->use_mmap_for_read = TRUE;
|
||||
// No-op as of 5.6.3 (mmap is being abandoned) but don't break
|
||||
// the command-line user experience.
|
||||
argi += 1;
|
||||
|
||||
} else if (streq(argv[argi], "--no-mmap")) {
|
||||
preader_opts->use_mmap_for_read = FALSE;
|
||||
// No-op as of 5.6.3 (mmap is being abandoned) but don't break
|
||||
// the command-line user experience.
|
||||
argi += 1;
|
||||
|
||||
} else if (streq(argv[argi], "--mmap-below")) {
|
||||
check_arg_count(argv, argi, argc, 2);
|
||||
preader_opts->use_mmap_for_read = TRUE;
|
||||
long long llmax;
|
||||
if (sscanf(argv[argi+1], "%lld", &llmax) != 1) {
|
||||
fprintf(stderr, "%s: could not scan \"%s\".\n",
|
||||
MLR_GLOBALS.bargv0, argv[argi+1]);
|
||||
}
|
||||
preader_opts->max_file_size_for_mmap = llmax;
|
||||
argi += 2;
|
||||
|
||||
} else if (streq(argv[argi], "--prepipe")) {
|
||||
check_arg_count(argv, argi, argc, 2);
|
||||
preader_opts->prepipe = argv[argi+1];
|
||||
preader_opts->use_mmap_for_read = FALSE;
|
||||
argi += 2;
|
||||
|
||||
} else if (streq(argv[argi], "--skip-comments")) {
|
||||
|
|
|
|||
|
|
@ -37,7 +37,6 @@ typedef struct _cli_reader_opts_t {
|
|||
int allow_repeat_ips;
|
||||
int use_implicit_csv_header;
|
||||
int allow_ragged_csv_input;
|
||||
int use_mmap_for_read;
|
||||
|
||||
// Command for popen on input, e.g. "zcat -cf <". Can be null in which case
|
||||
// files are read directly rather than through a pipe.
|
||||
|
|
@ -46,9 +45,6 @@ typedef struct _cli_reader_opts_t {
|
|||
comment_handling_t comment_handling;
|
||||
char* comment_string;
|
||||
|
||||
// https://github.com/johnkerl/miller/issues/160
|
||||
ssize_t max_file_size_for_mmap;
|
||||
|
||||
// Fake internal-data-generator 'reader'
|
||||
generator_opts_t generator_opts;
|
||||
|
||||
|
|
|
|||
|
|
@ -2,8 +2,6 @@ noinst_LTLIBRARIES= libinput.la
|
|||
libinput_la_SOURCES= \
|
||||
byte_reader.h \
|
||||
byte_readers.h \
|
||||
file_reader_mmap.c \
|
||||
file_reader_mmap.h \
|
||||
file_reader_stdio.c \
|
||||
file_reader_stdio.h \
|
||||
file_ingestor_stdio.c \
|
||||
|
|
@ -17,12 +15,6 @@ libinput_la_SOURCES= \
|
|||
lrec_reader.h \
|
||||
lrec_reader_gen.c \
|
||||
lrec_reader_in_memory.c \
|
||||
lrec_reader_mmap_csv.c \
|
||||
lrec_reader_mmap_csvlite.c \
|
||||
lrec_reader_mmap_dkvp.c \
|
||||
lrec_reader_mmap_json.c \
|
||||
lrec_reader_mmap_nidx.c \
|
||||
lrec_reader_mmap_xtab.c \
|
||||
lrec_reader_stdio_csv.c \
|
||||
lrec_reader_stdio_csvlite.c \
|
||||
lrec_reader_stdio_dkvp.c \
|
||||
|
|
@ -31,7 +23,6 @@ libinput_la_SOURCES= \
|
|||
lrec_reader_stdio_xtab.c \
|
||||
lrec_readers.c \
|
||||
lrec_readers.h \
|
||||
mmap_byte_reader.c \
|
||||
peek_file_reader.c \
|
||||
peek_file_reader.h \
|
||||
stdio_byte_reader.c \
|
||||
|
|
|
|||
|
|
@ -4,10 +4,8 @@
|
|||
|
||||
byte_reader_t* string_byte_reader_alloc();
|
||||
byte_reader_t* stdio_byte_reader_alloc();
|
||||
byte_reader_t* mmap_byte_reader_alloc();
|
||||
|
||||
void string_byte_reader_free(byte_reader_t* pbr);
|
||||
void stdio_byte_reader_free(byte_reader_t* pbr);
|
||||
void mmap_byte_reader_free(byte_reader_t* pbr);
|
||||
|
||||
#endif // BYTE_READERS_H
|
||||
|
|
|
|||
|
|
@ -1,84 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include "lib/mlr_arch.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "file_reader_mmap.h"
|
||||
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
static char empty_buf[1] = { 0 };
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
file_reader_mmap_state_t* file_reader_mmap_open(char* prepipe, char* file_name) {
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
// popen is a stdio construct, not an mmap construct, and it can't be supported here.
|
||||
if (prepipe != NULL) {
|
||||
fprintf(stderr, "%s: coding error detected in file %s at line %d.\n",
|
||||
MLR_GLOBALS.bargv0, __FILE__, __LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
file_reader_mmap_state_t* pstate = mlr_malloc_or_die(sizeof(file_reader_mmap_state_t));
|
||||
pstate->fd = open(file_name, O_RDONLY);
|
||||
if (pstate->fd < 0) {
|
||||
perror("open");
|
||||
fprintf(stderr, "%s: could not open \"%s\"\n", MLR_GLOBALS.bargv0, file_name);
|
||||
exit(1);
|
||||
}
|
||||
struct stat stat;
|
||||
if (fstat(pstate->fd, &stat) < 0) {
|
||||
perror("fstat");
|
||||
fprintf(stderr, "%s: could not fstat \"%s\"\n", MLR_GLOBALS.bargv0, file_name);
|
||||
exit(1);
|
||||
}
|
||||
if (stat.st_size == 0) {
|
||||
// mmap doesn't allow us to map zero-length files but zero-length files do exist.
|
||||
pstate->sol = &empty_buf[0];
|
||||
} else {
|
||||
pstate->sol = mmap(NULL, (size_t)stat.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE, pstate->fd, (off_t)0);
|
||||
if (pstate->sol == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
fprintf(stderr, "%s: could not mmap \"%s\"\n", MLR_GLOBALS.bargv0, file_name);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
pstate->eof = pstate->sol + stat.st_size;
|
||||
// POSIX semantics: the mmap itself increments a reference count to the file, in addition to the
|
||||
// open. We close the file but keep the mmap reference until a subsequent munmap.
|
||||
if (close(pstate->fd) < 0) {
|
||||
perror("close");
|
||||
exit(1);
|
||||
}
|
||||
return pstate;
|
||||
#else
|
||||
fprintf(stderr, "%s: mmap is unsupported on this architecture.\n", MLR_GLOBALS.bargv0);
|
||||
exit(1);
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Here we intentionally do not munmap.
|
||||
//
|
||||
// This method is used by various lrec readers, where lrecs are instantiated with keys/values
|
||||
// pointing into mmapped file-contents buffers. This is done for the sake of performance, to reduce
|
||||
// data-copies. But it also means we can't unmap files after ingesting lrecs, since the lrecs in
|
||||
// question might be retained after the input-file closes. Example: mlr sort on multiple files.
|
||||
void file_reader_mmap_close(file_reader_mmap_state_t* pstate, char* prepipe) {
|
||||
free(pstate);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
void* file_reader_mmap_vopen(void* pvstate, char* prepipe, char* file_name) {
|
||||
return file_reader_mmap_open(prepipe, file_name);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
void file_reader_mmap_vclose(void* pvstate, void* pvhandle, char* prepipe) {
|
||||
file_reader_mmap_close(pvhandle, prepipe);
|
||||
}
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
// ================================================================
|
||||
// Abstraction layer for mmapped file-read logic.
|
||||
// ================================================================
|
||||
|
||||
#ifndef FILE_READER_MMAP_H
|
||||
#define FILE_READER_MMAP_H
|
||||
|
||||
typedef struct _file_reader_mmap_state_t {
|
||||
char* sol;
|
||||
char* eof;
|
||||
int fd;
|
||||
} file_reader_mmap_state_t;
|
||||
|
||||
file_reader_mmap_state_t* file_reader_mmap_open(char* prepipe, char* file_name);
|
||||
void file_reader_mmap_close(file_reader_mmap_state_t* pstate, char* prepipe);
|
||||
|
||||
void* file_reader_mmap_vopen(void* pvstate, char* prepipe, char* file_name);
|
||||
void file_reader_mmap_vclose(void* pvstate, void* pvhandle, char* prepipe);
|
||||
|
||||
#endif // FILE_READER_MMAP_H
|
||||
|
|
@ -4,7 +4,6 @@
|
|||
#include <stdio.h>
|
||||
#include "lib/context.h"
|
||||
#include "containers/lrec.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
|
||||
struct _lrec_reader_t; // forward reference for method declarations
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@
|
|||
#include <stdlib.h>
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
#include "input/lrec_readers.h"
|
||||
|
||||
typedef struct _lrec_reader_gen_state_t {
|
||||
|
|
|
|||
|
|
@ -1,546 +0,0 @@
|
|||
// ================================================================
|
||||
// Note: there are multiple process methods with a lot of code duplication.
|
||||
// This is intentional. Much of Miller's measured processing time is in the
|
||||
// lrec-reader process methods. This is code which needs to execute on every
|
||||
// byte of input and even moving a single runtime if-statement into a
|
||||
// function-pointer assignment at alloc time can have noticeable effects on
|
||||
// performance (5-10% in some cases).
|
||||
// ================================================================
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include "cli/comment_handling.h"
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "lib/string_builder.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
#include "input/lrec_readers.h"
|
||||
#include "input/peek_file_reader.h"
|
||||
#include "containers/rslls.h"
|
||||
#include "containers/lhmslv.h"
|
||||
#include "containers/parse_trie.h"
|
||||
|
||||
// Idea of pheader_keepers: each header_keeper object retains the input-line backing
|
||||
// and the slls_t for a CSV header line which is used by one or more CSV data
|
||||
// lines. Meanwhile some mappers retain input records from the entire data
|
||||
// stream, including header-schema changes in the input stream. This means we
|
||||
// need to keep headers intact as long as any lrecs are pointing to them. One
|
||||
// option is reference-counting which I experimented with; it was messy and
|
||||
// error-prone. The approach used here is to keep a hash map from header-schema
|
||||
// to header_keeper object. The current pheader_keeper is a pointer into one of
|
||||
// those. Then when the reader is freed, all the header-keepers are freed.
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
#define STRING_BUILDER_INIT_SIZE 1024
|
||||
|
||||
#define IRS_TOKEN 0x2001
|
||||
#define IFS_TOKEN 0x2002
|
||||
#define DQUOTE_TOKEN 0x2003
|
||||
#define DQUOTE_IRS_TOKEN 0x2004
|
||||
#define DQUOTE_IRS2_TOKEN 0x2005 // alternate line-ending for autodetect LF/CRLF
|
||||
#define DQUOTE_IFS_TOKEN 0x2006
|
||||
#define DQUOTE_DQUOTE_TOKEN 0x2007
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
typedef struct _lrec_reader_mmap_csv_state_t {
|
||||
// Input line number is not the same as the record-counter in context_t,
|
||||
// which counts records.
|
||||
long long ilno;
|
||||
|
||||
char* eof;
|
||||
char* irs;
|
||||
char* ifs_eof;
|
||||
char* ifs;
|
||||
char* dquote;
|
||||
char* dquote_irs;
|
||||
char* dquote_irs2;
|
||||
char* dquote_ifs;
|
||||
char* dquote_eof;
|
||||
char* dquote_dquote;
|
||||
int do_auto_line_term;
|
||||
comment_handling_t comment_handling;
|
||||
char* comment_string;
|
||||
int comment_string_length;
|
||||
|
||||
int dquotelen;
|
||||
|
||||
rslls_t* pfields;
|
||||
string_builder_t* psb;
|
||||
|
||||
parse_trie_t* pno_dquote_parse_trie;
|
||||
parse_trie_t* pdquote_parse_trie;
|
||||
|
||||
int expect_header_line_next;
|
||||
int use_implicit_csv_header;
|
||||
int allow_ragged_csv_input;
|
||||
header_keeper_t* pheader_keeper;
|
||||
lhmslv_t* pheader_keepers;
|
||||
|
||||
} lrec_reader_mmap_csv_state_t;
|
||||
|
||||
static void lrec_reader_mmap_csv_free(lrec_reader_t* preader);
|
||||
static void lrec_reader_mmap_csv_sof(void* pvstate, void* pvhandle);
|
||||
static lrec_t* lrec_reader_mmap_csv_process(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate,
|
||||
rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx);
|
||||
static lrec_t* paste_indices_and_data(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields, context_t* pctx);
|
||||
static lrec_t* paste_header_and_data_ragged(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
|
||||
context_t* pctx);
|
||||
static lrec_t* paste_header_and_data_rectangular(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
|
||||
context_t* pctx);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
lrec_reader_t* lrec_reader_mmap_csv_alloc(char* irs, char* ifs, int use_implicit_csv_header,
|
||||
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string)
|
||||
{
|
||||
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
|
||||
|
||||
lrec_reader_mmap_csv_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_csv_state_t));
|
||||
pstate->ilno = 0LL;
|
||||
|
||||
pstate->do_auto_line_term = FALSE;
|
||||
if (streq(irs, "auto")) {
|
||||
pstate->do_auto_line_term = TRUE;
|
||||
irs = "\n";
|
||||
}
|
||||
|
||||
pstate->comment_handling = comment_handling;
|
||||
pstate->comment_string = comment_string;
|
||||
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
|
||||
|
||||
pstate->eof = "\xff";
|
||||
pstate->irs = irs;
|
||||
pstate->ifs = ifs;
|
||||
pstate->ifs_eof = mlr_paste_2_strings(pstate->ifs, "\xff");
|
||||
pstate->dquote = "\"";
|
||||
|
||||
pstate->dquote_ifs = mlr_paste_2_strings("\"", pstate->ifs);
|
||||
pstate->dquote_eof = "\"\xff";
|
||||
pstate->dquote_dquote = "\"\"";
|
||||
|
||||
pstate->dquotelen = strlen(pstate->dquote);
|
||||
|
||||
pstate->pno_dquote_parse_trie = parse_trie_alloc();
|
||||
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->irs, IRS_TOKEN);
|
||||
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->ifs, IFS_TOKEN);
|
||||
parse_trie_add_string(pstate->pno_dquote_parse_trie, pstate->dquote, DQUOTE_TOKEN);
|
||||
|
||||
pstate->pdquote_parse_trie = parse_trie_alloc();
|
||||
if (pstate->do_auto_line_term) {
|
||||
pstate->dquote_irs = mlr_paste_2_strings("\"", "\n");
|
||||
pstate->dquote_irs2 = mlr_paste_2_strings("\"", "\r\n");
|
||||
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_TOKEN);
|
||||
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs2, DQUOTE_IRS2_TOKEN);
|
||||
} else {
|
||||
pstate->dquote_irs = mlr_paste_2_strings("\"", pstate->irs);
|
||||
pstate->dquote_irs2 = NULL;
|
||||
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_irs, DQUOTE_IRS_TOKEN);
|
||||
}
|
||||
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_ifs, DQUOTE_IFS_TOKEN);
|
||||
parse_trie_add_string(pstate->pdquote_parse_trie, pstate->dquote_dquote, DQUOTE_DQUOTE_TOKEN);
|
||||
|
||||
pstate->pfields = rslls_alloc();
|
||||
pstate->psb = sb_alloc(STRING_BUILDER_INIT_SIZE);
|
||||
|
||||
pstate->expect_header_line_next = use_implicit_csv_header ? FALSE : TRUE;
|
||||
pstate->use_implicit_csv_header = use_implicit_csv_header;
|
||||
pstate->allow_ragged_csv_input = allow_ragged_csv_input;
|
||||
pstate->pheader_keeper = NULL;
|
||||
pstate->pheader_keepers = lhmslv_alloc();
|
||||
|
||||
plrec_reader->pvstate = (void*)pstate;
|
||||
plrec_reader->popen_func = file_reader_mmap_vopen;
|
||||
plrec_reader->pclose_func = file_reader_mmap_vclose;
|
||||
plrec_reader->pprocess_func = lrec_reader_mmap_csv_process;
|
||||
plrec_reader->psof_func = lrec_reader_mmap_csv_sof;
|
||||
plrec_reader->pfree_func = lrec_reader_mmap_csv_free;
|
||||
|
||||
return plrec_reader;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static void lrec_reader_mmap_csv_free(lrec_reader_t* preader) {
|
||||
lrec_reader_mmap_csv_state_t* pstate = preader->pvstate;
|
||||
for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) {
|
||||
header_keeper_t* pheader_keeper = pe->pvvalue;
|
||||
header_keeper_free(pheader_keeper);
|
||||
}
|
||||
lhmslv_free(pstate->pheader_keepers);
|
||||
parse_trie_free(pstate->pno_dquote_parse_trie);
|
||||
parse_trie_free(pstate->pdquote_parse_trie);
|
||||
rslls_free(pstate->pfields);
|
||||
sb_free(pstate->psb);
|
||||
free(pstate->ifs_eof);
|
||||
free(pstate->dquote_irs);
|
||||
free(pstate->dquote_irs2);
|
||||
free(pstate->dquote_ifs);
|
||||
free(pstate);
|
||||
free(preader);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static void lrec_reader_mmap_csv_sof(void* pvstate, void* pvhandle) {
|
||||
lrec_reader_mmap_csv_state_t* pstate = pvstate;
|
||||
pstate->ilno = 0LL;
|
||||
pstate->expect_header_line_next = pstate->use_implicit_csv_header ? FALSE : TRUE;
|
||||
|
||||
// Strip UTF-8 BOM if any
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
if ((phandle->eof - phandle->sol) >= 3) {
|
||||
if (memcmp(phandle->sol, "\xef\xbb\xbf", 3) == 0) {
|
||||
phandle->sol += 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_csv_process(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
lrec_reader_mmap_csv_state_t* pstate = pvstate;
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
|
||||
// Ingest the next header line, if expected
|
||||
if (pstate->expect_header_line_next) {
|
||||
while (TRUE) {
|
||||
if (!lrec_reader_mmap_csv_get_fields(pstate, pstate->pfields, phandle, pctx))
|
||||
return NULL;
|
||||
pstate->ilno++;
|
||||
|
||||
// We check for comments here rather than within the parser since it's important
|
||||
// for users to be able to comment out lines containing double-quoted newlines.
|
||||
if (pstate->comment_string != NULL && pstate->pfields->phead != NULL) {
|
||||
if (streqn(pstate->pfields->phead->value, pstate->comment_string, pstate->comment_string_length)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS) {
|
||||
int i = 0;
|
||||
for (
|
||||
rsllse_t* pe = pstate->pfields->phead;
|
||||
i < pstate->pfields->length && pe != NULL;
|
||||
pe = pe->pnext, i++)
|
||||
{
|
||||
if (i > 0)
|
||||
fputs(pstate->ifs, stdout);
|
||||
fputs(pe->value, stdout);
|
||||
}
|
||||
if (pstate->do_auto_line_term) {
|
||||
fputs(pctx->auto_line_term, stdout);
|
||||
} else {
|
||||
fputs(pstate->irs, stdout);
|
||||
}
|
||||
}
|
||||
rslls_reset(pstate->pfields);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
slls_t* pheader_fields = slls_alloc();
|
||||
int i = 0;
|
||||
for (rsllse_t* pe = pstate->pfields->phead; i < pstate->pfields->length && pe != NULL; pe = pe->pnext, i++) {
|
||||
if (*pe->value == 0) {
|
||||
fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
// Transfer pointer-free responsibility from the rslls to the
|
||||
// header fields in the header keeper
|
||||
slls_append(pheader_fields, pe->value, pe->free_flag);
|
||||
pe->free_flag = 0;
|
||||
}
|
||||
rslls_reset(pstate->pfields);
|
||||
|
||||
pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields);
|
||||
if (pstate->pheader_keeper == NULL) {
|
||||
pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields);
|
||||
lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper,
|
||||
NO_FREE); // freed by header-keeper
|
||||
} else { // Re-use the header-keeper in the header cache
|
||||
slls_free(pheader_fields);
|
||||
}
|
||||
|
||||
pstate->expect_header_line_next = FALSE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Ingest the next data line, if expected
|
||||
while (TRUE) {
|
||||
int rc = lrec_reader_mmap_csv_get_fields(pstate, pstate->pfields, phandle, pctx);
|
||||
pstate->ilno++;
|
||||
if (rc == FALSE) // EOF
|
||||
return NULL;
|
||||
|
||||
// We check for comments here rather than within the parser since it's important
|
||||
// for users to be able to comment out lines containing double-quoted newlines.
|
||||
if (pstate->comment_string != NULL && pstate->pfields->phead != NULL) {
|
||||
if (streqn(pstate->pfields->phead->value, pstate->comment_string, pstate->comment_string_length)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS) {
|
||||
int i = 0;
|
||||
for (
|
||||
rsllse_t* pe = pstate->pfields->phead;
|
||||
i < pstate->pfields->length && pe != NULL;
|
||||
pe = pe->pnext, i++)
|
||||
{
|
||||
if (i > 0)
|
||||
fputs(pstate->ifs, stdout);
|
||||
fputs(pe->value, stdout);
|
||||
}
|
||||
if (pstate->do_auto_line_term) {
|
||||
fputs(pctx->auto_line_term, stdout);
|
||||
} else {
|
||||
fputs(pstate->irs, stdout);
|
||||
}
|
||||
}
|
||||
rslls_reset(pstate->pfields);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
lrec_t* prec = pstate->use_implicit_csv_header
|
||||
? paste_indices_and_data(pstate, pstate->pfields, pctx)
|
||||
: pstate->allow_ragged_csv_input
|
||||
? paste_header_and_data_ragged(pstate, pstate->pfields, pctx)
|
||||
: paste_header_and_data_rectangular(pstate, pstate->pfields, pctx);
|
||||
rslls_reset(pstate->pfields);
|
||||
return prec;
|
||||
}
|
||||
}
|
||||
|
||||
static int lrec_reader_mmap_csv_get_fields(lrec_reader_mmap_csv_state_t* pstate,
|
||||
rslls_t* pfields, file_reader_mmap_state_t* phandle, context_t* pctx)
|
||||
{
|
||||
int rc, token = 0, matchlen = 0, record_done = FALSE, field_done = FALSE;
|
||||
string_builder_t* psb = pstate->psb;
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return FALSE;
|
||||
|
||||
char* p = phandle->sol;
|
||||
char* e = p;
|
||||
|
||||
// loop over fields in record
|
||||
record_done = FALSE;
|
||||
while (!record_done) {
|
||||
// Assumption is dquote is "\""
|
||||
if (*e != pstate->dquote[0]) { // start of non-quoted field
|
||||
|
||||
// Loop over characters in field
|
||||
field_done = FALSE;
|
||||
while (!field_done) {
|
||||
MLR_INTERNAL_CODING_ERROR_IF(e > phandle->eof);
|
||||
rc = parse_trie_match(pstate->pno_dquote_parse_trie, e, phandle->eof, &token, &matchlen);
|
||||
if (rc) {
|
||||
switch(token) {
|
||||
case IFS_TOKEN: // end of field
|
||||
*e = 0;
|
||||
rslls_append(pfields, p, NO_FREE, 0);
|
||||
p = e + matchlen;
|
||||
field_done = TRUE;
|
||||
break;
|
||||
case IRS_TOKEN: // end of record
|
||||
*e = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (e > p && e[-1] == '\r') {
|
||||
e[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
rslls_append(pfields, p, NO_FREE, 0);
|
||||
p = e + matchlen;
|
||||
field_done = TRUE;
|
||||
record_done = TRUE;
|
||||
break;
|
||||
case DQUOTE_TOKEN: // CSV syntax error: fields containing quotes must be fully wrapped in quotes
|
||||
fprintf(stderr, "%s: syntax error: unwrapped double quote at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pstate->ilno);
|
||||
exit(1);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, token, pstate->ilno);
|
||||
exit(1);
|
||||
break;
|
||||
}
|
||||
e += matchlen;
|
||||
} else if (e >= phandle->eof) {
|
||||
// We read to end of file without seeing end of line. We can't always zero-poke a null character to
|
||||
// terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's
|
||||
// our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking
|
||||
// at EOF is one byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(p, phandle->eof - p);
|
||||
rslls_append(pfields, copy, FREE_ENTRY_VALUE, 0);
|
||||
p = e + matchlen;
|
||||
field_done = TRUE;
|
||||
record_done = TRUE;
|
||||
break;
|
||||
} else {
|
||||
e++;
|
||||
}
|
||||
}
|
||||
|
||||
} else { // start of quoted field
|
||||
e += pstate->dquotelen;
|
||||
p = e;
|
||||
|
||||
// loop over characters in field
|
||||
field_done = FALSE;
|
||||
int contiguous = TRUE;
|
||||
// If there are no embedded double-double quotes, then the field value is a contiguous
|
||||
// array of bytes between the start and end double-quotes (non-inclusive). E.g. "ab,c"
|
||||
// has contents ab,c. In that case we can point the rslls at that range of bytes
|
||||
// with no data-copying. However, if there are embedded double-double quotes, then
|
||||
// we use the string-build logic to build up a dynamically allocated string. E.g.
|
||||
// "ab""c" becomes ab"c.
|
||||
while (!field_done) {
|
||||
if (e >= phandle->eof) {
|
||||
fprintf(stderr, "%s: unmatched double quote at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
rc = parse_trie_match(pstate->pdquote_parse_trie, e, phandle->eof, &token, &matchlen);
|
||||
|
||||
if (rc) {
|
||||
switch(token) {
|
||||
case DQUOTE_IFS_TOKEN: // end of field
|
||||
*e = 0;
|
||||
if (contiguous)
|
||||
rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT);
|
||||
else
|
||||
rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
|
||||
p = e + matchlen;
|
||||
field_done = TRUE;
|
||||
break;
|
||||
case DQUOTE_IRS_TOKEN: // end of record
|
||||
case DQUOTE_IRS2_TOKEN: // end of record
|
||||
*e = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (e > p && e[-1] == '\r') {
|
||||
e[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
if (contiguous)
|
||||
rslls_append(pfields, p, NO_FREE, FIELD_QUOTED_ON_INPUT);
|
||||
else
|
||||
rslls_append(pfields, sb_finish(psb), FREE_ENTRY_VALUE, FIELD_QUOTED_ON_INPUT);
|
||||
p = e + matchlen;
|
||||
field_done = TRUE;
|
||||
record_done = TRUE;
|
||||
break;
|
||||
case DQUOTE_DQUOTE_TOKEN: // RFC-4180 CSV: "" inside a dquoted field is an escape for "
|
||||
if (contiguous) { // not anymore it isn't
|
||||
sb_append_char_range(psb, p, e);
|
||||
contiguous = FALSE;
|
||||
} else {
|
||||
sb_append_char(psb, pstate->dquote[0]);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "%s: internal coding error: unexpected token %d at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, token, pstate->ilno);
|
||||
exit(1);
|
||||
break;
|
||||
}
|
||||
e += matchlen;
|
||||
} else {
|
||||
if (!contiguous)
|
||||
sb_append_char(psb, *e);
|
||||
e++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
phandle->sol = e;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* paste_indices_and_data(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
|
||||
context_t* pctx)
|
||||
{
|
||||
int idx = 0;
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
for (rsllse_t* pd = pdata_fields->phead; idx < pdata_fields->length && pd != NULL; pd = pd->pnext) {
|
||||
idx++;
|
||||
char key_free_flags = 0;
|
||||
char* key = low_int_to_string(idx, &key_free_flags);
|
||||
char value_free_flags = pd->free_flag;
|
||||
// Transfer pointer-free responsibility from the rslls to the lrec object
|
||||
lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag);
|
||||
pd->free_flag = 0;
|
||||
}
|
||||
return prec;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* paste_header_and_data_ragged(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
|
||||
context_t* pctx)
|
||||
{
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
sllse_t* ph = pstate->pheader_keeper->pkeys->phead;
|
||||
rsllse_t* pd = pdata_fields->phead;
|
||||
int idx = 0;
|
||||
int hlen = pstate->pheader_keeper->pkeys->length;
|
||||
int dlen = pdata_fields->length;
|
||||
|
||||
// Process fields up to minimum of header length and data length
|
||||
// Note that pd->pnext can be non-null due to pointer-reuse semantics of rslls,
|
||||
// so use list-length attributes for end-of-list check.
|
||||
for (idx = 0; idx < hlen && idx < dlen; idx++, ph = ph->pnext, pd = pd->pnext) {
|
||||
// Transfer pointer-free responsibility from the rslls to the lrec object
|
||||
lrec_put_ext(prec, ph->value, pd->value, pd->free_flag, pd->quote_flag);
|
||||
pd->free_flag = 0;
|
||||
}
|
||||
|
||||
if (hlen > dlen) {
|
||||
// Header is longer. Empty-fill the remaining data fields.
|
||||
// E.g. if the input looks like
|
||||
// a,b,c,d <-- header
|
||||
// 1,2 <-- data
|
||||
// then put c="", d="".
|
||||
for ( ; idx < hlen; idx++, ph = ph->pnext) {
|
||||
lrec_put_ext(prec, ph->value, "", NO_FREE, 0);
|
||||
}
|
||||
} else {
|
||||
// Data is longer. Use positional indices to label the remaining data fields.
|
||||
for ( ; idx < dlen; idx++, pd = pd->pnext) {
|
||||
char key_free_flags = 0;
|
||||
char* key = low_int_to_string(idx+1, &key_free_flags);
|
||||
char value_free_flags = pd->free_flag;
|
||||
// Transfer pointer-free responsibility from the rslls to the lrec object
|
||||
lrec_put_ext(prec, key, pd->value, key_free_flags | value_free_flags, pd->quote_flag);
|
||||
pd->free_flag = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* paste_header_and_data_rectangular(lrec_reader_mmap_csv_state_t* pstate, rslls_t* pdata_fields,
|
||||
context_t* pctx)
|
||||
{
|
||||
if (pstate->pheader_keeper->pkeys->length != pdata_fields->length) {
|
||||
fprintf(stderr, "%s: Header/data length mismatch (%llu != %llu) at file \"%s\" line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pstate->pheader_keeper->pkeys->length, pdata_fields->length,
|
||||
pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
sllse_t* ph = pstate->pheader_keeper->pkeys->phead;
|
||||
rsllse_t* pd = pdata_fields->phead;
|
||||
for ( ; ph != NULL && pd != NULL; ph = ph->pnext, pd = pd->pnext) {
|
||||
// Transfer pointer-free responsibility from the rslls to the lrec object
|
||||
lrec_put_ext(prec, ph->value, pd->value, pd->free_flag, pd->quote_flag);
|
||||
pd->free_flag = 0;
|
||||
}
|
||||
return prec;
|
||||
}
|
||||
|
|
@ -1,876 +0,0 @@
|
|||
// ================================================================
|
||||
// Note: there are multiple process methods with a lot of code duplication.
|
||||
// This is intentional. Much of Miller's measured processing time is in the
|
||||
// lrec-reader process methods. This is code which needs to execute on every
|
||||
// byte of input and even moving a single runtime if-statement into a
|
||||
// function-pointer assignment at alloc time can have noticeable effects on
|
||||
// performance (5-10% in some cases).
|
||||
// ================================================================
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "cli/comment_handling.h"
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "containers/slls.h"
|
||||
#include "containers/lhmslv.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
#include "input/lrec_readers.h"
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// Multi-file cases:
|
||||
//
|
||||
// a,a a,b c d
|
||||
// -- FILE1: -- FILE1: -- FILE1: -- FILE1:
|
||||
// a,b,c a,b,c a,b,c a,b,c
|
||||
// 1,2,3 1,2,3 1,2,3 1,2,3
|
||||
// 4,5,6 4,5,6 4,5,6 4,5,6
|
||||
// -- FILE2: -- FILE2:
|
||||
// a,b,c d,e,f,g a,b,c d,e,f
|
||||
// 7,8,9 3,4,5,6 7,8,9 3,4,5
|
||||
// --OUTPUT: --OUTPUT: --OUTPUT: --OUTPUT:
|
||||
// a,b,c a,b,c a,b,c a,b,c
|
||||
// 1,2,3 1,2,3 1,2,3 1,2,3
|
||||
// 4,5,6 4,5,6 4,5,6 4,5,6
|
||||
// 7,8,9 7,8,9
|
||||
// d,e,f,g d,e,f
|
||||
// 3,4,5,6 3,4,5
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
typedef struct _lrec_reader_mmap_csvlite_state_t {
|
||||
long long ifnr;
|
||||
long long ilno; // Line-level, not record-level as in context_t
|
||||
char* irs;
|
||||
char* ifs;
|
||||
int irslen;
|
||||
int ifslen;
|
||||
int allow_repeat_ifs;
|
||||
int do_auto_line_term;
|
||||
int use_implicit_csv_header;
|
||||
int allow_ragged_csv_input;
|
||||
comment_handling_t comment_handling;
|
||||
char* comment_string;
|
||||
int comment_string_length;
|
||||
|
||||
int expect_header_line_next;
|
||||
header_keeper_t* pheader_keeper;
|
||||
lhmslv_t* pheader_keepers;
|
||||
} lrec_reader_mmap_csvlite_state_t;
|
||||
|
||||
static void lrec_reader_mmap_csvlite_free(lrec_reader_t* preader);
|
||||
static void lrec_reader_mmap_csvlite_sof(void* pvstate, void* pvhandle);
|
||||
static lrec_t* lrec_reader_mmap_csvlite_process_single_seps(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_csvlite_process_multi_seps(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
|
||||
static slls_t* lrec_reader_mmap_csvlite_get_header_single_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx);
|
||||
|
||||
static slls_t* lrec_reader_mmap_csvlite_get_header_multi_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate);
|
||||
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
|
||||
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
|
||||
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
|
||||
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza);
|
||||
|
||||
static int handle_comment_line_single_irs(
|
||||
file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate,
|
||||
char irs);
|
||||
|
||||
static int handle_comment_line_multi_irs(
|
||||
file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char* irs, char* ifs, int allow_repeat_ifs, int use_implicit_csv_header,
|
||||
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string)
|
||||
{
|
||||
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
|
||||
|
||||
lrec_reader_mmap_csvlite_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_csvlite_state_t));
|
||||
pstate->ifnr = 0LL;
|
||||
pstate->irs = irs;
|
||||
pstate->ifs = ifs;
|
||||
pstate->irslen = strlen(irs);
|
||||
pstate->ifslen = strlen(ifs);
|
||||
pstate->allow_repeat_ifs = allow_repeat_ifs;
|
||||
pstate->do_auto_line_term = FALSE;
|
||||
pstate->use_implicit_csv_header = use_implicit_csv_header;
|
||||
pstate->allow_ragged_csv_input = allow_ragged_csv_input;
|
||||
pstate->comment_handling = comment_handling;
|
||||
pstate->comment_string = comment_string;
|
||||
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
|
||||
|
||||
pstate->expect_header_line_next = use_implicit_csv_header ? FALSE : TRUE;
|
||||
pstate->pheader_keeper = NULL;
|
||||
pstate->pheader_keepers = lhmslv_alloc();
|
||||
|
||||
plrec_reader->pvstate = (void*)pstate;
|
||||
plrec_reader->popen_func = file_reader_mmap_vopen;
|
||||
plrec_reader->pclose_func = file_reader_mmap_vclose;
|
||||
|
||||
if (streq(irs, "auto")) {
|
||||
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
|
||||
// either case the final character is "\n". Then for autodetect we
|
||||
// simply check if there's a character in the line before the '\n', and
|
||||
// if that is '\r'.
|
||||
pstate->do_auto_line_term = TRUE;
|
||||
pstate->irs = "\n";
|
||||
pstate->irslen = 1;
|
||||
plrec_reader->pprocess_func = (pstate->ifslen == 1)
|
||||
? lrec_reader_mmap_csvlite_process_single_seps
|
||||
: lrec_reader_mmap_csvlite_process_multi_seps;
|
||||
} else {
|
||||
plrec_reader->pprocess_func = (pstate->irslen == 1 && pstate->ifslen == 1)
|
||||
? lrec_reader_mmap_csvlite_process_single_seps
|
||||
: lrec_reader_mmap_csvlite_process_multi_seps;
|
||||
}
|
||||
|
||||
plrec_reader->psof_func = lrec_reader_mmap_csvlite_sof;
|
||||
plrec_reader->pfree_func = lrec_reader_mmap_csvlite_free;
|
||||
|
||||
return plrec_reader;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static void lrec_reader_mmap_csvlite_free(lrec_reader_t* preader) {
|
||||
lrec_reader_mmap_csvlite_state_t* pstate = preader->pvstate;
|
||||
for (lhmslve_t* pe = pstate->pheader_keepers->phead; pe != NULL; pe = pe->pnext) {
|
||||
header_keeper_t* pheader_keeper = pe->pvvalue;
|
||||
header_keeper_free(pheader_keeper);
|
||||
}
|
||||
lhmslv_free(pstate->pheader_keepers);
|
||||
free(pstate);
|
||||
free(preader);
|
||||
}
|
||||
|
||||
static void lrec_reader_mmap_csvlite_sof(void* pvstate, void* pvhandle) {
|
||||
lrec_reader_mmap_csvlite_state_t* pstate = pvstate;
|
||||
pstate->ifnr = 0LL;
|
||||
pstate->ilno = 0LL;
|
||||
pstate->expect_header_line_next = pstate->use_implicit_csv_header ? FALSE : TRUE;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_csvlite_process_single_seps(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_csvlite_state_t* pstate = pvstate;
|
||||
|
||||
while (TRUE) {
|
||||
if (pstate->expect_header_line_next) {
|
||||
|
||||
slls_t* pheader_fields = lrec_reader_mmap_csvlite_get_header_single_seps(phandle, pstate, pctx);
|
||||
if (pheader_fields == NULL) { // EOF
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (sllse_t* pe = pheader_fields->phead; pe != NULL; pe = pe->pnext) {
|
||||
if (*pe->value == 0) {
|
||||
fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields);
|
||||
if (pstate->pheader_keeper == NULL) {
|
||||
pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields);
|
||||
lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper,
|
||||
NO_FREE); // freed by header-keeper
|
||||
} else { // Re-use the header-keeper in the header cache
|
||||
slls_free(pheader_fields);
|
||||
}
|
||||
pstate->expect_header_line_next = FALSE;
|
||||
}
|
||||
|
||||
int end_of_stanza = FALSE;
|
||||
lrec_t* prec = pstate->use_implicit_csv_header
|
||||
? lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(phandle, pstate, pctx,
|
||||
pstate->pheader_keeper, &end_of_stanza)
|
||||
: lrec_reader_mmap_csvlite_get_record_single_seps(phandle, pstate, pctx,
|
||||
pstate->pheader_keeper, &end_of_stanza);
|
||||
if (end_of_stanza) {
|
||||
pstate->expect_header_line_next = TRUE;
|
||||
} else if (prec == NULL) { // EOF
|
||||
return NULL;
|
||||
} else {
|
||||
return prec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_csvlite_process_multi_seps(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_csvlite_state_t* pstate = pvstate;
|
||||
|
||||
while (TRUE) {
|
||||
if (pstate->expect_header_line_next) {
|
||||
|
||||
slls_t* pheader_fields = lrec_reader_mmap_csvlite_get_header_multi_seps(phandle, pstate);
|
||||
if (pheader_fields == NULL) // EOF
|
||||
return NULL;
|
||||
|
||||
for (sllse_t* pe = pheader_fields->phead; pe != NULL; pe = pe->pnext) {
|
||||
if (*pe->value == 0) {
|
||||
fprintf(stderr, "%s: unacceptable empty CSV key at file \"%s\" line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
pstate->pheader_keeper = lhmslv_get(pstate->pheader_keepers, pheader_fields);
|
||||
if (pstate->pheader_keeper == NULL) {
|
||||
pstate->pheader_keeper = header_keeper_alloc(NULL, pheader_fields);
|
||||
lhmslv_put(pstate->pheader_keepers, pheader_fields, pstate->pheader_keeper,
|
||||
NO_FREE); // freed by header-keeper
|
||||
} else { // Re-use the header-keeper in the header cache
|
||||
slls_free(pheader_fields);
|
||||
}
|
||||
pstate->expect_header_line_next = FALSE;
|
||||
}
|
||||
|
||||
int end_of_stanza = FALSE;
|
||||
lrec_t* prec = pstate->use_implicit_csv_header
|
||||
? lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(phandle, pstate, pctx,
|
||||
pstate->pheader_keeper, &end_of_stanza)
|
||||
: lrec_reader_mmap_csvlite_get_record_multi_seps(phandle, pstate, pctx,
|
||||
pstate->pheader_keeper, &end_of_stanza);
|
||||
if (end_of_stanza) {
|
||||
pstate->expect_header_line_next = TRUE;
|
||||
} else if (prec == NULL) { // EOF
|
||||
return NULL;
|
||||
} else {
|
||||
return prec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static slls_t* lrec_reader_mmap_csvlite_get_header_single_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
char irs = pstate->irs[0];
|
||||
char ifs = pstate->ifs[0];
|
||||
int allow_repeat_ifs = pstate->allow_repeat_ifs;
|
||||
|
||||
slls_t* pheader_names = slls_alloc();
|
||||
|
||||
// Skip blank/comment lines and seek to header line
|
||||
while (TRUE) {
|
||||
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
|
||||
phandle->sol++;
|
||||
pstate->ilno++;
|
||||
continue;
|
||||
}
|
||||
if (pstate->comment_string != NULL && handle_comment_line_single_irs(phandle, pstate, irs)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
char* p = phandle->sol;
|
||||
if (allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
char* osol = p;
|
||||
char* header_name = p;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (*p == irs) {
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > phandle->sol && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
pstate->ilno++;
|
||||
break;
|
||||
} else if (*p == ifs) {
|
||||
*p = 0;
|
||||
|
||||
slls_append_no_free(pheader_names, header_name);
|
||||
|
||||
p++;
|
||||
if (allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
header_name = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (allow_repeat_ifs && *header_name == 0) {
|
||||
// OK
|
||||
} else if (p == osol) {
|
||||
// OK
|
||||
} else {
|
||||
slls_append_no_free(pheader_names, header_name);
|
||||
}
|
||||
|
||||
return pheader_names;
|
||||
}
|
||||
|
||||
static slls_t* lrec_reader_mmap_csvlite_get_header_multi_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate)
|
||||
{
|
||||
char* irs = pstate->irs;
|
||||
char* ifs = pstate->ifs;
|
||||
int irslen = pstate->irslen;
|
||||
int ifslen = pstate->ifslen;
|
||||
int allow_repeat_ifs = pstate->allow_repeat_ifs;
|
||||
|
||||
// Skip blank/comment lines and seek to header line
|
||||
while (TRUE) {
|
||||
if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) {
|
||||
phandle->sol += irslen;
|
||||
pstate->ilno++;
|
||||
continue;
|
||||
}
|
||||
if (pstate->comment_string != NULL && handle_comment_line_multi_irs(phandle, pstate)) {
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
slls_t* pheader_names = slls_alloc();
|
||||
|
||||
// Parse the header line
|
||||
char* p = phandle->sol;
|
||||
if (allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
char* osol = p;
|
||||
char* header_name = p;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, irs, irslen)) {
|
||||
*p = 0;
|
||||
phandle->sol = p + irslen;
|
||||
pstate->ilno++;
|
||||
break;
|
||||
} else if (streqn(p, ifs, ifslen)) {
|
||||
*p = 0;
|
||||
|
||||
slls_append_no_free(pheader_names, header_name);
|
||||
|
||||
p += ifslen;
|
||||
if (allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
header_name = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (allow_repeat_ifs && *header_name == 0) {
|
||||
// OK
|
||||
} else if (p == osol) {
|
||||
// OK
|
||||
} else {
|
||||
slls_append_no_free(pheader_names, header_name);
|
||||
}
|
||||
|
||||
return pheader_names;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
|
||||
{
|
||||
char irs = pstate->irs[0];
|
||||
char ifs = pstate->ifs[0];
|
||||
int allow_repeat_ifs = pstate->allow_repeat_ifs;
|
||||
|
||||
// Skip comment lines
|
||||
if (pstate->comment_string != NULL) {
|
||||
while (handle_comment_line_single_irs(phandle, pstate, irs))
|
||||
;
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* line = phandle->sol;
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
sllse_t* pe = pheader_keeper->pkeys->phead;
|
||||
char* p = line;
|
||||
if (allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
int saw_rs = FALSE;
|
||||
int idx = 0;
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (*p == irs) {
|
||||
if (p == line) {
|
||||
*pend_of_stanza = TRUE;
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
}
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
pstate->ilno++;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (*p == ifs) {
|
||||
*p = 0;
|
||||
idx++;
|
||||
if (pe == NULL) {
|
||||
// Data line has more fields than the header line did
|
||||
if (pstate->allow_ragged_csv_input) {
|
||||
char free_flags = NO_FREE;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
key = pe->value;
|
||||
pe = pe->pnext;
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
p++;
|
||||
if (allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
char free_flags = NO_FREE;
|
||||
if (pe == NULL) {
|
||||
// Data line has more fields than the header line did
|
||||
if (pstate->allow_ragged_csv_input) {
|
||||
idx++;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
} else {
|
||||
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
key = pe->value;
|
||||
}
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
if (pe != NULL && pe->pnext != NULL) {
|
||||
// Header line has more fields than the data line did
|
||||
if (pstate->allow_ragged_csv_input) {
|
||||
for (pe = pe->pnext ; pe != NULL; pe = pe->pnext) {
|
||||
key = pe->value;
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
|
||||
{
|
||||
// Skip comment lines
|
||||
if (pstate->comment_string != NULL) {
|
||||
while (handle_comment_line_multi_irs(phandle, pstate))
|
||||
;
|
||||
}
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* irs = pstate->irs;
|
||||
char* ifs = pstate->ifs;
|
||||
int irslen = pstate->irslen;
|
||||
int ifslen = pstate->ifslen;
|
||||
int allow_repeat_ifs = pstate->allow_repeat_ifs;
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
char* line = phandle->sol;
|
||||
|
||||
sllse_t* pe = pheader_keeper->pkeys->phead;
|
||||
char* p = line;
|
||||
if (allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
int saw_rs = FALSE;
|
||||
int idx = 0;
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, irs, irslen)) {
|
||||
if (p == line) {
|
||||
*pend_of_stanza = TRUE;
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
}
|
||||
*p = 0;
|
||||
phandle->sol = p + irslen;
|
||||
pstate->ilno++;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (streqn(p, ifs, ifslen)) {
|
||||
*p = 0;
|
||||
idx++;
|
||||
if (pe == NULL) {
|
||||
// Data line has more fields than the header line did
|
||||
if (pstate->allow_ragged_csv_input) {
|
||||
char free_flags = NO_FREE;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
key = pe->value;
|
||||
pe = pe->pnext;
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
|
||||
p += ifslen;
|
||||
if (allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
char free_flags = NO_FREE;
|
||||
if (pe == NULL) {
|
||||
// Data line has more fields than the header line did
|
||||
if (pstate->allow_ragged_csv_input) {
|
||||
idx++;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
} else {
|
||||
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
key = pe->value;
|
||||
}
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
if (pe != NULL && pe->pnext != NULL) {
|
||||
// Header line has more fields than the data line did
|
||||
if (pstate->allow_ragged_csv_input) {
|
||||
for (pe = pe->pnext ; pe != NULL; pe = pe->pnext) {
|
||||
key = pe->value;
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "%s: Header-data length mismatch in file %s at line %lld.\n",
|
||||
MLR_GLOBALS.bargv0, pctx->filename, pstate->ilno);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_single_seps_implicit_header(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
|
||||
{
|
||||
char irs = pstate->irs[0];
|
||||
char ifs = pstate->ifs[0];
|
||||
int allow_repeat_ifs = pstate->allow_repeat_ifs;
|
||||
|
||||
// Skip comment lines
|
||||
if (pstate->comment_string != NULL) {
|
||||
while (handle_comment_line_single_irs(phandle, pstate, irs))
|
||||
;
|
||||
}
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
char* line = phandle->sol;
|
||||
|
||||
char* p = line;
|
||||
if (allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
char free_flags = NO_FREE;
|
||||
int idx = 0;
|
||||
int saw_rs = FALSE;
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (*p == irs) {
|
||||
if (p == line) {
|
||||
*pend_of_stanza = TRUE;
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
}
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
pstate->ilno++;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (*p == ifs) {
|
||||
*p = 0;
|
||||
key = low_int_to_string(++idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
p++;
|
||||
if (allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
key = low_int_to_string(++idx, &free_flags);
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_csvlite_get_record_multi_seps_implicit_header(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate, context_t* pctx, header_keeper_t* pheader_keeper, int* pend_of_stanza)
|
||||
{
|
||||
// Skip comment lines
|
||||
if (pstate->comment_string != NULL) {
|
||||
while (handle_comment_line_multi_irs(phandle, pstate))
|
||||
;
|
||||
}
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* irs = pstate->irs;
|
||||
char* ifs = pstate->ifs;
|
||||
int irslen = pstate->irslen;
|
||||
int ifslen = pstate->ifslen;
|
||||
int allow_repeat_ifs = pstate->allow_repeat_ifs;
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
char* line = phandle->sol;
|
||||
|
||||
char* p = line;
|
||||
if (allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
char free_flags;
|
||||
int idx = 0;
|
||||
int saw_rs = FALSE;
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, irs, irslen)) {
|
||||
if (p == line) {
|
||||
*pend_of_stanza = TRUE;
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
}
|
||||
*p = 0;
|
||||
phandle->sol = p + irslen;
|
||||
pstate->ilno++;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (streqn(p, ifs, ifslen)) {
|
||||
*p = 0;
|
||||
key = low_int_to_string(++idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
|
||||
p += ifslen;
|
||||
if (allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
key = low_int_to_string(++idx, &free_flags);
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static int handle_comment_line_single_irs(
|
||||
file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate,
|
||||
char irs)
|
||||
{
|
||||
if ((phandle->eof - phandle->sol) >= pstate->comment_string_length
|
||||
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
|
||||
{
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < pstate->comment_string_length; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += pstate->comment_string_length;
|
||||
while (phandle->sol < phandle->eof && *phandle->sol != irs) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
pstate->ilno++;
|
||||
return TRUE;
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static int handle_comment_line_multi_irs(
|
||||
file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_csvlite_state_t* pstate)
|
||||
{
|
||||
if ((phandle->eof - phandle->sol) >= pstate->comment_string_length
|
||||
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
|
||||
{
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < pstate->comment_string_length; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += pstate->comment_string_length;
|
||||
while ((phandle->eof - phandle->sol >= pstate->irslen) && !streqn(phandle->sol, pstate->irs, pstate->irslen)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
if ((phandle->eof - phandle->sol >= pstate->irslen) && streqn(phandle->sol, pstate->irs, pstate->irslen)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < pstate->irslen; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += pstate->irslen;
|
||||
}
|
||||
pstate->ilno++;
|
||||
return TRUE;
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,683 +0,0 @@
|
|||
// ================================================================
|
||||
// Note: there are multiple process methods with a lot of code duplication.
|
||||
// This is intentional. Much of Miller's measured processing time is in the
|
||||
// lrec-reader process methods. This is code which needs to execute on every
|
||||
// byte of input and even moving a single runtime if-statement into a
|
||||
// function-pointer assignment at alloc time can have noticeable effects on
|
||||
// performance (5-10% in some cases).
|
||||
// ================================================================
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "cli/comment_handling.h"
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
#include "input/lrec_readers.h"
|
||||
|
||||
typedef struct _lrec_reader_mmap_dkvp_state_t {
|
||||
char* irs;
|
||||
char* ifs;
|
||||
char* ips;
|
||||
int irslen;
|
||||
int ifslen;
|
||||
int ipslen;
|
||||
int allow_repeat_ifs;
|
||||
int do_auto_line_term;
|
||||
comment_handling_t comment_handling;
|
||||
char* comment_string;
|
||||
int comment_string_length;
|
||||
} lrec_reader_mmap_dkvp_state_t;
|
||||
|
||||
static void lrec_reader_mmap_dkvp_free(lrec_reader_t* preader);
|
||||
static void lrec_reader_mmap_dkvp_sof(void* pvstate, void* pvhandle);
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_dkvp_single_irs_single_others(file_reader_mmap_state_t *phandle,
|
||||
char irs, char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_dkvp_single_irs_multi_others(file_reader_mmap_state_t *phandle,
|
||||
char irs, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_single_others(file_reader_mmap_state_t *phandle,
|
||||
char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_multi_others(file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx);
|
||||
|
||||
static void skip_over_comment_lines_single_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_dkvp_state_t* pstate,
|
||||
char irs);
|
||||
|
||||
static void skip_over_comment_lines_multi_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_dkvp_state_t* pstate,
|
||||
char* irs,
|
||||
int irslen);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
lrec_reader_t* lrec_reader_mmap_dkvp_alloc(char* irs, char* ifs, char* ips, int allow_repeat_ifs,
|
||||
comment_handling_t comment_handling, char* comment_string)
|
||||
{
|
||||
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
|
||||
|
||||
lrec_reader_mmap_dkvp_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_dkvp_state_t));
|
||||
pstate->irs = irs;
|
||||
pstate->ifs = ifs;
|
||||
pstate->ips = ips;
|
||||
pstate->irslen = strlen(irs);
|
||||
pstate->ifslen = strlen(ifs);
|
||||
pstate->ipslen = strlen(ips);
|
||||
pstate->allow_repeat_ifs = allow_repeat_ifs;
|
||||
pstate->do_auto_line_term = FALSE;
|
||||
pstate->comment_handling = comment_handling;
|
||||
pstate->comment_string = comment_string;
|
||||
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
|
||||
|
||||
plrec_reader->pvstate = (void*)pstate;
|
||||
plrec_reader->popen_func = file_reader_mmap_vopen;
|
||||
plrec_reader->pclose_func = file_reader_mmap_vclose;
|
||||
if (streq(irs, "auto")) {
|
||||
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
|
||||
// either case the final character is "\n". Then for autodetect we
|
||||
// simply check if there's a character in the line before the '\n', and
|
||||
// if that is '\r'.
|
||||
pstate->do_auto_line_term = TRUE;
|
||||
pstate->irs = "\n";
|
||||
pstate->irslen = 1;
|
||||
plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1)
|
||||
? lrec_reader_mmap_dkvp_process_single_irs_single_others
|
||||
: lrec_reader_mmap_dkvp_process_single_irs_multi_others;
|
||||
} else if (pstate->irslen == 1) {
|
||||
plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1)
|
||||
? lrec_reader_mmap_dkvp_process_single_irs_single_others
|
||||
: lrec_reader_mmap_dkvp_process_single_irs_multi_others;
|
||||
} else {
|
||||
plrec_reader->pprocess_func = (pstate->ifslen == 1 && pstate->ipslen == 1)
|
||||
? lrec_reader_mmap_dkvp_process_multi_irs_single_others
|
||||
: lrec_reader_mmap_dkvp_process_multi_irs_multi_others;
|
||||
}
|
||||
plrec_reader->psof_func = lrec_reader_mmap_dkvp_sof;
|
||||
plrec_reader->pfree_func = lrec_reader_mmap_dkvp_free;
|
||||
|
||||
return plrec_reader;
|
||||
}
|
||||
|
||||
static void lrec_reader_mmap_dkvp_free(lrec_reader_t* preader) {
|
||||
free(preader->pvstate);
|
||||
free(preader);
|
||||
}
|
||||
|
||||
// No-op for stateless readers such as this one.
|
||||
static void lrec_reader_mmap_dkvp_sof(void* pvstate, void* pvhandle) {
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_dkvp_single_irs_single_others(phandle, pstate->irs[0], pstate->ifs[0], pstate->ips[0],
|
||||
pstate, pctx);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_single_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_dkvp_single_irs_multi_others(phandle, pstate->irs[0], pstate, pctx);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_single_others(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_dkvp_multi_irs_single_others(phandle, pstate->ifs[0], pstate->ips[0],
|
||||
pstate, pctx);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_dkvp_process_multi_irs_multi_others(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_dkvp_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_dkvp_multi_irs_multi_others(phandle, pstate, pctx);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_parse_mmap_dkvp_single_irs_single_others(file_reader_mmap_state_t *phandle,
|
||||
char irs, char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_single_irs(phandle, pstate, irs);
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* line = phandle->sol;
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
int idx = 0;
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
char* key = p;
|
||||
char* value = p;
|
||||
|
||||
int saw_ps = FALSE;
|
||||
int saw_rs = FALSE;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (*p == irs) {
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (*p == ifs) {
|
||||
saw_ps = FALSE;
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
if (*key == 0 || value <= key) {
|
||||
// E.g the pair has no equals sign: "a" rather than "a=1" or
|
||||
// "a=". Here we use the positional index as the key. This way
|
||||
// DKVP is a generalization of NIDX.
|
||||
char free_flags = NO_FREE;
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
|
||||
p++;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
key = p;
|
||||
value = p;
|
||||
} else if (*p == ips && !saw_ps) {
|
||||
*p = 0;
|
||||
p++;
|
||||
value = p;
|
||||
saw_ps = TRUE;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
|
||||
return prec;
|
||||
|
||||
// There are two ways out of that loop: saw IRS, or saw end of file.
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
else
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
else
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_single_others(file_reader_mmap_state_t *phandle,
|
||||
char ifs, char ips, lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* line = phandle->sol;
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
int idx = 0;
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
char* key = p;
|
||||
char* value = p;
|
||||
|
||||
int saw_ps = FALSE;
|
||||
int saw_rs = FALSE;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, pstate->irs, pstate->irslen)) {
|
||||
*p = 0;
|
||||
phandle->sol = p + pstate->irslen;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (*p == ifs) {
|
||||
saw_ps = FALSE;
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
if (*key == 0 || value <= key) {
|
||||
// E.g the pair has no equals sign: "a" rather than "a=1" or
|
||||
// "a=". Here we use the positional index as the key. This way
|
||||
// DKVP is a generalization of NIDX.
|
||||
char free_flags = NO_FREE;
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
|
||||
p++;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
key = p;
|
||||
value = p;
|
||||
} else if (*p == ips && !saw_ps) {
|
||||
*p = 0;
|
||||
p++;
|
||||
value = p;
|
||||
saw_ps = TRUE;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
|
||||
return prec;
|
||||
|
||||
// There are two ways out of that loop: saw IRS, or saw end of file.
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
else
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
else
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_dkvp_single_irs_multi_others(file_reader_mmap_state_t *phandle, char irs,
|
||||
lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_single_irs(phandle, pstate, irs);
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* line = phandle->sol;
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
int idx = 0;
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, pstate->ifs, pstate->ifslen))
|
||||
p += pstate->ifslen;
|
||||
}
|
||||
char* key = p;
|
||||
char* value = p;
|
||||
|
||||
int saw_ps = FALSE;
|
||||
int saw_rs = FALSE;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (*p == irs) {
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (streqn(p, pstate->ifs, pstate->ifslen)) {
|
||||
saw_ps = FALSE;
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
if (*key == 0 || value <= key) {
|
||||
// E.g the pair has no equals sign: "a" rather than "a=1" or
|
||||
// "a=". Here we use the positional index as the key. This way
|
||||
// DKVP is a generalization of NIDX.
|
||||
char free_flags = NO_FREE;
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
|
||||
p += pstate->ifslen;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, pstate->ifs, pstate->ifslen))
|
||||
p += pstate->ifslen;
|
||||
}
|
||||
key = p;
|
||||
value = p;
|
||||
} else if (streqn(p, pstate->ips, pstate->ipslen) && !saw_ps) {
|
||||
*p = 0;
|
||||
p += pstate->ipslen;
|
||||
value = p;
|
||||
saw_ps = TRUE;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
*p = 0;
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
|
||||
return prec;
|
||||
|
||||
// There are two ways out of that loop: saw IRS, or saw end of file.
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
else
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
else
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_dkvp_multi_irs_multi_others(file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_dkvp_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* line = phandle->sol;
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
int idx = 0;
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, pstate->ifs, pstate->ifslen))
|
||||
p += pstate->ifslen;
|
||||
}
|
||||
char* key = p;
|
||||
char* value = p;
|
||||
|
||||
int saw_ps = FALSE;
|
||||
int saw_rs = FALSE;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, pstate->irs, pstate->irslen)) {
|
||||
*p = 0;
|
||||
phandle->sol = p + pstate->irslen;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (streqn(p, pstate->ifs, pstate->ifslen)) {
|
||||
saw_ps = FALSE;
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
if (*key == 0 || value <= key) {
|
||||
// E.g the pair has no equals sign: "a" rather than "a=1" or
|
||||
// "a=". Here we use the positional index as the key. This way
|
||||
// DKVP is a generalization of NIDX.
|
||||
char free_flags = NO_FREE;
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
|
||||
p += pstate->ifslen;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, pstate->ifs, pstate->ifslen))
|
||||
p += pstate->ifslen;
|
||||
}
|
||||
key = p;
|
||||
value = p;
|
||||
} else if (streqn(p, pstate->ips, pstate->ipslen) && !saw_ps) {
|
||||
*p = 0;
|
||||
p += pstate->ipslen;
|
||||
value = p;
|
||||
saw_ps = TRUE;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *key == 0 && *value == 0)
|
||||
return prec;
|
||||
|
||||
// There are two ways out of that loop: saw IRS, or saw end of file.
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
else
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), value, free_flags);
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof)
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
else
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
}
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
if (*key == 0 || value <= key) {
|
||||
char free_flags = NO_FREE;
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), "", free_flags);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, low_int_to_string(idx, &free_flags), copy, free_flags | FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (value >= phandle->eof) {
|
||||
lrec_put(prec, key, "", NO_FREE);
|
||||
} else {
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static void skip_over_comment_lines_single_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_dkvp_state_t* pstate,
|
||||
char irs)
|
||||
{
|
||||
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
|
||||
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
|
||||
{
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < pstate->comment_string_length; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += pstate->comment_string_length;
|
||||
while (phandle->sol < phandle->eof && *phandle->sol != irs) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void skip_over_comment_lines_multi_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_dkvp_state_t* pstate,
|
||||
char* irs,
|
||||
int irslen)
|
||||
{
|
||||
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
|
||||
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
|
||||
{
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < pstate->comment_string_length; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += pstate->comment_string_length;
|
||||
while ((phandle->eof - phandle->sol) >= irslen && !streqn(phandle->sol, irs, irslen)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < irslen; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += irslen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,220 +0,0 @@
|
|||
// ================================================================
|
||||
// Note: there are multiple process methods with a lot of code duplication.
|
||||
// This is intentional. Much of Miller's measured processing time is in the
|
||||
// lrec-reader process methods. This is code which needs to execute on every
|
||||
// byte of input and even moving a single runtime if-statement into a
|
||||
// function-pointer assignment at alloc time can have noticeable effects on
|
||||
// performance (5-10% in some cases).
|
||||
// ================================================================
|
||||
|
||||
// ================================================================
|
||||
// Unlike other Miller record-readers, there is no streaming for JSON input: no
|
||||
// records are processed until EOF is seen. See also
|
||||
// https://github.com/johnkerl/miller/issues/99.
|
||||
// ================================================================
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "cli/json_array_ingest.h"
|
||||
#include "cli/comment_handling.h"
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
#include "input/lrec_readers.h"
|
||||
#include "input/json_parser.h"
|
||||
#include "input/mlr_json_adapter.h"
|
||||
|
||||
typedef struct _lrec_reader_mmap_json_state_t {
|
||||
// The list of top-level JSON objects is backed by the file contents. The records are in turn
|
||||
// backed by the top-level JSON objects. This means the latter should not be freed while
|
||||
// the records are in used. (This is done to reduce data copies, for performance: we can
|
||||
// manipulate pointers to strings rather than copying strings.)
|
||||
//
|
||||
// In particular, in the multifile-input case, we need to keep *all* parsed JSON (and
|
||||
// not free one file's data when we proceed to the next) since records with pointers
|
||||
// into the parsed JSON may still be in use -- e.g. mlr sort.
|
||||
sllv_t* ptop_level_json_objects;
|
||||
sllv_t* precords;
|
||||
char* input_json_flatten_separator;
|
||||
json_array_ingest_t json_array_ingest;
|
||||
char* specified_line_term;
|
||||
int do_auto_line_term;
|
||||
char* detected_line_term;
|
||||
comment_handling_t comment_handling;
|
||||
char* comment_string;
|
||||
} lrec_reader_mmap_json_state_t;
|
||||
|
||||
static void lrec_reader_mmap_json_free(lrec_reader_t* preader);
|
||||
static void lrec_reader_mmap_json_sof(void* pvstate, void* pvhandle);
|
||||
static lrec_t* lrec_reader_mmap_json_process(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
lrec_reader_t* lrec_reader_mmap_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term,
|
||||
comment_handling_t comment_handling, char* comment_string)
|
||||
{
|
||||
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
|
||||
|
||||
lrec_reader_mmap_json_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_json_state_t));
|
||||
pstate->ptop_level_json_objects = sllv_alloc();
|
||||
pstate->precords = sllv_alloc();
|
||||
pstate->input_json_flatten_separator = input_json_flatten_separator;
|
||||
pstate->json_array_ingest = json_array_ingest;
|
||||
pstate->specified_line_term = line_term;
|
||||
pstate->do_auto_line_term = FALSE;
|
||||
pstate->detected_line_term = "\n"; // xxx adapt to MLR_GLOBALS/ctx-const for Windows port
|
||||
pstate->comment_handling = comment_handling;
|
||||
pstate->comment_string = comment_string;
|
||||
|
||||
if (streq(line_term, "auto")) {
|
||||
pstate->do_auto_line_term = TRUE;
|
||||
}
|
||||
|
||||
plrec_reader->pvstate = (void*)pstate;
|
||||
plrec_reader->popen_func = file_reader_mmap_vopen;
|
||||
plrec_reader->pclose_func = file_reader_mmap_vclose;
|
||||
plrec_reader->pprocess_func = lrec_reader_mmap_json_process;
|
||||
plrec_reader->psof_func = lrec_reader_mmap_json_sof;
|
||||
plrec_reader->pfree_func = lrec_reader_mmap_json_free;
|
||||
|
||||
return plrec_reader;
|
||||
}
|
||||
|
||||
static void lrec_reader_mmap_json_free(lrec_reader_t* preader) {
|
||||
lrec_reader_mmap_json_state_t* pstate = preader->pvstate;
|
||||
|
||||
for (sllve_t* pe = pstate->ptop_level_json_objects->phead; pe != NULL; pe = pe->pnext) {
|
||||
json_value_t* top_level_json_object = pe->pvvalue;
|
||||
json_free_value(top_level_json_object);
|
||||
}
|
||||
sllv_free(pstate->ptop_level_json_objects);
|
||||
pstate->ptop_level_json_objects = NULL;
|
||||
for (sllve_t* pf = pstate->precords->phead; pf != NULL; pf = pf->pnext) {
|
||||
lrec_t* prec = pf->pvvalue;
|
||||
lrec_free(prec);
|
||||
}
|
||||
sllv_free(pstate->precords);
|
||||
pstate->precords = NULL;
|
||||
|
||||
free(pstate);
|
||||
free(preader);
|
||||
}
|
||||
|
||||
// The mmap-JSON lrec-reader is non-streaming: we ingest all records here in the start-of-file hook.
|
||||
// Then in the process method we pop one lrec off the list at a time, until they are all exhausted.
|
||||
// This is in contrast to other Miller lrec-readers.
|
||||
//
|
||||
// It would be possible to extend the streaming framework to also have an end-of-file hook
|
||||
// which we could use here to free parsed-JSON data. However, we simply leverage the start-of-file
|
||||
// hook for the *next* file (if any) or the free method (if not): these free parsed-JSON structures
|
||||
// from the previous file (if any).
|
||||
static void lrec_reader_mmap_json_sof(void* pvstate, void* pvhandle) {
|
||||
lrec_reader_mmap_json_state_t* pstate = pvstate;
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
json_char* json_input = (json_char*)phandle->sol;
|
||||
json_value_t* parsed_top_level_json;
|
||||
json_char error_buf[JSON_ERROR_MAX];
|
||||
|
||||
// This enables us to handle input of the form
|
||||
//
|
||||
// { "a" : 1 }
|
||||
// { "b" : 2 }
|
||||
// { "c" : 3 }
|
||||
//
|
||||
// in addition to
|
||||
//
|
||||
// [
|
||||
// { "a" : 1 }
|
||||
// { "b" : 2 }
|
||||
// { "c" : 3 }
|
||||
// ]
|
||||
//
|
||||
// This is in line with what jq can handle. In this case, json_parse will return
|
||||
// once for each top-level item and will give us back a pointer to the start of
|
||||
// the rest of the input stream, so we can call json_parse on the rest until it is
|
||||
// all exhausted.
|
||||
|
||||
json_char* item_start = json_input;
|
||||
int length = phandle->eof - phandle->sol;
|
||||
char* detected_line_term = NULL;
|
||||
|
||||
while (TRUE) {
|
||||
|
||||
// Find the first line-ending sequence (if any): LF or CRLF.
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (detected_line_term == NULL) {
|
||||
for (char* p = phandle->sol; p < phandle->eof; p++) {
|
||||
if (p[0] == '\n') {
|
||||
if (p > phandle->sol && p[-1] == '\r') {
|
||||
detected_line_term = "\r\n";
|
||||
} else {
|
||||
detected_line_term = "\n";
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Skip comments. For JSON, we ingest the entire blob, this is a matter of finding and iterating over lines.
|
||||
// Miller data comments must be at start of line.
|
||||
if (pstate->comment_handling != COMMENTS_ARE_DATA) {
|
||||
char* line_term = pstate->specified_line_term;
|
||||
if (pstate->do_auto_line_term && detected_line_term != NULL)
|
||||
line_term = detected_line_term;
|
||||
mlr_json_strip_comments(item_start, item_start + length, pstate->comment_handling, pstate->comment_string,
|
||||
line_term);
|
||||
}
|
||||
|
||||
// Trim trailing whitespace.
|
||||
char* item_end = item_start + length;
|
||||
mlr_json_end_strip(item_start, &item_end);
|
||||
length = item_end - item_start;
|
||||
|
||||
if (length == 0)
|
||||
break;
|
||||
|
||||
parsed_top_level_json = json_parse(item_start, length, error_buf, &item_start);
|
||||
if (parsed_top_level_json == NULL) {
|
||||
fprintf(stderr, "%s: Unable to parse JSON data: %s\n", MLR_GLOBALS.bargv0, error_buf);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
sllv_append(pstate->ptop_level_json_objects, parsed_top_level_json);
|
||||
|
||||
// The lrecs have their string pointers pointing into the parsed-JSON objects (for
|
||||
// efficiency) so it's important we not free the latter until our free method.
|
||||
if (!reference_json_objects_as_lrecs(pstate->precords, parsed_top_level_json,
|
||||
pstate->input_json_flatten_separator, pstate->json_array_ingest))
|
||||
{
|
||||
fprintf(stderr, "%s: Unable to parse JSON data.\n", MLR_GLOBALS.bargv0);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (item_start == NULL)
|
||||
break;
|
||||
if (*item_start == 0)
|
||||
break;
|
||||
length -= (item_start - json_input);
|
||||
json_input = item_start;
|
||||
// json_parse goes up to the '\r' or '\n' (whichever is found first) on the first
|
||||
// parse, then keeps going from there on the next. E.g. in the CRLF case it
|
||||
// consumes the CR at the end of the first read and consumes the LF at the start
|
||||
// of the second, and so on. After the very last parse, we need to here consume
|
||||
// the final '\n' which is (by itself) a parse error.
|
||||
if (length == 1 && *(char*)json_input == '\n') {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (detected_line_term != NULL) {
|
||||
pstate->detected_line_term = detected_line_term;
|
||||
}
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_json_process(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
lrec_reader_mmap_json_state_t* pstate = pvstate;
|
||||
if (pstate->do_auto_line_term) {
|
||||
context_set_autodetected_line_term(pctx, pstate->detected_line_term);
|
||||
}
|
||||
return sllv_pop(pstate->precords);
|
||||
}
|
||||
|
|
@ -1,512 +0,0 @@
|
|||
// ================================================================
|
||||
// Note: there are multiple process methods with a lot of code duplication.
|
||||
// This is intentional. Much of Miller's measured processing time is in the
|
||||
// lrec-reader process methods. This is code which needs to execute on every
|
||||
// byte of input and even moving a single runtime if-statement into a
|
||||
// function-pointer assignment at alloc time can have noticeable effects on
|
||||
// performance (5-10% in some cases).
|
||||
// ================================================================
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "cli/comment_handling.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
#include "input/lrec_readers.h"
|
||||
|
||||
typedef struct _lrec_reader_mmap_nidx_state_t {
|
||||
char* irs;
|
||||
char* ifs;
|
||||
int irslen;
|
||||
int ifslen;
|
||||
int allow_repeat_ifs;
|
||||
int do_auto_line_term;
|
||||
comment_handling_t comment_handling;
|
||||
char* comment_string;
|
||||
int comment_string_length;
|
||||
} lrec_reader_mmap_nidx_state_t;
|
||||
|
||||
static void lrec_reader_mmap_nidx_free(lrec_reader_t* preader);
|
||||
static void lrec_reader_mmap_nidx_sof(void* pvstate, void* pvhandle);
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_nidx_single_irs_single_ifs(file_reader_mmap_state_t *phandle,
|
||||
char irs, char ifs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_nidx_single_irs_multi_ifs(file_reader_mmap_state_t *phandle,
|
||||
char irs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_nidx_multi_irs_single_ifs(file_reader_mmap_state_t *phandle,
|
||||
char ifs, lrec_reader_mmap_nidx_state_t* pstate);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_nidx_multi_irs_multi_ifs(file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_nidx_state_t* pstate);
|
||||
|
||||
static void skip_over_comment_lines_single_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_nidx_state_t* pstate,
|
||||
char irs);
|
||||
|
||||
static void skip_over_comment_lines_multi_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_nidx_state_t* pstate,
|
||||
char* irs,
|
||||
int irslen);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
lrec_reader_t* lrec_reader_mmap_nidx_alloc(char* irs, char* ifs, int allow_repeat_ifs,
|
||||
comment_handling_t comment_handling, char* comment_string)
|
||||
{
|
||||
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
|
||||
|
||||
lrec_reader_mmap_nidx_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_nidx_state_t));
|
||||
pstate->irs = irs;
|
||||
pstate->ifs = ifs;
|
||||
pstate->irslen = strlen(pstate->irs);
|
||||
pstate->ifslen = strlen(pstate->ifs);
|
||||
pstate->allow_repeat_ifs = allow_repeat_ifs;
|
||||
pstate->do_auto_line_term = FALSE;
|
||||
pstate->comment_handling = comment_handling;
|
||||
pstate->comment_string = comment_string;
|
||||
pstate->comment_string_length = comment_string == NULL ? 0 : strlen(comment_string);
|
||||
|
||||
plrec_reader->pvstate = (void*)pstate;
|
||||
plrec_reader->popen_func = file_reader_mmap_vopen;
|
||||
plrec_reader->pclose_func = file_reader_mmap_vclose;
|
||||
|
||||
if (streq(irs, "auto")) {
|
||||
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
|
||||
// either case the final character is "\n". Then for autodetect we
|
||||
// simply check if there's a character in the line before the '\n', and
|
||||
// if that is '\r'.
|
||||
pstate->do_auto_line_term = TRUE;
|
||||
pstate->irs = "\n";
|
||||
pstate->irslen = 1;
|
||||
plrec_reader->pprocess_func = (pstate->ifslen == 1)
|
||||
? lrec_reader_mmap_nidx_process_single_irs_single_ifs
|
||||
: lrec_reader_mmap_nidx_process_single_irs_multi_ifs;
|
||||
} else if (pstate->irslen == 1) {
|
||||
plrec_reader->pprocess_func = (pstate->ifslen == 1)
|
||||
? lrec_reader_mmap_nidx_process_single_irs_single_ifs
|
||||
: lrec_reader_mmap_nidx_process_single_irs_multi_ifs;
|
||||
} else {
|
||||
plrec_reader->pprocess_func = (pstate->ifslen == 1)
|
||||
? lrec_reader_mmap_nidx_process_multi_irs_single_ifs
|
||||
: lrec_reader_mmap_nidx_process_multi_irs_multi_ifs;
|
||||
}
|
||||
|
||||
plrec_reader->psof_func = lrec_reader_mmap_nidx_sof;
|
||||
plrec_reader->pfree_func = lrec_reader_mmap_nidx_free;
|
||||
|
||||
return plrec_reader;
|
||||
}
|
||||
|
||||
static void lrec_reader_mmap_nidx_free(lrec_reader_t* preader) {
|
||||
free(preader->pvstate);
|
||||
free(preader);
|
||||
}
|
||||
|
||||
// No-op for stateless readers such as this one.
|
||||
static void lrec_reader_mmap_nidx_sof(void* pvstate, void* pvhandle) {
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_nidx_single_irs_single_ifs(phandle, pstate->irs[0], pstate->ifs[0], pstate, pctx);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_single_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_nidx_single_irs_multi_ifs(phandle, pstate->irs[0], pstate, pctx);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_single_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_nidx_multi_irs_single_ifs(phandle, pstate->ifs[0], pstate);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_nidx_process_multi_irs_multi_ifs(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_nidx_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_nidx_multi_irs_multi_ifs(phandle, pstate);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_parse_mmap_nidx_single_irs_single_ifs(file_reader_mmap_state_t *phandle,
|
||||
char irs, char ifs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_single_irs(phandle, pstate, irs);
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* line = phandle->sol;
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
int idx = 0;
|
||||
char free_flags = NO_FREE;
|
||||
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
int saw_rs = FALSE;
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (*p == irs) {
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (*p == ifs) {
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
|
||||
p++;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_nidx_single_irs_multi_ifs(file_reader_mmap_state_t *phandle,
|
||||
char irs, lrec_reader_mmap_nidx_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_single_irs(phandle, pstate, irs);
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
char* ifs = pstate->ifs;
|
||||
int ifslen = pstate->ifslen;
|
||||
|
||||
char* line = phandle->sol;
|
||||
int idx = 0;
|
||||
char free_flags = NO_FREE;
|
||||
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
int saw_rs = FALSE;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (*p == irs) {
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (streqn(p, ifs, ifslen)) {
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
|
||||
p += ifslen;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_nidx_multi_irs_single_ifs(file_reader_mmap_state_t *phandle,
|
||||
char ifs, lrec_reader_mmap_nidx_state_t* pstate)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
char* line = phandle->sol;
|
||||
int idx = 0;
|
||||
char free_flags = NO_FREE;
|
||||
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
int saw_rs = FALSE;
|
||||
|
||||
char* irs = pstate->irs;
|
||||
int irslen = pstate->irslen;
|
||||
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, irs, irslen)) {
|
||||
*p = 0;
|
||||
phandle->sol = p + irslen;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (*p == ifs) {
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
|
||||
p++;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (*p == ifs)
|
||||
p++;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_nidx_multi_irs_multi_ifs(file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_nidx_state_t* pstate)
|
||||
{
|
||||
if (pstate->comment_string != NULL)
|
||||
skip_over_comment_lines_multi_irs(phandle, pstate, pstate->irs, pstate->irslen);
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
char* line = phandle->sol;
|
||||
int idx = 0;
|
||||
char free_flags = NO_FREE;
|
||||
|
||||
char* ifs = pstate->ifs;
|
||||
int ifslen = pstate->ifslen;
|
||||
char* irs = pstate->irs;
|
||||
int irslen = pstate->irslen;
|
||||
|
||||
char* p = line;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
char* key = NULL;
|
||||
char* value = p;
|
||||
int saw_rs = FALSE;
|
||||
for ( ; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, irs, irslen)) {
|
||||
*p = 0;
|
||||
phandle->sol = p + irslen;
|
||||
saw_rs = TRUE;
|
||||
break;
|
||||
} else if (streqn(p, ifs, ifslen)) {
|
||||
*p = 0;
|
||||
|
||||
idx++;
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
|
||||
p += ifslen;
|
||||
if (pstate->allow_repeat_ifs) {
|
||||
while (streqn(p, ifs, ifslen))
|
||||
p += ifslen;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
idx++;
|
||||
|
||||
if (pstate->allow_repeat_ifs && *value == 0)
|
||||
return prec;
|
||||
|
||||
key = low_int_to_string(idx, &free_flags);
|
||||
|
||||
if (saw_rs) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate the
|
||||
// C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, free_flags);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null character
|
||||
// to terminate the C string: if the file size is not a multiple of the OS page size it'll work (it's our
|
||||
// copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at EOF is one
|
||||
// byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, free_flags|FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
return prec;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static void skip_over_comment_lines_single_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_nidx_state_t* pstate,
|
||||
char irs)
|
||||
{
|
||||
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
|
||||
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
|
||||
{
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < pstate->comment_string_length; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += pstate->comment_string_length;
|
||||
while (phandle->sol < phandle->eof && *phandle->sol != irs) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
if (phandle->sol < phandle->eof && *phandle->sol == irs) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void skip_over_comment_lines_multi_irs(
|
||||
file_reader_mmap_state_t *phandle,
|
||||
lrec_reader_mmap_nidx_state_t* pstate,
|
||||
char* irs,
|
||||
int irslen)
|
||||
{
|
||||
while ((phandle->eof - phandle->sol) >= pstate->comment_string_length
|
||||
&& streqn(phandle->sol, pstate->comment_string, pstate->comment_string_length))
|
||||
{
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < pstate->comment_string_length; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += pstate->comment_string_length;
|
||||
while ((phandle->eof - phandle->sol) >= irslen && !streqn(phandle->sol, irs, irslen)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
fputc(*phandle->sol, stdout);
|
||||
phandle->sol++;
|
||||
}
|
||||
if ((phandle->eof - phandle->sol) >= irslen && streqn(phandle->sol, irs, irslen)) {
|
||||
if (pstate->comment_handling == PASS_COMMENTS)
|
||||
for (int i = 0; i < irslen; i++)
|
||||
fputc(phandle->sol[i], stdout);
|
||||
phandle->sol += irslen;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,529 +0,0 @@
|
|||
// ================================================================
|
||||
// Note: there are multiple process methods with a lot of code duplication.
|
||||
// This is intentional. Much of Miller's measured processing time is in the
|
||||
// lrec-reader process methods. This is code which needs to execute on every
|
||||
// byte of input and even moving a single runtime if-statement into a
|
||||
// function-pointer assignment at alloc time can have noticeable effects on
|
||||
// performance (5-10% in some cases).
|
||||
// ================================================================
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "cli/comment_handling.h"
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "lib/mlrutil.h"
|
||||
#include "input/file_reader_mmap.h"
|
||||
#include "input/lrec_readers.h"
|
||||
|
||||
typedef struct _lrec_reader_mmap_xtab_state_t {
|
||||
char* ifs;
|
||||
char* ips;
|
||||
int ifslen;
|
||||
int ipslen;
|
||||
int allow_repeat_ips;
|
||||
int do_auto_line_term;
|
||||
} lrec_reader_mmap_xtab_state_t;
|
||||
|
||||
static void lrec_reader_mmap_xtab_free(lrec_reader_t* preader);
|
||||
static void lrec_reader_mmap_xtab_sof(void* pvstate, void* pvhandle);
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_xtab_single_ifs_single_ips(file_reader_mmap_state_t* phandle, char ifs, char ips,
|
||||
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_xtab_single_ifs_multi_ips(file_reader_mmap_state_t* phandle, char ifs,
|
||||
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_single_ips(file_reader_mmap_state_t* phandle, char ips,
|
||||
lrec_reader_mmap_xtab_state_t* pstate);
|
||||
|
||||
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_multi_ips(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_xtab_state_t* pstate);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
lrec_reader_t* lrec_reader_mmap_xtab_alloc(char* ifs, char* ips, int allow_repeat_ips,
|
||||
comment_handling_t comment_handling, char* comment_string)
|
||||
{
|
||||
// lrec_reader_alloc should have shunted away from us in this case.
|
||||
// (Interleaving blank-line handling, line-term autodetect, and comment-handling all in
|
||||
// the byte-at-a-time logic turned out to be a mess in this file. In the stdio implementation,
|
||||
// by constrast, it falls out rather easily.)
|
||||
if (comment_string != NULL) {
|
||||
fprintf(stderr, "%s: internal coding error detected in file %s at line %d.\n",
|
||||
MLR_GLOBALS.bargv0, __FILE__, __LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
lrec_reader_t* plrec_reader = mlr_malloc_or_die(sizeof(lrec_reader_t));
|
||||
|
||||
lrec_reader_mmap_xtab_state_t* pstate = mlr_malloc_or_die(sizeof(lrec_reader_mmap_xtab_state_t));
|
||||
pstate->ifs = ifs;
|
||||
pstate->ips = ips;
|
||||
pstate->ifslen = strlen(pstate->ifs);
|
||||
pstate->ipslen = strlen(pstate->ips);
|
||||
pstate->allow_repeat_ips = allow_repeat_ips;
|
||||
pstate->do_auto_line_term = FALSE;
|
||||
|
||||
plrec_reader->pvstate = (void*)pstate;
|
||||
plrec_reader->popen_func = file_reader_mmap_vopen;
|
||||
plrec_reader->pclose_func = file_reader_mmap_vclose;
|
||||
|
||||
if (streq(ifs, "auto")) {
|
||||
// Auto means either lines end in "\n" or "\r\n" (LF or CRLF). In
|
||||
// either case the final character is "\n". Then for autodetect we
|
||||
// simply check if there's a character in the line before the '\n', and
|
||||
// if that is '\r'.
|
||||
pstate->do_auto_line_term = TRUE;
|
||||
pstate->ifs = "\n";
|
||||
pstate->ifslen = 1;
|
||||
plrec_reader->pprocess_func = (pstate->ipslen == 1)
|
||||
? lrec_reader_mmap_xtab_process_single_ifs_single_ips
|
||||
: lrec_reader_mmap_xtab_process_single_ifs_multi_ips;
|
||||
} else if (pstate->ifslen == 1) {
|
||||
plrec_reader->pprocess_func = (pstate->ipslen == 1)
|
||||
? lrec_reader_mmap_xtab_process_single_ifs_single_ips
|
||||
: lrec_reader_mmap_xtab_process_single_ifs_multi_ips;
|
||||
} else {
|
||||
plrec_reader->pprocess_func = (pstate->ipslen == 1)
|
||||
? lrec_reader_mmap_xtab_process_multi_ifs_single_ips
|
||||
: lrec_reader_mmap_xtab_process_multi_ifs_multi_ips;
|
||||
}
|
||||
|
||||
plrec_reader->psof_func = lrec_reader_mmap_xtab_sof;
|
||||
plrec_reader->pfree_func = lrec_reader_mmap_xtab_free;
|
||||
|
||||
return plrec_reader;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static void lrec_reader_mmap_xtab_free(lrec_reader_t* preader) {
|
||||
free(preader->pvstate);
|
||||
free(preader);
|
||||
}
|
||||
|
||||
static void lrec_reader_mmap_xtab_sof(void* pvstate, void* pvhandle) {
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_xtab_single_ifs_single_ips(phandle, pstate->ifs[0], pstate->ips[0],
|
||||
pstate, pctx);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_single_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_xtab_single_ifs_multi_ips(phandle, pstate->ifs[0], pstate, pctx);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_single_ips(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_xtab_multi_ifs_single_ips(phandle, pstate->ips[0], pstate);
|
||||
}
|
||||
|
||||
static lrec_t* lrec_reader_mmap_xtab_process_multi_ifs_multi_ips(void* pvstate, void* pvhandle, context_t* pctx) {
|
||||
file_reader_mmap_state_t* phandle = pvhandle;
|
||||
lrec_reader_mmap_xtab_state_t* pstate = pvstate;
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
else
|
||||
return lrec_parse_mmap_xtab_multi_ifs_multi_ips(phandle, pstate);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static lrec_t* lrec_parse_mmap_xtab_single_ifs_single_ips(file_reader_mmap_state_t* phandle, char ifs, char ips,
|
||||
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->do_auto_line_term) {
|
||||
// Skip over otherwise empty LF-only or CRLF-only lines.
|
||||
while (phandle->sol < phandle->eof) {
|
||||
if (*phandle->sol == '\n') {
|
||||
context_set_autodetected_lf(pctx);
|
||||
phandle->sol += 1;
|
||||
} else if (*phandle->sol == '\r') {
|
||||
char* q = phandle->sol + 1;
|
||||
if (q < phandle->eof && *q == '\n') {
|
||||
context_set_autodetected_crlf(pctx);
|
||||
phandle->sol += 2;
|
||||
} else {
|
||||
phandle->sol += 1;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Skip over otherwise empty IFS-only lines
|
||||
while (phandle->sol < phandle->eof && *phandle->sol == ifs) {
|
||||
phandle->sol++;
|
||||
}
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
// Loop over fields, one per line
|
||||
while (TRUE) {
|
||||
if (phandle->sol >= phandle->eof)
|
||||
break;
|
||||
|
||||
char* line = phandle->sol;
|
||||
char* key = line;
|
||||
char* value = "";
|
||||
char* p;
|
||||
int saw_ips_in_field = FALSE;
|
||||
|
||||
// Construct one field
|
||||
int saw_eol = FALSE;
|
||||
for (p = line; p < phandle->eof && *p; ) {
|
||||
if (*p == ifs) {
|
||||
saw_ips_in_field = FALSE;
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
saw_eol = TRUE;
|
||||
break;
|
||||
} else if (!saw_ips_in_field && *p == ips) {
|
||||
saw_ips_in_field = TRUE;
|
||||
key = line;
|
||||
*p = 0;
|
||||
|
||||
p++;
|
||||
if (pstate->allow_repeat_ips) {
|
||||
while (*p == ips)
|
||||
p++;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (saw_eol) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
|
||||
// the C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
|
||||
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
|
||||
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
|
||||
// EOF is one byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
break;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
char* p = phandle->sol;
|
||||
char* q = phandle->sol + 1;
|
||||
if (*p == '\n')
|
||||
break;
|
||||
if (q < phandle->eof && *p == '\r' && *q == '\n')
|
||||
break;
|
||||
} else {
|
||||
if (*phandle->sol == ifs)
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (prec->field_count == 0) {
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
} else {
|
||||
return prec;
|
||||
}
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_xtab_single_ifs_multi_ips(file_reader_mmap_state_t* phandle, char ifs,
|
||||
lrec_reader_mmap_xtab_state_t* pstate, context_t* pctx)
|
||||
{
|
||||
if (pstate->do_auto_line_term) {
|
||||
// Skip over otherwise empty LF-only or CRLF-only lines.
|
||||
while (phandle->sol < phandle->eof) {
|
||||
if (*phandle->sol == '\n') {
|
||||
context_set_autodetected_lf(pctx);
|
||||
phandle->sol += 1;
|
||||
} else if (*phandle->sol == '\r') {
|
||||
char* q = phandle->sol + 1;
|
||||
if (q < phandle->eof && *q == '\n') {
|
||||
context_set_autodetected_crlf(pctx);
|
||||
phandle->sol += 2;
|
||||
} else {
|
||||
phandle->sol += 1;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Skip over otherwise empty IFS-only lines.
|
||||
while (phandle->sol < phandle->eof && *phandle->sol == ifs)
|
||||
phandle->sol++;
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
char* ips = pstate->ips;
|
||||
int ipslen = pstate->ipslen;
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
// Loop over fields, one per line
|
||||
while (TRUE) {
|
||||
if (phandle->sol >= phandle->eof)
|
||||
break;
|
||||
|
||||
char* line = phandle->sol;
|
||||
char* key = line;
|
||||
char* value = "";
|
||||
char* p;
|
||||
int saw_ips_in_field = FALSE;
|
||||
|
||||
// Construct one field
|
||||
int saw_eol = FALSE;
|
||||
for (p = line; p < phandle->eof && *p; ) {
|
||||
if (*p == ifs) {
|
||||
saw_ips_in_field = FALSE;
|
||||
*p = 0;
|
||||
|
||||
if (pstate->do_auto_line_term) {
|
||||
if (p > line && p[-1] == '\r') {
|
||||
p[-1] = 0;
|
||||
context_set_autodetected_crlf(pctx);
|
||||
} else {
|
||||
context_set_autodetected_lf(pctx);
|
||||
}
|
||||
}
|
||||
|
||||
phandle->sol = p+1;
|
||||
saw_eol = TRUE;
|
||||
break;
|
||||
} else if (!saw_ips_in_field && streqn(p, ips, ipslen)) {
|
||||
saw_ips_in_field = TRUE;
|
||||
key = line;
|
||||
*p = 0;
|
||||
|
||||
p += ipslen;
|
||||
if (pstate->allow_repeat_ips) {
|
||||
while (streqn(p, ips, ipslen))
|
||||
p += ipslen;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (saw_eol) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
|
||||
// the C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
|
||||
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
|
||||
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
|
||||
// EOF is one byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof || *phandle->sol == ifs)
|
||||
break;
|
||||
}
|
||||
if (prec->field_count == 0) {
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
} else {
|
||||
return prec;
|
||||
}
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_single_ips(file_reader_mmap_state_t* phandle, char ips,
|
||||
lrec_reader_mmap_xtab_state_t* pstate)
|
||||
{
|
||||
char* ifs = pstate->ifs;
|
||||
int ifslen = pstate->ifslen;
|
||||
|
||||
// Skip blank lines
|
||||
while (phandle->eof - phandle->sol >= ifslen && streqn(phandle->sol, ifs, ifslen)) {
|
||||
phandle->sol += ifslen;
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
// Loop over fields, one per line
|
||||
while (TRUE) {
|
||||
if (phandle->sol >= phandle->eof)
|
||||
break;
|
||||
|
||||
char* line = phandle->sol;
|
||||
char* key = line;
|
||||
char* value = "";
|
||||
char* p;
|
||||
int saw_ips_in_field = FALSE;
|
||||
|
||||
// Construct one field
|
||||
int saw_eol = FALSE;
|
||||
for (p = line; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, ifs, ifslen)) {
|
||||
saw_ips_in_field = FALSE;
|
||||
*p = 0;
|
||||
phandle->sol = p + ifslen;
|
||||
saw_eol = TRUE;
|
||||
break;
|
||||
} else if (!saw_ips_in_field && *p == ips) {
|
||||
saw_ips_in_field = TRUE;
|
||||
key = line;
|
||||
*p = 0;
|
||||
|
||||
p++;
|
||||
if (pstate->allow_repeat_ips) {
|
||||
while (*p == ips)
|
||||
p++;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (saw_eol) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
|
||||
// the C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
|
||||
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
|
||||
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
|
||||
// EOF is one byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof || streqn(phandle->sol, ifs, ifslen))
|
||||
break;
|
||||
}
|
||||
if (prec->field_count == 0) {
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
} else {
|
||||
return prec;
|
||||
}
|
||||
}
|
||||
|
||||
static lrec_t* lrec_parse_mmap_xtab_multi_ifs_multi_ips(file_reader_mmap_state_t* phandle,
|
||||
lrec_reader_mmap_xtab_state_t* pstate)
|
||||
{
|
||||
char* ips = pstate->ips;
|
||||
int ipslen = pstate->ipslen;
|
||||
char* ifs = pstate->ifs;
|
||||
int ifslen = pstate->ifslen;
|
||||
|
||||
// Skip blank lines
|
||||
while (phandle->eof - phandle->sol >= ifslen && streqn(phandle->sol, ifs, ifslen)) {
|
||||
phandle->sol += ifslen;
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof)
|
||||
return NULL;
|
||||
|
||||
lrec_t* prec = lrec_unbacked_alloc();
|
||||
|
||||
// Loop over fields, one per line
|
||||
while (TRUE) {
|
||||
if (phandle->sol >= phandle->eof)
|
||||
break;
|
||||
|
||||
char* line = phandle->sol;
|
||||
char* key = line;
|
||||
char* value = "";
|
||||
char* p;
|
||||
int saw_ips_in_field = FALSE;
|
||||
|
||||
// Construct one field
|
||||
int saw_eol = FALSE;
|
||||
for (p = line; p < phandle->eof && *p; ) {
|
||||
if (streqn(p, ifs, ifslen)) {
|
||||
saw_ips_in_field = FALSE;
|
||||
*p = 0;
|
||||
phandle->sol = p + ifslen;
|
||||
saw_eol = TRUE;
|
||||
break;
|
||||
} else if (!saw_ips_in_field && streqn(p, ips, ipslen)) {
|
||||
saw_ips_in_field = TRUE;
|
||||
key = line;
|
||||
*p = 0;
|
||||
|
||||
p += ipslen;
|
||||
if (pstate->allow_repeat_ips) {
|
||||
while (streqn(p, ips, ipslen))
|
||||
p += ipslen;
|
||||
}
|
||||
value = p;
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
if (p >= phandle->eof)
|
||||
phandle->sol = p+1;
|
||||
|
||||
if (saw_eol) {
|
||||
// Easy and simple case: we read until end of line. We zero-poked the irs to a null character to terminate
|
||||
// the C string so it's OK to retain a pointer to that.
|
||||
lrec_put(prec, key, value, NO_FREE);
|
||||
} else {
|
||||
// Messier case: we read to end of file without seeing end of line. We can't always zero-poke a null
|
||||
// character to terminate the C string: if the file size is not a multiple of the OS page size it'll work
|
||||
// (it's our copy-on-write memory). But if the file size is a multiple of the page size, then zero-poking at
|
||||
// EOF is one byte past the page and that will segv us.
|
||||
char* copy = mlr_alloc_string_from_char_range(value, phandle->eof - value);
|
||||
lrec_put(prec, key, copy, FREE_ENTRY_VALUE);
|
||||
}
|
||||
|
||||
if (phandle->sol >= phandle->eof || streqn(phandle->sol, ifs, ifslen))
|
||||
break;
|
||||
}
|
||||
if (prec->field_count == 0) {
|
||||
lrec_free(prec);
|
||||
return NULL;
|
||||
} else {
|
||||
return prec;
|
||||
}
|
||||
}
|
||||
|
|
@ -348,10 +348,9 @@ static int lrec_reader_stdio_csv_get_fields(lrec_reader_stdio_csv_state_t* pstat
|
|||
if (pfr_peek_char(pfr) == (char)EOF) // char defaults to unsigned on some platforms
|
||||
return FALSE;
|
||||
|
||||
// Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap
|
||||
// we can test the first 3 bytes, then skip past them or not. For stdio on files we can fread
|
||||
// the first 3 bytes, then rewind the fp if they're not the UTF-8 BOM. But for stdio on stdin
|
||||
// (which is the primary reason we support stdio in Miller), we cannot rewind: stdin is not
|
||||
// Strip the UTF-8 BOM, if any. This is MUCH simpler for mmap, and for stdio on files. For mmap we can test the
|
||||
// first 3 bytes, then skip past them or not. For stdio on files we can fread the first 3 bytes, then rewind the fp
|
||||
// if they're not the UTF-8 BOM. But for stdio on stdin, we cannot rewind: stdin is not
|
||||
// rewindable.
|
||||
if (is_header) {
|
||||
pfr_buffer_by(pfr, UTF8_BOM_LENGTH);
|
||||
|
|
|
|||
|
|
@ -8,10 +8,11 @@
|
|||
// ================================================================
|
||||
|
||||
// ================================================================
|
||||
// This has at present a lot of code duplication with lrec_reader_mmap_json.
|
||||
// This is because we read the entire input file into memory and get a pointer
|
||||
// to it, which is a lot like mmap. At some future point we may implement a
|
||||
// streaming JSON parser at which point the two files would diverge.
|
||||
// Note: this is a non-streaming JSON reader which reads the entire input file
|
||||
// into memory and gets a pointer to it. At some future point we may implement
|
||||
// a streaming JSON parser at which point this would change dramatically.
|
||||
//
|
||||
// See also https://github.com/johnkerl/miller/issues/99
|
||||
// ================================================================
|
||||
|
||||
#include <stdio.h>
|
||||
|
|
|
|||
|
|
@ -9,50 +9,24 @@ lrec_reader_t* lrec_reader_alloc(cli_reader_opts_t* popts) {
|
|||
generator_opts_t* pgopts = &popts->generator_opts;
|
||||
return lrec_reader_gen_alloc(pgopts->field_name, pgopts->start, pgopts->stop, pgopts->step);
|
||||
} else if (streq(popts->ifile_fmt, "dkvp")) {
|
||||
if (popts->use_mmap_for_read)
|
||||
return lrec_reader_mmap_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
else
|
||||
return lrec_reader_stdio_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
return lrec_reader_stdio_dkvp_alloc(popts->irs, popts->ifs, popts->ips, popts->allow_repeat_ifs,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
} else if (streq(popts->ifile_fmt, "csv")) {
|
||||
if (popts->use_mmap_for_read)
|
||||
return lrec_reader_mmap_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header,
|
||||
popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string);
|
||||
else
|
||||
return lrec_reader_stdio_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header,
|
||||
popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string);
|
||||
return lrec_reader_stdio_csv_alloc(popts->irs, popts->ifs, popts->use_implicit_csv_header,
|
||||
popts->allow_ragged_csv_input, popts->comment_handling, popts->comment_string);
|
||||
} else if (streq(popts->ifile_fmt, "csvlite")) {
|
||||
if (popts->use_mmap_for_read)
|
||||
return lrec_reader_mmap_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
|
||||
popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling,
|
||||
popts->comment_string);
|
||||
else
|
||||
return lrec_reader_stdio_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
|
||||
popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling,
|
||||
popts->comment_string);
|
||||
return lrec_reader_stdio_csvlite_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
|
||||
popts->use_implicit_csv_header, popts->allow_ragged_csv_input, popts->comment_handling,
|
||||
popts->comment_string);
|
||||
} else if (streq(popts->ifile_fmt, "nidx")) {
|
||||
if (popts->use_mmap_for_read)
|
||||
return lrec_reader_mmap_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
else
|
||||
return lrec_reader_stdio_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
return lrec_reader_stdio_nidx_alloc(popts->irs, popts->ifs, popts->allow_repeat_ifs,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
} else if (streq(popts->ifile_fmt, "xtab")) {
|
||||
// Use stdio-xtab for comment handling; not supported in the mmap-xtab reader.
|
||||
if (popts->use_mmap_for_read && popts->comment_string == NULL)
|
||||
return lrec_reader_mmap_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
else
|
||||
return lrec_reader_stdio_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
return lrec_reader_stdio_xtab_alloc(popts->ifs, popts->ips, popts->allow_repeat_ips,
|
||||
popts->comment_handling, popts->comment_string);
|
||||
} else if (streq(popts->ifile_fmt, "json")) {
|
||||
if (popts->use_mmap_for_read)
|
||||
return lrec_reader_mmap_json_alloc(popts->input_json_flatten_separator,
|
||||
popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string);
|
||||
else
|
||||
return lrec_reader_stdio_json_alloc(popts->input_json_flatten_separator,
|
||||
popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string);
|
||||
return lrec_reader_stdio_json_alloc(popts->input_json_flatten_separator,
|
||||
popts->json_array_ingest, popts->irs, popts->comment_handling, popts->comment_string);
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,19 +24,6 @@ lrec_reader_t* lrec_reader_stdio_xtab_alloc(char* ifs, char* ips, int allow_repe
|
|||
lrec_reader_t* lrec_reader_stdio_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term,
|
||||
comment_handling_t comment_handling, char* comment_string);
|
||||
|
||||
lrec_reader_t* lrec_reader_mmap_csv_alloc(char* irs, char* ifs, int use_implicit_csv_header,
|
||||
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string);
|
||||
lrec_reader_t* lrec_reader_mmap_csvlite_alloc(char* irs, char* ifs, int allow_repeat_ifs, int use_implicit_csv_header,
|
||||
int allow_ragged_csv_input, comment_handling_t comment_handling, char* comment_string);
|
||||
lrec_reader_t* lrec_reader_mmap_dkvp_alloc(char* irs, char* ifs, char* ips, int allow_repeat_ifs,
|
||||
comment_handling_t comment_handling, char* comment_string);
|
||||
lrec_reader_t* lrec_reader_mmap_nidx_alloc(char* irs, char* ifs, int allow_repeat_ifs,
|
||||
comment_handling_t comment_handling, char* comment_string);
|
||||
lrec_reader_t* lrec_reader_mmap_xtab_alloc(char* ifs, char* ips, int allow_repeat_ips,
|
||||
comment_handling_t comment_handling, char* comment_string);
|
||||
lrec_reader_t* lrec_reader_mmap_json_alloc(char* input_json_flatten_separator, json_array_ingest_t json_array_ingest, char* line_term,
|
||||
comment_handling_t comment_handling, char* comment_string);
|
||||
|
||||
lrec_reader_t* lrec_reader_in_memory_alloc(sllv_t* precords);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -273,7 +273,7 @@ static int populate_from_nested_array(lrec_t* prec, json_value_t* pjson_array, c
|
|||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
// * The buffer is an entire JSON blob, e.g. contents from stdio read or mmap; peof-psof is the file size so peof is one
|
||||
// * The buffer is an entire JSON blob, e.g. contents from stdio read; peof-psof is the file size so peof is one
|
||||
// byte *after* the last valid file byte.
|
||||
// * The buffer is not assumed to be null-terminated.
|
||||
// * Any lines beginning with comment_string are modified by poking space characters up to line_term.
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
int reference_json_objects_as_lrecs(sllv_t* precords, json_value_t* ptop_level_json, char* flatten_sep,
|
||||
json_array_ingest_t json_array_ingest);
|
||||
|
||||
// * The buffer is an entire JSON blob, e.g. contents from stdio read or mmap; peof-psof is the file size so peof is one
|
||||
// * The buffer is an entire JSON blob, e.g. contents from stdio read; peof-psof is the file size so peof is one
|
||||
// byte *after* the last valid file byte.
|
||||
// * The buffer is not assumed to be null-terminated.
|
||||
// * Any lines beginning with comment_string are modified by poking space characters up to line_term.
|
||||
|
|
|
|||
|
|
@ -1,112 +0,0 @@
|
|||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
#include "lib/mlr_arch.h"
|
||||
#include "input/byte_readers.h"
|
||||
#include "lib/mlr_globals.h"
|
||||
#include "lib/mlrutil.h"
|
||||
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
static char empty_buf[1] = { 0 };
|
||||
#endif
|
||||
|
||||
typedef struct _mmap_byte_reader_state_t {
|
||||
char* filename;
|
||||
int fd;
|
||||
char* sof;
|
||||
char* p;
|
||||
char* eof;
|
||||
} mmap_byte_reader_state_t;
|
||||
|
||||
static int mmap_byte_reader_open_func(struct _byte_reader_t* pbr, char* prepipe, char* filename);
|
||||
static int mmap_byte_reader_read_func(struct _byte_reader_t* pbr);
|
||||
static void mmap_byte_reader_close_func(struct _byte_reader_t* pbr, char* prepipe);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
byte_reader_t* mmap_byte_reader_alloc() {
|
||||
byte_reader_t* pbr = mlr_malloc_or_die(sizeof(byte_reader_t));
|
||||
|
||||
pbr->pvstate = NULL;
|
||||
pbr->popen_func = mmap_byte_reader_open_func;
|
||||
pbr->pread_func = mmap_byte_reader_read_func;
|
||||
pbr->pclose_func = mmap_byte_reader_close_func;
|
||||
|
||||
return pbr;
|
||||
}
|
||||
|
||||
void mmap_byte_reader_free(byte_reader_t* pbr) {
|
||||
mmap_byte_reader_state_t* pstate = pbr->pvstate;
|
||||
if (pstate != NULL) {
|
||||
free(pstate->filename); // null-ok semantics
|
||||
}
|
||||
free(pbr);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static int mmap_byte_reader_open_func(struct _byte_reader_t* pbr, char* prepipe, char* filename) {
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
// popen is a stdio construct, not an mmap construct, and it can't be supported here.
|
||||
if (prepipe != NULL) {
|
||||
fprintf(stderr, "%s: coding error detected in file %s at line %d.\n",
|
||||
MLR_GLOBALS.bargv0, __FILE__, __LINE__);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
mmap_byte_reader_state_t* pstate = mlr_malloc_or_die(sizeof(mmap_byte_reader_state_t));
|
||||
pstate->filename = mlr_strdup_or_die(filename);
|
||||
pstate->fd = open(filename, O_RDONLY);
|
||||
if (pstate->fd < 0) {
|
||||
perror("open");
|
||||
fprintf(stderr, "%s: Couldn't open \"%s\" for read.\n", MLR_GLOBALS.bargv0, filename);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
struct stat stat;
|
||||
if (fstat(pstate->fd, &stat) < 0) {
|
||||
perror("fstat");
|
||||
fprintf(stderr, "%s: could not fstat \"%s\"\n", MLR_GLOBALS.bargv0, filename);
|
||||
exit(1);
|
||||
}
|
||||
if (stat.st_size == 0) {
|
||||
// mmap doesn't allow us to map zero-length files but zero-length files do exist.
|
||||
pstate->sof = &empty_buf[0];
|
||||
} else {
|
||||
pstate->sof = mmap(NULL, (size_t)stat.st_size, PROT_READ|PROT_WRITE, MAP_FILE|MAP_PRIVATE,
|
||||
pstate->fd, (off_t)0);
|
||||
if (pstate->sof == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
fprintf(stderr, "%s: could not mmap \"%s\"\n", MLR_GLOBALS.bargv0, filename);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
pstate->eof = pstate->sof + stat.st_size;
|
||||
pstate->p = pstate->sof;
|
||||
pbr->pvstate = pstate;
|
||||
return TRUE;
|
||||
#else
|
||||
fprintf(stderr, "%s: mmap is unsupported on this architecture.\n", MLR_GLOBALS.bargv0);
|
||||
exit(1);
|
||||
return TRUE;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int mmap_byte_reader_read_func(struct _byte_reader_t* pbr) {
|
||||
mmap_byte_reader_state_t* pstate = pbr->pvstate;
|
||||
if (pstate->p >= pstate->eof) {
|
||||
return EOF;
|
||||
} else {
|
||||
int c = *pstate->p;
|
||||
pstate->p++;
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
||||
static void mmap_byte_reader_close_func(struct _byte_reader_t* pbr, char* prepipe) {
|
||||
mmap_byte_reader_state_t* pstate = pbr->pvstate;
|
||||
if (close(pstate->fd) < 0) {
|
||||
perror("close");
|
||||
fprintf(stderr, "%s: close error on file \"%s\".\n", MLR_GLOBALS.bargv0, pstate->filename);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
|
@ -23,14 +23,6 @@
|
|||
#define mlr_arch_getc(stream) getc_unlocked(stream)
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
#ifdef MLR_ON_MSYS2
|
||||
#define MLR_ARCH_MMAP_ENABLED 0
|
||||
#else
|
||||
#define MLR_ARCH_MMAP_ENABLED 1
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
int mlr_arch_setenv(const char *name, const char *value);
|
||||
int mlr_arch_unsetenv(const char *name);
|
||||
|
|
|
|||
|
|
@ -107,8 +107,6 @@ static void mapper_join_usage(FILE* o, char* argv0, char* verb) {
|
|||
fprintf(o, " --ips {pair-separator character}\n");
|
||||
fprintf(o, " --repifs\n");
|
||||
fprintf(o, " --repips\n");
|
||||
fprintf(o, " --mmap\n");
|
||||
fprintf(o, " --no-mmap\n");
|
||||
fprintf(o, "Please use \"%s --usage-separator-options\" for information on specifying separators.\n",
|
||||
argv0);
|
||||
fprintf(o, "Please see http://johnkerl.org/miller/doc/reference.html for more information\n");
|
||||
|
|
@ -237,10 +235,6 @@ static mapper_t* mapper_join_parse_cli(int* pargi, int argc, char** argv,
|
|||
|
||||
cli_merge_reader_opts(&popts->reader_opts, pmain_reader_opts);
|
||||
|
||||
// popen is a stdio construct, not an mmap construct, and it can't be supported here.
|
||||
if (popts->prepipe != NULL)
|
||||
popts->reader_opts.use_mmap_for_read = FALSE;
|
||||
|
||||
if (popts->left_file_name == NULL) {
|
||||
fprintf(stderr, "%s %s: need left file name\n", MLR_GLOBALS.bargv0, verb);
|
||||
mapper_join_usage(stderr, argv[0], verb);
|
||||
|
|
|
|||
|
|
@ -47217,71 +47217,6 @@ a=1,b=2,c=3
|
|||
a=4,b=5,c=6
|
||||
|
||||
|
||||
================================================================
|
||||
MMAP AT PAGE BOUNDARIES
|
||||
|
||||
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-ifs.dkvp
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=
|
||||
|
||||
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-irs.dkvp
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
|
||||
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-final-no-ifs.dkvp
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,3=z
|
||||
|
||||
mlr --dkvp tail -n 4 ./reg_test/input/page-aligned-no-final-irs.dkvp
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=cccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
x=aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,y=bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,z=ccccccccccccccccccccccccccccccccccccccccccccccccc
|
||||
|
||||
mlr --nidx tail -n 4 ./reg_test/input/page-aligned-no-final-irs.nidx
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333
|
||||
|
||||
mlr --csvlite tail -n 4 ./reg_test/input/page-aligned-no-final-irs.csvl
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,ccccccccccccccccccccccccccccccccccccccccccc
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333
|
||||
|
||||
mlr --csv --rs lf tail -n 4 ./reg_test/input/page-aligned-no-final-irs.csvl
|
||||
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa,bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb,ccccccccccccccccccccccccccccccccccccccccccc
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,3333333333333333333333333333333333333333333
|
||||
11111111111111111111111111111111111111,22222222222222222222222222222222222222222222,33333333333333333333333333333333333333333333
|
||||
|
||||
mlr --xtab tail -n 4 ./reg_test/input/page-aligned-no-final-eol.xtab
|
||||
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
|
||||
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
|
||||
cccccccccccccccccccc 33333333333333333333
|
||||
|
||||
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
|
||||
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
|
||||
cccccccccccccccccccc 33333333333333333333
|
||||
|
||||
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
|
||||
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
|
||||
cccccccccccccccccccc 33333333333333333333
|
||||
|
||||
aaaaaaaaaaaaaaaaaaaa 111111111111111111111
|
||||
bbbbbbbbbbbbbbbbbbbb 22222222222222222222
|
||||
cccccccccccccccccccc 3333333333333333333333
|
||||
|
||||
|
||||
================================================================
|
||||
INT64 I/O
|
||||
|
||||
|
|
@ -47675,54 +47610,54 @@ x,"y""yy",z
|
|||
================================================================
|
||||
RFC-CSV
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple.csv-crlf
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/simple.csv-crlf
|
||||
a,b,c
|
||||
1,x,3
|
||||
4,5,6
|
||||
x,"y""yy",z
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv
|
||||
a,b,c
|
||||
1,x,3
|
||||
4,5,6
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/narrow.csv
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/narrow.csv
|
||||
a
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/narrow-truncated.csv
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/narrow-truncated.csv
|
||||
a
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-comma.csv
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/quoted-comma.csv
|
||||
a,b,c
|
||||
1,"x,3",y
|
||||
4,5,6
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-comma-truncated.csv
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/quoted-comma-truncated.csv
|
||||
a,b,c
|
||||
1,"x,3",y
|
||||
4,5,6
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-crlf.csv
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/quoted-crlf.csv
|
||||
a,b,c
|
||||
1,"x
|
||||
3",y
|
||||
4,5,6
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/quoted-crlf-truncated.csv
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/quoted-crlf-truncated.csv
|
||||
a,b,c
|
||||
1,"x
|
||||
3",y
|
||||
4,5,6
|
||||
|
||||
mlr --mmap --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv ./reg_test/input/rfc-csv/simple.csv-crlf
|
||||
mlr --csv cat ./reg_test/input/rfc-csv/simple-truncated.csv ./reg_test/input/rfc-csv/simple.csv-crlf
|
||||
a,b,c
|
||||
1,x,3
|
||||
4,5,6
|
||||
|
|
@ -47730,7 +47665,7 @@ a,b,c
|
|||
4,5,6
|
||||
x,"y""yy",z
|
||||
|
||||
mlr --mmap --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b ./reg_test/input/rfc-csv/modify-defaults.csv
|
||||
mlr --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b ./reg_test/input/rfc-csv/modify-defaults.csv
|
||||
a|c
|
||||
|
||||
1|3
|
||||
|
|
@ -47738,22 +47673,13 @@ a|c
|
|||
4|6
|
||||
|
||||
|
||||
mlr --mmap --csv --rs lf --quote-original cut -o -f c,b,a ./reg_test/input/quote-original.csv
|
||||
mlr --csv --rs lf --quote-original cut -o -f c,b,a ./reg_test/input/quote-original.csv
|
||||
c,b,a
|
||||
3,2,1
|
||||
6,"5",4
|
||||
"9",8,"7"
|
||||
|
||||
mlr --mmap --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv
|
||||
a 1
|
||||
b 2
|
||||
c 3
|
||||
|
||||
a 4
|
||||
b 5
|
||||
c
|
||||
|
||||
mlr --no-mmap --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv
|
||||
mlr --icsv --oxtab cat ./reg_test/input/comma-at-eof.csv
|
||||
a 1
|
||||
b 2
|
||||
c 3
|
||||
|
|
@ -47818,7 +47744,7 @@ c i
|
|||
================================================================
|
||||
RAGGED NON-RFC CSV
|
||||
|
||||
mlr --mmap --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv
|
||||
mlr --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv
|
||||
a 1
|
||||
b 2
|
||||
c 3
|
||||
|
|
@ -47832,35 +47758,7 @@ b 7
|
|||
c 8
|
||||
4 9
|
||||
|
||||
mlr --no-mmap --icsv --oxtab --ragged cat ./reg_test/input/ragged.csv
|
||||
a 1
|
||||
b 2
|
||||
c 3
|
||||
|
||||
a 4
|
||||
b 5
|
||||
c
|
||||
|
||||
a 6
|
||||
b 7
|
||||
c 8
|
||||
4 9
|
||||
|
||||
mlr --mmap --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv
|
||||
a 1
|
||||
b 2
|
||||
c 3
|
||||
|
||||
a 4
|
||||
b 5
|
||||
c
|
||||
|
||||
a 6
|
||||
b 7
|
||||
c 8
|
||||
4 9
|
||||
|
||||
mlr --no-mmap --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv
|
||||
mlr --icsvlite --oxtab --ragged cat ./reg_test/input/ragged.csv
|
||||
a 1
|
||||
b 2
|
||||
c 3
|
||||
|
|
@ -48177,9 +48075,9 @@ i 4
|
|||
================================================================
|
||||
MULTI-CHARACTER SEPARATORS FOR XTAB
|
||||
|
||||
mlr --mmap --xtab --ifs crlf --ofs Z cut -x -f b ./reg_test/input/truncated.xtab-crlf
|
||||
mlr --xtab --ifs crlf --ofs Z cut -x -f b ./reg_test/input/truncated.xtab-crlf
|
||||
a 1Zc 3ZZd 4Ze 5Z
|
||||
mlr --mmap --xtab --ips . --ops @ cut -x -f b ./reg_test/input/dots.xtab
|
||||
mlr --xtab --ips . --ops @ cut -x -f b ./reg_test/input/dots.xtab
|
||||
a@1
|
||||
c@345
|
||||
|
||||
|
|
@ -48195,12 +48093,7 @@ sum@@@@3
|
|||
================================================================
|
||||
EMBEDDED IPS FOR XTAB
|
||||
|
||||
mlr --xtab --mmap cat ./reg_test/input/embedded-ips.xtab
|
||||
a 1
|
||||
b 2
|
||||
c 3 4 5
|
||||
|
||||
mlr --xtab --no-mmap cat ./reg_test/input/embedded-ips.xtab
|
||||
mlr --xtab cat ./reg_test/input/embedded-ips.xtab
|
||||
a 1
|
||||
b 2
|
||||
c 3 4 5
|
||||
|
|
@ -48374,7 +48267,7 @@ mlr --opprint --barred --right cat ./reg_test/input/abixy-het
|
|||
================================================================
|
||||
MULTI-CHARACTER IXS SPECIFIERS
|
||||
|
||||
mlr --oxtab --idkvp --mmap --irs lf --ifs , --ips = cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
|
||||
mlr --oxtab --idkvp --irs lf --ifs , --ips = cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
|
||||
x :0.641593543645736508/
|
||||
a :wye/
|
||||
i :0/
|
||||
|
|
@ -48395,7 +48288,7 @@ x :0.676537984365847889/
|
|||
a :zee/
|
||||
i :4/
|
||||
|
||||
mlr --oxtab --idkvp --mmap --irs lf --ifs /, --ips =: cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
|
||||
mlr --oxtab --idkvp --irs lf --ifs /, --ips =: cut -o -f x,a,i ./reg_test/input/multi-sep.dkvp-crlf
|
||||
x 0.641593543645736508
|
||||
a wye
|
||||
i 0
|
||||
|
|
@ -49746,7 +49639,7 @@ a=hat,b=wye,i=9,x=0.03144187646093577,y=0.7495507603507059
|
|||
a=pan,b=wye,i=10,x=0.5026260055412137,y=0.9526183602969864
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap nidx
|
||||
---------------------------------------------------------------- nidx
|
||||
mlr --irs auto --ors lf --nidx --fs comma cat ./reg_test/input/line-term-lf.dkvp
|
||||
a=pan,b=pan,i=1,x=0.3467901443380824,y=0.7268028627434533
|
||||
a=eks,b=pan,i=2,x=0.7586799647899636,y=0.5221511083334797
|
||||
|
|
@ -49796,7 +49689,7 @@ a=hat,b=wye,i=9,x=0.03144187646093577,y=0.7495507603507059
|
|||
a=pan,b=wye,i=10,x=0.5026260055412137,y=0.9526183602969864
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap csvlite
|
||||
---------------------------------------------------------------- csvlite
|
||||
mlr --irs auto --ors lf --csvlite cat ./reg_test/input/line-term-lf.csv
|
||||
a,b,i,x,y
|
||||
pan,pan,1,0.3467901443380824,0.7268028627434533
|
||||
|
|
@ -49850,7 +49743,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059
|
|||
pan,wye,10,0.5026260055412137,0.9526183602969864
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap pprint
|
||||
---------------------------------------------------------------- pprint
|
||||
mlr --irs auto --ors lf --pprint cat ./reg_test/input/line-term-lf.csv
|
||||
a,b,i,x,y
|
||||
pan,pan,1,0.3467901443380824,0.7268028627434533
|
||||
|
|
@ -49904,7 +49797,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059
|
|||
pan,wye,10,0.5026260055412137,0.9526183602969864
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap xtab
|
||||
---------------------------------------------------------------- xtab
|
||||
mlr --ifs auto --xtab cat ./reg_test/input/line-term-lf.xtab
|
||||
a pan
|
||||
b pan
|
||||
|
|
@ -50150,7 +50043,7 @@ x 0.5026260055412137
|
|||
y 0.9526183602969864
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap xtab
|
||||
---------------------------------------------------------------- xtab
|
||||
mlr --ifs auto --xtab cat ./reg_test/input/line-term-lf.xtab
|
||||
a pan
|
||||
b pan
|
||||
|
|
@ -50396,7 +50289,7 @@ x 0.5026260055412137
|
|||
y 0.9526183602969864
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap csv
|
||||
---------------------------------------------------------------- csv
|
||||
mlr --irs auto --ors lf --csv cat ./reg_test/input/line-term-lf.csv
|
||||
a,b,i,x,y
|
||||
pan,pan,1,0.3467901443380824,0.7268028627434533
|
||||
|
|
@ -50450,7 +50343,7 @@ hat,wye,9,0.03144187646093577,0.7495507603507059
|
|||
pan,wye,10,0.5026260055412137,0.9526183602969864
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json nowrap nostack
|
||||
---------------------------------------------------------------- json nowrap nostack
|
||||
mlr --irs auto --ors lf --json cat ./reg_test/input/line-term-lf.json
|
||||
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
|
||||
{ "a": "eks", "b": "pan", "i": 2, "x": 0.7586799647899636, "y": 0.5221511083334797 }
|
||||
|
|
@ -50500,7 +50393,7 @@ mlr --json cat ./reg_test/input/line-term-crlf.json
|
|||
{ "a": "pan", "b": "wye", "i": 10, "x": 0.5026260055412137, "y": 0.9526183602969864 }
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json yeswrap nostack
|
||||
---------------------------------------------------------------- json yeswrap nostack
|
||||
mlr --irs auto --ors lf --jlistwrap --json cat ./reg_test/input/line-term-lf-wrap.json
|
||||
[
|
||||
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
|
||||
|
|
@ -50558,7 +50451,7 @@ mlr --jlistwrap --json cat ./reg_test/input/line-term-crlf-wrap.json
|
|||
]
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json nowrap yesstack
|
||||
---------------------------------------------------------------- json nowrap yesstack
|
||||
mlr --irs auto --json --jvstack cat ./reg_test/input/line-term-lf.json
|
||||
{
|
||||
"a": "pan",
|
||||
|
|
@ -50848,7 +50741,7 @@ mlr --json --jvstack cat ./reg_test/input/line-term-crlf.json
|
|||
}
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json yeswrap yesstack
|
||||
---------------------------------------------------------------- json yeswrap yesstack
|
||||
mlr --irs auto --ors lf --jlistwrap --json --jvstack cat ./reg_test/input/line-term-lf-wrap.json
|
||||
[
|
||||
{
|
||||
|
|
@ -51146,7 +51039,7 @@ mlr --jlistwrap --json --jvstack cat ./reg_test/input/line-term-crlf-wrap.json
|
|||
]
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json nowrap nostack
|
||||
---------------------------------------------------------------- json nowrap nostack
|
||||
mlr --irs auto --ors lf --json cat ./reg_test/input/line-term-lf.json
|
||||
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
|
||||
{ "a": "eks", "b": "pan", "i": 2, "x": 0.7586799647899636, "y": 0.5221511083334797 }
|
||||
|
|
@ -51196,7 +51089,7 @@ mlr --json cat ./reg_test/input/line-term-crlf.json
|
|||
{ "a": "pan", "b": "wye", "i": 10, "x": 0.5026260055412137, "y": 0.9526183602969864 }
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json yeswrap nostack
|
||||
---------------------------------------------------------------- json yeswrap nostack
|
||||
mlr --irs auto --ors lf --jlistwrap --json cat ./reg_test/input/line-term-lf-wrap.json
|
||||
[
|
||||
{ "a": "pan", "b": "pan", "i": 1, "x": 0.3467901443380824, "y": 0.7268028627434533 }
|
||||
|
|
@ -51254,7 +51147,7 @@ mlr --jlistwrap --json cat ./reg_test/input/line-term-crlf-wrap.json
|
|||
]
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json nowrap yesstack
|
||||
---------------------------------------------------------------- json nowrap yesstack
|
||||
mlr --irs auto --ors lf --json --jvstack cat ./reg_test/input/line-term-lf.json
|
||||
{
|
||||
"a": "pan",
|
||||
|
|
@ -51544,7 +51437,7 @@ mlr --json --jvstack cat ./reg_test/input/line-term-crlf.json
|
|||
}
|
||||
|
||||
|
||||
---------------------------------------------------------------- mmap json yeswrap yesstack
|
||||
---------------------------------------------------------------- json yeswrap yesstack
|
||||
mlr --irs auto --ors lf --jlistwrap --json --jvstack cat ./reg_test/input/line-term-lf-wrap.json
|
||||
[
|
||||
{
|
||||
|
|
|
|||
|
|
@ -44,12 +44,6 @@ if [ "$1" = "--valgrind" ]; then
|
|||
# ../tools/clean-valg can be used to filter the output.
|
||||
path_to_mlr="valgrind --leak-check=full ${path_to_mlr}g"
|
||||
path_to_mlr_for_auxents="$path_to_mlr"
|
||||
elif [ "$1" = "--no-mmap" ]; then
|
||||
path_to_mlr_for_auxents="${path_to_mlr}"
|
||||
path_to_mlr="${path_to_mlr} --no-mmap"
|
||||
elif [ "$1" = "--valgrind-no-mmap" ]; then
|
||||
path_to_mlr="valgrind --leak-check=full ${path_to_mlr}g --no-mmap"
|
||||
path_to_mlr_for_auxents="valgrind --leak-check=full ${path_to_mlr}g"
|
||||
fi
|
||||
echo Using mlr executable $path_to_mlr
|
||||
|
||||
|
|
@ -5755,18 +5749,6 @@ mention pass comments1-crlf.csv
|
|||
run_mlr --pass-comments --icsv --odkvp cat < $outdir/comments1-crlf.csv
|
||||
run_mlr --pass-comments --icsv --odkvp cat $outdir/comments1-crlf.csv
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
announce MMAP AT PAGE BOUNDARIES
|
||||
|
||||
run_mlr --dkvp tail -n 4 $indir/page-aligned-final-ifs.dkvp
|
||||
run_mlr --dkvp tail -n 4 $indir/page-aligned-final-irs.dkvp
|
||||
run_mlr --dkvp tail -n 4 $indir/page-aligned-final-no-ifs.dkvp
|
||||
run_mlr --dkvp tail -n 4 $indir/page-aligned-no-final-irs.dkvp
|
||||
run_mlr --nidx tail -n 4 $indir/page-aligned-no-final-irs.nidx
|
||||
run_mlr --csvlite tail -n 4 $indir/page-aligned-no-final-irs.csvl
|
||||
run_mlr --csv --rs lf tail -n 4 $indir/page-aligned-no-final-irs.csvl
|
||||
run_mlr --xtab tail -n 4 $indir/page-aligned-no-final-eol.xtab
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
announce INT64 I/O
|
||||
|
||||
|
|
@ -5797,20 +5779,19 @@ run_mlr --csv cat < $indir/rfc-csv/simple.csv-crlf
|
|||
# ----------------------------------------------------------------
|
||||
announce RFC-CSV
|
||||
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/simple.csv-crlf
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/simple-truncated.csv
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/narrow.csv
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/narrow-truncated.csv
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-comma.csv
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-comma-truncated.csv
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-crlf.csv
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/quoted-crlf-truncated.csv
|
||||
run_mlr --mmap --csv cat $indir/rfc-csv/simple-truncated.csv $indir/rfc-csv/simple.csv-crlf
|
||||
run_mlr --mmap --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b $indir/rfc-csv/modify-defaults.csv
|
||||
run_mlr --mmap --csv --rs lf --quote-original cut -o -f c,b,a $indir/quote-original.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/simple.csv-crlf
|
||||
run_mlr --csv cat $indir/rfc-csv/simple-truncated.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/narrow.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/narrow-truncated.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/quoted-comma.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/quoted-comma-truncated.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/quoted-crlf.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/quoted-crlf-truncated.csv
|
||||
run_mlr --csv cat $indir/rfc-csv/simple-truncated.csv $indir/rfc-csv/simple.csv-crlf
|
||||
run_mlr --csv --ifs semicolon --ofs pipe --irs lf --ors lflf cut -x -f b $indir/rfc-csv/modify-defaults.csv
|
||||
run_mlr --csv --rs lf --quote-original cut -o -f c,b,a $indir/quote-original.csv
|
||||
|
||||
run_mlr --mmap --icsv --oxtab cat $indir/comma-at-eof.csv
|
||||
run_mlr --no-mmap --icsv --oxtab cat $indir/comma-at-eof.csv
|
||||
run_mlr --icsv --oxtab cat $indir/comma-at-eof.csv
|
||||
|
||||
run_mlr --csv --quote-all cat $indir/rfc-csv/simple.csv-crlf
|
||||
run_mlr --csv --quote-original cat $indir/rfc-csv/simple.csv-crlf
|
||||
|
|
@ -5822,10 +5803,8 @@ run_mlr --iusv --oxtab cat $indir/example.usv
|
|||
# ----------------------------------------------------------------
|
||||
announce RAGGED NON-RFC CSV
|
||||
|
||||
run_mlr --mmap --icsv --oxtab --ragged cat $indir/ragged.csv
|
||||
run_mlr --no-mmap --icsv --oxtab --ragged cat $indir/ragged.csv
|
||||
run_mlr --mmap --icsvlite --oxtab --ragged cat $indir/ragged.csv
|
||||
run_mlr --no-mmap --icsvlite --oxtab --ragged cat $indir/ragged.csv
|
||||
run_mlr --icsv --oxtab --ragged cat $indir/ragged.csv
|
||||
run_mlr --icsvlite --oxtab --ragged cat $indir/ragged.csv
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
announce MARKDOWN OUTPUT
|
||||
|
|
@ -5866,15 +5845,14 @@ run_mlr --oxtab --icsvlite --irs crlf --ifs /, cut -o -f x,a,i $indir/multi-s
|
|||
# ----------------------------------------------------------------
|
||||
announce MULTI-CHARACTER SEPARATORS FOR XTAB
|
||||
|
||||
run_mlr --mmap --xtab --ifs crlf --ofs Z cut -x -f b $indir/truncated.xtab-crlf
|
||||
run_mlr --mmap --xtab --ips . --ops @ cut -x -f b $indir/dots.xtab
|
||||
run_mlr --xtab --ifs crlf --ofs Z cut -x -f b $indir/truncated.xtab-crlf
|
||||
run_mlr --xtab --ips . --ops @ cut -x -f b $indir/dots.xtab
|
||||
run_mlr --xtab --ips ": " --ops '@@@@' put '$sum=int($a+$b)' $indir/multi-ips.dkvp
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
announce EMBEDDED IPS FOR XTAB
|
||||
|
||||
run_mlr --xtab --mmap cat $indir/embedded-ips.xtab
|
||||
run_mlr --xtab --no-mmap cat $indir/embedded-ips.xtab
|
||||
run_mlr --xtab cat $indir/embedded-ips.xtab
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
announce MULTI-CHARACTER IRS FOR PPRINT
|
||||
|
|
@ -5893,8 +5871,8 @@ run_mlr --opprint --barred --right cat $indir/abixy-het
|
|||
# ----------------------------------------------------------------
|
||||
announce MULTI-CHARACTER IXS SPECIFIERS
|
||||
|
||||
run_mlr --oxtab --idkvp --mmap --irs lf --ifs '\x2c' --ips '\075' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
|
||||
run_mlr --oxtab --idkvp --mmap --irs lf --ifs /, --ips '\x3d\x3a' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
|
||||
run_mlr --oxtab --idkvp --irs lf --ifs '\x2c' --ips '\075' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
|
||||
run_mlr --oxtab --idkvp --irs lf --ifs /, --ips '\x3d\x3a' cut -o -f x,a,i $indir/multi-sep.dkvp-crlf
|
||||
|
||||
# ----------------------------------------------------------------
|
||||
announce JSON I/O
|
||||
|
|
@ -6010,96 +5988,96 @@ run_mlr --irs auto --ors lf cat $indir/line-term-crlf.dkvp
|
|||
run_mlr cat $indir/line-term-lf.dkvp
|
||||
run_mlr cat $indir/line-term-crlf.dkvp
|
||||
|
||||
mention mmap nidx
|
||||
mention nidx
|
||||
run_mlr --irs auto --ors lf --nidx --fs comma cat $indir/line-term-lf.dkvp
|
||||
run_mlr --irs auto --ors lf --nidx --fs comma cat $indir/line-term-crlf.dkvp
|
||||
run_mlr --nidx --fs comma cat $indir/line-term-lf.dkvp
|
||||
run_mlr --nidx --fs comma cat $indir/line-term-crlf.dkvp
|
||||
|
||||
|
||||
mention mmap csvlite
|
||||
mention csvlite
|
||||
run_mlr --irs auto --ors lf --csvlite cat $indir/line-term-lf.csv
|
||||
run_mlr --irs auto --ors lf --csvlite cat $indir/line-term-crlf.csv
|
||||
run_mlr --csvlite cat $indir/line-term-lf.csv
|
||||
run_mlr --csvlite cat $indir/line-term-crlf.csv
|
||||
|
||||
|
||||
mention mmap pprint
|
||||
mention pprint
|
||||
run_mlr --irs auto --ors lf --pprint cat $indir/line-term-lf.csv
|
||||
run_mlr --irs auto --ors lf --pprint cat $indir/line-term-crlf.csv
|
||||
run_mlr --pprint cat $indir/line-term-lf.csv
|
||||
run_mlr --pprint cat $indir/line-term-crlf.csv
|
||||
|
||||
|
||||
mention mmap xtab
|
||||
mention xtab
|
||||
run_mlr --ifs auto --xtab cat $indir/line-term-lf.xtab
|
||||
run_mlr --ifs auto --xtab cat $indir/line-term-crlf.xtab
|
||||
run_mlr --fs auto --xtab cat $indir/line-term-lf.xtab
|
||||
run_mlr --fs auto --xtab cat $indir/line-term-crlf.xtab
|
||||
|
||||
mention mmap xtab
|
||||
mention xtab
|
||||
run_mlr --ifs auto --xtab cat $indir/line-term-lf.xtab
|
||||
run_mlr --ifs auto --xtab cat $indir/line-term-crlf.xtab
|
||||
run_mlr --fs auto --xtab cat $indir/line-term-lf.xtab
|
||||
run_mlr --fs auto --xtab cat $indir/line-term-crlf.xtab
|
||||
|
||||
|
||||
mention mmap csv
|
||||
mention csv
|
||||
run_mlr --irs auto --ors lf --csv cat $indir/line-term-lf.csv
|
||||
run_mlr --irs auto --ors lf --csv cat $indir/line-term-crlf.csv
|
||||
run_mlr --csv cat $indir/line-term-lf.csv
|
||||
run_mlr --csv cat $indir/line-term-crlf.csv
|
||||
|
||||
|
||||
mention mmap json nowrap nostack
|
||||
mention json nowrap nostack
|
||||
run_mlr --irs auto --ors lf --json cat $indir/line-term-lf.json
|
||||
run_mlr --irs auto --ors lf --json cat $indir/line-term-crlf.json
|
||||
run_mlr --json cat $indir/line-term-lf.json
|
||||
run_mlr --json cat $indir/line-term-crlf.json
|
||||
|
||||
|
||||
mention mmap json yeswrap nostack
|
||||
mention json yeswrap nostack
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-lf-wrap.json
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-crlf-wrap.json
|
||||
run_mlr --jlistwrap --json cat $indir/line-term-lf-wrap.json
|
||||
run_mlr --jlistwrap --json cat $indir/line-term-crlf-wrap.json
|
||||
|
||||
|
||||
mention mmap json nowrap yesstack
|
||||
mention json nowrap yesstack
|
||||
run_mlr --irs auto --json --jvstack cat $indir/line-term-lf.json
|
||||
run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-crlf.json
|
||||
run_mlr --json --jvstack cat $indir/line-term-lf.json
|
||||
run_mlr --json --jvstack cat $indir/line-term-crlf.json
|
||||
|
||||
|
||||
mention mmap json yeswrap yesstack
|
||||
mention json yeswrap yesstack
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json
|
||||
run_mlr --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json
|
||||
run_mlr --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json
|
||||
|
||||
mention mmap json nowrap nostack
|
||||
mention json nowrap nostack
|
||||
run_mlr --irs auto --ors lf --json cat $indir/line-term-lf.json
|
||||
run_mlr --irs auto --ors lf --json cat $indir/line-term-crlf.json
|
||||
run_mlr --json cat $indir/line-term-lf.json
|
||||
run_mlr --json cat $indir/line-term-crlf.json
|
||||
|
||||
|
||||
mention mmap json yeswrap nostack
|
||||
mention json yeswrap nostack
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-lf-wrap.json
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json cat $indir/line-term-crlf-wrap.json
|
||||
run_mlr --jlistwrap --json cat $indir/line-term-lf-wrap.json
|
||||
run_mlr --jlistwrap --json cat $indir/line-term-crlf-wrap.json
|
||||
|
||||
|
||||
mention mmap json nowrap yesstack
|
||||
mention json nowrap yesstack
|
||||
run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-lf.json
|
||||
run_mlr --irs auto --ors lf --json --jvstack cat $indir/line-term-crlf.json
|
||||
run_mlr --json --jvstack cat $indir/line-term-lf.json
|
||||
run_mlr --json --jvstack cat $indir/line-term-crlf.json
|
||||
|
||||
|
||||
mention mmap json yeswrap yesstack
|
||||
mention json yeswrap yesstack
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json
|
||||
run_mlr --irs auto --ors lf --jlistwrap --json --jvstack cat $indir/line-term-crlf-wrap.json
|
||||
run_mlr --jlistwrap --json --jvstack cat $indir/line-term-lf-wrap.json
|
||||
|
|
|
|||
|
|
@ -116,92 +116,12 @@ static char* test_stdio_byte_reader_reuse() {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static char* test_mmap_byte_reader_1() {
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
byte_reader_t* pbr = mmap_byte_reader_alloc();
|
||||
|
||||
char* contents = "";
|
||||
char* path = write_temp_file_or_die(contents);
|
||||
int ok = pbr->popen_func(pbr, NULL, path);
|
||||
mu_assert_lf(ok == TRUE);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
unlink_file_or_die(path);
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static char* test_mmap_byte_reader_2() {
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
byte_reader_t* pbr = mmap_byte_reader_alloc();
|
||||
|
||||
char* contents = "abcdefg";
|
||||
char* path = write_temp_file_or_die(contents);
|
||||
int ok = pbr->popen_func(pbr, NULL, path);
|
||||
mu_assert_lf(ok == TRUE);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'a');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'b');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'c');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'd');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'e');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'f');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'g');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
unlink_file_or_die(path);
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
static char* test_mmap_byte_reader_reuse() {
|
||||
#if MLR_ARCH_MMAP_ENABLED
|
||||
byte_reader_t* pbr = mmap_byte_reader_alloc();
|
||||
|
||||
char* contents = "abc";
|
||||
char* path = write_temp_file_or_die(contents);
|
||||
int ok = pbr->popen_func(pbr, NULL, path);
|
||||
mu_assert_lf(ok == TRUE);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'a');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'b');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'c');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
unlink_file_or_die(path);
|
||||
|
||||
contents = "defg";
|
||||
path = write_temp_file_or_die(contents);
|
||||
ok = pbr->popen_func(pbr, NULL, path);
|
||||
mu_assert_lf(ok == TRUE);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'd');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'e');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'f');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == 'g');
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
mu_assert_lf(pbr->pread_func(pbr) == EOF);
|
||||
unlink_file_or_die(path);
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
// ================================================================
|
||||
static char * run_all_tests() {
|
||||
mu_run_test(test_string_byte_reader);
|
||||
mu_run_test(test_stdio_byte_reader_1);
|
||||
mu_run_test(test_stdio_byte_reader_2);
|
||||
mu_run_test(test_stdio_byte_reader_reuse);
|
||||
mu_run_test(test_mmap_byte_reader_1);
|
||||
mu_run_test(test_mmap_byte_reader_2);
|
||||
mu_run_test(test_mmap_byte_reader_reuse);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue