mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-24 02:36:15 +00:00
160 lines
5.4 KiB
C
160 lines
5.4 KiB
C
#include <stdio.h>
|
|
#include "cli/argparse.h"
|
|
#include "lib/mlrutil.h"
|
|
#include "lib/mtrand.h"
|
|
#include "containers/sllv.h"
|
|
#include "mapping/mappers.h"
|
|
|
|
#define NOUT_EQUALS_NIN -1
|
|
typedef struct _mapper_bootstrap_state_t {
|
|
ap_state_t* pargp;
|
|
int nout;
|
|
sllv_t* records;
|
|
} mapper_bootstrap_state_t;
|
|
|
|
static void mapper_bootstrap_usage(FILE* o, char* argv0, char* verb);
|
|
static mapper_t* mapper_bootstrap_parse_cli(int* pargi, int argc, char** argv,
|
|
cli_reader_opts_t* _, cli_writer_opts_t* __);
|
|
static mapper_t* mapper_bootstrap_alloc(int nout, ap_state_t* pargp);
|
|
static void mapper_bootstrap_free(mapper_t* pmapper, context_t* _);
|
|
static sllv_t* mapper_bootstrap_process(lrec_t* pinrec, context_t* pctx, void* pvstate);
|
|
|
|
// ----------------------------------------------------------------
|
|
mapper_setup_t mapper_bootstrap_setup = {
|
|
.verb = "bootstrap",
|
|
.pusage_func = mapper_bootstrap_usage,
|
|
.pparse_func = mapper_bootstrap_parse_cli,
|
|
.ignores_input = FALSE,
|
|
};
|
|
|
|
// ----------------------------------------------------------------
|
|
static void mapper_bootstrap_usage(FILE* o, char* argv0, char* verb) {
|
|
fprintf(o, "Usage: %s %s [options]\n", argv0, verb);
|
|
fprintf(o, "Emits an n-sample, with replacement, of the input records.\n");
|
|
fprintf(o, "Options:\n");
|
|
fprintf(o, "-n {number} Number of samples to output. Defaults to number of input records.\n");
|
|
fprintf(o, " Must be non-negative.\n");
|
|
fprintf(o, "See also %s sample and %s shuffle.\n", argv0, argv0);
|
|
}
|
|
|
|
static mapper_t* mapper_bootstrap_parse_cli(int* pargi, int argc, char** argv,
|
|
cli_reader_opts_t* _, cli_writer_opts_t* __)
|
|
{
|
|
int nout = NOUT_EQUALS_NIN;
|
|
if ((argc - *pargi) < 1) {
|
|
mapper_bootstrap_usage(stderr, argv[0], argv[*pargi]);
|
|
return NULL;
|
|
}
|
|
|
|
char* verb = argv[*pargi];
|
|
*pargi += 1;
|
|
|
|
ap_state_t* pstate = ap_alloc();
|
|
ap_define_int_flag(pstate, "-n", &nout);
|
|
|
|
if (!ap_parse(pstate, verb, pargi, argc, argv)) {
|
|
mapper_bootstrap_usage(stderr, argv[0], verb);
|
|
return NULL;
|
|
}
|
|
|
|
if (nout != NOUT_EQUALS_NIN && nout < 0) {
|
|
mapper_bootstrap_usage(stderr, argv[0], verb);
|
|
return NULL;
|
|
}
|
|
|
|
mapper_t* pmapper = mapper_bootstrap_alloc(nout, pstate);
|
|
return pmapper;
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
static mapper_t* mapper_bootstrap_alloc(int nout, ap_state_t* pargp) {
|
|
mapper_t* pmapper = mlr_malloc_or_die(sizeof(mapper_t));
|
|
|
|
mapper_bootstrap_state_t* pstate = mlr_malloc_or_die(sizeof(mapper_bootstrap_state_t));
|
|
pstate->nout = nout;
|
|
pstate->pargp = pargp;
|
|
pstate->records = sllv_alloc();
|
|
|
|
pmapper->pvstate = pstate;
|
|
pmapper->pprocess_func = mapper_bootstrap_process;
|
|
pmapper->pfree_func = mapper_bootstrap_free;
|
|
|
|
return pmapper;
|
|
}
|
|
|
|
static void mapper_bootstrap_free(mapper_t* pmapper, context_t* _) {
|
|
mapper_bootstrap_state_t* pstate = pmapper->pvstate;
|
|
// Free the container
|
|
sllv_free(pstate->records);
|
|
ap_free(pstate->pargp);
|
|
free(pstate);
|
|
free(pmapper);
|
|
}
|
|
|
|
// ----------------------------------------------------------------
|
|
static sllv_t* mapper_bootstrap_process(lrec_t* pinrec, context_t* pctx, void* pvstate) {
|
|
mapper_bootstrap_state_t* pstate = pvstate;
|
|
if (pinrec != NULL) { // Not end of input stream: consume an input record.
|
|
// The caller will free the outrecs
|
|
sllv_append(pstate->records, pinrec);
|
|
return NULL;
|
|
}
|
|
|
|
// This is entirely straightforward except for memory management. The
|
|
// Miller contract to respect is that each routine frees records or passes
|
|
// them on. This means lrec-writers will free lrecs they receive, after
|
|
// writing them. This also means we must free lrecs we don't pass on.
|
|
//
|
|
// Given nin input records, we produce nout output records, but sampling with replacement.
|
|
// The memory-management criteria above mean:
|
|
// * If an input lrec is not output at all, we must free it.
|
|
// * If an input lrec is output once, we pass it on through and let the write free it.
|
|
// * If an input lrec is output more than once, it would get double-freed (which is not OK)
|
|
// so for all repetitions past the first we must make a copy.
|
|
// A used_flags[] array allows us to handle all of this.
|
|
|
|
sllv_t* poutrecs = sllv_alloc();
|
|
int nin = pstate->records->length;
|
|
int nout = (pstate->nout == NOUT_EQUALS_NIN) ? nin : pstate->nout;
|
|
if (nin == 0) {
|
|
sllv_append(poutrecs, NULL);
|
|
return poutrecs;
|
|
}
|
|
|
|
// Make an array of pointers into the input list. Mark each lrec as not yet output.
|
|
lrec_t** record_array = mlr_malloc_or_die(nin * sizeof(lrec_t**));
|
|
char* used_flags = mlr_malloc_or_die(nin * sizeof(char));
|
|
sllve_t* pe = pstate->records->phead;
|
|
for (int i = 0; i < nin; i++, pe = pe->pnext) {
|
|
record_array[i] = pe->pvvalue;
|
|
used_flags[i] = FALSE;
|
|
}
|
|
|
|
// Do the sample-with-replacment, reading from random indices in the input
|
|
// array and appending to the output list.
|
|
for (int i = 0; i < nout; i++) {
|
|
int index = nin * get_mtrand_double();
|
|
if (index >= nin)
|
|
index = nin - 1;
|
|
lrec_t* prec = record_array[index];
|
|
if (used_flags[index]) { // Copy for repeated output.
|
|
prec = lrec_copy(prec);
|
|
} else { // First output of this record; remember it.
|
|
used_flags[index] = TRUE;
|
|
}
|
|
sllv_append(poutrecs, prec);
|
|
}
|
|
|
|
// Free non-output records
|
|
pe = pstate->records->phead;
|
|
for (int i = 0; i < nin; i++, pe = pe->pnext)
|
|
if (!used_flags[i])
|
|
lrec_free(record_array[i]);
|
|
free(used_flags);
|
|
// Free the temp array
|
|
free(record_array);
|
|
|
|
// Null-terminate the output list to signify end of stream.
|
|
sllv_append(poutrecs, NULL);
|
|
return poutrecs;
|
|
}
|