iterative stats2 feature

This commit is contained in:
John Kerl 2015-10-10 07:41:51 -07:00
parent 4b81bb0cdd
commit 5a529dda10
6 changed files with 86 additions and 20 deletions

View file

@ -44,14 +44,14 @@ typedef struct _mapper_stats1_state_t {
// ----------------------------------------------------------------
static void mapper_stats1_usage(FILE* o, char* argv0, char* verb);
static mapper_t* mapper_stats1_parse_cli(int* pargi, int argc, char** argv);
static mapper_t* mapper_stats1_alloc(slls_t* paccumulator_names, slls_t* pvalue_field_names, slls_t* pgroup_by_field_names,
int do_iterative_stats);
static mapper_t* mapper_stats1_alloc(slls_t* paccumulator_names, slls_t* pvalue_field_names,
slls_t* pgroup_by_field_names, int do_iterative_stats);
static void mapper_stats1_free(void* pvstate);
static sllv_t* mapper_stats1_process(lrec_t* pinrec, context_t* pctx, void* pvstate);
static lrec_t* mapper_stats1_ingest(lrec_t* pinrec, mapper_stats1_state_t* pstate);
static sllv_t* mapper_stats1_emit_all(mapper_stats1_state_t* pstate);
static lrec_t* mapper_stats1_emit(mapper_stats1_state_t* pstate, lrec_t* poutrec, char* value_field_name, char* stats1_name,
lhmsv_t* acc_field_to_acc_state);
static lrec_t* mapper_stats1_emit(mapper_stats1_state_t* pstate, lrec_t* poutrec,
char* value_field_name, char* stats1_name, lhmsv_t* acc_field_to_acc_state);
static stats1_t* stats1_count_alloc(char* value_field_name, char* stats1_name);
static stats1_t* stats1_mode_alloc(char* value_field_name, char* stats1_name);
@ -133,7 +133,7 @@ static mapper_t* mapper_stats1_parse_cli(int* pargi, int argc, char** argv) {
ap_define_string_list_flag(pstate, "-a", &paccumulator_names);
ap_define_string_list_flag(pstate, "-f", &pvalue_field_names);
ap_define_string_list_flag(pstate, "-g", &pgroup_by_field_names);
ap_define_true_flag(pstate, "-s", &do_iterative_stats);
ap_define_true_flag(pstate, "-s", &do_iterative_stats);
if (!ap_parse(pstate, verb, pargi, argc, argv)) {
mapper_stats1_usage(stderr, argv[0], verb);
@ -145,7 +145,8 @@ static mapper_t* mapper_stats1_parse_cli(int* pargi, int argc, char** argv) {
return NULL;
}
return mapper_stats1_alloc(paccumulator_names, pvalue_field_names, pgroup_by_field_names, do_iterative_stats);
return mapper_stats1_alloc(paccumulator_names, pvalue_field_names, pgroup_by_field_names,
do_iterative_stats);
}
// ----------------------------------------------------------------
@ -404,7 +405,6 @@ static sllv_t* mapper_stats1_emit_all(mapper_stats1_state_t* pstate) {
static lrec_t* mapper_stats1_emit(mapper_stats1_state_t* pstate, lrec_t* poutrec,
char* value_field_name, char* stats1_name, lhmsv_t* acc_field_to_acc_state)
{
// Add in fields such as x_sum=#, y_count=#, etc.:
for (sllse_t* pe = pstate->paccumulator_names->phead; pe != NULL; pe = pe->pnext) {
char* stats1_name = pe->value;

View file

@ -49,8 +49,10 @@ static mapper_t* mapper_stats2_alloc(slls_t* paccumulator_names, slls_t* pvalue_
slls_t* pgroup_by_field_names, int do_verbose, int do_iterative_stats);
static void mapper_stats2_free(void* pvstate);
static sllv_t* mapper_stats2_process(lrec_t* pinrec, context_t* pctx, void* pvstate);
static void mapper_stats2_ingest(lrec_t* pinrec, context_t* pctx, mapper_stats2_state_t* pstate);
static sllv_t* mapper_stats2_emit(mapper_stats2_state_t* pstate);
static lrec_t* mapper_stats2_ingest(lrec_t* pinrec, context_t* pctx, mapper_stats2_state_t* pstate);
static sllv_t* mapper_stats2_emit_all(mapper_stats2_state_t* pstate);
static void mapper_stats2_emit(mapper_stats2_state_t* pstate, lrec_t* pinrec,
char* value_field_name_1, char* value_field_name_2, lhmsv_t* acc_fields_to_acc_state);
static stats2_t* make_stats2(char* value_field_name_1, char* value_field_name_2, char* stats2_name, int do_verbose);
static stats2_t* stats2_linreg_ols_alloc(char* value_field_name_1, char* value_field_name_2, char* stats2_name, int do_verbose);
@ -95,6 +97,9 @@ static void mapper_stats2_usage(FILE* o, char* argv0, char* verb) {
fprintf(o, " There must be an even number of names.\n");
fprintf(o, "-g {e,f,g} Optional group-by-field names.\n");
fprintf(o, "-v Print additional output for linreg-pca.\n");
fprintf(o, "-s Print iterative stats. Useful in tail -f contexts (in which\n");
fprintf(o, " case please avoid pprint-format output since end of input\n");
fprintf(o, " stream will never be seen).\n");
fprintf(o, "Example: %s %s -a linreg-pca -f x,y\n", argv0, verb);
fprintf(o, "Example: %s %s -a linreg-ols,r2 -f x,y -g size,shape\n", argv0, verb);
fprintf(o, "Example: %s %s -a corr -f x,y\n", argv0, verb);
@ -114,6 +119,7 @@ static mapper_t* mapper_stats2_parse_cli(int* pargi, int argc, char** argv) {
ap_define_string_list_flag(pstate, "-f", &pvalue_field_names);
ap_define_string_list_flag(pstate, "-g", &pgroup_by_field_names);
ap_define_true_flag(pstate, "-v", &do_verbose);
ap_define_true_flag(pstate, "-s", &do_iterative_stats);
if (!ap_parse(pstate, verb, pargi, argc, argv)) {
mapper_stats2_usage(stderr, argv[0], verb);
@ -165,26 +171,32 @@ static mapper_t* mapper_stats2_parse_cli(int* pargi, int argc, char** argv) {
// }
// ================================================================
// ----------------------------------------------------------------
// In the iterative case, add to the current record its current group's stats fields.
// In the non-iteratiive case, produce output only at end of input stream.
static sllv_t* mapper_stats2_process(lrec_t* pinrec, context_t* pctx, void* pvstate) {
mapper_stats2_state_t* pstate = pvstate;
if (pinrec != NULL) {
mapper_stats2_ingest(pinrec, pctx, pstate);
lrec_free(pinrec);
lrec_t* poutrec = mapper_stats2_ingest(pinrec, pctx, pstate);
if (poutrec == NULL) {
lrec_free(pinrec);
return NULL;
} else {
return sllv_single(poutrec);
}
} else if (!pstate->do_iterative_stats) {
return mapper_stats2_emit_all(pstate);
} else {
return NULL;
}
else {
return mapper_stats2_emit(pstate);
}
}
// ----------------------------------------------------------------
static void mapper_stats2_ingest(lrec_t* pinrec, context_t* pctx, mapper_stats2_state_t* pstate) {
static lrec_t* mapper_stats2_ingest(lrec_t* pinrec, context_t* pctx, mapper_stats2_state_t* pstate) {
// ["s", "t"]
slls_t* pgroup_by_field_values = mlr_selected_values_from_record(pinrec, pstate->pgroup_by_field_names);
if (pgroup_by_field_values->length != pstate->pgroup_by_field_names->length) {
slls_free(pgroup_by_field_values);
return;
return pstate->do_iterative_stats ? pinrec : NULL;
}
lhms2v_t* group_to_acc_field = lhmslv_get(pstate->groups, pgroup_by_field_values);
@ -231,13 +243,18 @@ static void mapper_stats2_ingest(lrec_t* pinrec, context_t* pctx, mapper_stats2_
double dval2 = mlr_double_from_string_or_die(sval2);
pstats2->pingest_func(pstats2->pvstate, dval1, dval2);
}
if (pstate->do_iterative_stats) {
mapper_stats2_emit(pstate, pinrec, value_field_name_1, value_field_name_2,
acc_fields_to_acc_state);
}
}
slls_free(pgroup_by_field_values);
return pstate->do_iterative_stats ? pinrec : NULL;
}
// ----------------------------------------------------------------
static sllv_t* mapper_stats2_emit(mapper_stats2_state_t* pstate) {
static sllv_t* mapper_stats2_emit_all(mapper_stats2_state_t* pstate) {
sllv_t* poutrecs = sllv_alloc();
for (lhmslve_t* pa = pstate->groups->phead; pa != NULL; pa = pa->pnext) {
@ -260,6 +277,9 @@ static sllv_t* mapper_stats2_emit(mapper_stats2_state_t* pstate) {
char* value_field_name_2 = pd->key2;
lhmsv_t* acc_fields_to_acc_state = pd->pvvalue;
mapper_stats2_emit(pstate, poutrec, value_field_name_1, value_field_name_2,
acc_fields_to_acc_state);
// For "corr", "linreg"
for (lhmsve_t* pe = acc_fields_to_acc_state->phead; pe != NULL; pe = pe->pnext) {
stats2_t* pstats2 = pe->pvvalue;
@ -273,6 +293,16 @@ static sllv_t* mapper_stats2_emit(mapper_stats2_state_t* pstate) {
return poutrecs;
}
static void mapper_stats2_emit(mapper_stats2_state_t* pstate, lrec_t* poutrec,
char* value_field_name_1, char* value_field_name_2, lhmsv_t* acc_fields_to_acc_state)
{
// For "corr", "linreg"
for (lhmsve_t* pe = acc_fields_to_acc_state->phead; pe != NULL; pe = pe->pnext) {
stats2_t* pstats2 = pe->pvvalue;
pstats2->pemit_func(pstats2->pvstate, value_field_name_1, value_field_name_2, poutrec);
}
}
// ----------------------------------------------------------------
static mapper_t* mapper_stats2_alloc(slls_t* paccumulator_names, slls_t* pvalue_field_name_pairs,
slls_t* pgroup_by_field_names, int do_verbose, int do_iterative_stats)
@ -285,6 +315,7 @@ static mapper_t* mapper_stats2_alloc(slls_t* paccumulator_names, slls_t* pvalue_
pstate->pgroup_by_field_names = pgroup_by_field_names;
pstate->groups = lhmslv_alloc();
pstate->do_verbose = do_verbose;
pstate->do_iterative_stats = do_iterative_stats;
pmapper->pvstate = pstate;
pmapper->pprocess_func = mapper_stats2_process;

View file

@ -4,6 +4,7 @@ EXTRA_DIST= \
a.pprint \
abixy \
abixy-wide \
abixy-wide-short \
b.csv \
b.pprint \
c.csv \

View file

@ -0,0 +1,20 @@
a=cat,b=pan,i=1,x=0.5117389009583777,y=0.08295224980036853,x2=0.2618767027540883,xy=0.0424498931448654,y2=0.006881075746942741
a=pan,b=wye,i=2,x=0.5225940442098578,y=0.511678736087022,x2=0.27310453504361476,xy=0.2674002600279053,y2=0.26181512896361225
a=wye,b=cat,i=3,x=0.8150401717873625,y=0.07989551500795256,x2=0.6642904816271734,xy=0.06511805427712146,y2=0.006383293318385972
a=dog,b=hat,i=4,x=0.4488733555675044,y=0.5730530513123552,x2=0.20148728933843124,xy=0.25722824606077416,y2=0.32838979961840076
a=dog,b=pan,i=5,x=0.2946557960430134,y=0.6850437256584863,x2=0.08682203814174191,xy=0.20185210430817294,y2=0.46928490606405937
a=wye,b=cat,i=6,x=0.048709182664292916,y=0.5851879044762575,x2=0.0023725844758234536,xy=0.02850402453206882,y2=0.34244488354531344
a=dog,b=hat,i=7,x=0.8500003149528544,y=0.2984098741712895,x2=0.7225005354199517,xy=0.25364848703063775,y2=0.08904845300292483
a=pan,b=pan,i=8,x=0.616507208914765,y=0.25924335982487057,x2=0.38008113864387366,xy=0.15982540019531707,y2=0.06720711961328732
a=hat,b=hat,i=9,x=0.33786884067769307,y=0.6036735617015514,x2=0.11415535350088835,xy=0.203962486439877,y2=0.3644217690974368
a=wye,b=hat,i=10,x=0.3834648944206174,y=0.4999709279216641,x2=0.14704532525301522,xy=0.19172129908885902,y2=0.24997092876684981
a=pan,b=hat,i=11,x=0.025474999754416028,y=0.7861954915044592,x2=0.0006489756124874967,xy=0.020028329952999087,y2=0.6181033508619382
a=cat,b=hat,i=12,x=0.6335445699880142,y=0.15467178563525052,x2=0.4013787221612979,xy=0.0979914699195631,y2=0.02392336127159689
a=hat,b=wye,i=13,x=0.35922068401384877,y=0.8502678133887914,x2=0.1290394998233774,xy=0.30543378552048117,y2=0.7229553544849566
a=dog,b=dog,i=14,x=0.5440047442770544,y=0.933608851612059,x2=0.2959411617959433,xy=0.5078876445760125,y2=0.8716254878083876
a=wye,b=dog,i=15,x=0.4689175303764642,y=0.09048353045392021,x2=0.21988365029436224,xy=0.04242931364019586,y2=0.008187269283405506
a=pan,b=pan,i=16,x=0.3959177828066379,y=0.6339858483805666,x2=0.15675089074252413,xy=0.25100627142161924,y2=0.4019380559468268
a=dog,b=hat,i=17,x=0.34033844788864975,y=0.8845934733681523,x2=0.11583025911125516,xy=0.3010611697385466,y2=0.782505613125532
a=wye,b=wye,i=18,x=0.6770613653962891,y=0.896307226056897,x2=0.4584120925122874,xy=0.6068549942886431,y2=0.8033666434818095
a=dog,b=wye,i=19,x=0.4865373244199632,y=0.44117766146315884,x2=0.23671856805373653,xy=0.2146493990021416,y2=0.1946377289741016
a=dog,b=dog,i=20,x=0.3223311725542929,y=0.08115611029827985,x2=0.10389738480022534,xy=0.026159144192390068,y2=0.006586314238746564

View file

@ -232,10 +232,14 @@ run_mlr --opprint stats1 -a min,p10,p50,mode,p90,max -f i,x,y -g a,b $indi
run_mlr --opprint stats1 -a mean,meaneb,stddev -f i,x,y -g a,b $indir/abixy
run_mlr --opprint stats1 -s -a mean,sum,count,min,max,mode -f i,x,y -g a,b $indir/abixy
run_mlr --opprint stats2 -a linreg-ols,linreg-pca,r2,corr,cov -f x,y,xy,y2,x2,x2 $indir/abixy-wide
run_mlr --opprint stats2 -a linreg-ols,linreg-pca,r2,corr,cov -f x,y,xy,y2,x2,x2 -g a,b $indir/abixy-wide
run_mlr --opprint stats2 -a linreg-ols,linreg-pca,r2,corr,cov -f x,y,xy,y2,x2,x2 $indir/abixy-wide
run_mlr --opprint stats2 -a linreg-ols,linreg-pca,r2,corr,cov -f x,y,xy,y2,x2,x2 -g a,b $indir/abixy-wide
run_mlr --oxtab stats2 -s -a linreg-ols,linreg-pca,r2,corr,cov -f x,y,xy,y2,x2,x2 $indir/abixy-wide-short
run_mlr --oxtab stats2 -s -a linreg-ols,linreg-pca,r2,corr,cov -f x,y,xy,y2,x2,x2 -g a,b $indir/abixy-wide-short
run_mlr --opprint step -a rsum,delta,counter -f x,y $indir/abixy
run_mlr --opprint step -a rsum,delta,counter -f x,y -g a $indir/abixy
run_mlr --opprint histogram -f x,y --lo 0 --hi 1 --nbins 20 $indir/small
# ----------------------------------------------------------------

View file

@ -0,0 +1,10 @@
#!/usr/bin/env ruby
# For playing with stats1/2 -s
$stdout.sync = true
while true
x = rand()
y = rand()
xy = x*y
puts "x=#{x},y=#{y},xy=#{xy}"
sleep 0.1
end