From df73ad8ec05cbda0dde84b98fcd083c0d7f0aaac Mon Sep 17 00:00:00 2001 From: "Christian G. Warden" Date: Thu, 15 May 2025 17:17:08 -0500 Subject: [PATCH] Add surv Verb to Estimate a Survival Curve (#1788) Add a surv verb to estimate a survival curve using Kaplan-Meier. It requires duration and status (event or censored) columns, and outputs each distinct duration and corresponding probability of survival. --- go.mod | 11 +- go.sum | 14 +- pkg/transformers/aaa_transformer_table.go | 1 + pkg/transformers/surv.go | 173 ++++++++++++++++++++++ test/cases/cli-help/0001/expout | 10 ++ test/cases/verb-surv/0001/cmd | 1 + test/cases/verb-surv/0001/experr | 0 test/cases/verb-surv/0001/expout | 4 + test/input/surv.csv | 6 + 9 files changed, 216 insertions(+), 4 deletions(-) create mode 100644 pkg/transformers/surv.go create mode 100644 test/cases/verb-surv/0001/cmd create mode 100644 test/cases/verb-surv/0001/experr create mode 100644 test/cases/verb-surv/0001/expout create mode 100644 test/input/surv.csv diff --git a/go.mod b/go.mod index 8d843cc61..1868fb964 100644 --- a/go.mod +++ b/go.mod @@ -14,13 +14,16 @@ module github.com/johnkerl/miller/v6 // Local development: // replace github.com/johnkerl/lumin => /Users/kerl/git/johnkerl/lumin -go 1.21 +go 1.23.0 + +toolchain go1.24.2 require ( github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb github.com/johnkerl/lumin v1.0.0 github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 github.com/klauspost/compress v1.17.11 + github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1 github.com/lestrrat-go/strftime v1.1.0 github.com/mattn/go-isatty v0.0.20 github.com/nine-lives-later/go-windows-terminal-sequences v1.0.4 @@ -28,13 +31,17 @@ require ( github.com/stretchr/testify v1.10.0 golang.org/x/sys v0.30.0 golang.org/x/term v0.29.0 - golang.org/x/text v0.22.0 + golang.org/x/text v0.23.0 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/felixge/fgprof v0.9.3 // indirect + github.com/golang/snappy v1.0.0 // indirect github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect + github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + golang.org/x/tools v0.26.0 // indirect + gonum.org/v1/gonum v0.16.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 32aae654a..49ded388f 100644 --- a/go.sum +++ b/go.sum @@ -8,6 +8,8 @@ github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb h1:IT4JYU7k4ikYg1S github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb/go.mod h1:bH6Xx7IW64qjjJq8M2u4dxNaBiDfKK+z/3eGDpXEQhc= github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g= github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw= +github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs= +github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/pprof v0.0.0-20211214055906-6f57359322fd h1:1FjCyPC+syAzJ5/2S8fqdZK1R22vvA0J7JZKcuOIQ7Y= github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w= @@ -17,6 +19,10 @@ github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNU github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb h1:Z5BVHFk/DLOIUAd2NycF0mLtKfhl7ynm4Uy5+AFhT48= +github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb/go.mod h1:+U+6yzfITr4/teU2YhxWhdyw6YzednT/16/UBMjlDrU= +github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1 h1:UyIQ1VTQq/0CS/wLYjf3DV6uRKTd1xcsng3BccM4XCY= +github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1/go.mod h1:uvVFnikBpVz7S1pdsyUI+BBRlz64vmU6Q+kviiB+fpU= github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc h1:RKf14vYWi2ttpEmkA4aQ3j4u9dStX2t4M8UM6qqNsG8= github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc/go.mod h1:kopuH9ugFRkIXf3YoqHKyrJ9YfUFsckUU9S7B+XP+is= github.com/lestrrat-go/strftime v1.1.0 h1:gMESpZy44/4pXLO/m+sL0yBd1W6LjgjrrD4a68Gapyg= @@ -41,8 +47,12 @@ golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc= golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU= golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s= -golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM= -golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY= +golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= +golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= +golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= +golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= +gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= +gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/pkg/transformers/aaa_transformer_table.go b/pkg/transformers/aaa_transformer_table.go index 1f201fa29..b1a41ae17 100644 --- a/pkg/transformers/aaa_transformer_table.go +++ b/pkg/transformers/aaa_transformer_table.go @@ -70,6 +70,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{ StepSetup, SubSetup, SummarySetup, + SurvSetup, TacSetup, TailSetup, TeeSetup, diff --git a/pkg/transformers/surv.go b/pkg/transformers/surv.go new file mode 100644 index 000000000..6d4b38f09 --- /dev/null +++ b/pkg/transformers/surv.go @@ -0,0 +1,173 @@ +package transformers + +import ( + "container/list" + "fmt" + "os" + "strings" + + "github.com/johnkerl/miller/v6/pkg/cli" + "github.com/johnkerl/miller/v6/pkg/mlrval" + "github.com/johnkerl/miller/v6/pkg/types" + "github.com/kshedden/statmodel/duration" + "github.com/kshedden/statmodel/statmodel" +) + +// ---------------------------------------------------------------- +const verbNameSurv = "surv" + +// SurvSetup defines the surv verb: Kaplan-Meier survival curve. +var SurvSetup = TransformerSetup{ + Verb: verbNameSurv, + UsageFunc: transformerSurvUsage, + ParseCLIFunc: transformerSurvParseCLI, + IgnoresInput: false, +} + +func transformerSurvUsage(o *os.File) { + fmt.Fprintf(o, "Usage: %s %s -d {duration-field} -s {status-field}\n", "mlr", verbNameSurv) + fmt.Fprint(o, ` +Estimate Kaplan-Meier survival curve (right-censored). +Options: + -d {field} Name of duration field (time-to-event or censoring). + -s {field} Name of status field (0=censored, 1=event). + -h, --help Show this message. +`) +} + +func transformerSurvParseCLI( + pargi *int, + argc int, + args []string, + _ *cli.TOptions, + doConstruct bool, +) IRecordTransformer { + argi := *pargi + verb := args[argi] + argi++ + + var durationField, statusField string + + for argi < argc { + opt := args[argi] + if !strings.HasPrefix(opt, "-") { + break + } + if opt == "-h" || opt == "--help" { + transformerSurvUsage(os.Stdout) + os.Exit(0) + } else if opt == "-d" { + if argi+1 >= argc { + fmt.Fprintf(os.Stderr, "mlr %s: %s requires an argument\n", verb, opt) + os.Exit(1) + } + argi++ + durationField = args[argi] + argi++ + } else if opt == "-s" { + if argi+1 >= argc { + fmt.Fprintf(os.Stderr, "mlr %s: %s requires an argument\n", verb, opt) + os.Exit(1) + } + argi++ + statusField = args[argi] + argi++ + } else { + break + } + } + *pargi = argi + if !doConstruct { + return nil + } + if durationField == "" { + fmt.Fprintf(os.Stderr, "mlr %s: -d option is required.\n", verbNameSurv) + fmt.Fprintf(os.Stderr, "Please see 'mlr %s --help' for more information.\n", verbNameSurv) + os.Exit(1) + } + if statusField == "" { + fmt.Fprintf(os.Stderr, "mlr %s: -s option is required.\n", verbNameSurv) + fmt.Fprintf(os.Stderr, "Please see 'mlr %s --help' for more information.\n", verbNameSurv) + os.Exit(1) + } + return NewTransformerSurv(durationField, statusField) +} + +// TransformerSurv holds fields for surv verb. +type TransformerSurv struct { + durationField string + statusField string + times []float64 + events []bool +} + +// NewTransformerSurv constructs a new surv transformer. +func NewTransformerSurv(durationField, statusField string) IRecordTransformer { + return &TransformerSurv{ + durationField: durationField, + statusField: statusField, + times: make([]float64, 0), + events: make([]bool, 0), + } +} + +// Transform processes each record or emits results at end-of-stream. +func (tr *TransformerSurv) Transform( + inrecAndContext *types.RecordAndContext, + outputRecordsAndContexts *list.List, + inputDownstreamDoneChannel <-chan bool, + outputDownstreamDoneChannel chan<- bool, +) { + HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel) + if !inrecAndContext.EndOfStream { + rec := inrecAndContext.Record + mvDur := rec.Get(tr.durationField) + if mvDur == nil { + fmt.Fprintf(os.Stderr, "mlr surv: duration field '%s' not found\n", tr.durationField) + os.Exit(1) + } + duration := mvDur.GetNumericToFloatValueOrDie() + mvStat := rec.Get(tr.statusField) + if mvStat == nil { + fmt.Fprintf(os.Stderr, "mlr surv: status field '%s' not found\n", tr.statusField) + os.Exit(1) + } + status := mvStat.GetNumericToFloatValueOrDie() != 0 + tr.times = append(tr.times, duration) + tr.events = append(tr.events, status) + } else { + // Compute survival using kshedden/statmodel + n := len(tr.times) + if n == 0 { + outputRecordsAndContexts.PushBack(inrecAndContext) + return + } + durations := tr.times + statuses := make([]float64, n) + for i, ev := range tr.events { + if ev { + statuses[i] = 1.0 + } else { + statuses[i] = 0.0 + } + } + dataCols := [][]float64{durations, statuses} + names := []string{tr.durationField, tr.statusField} + ds := statmodel.NewDataset(dataCols, names) + sf, err := duration.NewSurvfuncRight(ds, tr.durationField, tr.statusField, &duration.SurvfuncRightConfig{}) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr surv: %v\n", err) + os.Exit(1) + } + sf.Fit() + times := sf.Time() + survProbs := sf.SurvProb() + for i, t := range times { + newrec := mlrval.NewMlrmapAsRecord() + newrec.PutCopy("time", mlrval.FromFloat(t)) + newrec.PutCopy("survival", mlrval.FromFloat(survProbs[i])) + outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context)) + } + outputRecordsAndContexts.PushBack(inrecAndContext) + } +} diff --git a/test/cases/cli-help/0001/expout b/test/cases/cli-help/0001/expout index c8d0af1bc..e7eed5d74 100644 --- a/test/cases/cli-help/0001/expout +++ b/test/cases/cli-help/0001/expout @@ -1261,6 +1261,16 @@ Options: --transpose Show output with field names as column names.. -h|--help Show this message. +================================================================ +surv +Usage: mlr surv -d {duration-field} -s {status-field} + +Estimate Kaplan-Meier survival curve (right-censored). +Options: + -d {field} Name of duration field (time-to-event or censoring). + -s {field} Name of status field (0=censored, 1=event). + -h, --help Show this message. + ================================================================ tac Usage: mlr tac [options] diff --git a/test/cases/verb-surv/0001/cmd b/test/cases/verb-surv/0001/cmd new file mode 100644 index 000000000..d50e07397 --- /dev/null +++ b/test/cases/verb-surv/0001/cmd @@ -0,0 +1 @@ +mlr --csv --from test/input/surv.csv surv -d duration -s status \ No newline at end of file diff --git a/test/cases/verb-surv/0001/experr b/test/cases/verb-surv/0001/experr new file mode 100644 index 000000000..e69de29bb diff --git a/test/cases/verb-surv/0001/expout b/test/cases/verb-surv/0001/expout new file mode 100644 index 000000000..9b23a50f5 --- /dev/null +++ b/test/cases/verb-surv/0001/expout @@ -0,0 +1,4 @@ +time,survival +1.00000000,0.80000000 +3.00000000,0.53333333 +5.00000000,0.00000000 diff --git a/test/input/surv.csv b/test/input/surv.csv new file mode 100644 index 000000000..f025f6a5c --- /dev/null +++ b/test/input/surv.csv @@ -0,0 +1,6 @@ +duration,status +1,1 +2,0 +3,1 +4,0 +5,1 \ No newline at end of file