Add surv Verb to Estimate a Survival Curve (#1788)

Add a surv verb to estimate a survival curve using Kaplan-Meier.  It
requires duration and status (event or censored) columns, and outputs
each distinct duration and corresponding probability of survival.
This commit is contained in:
Christian G. Warden 2025-05-15 17:17:08 -05:00 committed by GitHub
parent 35c7eeb977
commit df73ad8ec0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 216 additions and 4 deletions

11
go.mod
View file

@ -14,13 +14,16 @@ module github.com/johnkerl/miller/v6
// Local development:
// replace github.com/johnkerl/lumin => /Users/kerl/git/johnkerl/lumin
go 1.21
go 1.23.0
toolchain go1.24.2
require (
github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb
github.com/johnkerl/lumin v1.0.0
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51
github.com/klauspost/compress v1.17.11
github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1
github.com/lestrrat-go/strftime v1.1.0
github.com/mattn/go-isatty v0.0.20
github.com/nine-lives-later/go-windows-terminal-sequences v1.0.4
@ -28,13 +31,17 @@ require (
github.com/stretchr/testify v1.10.0
golang.org/x/sys v0.30.0
golang.org/x/term v0.29.0
golang.org/x/text v0.22.0
golang.org/x/text v0.23.0
)
require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/felixge/fgprof v0.9.3 // indirect
github.com/golang/snappy v1.0.0 // indirect
github.com/google/pprof v0.0.0-20211214055906-6f57359322fd // indirect
github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
golang.org/x/tools v0.26.0 // indirect
gonum.org/v1/gonum v0.16.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)

14
go.sum
View file

@ -8,6 +8,8 @@ github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb h1:IT4JYU7k4ikYg1S
github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb/go.mod h1:bH6Xx7IW64qjjJq8M2u4dxNaBiDfKK+z/3eGDpXEQhc=
github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g=
github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw=
github.com/golang/snappy v1.0.0 h1:Oy607GVXHs7RtbggtPBnr2RmDArIsAefDwvrdWvRhGs=
github.com/golang/snappy v1.0.0/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/pprof v0.0.0-20211214055906-6f57359322fd h1:1FjCyPC+syAzJ5/2S8fqdZK1R22vvA0J7JZKcuOIQ7Y=
github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg=
github.com/ianlancetaylor/demangle v0.0.0-20210905161508-09a460cdf81d/go.mod h1:aYm2/VgdVmcIU8iMfdMvDMsRAQjcfZSKFby6HOFvi/w=
@ -17,6 +19,10 @@ github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNU
github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8=
github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc=
github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0=
github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb h1:Z5BVHFk/DLOIUAd2NycF0mLtKfhl7ynm4Uy5+AFhT48=
github.com/kshedden/dstream v0.0.0-20190512025041-c4c410631beb/go.mod h1:+U+6yzfITr4/teU2YhxWhdyw6YzednT/16/UBMjlDrU=
github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1 h1:UyIQ1VTQq/0CS/wLYjf3DV6uRKTd1xcsng3BccM4XCY=
github.com/kshedden/statmodel v0.0.0-20210519035403-ee97d3e48df1/go.mod h1:uvVFnikBpVz7S1pdsyUI+BBRlz64vmU6Q+kviiB+fpU=
github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc h1:RKf14vYWi2ttpEmkA4aQ3j4u9dStX2t4M8UM6qqNsG8=
github.com/lestrrat-go/envload v0.0.0-20180220234015-a3eb8ddeffcc/go.mod h1:kopuH9ugFRkIXf3YoqHKyrJ9YfUFsckUU9S7B+XP+is=
github.com/lestrrat-go/strftime v1.1.0 h1:gMESpZy44/4pXLO/m+sL0yBd1W6LjgjrrD4a68Gapyg=
@ -41,8 +47,12 @@ golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.29.0 h1:L6pJp37ocefwRRtYPKSWOWzOtWSxVajvz2ldH/xi3iU=
golang.org/x/term v0.29.0/go.mod h1:6bl4lRlvVuDgSf3179VpIxBF0o10JUpXWOnI7nErv7s=
golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY=
golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4=
golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ=
golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0=
gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk=
gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View file

@ -70,6 +70,7 @@ var TRANSFORMER_LOOKUP_TABLE = []TransformerSetup{
StepSetup,
SubSetup,
SummarySetup,
SurvSetup,
TacSetup,
TailSetup,
TeeSetup,

173
pkg/transformers/surv.go Normal file
View file

@ -0,0 +1,173 @@
package transformers
import (
"container/list"
"fmt"
"os"
"strings"
"github.com/johnkerl/miller/v6/pkg/cli"
"github.com/johnkerl/miller/v6/pkg/mlrval"
"github.com/johnkerl/miller/v6/pkg/types"
"github.com/kshedden/statmodel/duration"
"github.com/kshedden/statmodel/statmodel"
)
// ----------------------------------------------------------------
const verbNameSurv = "surv"
// SurvSetup defines the surv verb: Kaplan-Meier survival curve.
var SurvSetup = TransformerSetup{
Verb: verbNameSurv,
UsageFunc: transformerSurvUsage,
ParseCLIFunc: transformerSurvParseCLI,
IgnoresInput: false,
}
func transformerSurvUsage(o *os.File) {
fmt.Fprintf(o, "Usage: %s %s -d {duration-field} -s {status-field}\n", "mlr", verbNameSurv)
fmt.Fprint(o, `
Estimate Kaplan-Meier survival curve (right-censored).
Options:
-d {field} Name of duration field (time-to-event or censoring).
-s {field} Name of status field (0=censored, 1=event).
-h, --help Show this message.
`)
}
func transformerSurvParseCLI(
pargi *int,
argc int,
args []string,
_ *cli.TOptions,
doConstruct bool,
) IRecordTransformer {
argi := *pargi
verb := args[argi]
argi++
var durationField, statusField string
for argi < argc {
opt := args[argi]
if !strings.HasPrefix(opt, "-") {
break
}
if opt == "-h" || opt == "--help" {
transformerSurvUsage(os.Stdout)
os.Exit(0)
} else if opt == "-d" {
if argi+1 >= argc {
fmt.Fprintf(os.Stderr, "mlr %s: %s requires an argument\n", verb, opt)
os.Exit(1)
}
argi++
durationField = args[argi]
argi++
} else if opt == "-s" {
if argi+1 >= argc {
fmt.Fprintf(os.Stderr, "mlr %s: %s requires an argument\n", verb, opt)
os.Exit(1)
}
argi++
statusField = args[argi]
argi++
} else {
break
}
}
*pargi = argi
if !doConstruct {
return nil
}
if durationField == "" {
fmt.Fprintf(os.Stderr, "mlr %s: -d option is required.\n", verbNameSurv)
fmt.Fprintf(os.Stderr, "Please see 'mlr %s --help' for more information.\n", verbNameSurv)
os.Exit(1)
}
if statusField == "" {
fmt.Fprintf(os.Stderr, "mlr %s: -s option is required.\n", verbNameSurv)
fmt.Fprintf(os.Stderr, "Please see 'mlr %s --help' for more information.\n", verbNameSurv)
os.Exit(1)
}
return NewTransformerSurv(durationField, statusField)
}
// TransformerSurv holds fields for surv verb.
type TransformerSurv struct {
durationField string
statusField string
times []float64
events []bool
}
// NewTransformerSurv constructs a new surv transformer.
func NewTransformerSurv(durationField, statusField string) IRecordTransformer {
return &TransformerSurv{
durationField: durationField,
statusField: statusField,
times: make([]float64, 0),
events: make([]bool, 0),
}
}
// Transform processes each record or emits results at end-of-stream.
func (tr *TransformerSurv) Transform(
inrecAndContext *types.RecordAndContext,
outputRecordsAndContexts *list.List,
inputDownstreamDoneChannel <-chan bool,
outputDownstreamDoneChannel chan<- bool,
) {
HandleDefaultDownstreamDone(inputDownstreamDoneChannel, outputDownstreamDoneChannel)
if !inrecAndContext.EndOfStream {
rec := inrecAndContext.Record
mvDur := rec.Get(tr.durationField)
if mvDur == nil {
fmt.Fprintf(os.Stderr, "mlr surv: duration field '%s' not found\n", tr.durationField)
os.Exit(1)
}
duration := mvDur.GetNumericToFloatValueOrDie()
mvStat := rec.Get(tr.statusField)
if mvStat == nil {
fmt.Fprintf(os.Stderr, "mlr surv: status field '%s' not found\n", tr.statusField)
os.Exit(1)
}
status := mvStat.GetNumericToFloatValueOrDie() != 0
tr.times = append(tr.times, duration)
tr.events = append(tr.events, status)
} else {
// Compute survival using kshedden/statmodel
n := len(tr.times)
if n == 0 {
outputRecordsAndContexts.PushBack(inrecAndContext)
return
}
durations := tr.times
statuses := make([]float64, n)
for i, ev := range tr.events {
if ev {
statuses[i] = 1.0
} else {
statuses[i] = 0.0
}
}
dataCols := [][]float64{durations, statuses}
names := []string{tr.durationField, tr.statusField}
ds := statmodel.NewDataset(dataCols, names)
sf, err := duration.NewSurvfuncRight(ds, tr.durationField, tr.statusField, &duration.SurvfuncRightConfig{})
if err != nil {
fmt.Fprintf(os.Stderr, "mlr surv: %v\n", err)
os.Exit(1)
}
sf.Fit()
times := sf.Time()
survProbs := sf.SurvProb()
for i, t := range times {
newrec := mlrval.NewMlrmapAsRecord()
newrec.PutCopy("time", mlrval.FromFloat(t))
newrec.PutCopy("survival", mlrval.FromFloat(survProbs[i]))
outputRecordsAndContexts.PushBack(types.NewRecordAndContext(newrec, &inrecAndContext.Context))
}
outputRecordsAndContexts.PushBack(inrecAndContext)
}
}

View file

@ -1261,6 +1261,16 @@ Options:
--transpose Show output with field names as column names..
-h|--help Show this message.
================================================================
surv
Usage: mlr surv -d {duration-field} -s {status-field}
Estimate Kaplan-Meier survival curve (right-censored).
Options:
-d {field} Name of duration field (time-to-event or censoring).
-s {field} Name of status field (0=censored, 1=event).
-h, --help Show this message.
================================================================
tac
Usage: mlr tac [options]

View file

@ -0,0 +1 @@
mlr --csv --from test/input/surv.csv surv -d duration -s status

View file

View file

@ -0,0 +1,4 @@
time,survival
1.00000000,0.80000000
3.00000000,0.53333333
5.00000000,0.00000000

6
test/input/surv.csv Normal file
View file

@ -0,0 +1,6 @@
duration,status
1,1
2,0
3,1
4,0
5,1
1 duration status
2 1 1
3 2 0
4 3 1
5 4 0
6 5 1