Re-use whitespace regexp in clean_whitespace (#994)

* DSL strict mode

* Re-use whitespace regexp in clean_whitespace
This commit is contained in:
John Kerl 2022-03-20 12:17:51 -04:00 committed by GitHub
parent ccf9d0fdc5
commit 4191c35b7c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 130 additions and 42 deletions

View file

@ -88,6 +88,7 @@ func ReplMain(args []string) int {
showPrompts := true
astPrintMode := ASTPrintNone
doWarnings := false
strictMode := false
options := cli.DefaultOptions()
for argi < argc /* variable increment: 1 or 2 depending on flag */ {
@ -117,6 +118,10 @@ func ReplMain(args []string) int {
doWarnings = true
argi++
} else if args[argi] == "-z" {
strictMode = true
argi++
} else if args[argi] == "--load" {
if argc-argi < 2 {
replUsage(replName, os.Stderr, 1)
@ -162,6 +167,7 @@ func ReplMain(args []string) int {
showPrompts,
astPrintMode,
doWarnings,
strictMode,
options,
recordOutputFileName,
recordOutputStream,

View file

@ -43,6 +43,7 @@ func NewRepl(
showPrompts bool,
astPrintMode ASTPrintMode,
doWarnings bool,
strictMode bool,
options *cli.TOptions,
recordOutputFileName string,
recordOutputStream *os.File,
@ -63,7 +64,7 @@ func NewRepl(
// NR is 0, etc until/unless the user opens a file and reads records from it.
context := types.NewContext()
runtimeState := runtime.NewEmptyState(options)
runtimeState := runtime.NewEmptyState(options, strictMode)
runtimeState.Update(inrec, context)
// The filter expression for the main Miller DSL is any non-assignment
// statement like 'true' or '$x > 0.5' etc. For the REPL, we re-use this for
@ -78,7 +79,9 @@ func NewRepl(
signal.Notify(sysToSignalHandlerChannel, os.Interrupt, syscall.SIGTERM)
go controlCHandler(sysToSignalHandlerChannel, appSignalNotificationChannel)
cstRootNode := cst.NewEmptyRoot(&options.WriterOptions, cst.DSLInstanceTypeREPL).WithRedefinableUDFUDS()
cstRootNode := cst.NewEmptyRoot(
&options.WriterOptions, cst.DSLInstanceTypeREPL,
).WithRedefinableUDFUDS().WithStrictMode(strictMode)
// TODO

View file

@ -182,7 +182,7 @@ func BIF_strip(input1 *mlrval.Mlrval) *mlrval.Mlrval {
// ----------------------------------------------------------------
func BIF_collapse_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_collapse_whitespace_regexp(input1, WhitespaceRegexp())
return BIF_collapse_whitespace_regexp(input1, _whitespace_regexp)
}
func BIF_collapse_whitespace_regexp(input1 *mlrval.Mlrval, whitespaceRegexp *regexp.Regexp) *mlrval.Mlrval {
@ -193,9 +193,7 @@ func BIF_collapse_whitespace_regexp(input1 *mlrval.Mlrval, whitespaceRegexp *reg
}
}
func WhitespaceRegexp() *regexp.Regexp {
return regexp.MustCompile(`\s+`)
}
var _whitespace_regexp = regexp.MustCompile(`\s+`)
// ================================================================
func BIF_toupper(input1 *mlrval.Mlrval) *mlrval.Mlrval {
@ -239,7 +237,7 @@ func BIF_capitalize(input1 *mlrval.Mlrval) *mlrval.Mlrval {
func BIF_clean_whitespace(input1 *mlrval.Mlrval) *mlrval.Mlrval {
return BIF_strip(
BIF_collapse_whitespace_regexp(
input1, WhitespaceRegexp(),
input1, _whitespace_regexp,
),
)
}

View file

@ -498,6 +498,7 @@ func (root *RootNode) BuildDotCallsiteNode(
func (node *DotCallsiteNode) Evaluate(
state *runtime.State,
) *mlrval.Mlrval {
// For strict mode, absence should be detected on the node.evaluable1 evaluator.
value1 := node.evaluable1.Evaluate(state)
mapvalue1 := value1.GetMap()
@ -506,7 +507,7 @@ func (node *DotCallsiteNode) Evaluate(
// Case 1: map.attribute as shorthand for map["attribute"]
value2 := mapvalue1.Get(node.string2)
if value2 == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "map access [" + node.string2 + "]")
} else {
return value2
}

View file

@ -52,7 +52,7 @@ func (node *ArrayLiteralNode) Evaluate(
}
// ----------------------------------------------------------------
type CollectionIndexAccessNode struct {
type ArrayOrMapIndexAccessNode struct {
baseEvaluable IEvaluable
indexEvaluable IEvaluable
}
@ -75,13 +75,13 @@ func (node *RootNode) BuildArrayOrMapIndexAccessNode(
return nil, err
}
return &CollectionIndexAccessNode{
return &ArrayOrMapIndexAccessNode{
baseEvaluable: baseEvaluable,
indexEvaluable: indexEvaluable,
}, nil
}
func (node *CollectionIndexAccessNode) Evaluate(
func (node *ArrayOrMapIndexAccessNode) Evaluate(
state *runtime.State,
) *mlrval.Mlrval {
baseMlrval := node.baseEvaluable.Evaluate(state)
@ -109,6 +109,7 @@ func (node *CollectionIndexAccessNode) Evaluate(
return mlrval.FromString(string(runes[zindex]))
} else if baseMlrval.IsAbsent() {
// For strict mode, absence should be detected on the baseMlrval and indexMlrval evaluators.
return mlrval.ABSENT
} else {
return mlrval.ERROR
@ -162,6 +163,7 @@ func (node *ArraySliceAccessNode) Evaluate(
upperIndexMlrval := node.upperIndexEvaluable.Evaluate(state)
if baseMlrval.IsAbsent() {
// For strict mode, absence should be detected on the baseMlrval and indexMlrval evaluators.
return mlrval.ABSENT
}
if baseMlrval.IsString() {
@ -229,7 +231,7 @@ func (node *PositionalFieldNameNode) Evaluate(
) *mlrval.Mlrval {
indexMlrval := node.indexEvaluable.Evaluate(state)
if indexMlrval.IsAbsent() {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$[[(absent)]]")
}
index, ok := indexMlrval.GetIntValue()
@ -239,7 +241,7 @@ func (node *PositionalFieldNameNode) Evaluate(
name, ok := state.Inrec.GetNameAtPositionalIndex(index)
if !ok {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$[["+indexMlrval.String()+"]]")
}
return mlrval.FromString(name)
@ -275,7 +277,7 @@ func (node *PositionalFieldValueNode) Evaluate(
) *mlrval.Mlrval {
indexMlrval := node.indexEvaluable.Evaluate(state)
if indexMlrval.IsAbsent() {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$[[[(absent)]]]")
}
index, ok := indexMlrval.GetIntValue()
@ -285,7 +287,7 @@ func (node *PositionalFieldValueNode) Evaluate(
retval := state.Inrec.GetWithPositionalIndex(index)
if retval == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$[[["+indexMlrval.String()+"]]]")
}
return retval
@ -330,6 +332,7 @@ func (node *ArrayOrMapPositionalNameAccessNode) Evaluate(
indexMlrval := node.indexEvaluable.Evaluate(state)
if indexMlrval.IsAbsent() {
// For strict mode, absence should be detected on the baseMlrval and indexMlrval evaluators.
return mlrval.ABSENT
}
@ -356,6 +359,7 @@ func (node *ArrayOrMapPositionalNameAccessNode) Evaluate(
}
} else if baseMlrval.IsAbsent() {
// For strict mode, absence should be detected on the baseMlrval and indexMlrval evaluators.
return mlrval.ABSENT
} else {
@ -402,6 +406,7 @@ func (node *ArrayOrMapPositionalValueAccessNode) Evaluate(
indexMlrval := node.indexEvaluable.Evaluate(state)
if indexMlrval.IsAbsent() {
// For strict mode, absence should be detected on the baseMlrval and indexMlrval evaluators.
return mlrval.ABSENT
}
@ -418,12 +423,14 @@ func (node *ArrayOrMapPositionalValueAccessNode) Evaluate(
} else if baseMlrval.IsMap() {
value := baseMlrval.GetMap().GetWithPositionalIndex(index)
if value == nil {
// For strict mode, absence should be detected on the baseMlrval and indexMlrval evaluators.
return mlrval.ABSENT
}
return value
} else if baseMlrval.IsAbsent() {
// For strict mode, absence should be detected on the baseMlrval and indexMlrval evaluators.
return mlrval.ABSENT
} else {

View file

@ -38,7 +38,7 @@ func (node *EnvironmentVariableNode) Evaluate(
) *mlrval.Mlrval {
name := node.nameEvaluable.Evaluate(state)
if name.IsAbsent() {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "ENV[(absent)]")
}
if !name.IsString() {
return mlrval.ERROR

View file

@ -109,7 +109,7 @@ func (node *IndirectFieldValueNode) Evaluate(
) *mlrval.Mlrval { // TODO: err
fieldName := node.fieldNameEvaluable.Evaluate(state)
if fieldName.IsAbsent() {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$[(absent)]")
}
// For normal DSL use the CST validator will prohibit this from being
@ -118,7 +118,7 @@ func (node *IndirectFieldValueNode) Evaluate(
// print inrec attributes. Also, a UDF/UDS invoked from begin/end could try
// to access the inrec, and that would get past the validator.
if state.Inrec == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$*")
}
value, err := state.Inrec.GetWithMlrvalIndex(fieldName)
@ -129,7 +129,7 @@ func (node *IndirectFieldValueNode) Evaluate(
os.Exit(1)
}
if value == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$[" + fieldName.String() + "]")
}
return value
}
@ -159,12 +159,12 @@ func (node *IndirectOosvarValueNode) Evaluate(
) *mlrval.Mlrval { // TODO: err
oosvarName := node.oosvarNameEvaluable.Evaluate(state)
if oosvarName.IsAbsent() {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "@[(absent)]")
}
value := state.Oosvars.Get(oosvarName.String())
if value == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "@[" + oosvarName.String() + "]")
}
return value

View file

@ -102,11 +102,11 @@ func (node *DirectFieldRvalueNode) Evaluate(
// print inrec attributes. Also, a UDF/UDS invoked from begin/end could try
// to access the inrec, and that would get past the validator.
if state.Inrec == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$*")
}
value := state.Inrec.Get(node.fieldName)
if value == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$"+node.fieldName)
} else {
return value
}
@ -128,7 +128,7 @@ func (node *FullSrecRvalueNode) Evaluate(
// print inrec attributes. Also, a UDF/UDS invoked from begin/end could try
// to access the inrec, and that would get past the validator.
if state.Inrec == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "$*")
} else {
return mlrval.FromMap(state.Inrec)
}
@ -149,7 +149,7 @@ func (node *DirectOosvarRvalueNode) Evaluate(
) *mlrval.Mlrval {
value := state.Oosvars.Get(node.variableName)
if value == nil {
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "@"+node.variableName)
} else {
return value
}
@ -206,7 +206,7 @@ func (node *LocalVariableNode) Evaluate(
// prerequisite since UDFs and BIFs are managed in quite different
// structures.
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(state.StrictMode, "local variable "+node.stackVariable.GetName())
}
// ----------------------------------------------------------------

View file

@ -779,10 +779,19 @@ type LocalVariableLvalueNode struct {
func (root *RootNode) BuildLocalVariableLvalueNode(astNode *dsl.ASTNode) (IAssignable, error) {
lib.InternalCodingErrorIf(astNode.Type != dsl.NodeTypeLocalVariable)
// TODO require type mask in strict mode
variableName := string(astNode.Token.Lit)
typeName := "any"
defineTypedAtScope := false
if astNode.Children != nil { // typed, like 'num x = 3'
if astNode.Children == nil { // untyped, like 'x = 3'
if root.strictMode {
return nil, fmt.Errorf(
"mlr: need typedecl such as \"var\", \"str\", \"num\", etc. for variable \"%s\" in strict mode",
variableName,
)
}
} else { // typed, like 'num x = 3'
typeNode := astNode.Children[0]
lib.InternalCodingErrorIf(typeNode.Type != dsl.NodeTypeTypedecl)
typeName = string(typeNode.Token.Lit)

View file

@ -52,6 +52,12 @@ func (root *RootNode) WithRedefinableUDFUDS() *RootNode {
return root
}
// WithStrictMode allows for runtime handling of absent-reads and untyped assignments.
func (root *RootNode) WithStrictMode(strictMode bool) *RootNode {
root.strictMode = strictMode
return root
}
// ----------------------------------------------------------------
// ASTBuildVisitorFunc is a callback, used by RootNode's Build method, which

View file

@ -49,6 +49,7 @@ type RootNode struct {
outputHandlerManagers *list.List
recordWriterOptions *cli.TWriterOptions
dslInstanceType DSLInstanceType // put, filter, repl
strictMode bool
}
// ----------------------------------------------------------------

View file

@ -265,7 +265,10 @@ func (site *UDFCallsite) EvaluateWithArguments(
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(
state.StrictMode,
"function "+udf.signature.funcOrSubrName+" implicit return value",
)
}
// TODO: should be an internal coding error. This would be break or
@ -277,7 +280,10 @@ func (site *UDFCallsite) EvaluateWithArguments(
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
return mlrval.ABSENT
return mlrval.ABSENT.StrictModeCheck(
state.StrictMode,
"function "+udf.signature.funcOrSubrName+" abnormal exit",
)
}
// Definitely a Miller internal coding error if the user put 'return x' in
@ -290,6 +296,12 @@ func (site *UDFCallsite) EvaluateWithArguments(
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
blockExitPayload.blockReturnValue.StrictModeCheck(
state.StrictMode,
"function "+udf.signature.funcOrSubrName+" return value",
)
return blockExitPayload.blockReturnValue.Copy()
}

View file

@ -146,3 +146,11 @@ func (mv *Mlrval) GetNumericToFloatValueOrDie() (floatValue float64) {
func (mv *Mlrval) AssertNumeric() {
_ = mv.GetNumericToFloatValueOrDie()
}
func (mv *Mlrval) StrictModeCheck(strictMode bool, description string) *Mlrval{
if strictMode && mv.IsAbsent() {
fmt.Fprintf(os.Stderr, "mlr: %s is absent and strict mode was requested.\n", description)
os.Exit(1)
}
return mv
}

View file

@ -27,9 +27,12 @@ type State struct {
// '$x =~ "(..)_(...)"', and interpolated via things like '$y = "\2:\1"'.
RegexCaptures []string
Options *cli.TOptions
// StrictMode allows for runtime handling of absent-reads and untyped assignments.
StrictMode bool
}
func NewEmptyState(options *cli.TOptions) *State {
func NewEmptyState(options *cli.TOptions, strictMode bool) *State {
oosvars := mlrval.NewMlrmap()
return &State{
Inrec: nil,
@ -43,6 +46,8 @@ func NewEmptyState(options *cli.TOptions) *State {
// See lib.MakeEmptyRegexCaptures for context.
RegexCaptures: lib.MakeEmptyRegexCaptures(),
Options: options,
StrictMode: strictMode,
}
}

View file

@ -206,6 +206,7 @@ func transformerPutOrFilterParseCLI(
exitAfterParse := false
doWarnings := false
warningsAreFatal := false
strictMode := false
invertFilter := false
suppressOutputRecord := false
presets := make([]string, 0)
@ -291,6 +292,11 @@ func transformerPutOrFilterParseCLI(
} else if opt == "-w" {
doWarnings = true
warningsAreFatal = false
} else if opt == "-z" {
// TODO: perhaps doWarnings and warningsAreFatal as well.
// But first I want to see what can be caught at runtime
// without static analysis.
strictMode = true
} else if opt == "-W" {
doWarnings = true
warningsAreFatal = true
@ -355,6 +361,7 @@ func transformerPutOrFilterParseCLI(
exitAfterParse,
doWarnings,
warningsAreFatal,
strictMode,
invertFilter,
suppressOutputRecord,
options,
@ -388,12 +395,13 @@ func NewTransformerPut(
exitAfterParse bool,
doWarnings bool,
warningsAreFatal bool,
strictMode bool,
invertFilter bool,
suppressOutputRecord bool,
options *cli.TOptions,
) (*TransformerPut, error) {
cstRootNode := cst.NewEmptyRoot(&options.WriterOptions, dslInstanceType)
cstRootNode := cst.NewEmptyRoot(&options.WriterOptions, dslInstanceType).WithStrictMode(strictMode)
err := cstRootNode.Build(
dslStrings,
@ -434,7 +442,7 @@ func NewTransformerPut(
return nil, err
}
runtimeState := runtime.NewEmptyState(options)
runtimeState := runtime.NewEmptyState(options, strictMode)
// E.g.
// mlr put -s sum=0

View file

@ -25,6 +25,40 @@ RELEASES
================================================================
FEATURES
----------------------------------------------------------------
STRICT MODE
i theme is handling of 'absent'
? what about handling of 'error' ?
* improve wording:
mlr: couldn't assign variable int function return value from value absent (absent)
* need $?x and @?x in the grammar & CST
* flags:
o mlr -z and mlr put -z
o note put has -w (warn) and -W (fatal)
- then strict mode includes -W?
* tests:
mlr --csv --from $exv put -z 'x = 1'
mlr --csv --from $exv put -z 'var x = a'
mlr --csv --from $exv put -z 'var x = $nonesuch'
mlr --csv --from $exv put -z 'var x = $["asdf"]'
mlr --csv --from $exv put -z 'var x = $[nonesuch]'
mlr --csv --from $exv put -z 'var x = $[[999]]'
mlr --csv --from $exv put -z 'var x = $[[[999]]]'
mlr --csv --from $exv put -z 'begin { var m = $* }'
mlr --csv --from $exv put -z 'var x = @nonesuch'
mlr --csv --from $exv put -z 'var x = @["nonesuch"]'
mlr --csv --from $exv put -z 'func f(): int {}; $x = f()'
mlr --csv --from $exv put -z 'func f() {}; $x = f()'
mlr --csv --from $exv put -z 'func f() {return nonesuch}; $x = f()'
mlr --csv --from $exv put -z '$env = ENV[nonesuch]'
mlr --csv --from $exv put -z '$env = ENV["nonesuch"]'
----------------------------------------------------------------
EXTENDED FIELD ACCESSORS
@ -94,16 +128,6 @@ inference:
o webdocs as in #933 description
* for data files: --symbol-true yes --symbol-false off --symbol-infinity inf --symbol-not-available N/A
----------------------------------------------------------------
strict-mode ideas
* localvar:
o LHS: just require typedecl (even just var)
o RHS: like put -w, but with turning warnings into errors
* oosvar:
o abend unless @?x -- ?
* srec:
o abend unless $?x -- ?
----------------------------------------------------------------
! sysdate, sysdate_local; datediff ...