mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 18:25:45 +00:00
475 lines
16 KiB
HTML
475 lines
16 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
|
|
<!-- PAGE GENERATED FROM template.html and content-for-data-sharing.html BY poki. -->
|
|
<!-- PLEASE MAKE CHANGES THERE AND THEN RE-RUN poki. -->
|
|
<head>
|
|
<meta http-equiv="Content-type" content="text/html;charset=UTF-8"/>
|
|
<meta name="description" content="Miller documentation"/>
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"/> <!-- mobile-friendly -->
|
|
<meta name="keywords"
|
|
content="John Kerl, Kerl, Miller, miller, mlr, OLAP, data analysis software, regression, correlation, variance, data tools, " />
|
|
|
|
<title> Mixing with other languages </title>
|
|
<link rel="stylesheet" type="text/css" href="css/miller.css"/>
|
|
<link rel="stylesheet" type="text/css" href="css/poki-callbacks.css"/>
|
|
</head>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript">
|
|
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
|
|
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
|
|
</script>
|
|
<script type="text/javascript">
|
|
try {
|
|
var pageTracker = _gat._getTracker("UA-15651652-1");
|
|
pageTracker._trackPageview();
|
|
} catch(err) {}
|
|
</script>
|
|
<!-- ================================================================ -->
|
|
|
|
<body bgcolor="#ffffff">
|
|
|
|
<!-- ================================================================ -->
|
|
|
|
<!-- navbar -->
|
|
<div class="pokinav">
|
|
<center><titleinbody>Miller</titleinbody></center>
|
|
|
|
<!-- NAVBAR GENERATED FROM template.html BY poki -->
|
|
<br/>
|
|
<a class="poki-navbar-element" href="index.html">Overview</a>
|
|
|
|
<a class="poki-navbar-element" href="faq.html"><b>Using</b></a>
|
|
|
|
<a class="poki-navbar-element" href="reference.html">Reference</a>
|
|
|
|
<a class="poki-navbar-element" href="why.html">Background</a>
|
|
|
|
<a class="poki-navbar-element" href="contact.html">Repository</a>
|
|
|
|
<br/>
|
|
<br/><a href="faq.html">FAQ</a>
|
|
<br/><a href="data-sharing.html"><b>Mixing with other languages</b></a>
|
|
<br/><a href="cookbook.html">Cookbook part 1</a>
|
|
<br/><a href="cookbook2.html">Cookbook part 2</a>
|
|
<br/><a href="cookbook3.html">Cookbook part 3</a>
|
|
<br/><a href="data-examples.html">Data-diving examples</a>
|
|
</div>
|
|
|
|
<!-- page body -->
|
|
<p/>
|
|
|
|
<!-- BODY COPIED FROM content-for-data-sharing.html BY poki -->
|
|
<div class="pokitoc">
|
|
<center><titleinbody>Mixing with other languages</titleinbody></center>
|
|
• <a href="#DKVP_I/O_in_Python">DKVP I/O in Python</a><br/>
|
|
• <a href="#DKVP_I/O_in_Ruby">DKVP I/O in Ruby</a><br/>
|
|
• <a href="#SQL-output_examples">SQL-output examples</a><br/>
|
|
• <a href="#SQL-input_examples">SQL-input examples</a><br/>
|
|
• <a href="#Running_shell_commands">Running shell commands</a><br/>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="bodyToggler.expandAll();" href="javascript:;">Expand all sections</button>
|
|
<button style="font-weight:bold;color:maroon;border:0" onclick="bodyToggler.collapseAll();" href="javascript:;">Collapse all sections</button>
|
|
|
|
<p/> As discussed in the section on
|
|
<a href="file-formats.html">File formats</a>, Miller supports several
|
|
different file formats. Different tools are good at different things, so
|
|
it’s important to be able to move data into and out of other languages.
|
|
<b>CSV</b> and <b>JSON</b> are well-known, of course; here are some examples
|
|
using <b>DKVP</b> format, with <b>Ruby</b> and <b>Python</b>. Last, we show how
|
|
to use arbitrary <b>shell commands</b> to extend functionality beyond Miller’s
|
|
domain-specific language.
|
|
|
|
<a id="DKVP_I/O_in_Python"/><h1>DKVP I/O in Python</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_dkvp_python');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_dkvp_python" style="display: block">
|
|
|
|
<p/>
|
|
Here are the I/O routines:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
#!/usr/bin/env python
|
|
|
|
# ================================================================
|
|
# Example of DKVP I/O using Python.
|
|
#
|
|
# Key point: Use Miller for what it's good at; pass data into/out of tools in
|
|
# other languages to do what they're good at.
|
|
#
|
|
# bash$ python -i dkvp_io.py
|
|
#
|
|
# # READ
|
|
# >>> map = dkvpline2map('x=1,y=2', '=', ',')
|
|
# >>> map
|
|
# OrderedDict([('x', '1'), ('y', '2')])
|
|
#
|
|
# # MODIFY
|
|
# >>> map['z'] = map['x'] + map['y']
|
|
# >>> map
|
|
# OrderedDict([('x', '1'), ('y', '2'), ('z', 3)])
|
|
#
|
|
# # WRITE
|
|
# >>> line = map2dkvpline(map, '=', ',')
|
|
# >>> line
|
|
# 'x=1,y=2,z=3'
|
|
#
|
|
# ================================================================
|
|
|
|
import re
|
|
import collections
|
|
|
|
# ----------------------------------------------------------------
|
|
# ips and ifs (input pair separator and input field separator) are nominally '=' and ','.
|
|
def dkvpline2map(line, ips, ifs):
|
|
pairs = re.split(ifs, line)
|
|
map = collections.OrderedDict()
|
|
for pair in pairs:
|
|
key, value = re.split(ips, pair, 1)
|
|
|
|
# Type inference:
|
|
try:
|
|
value = int(value)
|
|
except:
|
|
try:
|
|
value = float(value)
|
|
except:
|
|
pass
|
|
|
|
map[key] = value
|
|
return map
|
|
|
|
# ----------------------------------------------------------------
|
|
# ops and ofs (output pair separator and output field separator) are nominally '=' and ','.
|
|
def map2dkvpline(map , ops, ofs):
|
|
line = ''
|
|
pairs = []
|
|
for key in map:
|
|
pairs.append(str(key) + ops + str(map[key]))
|
|
return str.join(ofs, pairs)
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
And here is an example using them:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ cat polyglot-dkvp-io/example.py
|
|
#!/usr/bin/env python
|
|
|
|
import sys
|
|
import re
|
|
import copy
|
|
import dkvp_io
|
|
|
|
while True:
|
|
# Read the original record:
|
|
line = sys.stdin.readline().strip()
|
|
if line == '':
|
|
break
|
|
map = dkvp_io.dkvpline2map(line, '=', ',')
|
|
|
|
# Drop a field:
|
|
map.pop('x')
|
|
|
|
# Compute some new fields:
|
|
map['ab'] = map['a'] + map['b']
|
|
map['iy'] = map['i'] + map['y']
|
|
|
|
# Add new fields which show type of each already-existing field:
|
|
omap = copy.copy(map) # since otherwise the for-loop will modify what it loops over
|
|
keys = omap.keys()
|
|
for key in keys:
|
|
# Convert "<type 'int'>" to just "int", etc.:
|
|
type_string = str(map[key].__class__)
|
|
type_string = re.sub("<type '", "", type_string) # python2
|
|
type_string = re.sub("<class '", "", type_string) # python3
|
|
type_string = re.sub("'>", "", type_string)
|
|
map['t'+key] = type_string
|
|
|
|
# Write the modified record:
|
|
print(dkvp_io.map2dkvpline(map, '=', ','))
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
Run as-is:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ python polyglot-dkvp-io/example.py < data/small
|
|
a=pan,b=pan,i=1,y=0.726802862743,ab=panpan,iy=1.72680286274,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
|
|
a=eks,b=pan,i=2,y=0.522151108333,ab=ekspan,iy=2.52215110833,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
|
|
a=wye,b=wye,i=3,y=0.338318525517,ab=wyewye,iy=3.33831852552,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
|
|
a=eks,b=wye,i=4,y=0.134188743284,ab=ekswye,iy=4.13418874328,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
|
|
a=wye,b=pan,i=5,y=0.863624469903,ab=wyepan,iy=5.8636244699,ta=str,tb=str,ti=int,ty=float,tab=str,tiy=float
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
Run as-is, then pipe to Miller for pretty-printing:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ python polyglot-dkvp-io/example.py < data/small | mlr --opprint cat
|
|
a b i y ab iy ta tb ti ty tab tiy
|
|
pan pan 1 0.726802862743 panpan 1.72680286274 str str int float str float
|
|
eks pan 2 0.522151108333 ekspan 2.52215110833 str str int float str float
|
|
wye wye 3 0.338318525517 wyewye 3.33831852552 str str int float str float
|
|
eks wye 4 0.134188743284 ekswye 4.13418874328 str str int float str float
|
|
wye pan 5 0.863624469903 wyepan 5.8636244699 str str int float str float
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
</div>
|
|
<a id="DKVP_I/O_in_Ruby"/><h1>DKVP I/O in Ruby</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_dkvp_ruby');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_dkvp_ruby" style="display: block">
|
|
|
|
<p/>
|
|
Here are the I/O routines:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
#!/usr/bin/env ruby
|
|
|
|
# ================================================================
|
|
# Example of DKVP I/O using Ruby.
|
|
#
|
|
# Key point: Use Miller for what it's good at; pass data into/out of tools in
|
|
# other languages to do what they're good at.
|
|
#
|
|
# bash$ irb -I. -r dkvp_io.rb
|
|
#
|
|
# # READ
|
|
# irb(main):001:0> map = dkvpline2map('x=1,y=2', '=', ',')
|
|
# => {"x"=>"1", "y"=>"2"}
|
|
#
|
|
# # MODIFY
|
|
# irb(main):001:0> map['z'] = map['x'] + map['y']
|
|
# => 3
|
|
#
|
|
# # WRITE
|
|
# irb(main):002:0> line = map2dkvpline(map, '=', ',')
|
|
# => "x=1,y=2,z=3"
|
|
#
|
|
# ================================================================
|
|
|
|
# ----------------------------------------------------------------
|
|
# ips and ifs (input pair separator and input field separator) are nominally '=' and ','.
|
|
def dkvpline2map(line, ips, ifs)
|
|
map = {}
|
|
line.split(ifs).each do |pair|
|
|
(k, v) = pair.split(ips, 2)
|
|
|
|
# Type inference:
|
|
begin
|
|
v = Integer(v)
|
|
rescue ArgumentError
|
|
begin
|
|
v = Float(v)
|
|
rescue ArgumentError
|
|
# Leave as string
|
|
end
|
|
end
|
|
|
|
map[k] = v
|
|
end
|
|
map
|
|
end
|
|
|
|
# ----------------------------------------------------------------
|
|
# ops and ofs (output pair separator and output field separator) are nominally '=' and ','.
|
|
def map2dkvpline(map, ops, ofs)
|
|
map.collect{|k,v| k.to_s + ops + v.to_s}.join(ofs)
|
|
end
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
And here is an example using them:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ cat polyglot-dkvp-io/example.rb
|
|
#!/usr/bin/env ruby
|
|
|
|
require 'dkvp_io'
|
|
|
|
ARGF.each do |line|
|
|
# Read the original record:
|
|
map = dkvpline2map(line.chomp, '=', ',')
|
|
|
|
# Drop a field:
|
|
map.delete('x')
|
|
|
|
# Compute some new fields:
|
|
map['ab'] = map['a'] + map['b']
|
|
map['iy'] = map['i'] + map['y']
|
|
|
|
# Add new fields which show type of each already-existing field:
|
|
keys = map.keys
|
|
keys.each do |key|
|
|
map['t'+key] = map[key].class
|
|
end
|
|
|
|
# Write the modified record:
|
|
puts map2dkvpline(map, '=', ',')
|
|
end
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
Run as-is:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ ruby -I./polyglot-dkvp-io polyglot-dkvp-io/example.rb data/small
|
|
a=pan,b=pan,i=1,y=0.7268028627434533,ab=panpan,iy=1.7268028627434533,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
|
|
a=eks,b=pan,i=2,y=0.5221511083334797,ab=ekspan,iy=2.5221511083334796,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
|
|
a=wye,b=wye,i=3,y=0.33831852551664776,ab=wyewye,iy=3.3383185255166477,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
|
|
a=eks,b=wye,i=4,y=0.13418874328430463,ab=ekswye,iy=4.134188743284304,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
|
|
a=wye,b=pan,i=5,y=0.8636244699032729,ab=wyepan,iy=5.863624469903273,ta=String,tb=String,ti=Integer,ty=Float,tab=String,tiy=Float
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
Run as-is, then pipe to Miller for pretty-printing:
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ ruby -I./polyglot-dkvp-io polyglot-dkvp-io/example.rb data/small | mlr --opprint cat
|
|
a b i y ab iy ta tb ti ty tab tiy
|
|
pan pan 1 0.7268028627434533 panpan 1.7268028627434533 String String Integer Float String Float
|
|
eks pan 2 0.5221511083334797 ekspan 2.5221511083334796 String String Integer Float String Float
|
|
wye wye 3 0.33831852551664776 wyewye 3.3383185255166477 String String Integer Float String Float
|
|
eks wye 4 0.13418874328430463 ekswye 4.134188743284304 String String Integer Float String Float
|
|
wye pan 5 0.8636244699032729 wyepan 5.863624469903273 String String Integer Float String Float
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
</div>
|
|
|
|
<a id="SQL-output_examples"/><h1>SQL-output examples</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_sql_output_examples');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_sql_output_examples" style="display: block">
|
|
|
|
<p/>Please see <a href="10-min.html#SQL-output_examples">here</a>.
|
|
|
|
</div>
|
|
|
|
<a id="SQL-input_examples"/><h1>SQL-input examples</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_sql_input_examples');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_sql_input_examples" style="display: block">
|
|
|
|
<p/>Please see <a href="10-min.html#SQL-input_examples">here</a>.
|
|
|
|
</div>
|
|
<a id="Running_shell_commands"/><h1>Running shell commands</h1>
|
|
<button style="font-weight:bold;color:maroon;border:0" padding=0 onclick="bodyToggler.toggle('body_section_toggle_shell_commands');" href="javascript:;">Toggle section visibility</button>
|
|
<div id="body_section_toggle_shell_commands" style="display: block">
|
|
|
|
<p/>The <a href="reference-dsl.html#system">system</a> DSL function allows you to run a specific shell command and put its output — minus the final newline — into a record field. The command itself is any string, either a literal string, or a concatenation of strings, perhaps including other field values or what have you.
|
|
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint put '$o = system("echo hello world")' data/small
|
|
a b i x y o
|
|
pan pan 1 0.3467901443380824 0.7268028627434533 hello world
|
|
eks pan 2 0.7586799647899636 0.5221511083334797 hello world
|
|
wye wye 3 0.20460330576630303 0.33831852551664776 hello world
|
|
eks wye 4 0.38139939387114097 0.13418874328430463 hello world
|
|
wye pan 5 0.5732889198020006 0.8636244699032729 hello world
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint put '$o = system("echo {" . NR . "}")' data/small
|
|
a b i x y o
|
|
pan pan 1 0.3467901443380824 0.7268028627434533 {1}
|
|
eks pan 2 0.7586799647899636 0.5221511083334797 {2}
|
|
wye wye 3 0.20460330576630303 0.33831852551664776 {3}
|
|
eks wye 4 0.38139939387114097 0.13418874328430463 {4}
|
|
wye pan 5 0.5732889198020006 0.8636244699032729 {5}
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint put '$o = system("echo -n ".$a."| sha1sum")' data/small
|
|
a b i x y o
|
|
pan pan 1 0.3467901443380824 0.7268028627434533 bd2bd8216b9cb4aa5a12daa6cfc98eef2ee20e56 -
|
|
eks pan 2 0.7586799647899636 0.5221511083334797 16191338e81a46c7d127f5c8899f5c92e3cd38e3 -
|
|
wye wye 3 0.20460330576630303 0.33831852551664776 14ba3c3e96a2474ab6dc7409ebf9d6b9cc3d84f0 -
|
|
eks wye 4 0.38139939387114097 0.13418874328430463 16191338e81a46c7d127f5c8899f5c92e3cd38e3 -
|
|
wye pan 5 0.5732889198020006 0.8636244699032729 14ba3c3e96a2474ab6dc7409ebf9d6b9cc3d84f0 -
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
<p/>Note that running a subprocess on every record takes a non-trivial amount of time. Comparing asking the system <code>date</code> command for the current time in nanoseconds versus computing it in process:
|
|
|
|
<!-- hard-coded since %N doesn't exist on all platforms -->
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint put '$t=system("date +%s.%N")' then step -a delta -f t data/small
|
|
a b i x y t t_delta
|
|
pan pan 1 0.3467901443380824 0.7268028627434533 1568774318.513903817 0
|
|
eks pan 2 0.7586799647899636 0.5221511083334797 1568774318.514722876 0.000819
|
|
wye wye 3 0.20460330576630303 0.33831852551664776 1568774318.515618046 0.000895
|
|
eks wye 4 0.38139939387114097 0.13418874328430463 1568774318.516547441 0.000929
|
|
wye pan 5 0.5732889198020006 0.8636244699032729 1568774318.517518828 0.000971
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
<p/>
|
|
<div class="pokipanel">
|
|
<pre>
|
|
$ mlr --opprint put '$t=systime()' then step -a delta -f t data/small
|
|
a b i x y t t_delta
|
|
pan pan 1 0.3467901443380824 0.7268028627434533 1568774318.518699 0
|
|
eks pan 2 0.7586799647899636 0.5221511083334797 1568774318.518717 0.000018
|
|
wye wye 3 0.20460330576630303 0.33831852551664776 1568774318.518723 0.000006
|
|
eks wye 4 0.38139939387114097 0.13418874328430463 1568774318.518727 0.000004
|
|
wye pan 5 0.5732889198020006 0.8636244699032729 1568774318.518730 0.000003
|
|
</pre>
|
|
</div>
|
|
<p/>
|
|
|
|
</div>
|
|
|
|
<!-- ================================================================ -->
|
|
<script type="text/javascript" src="js/miller-doc-toggler.js"></script>
|
|
<!-- wtf -->
|
|
<script type="text/javascript">
|
|
// Put this at the bottom of the page since its constructor scans the
|
|
// document's div tags to find the toggleables.
|
|
const bodyToggler = new MillerDocToggler(
|
|
"body_section_toggle_",
|
|
'maroon',
|
|
'maroon',
|
|
);
|
|
</script>
|
|
|
|
</body>
|
|
</html>
|