mirror of
https://github.com/johnkerl/miller.git
synced 2026-01-23 18:25:45 +00:00
125 lines
3.7 KiB
Ruby
Executable file
125 lines
3.7 KiB
Ruby
Executable file
#!/usr/bin/ruby
|
|
|
|
# ================================================================
|
|
# Produces a variety of categorized pseudo-random data in support
|
|
# of Miller documentation.
|
|
# ================================================================
|
|
#
|
|
# Sample output:
|
|
#
|
|
# ./datagen/mkdat2 6 | mlr --opprint cat
|
|
# color shape flag i u v w x
|
|
# purple circle 0 1 0.5637512757306459 0.4981743566291943 0.36884254478967105 4.480962906000271
|
|
# orange square 1 2 0.5232158098904274 0.33717333156510765 0.44646982455699713 5.807558719657881
|
|
# red circle 1 3 0.5084894411433544 0.7025118761232125 0.672558508897624 5.094127602621387
|
|
# blue square 0 4 0.3145642940519666 0.3040179460903778 0.6388947856924174 7.517194060345607
|
|
# yellow triangle 1 5 0.06776212921515201 0.8517576443958519 0.4319941923080997 4.955913436917771
|
|
# red square 0 6 0.24407904404655156 0.4297654986740608 0.6681496181121647 4.702469482713694
|
|
# ================================================================
|
|
|
|
# Vary repeat counts to obtain non-uniform distribution on colors
|
|
$colors=%w(
|
|
red red red red red red red red red red red red red red red red
|
|
green green green green
|
|
blue blue blue blue blue
|
|
orange
|
|
yellow yellow yellow yellow yellow
|
|
purple purple purple purple
|
|
)
|
|
# Vary repeat counts to obtain non-uniform distribution on shapes
|
|
$shapes = %w(
|
|
circle circle circle
|
|
square square square square square
|
|
triangle triangle triangle triangle
|
|
)
|
|
# Vary per-color probabilities of flag==1
|
|
$color_flag_ps = {
|
|
'blue' => 0.6,
|
|
'green' => 0.2,
|
|
'orange' => 0.5,
|
|
'purple' => 0.1,
|
|
'red' => 0.3,
|
|
'yellow' => 0.9,
|
|
}
|
|
# For autocorrelation of time series by color
|
|
$eta = 0.99
|
|
|
|
# ================================================================
|
|
def main()
|
|
n = 100000
|
|
n = Integer ARGV[0] if ARGV.length == 1
|
|
|
|
history_keys = []
|
|
$colors.uniq.each do |color|
|
|
$shapes.uniq.each do |shape|
|
|
history_keys << color + '-' + shape
|
|
end
|
|
end
|
|
|
|
ht = HistoryTracker.new(history_keys, $eta)
|
|
|
|
n.times do |i|
|
|
color = $colors[rand $colors.length]
|
|
shape = $shapes[rand $shapes.length]
|
|
flag = bernoulli($color_flag_ps[color])
|
|
|
|
# u: plain unit-interval uniform
|
|
# v: similar, except for circles, pairwise-correlate u and v
|
|
u = rand
|
|
v = rand
|
|
if color == 'red' && shape == 'circle'
|
|
v = u + 0.2*(rand-0.5)
|
|
end
|
|
|
|
# w: autocorrelated time series by color. If you look at stats of w you'll
|
|
# see roughly uniform distribution. But if you follow each color/shape
|
|
# combination then you'll see small deltas from one to the next.
|
|
w = ht.emit(color + '-' + shape)
|
|
|
|
# x: gaussian (boring)
|
|
x = gaussian
|
|
|
|
puts "color=#{color},shape=#{shape},flag=#{flag},i=#{i+1},u=#{u},v=#{v},w=#{w},x=#{x}"
|
|
end
|
|
end
|
|
|
|
# ================================================================
|
|
# Roughly normal between 0 and 10, mean 5, stddev about 1.2
|
|
def gaussian()
|
|
10*(rand+rand+rand+rand+rand+rand)/6
|
|
end
|
|
|
|
def bernoulli(p)
|
|
if rand < p
|
|
1
|
|
else
|
|
0
|
|
end
|
|
end
|
|
|
|
# Autocorrelated time series per color.
|
|
# For reference please see http://johnkerl.org/rcm/eta.pdf.
|
|
class HistoryTracker
|
|
def initialize(keys, eta)
|
|
@keys = keys
|
|
@prevs = {}
|
|
@keys.each do |key|
|
|
@prevs[key] = rand
|
|
end
|
|
@eta = eta
|
|
@etac = 1.0 - eta
|
|
@s = Math.sqrt((1+eta)/(1-eta))
|
|
@a = 0.5 * (1 - @s)
|
|
@b = 0.5 * (1 + @s)
|
|
end
|
|
def emit(key)
|
|
@prevs[key] = @eta * @prevs[key] + @etac * rand
|
|
end
|
|
end
|
|
|
|
# ================================================================
|
|
begin
|
|
main()
|
|
rescue Errno::EPIPE # E.g. we're piped to head
|
|
exit 0
|
|
end
|