miller/docs6/docs/data/mkdat2

125 lines
3.7 KiB
Ruby
Executable file

#!/usr/bin/ruby
# ================================================================
# Produces a variety of categorized pseudo-random data in support
# of Miller documentation.
# ================================================================
#
# Sample output:
#
# ./datagen/mkdat2 6 | mlr --opprint cat
# color shape flag i u v w x
# purple circle 0 1 0.5637512757306459 0.4981743566291943 0.36884254478967105 4.480962906000271
# orange square 1 2 0.5232158098904274 0.33717333156510765 0.44646982455699713 5.807558719657881
# red circle 1 3 0.5084894411433544 0.7025118761232125 0.672558508897624 5.094127602621387
# blue square 0 4 0.3145642940519666 0.3040179460903778 0.6388947856924174 7.517194060345607
# yellow triangle 1 5 0.06776212921515201 0.8517576443958519 0.4319941923080997 4.955913436917771
# red square 0 6 0.24407904404655156 0.4297654986740608 0.6681496181121647 4.702469482713694
# ================================================================
# Vary repeat counts to obtain non-uniform distribution on colors
$colors=%w(
red red red red red red red red red red red red red red red red
green green green green
blue blue blue blue blue
orange
yellow yellow yellow yellow yellow
purple purple purple purple
)
# Vary repeat counts to obtain non-uniform distribution on shapes
$shapes = %w(
circle circle circle
square square square square square
triangle triangle triangle triangle
)
# Vary per-color probabilities of flag==1
$color_flag_ps = {
'blue' => 0.6,
'green' => 0.2,
'orange' => 0.5,
'purple' => 0.1,
'red' => 0.3,
'yellow' => 0.9,
}
# For autocorrelation of time series by color
$eta = 0.99
# ================================================================
def main()
n = 100000
n = Integer ARGV[0] if ARGV.length == 1
history_keys = []
$colors.uniq.each do |color|
$shapes.uniq.each do |shape|
history_keys << color + '-' + shape
end
end
ht = HistoryTracker.new(history_keys, $eta)
n.times do |i|
color = $colors[rand $colors.length]
shape = $shapes[rand $shapes.length]
flag = bernoulli($color_flag_ps[color])
# u: plain unit-interval uniform
# v: similar, except for circles, pairwise-correlate u and v
u = rand
v = rand
if color == 'red' && shape == 'circle'
v = u + 0.2*(rand-0.5)
end
# w: autocorrelated time series by color. If you look at stats of w you'll
# see roughly uniform distribution. But if you follow each color/shape
# combination then you'll see small deltas from one to the next.
w = ht.emit(color + '-' + shape)
# x: gaussian (boring)
x = gaussian
puts "color=#{color},shape=#{shape},flag=#{flag},i=#{i+1},u=#{u},v=#{v},w=#{w},x=#{x}"
end
end
# ================================================================
# Roughly normal between 0 and 10, mean 5, stddev about 1.2
def gaussian()
10*(rand+rand+rand+rand+rand+rand)/6
end
def bernoulli(p)
if rand < p
1
else
0
end
end
# Autocorrelated time series per color.
# For reference please see http://johnkerl.org/rcm/eta.pdf.
class HistoryTracker
def initialize(keys, eta)
@keys = keys
@prevs = {}
@keys.each do |key|
@prevs[key] = rand
end
@eta = eta
@etac = 1.0 - eta
@s = Math.sqrt((1+eta)/(1-eta))
@a = 0.5 * (1 - @s)
@b = 0.5 * (1 + @s)
end
def emit(key)
@prevs[key] = @eta * @prevs[key] + @etac * rand
end
end
# ================================================================
begin
main()
rescue Errno::EPIPE # E.g. we're piped to head
exit 0
end