#!/usr/bin/ruby # ================================================================ # Produces a variety of categorized pseudo-random data in support # of Miller documentation. # ================================================================ # # Sample output: # # ./datagen/mkdat2 6 | mlr --opprint cat # color shape flag i u v w x # purple circle 0 1 0.5637512757306459 0.4981743566291943 0.36884254478967105 4.480962906000271 # orange square 1 2 0.5232158098904274 0.33717333156510765 0.44646982455699713 5.807558719657881 # red circle 1 3 0.5084894411433544 0.7025118761232125 0.672558508897624 5.094127602621387 # blue square 0 4 0.3145642940519666 0.3040179460903778 0.6388947856924174 7.517194060345607 # yellow triangle 1 5 0.06776212921515201 0.8517576443958519 0.4319941923080997 4.955913436917771 # red square 0 6 0.24407904404655156 0.4297654986740608 0.6681496181121647 4.702469482713694 # ================================================================ # Vary repeat counts to obtain non-uniform distribution on colors $colors=%w( red red red red red red red red red red red red red red red red green green green green blue blue blue blue blue orange yellow yellow yellow yellow yellow purple purple purple purple ) # Vary repeat counts to obtain non-uniform distribution on shapes $shapes = %w( circle circle circle square square square square square triangle triangle triangle triangle ) # Vary per-color probabilities of flag==1 $color_flag_ps = { 'blue' => 0.6, 'green' => 0.2, 'orange' => 0.5, 'purple' => 0.1, 'red' => 0.3, 'yellow' => 0.9, } # For autocorrelation of time series by color $eta = 0.99 # ================================================================ def main() n = 100000 n = Integer ARGV[0] if ARGV.length == 1 history_keys = [] $colors.uniq.each do |color| $shapes.uniq.each do |shape| history_keys << color + '-' + shape end end ht = HistoryTracker.new(history_keys, $eta) n.times do |i| color = $colors[rand $colors.length] shape = $shapes[rand $shapes.length] flag = bernoulli($color_flag_ps[color]) # u: plain unit-interval uniform # v: similar, except for circles, pairwise-correlate u and v u = rand v = rand if color == 'red' && shape == 'circle' v = u + 0.2*(rand-0.5) end # w: autocorrelated time series by color. If you look at stats of w you'll # see roughly uniform distribution. But if you follow each color/shape # combination then you'll see small deltas from one to the next. w = ht.emit(color + '-' + shape) # x: gaussian (boring) x = gaussian puts "color=#{color},shape=#{shape},flag=#{flag},i=#{i+1},u=#{u},v=#{v},w=#{w},x=#{x}" end end # ================================================================ # Roughly normal between 0 and 10, mean 5, stddev about 1.2 def gaussian() 10*(rand+rand+rand+rand+rand+rand)/6 end def bernoulli(p) if rand < p 1 else 0 end end # Autocorrelated time series per color. # For reference please see http://johnkerl.org/rcm/eta.pdf. class HistoryTracker def initialize(keys, eta) @keys = keys @prevs = {} @keys.each do |key| @prevs[key] = rand end @eta = eta @etac = 1.0 - eta @s = Math.sqrt((1+eta)/(1-eta)) @a = 0.5 * (1 - @s) @b = 0.5 * (1 + @s) end def emit(key) @prevs[key] = @eta * @prevs[key] + @etac * rand end end # ================================================================ begin main() rescue Errno::EPIPE # E.g. we're piped to head exit 0 end