Add: pareto and normal distribution

ylacancellera · ylacancellera · commit 79bae061f2c5 · 2026-05-22T19:16:14.000+02:00
diff --git a/README.md b/README.md
@@ -480,17 +480,21 @@ General:
 - [ ] helpers to get schema (generate pgdump/mysqldump commands, get index stats, ...)
 - [x] protect against foreign key cycles. Both explicits and implicits (avoid generating implicits that would end up causing loops)
 - [x] detect selfpointing foreign keys 
-- [ ] have some graph to show --coin-flip-percent with --bulk-size
 - [x] using --values-freq-map to make query parameters work
 
+Sampling:
+- [x] normal law through box-muller, select sqrt(-2*log(random()))*sin(2*pi()*random());
+- [x] pareto laws
+- [ ] have some graph to show --coin-flip-percent with --bulk-size
+
 Stepping stones to fully reproduce cardinalities:
 - [x] incorporating arbitrary values with fixed frequency into the bulk inserts
 - [x] table-per-table override for --rows, --null-frequency
 - [ ] coin-flip-percent per relationship basis. Current thought: adding it to --binomial this way --binomial="parent=child:70" to set the coinflip to 70 for this link
 - [ ] parse col/index stats (cardinality + most_common_elems + most_common_freqs for postgres, cardinalities for MySQL)
 
 Without clear plan:
-- [ ] More random algorithms (as of now, no good implementations has been found for pareto that wouldn't provoke huge runtime and/or huge memory consumption, unless implemented fields are restricted to integers)
+- [x] More random algorithms (as of now, no good implementations has been found for pareto that wouldn't provoke huge runtime and/or huge memory consumption, unless implemented fields are restricted to integers)
 - [ ] guessing joins on subqueries/cte. Joins wouldn't be based on columns, but on expressions
 - [ ] be able to "suplement" existing foreign keys with additional columns ?
 
diff --git a/db/db.go b/db/db.go
@@ -28,6 +28,8 @@ type Engine interface {
 	SetTableMetadata(*Table, string, string)
 	BinomialWhereClause(float64) string
 	ErrShouldRetryTx(error) bool
+	FilterOnRowNumberFromClause([]Field, string, string) string
+	FilterOnRowNumberVarClause() string
 }
 
 var ErrFieldsNotFound = errors.New("fields not found")
@@ -77,3 +79,11 @@ func BinomialWhereClause(freqPercent float64) string {
 func ErrShouldRetryTx(err error) bool {
 	return engine.ErrShouldRetryTx(err)
 }
+
+func FilterOnRowNumberFromClause(fields []Field, table, schema string) string {
+	return engine.FilterOnRowNumberFromClause(fields, table, schema)
+}
+
+func FilterOnRowNumberVarClause() string {
+	return engine.FilterOnRowNumberVarClause()
+}
diff --git a/db/mysql.go b/db/mysql.go
@@ -220,3 +220,11 @@ func (_ MySQL) BinomialWhereClause(freqPercent float64) string {
 func (_ MySQL) ErrShouldRetryTx(err error) bool {
 	return strings.Contains(err.Error(), "Duplicate entry")
 }
+
+func (_ MySQL) FilterOnRowNumberFromClause(_ []Field, table, schema string) string {
+	return fmt.Sprintf("%s.%s, (SELECT @rownumber := 0) f", Escape(schema), Escape(table))
+}
+
+func (_ MySQL) FilterOnRowNumberVarClause() string {
+	return "(@rownumber := @rownumber + 1)"
+}
diff --git a/db/pg.go b/db/pg.go
@@ -164,3 +164,12 @@ func (_ Postgres) BinomialWhereClause(freqPercent float64) string {
 func (_ Postgres) ErrShouldRetryTx(err error) bool {
 	return strings.Contains(err.Error(), "duplicate key value violates unique constraint")
 }
+
+func (_ Postgres) FilterOnRowNumberFromClause(fields []Field, table, schema string) string {
+	escapedFields := EscapedNamesListFromFields(fields)
+	return fmt.Sprintf("(SELECT %s, ROW_NUMBER() OVER (ORDER BY %s) as rownumber FROM %s.%s ) f", escapedFields, escapedFields, Escape(schema), Escape(table))
+}
+
+func (_ Postgres) FilterOnRowNumberVarClause() string {
+	return "rownumber"
+}
diff --git a/generate/generate.go b/generate/generate.go
@@ -17,33 +17,40 @@ import (
 )
 
 type Insert struct {
-	table        *db.Table
-	writer       io.Writer
-	NotifyChan   chan int64
-	fklinks      ForeignKeyLinks
-	workersCount int
-	insertMutex  sync.Mutex
-	maxTextSize  int64
-	uuidVersion  int
-	maxRetries   int
-	frequencies  frequency.ColumnFrequency
+	table             *db.Table
+	writer            io.Writer
+	NotifyChan        chan int64
+	fklinks           ForeignKeyLinks
+	workersCount      int
+	insertMutex       sync.Mutex
+	maxTextSize       int64
+	uuidVersion       int
+	maxRetries        int
+	frequencies       frequency.ColumnFrequency
+	expectedTableSize int64
 }
 
 type ForeignKeyLinks struct {
-	DefaultRelationship string            `name:"default-relationship" help:"Will define the default foreign-key relationship to apply. Possible values: ${BinomialFlag},${SequentialFlag}. The default relation can be overriden with other parameters --${BinomialFlag} or --${SequentialFlag}" enum:"${BinomialFlag},${SequentialFlag}" default:"${BinomialFlag}"`
+	DefaultRelationship string            `name:"default-relationship" help:"Will define the default foreign-key relationship to apply. Possible values: ${BinomialFlag},${SequentialFlag}. The default relation can be overriden with other parameters --${BinomialFlag} or --${SequentialFlag}" enum:"${BinomialFlag},${SequentialFlag},${NormalFlag},${ParetoFlag}" default:"${BinomialFlag}"`
 	Binomial            map[string]string ` help:"Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be \"parent_table=child_table\" E.g: --${BinomialFlag}=\"customers=orders;orders=items\""`
 	Sequential          map[string]string `name:"sequential" help:"Defines a sequential foreign key links relationships, using SELECT ... LIMIT x OFFET y. Format should be \"parent_table=child_table\" E.g: --${SequentialFlag}=\"citizens=ssns\""`
 	CoinFlipPercent     float64           `name:"coin-flip-percent" help:"When used with ${BinomialFlag}, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10%% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --${SequentialFlag}. Lower value (e.g 0.001) will also slow down the sampling speed" default:"1"`
+	Normal              map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution"`
+	Pareto              map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution"`
 }
 
 const (
 	SequentialFlag = "sequential"
 	BinomialFlag   = "binomial"
+	NormalFlag     = "normal"
+	ParetoFlag     = "pareto"
 )
 
 var fkLinkToSamplerCreator = map[string]SamplerBuilder{
 	SequentialFlag: NewUniformSample,
 	BinomialFlag:   NewDBRandomSample,
+	NormalFlag:     NewBoxMullerSample,
+	ParetoFlag:     NewZipfSample,
 }
 
 func (r ForeignKeyLinks) relationship(parent, child string) SamplerBuilder {
@@ -53,6 +60,12 @@ func (r ForeignKeyLinks) relationship(parent, child string) SamplerBuilder {
 	if r.Binomial[parent] == child {
 		return fkLinkToSamplerCreator[BinomialFlag]
 	}
+	if r.Normal[parent] == child {
+		return fkLinkToSamplerCreator[NormalFlag]
+	}
+	if r.Pareto[parent] == child {
+		return fkLinkToSamplerCreator[NormalFlag]
+	}
 	return fkLinkToSamplerCreator[r.DefaultRelationship]
 }
 
@@ -115,6 +128,7 @@ func (in *Insert) run(count int64, bulksize int64, dryRun bool) error {
 	completeInserts := count / bulksize
 	remainder := count - completeInserts*bulksize
 	numJobs := completeInserts + 1 // + remainder
+	in.expectedTableSize = count
 
 	bulksizeJobs := make(chan int64, numJobs)
 	errChan := make(chan error, numJobs)
@@ -354,7 +368,7 @@ func (in *Insert) sampleConstraints(constraints db.Constraints, values [][]Gette
 		}
 
 		samplerInit := in.fklinks.relationship(constraint.ReferencedTableName, in.table.Name)
-		sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.fklinks.CoinFlipPercent)
+		sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.fklinks.CoinFlipPercent, in.expectedTableSize)
 		err = sampler.Sample()
 		if err != nil {
 			return errors.Wrap(err, "sampleFieldsTable")
diff --git a/generate/samples.go b/generate/samples.go
@@ -2,7 +2,12 @@ package generate
 
 import (
 	"fmt"
+	"math"
+	"math/rand"
+	"strconv"
+	"strings"
 	"sync"
+	"time"
 
 	"github.com/pkg/errors"
 	"github.com/rs/zerolog/log"
@@ -13,7 +18,7 @@ type Sampler interface {
 	Sample() error
 }
 
-type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, float64) Sampler
+type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, float64, int64) Sampler
 
 type sampleCommon struct {
 	schema string
@@ -113,7 +118,7 @@ func (s *UniformSample) Sample() error {
 var storedUniformSamples = map[string]*UniformSample{}
 var storedUniformSamplesMutex = sync.Mutex{}
 
-func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, _ float64) Sampler {
+func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, _ float64, tableSize int64) Sampler {
 	storedUniformSamplesMutex.Lock()
 	defer storedUniformSamplesMutex.Unlock()
 	if s, ok := storedUniformSamples[tablename+constraintName]; ok {
@@ -143,7 +148,7 @@ func (s *DBRandomSample) Sample() error {
 	return s.query(query, s.values)
 }
 
-func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Getter, samplePercent float64) Sampler {
+func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Getter, samplePercent float64, _ int64) Sampler {
 	s := &DBRandomSample{}
 	s.table = name
 	s.schema = schema
@@ -153,3 +158,90 @@ func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Get
 	s.fields = fields
 	return s
 }
+
+type BoxMullerSample struct {
+	sampleCommon
+	stddev    float64
+	mean      float64
+	tableSize int64
+}
+
+// box muller
+// currently has a "distribution" bug I cannot figure out, there's a spike of probability around what should have been the 25 quartile
+// maybe it's tied to the fact boxmuller expects [0.0,1.0] for u1 u2, but golang can only provide [0.0,1.0[
+// stddev/mean does not affect it, it does not look like a float related issues but it most probably is
+func (s *BoxMullerSample) Sample() error {
+
+	rowNumbers := make([]string, s.limit)
+	for i := range rowNumbers {
+		var cosId int64 = -1
+		x1, x2 := rand.Float64(), rand.Float64()
+		for cosId < 0 || cosId > s.tableSize {
+			cosId = int64(math.Round(s.mean + s.stddev*math.Sqrt(-2*math.Log(x1))*math.Cos(2*math.Pi*x2)))
+		}
+		rowNumbers[i] = strconv.FormatInt(cosId, 10)
+	}
+
+	escapedFields := db.EscapedNamesListFromFields(s.fields)
+	query := fmt.Sprintf("SELECT %s FROM %s WHERE %s IN (%s) AND %s LIMIT %d",
+		escapedFields,
+		db.FilterOnRowNumberFromClause(s.fields, s.table, s.schema),
+		db.FilterOnRowNumberVarClause(),
+		strings.Join(rowNumbers, ","),
+		db.EscapedFieldsIsNotNull(s.fields),
+		s.limit,
+	)
+
+	return s.query(query, s.values)
+}
+
+func NewBoxMullerSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
+	s := &BoxMullerSample{}
+	s.table = name
+	s.schema = schema
+	s.limit = len(values)
+	s.values = values
+	s.fields = fields
+	s.tableSize = tableSize
+	// TODO
+	s.stddev = float64(s.limit)
+	s.mean = float64(tableSize) / 2
+	return s
+}
+
+type ZipfSample struct {
+	sampleCommon
+	zipfRand *rand.Zipf
+}
+
+func (s *ZipfSample) Sample() error {
+
+	rowNumbers := make([]string, s.limit)
+	for i := range rowNumbers {
+		rowNumbers[i] = strconv.Itoa(int(s.zipfRand.Uint64()))
+	}
+	escapedFields := db.EscapedNamesListFromFields(s.fields)
+	query := fmt.Sprintf("SELECT %s FROM %s WHERE %s IN (%s) AND %s LIMIT %d",
+		escapedFields,
+		db.FilterOnRowNumberFromClause(s.fields, s.table, s.schema),
+		db.FilterOnRowNumberVarClause(),
+		strings.Join(rowNumbers, ","),
+		db.EscapedFieldsIsNotNull(s.fields),
+		s.limit,
+	)
+
+	return s.query(query, s.values)
+}
+
+func NewZipfSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
+	s := &ZipfSample{}
+	s.table = name
+	s.schema = schema
+	s.limit = len(values)
+	s.values = values
+	s.fields = fields
+
+	s.zipfRand = rand.NewZipf(rand.New(rand.NewSource(time.Now().UnixNano())), 1.1, 1.0, uint64(tableSize))
+
+	return s
+}
diff --git a/main.go b/main.go
@@ -51,6 +51,8 @@ func main() {
 			"version":        buildInfo,
 			"SequentialFlag": generate.SequentialFlag,
 			"BinomialFlag":   generate.BinomialFlag,
+			"NormalFlag":     generate.NormalFlag,
+			"ParetoFlag":     generate.ParetoFlag,
 		},
 		kong.ConfigureHelp(kong.HelpOptions{
 			Compact: false,
diff --git a/main_test.go b/main_test.go
@@ -182,6 +182,18 @@ func TestRun(t *testing.T) {
 			engines:    []string{"pg", "mysql"},
 			cmds:       [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=binomial", "--coin-flip-percent=60"}},
 		},
+		{
+			name:       "fk_pareto",
+			checkQuery: "select count(distinct t1.id) between 1 and 99 from t1 join t2 on t1.id = t2.t1_id;",
+			engines:    []string{"pg", "mysql"},
+			cmds:       [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=pareto"}},
+		},
+		{
+			name:       "fk_normal",
+			checkQuery: "select count(distinct t1.id) between 1 and 99 from t1 join t2 on t1.id = t2.t1_id;",
+			engines:    []string{"pg", "mysql"},
+			cmds:       [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=normal", "--bulk-size=10"}},
+		},
 
 		// 5% of 1000 will end up being 50, but we need 100 samples per chunks and t1_id has NOT NULL so it has to loop to get more samples
 		{
diff --git a/tests/mysql/fk_normal b/tests/mysql/fk_normal
@@ -0,0 +1 @@
+fk_binomial
diff --git a/tests/mysql/fk_pareto b/tests/mysql/fk_pareto
@@ -0,0 +1 @@
+fk_binomial
diff --git a/tests/pg/fk_normal b/tests/pg/fk_normal
@@ -0,0 +1 @@
+fk_binomial
diff --git a/tests/pg/fk_pareto b/tests/pg/fk_pareto
@@ -0,0 +1 @@
+fk_binomial