Add: pareto+normal slope tuning

ylacancellera · ylacancellera · commit 800e31e6121a · 2026-05-26T10:43:26.000+02:00
diff --git a/README.md b/README.md
@@ -65,6 +65,12 @@ Valuable types currently not implemented:
 |--binomial|Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be "parent_table=child_table". E.g: --binomial="customers=orders;orders=items"|
 |--coin-flip-percent|When used with --binomial, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --sequential (Default: 1)|
 |--sequential|Defines a sequential foreign key links relationships. Format should be "parent_table=child_table". E.g: --sequential="citizens=ssns"|
+|--normal|Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution. Slow method needing full table scans for each samples.|
+|--normal-stddev|Standard deviation to the normal law. Will default to 1/10 of the table size|
+|--normal-mean|Mean of the normal law. Will default to the middle of the table, --rows/2|
+|--pareto|Defines a 1-N foreign key relationships using zipf (pareto) distribution. Slow method needing full table scans for each samples|
+|--pareto-s|Zipf slope parameter. Must be above 1. Higher value will mean faster decay, so first rows will be hotter|
+|--pareto-v|Must be >=1. Directly map to V, https://pkg.go.dev/math/rand#Zipf.|
 |--add-fk|Add foreign keys, if they are not explicitely created in the table schema. It can complement the foreign keys guessed from the --query, or be used to manually define foreign keys when using --no-fk-guess too. Format: --add-fk="parent_table.col1[,col2...]=child_table.colx[,coly...][; additional fk ]". Example: --add-fk="customers.id,created_at=purchases.customer_id,created_at;purchases.id=items.purchase_id"|
 |--no-fk-guess|Do not try to guess foreign keys from the --query missing in the schema. When a query is provided, it will analyze the expected JOINs and try to respect dependencies even when foreign keys are not explicitely created in the database objects. This flag will make the tool stick to the constraints defined in the database only, unless you add foreign keys manually with --add-foreign-keys.|
 |--no-skip-fields|Disable field whitelist system. When using a --query, it will get the list of fields being used as a whitelist in order to generate the minimal sets of fields required, unless --no-skip-fields is being used or any * has been found.|
diff --git a/cmd/run.go b/cmd/run.go
@@ -49,10 +49,23 @@ func (cmd *RunCmd) Run() error {
 		return err
 	}
 
-	if (float64(cmd.Rows) * cmd.CoinFlipPercent) < (float64(cmd.BulkSize) / 2) {
+	if (cmd.DefaultRelationship == generate.BinomialFlag || len(cmd.Binomial) > 0) && (float64(cmd.Rows)*cmd.CoinFlipPercent) < (float64(cmd.BulkSize)/2) {
 		cmd.CoinFlipPercent = float64(cmd.BulkSize) / float64(cmd.Rows) / 2
 		log.Info().Msgf("Increasing --coin-flip-percent to %.10f due to low --rows to ensure we can at least sample and get half of --bulk-size at a time", cmd.CoinFlipPercent)
 	}
+	if cmd.DefaultRelationship == generate.NormalFlag || len(cmd.Normal) > 0 {
+		if cmd.NormalStddev == 0 {
+			cmd.NormalStddev = float64(cmd.Rows / 10)
+			log.Info().Msgf("Setting --normal-stddev to %.2f (--rows/10) by default", cmd.NormalStddev)
+		}
+		if cmd.NormalMean == 0 {
+			cmd.NormalMean = float64(cmd.Rows / 2)
+			log.Info().Msgf("Setting --normal-mean to %.2f (--rows/2) by default", cmd.NormalMean)
+		}
+	}
+	if (cmd.DefaultRelationship == generate.ParetoFlag || len(cmd.Pareto) > 0) && (cmd.ParetoS <= 1.0 || cmd.ParetoV < 1) {
+		return errors.New("--pareto-s needs to be >1, --pareto-v needs to be >=1")
+	}
 
 	tablesNames := map[string]struct{}{}
 	identifiers := map[string]struct{}{}
diff --git a/generate/generate.go b/generate/generate.go
@@ -35,8 +35,12 @@ type ForeignKeyLinks struct {
 	Binomial            map[string]string ` help:"Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be \"parent_table=child_table\" E.g: --${BinomialFlag}=\"customers=orders;orders=items\""`
 	Sequential          map[string]string `name:"sequential" help:"Defines a sequential foreign key links relationships, using SELECT ... LIMIT x OFFET y. Format should be \"parent_table=child_table\" E.g: --${SequentialFlag}=\"citizens=ssns\""`
 	CoinFlipPercent     float64           `name:"coin-flip-percent" help:"When used with ${BinomialFlag}, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10%% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --${SequentialFlag}. Lower value (e.g 0.001) will also slow down the sampling speed" default:"1"`
-	Normal              map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution"`
-	Pareto              map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution"`
+	Normal              map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution. Slow method needing full table scans for each samples."`
+	NormalStddev        float64           `help:"Standard deviation to the normal law. Will default to 1/10 of the table size"`
+	NormalMean          float64           `help:"Mean of the normal law. Will default to the middle of the table, --rows/2"`
+	Pareto              map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution. Slow method needing full table scans for each samples"`
+	ParetoS             float64           `help:"Zipf slope parameter. Must be above 1. Higher value will mean faster decay, so first rows will be hotter" default:"1.1"`
+	ParetoV             float64           `help:"Must be >=1. Directly map to V, https://pkg.go.dev/math/rand#Zipf." default:"1.0"`
 }
 
 const (
@@ -64,7 +68,7 @@ func (r ForeignKeyLinks) relationship(parent, child string) SamplerBuilder {
 		return fkLinkToSamplerCreator[NormalFlag]
 	}
 	if r.Pareto[parent] == child {
-		return fkLinkToSamplerCreator[NormalFlag]
+		return fkLinkToSamplerCreator[ParetoFlag]
 	}
 	return fkLinkToSamplerCreator[r.DefaultRelationship]
 }
@@ -368,7 +372,7 @@ func (in *Insert) sampleConstraints(constraints db.Constraints, values [][]Gette
 		}
 
 		samplerInit := in.fklinks.relationship(constraint.ReferencedTableName, in.table.Name)
-		sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.fklinks.CoinFlipPercent, in.expectedTableSize)
+		sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.expectedTableSize, &in.fklinks)
 		err = sampler.Sample()
 		if err != nil {
 			return errors.Wrap(err, "sampleFieldsTable")
diff --git a/generate/samples.go b/generate/samples.go
@@ -18,14 +18,17 @@ type Sampler interface {
 	Sample() error
 }
 
-type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, float64, int64) Sampler
+type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, int64, *ForeignKeyLinks) Sampler
 
 type sampleCommon struct {
-	schema string
-	table  string
-	fields []db.Field
-	values [][]Getter
-	limit  int
+	schema         string
+	table          string
+	constraintName string
+	fields         []db.Field
+	values         [][]Getter
+	limit          int
+	tableSize      int64
+	fkCli          *ForeignKeyLinks
 }
 
 func (s *sampleCommon) query(query string, values [][]Getter) error {
@@ -96,6 +99,18 @@ func (s *sampleCommon) getterFromField(f db.Field) ScannerGetter {
 	return nil
 }
 
+func (s *sampleCommon) Init(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) {
+
+	s.table = tablename
+	s.schema = schema
+	s.constraintName = constraintName
+	s.limit = len(values)
+	s.values = values
+	s.fields = fields
+	s.tableSize = tableSize
+	s.fkCli = fkCli
+}
+
 type UniformSample struct {
 	sampleCommon
 	lastOffset int // paging by offset is bad, but it will work with compound pk, lack of pk, or complex pk types
@@ -118,52 +133,43 @@ func (s *UniformSample) Sample() error {
 var storedUniformSamples = map[string]*UniformSample{}
 var storedUniformSamplesMutex = sync.Mutex{}
 
-func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, _ float64, tableSize int64) Sampler {
+func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
 	storedUniformSamplesMutex.Lock()
 	defer storedUniformSamplesMutex.Unlock()
 	if s, ok := storedUniformSamples[tablename+constraintName]; ok {
 		s.values = values
 		return s
 	}
 	s := &UniformSample{}
-	s.table = tablename
-	s.schema = schema
-	s.limit = len(values)
-	s.values = values
-	s.fields = fields
+	s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
 	storedUniformSamples[tablename+constraintName] = s
 	return s
 }
 
 type DBRandomSample struct {
 	sampleCommon
-	samplePercent float64
+	coinFlipPercent float64
 }
 
 func (s *DBRandomSample) Sample() error {
 
 	query := fmt.Sprintf("SELECT %s FROM %s.%s %s AND %s ORDER BY 1 LIMIT %d",
-		db.EscapedNamesListFromFields(s.fields), db.Escape(s.schema), db.Escape(s.table), db.BinomialWhereClause(s.samplePercent), db.EscapedFieldsIsNotNull(s.fields), s.limit)
+		db.EscapedNamesListFromFields(s.fields), db.Escape(s.schema), db.Escape(s.table), db.BinomialWhereClause(s.coinFlipPercent), db.EscapedFieldsIsNotNull(s.fields), s.limit)
 
 	return s.query(query, s.values)
 }
 
-func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Getter, samplePercent float64, _ int64) Sampler {
+func NewDBRandomSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
 	s := &DBRandomSample{}
-	s.table = name
-	s.schema = schema
-	s.samplePercent = samplePercent
-	s.limit = len(values)
-	s.values = values
-	s.fields = fields
+	s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
+	s.coinFlipPercent = fkCli.CoinFlipPercent
 	return s
 }
 
 type BoxMullerSample struct {
 	sampleCommon
-	stddev    float64
-	mean      float64
-	tableSize int64
+	stddev float64
+	mean   float64
 }
 
 // box muller
@@ -195,17 +201,12 @@ func (s *BoxMullerSample) Sample() error {
 	return s.query(query, s.values)
 }
 
-func NewBoxMullerSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
+func NewBoxMullerSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
 	s := &BoxMullerSample{}
-	s.table = name
-	s.schema = schema
-	s.limit = len(values)
-	s.values = values
-	s.fields = fields
-	s.tableSize = tableSize
-	// TODO
-	s.stddev = float64(s.limit)
-	s.mean = float64(tableSize) / 2
+	s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
+
+	s.stddev = fkCli.NormalStddev
+	s.mean = fkCli.NormalMean
 	return s
 }
 
@@ -233,15 +234,10 @@ func (s *ZipfSample) Sample() error {
 	return s.query(query, s.values)
 }
 
-func NewZipfSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
+func NewZipfSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
 	s := &ZipfSample{}
-	s.table = name
-	s.schema = schema
-	s.limit = len(values)
-	s.values = values
-	s.fields = fields
-
-	s.zipfRand = rand.NewZipf(rand.New(rand.NewSource(time.Now().UnixNano())), 1.1, 1.0, uint64(tableSize))
+	s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
+	s.zipfRand = rand.NewZipf(rand.New(rand.NewSource(time.Now().UnixNano())), fkCli.ParetoS, fkCli.ParetoV, uint64(tableSize))
 
 	return s
 }
diff --git a/main_test.go b/main_test.go
@@ -192,7 +192,7 @@ func TestRun(t *testing.T) {
 			name:       "fk_normal",
 			checkQuery: "select count(distinct t1.id) between 1 and 99 from t1 join t2 on t1.id = t2.t1_id;",
 			engines:    []string{"pg", "mysql"},
-			cmds:       [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=normal", "--bulk-size=10"}},
+			cmds:       [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=normal"}},
 		},
 
 		// 5% of 1000 will end up being 50, but we need 100 samples per chunks and t1_id has NOT NULL so it has to loop to get more samples

Original file line number	Diff line number	Diff line change
`@@ -35,8 +35,12 @@ type ForeignKeyLinks struct {`
`35`	`35`	Binomial map[string]string ` help:"Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be \"parent_table=child_table\" E.g: --${BinomialFlag}=\"customers=orders;orders=items\""`
`36`	`36`	Sequential map[string]string `name:"sequential" help:"Defines a sequential foreign key links relationships, using SELECT ... LIMIT x OFFET y. Format should be \"parent_table=child_table\" E.g: --${SequentialFlag}=\"citizens=ssns\""`
`37`	`37`	CoinFlipPercent float64 `name:"coin-flip-percent" help:"When used with ${BinomialFlag}, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10%% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --${SequentialFlag}. Lower value (e.g 0.001) will also slow down the sampling speed" default:"1"`
`38`		- Normal map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution"`
`39`		- Pareto map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution"`
	`38`	+ Normal map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution. Slow method needing full table scans for each samples."`
	`39`	+ NormalStddev float64 `help:"Standard deviation to the normal law. Will default to 1/10 of the table size"`
	`40`	+ NormalMean float64 `help:"Mean of the normal law. Will default to the middle of the table, --rows/2"`
	`41`	+ Pareto map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution. Slow method needing full table scans for each samples"`
	`42`	+ ParetoS float64 `help:"Zipf slope parameter. Must be above 1. Higher value will mean faster decay, so first rows will be hotter" default:"1.1"`
	`43`	+ ParetoV float64 `help:"Must be >=1. Directly map to V, https://pkg.go.dev/math/rand#Zipf." default:"1.0"`
`40`	`44`	`}`
`41`	`45`
`42`	`46`	`const (`
`@@ -64,7 +68,7 @@ func (r ForeignKeyLinks) relationship(parent, child string) SamplerBuilder {`
`64`	`68`	`return fkLinkToSamplerCreator[NormalFlag]`
`65`	`69`	`}`
`66`	`70`	`if r.Pareto[parent] == child {`
`67`		`- return fkLinkToSamplerCreator[NormalFlag]`
	`71`	`+ return fkLinkToSamplerCreator[ParetoFlag]`
`68`	`72`	`}`
`69`	`73`	`return fkLinkToSamplerCreator[r.DefaultRelationship]`
`70`	`74`	`}`
`@@ -368,7 +372,7 @@ func (in *Insert) sampleConstraints(constraints db.Constraints, values [][]Gette`
`368`	`372`	`}`
`369`	`373`
`370`	`374`	`samplerInit := in.fklinks.relationship(constraint.ReferencedTableName, in.table.Name)`
`371`		`- sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.fklinks.CoinFlipPercent, in.expectedTableSize)`
	`375`	`+ sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.expectedTableSize, &in.fklinks)`
`372`	`376`	`err = sampler.Sample()`
`373`	`377`	`if err != nil {`
`374`	`378`	`return errors.Wrap(err, "sampleFieldsTable")`