Skip to content

Commit 800e31e

Browse files
committed
Add: pareto+normal slope tuning
1 parent 79bae06 commit 800e31e

5 files changed

Lines changed: 67 additions & 48 deletions

File tree

README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ Valuable types currently not implemented:
6565
|--binomial|Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be "parent_table=child_table". E.g: --binomial="customers=orders;orders=items"|
6666
|--coin-flip-percent|When used with --binomial, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --sequential (Default: 1)|
6767
|--sequential|Defines a sequential foreign key links relationships. Format should be "parent_table=child_table". E.g: --sequential="citizens=ssns"|
68+
|--normal|Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution. Slow method needing full table scans for each samples.|
69+
|--normal-stddev|Standard deviation to the normal law. Will default to 1/10 of the table size|
70+
|--normal-mean|Mean of the normal law. Will default to the middle of the table, --rows/2|
71+
|--pareto|Defines a 1-N foreign key relationships using zipf (pareto) distribution. Slow method needing full table scans for each samples|
72+
|--pareto-s|Zipf slope parameter. Must be above 1. Higher value will mean faster decay, so first rows will be hotter|
73+
|--pareto-v|Must be >=1. Directly map to V, https://pkg.go.dev/math/rand#Zipf.|
6874
|--add-fk|Add foreign keys, if they are not explicitely created in the table schema. It can complement the foreign keys guessed from the --query, or be used to manually define foreign keys when using --no-fk-guess too. Format: --add-fk="parent_table.col1[,col2...]=child_table.colx[,coly...][; additional fk ]". Example: --add-fk="customers.id,created_at=purchases.customer_id,created_at;purchases.id=items.purchase_id"|
6975
|--no-fk-guess|Do not try to guess foreign keys from the --query missing in the schema. When a query is provided, it will analyze the expected JOINs and try to respect dependencies even when foreign keys are not explicitely created in the database objects. This flag will make the tool stick to the constraints defined in the database only, unless you add foreign keys manually with --add-foreign-keys.|
7076
|--no-skip-fields|Disable field whitelist system. When using a --query, it will get the list of fields being used as a whitelist in order to generate the minimal sets of fields required, unless --no-skip-fields is being used or any * has been found.|

cmd/run.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,23 @@ func (cmd *RunCmd) Run() error {
4949
return err
5050
}
5151

52-
if (float64(cmd.Rows) * cmd.CoinFlipPercent) < (float64(cmd.BulkSize) / 2) {
52+
if (cmd.DefaultRelationship == generate.BinomialFlag || len(cmd.Binomial) > 0) && (float64(cmd.Rows)*cmd.CoinFlipPercent) < (float64(cmd.BulkSize)/2) {
5353
cmd.CoinFlipPercent = float64(cmd.BulkSize) / float64(cmd.Rows) / 2
5454
log.Info().Msgf("Increasing --coin-flip-percent to %.10f due to low --rows to ensure we can at least sample and get half of --bulk-size at a time", cmd.CoinFlipPercent)
5555
}
56+
if cmd.DefaultRelationship == generate.NormalFlag || len(cmd.Normal) > 0 {
57+
if cmd.NormalStddev == 0 {
58+
cmd.NormalStddev = float64(cmd.Rows / 10)
59+
log.Info().Msgf("Setting --normal-stddev to %.2f (--rows/10) by default", cmd.NormalStddev)
60+
}
61+
if cmd.NormalMean == 0 {
62+
cmd.NormalMean = float64(cmd.Rows / 2)
63+
log.Info().Msgf("Setting --normal-mean to %.2f (--rows/2) by default", cmd.NormalMean)
64+
}
65+
}
66+
if (cmd.DefaultRelationship == generate.ParetoFlag || len(cmd.Pareto) > 0) && (cmd.ParetoS <= 1.0 || cmd.ParetoV < 1) {
67+
return errors.New("--pareto-s needs to be >1, --pareto-v needs to be >=1")
68+
}
5669

5770
tablesNames := map[string]struct{}{}
5871
identifiers := map[string]struct{}{}

generate/generate.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,12 @@ type ForeignKeyLinks struct {
3535
Binomial map[string]string ` help:"Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be \"parent_table=child_table\" E.g: --${BinomialFlag}=\"customers=orders;orders=items\""`
3636
Sequential map[string]string `name:"sequential" help:"Defines a sequential foreign key links relationships, using SELECT ... LIMIT x OFFET y. Format should be \"parent_table=child_table\" E.g: --${SequentialFlag}=\"citizens=ssns\""`
3737
CoinFlipPercent float64 `name:"coin-flip-percent" help:"When used with ${BinomialFlag}, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10%% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --${SequentialFlag}. Lower value (e.g 0.001) will also slow down the sampling speed" default:"1"`
38-
Normal map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution"`
39-
Pareto map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution"`
38+
Normal map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution. Slow method needing full table scans for each samples."`
39+
NormalStddev float64 `help:"Standard deviation to the normal law. Will default to 1/10 of the table size"`
40+
NormalMean float64 `help:"Mean of the normal law. Will default to the middle of the table, --rows/2"`
41+
Pareto map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution. Slow method needing full table scans for each samples"`
42+
ParetoS float64 `help:"Zipf slope parameter. Must be above 1. Higher value will mean faster decay, so first rows will be hotter" default:"1.1"`
43+
ParetoV float64 `help:"Must be >=1. Directly map to V, https://pkg.go.dev/math/rand#Zipf." default:"1.0"`
4044
}
4145

4246
const (
@@ -64,7 +68,7 @@ func (r ForeignKeyLinks) relationship(parent, child string) SamplerBuilder {
6468
return fkLinkToSamplerCreator[NormalFlag]
6569
}
6670
if r.Pareto[parent] == child {
67-
return fkLinkToSamplerCreator[NormalFlag]
71+
return fkLinkToSamplerCreator[ParetoFlag]
6872
}
6973
return fkLinkToSamplerCreator[r.DefaultRelationship]
7074
}
@@ -368,7 +372,7 @@ func (in *Insert) sampleConstraints(constraints db.Constraints, values [][]Gette
368372
}
369373

370374
samplerInit := in.fklinks.relationship(constraint.ReferencedTableName, in.table.Name)
371-
sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.fklinks.CoinFlipPercent, in.expectedTableSize)
375+
sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.expectedTableSize, &in.fklinks)
372376
err = sampler.Sample()
373377
if err != nil {
374378
return errors.Wrap(err, "sampleFieldsTable")

generate/samples.go

Lines changed: 38 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,17 @@ type Sampler interface {
1818
Sample() error
1919
}
2020

21-
type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, float64, int64) Sampler
21+
type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, int64, *ForeignKeyLinks) Sampler
2222

2323
type sampleCommon struct {
24-
schema string
25-
table string
26-
fields []db.Field
27-
values [][]Getter
28-
limit int
24+
schema string
25+
table string
26+
constraintName string
27+
fields []db.Field
28+
values [][]Getter
29+
limit int
30+
tableSize int64
31+
fkCli *ForeignKeyLinks
2932
}
3033

3134
func (s *sampleCommon) query(query string, values [][]Getter) error {
@@ -96,6 +99,18 @@ func (s *sampleCommon) getterFromField(f db.Field) ScannerGetter {
9699
return nil
97100
}
98101

102+
func (s *sampleCommon) Init(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) {
103+
104+
s.table = tablename
105+
s.schema = schema
106+
s.constraintName = constraintName
107+
s.limit = len(values)
108+
s.values = values
109+
s.fields = fields
110+
s.tableSize = tableSize
111+
s.fkCli = fkCli
112+
}
113+
99114
type UniformSample struct {
100115
sampleCommon
101116
lastOffset int // paging by offset is bad, but it will work with compound pk, lack of pk, or complex pk types
@@ -118,52 +133,43 @@ func (s *UniformSample) Sample() error {
118133
var storedUniformSamples = map[string]*UniformSample{}
119134
var storedUniformSamplesMutex = sync.Mutex{}
120135

121-
func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, _ float64, tableSize int64) Sampler {
136+
func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
122137
storedUniformSamplesMutex.Lock()
123138
defer storedUniformSamplesMutex.Unlock()
124139
if s, ok := storedUniformSamples[tablename+constraintName]; ok {
125140
s.values = values
126141
return s
127142
}
128143
s := &UniformSample{}
129-
s.table = tablename
130-
s.schema = schema
131-
s.limit = len(values)
132-
s.values = values
133-
s.fields = fields
144+
s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
134145
storedUniformSamples[tablename+constraintName] = s
135146
return s
136147
}
137148

138149
type DBRandomSample struct {
139150
sampleCommon
140-
samplePercent float64
151+
coinFlipPercent float64
141152
}
142153

143154
func (s *DBRandomSample) Sample() error {
144155

145156
query := fmt.Sprintf("SELECT %s FROM %s.%s %s AND %s ORDER BY 1 LIMIT %d",
146-
db.EscapedNamesListFromFields(s.fields), db.Escape(s.schema), db.Escape(s.table), db.BinomialWhereClause(s.samplePercent), db.EscapedFieldsIsNotNull(s.fields), s.limit)
157+
db.EscapedNamesListFromFields(s.fields), db.Escape(s.schema), db.Escape(s.table), db.BinomialWhereClause(s.coinFlipPercent), db.EscapedFieldsIsNotNull(s.fields), s.limit)
147158

148159
return s.query(query, s.values)
149160
}
150161

151-
func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Getter, samplePercent float64, _ int64) Sampler {
162+
func NewDBRandomSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
152163
s := &DBRandomSample{}
153-
s.table = name
154-
s.schema = schema
155-
s.samplePercent = samplePercent
156-
s.limit = len(values)
157-
s.values = values
158-
s.fields = fields
164+
s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
165+
s.coinFlipPercent = fkCli.CoinFlipPercent
159166
return s
160167
}
161168

162169
type BoxMullerSample struct {
163170
sampleCommon
164-
stddev float64
165-
mean float64
166-
tableSize int64
171+
stddev float64
172+
mean float64
167173
}
168174

169175
// box muller
@@ -195,17 +201,12 @@ func (s *BoxMullerSample) Sample() error {
195201
return s.query(query, s.values)
196202
}
197203

198-
func NewBoxMullerSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
204+
func NewBoxMullerSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
199205
s := &BoxMullerSample{}
200-
s.table = name
201-
s.schema = schema
202-
s.limit = len(values)
203-
s.values = values
204-
s.fields = fields
205-
s.tableSize = tableSize
206-
// TODO
207-
s.stddev = float64(s.limit)
208-
s.mean = float64(tableSize) / 2
206+
s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
207+
208+
s.stddev = fkCli.NormalStddev
209+
s.mean = fkCli.NormalMean
209210
return s
210211
}
211212

@@ -233,15 +234,10 @@ func (s *ZipfSample) Sample() error {
233234
return s.query(query, s.values)
234235
}
235236

236-
func NewZipfSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
237+
func NewZipfSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, tableSize int64, fkCli *ForeignKeyLinks) Sampler {
237238
s := &ZipfSample{}
238-
s.table = name
239-
s.schema = schema
240-
s.limit = len(values)
241-
s.values = values
242-
s.fields = fields
243-
244-
s.zipfRand = rand.NewZipf(rand.New(rand.NewSource(time.Now().UnixNano())), 1.1, 1.0, uint64(tableSize))
239+
s.Init(fields, schema, tablename, constraintName, values, tableSize, fkCli)
240+
s.zipfRand = rand.NewZipf(rand.New(rand.NewSource(time.Now().UnixNano())), fkCli.ParetoS, fkCli.ParetoV, uint64(tableSize))
245241

246242
return s
247243
}

main_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ func TestRun(t *testing.T) {
192192
name: "fk_normal",
193193
checkQuery: "select count(distinct t1.id) between 1 and 99 from t1 join t2 on t1.id = t2.t1_id;",
194194
engines: []string{"pg", "mysql"},
195-
cmds: [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=normal", "--bulk-size=10"}},
195+
cmds: [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=normal"}},
196196
},
197197

198198
// 5% of 1000 will end up being 50, but we need 100 samples per chunks and t1_id has NOT NULL so it has to loop to get more samples

0 commit comments

Comments
 (0)