Skip to content

Commit 79bae06

Browse files
committed
Add: pareto and normal distribution
1 parent 43fa30c commit 79bae06

12 files changed

Lines changed: 172 additions & 17 deletions

File tree

README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,17 +480,21 @@ General:
480480
- [ ] helpers to get schema (generate pgdump/mysqldump commands, get index stats, ...)
481481
- [x] protect against foreign key cycles. Both explicits and implicits (avoid generating implicits that would end up causing loops)
482482
- [x] detect selfpointing foreign keys
483-
- [ ] have some graph to show --coin-flip-percent with --bulk-size
484483
- [x] using --values-freq-map to make query parameters work
485484

485+
Sampling:
486+
- [x] normal law through box-muller, select sqrt(-2*log(random()))*sin(2*pi()*random());
487+
- [x] pareto laws
488+
- [ ] have some graph to show --coin-flip-percent with --bulk-size
489+
486490
Stepping stones to fully reproduce cardinalities:
487491
- [x] incorporating arbitrary values with fixed frequency into the bulk inserts
488492
- [x] table-per-table override for --rows, --null-frequency
489493
- [ ] coin-flip-percent per relationship basis. Current thought: adding it to --binomial this way --binomial="parent=child:70" to set the coinflip to 70 for this link
490494
- [ ] parse col/index stats (cardinality + most_common_elems + most_common_freqs for postgres, cardinalities for MySQL)
491495

492496
Without clear plan:
493-
- [ ] More random algorithms (as of now, no good implementations has been found for pareto that wouldn't provoke huge runtime and/or huge memory consumption, unless implemented fields are restricted to integers)
497+
- [x] More random algorithms (as of now, no good implementations has been found for pareto that wouldn't provoke huge runtime and/or huge memory consumption, unless implemented fields are restricted to integers)
494498
- [ ] guessing joins on subqueries/cte. Joins wouldn't be based on columns, but on expressions
495499
- [ ] be able to "suplement" existing foreign keys with additional columns ?
496500

db/db.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ type Engine interface {
2828
SetTableMetadata(*Table, string, string)
2929
BinomialWhereClause(float64) string
3030
ErrShouldRetryTx(error) bool
31+
FilterOnRowNumberFromClause([]Field, string, string) string
32+
FilterOnRowNumberVarClause() string
3133
}
3234

3335
var ErrFieldsNotFound = errors.New("fields not found")
@@ -77,3 +79,11 @@ func BinomialWhereClause(freqPercent float64) string {
7779
func ErrShouldRetryTx(err error) bool {
7880
return engine.ErrShouldRetryTx(err)
7981
}
82+
83+
func FilterOnRowNumberFromClause(fields []Field, table, schema string) string {
84+
return engine.FilterOnRowNumberFromClause(fields, table, schema)
85+
}
86+
87+
func FilterOnRowNumberVarClause() string {
88+
return engine.FilterOnRowNumberVarClause()
89+
}

db/mysql.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,11 @@ func (_ MySQL) BinomialWhereClause(freqPercent float64) string {
220220
func (_ MySQL) ErrShouldRetryTx(err error) bool {
221221
return strings.Contains(err.Error(), "Duplicate entry")
222222
}
223+
224+
func (_ MySQL) FilterOnRowNumberFromClause(_ []Field, table, schema string) string {
225+
return fmt.Sprintf("%s.%s, (SELECT @rownumber := 0) f", Escape(schema), Escape(table))
226+
}
227+
228+
func (_ MySQL) FilterOnRowNumberVarClause() string {
229+
return "(@rownumber := @rownumber + 1)"
230+
}

db/pg.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,12 @@ func (_ Postgres) BinomialWhereClause(freqPercent float64) string {
164164
func (_ Postgres) ErrShouldRetryTx(err error) bool {
165165
return strings.Contains(err.Error(), "duplicate key value violates unique constraint")
166166
}
167+
168+
func (_ Postgres) FilterOnRowNumberFromClause(fields []Field, table, schema string) string {
169+
escapedFields := EscapedNamesListFromFields(fields)
170+
return fmt.Sprintf("(SELECT %s, ROW_NUMBER() OVER (ORDER BY %s) as rownumber FROM %s.%s ) f", escapedFields, escapedFields, Escape(schema), Escape(table))
171+
}
172+
173+
func (_ Postgres) FilterOnRowNumberVarClause() string {
174+
return "rownumber"
175+
}

generate/generate.go

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,33 +17,40 @@ import (
1717
)
1818

1919
type Insert struct {
20-
table *db.Table
21-
writer io.Writer
22-
NotifyChan chan int64
23-
fklinks ForeignKeyLinks
24-
workersCount int
25-
insertMutex sync.Mutex
26-
maxTextSize int64
27-
uuidVersion int
28-
maxRetries int
29-
frequencies frequency.ColumnFrequency
20+
table *db.Table
21+
writer io.Writer
22+
NotifyChan chan int64
23+
fklinks ForeignKeyLinks
24+
workersCount int
25+
insertMutex sync.Mutex
26+
maxTextSize int64
27+
uuidVersion int
28+
maxRetries int
29+
frequencies frequency.ColumnFrequency
30+
expectedTableSize int64
3031
}
3132

3233
type ForeignKeyLinks struct {
33-
DefaultRelationship string `name:"default-relationship" help:"Will define the default foreign-key relationship to apply. Possible values: ${BinomialFlag},${SequentialFlag}. The default relation can be overriden with other parameters --${BinomialFlag} or --${SequentialFlag}" enum:"${BinomialFlag},${SequentialFlag}" default:"${BinomialFlag}"`
34+
DefaultRelationship string `name:"default-relationship" help:"Will define the default foreign-key relationship to apply. Possible values: ${BinomialFlag},${SequentialFlag}. The default relation can be overriden with other parameters --${BinomialFlag} or --${SequentialFlag}" enum:"${BinomialFlag},${SequentialFlag},${NormalFlag},${ParetoFlag}" default:"${BinomialFlag}"`
3435
Binomial map[string]string ` help:"Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be \"parent_table=child_table\" E.g: --${BinomialFlag}=\"customers=orders;orders=items\""`
3536
Sequential map[string]string `name:"sequential" help:"Defines a sequential foreign key links relationships, using SELECT ... LIMIT x OFFET y. Format should be \"parent_table=child_table\" E.g: --${SequentialFlag}=\"citizens=ssns\""`
3637
CoinFlipPercent float64 `name:"coin-flip-percent" help:"When used with ${BinomialFlag}, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10%% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --${SequentialFlag}. Lower value (e.g 0.001) will also slow down the sampling speed" default:"1"`
38+
Normal map[string]string `help:"Defines a 1-N foreign key relationships using box-muller transformation to provide normal distribution"`
39+
Pareto map[string]string `help:"Defines a 1-N foreign key relationships using zipf (pareto) distribution"`
3740
}
3841

3942
const (
4043
SequentialFlag = "sequential"
4144
BinomialFlag = "binomial"
45+
NormalFlag = "normal"
46+
ParetoFlag = "pareto"
4247
)
4348

4449
var fkLinkToSamplerCreator = map[string]SamplerBuilder{
4550
SequentialFlag: NewUniformSample,
4651
BinomialFlag: NewDBRandomSample,
52+
NormalFlag: NewBoxMullerSample,
53+
ParetoFlag: NewZipfSample,
4754
}
4855

4956
func (r ForeignKeyLinks) relationship(parent, child string) SamplerBuilder {
@@ -53,6 +60,12 @@ func (r ForeignKeyLinks) relationship(parent, child string) SamplerBuilder {
5360
if r.Binomial[parent] == child {
5461
return fkLinkToSamplerCreator[BinomialFlag]
5562
}
63+
if r.Normal[parent] == child {
64+
return fkLinkToSamplerCreator[NormalFlag]
65+
}
66+
if r.Pareto[parent] == child {
67+
return fkLinkToSamplerCreator[NormalFlag]
68+
}
5669
return fkLinkToSamplerCreator[r.DefaultRelationship]
5770
}
5871

@@ -115,6 +128,7 @@ func (in *Insert) run(count int64, bulksize int64, dryRun bool) error {
115128
completeInserts := count / bulksize
116129
remainder := count - completeInserts*bulksize
117130
numJobs := completeInserts + 1 // + remainder
131+
in.expectedTableSize = count
118132

119133
bulksizeJobs := make(chan int64, numJobs)
120134
errChan := make(chan error, numJobs)
@@ -354,7 +368,7 @@ func (in *Insert) sampleConstraints(constraints db.Constraints, values [][]Gette
354368
}
355369

356370
samplerInit := in.fklinks.relationship(constraint.ReferencedTableName, in.table.Name)
357-
sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.fklinks.CoinFlipPercent)
371+
sampler := samplerInit(constraint.ReferencedFields, constraint.ReferencedTableSchema, constraint.ReferencedTableName, constraint.ConstraintName, subSlice, in.fklinks.CoinFlipPercent, in.expectedTableSize)
358372
err = sampler.Sample()
359373
if err != nil {
360374
return errors.Wrap(err, "sampleFieldsTable")

generate/samples.go

Lines changed: 95 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,12 @@ package generate
22

33
import (
44
"fmt"
5+
"math"
6+
"math/rand"
7+
"strconv"
8+
"strings"
59
"sync"
10+
"time"
611

712
"github.com/pkg/errors"
813
"github.com/rs/zerolog/log"
@@ -13,7 +18,7 @@ type Sampler interface {
1318
Sample() error
1419
}
1520

16-
type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, float64) Sampler
21+
type SamplerBuilder func([]db.Field, string, string, string, [][]Getter, float64, int64) Sampler
1722

1823
type sampleCommon struct {
1924
schema string
@@ -113,7 +118,7 @@ func (s *UniformSample) Sample() error {
113118
var storedUniformSamples = map[string]*UniformSample{}
114119
var storedUniformSamplesMutex = sync.Mutex{}
115120

116-
func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, _ float64) Sampler {
121+
func NewUniformSample(fields []db.Field, schema, tablename, constraintName string, values [][]Getter, _ float64, tableSize int64) Sampler {
117122
storedUniformSamplesMutex.Lock()
118123
defer storedUniformSamplesMutex.Unlock()
119124
if s, ok := storedUniformSamples[tablename+constraintName]; ok {
@@ -143,7 +148,7 @@ func (s *DBRandomSample) Sample() error {
143148
return s.query(query, s.values)
144149
}
145150

146-
func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Getter, samplePercent float64) Sampler {
151+
func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Getter, samplePercent float64, _ int64) Sampler {
147152
s := &DBRandomSample{}
148153
s.table = name
149154
s.schema = schema
@@ -153,3 +158,90 @@ func NewDBRandomSample(fields []db.Field, schema, name, _ string, values [][]Get
153158
s.fields = fields
154159
return s
155160
}
161+
162+
type BoxMullerSample struct {
163+
sampleCommon
164+
stddev float64
165+
mean float64
166+
tableSize int64
167+
}
168+
169+
// box muller
170+
// currently has a "distribution" bug I cannot figure out, there's a spike of probability around what should have been the 25 quartile
171+
// maybe it's tied to the fact boxmuller expects [0.0,1.0] for u1 u2, but golang can only provide [0.0,1.0[
172+
// stddev/mean does not affect it, it does not look like a float related issues but it most probably is
173+
func (s *BoxMullerSample) Sample() error {
174+
175+
rowNumbers := make([]string, s.limit)
176+
for i := range rowNumbers {
177+
var cosId int64 = -1
178+
x1, x2 := rand.Float64(), rand.Float64()
179+
for cosId < 0 || cosId > s.tableSize {
180+
cosId = int64(math.Round(s.mean + s.stddev*math.Sqrt(-2*math.Log(x1))*math.Cos(2*math.Pi*x2)))
181+
}
182+
rowNumbers[i] = strconv.FormatInt(cosId, 10)
183+
}
184+
185+
escapedFields := db.EscapedNamesListFromFields(s.fields)
186+
query := fmt.Sprintf("SELECT %s FROM %s WHERE %s IN (%s) AND %s LIMIT %d",
187+
escapedFields,
188+
db.FilterOnRowNumberFromClause(s.fields, s.table, s.schema),
189+
db.FilterOnRowNumberVarClause(),
190+
strings.Join(rowNumbers, ","),
191+
db.EscapedFieldsIsNotNull(s.fields),
192+
s.limit,
193+
)
194+
195+
return s.query(query, s.values)
196+
}
197+
198+
func NewBoxMullerSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
199+
s := &BoxMullerSample{}
200+
s.table = name
201+
s.schema = schema
202+
s.limit = len(values)
203+
s.values = values
204+
s.fields = fields
205+
s.tableSize = tableSize
206+
// TODO
207+
s.stddev = float64(s.limit)
208+
s.mean = float64(tableSize) / 2
209+
return s
210+
}
211+
212+
type ZipfSample struct {
213+
sampleCommon
214+
zipfRand *rand.Zipf
215+
}
216+
217+
func (s *ZipfSample) Sample() error {
218+
219+
rowNumbers := make([]string, s.limit)
220+
for i := range rowNumbers {
221+
rowNumbers[i] = strconv.Itoa(int(s.zipfRand.Uint64()))
222+
}
223+
escapedFields := db.EscapedNamesListFromFields(s.fields)
224+
query := fmt.Sprintf("SELECT %s FROM %s WHERE %s IN (%s) AND %s LIMIT %d",
225+
escapedFields,
226+
db.FilterOnRowNumberFromClause(s.fields, s.table, s.schema),
227+
db.FilterOnRowNumberVarClause(),
228+
strings.Join(rowNumbers, ","),
229+
db.EscapedFieldsIsNotNull(s.fields),
230+
s.limit,
231+
)
232+
233+
return s.query(query, s.values)
234+
}
235+
236+
func NewZipfSample(fields []db.Field, schema, name, _ string, values [][]Getter, _ float64, tableSize int64) Sampler {
237+
s := &ZipfSample{}
238+
s.table = name
239+
s.schema = schema
240+
s.limit = len(values)
241+
s.values = values
242+
s.fields = fields
243+
244+
s.zipfRand = rand.NewZipf(rand.New(rand.NewSource(time.Now().UnixNano())), 1.1, 1.0, uint64(tableSize))
245+
246+
return s
247+
}

main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ func main() {
5151
"version": buildInfo,
5252
"SequentialFlag": generate.SequentialFlag,
5353
"BinomialFlag": generate.BinomialFlag,
54+
"NormalFlag": generate.NormalFlag,
55+
"ParetoFlag": generate.ParetoFlag,
5456
},
5557
kong.ConfigureHelp(kong.HelpOptions{
5658
Compact: false,

main_test.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,18 @@ func TestRun(t *testing.T) {
182182
engines: []string{"pg", "mysql"},
183183
cmds: [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=binomial", "--coin-flip-percent=60"}},
184184
},
185+
{
186+
name: "fk_pareto",
187+
checkQuery: "select count(distinct t1.id) between 1 and 99 from t1 join t2 on t1.id = t2.t1_id;",
188+
engines: []string{"pg", "mysql"},
189+
cmds: [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=pareto"}},
190+
},
191+
{
192+
name: "fk_normal",
193+
checkQuery: "select count(distinct t1.id) between 1 and 99 from t1 join t2 on t1.id = t2.t1_id;",
194+
engines: []string{"pg", "mysql"},
195+
cmds: [][]string{[]string{"--rows=100", "--table=t1"}, []string{"--rows=100", "--table=t2", "--default-relationship=normal", "--bulk-size=10"}},
196+
},
185197

186198
// 5% of 1000 will end up being 50, but we need 100 samples per chunks and t1_id has NOT NULL so it has to loop to get more samples
187199
{

tests/mysql/fk_normal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
fk_binomial

tests/mysql/fk_pareto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
fk_binomial

0 commit comments

Comments
 (0)