Skip to content

Commit ef0fe92

Browse files
committed
Add: timestamp --min/max-generated-time
1 parent 800e31e commit ef0fe92

8 files changed

Lines changed: 79 additions & 180 deletions

File tree

README.md

Lines changed: 25 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ This is early stage
2727
|double|0 ~ 1000|
2828
|char(n)|up to n random chars|
2929
|varchar(n)|up to n random chars|
30-
|date|NOW() - 1 year ~ NOW()|
31-
|datetime|NOW() - 1 year ~ NOW()|
32-
|timestamp|NOW() - 1 year ~ NOW()|
30+
|date|between --min-generated-time and --max-generated-time|
31+
|datetime|between --min-generated-time and --max-generated-time|
32+
|timestamp|between --min-generated-time and --max-generated-time|
3333
|time|00:00:00 ~ 23:59:59|
3434
|year|Current year - 1 ~ current year|
3535
|tinyblob|up to 100 chars random paragraph|
@@ -49,18 +49,38 @@ Valuable types currently not implemented:
4949
- Vectors
5050

5151
## Options
52+
53+
Common options:
54+
5255
|Option|Description|
5356
|------|-----------|
5457
|--engine|mysql/pg|
5558
|--host|Host name/ip|
5659
|--user|Username|
5760
|--password|Password|
5861
|--port|Port number|
62+
|--quiet|Do not print progress bar|
63+
|--dry-run|Print queries to the standard output instead of inserting them into the db|
64+
|--debug|Show some debug information|
65+
|--pprof|Generate pprof trace at --cpu-prof-path. Also opens port 6060 for pprof go tool|
66+
|--version|Show version and exit|
5967
|--rows-per-table|Number of rows to insert per-table. Will have priority over --rows|
6068
|--bulk-size|Number of rows per INSERT statement (Default: 1000)|
6169
|--workers|how many workers to spawn. Only the random generation and sampling are parallelized. Insert queries are executed one at a time (Default: 3)|
6270
|--table|Table to insert to. When using --query, --table will be used to restrict the tables to insert to.|
6371
|--query|Providing a query will analyze its schema usage, insert recursively into tables, and identify implicit joins|
72+
|--no-skip-fields|Disable field whitelist system. When using a --query, it will get the list of fields being used as a whitelist in order to generate the minimal sets of fields required, unless --no-skip-fields is being used or any * has been found.|
73+
|--null-freq|Define how frequent nullable fields should be NULL|
74+
|--null-freq-map|Define how frequent nullable fields should be NULL for a given column. Will have priority over --null-freq. The format is \"--null-freq-map=t1.c1=73;t1.c2=4\" to set 73% or 4% of NULL for respective columns|
75+
|--values-freq-map|Inject arbitrary values at fixed frequencies. The format is "--values-freq-map=t1.c1=val1:0.75,val2:0.23;t1.c2=10:0.99" so that val1 will be on 75% of rows and val2 on 23% for column c1|
76+
|--min-generated-time|Generated timestamps will be after this date. Format is RFC3339. Will default to --max-generated-time - 1 year|
77+
|--max-generated-time|Generated timestamps will be before this date. Format is RFC3339. Will default to now()|
78+
79+
Foreign key sampling options:
80+
|Option|Description|
81+
|------|-----------|
82+
|--add-fk|Add foreign keys, if they are not explicitely created in the table schema. It can complement the foreign keys guessed from the --query, or be used to manually define foreign keys when using --no-fk-guess too. Format: --add-fk="parent_table.col1[,col2...]=child_table.colx[,coly...][; additional fk ]". Example: --add-fk="customers.id,created_at=purchases.customer_id,created_at;purchases.id=items.purchase_id"|
83+
|--no-fk-guess|Do not try to guess foreign keys from the --query missing in the schema. When a query is provided, it will analyze the expected JOINs and try to respect dependencies even when foreign keys are not explicitely created in the database objects. This flag will make the tool stick to the constraints defined in the database only, unless you add foreign keys manually with --add-foreign-keys.|
6484
|--default-relationship|Will define the default foreign-key relationship to apply. Possible values: binomial,sequential. The default relation can be overriden with other parameters --binomial or --sequential|
6585
|--binomial|Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be "parent_table=child_table". E.g: --binomial="customers=orders;orders=items"|
6686
|--coin-flip-percent|When used with --binomial, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --sequential (Default: 1)|
@@ -71,17 +91,6 @@ Valuable types currently not implemented:
7191
|--pareto|Defines a 1-N foreign key relationships using zipf (pareto) distribution. Slow method needing full table scans for each samples|
7292
|--pareto-s|Zipf slope parameter. Must be above 1. Higher value will mean faster decay, so first rows will be hotter|
7393
|--pareto-v|Must be >=1. Directly map to V, https://pkg.go.dev/math/rand#Zipf.|
74-
|--add-fk|Add foreign keys, if they are not explicitely created in the table schema. It can complement the foreign keys guessed from the --query, or be used to manually define foreign keys when using --no-fk-guess too. Format: --add-fk="parent_table.col1[,col2...]=child_table.colx[,coly...][; additional fk ]". Example: --add-fk="customers.id,created_at=purchases.customer_id,created_at;purchases.id=items.purchase_id"|
75-
|--no-fk-guess|Do not try to guess foreign keys from the --query missing in the schema. When a query is provided, it will analyze the expected JOINs and try to respect dependencies even when foreign keys are not explicitely created in the database objects. This flag will make the tool stick to the constraints defined in the database only, unless you add foreign keys manually with --add-foreign-keys.|
76-
|--no-skip-fields|Disable field whitelist system. When using a --query, it will get the list of fields being used as a whitelist in order to generate the minimal sets of fields required, unless --no-skip-fields is being used or any * has been found.|
77-
|--null-freq|Define how frequent nullable fields should be NULL|
78-
|--null-freq-map|Define how frequent nullable fields should be NULL for a given column. Will have priority over --null-freq. The format is \"--null-freq-map=t1.c1=73;t1.c2=4\" to set 73% or 4% of NULL for respective columns|
79-
|--values-freq-map|Inject arbitrary values at fixed frequencies. The format is "--values-freq-map=t1.c1=val1:0.75,val2:0.23;t1.c2=10:0.99" so that val1 will be on 75% of rows and val2 on 23% for column c1|
80-
|--quiet|Do not print progress bar|
81-
|--dry-run|Print queries to the standard output instead of inserting them into the db|
82-
|--debug|Show some debug information|
83-
|--pprof|Generate pprof trace at --cpu-prof-path. Also opens port 6060 for pprof go tool|
84-
|--version|Show version and exit|
8594

8695
## Foreign keys support
8796
If a field has Foreign Keys constraints, `random-data-load` will get samples from the referenced tables in order to insert valid values for the field.
@@ -329,72 +338,6 @@ postgres=# select oi.product_no, count(*) from order_items oi group by 1 order b
329338
330339
```
331340

332-
333-
## Options
334-
|Option|Description|
335-
|------|-----------|
336-
|--engine|mysql/pg|
337-
|--host|Host name/ip|
338-
|--user|Username|
339-
|--password|Password|
340-
|--port|Port number|
341-
|--bulk-size|Number of rows per INSERT statement (Default: 1000)|
342-
|--workers|how many workers to spawn. Only the random generation and sampling are parallelized. Insert queries are executed one at a time (Default: 3)|
343-
|--table|Table to insert to. When using --query, --table will be used to restrict the tables to insert to.|
344-
|--query|Providing a query will analyze its schema usage, insert recursively into tables, and identify implicit joins|
345-
|--default-relationship|Will define the default foreign-key relationship to apply. Possible values: binomial,sequential. The default relation can be overriden with other parameters --binomial or --sequential|
346-
|--binomial|Defines a 1-N foreign key relationships using repeated coin flips. Postgres' tablesamples Bernouilli or mysql RAND() < 0.1 (can be tuned with --coin-flip-percent). Format should be "parent_table=child_table". E.g: --binomial="customers=orders;orders=items"|
347-
|--coin-flip-percent|When used with --binomial, it will set the likeliness of each rows to be sampled or not. 10 would mean each rows have only 10%% chance to be selected when sampling a parent table. Using large values will favor hot rows: the coin flips are done with a table full scan, with a limit set at --bulk-size, so with a large percent chance most of the time the first rows will be selected. No effects when used with --sequential (Default: 1)|
348-
|--sequential|Defines a sequential foreign key links relationships. Format should be "parent_table=child_table". E.g: --sequential="citizens=ssns"|
349-
|--add-foreign-keys|Add foreign keys, if they are not explicitely created in the table schema. The format must be parent_table.col1=child_table.col2. It can complement the foreign keys guessed from the --query, or be used to manually define foreign keys when using --no-fk-guess too. Example --add-foreign-keys="customers.id=purchases.customer_id;purchases.id=items.purchase_id"|
350-
|--no-fk-guess|Do not try to guess foreign keys from the --query missing in the schema. When a query is provided, it will analyze the expected JOINs and try to respect dependencies even when foreign keys are not explicitely created in the database objects. This flag will make the tool stick to the constraints defined in the database only, unless you add foreign keys manually with --add-foreign-keys.|
351-
|--no-skip-fields|Disable field whitelist system. When using a --query, it will get the list of fields being used as a whitelist in order to generate the minimal sets of fields required, unless --no-skip-fields is being used or any * has been found.|
352-
|--null-freq|Define how frequent nullable fields should be NULL by default|
353-
|--values-freq-map|Define how frequent nullable fields should be NULL for a given column. Will have priority over --null-freq. The format is "--null-freq-map=t1.c1=73;t1.c2=4" to set 73%% or 4%% of NULL for respective columns
354-
|--query-param-freq|Frequency at which to insert arbitrary values guessed from the query parameters. = and IN operators are handled. Can be disabled when set to 0.0.|
355-
|--quiet|Do not print progress bar|
356-
|--dry-run|Print queries to the standard output instead of inserting them into the db|
357-
|--debug|Show some debug information|
358-
|--pprof|Generate pprof trace at --cpu-prof-path. Also opens port 6060 for pprof go tool|
359-
|--version|Show version and exit|
360-
361-
362-
## Supported fields:
363-
|Field type|Generated values|
364-
|----------|----------------|
365-
|bool|false ~ true|
366-
|tinyint|0 ~ 0xFF|
367-
|smallint|0 ~ 0XFFFF|
368-
|mediumint|0 ~ 0xFFFFFF|
369-
|int - integer|0 ~ 0xFFFFFFFF|
370-
|bigint|0 ~ 0xFFFFFFFFFFFFFFFF|
371-
|float|0 ~ 1e8|
372-
|decimal(m,n)|0 ~ 10^(m-n)|
373-
|double|0 ~ 1000|
374-
|char(n)|up to n random chars|
375-
|varchar(n)|up to n random chars|
376-
|date|NOW() - 1 year ~ NOW()|
377-
|datetime|NOW() - 1 year ~ NOW()|
378-
|timestamp|NOW() - 1 year ~ NOW()|
379-
|time|00:00:00 ~ 23:59:59|
380-
|year|Current year - 1 ~ current year|
381-
|tinyblob|up to 100 chars random paragraph|
382-
|tinytext|up to 100 chars random paragraph|
383-
|blob|up to --max-text-size chars random paragraph|
384-
|text|up to --max-text-size chars random paragraph|
385-
|mediumblob|up to --max-text-size chars random paragraph|
386-
|mediumtext|up to --max-text-size chars random paragraph|
387-
|longblob|up to --max-text-size chars random paragraph|
388-
|longtext|up to --max-text-size chars random paragraph|
389-
|enum|A random item from the valid items list|
390-
|set|A random item from the valid items list|
391-
392-
Valuable types currently not implemented:
393-
- JSONs
394-
- Geospatial
395-
- Vectors
396-
397-
398341
## Foreign keys support
399342
If a field has Foreign Keys constraints, `random-data-load` will get samples from the referenced tables in order to insert valid values for the field.
400343
To enforce orders, an arbitrary 'ORDER BY 1' is made. This is so that --sequential can create 1-1 relationship, and to better master the eventual distribution of --binomial.
@@ -481,7 +424,7 @@ https://github.com/Percona-Lab/random-data-load/releases
481424

482425
## To do
483426
General:
484-
- [ ] better datetime random generation. It should be flexible over its range
427+
- [x] better datetime random generation. It should be flexible over its range
485428
- [x] use more gofakeit generators with regexes to generate "legit" data when possible
486429
- [ ] helpers to get schema (generate pgdump/mysqldump commands, get index stats, ...)
487430
- [x] protect against foreign key cycles. Both explicits and implicits (avoid generating implicits that would end up causing loops)
@@ -498,6 +441,7 @@ Stepping stones to fully reproduce cardinalities:
498441
- [x] table-per-table override for --rows, --null-frequency
499442
- [ ] coin-flip-percent per relationship basis. Current thought: adding it to --binomial this way --binomial="parent=child:70" to set the coinflip to 70 for this link
500443
- [ ] parse col/index stats (cardinality + most_common_elems + most_common_freqs for postgres, cardinalities for MySQL)
444+
- [ ] estimate/decide sampling method+tuning based on stats
501445

502446
Without clear plan:
503447
- [x] More random algorithms (as of now, no good implementations has been found for pareto that wouldn't provoke huge runtime and/or huge memory consumption, unless implemented fields are restricted to integers)

cmd/run.go

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"regexp"
77
"slices"
88
"strings"
9+
"time"
910

1011
"github.com/apoorvam/goterminal"
1112
"github.com/pkg/errors"
@@ -19,16 +20,18 @@ import (
1920
type RunCmd struct {
2021
DB db.Config `embed:""`
2122

22-
Table string `help:"Table to insert to. When using --query, --table will be used to restrict the tables to insert to."`
23-
Rows int64 `name:"rows" required:"true" help:"Number of rows to insert"`
24-
RowsPerTable map[string]int64 `name:"rows-per-table" help:"Number of rows to insert per-table. Will have priority over --rows. Format is \"{table}=X\"" default:""`
25-
BulkSize int64 `name:"bulk-size" help:"Number of rows per insert statement" default:"1000"`
26-
DryRun bool `name:"dry-run" help:"Print queries to the standard output instead of inserting them into the db"`
27-
Quiet bool `name:"quiet" help:"Do not print progress bar"`
28-
WorkersCount int `name:"workers" help:"How many workers to spawn. Only the random generation and sampling are parallelized. Insert queries are executed one at a time" default:"3"`
29-
MaxTextSize int64 `help:"Limit the maximum size of long text, varchar and blob fields." default:"65535"`
30-
UUIDVersion int `name:"uuid-version" help:"UUID v4 or v7 for uuid datatypes" default:"4" enum:"4,7"`
31-
Query string `help:"Providing a query will enable to automatically discover the schema, insert recursively into tables, enforce implicit joins."`
23+
Table string `help:"Table to insert to. When using --query, --table will be used to restrict the tables to insert to."`
24+
Rows int64 `name:"rows" required:"true" help:"Number of rows to insert"`
25+
RowsPerTable map[string]int64 `name:"rows-per-table" help:"Number of rows to insert per-table. Will have priority over --rows. Format is \"{table}=X\"" default:""`
26+
BulkSize int64 `name:"bulk-size" help:"Number of rows per insert statement" default:"1000"`
27+
DryRun bool `name:"dry-run" help:"Print queries to the standard output instead of inserting them into the db"`
28+
Quiet bool `name:"quiet" help:"Do not print progress bar"`
29+
WorkersCount int `name:"workers" help:"How many workers to spawn. Only the random generation and sampling are parallelized. Insert queries are executed one at a time" default:"3"`
30+
MaxTextSize int64 `help:"Limit the maximum size of long text, varchar and blob fields." default:"65535"`
31+
UUIDVersion int `name:"uuid-version" help:"UUID v4 or v7 for uuid datatypes" default:"4" enum:"4,7"`
32+
MinGeneratedTime time.Time `help:"Generated timestamps will be after this date. Format is RFC3339. Will default to --max-generated-time - 1 year"`
33+
MaxGeneratedTime time.Time `help:"Generated timestamps will be before this date. Format is RFC3339. Will default to now()"`
34+
Query string `help:"Providing a query will enable to automatically discover the schema, insert recursively into tables, enforce implicit joins."`
3235

3336
generate.ForeignKeyLinks
3437
AddForeignKeys query.VirtualJoins `name:"add-fk" help:"Add foreign keys, if they are not explicitely created in the table schema. It can complement the foreign keys guessed from the --query, or be used to manually define foreign keys when using --no-fk-guess too. Format: --add-fk=\"parent_table.col1[,col2...]=child_table.colx[,coly...][; additional fk ]\". Example: --add-fk=\"customers.id,created_at=purchases.customer_id,created_at;purchases.id=items.purchase_id\""`
@@ -49,6 +52,13 @@ func (cmd *RunCmd) Run() error {
4952
return err
5053
}
5154

55+
if cmd.MaxGeneratedTime.IsZero() {
56+
cmd.MaxGeneratedTime = time.Now()
57+
}
58+
if cmd.MinGeneratedTime.IsZero() {
59+
cmd.MinGeneratedTime = cmd.MaxGeneratedTime.Add(-1 * time.Duration(24*365) * time.Hour)
60+
}
61+
5262
if (cmd.DefaultRelationship == generate.BinomialFlag || len(cmd.Binomial) > 0) && (float64(cmd.Rows)*cmd.CoinFlipPercent) < (float64(cmd.BulkSize)/2) {
5363
cmd.CoinFlipPercent = float64(cmd.BulkSize) / float64(cmd.Rows) / 2
5464
log.Info().Msgf("Increasing --coin-flip-percent to %.10f due to low --rows to ensure we can at least sample and get half of --bulk-size at a time", cmd.CoinFlipPercent)
@@ -163,7 +173,7 @@ func (cmd *RunCmd) Run() error {
163173
func (cmd *RunCmd) run(table *db.Table) error {
164174
rows := valueForTable(cmd.Rows, cmd.RowsPerTable, table.Name)
165175
colNullFreqs := frequency.SharedTableFrequency[table.Name]
166-
ins := generate.New(table, cmd.ForeignKeyLinks, cmd.WorkersCount, cmd.MaxTextSize, cmd.UUIDVersion, colNullFreqs)
176+
ins := generate.New(table, cmd.ForeignKeyLinks, cmd.WorkersCount, cmd.MaxTextSize, cmd.UUIDVersion, colNullFreqs, &cmd.MinGeneratedTime, &cmd.MaxGeneratedTime)
167177

168178
if !cmd.Quiet && !cmd.DryRun {
169179
go startProgressBar(table.Name, rows, ins.NotifyChan)

generate/date.go

Lines changed: 3 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package generate
22

33
import (
4-
"math/rand"
54
"time"
65
)
76

@@ -17,43 +16,7 @@ func (r *RandomDate) IsQuotable() bool {
1716
return true
1817
}
1918

20-
func NewRandomDate() *RandomDate {
21-
// TODO allownull
22-
var randomSeconds time.Duration
23-
for i := 0; i < 10 && randomSeconds != 0; i++ {
24-
randomSeconds = time.Duration(rand.Int63n(int64(oneYear)) + rand.Int63n(100))
25-
// TODO: configurable date range
26-
//for i := 0; i < 10 && randomSeconds == 0; i++ {
27-
// randomSeconds += time.Duration((rand.Int63n(4*int64(oneYear)) + rand.Int63n(100)) * 1000000000)
28-
}
29-
return &RandomDate{time.Now().Add(-1 * randomSeconds)}
19+
func NewRandomDate(minGeneratedTime, maxGeneratedTime *time.Time) *RandomDate {
20+
t := time.Unix(NewRandomIntRange(minGeneratedTime.Unix(), maxGeneratedTime.Unix()).value, 0)
21+
return &RandomDate{t}
3022
}
31-
32-
//type RandomDateInRange struct {
33-
// value time.Time
34-
//}
35-
//
36-
//func (r *RandomDateInRange) Value() interface{} {
37-
// return r.value
38-
//}
39-
//
40-
//func (r *RandomDateInRange) String() string {
41-
// d := r.Value().(time.Time)
42-
// return d.Format("2006-01-02 15:03:04")
43-
//}
44-
//
45-
//func (r *RandomDateInRange) Quote() string {
46-
// d := r.Value().(time.Time)
47-
// return fmt.Sprintf("'%s'", d.Format("2006-01-02 15:03:04"))
48-
//}
49-
//
50-
//func NewRandomDateInRange(name string, min, max string, allowNull bool) *RandomDateInRange {
51-
// if min == "" {
52-
// t := time.Now().Add(-1 * time.Duration(oneYear) * time.Second)
53-
// min = t.Format("2006-01-02")
54-
// }
55-
// var randomSeconds int64
56-
// randomSeconds = rand.Int63n(oneYear) + rand.Int63n(100)
57-
// d := time.Now().Add(-1 * time.Duration(randomSeconds) * time.Second)
58-
// return &RandomDateInRange{name, min, max, allowNull}
59-
//}

0 commit comments

Comments
 (0)