Skip to content

Commit 530a1f4

Browse files
authored
Upgrade pyarrow to fix S3 segfaults (#7)
* Bump arrow all the way to latest * Add extra module support - closes #4 * Version bump
1 parent d6cafb7 commit 530a1f4

6 files changed

Lines changed: 145 additions & 74 deletions

File tree

faker_cli/cli.py

Lines changed: 49 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,32 @@
1-
from faker import Faker
2-
import click
31
import sys
4-
from faker_cli.templates import CloudFrontWriter, S3AccessLogs, S3AccessWriter, CloudTrailLogs, CloudFrontLogs
5-
6-
from faker_cli.writer import CSVWriter, JSONWriter, ParquetWriter, DeltaLakeWriter
72
from typing import List
83

4+
import click
5+
from faker import Faker
6+
7+
from faker_cli.templates import (
8+
CloudFrontLogs,
9+
CloudFrontWriter,
10+
S3AccessLogs,
11+
S3AccessWriter,
12+
)
13+
from faker_cli.writer import CSVWriter, JSONWriter
14+
15+
916
def infer_column_names(col_names, col_types: str) -> List[str]:
1017
"""
1118
Infer column names from column types
1219
"""
1320
# For now, nothing special - but eventually we need to parse things out
1421
if col_names:
1522
return col_names.split(",")
16-
23+
1724
return col_types.split(",")
1825

26+
1927
KLAS_MAPPER = {
2028
"csv": CSVWriter,
2129
"json": JSONWriter,
22-
"parquet": ParquetWriter,
23-
"deltalake": DeltaLakeWriter
2430
}
2531

2632
TEMPLATE_MAPPER = {
@@ -32,9 +38,16 @@ def infer_column_names(col_names, col_types: str) -> List[str]:
3238
fake.add_provider(S3AccessLogs)
3339
fake.add_provider(CloudFrontLogs)
3440

41+
3542
@click.command()
3643
@click.option("--num-rows", "-n", default=1, help="Number of rows")
37-
@click.option("--format", "-f", type=click.Choice(["csv", "json", "parquet", "deltalake"]), default="csv", help="Format of the output")
44+
@click.option(
45+
"--format",
46+
"-f",
47+
type=click.Choice(["csv", "json", "parquet", "deltalake"]),
48+
default="csv",
49+
help="Format of the output",
50+
)
3851
@click.option("--output", "-o", type=click.Path(writable=True))
3952
@click.option("--columns", "-c", help="Column names", default=None, required=False)
4053
@click.option("--template", "-t", help="Template to use", type=click.Choice(["s3access", "cloudfront"]), default=None)
@@ -53,16 +66,37 @@ def main(num_rows, format, output, columns, template, column_types):
5366
ctx = click.get_current_context()
5467
click.echo(ctx.get_help())
5568
ctx.exit()
56-
raise click.BadArgumentUsage(
57-
"either --template or a list of Faker property names must be provided."
58-
)
69+
raise click.BadArgumentUsage("either --template or a list of Faker property names must be provided.")
5970

6071
# Parquet output requires a filename
6172
if format in ["parquet", "deltalake"] and output is None:
6273
raise click.BadArgumentUsage("parquet | deltalake formats requires --output/-o filename parameter.")
6374
if output is not None and format not in ["parquet", "deltalake"]:
6475
raise click.BadArgumentUsage("output files not supported for csv/json yet.")
65-
76+
77+
# Optionally load additional features
78+
if format == "parquet":
79+
try:
80+
from faker_cli.writers.parquet import ParquetWriter
81+
82+
KLAS_MAPPER["parquet"] = ParquetWriter
83+
except ImportError:
84+
raise click.ClickException(
85+
"Using Parquet writer, but the 'pyarrow' package is not installed. "
86+
"Make sure to install faker-cli using `pip install faker-cli[parquet]`."
87+
)
88+
89+
if format == "deltalake":
90+
try:
91+
from faker_cli.writers.delta import DeltaLakeWriter
92+
93+
KLAS_MAPPER["deltalake"] = DeltaLakeWriter
94+
except ImportError:
95+
raise click.ClickException(
96+
"Using Delta writer, but the 'deltalake' package is not installed. "
97+
"Make sure to install faker-cli using `pip install faker-cli[delta]`."
98+
)
99+
66100
# If the user provides a template, we use that provider and writer and exit.
67101
# We assume a template has a custom writer that may be different than CSV or JSON
68102
if template:
@@ -72,13 +106,13 @@ def main(num_rows, format, output, columns, template, column_types):
72106
row = fake.format(log_entry)
73107
writer.write(row)
74108
return
75-
109+
76110
# Now, if a template hasn't been provided, generate some fake data!
77111
col_types = column_types.split(",")
78112
headers = infer_column_names(columns, column_types)
79113
writer = KLAS_MAPPER.get(format)(sys.stdout, headers, output)
80114
for i in range(num_rows):
81115
# TODO: Handle args
82-
row = [ fake.format(ctype) for ctype in col_types ]
116+
row = [fake.format(ctype) for ctype in col_types]
83117
writer.write(row)
84118
writer.close()

faker_cli/writer.py

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
import csv
22
import json
33
from typing import Optional
4-
import pyarrow as pa
5-
import pyarrow.parquet as pq
6-
import deltalake
74

85

96
class Writer:
@@ -38,27 +35,3 @@ def write(self, row):
3835
jsonl = json.dumps(dict(zip(self.headers, row)), default=str)
3936
self.writer.write(jsonl)
4037
self.writer.write("\n")
41-
42-
43-
class ParquetWriter(Writer):
44-
def __init__(self, output, headers, filename):
45-
super().__init__(output, headers)
46-
self.filename = filename
47-
self.table: pa.Table = None
48-
49-
def write(self, row):
50-
ini_dict = [{k: [v]} for k, v in list(zip(self.headers, row))]
51-
tbl = {k: v for d in ini_dict for k, v in d.items()}
52-
table = pa.table(tbl)
53-
if self.table is None:
54-
self.table = table
55-
else:
56-
self.table = pa.concat_tables([self.table, table])
57-
58-
def close(self):
59-
pq.write_table(self.table, self.filename)
60-
61-
62-
class DeltaLakeWriter(ParquetWriter):
63-
def close(self):
64-
deltalake.write_deltalake(table_or_uri=self.filename, data=self.table)

faker_cli/writers/delta.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import deltalake
2+
3+
from faker_cli.writers.parquet import ParquetWriter
4+
5+
6+
class DeltaLakeWriter(ParquetWriter):
7+
def close(self):
8+
deltalake.write_deltalake(table_or_uri=self.filename, data=self.table)

faker_cli/writers/parquet.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
2+
import pyarrow as pa
3+
import pyarrow.parquet as pq
4+
5+
from faker_cli.writer import Writer
6+
7+
class ParquetWriter(Writer):
8+
def __init__(self, output, headers, filename):
9+
super().__init__(output, headers)
10+
self.filename = filename
11+
self.table: pa.Table = None
12+
13+
def write(self, row):
14+
ini_dict = [{k: [v]} for k, v in list(zip(self.headers, row))]
15+
tbl = {k: v for d in ini_dict for k, v in d.items()}
16+
table = pa.table(tbl)
17+
if self.table is None:
18+
self.table = table
19+
else:
20+
self.table = pa.concat_tables([self.table, table])
21+
22+
def close(self):
23+
pq.write_table(self.table, self.filename)

poetry.lock

Lines changed: 58 additions & 29 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)