Skip to content

Commit 8972f12

Browse files
authored
Json uploads (#647)
* Upload json or json-list files, in addition to CSV and SQLite * Improvements to user uploads; refactoring, test coverage, better UI, bug fixes, logging * Clicking a field name in schema explorer copies it to clipboard * Limit charts to 10 numerical series for performance (past 10 it's incomprehensible anyway) * Limit sampling of uploaded files to 5k rows for the purposes of type inference
1 parent 4b31630 commit 8972f12

27 files changed

Lines changed: 1433 additions & 250 deletions

docs/features.rst

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,20 @@
11
Features
22
========
33

4-
Easy to get started
5-
-------------------
6-
- Built on Django's ORM, so works with MySQL, Postgres, Oracle,
7-
SQLite, Snowflake, MS SQL Server, RedShift, and MariaDB.
8-
- If you want to use Snowflake or SQL Server, you will need to install the relevant package
9-
(e.g. https://pypi.org/project/django-snowflake/, https://github.com/microsoft/mssql-django)
10-
- Small number of dependencies.
11-
- MIT licensed (except for functionality in the /ee/ directory,
12-
which is still free for commercial use, but can't be resold).
13-
144
SQL Assistant
155
-------------
166
- Built in integration with OpenAI (or the LLM of your choosing)
177
to quickly get help with your query, with relevant schema
188
automatically injected into the prompt. Simple, effective.
199

10+
Database Support
11+
----------------
12+
- Supports MySql, postgres (and, by extension, pg-connection-compatible DBs like Redshift), SQLite,
13+
Oracle, MS SQL Server, MariaDB, and Snowflake
14+
- Note for Snowflake or SQL Server, you will need to install the relevant Django connection package
15+
(e.g. https://pypi.org/project/django-snowflake/, https://github.com/microsoft/mssql-django)
16+
- Also supports ad-hoc data sources by uploading JSON, CSV, or SQLite files directly.
17+
2018
Snapshots
2119
---------
2220
- Tick the 'snapshot' box on a query, and Explorer will upload a
@@ -120,7 +118,8 @@ Displaying query results as charts
120118
----------------------------------
121119

122120
If the results table has numeric columns, they can be displayed in a bar chart. The first column will always be used
123-
as the x-axis labels. This is quite basic, but can be useful for quick visualization.
121+
as the x-axis labels. This is quite basic, but can be useful for quick visualization. Charts (if enabled) will render
122+
for query results with ten or fewer numeric columns. With more series than that, the charts become a hot mess quickly.
124123

125124
To enable this feature, set ``EXPLORER_CHARTS_ENABLED`` setting to ``True`` and install the plotting library
126125
``matplotlib`` with:
@@ -169,7 +168,7 @@ Multiple Connections
169168
way. See connections.py for more documentation on
170169
multi-connection setup.
171170
- SQL Explorer also supports user-provided connections in the form
172-
of standard database connection details, or uploading CSV or SQLite
171+
of standard database connection details, or uploading CSV, JSON or SQLite
173172
files. See the 'User uploads' section of :doc:`settings`.
174173

175174
Power tips

explorer/charts.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ def get_chart(result: QueryResult, chart_type: str) -> Optional[str]:
2525
c for c in range(1, len(result.data[0]))
2626
if all([isinstance(col[c], (int, float)) or col[c] is None for col in result.data])
2727
]
28-
if len(numeric_columns) < 1:
28+
# Don't create charts for > 10 series. This is a lightweight visualization.
29+
if len(numeric_columns) < 1 or len(numeric_columns) > 10:
2930
return None
3031
labels = [row[0] for row in result.data]
3132
fig, ax = plt.subplots(figsize=(10, 3.8))
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import os
2+
from io import BytesIO
3+
4+
from explorer.ee.db_connections.type_infer import get_parser
5+
from explorer.ee.db_connections.utils import pandas_to_sqlite
6+
7+
8+
def parse_to_sqlite(file) -> (BytesIO, str):
9+
f_name = file.name
10+
f_bytes = file.read()
11+
df_parser = get_parser(file)
12+
if df_parser:
13+
df = df_parser(f_bytes)
14+
try:
15+
f_bytes = pandas_to_sqlite(df, local_path=f"{f_name}_tmp_local.db")
16+
except Exception as e: # noqa
17+
raise ValueError(f"Error while parsing {f_name}: {e}") from e
18+
# replace the previous extension with .db, as it is now a sqlite file
19+
name, _ = os.path.splitext(f_name)
20+
f_name = f"{name}.db"
21+
else:
22+
return BytesIO(f_bytes), f_name # if it's a SQLite file already, simply cough it up as a BytesIO object
23+
return f_bytes, f_name
24+

explorer/ee/db_connections/mime.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import csv
2+
import json
3+
4+
# These are 'shallow' checks. They are just to understand if the upload appears valid at surface-level.
5+
# A deeper check will happen when pandas tries to parse the file.
6+
# This is designed to be quick, and simply assigned the right (full) parsing function to the uploaded file.
7+
8+
9+
def is_csv(file):
10+
if file.content_type != "text/csv":
11+
return False
12+
try:
13+
# Check if the file content can be read as a CSV
14+
file.seek(0)
15+
sample = file.read(1024).decode("utf-8")
16+
csv.Sniffer().sniff(sample)
17+
file.seek(0)
18+
return True
19+
except csv.Error:
20+
return False
21+
22+
23+
def is_json(file):
24+
if file.content_type != "application/json":
25+
return False
26+
if not file.name.lower().endswith(".json"):
27+
return False
28+
return True
29+
30+
31+
def is_json_list(file):
32+
if not file.name.lower().endswith(".json"):
33+
return False
34+
file.seek(0)
35+
first_line = file.readline()
36+
file.seek(0)
37+
try:
38+
json.loads(first_line.decode("utf-8"))
39+
return True
40+
except ValueError:
41+
return False
42+
43+
44+
def is_sqlite(file):
45+
if file.content_type != "application/x-sqlite3":
46+
return False
47+
try:
48+
# Check if the file starts with the SQLite file header
49+
file.seek(0)
50+
header = file.read(16)
51+
file.seek(0)
52+
return header == b"SQLite format 3\x00"
53+
except Exception as e: # noqa
54+
return False
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import io
2+
import json
3+
from explorer.ee.db_connections.mime import is_csv, is_json, is_sqlite, is_json_list
4+
5+
6+
MAX_TYPING_SAMPLE_SIZE = 5000
7+
SHORTEST_PLAUSIBLE_DATE_STRING = 5
8+
9+
10+
def get_parser(file):
11+
if is_csv(file):
12+
return csv_to_typed_df
13+
if is_json_list(file):
14+
return json_list_to_typed_df
15+
if is_json(file):
16+
return json_to_typed_df
17+
if is_sqlite(file):
18+
return None
19+
raise ValueError(f"File {file.content_type} not supported.")
20+
21+
22+
def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True):
23+
import pandas as pd
24+
csv_file = io.BytesIO(csv_bytes)
25+
df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
26+
return df_to_typed_df(df)
27+
28+
29+
def json_list_to_typed_df(json_bytes):
30+
import pandas as pd
31+
data = []
32+
for line in io.BytesIO(json_bytes).readlines():
33+
data.append(json.loads(line.decode("utf-8")))
34+
35+
df = pd.json_normalize(data)
36+
return df_to_typed_df(df)
37+
38+
39+
def json_to_typed_df(json_bytes):
40+
import pandas as pd
41+
json_file = io.BytesIO(json_bytes)
42+
json_content = json.load(json_file)
43+
df = pd.json_normalize(json_content)
44+
return df_to_typed_df(df)
45+
46+
47+
def atof_custom(value):
48+
# Remove any thousands separators and convert the decimal point
49+
if "," in value and "." in value:
50+
if value.index(",") < value.index("."):
51+
# 0,000.00 format
52+
value = value.replace(",", "")
53+
else:
54+
# 0.000,00 format
55+
value = value.replace(".", "").replace(",", ".")
56+
elif "," in value:
57+
# No decimal point, only thousands separator
58+
value = value.replace(",", "")
59+
return float(value)
60+
61+
62+
63+
def df_to_typed_df(df): # noqa
64+
import pandas as pd
65+
from dateutil import parser
66+
try:
67+
68+
for column in df.columns:
69+
70+
# If we somehow have an array within a field (e.g. from a json object) then convert it to a string
71+
df[column] = df[column].apply(lambda x: str(x) if isinstance(x, list) else x)
72+
73+
values = df[column].dropna().unique()
74+
if len(values) > MAX_TYPING_SAMPLE_SIZE:
75+
values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()
76+
77+
is_date = False
78+
is_integer = True
79+
is_float = True
80+
81+
for value in values:
82+
try:
83+
float_val = atof_custom(str(value))
84+
if float_val == int(float_val):
85+
continue # This is effectively an integer
86+
else:
87+
is_integer = False
88+
except ValueError:
89+
is_integer = False
90+
is_float = False
91+
break
92+
93+
if is_integer:
94+
is_float = False
95+
96+
if not is_integer and not is_float:
97+
is_date = True
98+
99+
# The dateutil parser is very aggressive and will interpret many short strings as dates.
100+
# For example "12a" will be interpreted as 12:00 AM on the current date.
101+
# That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
102+
try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
103+
if len(try_parse) > 0:
104+
for value in try_parse:
105+
try:
106+
parser.parse(str(value))
107+
except (ValueError, TypeError, OverflowError):
108+
is_date = False
109+
break
110+
else:
111+
is_date = False
112+
113+
if is_date:
114+
df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
115+
elif is_integer:
116+
df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
117+
# If there are NaN / blank values, the column will be converted to float
118+
# Convert it back to integer
119+
df[column] = df[column].astype("Int64")
120+
elif is_float:
121+
df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
122+
else:
123+
inferred_type = pd.api.types.infer_dtype(values)
124+
if inferred_type == "integer":
125+
df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
126+
elif inferred_type == "floating":
127+
df[column] = pd.to_numeric(df[column], errors="coerce")
128+
129+
return df
130+
131+
except pd.errors.ParserError as e:
132+
return str(e)

explorer/ee/db_connections/utils.py

Lines changed: 0 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -108,96 +108,3 @@ def pandas_to_sqlite(df, local_path="local_database.db"):
108108
# Delete the local SQLite database file
109109
# Finally block to ensure we don't litter files around
110110
os.remove(local_path)
111-
112-
113-
MAX_TYPING_SAMPLE_SIZE = 10000
114-
SHORTEST_PLAUSIBLE_DATE_STRING = 5
115-
116-
117-
def atof_custom(value):
118-
# Remove any thousands separators and convert the decimal point
119-
if "," in value and "." in value:
120-
if value.index(",") < value.index("."):
121-
# 0,000.00 format
122-
value = value.replace(",", "")
123-
else:
124-
# 0.000,00 format
125-
value = value.replace(".", "").replace(",", ".")
126-
elif "," in value:
127-
# No decimal point, only thousands separator
128-
value = value.replace(",", "")
129-
return float(value)
130-
131-
def csv_to_typed_df(csv_bytes, delimiter=",", has_headers=True): # noqa
132-
import pandas as pd
133-
from dateutil import parser
134-
try:
135-
136-
csv_file = io.BytesIO(csv_bytes)
137-
df = pd.read_csv(csv_file, sep=delimiter, header=0 if has_headers else None)
138-
139-
for column in df.columns:
140-
values = df[column].dropna().unique()
141-
if len(values) > MAX_TYPING_SAMPLE_SIZE:
142-
values = pd.Series(values).sample(MAX_TYPING_SAMPLE_SIZE, random_state=42).to_numpy()
143-
144-
is_date = False
145-
is_integer = True
146-
is_float = True
147-
148-
for value in values:
149-
try:
150-
float_val = atof_custom(str(value))
151-
if float_val == int(float_val):
152-
continue # This is effectively an integer
153-
else:
154-
is_integer = False
155-
except ValueError:
156-
is_integer = False
157-
is_float = False
158-
break
159-
160-
if is_integer:
161-
is_float = False
162-
163-
if not is_integer and not is_float:
164-
is_date = True
165-
166-
# The dateutil parser is very aggressive and will interpret many short strings as dates.
167-
# For example "12a" will be interpreted as 12:00 AM on the current date.
168-
# That is not the behavior anyone wants. The shortest plausible date string is e.g. 1-1-23
169-
try_parse = [v for v in values if len(str(v)) > SHORTEST_PLAUSIBLE_DATE_STRING]
170-
if len(try_parse) > 0:
171-
for value in try_parse:
172-
try:
173-
parser.parse(str(value))
174-
except (ValueError, TypeError, OverflowError):
175-
is_date = False
176-
break
177-
else:
178-
is_date = False
179-
180-
if is_date:
181-
df[column] = pd.to_datetime(df[column], errors="coerce", utc=True)
182-
elif is_integer:
183-
df[column] = df[column].apply(lambda x: int(atof_custom(str(x))) if pd.notna(x) else x)
184-
# If there are NaN / blank values, the column will be converted to float
185-
# Convert it back to integer
186-
df[column] = df[column].astype("Int64")
187-
elif is_float:
188-
df[column] = df[column].apply(lambda x: atof_custom(str(x)) if pd.notna(x) else x)
189-
else:
190-
inferred_type = pd.api.types.infer_dtype(values)
191-
if inferred_type == "integer":
192-
df[column] = pd.to_numeric(df[column], errors="coerce", downcast="integer")
193-
elif inferred_type == "floating":
194-
df[column] = pd.to_numeric(df[column], errors="coerce")
195-
196-
return df
197-
198-
except pd.errors.ParserError as e:
199-
return str(e)
200-
201-
202-
def is_csv(file):
203-
return file.content_type == "text/csv"

0 commit comments

Comments
 (0)