Skip to content

Commit cf95c76

Browse files
Add new -parquet option
1 parent 08be2e1 commit cf95c76

5 files changed

Lines changed: 253 additions & 18 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ dist/
88
htmlcov/
99
.tox/
1010
docs/_build/
11-
*.egg-info
11+
*.egg-info
12+
*.parquet

README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ ip2location-csv-converter [-range | -cidr | -hex] [-replace | -append] INPUT_FIL
4747
| -hex | IP numbers will be converted into hexadecimal format. (auto padding) |
4848
| -hex4 | IP numbers will be converted into hexadecimal format. (pad IPv4) |
4949
| -hex6 | IP numbers will be converted into hexadecimal format. (pad IPv6) |
50+
| -parquet | Convert IP2Location/IP2Proxy CSV file to a custom parquet file. |
5051
| -replace | The IP numbers in will be replaced to the selected format. |
5152
| -append | The converted format will be appended after the IP numbers field. |
5253

@@ -214,6 +215,47 @@ Output:
214215

215216
You can use this converter for a custom input file provided the input is in CSV format, with the first and second field contain the **ip from** and **ip to** information in numeric format.
216217

218+
## Parquet conversion
219+
220+
You can convert any IP2Location or IP2Proxy CSV file to a parquet file using this converter. The command will be:
221+
222+
```
223+
ip2location-csv-converter -parquet <database_type> <input_csv_filename> <output_parquet_filename>
224+
```
225+
226+
You can get the database type of the IP2Location or IP2Proxy CSV file from the below link:
227+
- [https://www.ip2location.com/database/ip2location](https://www.ip2location.com/database/ip2location): Between DB1 to DB26.
228+
- [https://www.ip2location.com/database/ip2proxy](https://www.ip2location.com/database/ip2proxy): Between PX1 to PX12.
229+
230+
For IPv6, due to the current limitation of the Decimal data type in parquet, the converter will encode the IPv6 number to hex string and stored as varchar. Hence, you will need to do some pre-conversion during the query time.
231+
232+
Below is one of the example demonstrate on the per-conversion of an IPv6 address before query:
233+
234+
```python
235+
import ipaddress
236+
237+
# Example IPv6 address
238+
ipv6_addr = "2001:db8::1"
239+
240+
# Convert to integer
241+
ipv6_int = int(ipaddress.IPv6Address(ipv6_addr))
242+
243+
# Convert to zero-padded 32-character lowercase hex string
244+
ipv6_hex = format(ipv6_int, "032x")
245+
246+
```
247+
248+
To query using the hex IPv6 address, the code can looks like this:
249+
250+
```python
251+
import duckdb
252+
253+
result = duckdb.query(f"""
254+
SELECT * FROM '<ipv6_parquet_filename>'
255+
WHERE ipv6_hex = '{ipv6_hex}'
256+
""").to_df()
257+
```
258+
217259
## Support
218260

219261
URL: [https://www.ip2location.com](https://www.ip2location.com/)

ip2location_csv_converter/commandline.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import os, re, sys, time
2-
from ip2location_csv_converter.ip2location_csv_converter import convert_to_csv, check_data_validity
2+
from ip2location_csv_converter.ip2location_csv_converter import convert_to_csv, check_data_validity, csv_to_parquet
33

44
regex1 = r"^\-(range|cidr|hex)$"
55
regex2 = r"^\-(replace|append)$"
66
regex3 = r"^\-(help)$"
77

88
def print_usage():
99
print(
10-
"Usage: ip2location-csv-converter [-range | -cidr | -hex] [-replace | -append] INPUT_FILE OUTPUT_FILE\n"
10+
"Usage: ip2location-csv-converter [-range | -cidr | -hex | -parquet] [-replace | -append] [database_type] INPUT_FILE OUTPUT_FILE\n"
1111
"\n"
1212
" -range\n"
1313
" IP numbers will be converted into the first IP address and last IP address in the range.\n"
@@ -27,8 +27,8 @@ def print_usage():
2727
" -replace\n"
2828
" The IP numbers in will be replaced to the selected format.\n"
2929
"\n"
30-
" -append\n"
31-
" The converted format will be appended after the IP numbers field.\n"
30+
" -parquet\n"
31+
" Convert a IP2Location/IP2Proxy CSV to a parquet file.\n"
3232
"\n"
3333
" -help\n"
3434
" Display this guide.\n"
@@ -62,15 +62,24 @@ def main():
6262
if (check_data_validity(input_file) is False):
6363
print ("Please make sure the columns had comma as separator.")
6464
sys.exit(1)
65-
if (re.search(regex1, param1) != None):
66-
conversion_mode = re.findall(regex1, param1)[0]
67-
elif (re.search(regex2, param1) != None):
68-
write_mode = re.findall(regex2, param1)[0]
69-
if (re.search(regex1, param2) != None):
70-
conversion_mode = re.findall(regex1, param2)[0]
71-
elif (re.search(regex2, param2) != None):
72-
write_mode = re.findall(regex2, param2)[0]
73-
convert_to_csv(input_file, output_file, conversion_mode, write_mode)
65+
66+
if param1 == '-parquet':
67+
if param2 == '':
68+
print ("Please provide the database type of the CSV file.")
69+
sys.exit(1)
70+
print(f'Converting {input_file} to {output_file} now...')
71+
csv_to_parquet(input_file, output_file, param2)
72+
print(f'Conversion done.')
73+
else:
74+
if (re.search(regex1, param1) != None):
75+
conversion_mode = re.findall(regex1, param1)[0]
76+
elif (re.search(regex2, param1) != None):
77+
write_mode = re.findall(regex2, param1)[0]
78+
if (re.search(regex1, param2) != None):
79+
conversion_mode = re.findall(regex1, param2)[0]
80+
elif (re.search(regex2, param2) != None):
81+
write_mode = re.findall(regex2, param2)[0]
82+
convert_to_csv(input_file, output_file, conversion_mode, write_mode)
7483

7584
elif ((len(sys.argv) == 2) and (re.search(regex3, sys.argv[1]) != None)):
7685
print_usage()

ip2location_csv_converter/ip2location_csv_converter.py

Lines changed: 185 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
import os, sys, re, csv, socket, struct, ipaddress, binascii
22
import time
3-
from io import open
3+
from io import open, StringIO
4+
import pandas as pd
5+
import pyarrow as pa
6+
import pyarrow.parquet as pq
7+
from decimal import Decimal
8+
49

510
conversion_mode = 'range'
611
write_mode = 'replace'
@@ -179,3 +184,182 @@ def convert_to_csv(input_file, output_file, conversion_mode, write_mode):
179184
# Stop the loop if there are no more rows
180185
if not chunk:
181186
break
187+
188+
def get_last_row(file_path):
189+
with open(file_path, 'rb') as f:
190+
f.seek(-2, 2) # Move to the second-last byte
191+
while f.read(1) != b'\n':
192+
f.seek(-2, 1)
193+
last_line = f.readline().decode()
194+
return last_line
195+
196+
def detect_ip_version_from_number(ip_num):
197+
try:
198+
ip = ipaddress.ip_address(int(ip_num))
199+
return 'IPv4' if ip.version == 4 else 'IPv6'
200+
except ValueError:
201+
return 'Invalid'
202+
203+
def detect_versions_from_chunk(chunk):
204+
versions = set()
205+
206+
# Combine both start and end IP columns (col 0 and col 1)
207+
all_ips = pd.concat([chunk.iloc[:, 0], chunk.iloc[:, 1]], ignore_index=True)
208+
209+
for ip_num in all_ips:
210+
try:
211+
version = ipaddress.ip_address(int(ip_num)).version
212+
versions.add(version)
213+
if len(versions) > 1:
214+
break # early stop if we detect both v4 and v6
215+
except ValueError:
216+
continue # skip invalid numbers
217+
218+
return versions
219+
220+
# Scan the file in chunks
221+
def check_ip_versions(csv_path):
222+
seen_versions = set()
223+
for chunk in pd.read_csv(csv_path, chunksize=50_000, header=None, usecols=[0, 1]):
224+
seen_versions.update(detect_versions_from_chunk(chunk))
225+
if len(seen_versions) > 1:
226+
break # early exit if both found
227+
228+
if seen_versions == {4, 6}:
229+
print(f'Your csv file {csv_path} contains mixture of IPv4 and IPv6 addresses, which will causing issue when converting to parquet file.')
230+
print(f'It is advisable to separate IPv4 and IPv6 addresses into two identical csv file.')
231+
sys.exit(1)
232+
233+
# Convert CSV to Parquet
234+
def csv_to_parquet(input_file, output_file, db_type):
235+
column_names = ''
236+
csv_file = input_file
237+
parquet_file = output_file
238+
parquet_chunksize = 50_000
239+
parquet_writer = None
240+
241+
# Need to determine the column names
242+
column_names_list = {
243+
'DB1': ["ip_from", "ip_to", "country_code", "country_name"],
244+
'DB2': ["ip_from", "ip_to", "country_code", "country_name", "isp"],
245+
'DB3': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name"],
246+
'DB4': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "isp"],
247+
'DB5': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude"],
248+
'DB6': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp"],
249+
'DB7': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "isp", "domain"],
250+
'DB8': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp", "domain"],
251+
'DB9': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code"],
252+
'DB10': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "isp", "domain"],
253+
'DB11': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone"],
254+
'DB12': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain"],
255+
'DB13': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "time_zone", "net_speed"],
256+
'DB14': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed"],
257+
'DB15': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "idd_code", "area_code"],
258+
'DB16': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code"],
259+
'DB17': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "time_zone", "net_speed", "weather_station_code", "weather_station_name"],
260+
'DB18': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name"],
261+
'DB19': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp", "domain", "mcc", "mnc", "mobile_brand"],
262+
'DB20': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name", "mcc", "mnc", "mobile_brand"],
263+
'DB21': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "idd_code", "area_code", "elevation"],
264+
'DB22': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name", "mcc", "mnc", "mobile_brand", "elevation"],
265+
'DB23': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp", "domain", "mcc", "mnc", "mobile_brand", "usage_type"],
266+
'DB24': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name", "mcc", "mnc", "mobile_brand", "elevation", "usage_type"],
267+
'DB25': [
268+
"ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name",
269+
"latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "net_speed",
270+
"idd_code", "area_code", "weather_station_code", "weather_station_name",
271+
"mcc", "mnc", "mobile_brand", "elevation", "usage_type", "address_type",
272+
"category"
273+
],
274+
'DB26': [
275+
"ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name",
276+
"latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "net_speed",
277+
"idd_code", "area_code", "weather_station_code", "weather_station_name",
278+
"mcc", "mnc", "mobile_brand", "elevation", "usage_type", "address_type",
279+
"category", "district", "asn", "as_name"
280+
],
281+
'PX1': ['ip_from', 'ip_to', 'country_code', 'country_name'],
282+
'PX2': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name'],
283+
'PX3': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name'],
284+
'PX4': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp'],
285+
'PX5': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain'],
286+
'PX6': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type'],
287+
'PX7': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as'],
288+
'PX8': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen'],
289+
'PX9': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat'],
290+
'PX10': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat'],
291+
'PX11': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat', 'provider'],
292+
'PX12': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat', 'provider', 'fraud_score']
293+
}
294+
if db_type != '':
295+
try:
296+
column_names = column_names_list[db_type]
297+
except Exception:
298+
print(f'Invalid db_type value foundm the valid value should be range from DB1 to DB26. Your input: {db_type}.')
299+
sys.exit(1)
300+
301+
# check_ip_versions(csv_file)
302+
303+
# Determine ipv4 or ipv6 based on the last row of the file
304+
# Get last line
305+
last_line = get_last_row(csv_file)
306+
307+
df_last = pd.read_csv(StringIO(last_line), header=None)
308+
ip_value = df_last.iloc[0, 0] # Replace with actual index
309+
# print("Is IPv6?", is_ipv6(ip_value))
310+
# print(f"{ip_value} is {detect_ip_version_from_number(ip_value)}")
311+
ip_ver = detect_ip_version_from_number(ip_value)
312+
313+
if column_names != '':
314+
try:
315+
schema_list = []
316+
for column in column_names:
317+
if column in ["ip_from", "ip_to"]:
318+
if ip_ver == 'IPv4':
319+
schema_list.append(pa.field(column, pa.uint32()))
320+
elif ip_ver == 'IPv6':
321+
schema_list.append(pa.field(column, pa.string()))
322+
elif column in ["latitude", "longitude"]:
323+
schema_list.append(pa.field(column, pa.float64()))
324+
elif column in ['last_seen', 'fraud_score', "elevation"]:
325+
schema_list.append(pa.field(column, pa.int32()))
326+
else:
327+
schema_list.append(pa.field(column, pa.string()))
328+
schema = pa.schema(schema_list)
329+
for chunk in pd.read_csv(
330+
csv_file,
331+
names=column_names,
332+
header=None,
333+
chunksize=parquet_chunksize,
334+
low_memory=True,
335+
dtype=str # initially read all as string to control parsing
336+
):
337+
if ip_ver == 'IPv4':
338+
chunk["ip_from"] = pd.to_numeric(chunk["ip_from"], errors="coerce").astype("uint32")
339+
chunk["ip_to"] = pd.to_numeric(chunk["ip_to"], errors="coerce").astype("uint32")
340+
elif ip_ver == 'IPv6':
341+
chunk["ip_from"] = chunk["ip_from"].apply(int)
342+
chunk["ip_from"] = chunk["ip_from"].apply(lambda x: format(x, '032x'))
343+
chunk["ip_to"] = chunk["ip_to"].apply(int)
344+
chunk["ip_to"] = chunk["ip_to"].apply(lambda x: format(x, '032x'))
345+
if "latitude" in column_names:
346+
chunk["latitude"] = pd.to_numeric(chunk["latitude"], errors="coerce").astype("float64")
347+
if "longitude" in column_names:
348+
chunk["longitude"] = pd.to_numeric(chunk["longitude"], errors="coerce").astype("float64")
349+
if "elevation" in column_names:
350+
chunk["elevation"] = pd.to_numeric(chunk["elevation"], errors="coerce")
351+
352+
table = pa.Table.from_pandas(chunk, schema=schema, preserve_index=False)
353+
354+
if parquet_writer is None:
355+
parquet_writer = pq.ParquetWriter(parquet_file, table.schema)
356+
357+
parquet_writer.write_table(table)
358+
359+
if parquet_writer:
360+
parquet_writer.close()
361+
except Exception as e:
362+
print(f'Unexcepted error occured, will abort now...')
363+
print(str(e))
364+
sys.exit(1)
365+

setup.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
setuptools.setup(
1212
name="ip2location-python-csv-converter",
13-
version="1.2.7",
13+
version="1.3.0",
1414
description="Python script to converts IP2Location CSV database into IP range or CIDR format.",
1515
long_description_content_type="text/markdown",
1616
long_description=long_description,
@@ -23,7 +23,7 @@
2323
project_urls={
2424
'Official Website': 'https://www.ip2location.com',
2525
},
26-
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
26+
python_requires='>=3.5, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
2727
entry_points = {
2828
'console_scripts': ['ip2location-csv-converter=ip2location_csv_converter.commandline:main'],
2929
},
@@ -32,7 +32,6 @@
3232
"Development Status :: 5 - Production/Stable",
3333
"Intended Audience :: Developers",
3434
"Topic :: Utilities",
35-
'Programming Language :: Python :: 2.7',
3635
"Programming Language :: Python :: 3.5",
3736
"Programming Language :: Python :: 3.6",
3837
"Programming Language :: Python :: 3.7",

0 commit comments

Comments
 (0)