Add new -parquet option

ip2location-com · ip2location-com · commit cf95c76d7ff6 · 2025-06-20T08:32:52.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,5 @@ dist/
 htmlcov/
 .tox/
 docs/_build/
-*.egg-info
+*.egg-info
+*.parquet
diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ ip2location-csv-converter [-range | -cidr | -hex] [-replace | -append] INPUT_FIL
 | -hex      | IP numbers will be converted into hexadecimal format. (auto padding)        |
 | -hex4     | IP numbers will be converted into hexadecimal format. (pad IPv4)        |
 | -hex6     | IP numbers will be converted into hexadecimal format. (pad IPv6)        |
+| -parquet  | Convert IP2Location/IP2Proxy CSV file to a custom parquet file. |
 | -replace  | The IP numbers in will be replaced to the selected format.   |
 | -append   | The converted format will be appended after the IP numbers field. |
 
@@ -214,6 +215,47 @@ Output:
 
 You can use this converter for a custom input file provided the input is in CSV format, with the first and second field contain the **ip from** and **ip to** information in numeric format.
 
+## Parquet conversion
+
+You can convert any IP2Location or IP2Proxy CSV file to a parquet file using this converter. The command will be:
+
+```
+ip2location-csv-converter -parquet <database_type> <input_csv_filename> <output_parquet_filename>
+```
+
+You can get the database type of the IP2Location or IP2Proxy CSV file from the below link:
+- [https://www.ip2location.com/database/ip2location](https://www.ip2location.com/database/ip2location): Between DB1 to DB26.
+- [https://www.ip2location.com/database/ip2proxy](https://www.ip2location.com/database/ip2proxy): Between PX1 to PX12.
+
+For IPv6, due to the current limitation of the Decimal data type in parquet, the converter will encode the IPv6 number to hex string and stored as varchar. Hence, you will need to do some pre-conversion during the query time.
+
+Below is one of the example demonstrate on the per-conversion of an IPv6 address before query:
+
+```python
+import ipaddress
+
+# Example IPv6 address
+ipv6_addr = "2001:db8::1"
+
+# Convert to integer
+ipv6_int = int(ipaddress.IPv6Address(ipv6_addr))
+
+# Convert to zero-padded 32-character lowercase hex string
+ipv6_hex = format(ipv6_int, "032x")
+
+```
+
+To query using the hex IPv6 address, the code can looks like this:
+
+```python
+import duckdb
+
+result = duckdb.query(f"""
+    SELECT * FROM '<ipv6_parquet_filename>'
+    WHERE ipv6_hex = '{ipv6_hex}'
+""").to_df()
+```
+
 ## Support
 
 URL: [https://www.ip2location.com](https://www.ip2location.com/)
diff --git a/ip2location_csv_converter/commandline.py b/ip2location_csv_converter/commandline.py
@@ -1,13 +1,13 @@
 import os, re, sys, time
-from ip2location_csv_converter.ip2location_csv_converter import convert_to_csv, check_data_validity
+from ip2location_csv_converter.ip2location_csv_converter import convert_to_csv, check_data_validity, csv_to_parquet
 
 regex1 = r"^\-(range|cidr|hex)$"
 regex2 = r"^\-(replace|append)$"
 regex3 = r"^\-(help)$"
 
 def print_usage():
     print(
-"Usage: ip2location-csv-converter [-range | -cidr | -hex] [-replace | -append] INPUT_FILE OUTPUT_FILE\n"
+"Usage: ip2location-csv-converter [-range | -cidr | -hex | -parquet] [-replace | -append] [database_type] INPUT_FILE OUTPUT_FILE\n"
 "\n"
 "  -range\n"
 "  IP numbers will be converted into the first IP address and last IP address in the range.\n"
@@ -27,8 +27,8 @@ def print_usage():
 "  -replace\n"
 "  The IP numbers in will be replaced to the selected format.\n"
 "\n"
-"  -append\n"
-"  The converted format will be appended after the IP numbers field.\n"
+"  -parquet\n"
+"  Convert a IP2Location/IP2Proxy CSV to a parquet file.\n"
 "\n"
 "  -help\n"
 "  Display this guide.\n"
@@ -62,15 +62,24 @@ def main():
             if (check_data_validity(input_file)  is False):
                 print ("Please make sure the columns had comma as separator.")
                 sys.exit(1)
-        if (re.search(regex1, param1) != None):
-            conversion_mode = re.findall(regex1, param1)[0]
-        elif (re.search(regex2, param1) != None):
-            write_mode = re.findall(regex2, param1)[0]
-        if (re.search(regex1, param2) != None):
-            conversion_mode = re.findall(regex1, param2)[0]
-        elif (re.search(regex2, param2) != None):
-            write_mode = re.findall(regex2, param2)[0]
-        convert_to_csv(input_file, output_file, conversion_mode, write_mode)
+        
+        if param1 == '-parquet':
+            if param2 == '':
+                print ("Please provide the database type of the CSV file.")
+                sys.exit(1)
+            print(f'Converting {input_file} to {output_file} now...')
+            csv_to_parquet(input_file, output_file, param2)
+            print(f'Conversion done.')
+        else:
+            if (re.search(regex1, param1) != None):
+                conversion_mode = re.findall(regex1, param1)[0]
+            elif (re.search(regex2, param1) != None):
+                write_mode = re.findall(regex2, param1)[0]
+            if (re.search(regex1, param2) != None):
+                conversion_mode = re.findall(regex1, param2)[0]
+            elif (re.search(regex2, param2) != None):
+                write_mode = re.findall(regex2, param2)[0]
+            convert_to_csv(input_file, output_file, conversion_mode, write_mode)
 
     elif ((len(sys.argv) == 2) and (re.search(regex3, sys.argv[1]) != None)):
         print_usage()
diff --git a/ip2location_csv_converter/ip2location_csv_converter.py b/ip2location_csv_converter/ip2location_csv_converter.py
@@ -1,6 +1,11 @@
 import os, sys, re, csv, socket, struct, ipaddress, binascii
 import time
-from io import open
+from io import open, StringIO
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+from decimal import Decimal
+
 
 conversion_mode = 'range'
 write_mode = 'replace'
@@ -179,3 +184,182 @@ def convert_to_csv(input_file, output_file, conversion_mode, write_mode):
             # Stop the loop if there are no more rows
             if not chunk:
                 break
+
+def get_last_row(file_path):
+    with open(file_path, 'rb') as f:
+        f.seek(-2, 2)  # Move to the second-last byte
+        while f.read(1) != b'\n':
+            f.seek(-2, 1)
+        last_line = f.readline().decode()
+    return last_line
+
+def detect_ip_version_from_number(ip_num):
+    try:
+        ip = ipaddress.ip_address(int(ip_num))
+        return 'IPv4' if ip.version == 4 else 'IPv6'
+    except ValueError:
+        return 'Invalid'
+
+def detect_versions_from_chunk(chunk):
+    versions = set()
+
+    # Combine both start and end IP columns (col 0 and col 1)
+    all_ips = pd.concat([chunk.iloc[:, 0], chunk.iloc[:, 1]], ignore_index=True)
+
+    for ip_num in all_ips:
+        try:
+            version = ipaddress.ip_address(int(ip_num)).version
+            versions.add(version)
+            if len(versions) > 1:
+                break  # early stop if we detect both v4 and v6
+        except ValueError:
+            continue  # skip invalid numbers
+
+    return versions
+
+# Scan the file in chunks
+def check_ip_versions(csv_path):
+    seen_versions = set()
+    for chunk in pd.read_csv(csv_path, chunksize=50_000, header=None, usecols=[0, 1]):
+        seen_versions.update(detect_versions_from_chunk(chunk))
+        if len(seen_versions) > 1:
+            break  # early exit if both found
+
+    if seen_versions == {4, 6}:
+        print(f'Your csv file {csv_path} contains mixture of IPv4 and IPv6 addresses, which will causing issue when converting to parquet file.')
+        print(f'It is advisable to separate IPv4 and IPv6 addresses into two identical csv file.')
+        sys.exit(1)
+
+# Convert CSV to Parquet
+def csv_to_parquet(input_file, output_file, db_type):
+    column_names = ''
+    csv_file = input_file
+    parquet_file = output_file
+    parquet_chunksize = 50_000
+    parquet_writer = None
+    
+    # Need to determine the column names
+    column_names_list = {
+        'DB1': ["ip_from", "ip_to", "country_code", "country_name"],
+        'DB2': ["ip_from", "ip_to", "country_code", "country_name", "isp"],
+        'DB3': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name"],
+        'DB4': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "isp"],
+        'DB5': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude"],
+        'DB6': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp"],
+        'DB7': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "isp", "domain"],
+        'DB8': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp", "domain"],
+        'DB9': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code"],
+        'DB10': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "isp", "domain"],
+        'DB11': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone"],
+        'DB12': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain"],
+        'DB13': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "time_zone", "net_speed"],
+        'DB14': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed"],
+        'DB15': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "idd_code", "area_code"],
+        'DB16': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code"],
+        'DB17': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "time_zone", "net_speed", "weather_station_code", "weather_station_name"],
+        'DB18': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name"],
+        'DB19': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp", "domain", "mcc", "mnc", "mobile_brand"],
+        'DB20': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name", "mcc", "mnc", "mobile_brand"],
+        'DB21': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "idd_code", "area_code", "elevation"],
+        'DB22': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name", "mcc", "mnc", "mobile_brand", "elevation"],
+        'DB23': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "isp", "domain", "mcc", "mnc", "mobile_brand", "usage_type"],
+        'DB24': ["ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name", "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "time_zone", "net_speed", "idd_code", "area_code", "weather_station_code", "weather_station_name", "mcc", "mnc", "mobile_brand", "elevation", "usage_type"],
+        'DB25': [
+                "ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name",
+                "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "net_speed",
+                "idd_code", "area_code", "weather_station_code", "weather_station_name",
+                "mcc", "mnc", "mobile_brand", "elevation", "usage_type", "address_type",
+                "category"
+                ],
+        'DB26': [
+                "ip_from", "ip_to", "country_code", "country_name", "region_name", "city_name",
+                "latitude", "longitude", "zip_code", "time_zone", "isp", "domain", "net_speed",
+                "idd_code", "area_code", "weather_station_code", "weather_station_name",
+                "mcc", "mnc", "mobile_brand", "elevation", "usage_type", "address_type",
+                "category", "district", "asn", "as_name"
+                ],
+        'PX1': ['ip_from', 'ip_to', 'country_code', 'country_name'],
+        'PX2': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name'],
+        'PX3': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name'],
+        'PX4': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp'],
+        'PX5': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain'],
+        'PX6': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type'],
+        'PX7': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as'],
+        'PX8': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen'],
+        'PX9': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat'],
+        'PX10': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat'],
+        'PX11': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat', 'provider'],
+        'PX12': ['ip_from', 'ip_to', 'proxy_type', 'country_code', 'country_name', 'region_name', 'city_name', 'isp', 'domain', 'usage_type', 'asn', 'as', 'last_seen', 'threat', 'provider', 'fraud_score']
+    }
+    if db_type != '':
+        try:
+            column_names = column_names_list[db_type]
+        except Exception:
+            print(f'Invalid db_type value foundm the valid value should be range from DB1 to DB26. Your input: {db_type}.')
+            sys.exit(1)
+    
+    # check_ip_versions(csv_file)
+    
+    # Determine ipv4 or ipv6 based on the last row of the file
+    # Get last line
+    last_line = get_last_row(csv_file)
+    
+    df_last = pd.read_csv(StringIO(last_line), header=None)
+    ip_value = df_last.iloc[0, 0]  # Replace with actual index
+    # print("Is IPv6?", is_ipv6(ip_value))
+    # print(f"{ip_value} is {detect_ip_version_from_number(ip_value)}")
+    ip_ver = detect_ip_version_from_number(ip_value)
+
+    if column_names != '':
+        try:
+            schema_list = []
+            for column in column_names:
+                if column in ["ip_from", "ip_to"]:
+                    if ip_ver == 'IPv4':
+                        schema_list.append(pa.field(column, pa.uint32()))
+                    elif ip_ver == 'IPv6':
+                        schema_list.append(pa.field(column, pa.string()))
+                elif column in ["latitude", "longitude"]:
+                    schema_list.append(pa.field(column, pa.float64()))
+                elif column in ['last_seen', 'fraud_score', "elevation"]:
+                    schema_list.append(pa.field(column, pa.int32()))
+                else:
+                    schema_list.append(pa.field(column, pa.string()))
+            schema = pa.schema(schema_list)
+            for chunk in pd.read_csv(
+                csv_file,
+                names=column_names,
+                header=None,
+                chunksize=parquet_chunksize,
+                low_memory=True,
+                dtype=str  # initially read all as string to control parsing
+            ):
+                if ip_ver == 'IPv4':
+                    chunk["ip_from"] = pd.to_numeric(chunk["ip_from"], errors="coerce").astype("uint32")
+                    chunk["ip_to"] = pd.to_numeric(chunk["ip_to"], errors="coerce").astype("uint32")
+                elif ip_ver == 'IPv6':
+                    chunk["ip_from"] = chunk["ip_from"].apply(int)
+                    chunk["ip_from"] = chunk["ip_from"].apply(lambda x: format(x, '032x'))
+                    chunk["ip_to"] = chunk["ip_to"].apply(int)
+                    chunk["ip_to"] = chunk["ip_to"].apply(lambda x: format(x, '032x'))
+                if "latitude" in column_names:
+                    chunk["latitude"] = pd.to_numeric(chunk["latitude"], errors="coerce").astype("float64")
+                if "longitude" in column_names:
+                    chunk["longitude"] = pd.to_numeric(chunk["longitude"], errors="coerce").astype("float64")
+                if "elevation" in column_names:
+                    chunk["elevation"] = pd.to_numeric(chunk["elevation"], errors="coerce")
+
+                table = pa.Table.from_pandas(chunk, schema=schema, preserve_index=False)
+
+                if parquet_writer is None:
+                    parquet_writer = pq.ParquetWriter(parquet_file, table.schema)
+
+                parquet_writer.write_table(table)
+
+            if parquet_writer:
+                parquet_writer.close()
+        except Exception as e:
+            print(f'Unexcepted error occured, will abort now...')
+            print(str(e))
+            sys.exit(1)
+
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setuptools.setup(
     name="ip2location-python-csv-converter",
-    version="1.2.7",
+    version="1.3.0",
     description="Python script to converts IP2Location CSV database into IP range or CIDR format.",
     long_description_content_type="text/markdown",
     long_description=long_description,
@@ -23,7 +23,7 @@
     project_urls={
         'Official Website': 'https://www.ip2location.com',
     },
-    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
+    python_requires='>=3.5, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*',
     entry_points = {
         'console_scripts': ['ip2location-csv-converter=ip2location_csv_converter.commandline:main'],
     },
@@ -32,7 +32,6 @@
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Developers",
         "Topic :: Utilities",
-        'Programming Language :: Python :: 2.7',
         "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",