-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparquet_duplicated_rows.py
More file actions
45 lines (37 loc) · 1.13 KB
/
parquet_duplicated_rows.py
File metadata and controls
45 lines (37 loc) · 1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import sys
import importlib
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
# Comprobar argumento
if len(sys.argv) < 2:
print("[*] Run:\n\tscript.py <filename1.parquet> [filename2.parquet ...]")
sys.exit(1)
# Lista bibliotecas
libraries = ['pandas', 'pyarrow']
# Comprobar bibliotecas
for lib in libraries:
try:
importlib.import_module(lib)
except ImportError:
print(f"{lib} no está instalada. Por favor, instálala antes de ejecutar este script.")
sys.exit(1)
# Cargar archivos parquet en dataframes
filenames = sys.argv[1:]
dfs = []
for filename in filenames:
# memory map para leer el parquet
with pa.memory_map(filename, 'r') as source:
table = pq.read_table(source)
df = table.to_pandas()
dfs.append(df)
# Combinar dataframes
combined_df = pd.concat(dfs, ignore_index=True)
# Verificar duplicados con todas las columnas
duplicates = combined_df[combined_df.duplicated()]
# Resultado y duplicados si hay
if not duplicates.empty:
print("Filas duplicadas encontradas:")
print(duplicates)
else:
print("No se encontraron filas duplicadas.")