TinyML-for-Fault-Detection-in-Photovoltaic-Systems/Recoleccion.py at main · ComitNetLab/TinyML-for-Fault-Detection-in-Photovoltaic-Systems · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Fusionador de datasets {para TinyML

import os, glob
import pandas as pd


# Ruta base
BASE_DIR = "Datasets"
BASE_DIROU="DatasetsFinal"
OUT_DIR = os.path.join(BASE_DIROU, "processed")
os.makedirs(OUT_DIR, exist_ok=True)

# Buscar todos los archivos
patterns = ["**/*.csv", "**/*.xlsx", "**/*.xls"]
file_list = []
for pat in patterns:
    file_list.extend(glob.glob(os.path.join(BASE_DIR, pat), recursive=True))

if not file_list:
    raise FileNotFoundError(f"No se encontraron archivos CSV/XLSX/XLS en {BASE_DIR}")

print(f"Encontrados {len(file_list)} archivos para fusionar.")

#  Utilidades de limpieza
def to_numeric_safely(s):
    # Convierte strings con comas/puntos a numérico; deja NaN si no se puede
    return pd.to_numeric(
        s.astype(str)
         .str.replace(",", ".", regex=False)   # por si hay decimales con coma
         .str.replace(r"[^\d\.\-eE+]", "", regex=True),  # limpia simbología rara
        errors="coerce"
    )

def read_any(path):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".csv":
        # Intenta leer con infer_datetime
        df = pd.read_csv(path)
    else:
        df = pd.read_excel(path, engine="openpyxl" if ext == ".xlsx" else None)
    # Normaliza nombres de columnas
    df.columns = [c.strip().lower() for c in df.columns]
    # Intenta identificar la columna temporal más probable
    time_candidates = [c for c in df.columns if c in ("measured_on","timestamp","time","date","datetime")]
    if time_candidates:
        tcol = time_candidates[0]
        df[tcol] = pd.to_datetime(df[tcol], errors="coerce", utc=False)
    else:
        # si no hay columna de tiempo, crea una (se puede editar manualmente luego)
        tcol = "timestamp"
        df[tcol] = pd.NaT
        df.insert(0, tcol, df.pop(tcol))
    # Agrega metadatos de origen
    df["__source_file__"] = os.path.basename(path)
    return df, tcol

# Lectura incremental + unión de columnas (union schema)
dfs = []
all_cols = set()
tcol_name = None

# Primero pasamos para conocer el esquema total
tmp_dfs = []
for fp in sorted(file_list):
    df, tcol = read_any(fp)
    tmp_dfs.append((df, tcol))
    all_cols.update(df.columns)
    if tcol_name is None:
        tcol_name = tcol

# Reindexamos todos al esquema total y aplicamos coerción numérica a columnas no temporales
all_cols = list(all_cols)
for df, tcol in tmp_dfs:
    # Garantiza que exista la columna temporal
    if tcol not in df.columns:
        df[tcol_name] = pd.NaT
    # Reindex a columnas-unión
    df = df.reindex(columns=all_cols)
    # Coerción a numérico en columnas que no sean temporales ni __source_file__
    for c in df.columns:
        if c not in (tcol_name, "__source_file__"):
            # Intenta convertir a numérico manteniendo strings si no aplica
            try:
                df[c] = to_numeric_safely(df[c])
            except Exception:
                pass
    dfs.append(df)

# Concatenar, ordenar por tiempo y deduplicar
full = pd.concat(dfs, axis=0, ignore_index=True)
full = full.drop_duplicates()

# Asegura que la columna temporal exista
if tcol_name not in full.columns:
    full[tcol_name] = pd.NaT

#
# Orden temporal si hay timestamps válidos
if full[tcol_name].notna().any():
    full = full.sort_values(by=tcol_name)

# Guardados
out_csv = os.path.join(OUT_DIR, "pv_merged.csv")
full.to_csv(out_csv, index=False)
print(f"\nArchivo fusionado guardado en:\n  {out_csv}")


# Resumen
n_rows, n_cols = full.shape
time_coverage = (full[tcol_name].min(), full[tcol_name].max()) if full[tcol_name].notna().any() else ("N/A","N/A")

print("\n====== RESUMEN DEL DATASET FUSIONADO ======")
print(f"Filas: {n_rows:,} | Columnas: {n_cols}")
print(f"Columna temporal: {tcol_name}")
print(f"Cobertura temporal: {time_coverage[0]}  →  {time_coverage[1]}")
print("\nPrimeras columnas:", list(full.columns[:10]))
print("Últimas columnas:", list(full.columns[-10:]))

# Muestra 5 filas aleatorias
display(full.sample(min(5, len(full)), random_state=42))