Skip to content

Commit cf18f04

Browse files
Fixing simulation results
1 parent 8207a80 commit cf18f04

1 file changed

Lines changed: 85 additions & 50 deletions

File tree

results/deltas_Real_WWW_networkit.ipynb

Lines changed: 85 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,12 @@
88
},
99
"outputs": [],
1010
"source": [
11-
"# Google Drive Folder-Level PageRank Analysis\n",
12-
"# Processes all CSV pairs in a mounted Google Drive folder\n",
11+
"# Google Drive Folder-Level PageRank Analysis with REAL WWW Data\n",
12+
"# Uses FineWeb dataset CSV with FROM and TO columns as WWW graph\n",
1313
"# Calculates overall averages across all files in the strategy\n",
1414
"\n",
1515
"# === INSTALLATION CELL (Run first) ===\n",
16-
"# !pip install networkit pandas numpy\n",
16+
"!pip install networkit pandas numpy\n",
1717
"\n",
1818
"# === MOUNT GOOGLE DRIVE ===\n",
1919
"from google.colab import drive\n",
@@ -33,12 +33,16 @@
3333
"# USER CONFIGURATION\n",
3434
"# ============================================\n",
3535
"BASELINE_PATH = \"/content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv\"\n",
36-
"COMPARISON_FOLDER = \"/content/drive/MyDrive/WebKnoGraph/results/expert_led/low_batches/\"\n",
37-
"NUM_SIMULATIONS = 100\n",
36+
"COMPARISON_FOLDER = (\n",
37+
" \"/content/drive/MyDrive/WebKnoGraph/results/automatic_led/high_batches/\"\n",
38+
")\n",
39+
"\n",
40+
"# NEW: Path to FineWeb WWW graph CSV\n",
41+
"FINEWEB_WWW_PATH = \"/content/drive/MyDrive/WebKnoGraph/results/fineweb_500k_pages.csv\"\n",
42+
"\n",
43+
"NUM_SIMULATIONS = 10\n",
3844
"\n",
3945
"# Simulation Parameters\n",
40-
"TOTAL_NODES_WWW = 100000\n",
41-
"EDGES_PER_NEW_NODE = 2\n",
4246
"MIN_CONNECTIONS = 5\n",
4347
"MAX_CONNECTIONS = 50\n",
4448
"PAGERANK_TOLERANCE = 1e-6\n",
@@ -50,15 +54,17 @@
5054
"_www_graph_cache = None\n",
5155
"\n",
5256
"\n",
53-
"def load_graph_from_csv_networkit(file_path):\n",
57+
"def load_graph_from_csv_networkit(file_path, graph_name=\"graph\"):\n",
5458
" \"\"\"Load graph from CSV file.\"\"\"\n",
5559
" try:\n",
60+
" print(f\" Loading {graph_name} from {os.path.basename(file_path)}...\")\n",
5661
" df = pd.read_csv(file_path, usecols=[\"FROM\", \"TO\"])\n",
5762
" df = df.dropna()\n",
5863
" df[\"FROM\"] = df[\"FROM\"].astype(str)\n",
5964
" df[\"TO\"] = df[\"TO\"].astype(str)\n",
6065
"\n",
6166
" if len(df) == 0:\n",
67+
" print(f\" ERROR: No valid edges found in {file_path}\")\n",
6268
" return None, None, None\n",
6369
"\n",
6470
" from_urls = df[\"FROM\"].values\n",
@@ -70,77 +76,88 @@
7076
" for src_url, tgt_url in zip(from_urls, to_urls):\n",
7177
" g.addEdge(url_to_idx[src_url], url_to_idx[tgt_url])\n",
7278
"\n",
79+
" print(f\" Loaded: {len(all_urls):,} nodes, {len(df):,} edges\")\n",
7380
" return g, all_urls, url_to_idx\n",
7481
" except Exception as e:\n",
75-
" print(f\" Error loading {file_path}: {str(e)}\")\n",
82+
" print(f\" ERROR loading {file_path}: {str(e)}\")\n",
7683
" return None, None, None\n",
7784
"\n",
7885
"\n",
79-
"def create_www_graph_networkit(n_nodes, m_edges, seed=42):\n",
80-
" \"\"\"Create WWW graph with caching.\"\"\"\n",
86+
"def load_www_graph_networkit(www_csv_path):\n",
87+
" \"\"\"Load REAL WWW graph from FineWeb dataset with caching.\"\"\"\n",
8188
" global _www_graph_cache\n",
8289
"\n",
83-
" cache_key = (n_nodes, m_edges, seed)\n",
84-
" if _www_graph_cache is not None and _www_graph_cache[0] == cache_key:\n",
90+
" if _www_graph_cache is not None and _www_graph_cache[0] == www_csv_path:\n",
91+
" print(\" Using cached WWW graph\")\n",
8592
" cached_graph = _www_graph_cache[1]\n",
8693
" new_graph = nk.Graph(\n",
8794
" n=cached_graph.numberOfNodes(), weighted=False, directed=True\n",
8895
" )\n",
8996
" for u, v in cached_graph.iterEdges():\n",
9097
" new_graph.addEdge(u, v)\n",
91-
" return new_graph\n",
98+
" return new_graph, _www_graph_cache[2]\n",
9299
"\n",
93-
" nk.setSeed(seed, False)\n",
94-
" generator = nk.generators.BarabasiAlbertGenerator(\n",
95-
" k=m_edges, nMax=n_nodes, n0=m_edges\n",
100+
" print(\"\\nLoading REAL WWW graph from FineWeb dataset...\")\n",
101+
" www_graph, www_nodes, www_url_mapping = load_graph_from_csv_networkit(\n",
102+
" www_csv_path, graph_name=\"WWW graph\"\n",
96103
" )\n",
97-
" www_graph = generator.generate()\n",
98104
"\n",
105+
" if www_graph is None:\n",
106+
" raise ValueError(f\"Failed to load WWW graph from {www_csv_path}\")\n",
107+
"\n",
108+
" # Cache the graph\n",
99109
" cached_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)\n",
100110
" for u, v in www_graph.iterEdges():\n",
101111
" cached_graph.addEdge(u, v)\n",
102-
" _www_graph_cache = (cache_key, cached_graph)\n",
103-
" return www_graph\n",
112+
" _www_graph_cache = (www_csv_path, cached_graph, www_nodes)\n",
113+
"\n",
114+
" print(f\" WWW graph cached successfully\")\n",
115+
" return www_graph, www_nodes\n",
104116
"\n",
105117
"\n",
106118
"def process_configuration_networkit(\n",
107-
" www_graph, kalicube_edges, kalicube_nodes, kalicube_url_mapping\n",
119+
" www_graph, www_nodes, kalicube_edges, kalicube_nodes, kalicube_url_mapping\n",
108120
"):\n",
109121
" \"\"\"Process configuration and calculate PageRank.\"\"\"\n",
110122
" kalicube_offset = www_graph.numberOfNodes()\n",
111123
" n_kalicube = len(kalicube_nodes)\n",
124+
" n_www = www_graph.numberOfNodes()\n",
112125
"\n",
113-
" merged_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)\n",
126+
" # Create merged graph\n",
127+
" merged_graph = nk.Graph(n=n_www, weighted=False, directed=True)\n",
114128
" for u, v in www_graph.iterEdges():\n",
115129
" merged_graph.addEdge(u, v)\n",
116130
"\n",
131+
" # Add Kalicube nodes\n",
117132
" for _ in range(n_kalicube):\n",
118133
" merged_graph.addNode()\n",
119134
"\n",
135+
" # Add Kalicube internal edges\n",
120136
" if kalicube_edges:\n",
121137
" for src, tgt in kalicube_edges:\n",
122138
" merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)\n",
123139
"\n",
124-
" n_www_sample = min(MIN_CONNECTIONS, TOTAL_NODES_WWW)\n",
125-
" n_kalicube_sample = min(MIN_CONNECTIONS, len(kalicube_nodes))\n",
140+
" # Connect WWW to Kalicube\n",
141+
" n_www_sample = min(MAX_CONNECTIONS, n_www)\n",
142+
" n_kalicube_sample = min(MAX_CONNECTIONS, n_kalicube)\n",
126143
"\n",
127-
" www_nodes_sample = np.random.choice(\n",
128-
" TOTAL_NODES_WWW, size=n_www_sample, replace=False\n",
129-
" )\n",
144+
" www_nodes_sample = np.random.choice(n_www, size=n_www_sample, replace=False)\n",
130145
" kalicube_indices = np.random.choice(\n",
131-
" len(kalicube_nodes), size=n_kalicube_sample, replace=False\n",
146+
" n_kalicube, size=n_kalicube_sample, replace=False\n",
132147
" )\n",
133148
"\n",
134149
" for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):\n",
135150
" kalicube_node_id = kalicube_idx + kalicube_offset\n",
136151
" merged_graph.addEdge(www_node_id, kalicube_node_id)\n",
137152
"\n",
153+
" # Calculate PageRank\n",
138154
" pagerank_algo = nk.centrality.PageRank(\n",
139155
" merged_graph, damp=0.85, tol=PAGERANK_TOLERANCE\n",
140156
" )\n",
141157
" pagerank_algo.run()\n",
142158
" pagerank_scores = pagerank_algo.scores()\n",
143159
"\n",
160+
" # Extract Kalicube PageRank scores\n",
144161
" pagerank_dict = {}\n",
145162
" for i, url in enumerate(kalicube_nodes):\n",
146163
" vertex_id = i + kalicube_offset\n",
@@ -151,6 +168,8 @@
151168
"\n",
152169
"def run_single_simulation_networkit(\n",
153170
" simulation_id,\n",
171+
" www_graph,\n",
172+
" www_nodes,\n",
154173
" kalicube_old_edges,\n",
155174
" kalicube_new_edges,\n",
156175
" kalicube_nodes_old,\n",
@@ -163,18 +182,25 @@
163182
" np.random.seed(sim_seed)\n",
164183
" random.seed(sim_seed)\n",
165184
"\n",
166-
" www_graph = create_www_graph_networkit(\n",
167-
" TOTAL_NODES_WWW, EDGES_PER_NEW_NODE, sim_seed\n",
168-
" )\n",
169-
"\n",
185+
" # Calculate PageRank for old configuration\n",
170186
" pagerank_old_dict = process_configuration_networkit(\n",
171-
" www_graph, kalicube_old_edges, kalicube_nodes_old, kalicube_url_mapping_old\n",
187+
" www_graph,\n",
188+
" www_nodes,\n",
189+
" kalicube_old_edges,\n",
190+
" kalicube_nodes_old,\n",
191+
" kalicube_url_mapping_old,\n",
172192
" )\n",
173193
"\n",
194+
" # Calculate PageRank for new configuration\n",
174195
" pagerank_new_dict = process_configuration_networkit(\n",
175-
" www_graph, kalicube_new_edges, kalicube_nodes_new, kalicube_url_mapping_new\n",
196+
" www_graph,\n",
197+
" www_nodes,\n",
198+
" kalicube_new_edges,\n",
199+
" kalicube_nodes_new,\n",
200+
" kalicube_url_mapping_new,\n",
176201
" )\n",
177202
"\n",
203+
" # Compare results\n",
178204
" old_urls = set(pagerank_old_dict.keys())\n",
179205
" new_urls = set(pagerank_new_dict.keys())\n",
180206
" common_urls = old_urls & new_urls\n",
@@ -200,19 +226,19 @@
200226
" }\n",
201227
"\n",
202228
"\n",
203-
"def analyze_csv_pair(old_csv_path, new_csv_path):\n",
229+
"def analyze_csv_pair(www_graph, www_nodes, old_csv_path, new_csv_path):\n",
204230
" \"\"\"Analyze a pair of CSV files.\"\"\"\n",
205231
" print(f\"\\nAnalyzing: {os.path.basename(new_csv_path)}\")\n",
206232
"\n",
207233
" kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = (\n",
208-
" load_graph_from_csv_networkit(old_csv_path)\n",
234+
" load_graph_from_csv_networkit(old_csv_path, \"baseline Kalicube\")\n",
209235
" )\n",
210236
" if kalicube_graph_old is None:\n",
211237
" print(f\" Failed to load old graph\")\n",
212238
" return None\n",
213239
"\n",
214240
" kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = (\n",
215-
" load_graph_from_csv_networkit(new_csv_path)\n",
241+
" load_graph_from_csv_networkit(new_csv_path, \"comparison Kalicube\")\n",
216242
" )\n",
217243
" if kalicube_graph_new is None:\n",
218244
" print(f\" Failed to load new graph\")\n",
@@ -230,6 +256,8 @@
230256
" for sim_id in range(NUM_SIMULATIONS):\n",
231257
" result = run_single_simulation_networkit(\n",
232258
" sim_id,\n",
259+
" www_graph,\n",
260+
" www_nodes,\n",
233261
" kalicube_old_edges,\n",
234262
" kalicube_new_edges,\n",
235263
" kalicube_nodes_old,\n",
@@ -256,7 +284,7 @@
256284
" \"avg_mean_delta_pct\": avg_mean,\n",
257285
" \"avg_min_delta_pct\": avg_min,\n",
258286
" \"avg_max_delta_pct\": avg_max,\n",
259-
" \"sim_results\": sim_results, # Store individual simulation results\n",
287+
" \"sim_results\": sim_results,\n",
260288
" }\n",
261289
"\n",
262290
"\n",
@@ -283,42 +311,49 @@
283311
"\n",
284312
"if __name__ == \"__main__\":\n",
285313
" print(\"=\" * 70)\n",
286-
" print(\"FOLDER-LEVEL PAGERANK ANALYSIS\")\n",
314+
" print(\"PAGERANK ANALYSIS WITH REAL WWW DATA (FineWeb)\")\n",
287315
" print(\"=\" * 70)\n",
288316
"\n",
317+
" # Validate paths\n",
289318
" if not os.path.exists(BASELINE_PATH):\n",
290319
" print(f\"\\nERROR: Baseline file not found: {BASELINE_PATH}\")\n",
291320
" exit(1)\n",
292321
"\n",
293-
" print(f\"\\nBaseline: {os.path.basename(BASELINE_PATH)}\")\n",
322+
" if not os.path.exists(FINEWEB_WWW_PATH):\n",
323+
" print(f\"\\nERROR: FineWeb WWW file not found: {FINEWEB_WWW_PATH}\")\n",
324+
" exit(1)\n",
294325
"\n",
295326
" if not os.path.exists(COMPARISON_FOLDER):\n",
296327
" print(f\"\\nERROR: Comparison folder not found: {COMPARISON_FOLDER}\")\n",
297328
" exit(1)\n",
298329
"\n",
330+
" print(f\"\\nWWW Graph Source: {os.path.basename(FINEWEB_WWW_PATH)}\")\n",
331+
" print(f\"Baseline: {os.path.basename(BASELINE_PATH)}\")\n",
332+
"\n",
333+
" # Load REAL WWW graph (only once, then cached)\n",
334+
" www_graph, www_nodes = load_www_graph_networkit(FINEWEB_WWW_PATH)\n",
335+
"\n",
336+
" # Find comparison files\n",
299337
" csv_files = sorted([f for f in os.listdir(COMPARISON_FOLDER) if f.endswith(\".csv\")])\n",
300338
"\n",
301339
" if len(csv_files) == 0:\n",
302340
" print(f\"\\nERROR: No CSV files found in {COMPARISON_FOLDER}\")\n",
303341
" exit(1)\n",
304342
"\n",
305-
" print(f\"Found {len(csv_files)} CSV files in comparison folder\")\n",
306-
" print(f\"Analyzing {len(csv_files)} comparison files\")\n",
343+
" print(f\"\\nFound {len(csv_files)} CSV files in comparison folder\")\n",
307344
" print(\"=\" * 70)\n",
308345
"\n",
309346
" results = []\n",
310-
" all_simulation_results = [] # Collect all simulation results across all files\n",
347+
" all_simulation_results = []\n",
311348
"\n",
312349
" for new_csv_filename in csv_files:\n",
313350
" new_csv_path = os.path.join(COMPARISON_FOLDER, new_csv_filename)\n",
314-
" result = analyze_csv_pair(BASELINE_PATH, new_csv_path)\n",
351+
" result = analyze_csv_pair(www_graph, www_nodes, BASELINE_PATH, new_csv_path)\n",
315352
"\n",
316353
" if result is not None and validate_results(result):\n",
317354
" results.append(result)\n",
318-
" all_simulation_results.extend(\n",
319-
" result[\"sim_results\"]\n",
320-
" ) # Aggregate simulation results\n",
321-
" print(f\" Valid results obtained\")\n",
355+
" all_simulation_results.extend(result[\"sim_results\"])\n",
356+
" print(f\" ✓ Valid results obtained\")\n",
322357
"\n",
323358
" print(\"\\n\" + \"=\" * 70)\n",
324359
" print(\"INDIVIDUAL FILE RESULTS\")\n",
@@ -341,7 +376,7 @@
341376
" print(\"-\" * 90)\n",
342377
" print(f\"\\nSuccessfully analyzed {len(results)}/{len(csv_files)} files\")\n",
343378
"\n",
344-
" # Calculate overall averages across all simulations\n",
379+
" # Calculate overall averages\n",
345380
" if len(all_simulation_results) > 0:\n",
346381
" print(\"\\n\" + \"=\" * 70)\n",
347382
" print(\"OVERALL AVERAGES\")\n",
@@ -384,4 +419,4 @@
384419
},
385420
"nbformat": 4,
386421
"nbformat_minor": 0
387-
}
422+
}

0 commit comments

Comments
 (0)