88 },
99 "outputs" : [],
1010 "source" : [
11- " # Google Drive Folder-Level PageRank Analysis\n " ,
12- " # Processes all CSV pairs in a mounted Google Drive folder \n " ,
11+ " # Google Drive Folder-Level PageRank Analysis with REAL WWW Data \n " ,
12+ " # Uses FineWeb dataset CSV with FROM and TO columns as WWW graph \n " ,
1313 " # Calculates overall averages across all files in the strategy\n " ,
1414 " \n " ,
1515 " # === INSTALLATION CELL (Run first) ===\n " ,
16- " # !pip install networkit pandas numpy\n " ,
16+ " !pip install networkit pandas numpy\n " ,
1717 " \n " ,
1818 " # === MOUNT GOOGLE DRIVE ===\n " ,
1919 " from google.colab import drive\n " ,
3333 " # USER CONFIGURATION\n " ,
3434 " # ============================================\n " ,
3535 " BASELINE_PATH = \" /content/drive/MyDrive/WebKnoGraph/results/link_graph_edges.csv\"\n " ,
36- " COMPARISON_FOLDER = \" /content/drive/MyDrive/WebKnoGraph/results/expert_led/low_batches/\"\n " ,
37- " NUM_SIMULATIONS = 100\n " ,
36+ " COMPARISON_FOLDER = (\n " ,
37+ " \" /content/drive/MyDrive/WebKnoGraph/results/automatic_led/high_batches/\"\n " ,
38+ " )\n " ,
39+ " \n " ,
40+ " # NEW: Path to FineWeb WWW graph CSV\n " ,
41+ " FINEWEB_WWW_PATH = \" /content/drive/MyDrive/WebKnoGraph/results/fineweb_500k_pages.csv\"\n " ,
42+ " \n " ,
43+ " NUM_SIMULATIONS = 10\n " ,
3844 " \n " ,
3945 " # Simulation Parameters\n " ,
40- " TOTAL_NODES_WWW = 100000\n " ,
41- " EDGES_PER_NEW_NODE = 2\n " ,
4246 " MIN_CONNECTIONS = 5\n " ,
4347 " MAX_CONNECTIONS = 50\n " ,
4448 " PAGERANK_TOLERANCE = 1e-6\n " ,
5054 " _www_graph_cache = None\n " ,
5155 " \n " ,
5256 " \n " ,
53- " def load_graph_from_csv_networkit(file_path):\n " ,
57+ " def load_graph_from_csv_networkit(file_path, graph_name= \" graph \" ):\n " ,
5458 " \"\"\" Load graph from CSV file.\"\"\"\n " ,
5559 " try:\n " ,
60+ " print(f\" Loading {graph_name} from {os.path.basename(file_path)}...\" )\n " ,
5661 " df = pd.read_csv(file_path, usecols=[\" FROM\" , \" TO\" ])\n " ,
5762 " df = df.dropna()\n " ,
5863 " df[\" FROM\" ] = df[\" FROM\" ].astype(str)\n " ,
5964 " df[\" TO\" ] = df[\" TO\" ].astype(str)\n " ,
6065 " \n " ,
6166 " if len(df) == 0:\n " ,
67+ " print(f\" ERROR: No valid edges found in {file_path}\" )\n " ,
6268 " return None, None, None\n " ,
6369 " \n " ,
6470 " from_urls = df[\" FROM\" ].values\n " ,
7076 " for src_url, tgt_url in zip(from_urls, to_urls):\n " ,
7177 " g.addEdge(url_to_idx[src_url], url_to_idx[tgt_url])\n " ,
7278 " \n " ,
79+ " print(f\" Loaded: {len(all_urls):,} nodes, {len(df):,} edges\" )\n " ,
7380 " return g, all_urls, url_to_idx\n " ,
7481 " except Exception as e:\n " ,
75- " print(f\" Error loading {file_path}: {str(e)}\" )\n " ,
82+ " print(f\" ERROR loading {file_path}: {str(e)}\" )\n " ,
7683 " return None, None, None\n " ,
7784 " \n " ,
7885 " \n " ,
79- " def create_www_graph_networkit(n_nodes, m_edges, seed=42 ):\n " ,
80- " \"\"\" Create WWW graph with caching.\"\"\"\n " ,
86+ " def load_www_graph_networkit(www_csv_path ):\n " ,
87+ " \"\"\" Load REAL WWW graph from FineWeb dataset with caching.\"\"\"\n " ,
8188 " global _www_graph_cache\n " ,
8289 " \n " ,
83- " cache_key = (n_nodes, m_edges, seed) \n " ,
84- " if _www_graph_cache is not None and _www_graph_cache[0] == cache_key: \n " ,
90+ " if _www_graph_cache is not None and _www_graph_cache[0] == www_csv_path: \n " ,
91+ " print( \" Using cached WWW graph \" ) \n " ,
8592 " cached_graph = _www_graph_cache[1]\n " ,
8693 " new_graph = nk.Graph(\n " ,
8794 " n=cached_graph.numberOfNodes(), weighted=False, directed=True\n " ,
8895 " )\n " ,
8996 " for u, v in cached_graph.iterEdges():\n " ,
9097 " new_graph.addEdge(u, v)\n " ,
91- " return new_graph\n " ,
98+ " return new_graph, _www_graph_cache[2] \n " ,
9299 " \n " ,
93- " nk.setSeed(seed, False )\n " ,
94- " generator = nk.generators.BarabasiAlbertGenerator (\n " ,
95- " k=m_edges, nMax=n_nodes, n0=m_edges \n " ,
100+ " print( \"\\ nLoading REAL WWW graph from FineWeb dataset... \" )\n " ,
101+ " www_graph, www_nodes, www_url_mapping = load_graph_from_csv_networkit (\n " ,
102+ " www_csv_path, graph_name= \" WWW graph \" \n" ,
96103 " )\n " ,
97- " www_graph = generator.generate()\n " ,
98104 " \n " ,
105+ " if www_graph is None:\n " ,
106+ " raise ValueError(f\" Failed to load WWW graph from {www_csv_path}\" )\n " ,
107+ " \n " ,
108+ " # Cache the graph\n " ,
99109 " cached_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)\n " ,
100110 " for u, v in www_graph.iterEdges():\n " ,
101111 " cached_graph.addEdge(u, v)\n " ,
102- " _www_graph_cache = (cache_key, cached_graph)\n " ,
103- " return www_graph\n " ,
112+ " _www_graph_cache = (www_csv_path, cached_graph, www_nodes)\n " ,
113+ " \n " ,
114+ " print(f\" WWW graph cached successfully\" )\n " ,
115+ " return www_graph, www_nodes\n " ,
104116 " \n " ,
105117 " \n " ,
106118 " def process_configuration_networkit(\n " ,
107- " www_graph, kalicube_edges, kalicube_nodes, kalicube_url_mapping\n " ,
119+ " www_graph, www_nodes, kalicube_edges, kalicube_nodes, kalicube_url_mapping\n " ,
108120 " ):\n " ,
109121 " \"\"\" Process configuration and calculate PageRank.\"\"\"\n " ,
110122 " kalicube_offset = www_graph.numberOfNodes()\n " ,
111123 " n_kalicube = len(kalicube_nodes)\n " ,
124+ " n_www = www_graph.numberOfNodes()\n " ,
112125 " \n " ,
113- " merged_graph = nk.Graph(n=www_graph.numberOfNodes(), weighted=False, directed=True)\n " ,
126+ " # Create merged graph\n " ,
127+ " merged_graph = nk.Graph(n=n_www, weighted=False, directed=True)\n " ,
114128 " for u, v in www_graph.iterEdges():\n " ,
115129 " merged_graph.addEdge(u, v)\n " ,
116130 " \n " ,
131+ " # Add Kalicube nodes\n " ,
117132 " for _ in range(n_kalicube):\n " ,
118133 " merged_graph.addNode()\n " ,
119134 " \n " ,
135+ " # Add Kalicube internal edges\n " ,
120136 " if kalicube_edges:\n " ,
121137 " for src, tgt in kalicube_edges:\n " ,
122138 " merged_graph.addEdge(src + kalicube_offset, tgt + kalicube_offset)\n " ,
123139 " \n " ,
124- " n_www_sample = min(MIN_CONNECTIONS, TOTAL_NODES_WWW)\n " ,
125- " n_kalicube_sample = min(MIN_CONNECTIONS, len(kalicube_nodes))\n " ,
140+ " # Connect WWW to Kalicube\n " ,
141+ " n_www_sample = min(MAX_CONNECTIONS, n_www)\n " ,
142+ " n_kalicube_sample = min(MAX_CONNECTIONS, n_kalicube)\n " ,
126143 " \n " ,
127- " www_nodes_sample = np.random.choice(\n " ,
128- " TOTAL_NODES_WWW, size=n_www_sample, replace=False\n " ,
129- " )\n " ,
144+ " www_nodes_sample = np.random.choice(n_www, size=n_www_sample, replace=False)\n " ,
130145 " kalicube_indices = np.random.choice(\n " ,
131- " len(kalicube_nodes) , size=n_kalicube_sample, replace=False\n " ,
146+ " n_kalicube , size=n_kalicube_sample, replace=False\n " ,
132147 " )\n " ,
133148 " \n " ,
134149 " for www_node_id, kalicube_idx in zip(www_nodes_sample, kalicube_indices):\n " ,
135150 " kalicube_node_id = kalicube_idx + kalicube_offset\n " ,
136151 " merged_graph.addEdge(www_node_id, kalicube_node_id)\n " ,
137152 " \n " ,
153+ " # Calculate PageRank\n " ,
138154 " pagerank_algo = nk.centrality.PageRank(\n " ,
139155 " merged_graph, damp=0.85, tol=PAGERANK_TOLERANCE\n " ,
140156 " )\n " ,
141157 " pagerank_algo.run()\n " ,
142158 " pagerank_scores = pagerank_algo.scores()\n " ,
143159 " \n " ,
160+ " # Extract Kalicube PageRank scores\n " ,
144161 " pagerank_dict = {}\n " ,
145162 " for i, url in enumerate(kalicube_nodes):\n " ,
146163 " vertex_id = i + kalicube_offset\n " ,
151168 " \n " ,
152169 " def run_single_simulation_networkit(\n " ,
153170 " simulation_id,\n " ,
171+ " www_graph,\n " ,
172+ " www_nodes,\n " ,
154173 " kalicube_old_edges,\n " ,
155174 " kalicube_new_edges,\n " ,
156175 " kalicube_nodes_old,\n " ,
163182 " np.random.seed(sim_seed)\n " ,
164183 " random.seed(sim_seed)\n " ,
165184 " \n " ,
166- " www_graph = create_www_graph_networkit(\n " ,
167- " TOTAL_NODES_WWW, EDGES_PER_NEW_NODE, sim_seed\n " ,
168- " )\n " ,
169- " \n " ,
185+ " # Calculate PageRank for old configuration\n " ,
170186 " pagerank_old_dict = process_configuration_networkit(\n " ,
171- " www_graph, kalicube_old_edges, kalicube_nodes_old, kalicube_url_mapping_old\n " ,
187+ " www_graph,\n " ,
188+ " www_nodes,\n " ,
189+ " kalicube_old_edges,\n " ,
190+ " kalicube_nodes_old,\n " ,
191+ " kalicube_url_mapping_old,\n " ,
172192 " )\n " ,
173193 " \n " ,
194+ " # Calculate PageRank for new configuration\n " ,
174195 " pagerank_new_dict = process_configuration_networkit(\n " ,
175- " www_graph, kalicube_new_edges, kalicube_nodes_new, kalicube_url_mapping_new\n " ,
196+ " www_graph,\n " ,
197+ " www_nodes,\n " ,
198+ " kalicube_new_edges,\n " ,
199+ " kalicube_nodes_new,\n " ,
200+ " kalicube_url_mapping_new,\n " ,
176201 " )\n " ,
177202 " \n " ,
203+ " # Compare results\n " ,
178204 " old_urls = set(pagerank_old_dict.keys())\n " ,
179205 " new_urls = set(pagerank_new_dict.keys())\n " ,
180206 " common_urls = old_urls & new_urls\n " ,
200226 " }\n " ,
201227 " \n " ,
202228 " \n " ,
203- " def analyze_csv_pair(old_csv_path, new_csv_path):\n " ,
229+ " def analyze_csv_pair(www_graph, www_nodes, old_csv_path, new_csv_path):\n " ,
204230 " \"\"\" Analyze a pair of CSV files.\"\"\"\n " ,
205231 " print(f\"\\ nAnalyzing: {os.path.basename(new_csv_path)}\" )\n " ,
206232 " \n " ,
207233 " kalicube_graph_old, kalicube_nodes_old, kalicube_url_mapping_old = (\n " ,
208- " load_graph_from_csv_networkit(old_csv_path)\n " ,
234+ " load_graph_from_csv_networkit(old_csv_path, \" baseline Kalicube \" )\n " ,
209235 " )\n " ,
210236 " if kalicube_graph_old is None:\n " ,
211237 " print(f\" Failed to load old graph\" )\n " ,
212238 " return None\n " ,
213239 " \n " ,
214240 " kalicube_graph_new, kalicube_nodes_new, kalicube_url_mapping_new = (\n " ,
215- " load_graph_from_csv_networkit(new_csv_path)\n " ,
241+ " load_graph_from_csv_networkit(new_csv_path, \" comparison Kalicube \" )\n " ,
216242 " )\n " ,
217243 " if kalicube_graph_new is None:\n " ,
218244 " print(f\" Failed to load new graph\" )\n " ,
230256 " for sim_id in range(NUM_SIMULATIONS):\n " ,
231257 " result = run_single_simulation_networkit(\n " ,
232258 " sim_id,\n " ,
259+ " www_graph,\n " ,
260+ " www_nodes,\n " ,
233261 " kalicube_old_edges,\n " ,
234262 " kalicube_new_edges,\n " ,
235263 " kalicube_nodes_old,\n " ,
256284 " \" avg_mean_delta_pct\" : avg_mean,\n " ,
257285 " \" avg_min_delta_pct\" : avg_min,\n " ,
258286 " \" avg_max_delta_pct\" : avg_max,\n " ,
259- " \" sim_results\" : sim_results, # Store individual simulation results \n " ,
287+ " \" sim_results\" : sim_results,\n " ,
260288 " }\n " ,
261289 " \n " ,
262290 " \n " ,
283311 " \n " ,
284312 " if __name__ == \" __main__\" :\n " ,
285313 " print(\" =\" * 70)\n " ,
286- " print(\" FOLDER-LEVEL PAGERANK ANALYSIS\" )\n " ,
314+ " print(\" PAGERANK ANALYSIS WITH REAL WWW DATA (FineWeb) \" )\n " ,
287315 " print(\" =\" * 70)\n " ,
288316 " \n " ,
317+ " # Validate paths\n " ,
289318 " if not os.path.exists(BASELINE_PATH):\n " ,
290319 " print(f\"\\ nERROR: Baseline file not found: {BASELINE_PATH}\" )\n " ,
291320 " exit(1)\n " ,
292321 " \n " ,
293- " print(f\"\\ nBaseline: {os.path.basename(BASELINE_PATH)}\" )\n " ,
322+ " if not os.path.exists(FINEWEB_WWW_PATH):\n " ,
323+ " print(f\"\\ nERROR: FineWeb WWW file not found: {FINEWEB_WWW_PATH}\" )\n " ,
324+ " exit(1)\n " ,
294325 " \n " ,
295326 " if not os.path.exists(COMPARISON_FOLDER):\n " ,
296327 " print(f\"\\ nERROR: Comparison folder not found: {COMPARISON_FOLDER}\" )\n " ,
297328 " exit(1)\n " ,
298329 " \n " ,
330+ " print(f\"\\ nWWW Graph Source: {os.path.basename(FINEWEB_WWW_PATH)}\" )\n " ,
331+ " print(f\" Baseline: {os.path.basename(BASELINE_PATH)}\" )\n " ,
332+ " \n " ,
333+ " # Load REAL WWW graph (only once, then cached)\n " ,
334+ " www_graph, www_nodes = load_www_graph_networkit(FINEWEB_WWW_PATH)\n " ,
335+ " \n " ,
336+ " # Find comparison files\n " ,
299337 " csv_files = sorted([f for f in os.listdir(COMPARISON_FOLDER) if f.endswith(\" .csv\" )])\n " ,
300338 " \n " ,
301339 " if len(csv_files) == 0:\n " ,
302340 " print(f\"\\ nERROR: No CSV files found in {COMPARISON_FOLDER}\" )\n " ,
303341 " exit(1)\n " ,
304342 " \n " ,
305- " print(f\" Found {len(csv_files)} CSV files in comparison folder\" )\n " ,
306- " print(f\" Analyzing {len(csv_files)} comparison files\" )\n " ,
343+ " print(f\"\\ nFound {len(csv_files)} CSV files in comparison folder\" )\n " ,
307344 " print(\" =\" * 70)\n " ,
308345 " \n " ,
309346 " results = []\n " ,
310- " all_simulation_results = [] # Collect all simulation results across all files \n " ,
347+ " all_simulation_results = []\n " ,
311348 " \n " ,
312349 " for new_csv_filename in csv_files:\n " ,
313350 " new_csv_path = os.path.join(COMPARISON_FOLDER, new_csv_filename)\n " ,
314- " result = analyze_csv_pair(BASELINE_PATH, new_csv_path)\n " ,
351+ " result = analyze_csv_pair(www_graph, www_nodes, BASELINE_PATH, new_csv_path)\n " ,
315352 " \n " ,
316353 " if result is not None and validate_results(result):\n " ,
317354 " results.append(result)\n " ,
318- " all_simulation_results.extend(\n " ,
319- " result[\" sim_results\" ]\n " ,
320- " ) # Aggregate simulation results\n " ,
321- " print(f\" Valid results obtained\" )\n " ,
355+ " all_simulation_results.extend(result[\" sim_results\" ])\n " ,
356+ " print(f\" ✓ Valid results obtained\" )\n " ,
322357 " \n " ,
323358 " print(\"\\ n\" + \" =\" * 70)\n " ,
324359 " print(\" INDIVIDUAL FILE RESULTS\" )\n " ,
341376 " print(\" -\" * 90)\n " ,
342377 " print(f\"\\ nSuccessfully analyzed {len(results)}/{len(csv_files)} files\" )\n " ,
343378 " \n " ,
344- " # Calculate overall averages across all simulations \n " ,
379+ " # Calculate overall averages\n " ,
345380 " if len(all_simulation_results) > 0:\n " ,
346381 " print(\"\\ n\" + \" =\" * 70)\n " ,
347382 " print(\" OVERALL AVERAGES\" )\n " ,
384419 },
385420 "nbformat" : 4 ,
386421 "nbformat_minor" : 0
387- }
422+ }
0 commit comments