|
349 | 349 | "else:\n", |
350 | 350 | " print(\"No best PageRank URLs found based on the provided criteria.\")" |
351 | 351 | ] |
| 352 | + }, |
| 353 | + { |
| 354 | + "cell_type": "markdown", |
| 355 | + "metadata": { |
| 356 | + "id": "c0huYdotnVlL" |
| 357 | + }, |
| 358 | + "source": [ |
| 359 | + "# FINDING BEST PERFORMERS ON FOLDER DEPTH INTERVAL" |
| 360 | + ] |
| 361 | + }, |
| 362 | + { |
| 363 | + "cell_type": "code", |
| 364 | + "execution_count": null, |
| 365 | + "metadata": { |
| 366 | + "id": "xn5xIY1QnVWz" |
| 367 | + }, |
| 368 | + "outputs": [], |
| 369 | + "source": [ |
| 370 | + "from google.colab import drive\n", |
| 371 | + "import pandas as pd\n", |
| 372 | + "import os\n", |
| 373 | + "\n", |
| 374 | + "# Mount Google Drive\n", |
| 375 | + "drive.mount(\"/content/drive\")\n", |
| 376 | + "\n", |
| 377 | + "# Define path to CSV file\n", |
| 378 | + "base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n", |
| 379 | + "file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n", |
| 380 | + "\n", |
| 381 | + "\n", |
| 382 | + "def get_overall_top_performers_in_range(file_path, min_depth, max_depth, n_total_top):\n", |
| 383 | + " \"\"\"\n", |
| 384 | + " Retrieves the overall top N performing URLs (based on PageRank) within a defined range of folder depth levels.\n", |
| 385 | + "\n", |
| 386 | + " Args:\n", |
| 387 | + " file_path (str): The path to the CSV file containing URL analysis results.\n", |
| 388 | + " min_depth (int): The minimum folder depth level in the range (inclusive).\n", |
| 389 | + " max_depth (int): The maximum folder depth level in the range (inclusive).\n", |
| 390 | + " n_total_top (int): The total number of top performing URLs to retrieve across the entire range.\n", |
| 391 | + "\n", |
| 392 | + " Returns:\n", |
| 393 | + " pandas.DataFrame: A DataFrame containing the 'URL' column of the overall top performers.\n", |
| 394 | + " Returns an empty DataFrame if no matching data or missing columns.\n", |
| 395 | + " \"\"\"\n", |
| 396 | + " try:\n", |
| 397 | + " df = pd.read_csv(file_path)\n", |
| 398 | + " except FileNotFoundError:\n", |
| 399 | + " print(f\"Error: File not found at {file_path}\")\n", |
| 400 | + " return pd.DataFrame() # Return empty DataFrame on error\n", |
| 401 | + "\n", |
| 402 | + " required_cols = {\"URL\", \"Folder_Depth\", \"PageRank\"}\n", |
| 403 | + " if not required_cols.issubset(df.columns):\n", |
| 404 | + " raise ValueError(\n", |
| 405 | + " f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n", |
| 406 | + " )\n", |
| 407 | + "\n", |
| 408 | + " # Filter by the defined folder depth range\n", |
| 409 | + " filtered_df = df[\n", |
| 410 | + " (df[\"Folder_Depth\"] >= min_depth) & (df[\"Folder_Depth\"] <= max_depth)\n", |
| 411 | + " ]\n", |
| 412 | + "\n", |
| 413 | + " if filtered_df.empty:\n", |
| 414 | + " print(f\"No data found for folder depths between {min_depth} and {max_depth}.\")\n", |
| 415 | + " return pd.DataFrame()\n", |
| 416 | + "\n", |
| 417 | + " # Sort the entire filtered DataFrame by 'PageRank' in descending order\n", |
| 418 | + " sorted_df = filtered_df.sort_values(by=\"PageRank\", ascending=False)\n", |
| 419 | + "\n", |
| 420 | + " # Get the overall top N performers\n", |
| 421 | + " overall_top_performers = sorted_df.head(n_total_top)\n", |
| 422 | + "\n", |
| 423 | + " return overall_top_performers[[\"URL\"]]\n", |
| 424 | + "\n", |
| 425 | + "\n", |
| 426 | + "# --- User Inputs ---\n", |
| 427 | + "try:\n", |
| 428 | + " min_depth_input = int(input(\"Enter minimum folder depth level (integer): \"))\n", |
| 429 | + " max_depth_input = int(input(\"Enter maximum folder depth level (integer): \"))\n", |
| 430 | + " n_total_top_input = int(input(\"Enter total number of top performers to retrieve: \"))\n", |
| 431 | + "\n", |
| 432 | + " if min_depth_input > max_depth_input:\n", |
| 433 | + " print(\"Error: Minimum depth cannot be greater than maximum depth.\")\n", |
| 434 | + " exit()\n", |
| 435 | + "\n", |
| 436 | + "except ValueError:\n", |
| 437 | + " print(\"Invalid input. Please enter integers for depth range and count.\")\n", |
| 438 | + " exit() # Exit if inputs are not valid integers\n", |
| 439 | + "\n", |
| 440 | + "# --- Run Analysis ---\n", |
| 441 | + "print(\n", |
| 442 | + " f\"\\nRetrieving the overall top {n_total_top_input} performers from folder depths {min_depth_input} to {max_depth_input}:\"\n", |
| 443 | + ")\n", |
| 444 | + "overall_top_pages = get_overall_top_performers_in_range(\n", |
| 445 | + " file_path, min_depth_input, max_depth_input, n_total_top_input\n", |
| 446 | + ")\n", |
| 447 | + "\n", |
| 448 | + "if not overall_top_pages.empty:\n", |
| 449 | + " for url in overall_top_pages[\"URL\"]:\n", |
| 450 | + " print(url)\n", |
| 451 | + "else:\n", |
| 452 | + " print(\"No overall top performers found based on the provided criteria.\")" |
| 453 | + ] |
352 | 454 | } |
353 | 455 | ], |
354 | 456 | "metadata": { |
|
0 commit comments