|
231 | 231 | "else:\n", |
232 | 232 | " print(\"No random URLs found based on the provided criteria.\")" |
233 | 233 | ] |
| 234 | + }, |
| 235 | + { |
| 236 | + "cell_type": "markdown", |
| 237 | + "metadata": { |
| 238 | + "id": "q5SXzww0ujhd" |
| 239 | + }, |
| 240 | + "source": [ |
| 241 | + "# FILTERING URLS BY FOLDER/SUBPATH" |
| 242 | + ] |
| 243 | + }, |
| 244 | + { |
| 245 | + "cell_type": "code", |
| 246 | + "execution_count": null, |
| 247 | + "metadata": { |
| 248 | + "id": "kgfiA5y8ujzB" |
| 249 | + }, |
| 250 | + "outputs": [], |
| 251 | + "source": [ |
| 252 | + "from google.colab import drive\n", |
| 253 | + "import pandas as pd\n", |
| 254 | + "import os\n", |
| 255 | + "\n", |
| 256 | + "# Mount Google Drive\n", |
| 257 | + "drive.mount(\"/content/drive\")\n", |
| 258 | + "\n", |
| 259 | + "# Define path to CSV file\n", |
| 260 | + "base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n", |
| 261 | + "file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n", |
| 262 | + "\n", |
| 263 | + "\n", |
| 264 | + "def get_best_pagerank_urls_in_folder(file_path, n_best, url_subpath_filter=None):\n", |
| 265 | + " \"\"\"\n", |
| 266 | + " Retrieves the top N best performing URLs (based on PageRank) from a specific folder/URL subpath.\n", |
| 267 | + "\n", |
| 268 | + " Args:\n", |
| 269 | + " file_path (str): The path to the CSV file containing URL analysis results.\n", |
| 270 | + " n_best (int): The number of best PageRank URLs to retrieve.\n", |
| 271 | + " url_subpath_filter (str, optional): A URL subpath to filter by (e.g., '/learning-spaces/').\n", |
| 272 | + " Only URLs containing this subpath will be considered. Defaults to None.\n", |
| 273 | + "\n", |
| 274 | + " Returns:\n", |
| 275 | + " pandas.DataFrame: A DataFrame containing only the 'URL' column of the best PageRank URLs.\n", |
| 276 | + " Returns an empty DataFrame if no data, no matching URLs, or missing columns.\n", |
| 277 | + " \"\"\"\n", |
| 278 | + " try:\n", |
| 279 | + " df = pd.read_csv(file_path)\n", |
| 280 | + " except FileNotFoundError:\n", |
| 281 | + " print(f\"Error: File not found at {file_path}\")\n", |
| 282 | + " return pd.DataFrame() # Return empty DataFrame on error\n", |
| 283 | + "\n", |
| 284 | + " required_cols = {\"URL\", \"PageRank\"} # PageRank is still required for sorting\n", |
| 285 | + " if not required_cols.issubset(df.columns):\n", |
| 286 | + " raise ValueError(\n", |
| 287 | + " f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n", |
| 288 | + " )\n", |
| 289 | + "\n", |
| 290 | + " if df.empty:\n", |
| 291 | + " print(\"The CSV file is empty.\")\n", |
| 292 | + " return pd.DataFrame()\n", |
| 293 | + "\n", |
| 294 | + " # Apply URL subpath filter if provided\n", |
| 295 | + " filtered_df = df\n", |
| 296 | + " if url_subpath_filter:\n", |
| 297 | + " filtered_df = df[\n", |
| 298 | + " df[\"URL\"].astype(str).str.contains(url_subpath_filter, na=False)\n", |
| 299 | + " ]\n", |
| 300 | + " if filtered_df.empty:\n", |
| 301 | + " print(f\"No URLs found matching the subpath filter: '{url_subpath_filter}'\")\n", |
| 302 | + " return pd.DataFrame()\n", |
| 303 | + "\n", |
| 304 | + " # Sort by 'PageRank' in descending order (highest PageRank = best performer)\n", |
| 305 | + " sorted_df = filtered_df.sort_values(by=\"PageRank\", ascending=False)\n", |
| 306 | + "\n", |
| 307 | + " # Ensure n_best does not exceed the number of available URLs after filtering\n", |
| 308 | + " if n_best > len(sorted_df):\n", |
| 309 | + " print(\n", |
| 310 | + " f\"Warning: Requested {n_best} best URLs, but only {len(sorted_df)} are available after filtering. Returning all available URLs.\"\n", |
| 311 | + " )\n", |
| 312 | + " n_best = len(sorted_df)\n", |
| 313 | + "\n", |
| 314 | + " # Get the top N best PageRank URLs, only selecting the 'URL' column\n", |
| 315 | + " best_pagerank_urls = sorted_df.head(n_best)[[\"URL\"]]\n", |
| 316 | + "\n", |
| 317 | + " return best_pagerank_urls\n", |
| 318 | + "\n", |
| 319 | + "\n", |
| 320 | + "# --- User Inputs ---\n", |
| 321 | + "try:\n", |
| 322 | + " n_best_input = int(input(\"Enter number of best PageRank URLs to retrieve: \"))\n", |
| 323 | + " # Prompt for the subpath filter\n", |
| 324 | + " url_subpath_input = input(\n", |
| 325 | + " \"Enter URL subpath to filter by (e.g., /learning-spaces/, leave empty for no filter): \"\n", |
| 326 | + " )\n", |
| 327 | + " # Set to None if user leaves it empty\n", |
| 328 | + " if not url_subpath_input.strip():\n", |
| 329 | + " url_subpath_input = None\n", |
| 330 | + "except ValueError:\n", |
| 331 | + " print(\"Invalid input. Please enter an integer for the count.\")\n", |
| 332 | + " exit() # Exit if input is not a valid integer\n", |
| 333 | + "\n", |
| 334 | + "# --- Run Analysis ---\n", |
| 335 | + "if url_subpath_input:\n", |
| 336 | + " print(\n", |
| 337 | + " f\"\\nRetrieving {n_best_input} best PageRank URLs from subpath '{url_subpath_input}':\"\n", |
| 338 | + " )\n", |
| 339 | + "else:\n", |
| 340 | + " print(f\"\\nRetrieving {n_best_input} best PageRank URLs (no subpath filter):\")\n", |
| 341 | + "\n", |
| 342 | + "best_pages = get_best_pagerank_urls_in_folder(\n", |
| 343 | + " file_path, n_best_input, url_subpath_input\n", |
| 344 | + ")\n", |
| 345 | + "\n", |
| 346 | + "if not best_pages.empty:\n", |
| 347 | + " for url in best_pages[\"URL\"]: # Iterate through the 'URL' column of the DataFrame\n", |
| 348 | + " print(url)\n", |
| 349 | + "else:\n", |
| 350 | + " print(\"No best PageRank URLs found based on the provided criteria.\")" |
| 351 | + ] |
234 | 352 | } |
235 | 353 | ], |
236 | 354 | "metadata": { |
|
0 commit comments