|
140 | 140 | "else:\n", |
141 | 141 | " print(\"No top performers found based on the provided criteria.\")" |
142 | 142 | ] |
| 143 | + }, |
| 144 | + { |
| 145 | + "cell_type": "markdown", |
| 146 | + "metadata": { |
| 147 | + "id": "YbuR_a9ij-Hk" |
| 148 | + }, |
| 149 | + "source": [ |
| 150 | + "# FINDING RANDOM URLS" |
| 151 | + ] |
| 152 | + }, |
| 153 | + { |
| 154 | + "cell_type": "code", |
| 155 | + "execution_count": null, |
| 156 | + "metadata": { |
| 157 | + "id": "fSs-MIP2j8pX" |
| 158 | + }, |
| 159 | + "outputs": [], |
| 160 | + "source": [ |
| 161 | + "from google.colab import drive\n", |
| 162 | + "import pandas as pd\n", |
| 163 | + "import os\n", |
| 164 | + "import random\n", |
| 165 | + "\n", |
| 166 | + "# Mount Google Drive\n", |
| 167 | + "drive.mount(\"/content/drive\")\n", |
| 168 | + "\n", |
| 169 | + "# Define path to CSV file\n", |
| 170 | + "base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n", |
| 171 | + "file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n", |
| 172 | + "\n", |
| 173 | + "\n", |
| 174 | + "def get_random_urls(file_path, n_random):\n", |
| 175 | + " \"\"\"\n", |
| 176 | + " Retrieves a random sample of N URLs from the entire dataset, regardless of folder depth.\n", |
| 177 | + "\n", |
| 178 | + " Args:\n", |
| 179 | + " file_path (str): The path to the CSV file containing URL analysis results.\n", |
| 180 | + " n_random (int): The number of random URLs to retrieve.\n", |
| 181 | + "\n", |
| 182 | + " Returns:\n", |
| 183 | + " pandas.DataFrame: A DataFrame containing the 'URL' column of the random URLs.\n", |
| 184 | + " Returns an empty DataFrame if no data or missing columns.\n", |
| 185 | + " \"\"\"\n", |
| 186 | + " try:\n", |
| 187 | + " df = pd.read_csv(file_path)\n", |
| 188 | + " except FileNotFoundError:\n", |
| 189 | + " print(f\"Error: File not found at {file_path}\")\n", |
| 190 | + " return pd.DataFrame() # Return empty DataFrame on error\n", |
| 191 | + "\n", |
| 192 | + " required_cols = {\"URL\"} # Only 'URL' is strictly required for this function\n", |
| 193 | + " if not required_cols.issubset(df.columns):\n", |
| 194 | + " raise ValueError(\n", |
| 195 | + " f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n", |
| 196 | + " )\n", |
| 197 | + "\n", |
| 198 | + " if df.empty:\n", |
| 199 | + " print(\"The CSV file is empty.\")\n", |
| 200 | + " return pd.DataFrame()\n", |
| 201 | + "\n", |
| 202 | + " # Ensure n_random does not exceed the number of available URLs\n", |
| 203 | + " if n_random > len(df):\n", |
| 204 | + " print(\n", |
| 205 | + " f\"Warning: Requested {n_random} random URLs, but only {len(df)} are available. Returning all available URLs.\"\n", |
| 206 | + " )\n", |
| 207 | + " n_random = len(df)\n", |
| 208 | + "\n", |
| 209 | + " # Get a random sample of URLs\n", |
| 210 | + " random_urls = df.sample(\n", |
| 211 | + " n=n_random, random_state=None\n", |
| 212 | + " ) # random_state=None ensures true randomness each run\n", |
| 213 | + "\n", |
| 214 | + " return random_urls[[\"URL\"]]\n", |
| 215 | + "\n", |
| 216 | + "\n", |
| 217 | + "# --- User Inputs ---\n", |
| 218 | + "try:\n", |
| 219 | + " n_random_input = int(input(\"Enter number of random URLs to retrieve: \"))\n", |
| 220 | + "except ValueError:\n", |
| 221 | + " print(\"Invalid input. Please enter an integer for the count.\")\n", |
| 222 | + " exit() # Exit if input is not a valid integer\n", |
| 223 | + "\n", |
| 224 | + "# --- Run Analysis ---\n", |
| 225 | + "print(f\"\\nRetrieving {n_random_input} random URLs:\")\n", |
| 226 | + "random_pages = get_random_urls(file_path, n_random_input)\n", |
| 227 | + "\n", |
| 228 | + "if not random_pages.empty:\n", |
| 229 | + " for url in random_pages[\"URL\"]:\n", |
| 230 | + " print(url)\n", |
| 231 | + "else:\n", |
| 232 | + " print(\"No random URLs found based on the provided criteria.\")" |
| 233 | + ] |
143 | 234 | } |
144 | 235 | ], |
145 | 236 | "metadata": { |
|
0 commit comments