Skip to content

Commit 1da2646

Browse files
Adding random url filtering to filtering url connections csv
1 parent bab04aa commit 1da2646

1 file changed

Lines changed: 91 additions & 0 deletions

File tree

src/backend/services/filering_url_connections.ipynb

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,97 @@
140140
"else:\n",
141141
" print(\"No top performers found based on the provided criteria.\")"
142142
]
143+
},
144+
{
145+
"cell_type": "markdown",
146+
"metadata": {
147+
"id": "YbuR_a9ij-Hk"
148+
},
149+
"source": [
150+
"# FINDING RANDOM URLS"
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"execution_count": null,
156+
"metadata": {
157+
"id": "fSs-MIP2j8pX"
158+
},
159+
"outputs": [],
160+
"source": [
161+
"from google.colab import drive\n",
162+
"import pandas as pd\n",
163+
"import os\n",
164+
"import random\n",
165+
"\n",
166+
"# Mount Google Drive\n",
167+
"drive.mount(\"/content/drive\")\n",
168+
"\n",
169+
"# Define path to CSV file\n",
170+
"base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n",
171+
"file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n",
172+
"\n",
173+
"\n",
174+
"def get_random_urls(file_path, n_random):\n",
175+
" \"\"\"\n",
176+
" Retrieves a random sample of N URLs from the entire dataset, regardless of folder depth.\n",
177+
"\n",
178+
" Args:\n",
179+
" file_path (str): The path to the CSV file containing URL analysis results.\n",
180+
" n_random (int): The number of random URLs to retrieve.\n",
181+
"\n",
182+
" Returns:\n",
183+
" pandas.DataFrame: A DataFrame containing the 'URL' column of the random URLs.\n",
184+
" Returns an empty DataFrame if no data or missing columns.\n",
185+
" \"\"\"\n",
186+
" try:\n",
187+
" df = pd.read_csv(file_path)\n",
188+
" except FileNotFoundError:\n",
189+
" print(f\"Error: File not found at {file_path}\")\n",
190+
" return pd.DataFrame() # Return empty DataFrame on error\n",
191+
"\n",
192+
" required_cols = {\"URL\"} # Only 'URL' is strictly required for this function\n",
193+
" if not required_cols.issubset(df.columns):\n",
194+
" raise ValueError(\n",
195+
" f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n",
196+
" )\n",
197+
"\n",
198+
" if df.empty:\n",
199+
" print(\"The CSV file is empty.\")\n",
200+
" return pd.DataFrame()\n",
201+
"\n",
202+
" # Ensure n_random does not exceed the number of available URLs\n",
203+
" if n_random > len(df):\n",
204+
" print(\n",
205+
" f\"Warning: Requested {n_random} random URLs, but only {len(df)} are available. Returning all available URLs.\"\n",
206+
" )\n",
207+
" n_random = len(df)\n",
208+
"\n",
209+
" # Get a random sample of URLs\n",
210+
" random_urls = df.sample(\n",
211+
" n=n_random, random_state=None\n",
212+
" ) # random_state=None ensures true randomness each run\n",
213+
"\n",
214+
" return random_urls[[\"URL\"]]\n",
215+
"\n",
216+
"\n",
217+
"# --- User Inputs ---\n",
218+
"try:\n",
219+
" n_random_input = int(input(\"Enter number of random URLs to retrieve: \"))\n",
220+
"except ValueError:\n",
221+
" print(\"Invalid input. Please enter an integer for the count.\")\n",
222+
" exit() # Exit if input is not a valid integer\n",
223+
"\n",
224+
"# --- Run Analysis ---\n",
225+
"print(f\"\\nRetrieving {n_random_input} random URLs:\")\n",
226+
"random_pages = get_random_urls(file_path, n_random_input)\n",
227+
"\n",
228+
"if not random_pages.empty:\n",
229+
" for url in random_pages[\"URL\"]:\n",
230+
" print(url)\n",
231+
"else:\n",
232+
" print(\"No random URLs found based on the provided criteria.\")"
233+
]
143234
}
144235
],
145236
"metadata": {

0 commit comments

Comments
 (0)