Skip to content

Commit 5143925

Browse files
Adding URL filtering logic to the jypiter notebook
1 parent bd3d806 commit 5143925

1 file changed

Lines changed: 118 additions & 0 deletions

File tree

src/backend/services/filering_url_connections.ipynb

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,124 @@
231231
"else:\n",
232232
" print(\"No random URLs found based on the provided criteria.\")"
233233
]
234+
},
235+
{
236+
"cell_type": "markdown",
237+
"metadata": {
238+
"id": "q5SXzww0ujhd"
239+
},
240+
"source": [
241+
"# FILTERING URLS BY FOLDER/SUBPATH"
242+
]
243+
},
244+
{
245+
"cell_type": "code",
246+
"execution_count": null,
247+
"metadata": {
248+
"id": "kgfiA5y8ujzB"
249+
},
250+
"outputs": [],
251+
"source": [
252+
"from google.colab import drive\n",
253+
"import pandas as pd\n",
254+
"import os\n",
255+
"\n",
256+
"# Mount Google Drive\n",
257+
"drive.mount(\"/content/drive\")\n",
258+
"\n",
259+
"# Define path to CSV file\n",
260+
"base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n",
261+
"file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n",
262+
"\n",
263+
"\n",
264+
"def get_best_pagerank_urls_in_folder(file_path, n_best, url_subpath_filter=None):\n",
265+
" \"\"\"\n",
266+
" Retrieves the top N best performing URLs (based on PageRank) from a specific folder/URL subpath.\n",
267+
"\n",
268+
" Args:\n",
269+
" file_path (str): The path to the CSV file containing URL analysis results.\n",
270+
" n_best (int): The number of best PageRank URLs to retrieve.\n",
271+
" url_subpath_filter (str, optional): A URL subpath to filter by (e.g., '/learning-spaces/').\n",
272+
" Only URLs containing this subpath will be considered. Defaults to None.\n",
273+
"\n",
274+
" Returns:\n",
275+
" pandas.DataFrame: A DataFrame containing only the 'URL' column of the best PageRank URLs.\n",
276+
" Returns an empty DataFrame if no data, no matching URLs, or missing columns.\n",
277+
" \"\"\"\n",
278+
" try:\n",
279+
" df = pd.read_csv(file_path)\n",
280+
" except FileNotFoundError:\n",
281+
" print(f\"Error: File not found at {file_path}\")\n",
282+
" return pd.DataFrame() # Return empty DataFrame on error\n",
283+
"\n",
284+
" required_cols = {\"URL\", \"PageRank\"} # PageRank is still required for sorting\n",
285+
" if not required_cols.issubset(df.columns):\n",
286+
" raise ValueError(\n",
287+
" f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n",
288+
" )\n",
289+
"\n",
290+
" if df.empty:\n",
291+
" print(\"The CSV file is empty.\")\n",
292+
" return pd.DataFrame()\n",
293+
"\n",
294+
" # Apply URL subpath filter if provided\n",
295+
" filtered_df = df\n",
296+
" if url_subpath_filter:\n",
297+
" filtered_df = df[\n",
298+
" df[\"URL\"].astype(str).str.contains(url_subpath_filter, na=False)\n",
299+
" ]\n",
300+
" if filtered_df.empty:\n",
301+
" print(f\"No URLs found matching the subpath filter: '{url_subpath_filter}'\")\n",
302+
" return pd.DataFrame()\n",
303+
"\n",
304+
" # Sort by 'PageRank' in descending order (highest PageRank = best performer)\n",
305+
" sorted_df = filtered_df.sort_values(by=\"PageRank\", ascending=False)\n",
306+
"\n",
307+
" # Ensure n_best does not exceed the number of available URLs after filtering\n",
308+
" if n_best > len(sorted_df):\n",
309+
" print(\n",
310+
" f\"Warning: Requested {n_best} best URLs, but only {len(sorted_df)} are available after filtering. Returning all available URLs.\"\n",
311+
" )\n",
312+
" n_best = len(sorted_df)\n",
313+
"\n",
314+
" # Get the top N best PageRank URLs, only selecting the 'URL' column\n",
315+
" best_pagerank_urls = sorted_df.head(n_best)[[\"URL\"]]\n",
316+
"\n",
317+
" return best_pagerank_urls\n",
318+
"\n",
319+
"\n",
320+
"# --- User Inputs ---\n",
321+
"try:\n",
322+
" n_best_input = int(input(\"Enter number of best PageRank URLs to retrieve: \"))\n",
323+
" # Prompt for the subpath filter\n",
324+
" url_subpath_input = input(\n",
325+
" \"Enter URL subpath to filter by (e.g., /learning-spaces/, leave empty for no filter): \"\n",
326+
" )\n",
327+
" # Set to None if user leaves it empty\n",
328+
" if not url_subpath_input.strip():\n",
329+
" url_subpath_input = None\n",
330+
"except ValueError:\n",
331+
" print(\"Invalid input. Please enter an integer for the count.\")\n",
332+
" exit() # Exit if input is not a valid integer\n",
333+
"\n",
334+
"# --- Run Analysis ---\n",
335+
"if url_subpath_input:\n",
336+
" print(\n",
337+
" f\"\\nRetrieving {n_best_input} best PageRank URLs from subpath '{url_subpath_input}':\"\n",
338+
" )\n",
339+
"else:\n",
340+
" print(f\"\\nRetrieving {n_best_input} best PageRank URLs (no subpath filter):\")\n",
341+
"\n",
342+
"best_pages = get_best_pagerank_urls_in_folder(\n",
343+
" file_path, n_best_input, url_subpath_input\n",
344+
")\n",
345+
"\n",
346+
"if not best_pages.empty:\n",
347+
" for url in best_pages[\"URL\"]: # Iterate through the 'URL' column of the DataFrame\n",
348+
" print(url)\n",
349+
"else:\n",
350+
" print(\"No best PageRank URLs found based on the provided criteria.\")"
351+
]
234352
}
235353
],
236354
"metadata": {

0 commit comments

Comments
 (0)