Skip to content

Commit 770f14e

Browse files
Adding new logic to filtering connections ipynb
1 parent 768ad33 commit 770f14e

1 file changed

Lines changed: 102 additions & 0 deletions

File tree

src/backend/services/filering_url_connections.ipynb

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,108 @@
349349
"else:\n",
350350
" print(\"No best PageRank URLs found based on the provided criteria.\")"
351351
]
352+
},
353+
{
354+
"cell_type": "markdown",
355+
"metadata": {
356+
"id": "c0huYdotnVlL"
357+
},
358+
"source": [
359+
"# FINDING BEST PERFORMERS ON FOLDER DEPTH INTERVAL"
360+
]
361+
},
362+
{
363+
"cell_type": "code",
364+
"execution_count": null,
365+
"metadata": {
366+
"id": "xn5xIY1QnVWz"
367+
},
368+
"outputs": [],
369+
"source": [
370+
"from google.colab import drive\n",
371+
"import pandas as pd\n",
372+
"import os\n",
373+
"\n",
374+
"# Mount Google Drive\n",
375+
"drive.mount(\"/content/drive\")\n",
376+
"\n",
377+
"# Define path to CSV file\n",
378+
"base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n",
379+
"file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n",
380+
"\n",
381+
"\n",
382+
"def get_overall_top_performers_in_range(file_path, min_depth, max_depth, n_total_top):\n",
383+
" \"\"\"\n",
384+
" Retrieves the overall top N performing URLs (based on PageRank) within a defined range of folder depth levels.\n",
385+
"\n",
386+
" Args:\n",
387+
" file_path (str): The path to the CSV file containing URL analysis results.\n",
388+
" min_depth (int): The minimum folder depth level in the range (inclusive).\n",
389+
" max_depth (int): The maximum folder depth level in the range (inclusive).\n",
390+
" n_total_top (int): The total number of top performing URLs to retrieve across the entire range.\n",
391+
"\n",
392+
" Returns:\n",
393+
" pandas.DataFrame: A DataFrame containing the 'URL' column of the overall top performers.\n",
394+
" Returns an empty DataFrame if no matching data or missing columns.\n",
395+
" \"\"\"\n",
396+
" try:\n",
397+
" df = pd.read_csv(file_path)\n",
398+
" except FileNotFoundError:\n",
399+
" print(f\"Error: File not found at {file_path}\")\n",
400+
" return pd.DataFrame() # Return empty DataFrame on error\n",
401+
"\n",
402+
" required_cols = {\"URL\", \"Folder_Depth\", \"PageRank\"}\n",
403+
" if not required_cols.issubset(df.columns):\n",
404+
" raise ValueError(\n",
405+
" f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n",
406+
" )\n",
407+
"\n",
408+
" # Filter by the defined folder depth range\n",
409+
" filtered_df = df[\n",
410+
" (df[\"Folder_Depth\"] >= min_depth) & (df[\"Folder_Depth\"] <= max_depth)\n",
411+
" ]\n",
412+
"\n",
413+
" if filtered_df.empty:\n",
414+
" print(f\"No data found for folder depths between {min_depth} and {max_depth}.\")\n",
415+
" return pd.DataFrame()\n",
416+
"\n",
417+
" # Sort the entire filtered DataFrame by 'PageRank' in descending order\n",
418+
" sorted_df = filtered_df.sort_values(by=\"PageRank\", ascending=False)\n",
419+
"\n",
420+
" # Get the overall top N performers\n",
421+
" overall_top_performers = sorted_df.head(n_total_top)\n",
422+
"\n",
423+
" return overall_top_performers[[\"URL\"]]\n",
424+
"\n",
425+
"\n",
426+
"# --- User Inputs ---\n",
427+
"try:\n",
428+
" min_depth_input = int(input(\"Enter minimum folder depth level (integer): \"))\n",
429+
" max_depth_input = int(input(\"Enter maximum folder depth level (integer): \"))\n",
430+
" n_total_top_input = int(input(\"Enter total number of top performers to retrieve: \"))\n",
431+
"\n",
432+
" if min_depth_input > max_depth_input:\n",
433+
" print(\"Error: Minimum depth cannot be greater than maximum depth.\")\n",
434+
" exit()\n",
435+
"\n",
436+
"except ValueError:\n",
437+
" print(\"Invalid input. Please enter integers for depth range and count.\")\n",
438+
" exit() # Exit if inputs are not valid integers\n",
439+
"\n",
440+
"# --- Run Analysis ---\n",
441+
"print(\n",
442+
" f\"\\nRetrieving the overall top {n_total_top_input} performers from folder depths {min_depth_input} to {max_depth_input}:\"\n",
443+
")\n",
444+
"overall_top_pages = get_overall_top_performers_in_range(\n",
445+
" file_path, min_depth_input, max_depth_input, n_total_top_input\n",
446+
")\n",
447+
"\n",
448+
"if not overall_top_pages.empty:\n",
449+
" for url in overall_top_pages[\"URL\"]:\n",
450+
" print(url)\n",
451+
"else:\n",
452+
" print(\"No overall top performers found based on the provided criteria.\")"
453+
]
352454
}
353455
],
354456
"metadata": {

0 commit comments

Comments
 (0)