Adding URL filtering logic to the jypiter notebook

martech-engineer · martech-engineer · commit 51439250631a · 2025-07-10T11:37:36.000+02:00
diff --git a/src/backend/services/filering_url_connections.ipynb b/src/backend/services/filering_url_connections.ipynb
@@ -231,6 +231,124 @@
     "else:\n",
     "    print(\"No random URLs found based on the provided criteria.\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "q5SXzww0ujhd"
+   },
+   "source": [
+    "# FILTERING URLS BY FOLDER/SUBPATH"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "kgfiA5y8ujzB"
+   },
+   "outputs": [],
+   "source": [
+    "from google.colab import drive\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "\n",
+    "# Mount Google Drive\n",
+    "drive.mount(\"/content/drive\")\n",
+    "\n",
+    "# Define path to CSV file\n",
+    "base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n",
+    "file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n",
+    "\n",
+    "\n",
+    "def get_best_pagerank_urls_in_folder(file_path, n_best, url_subpath_filter=None):\n",
+    "    \"\"\"\n",
+    "    Retrieves the top N best performing URLs (based on PageRank) from a specific folder/URL subpath.\n",
+    "\n",
+    "    Args:\n",
+    "        file_path (str): The path to the CSV file containing URL analysis results.\n",
+    "        n_best (int): The number of best PageRank URLs to retrieve.\n",
+    "        url_subpath_filter (str, optional): A URL subpath to filter by (e.g., '/learning-spaces/').\n",
+    "                                            Only URLs containing this subpath will be considered. Defaults to None.\n",
+    "\n",
+    "    Returns:\n",
+    "        pandas.DataFrame: A DataFrame containing only the 'URL' column of the best PageRank URLs.\n",
+    "                          Returns an empty DataFrame if no data, no matching URLs, or missing columns.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        df = pd.read_csv(file_path)\n",
+    "    except FileNotFoundError:\n",
+    "        print(f\"Error: File not found at {file_path}\")\n",
+    "        return pd.DataFrame()  # Return empty DataFrame on error\n",
+    "\n",
+    "    required_cols = {\"URL\", \"PageRank\"}  # PageRank is still required for sorting\n",
+    "    if not required_cols.issubset(df.columns):\n",
+    "        raise ValueError(\n",
+    "            f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n",
+    "        )\n",
+    "\n",
+    "    if df.empty:\n",
+    "        print(\"The CSV file is empty.\")\n",
+    "        return pd.DataFrame()\n",
+    "\n",
+    "    # Apply URL subpath filter if provided\n",
+    "    filtered_df = df\n",
+    "    if url_subpath_filter:\n",
+    "        filtered_df = df[\n",
+    "            df[\"URL\"].astype(str).str.contains(url_subpath_filter, na=False)\n",
+    "        ]\n",
+    "        if filtered_df.empty:\n",
+    "            print(f\"No URLs found matching the subpath filter: '{url_subpath_filter}'\")\n",
+    "            return pd.DataFrame()\n",
+    "\n",
+    "    # Sort by 'PageRank' in descending order (highest PageRank = best performer)\n",
+    "    sorted_df = filtered_df.sort_values(by=\"PageRank\", ascending=False)\n",
+    "\n",
+    "    # Ensure n_best does not exceed the number of available URLs after filtering\n",
+    "    if n_best > len(sorted_df):\n",
+    "        print(\n",
+    "            f\"Warning: Requested {n_best} best URLs, but only {len(sorted_df)} are available after filtering. Returning all available URLs.\"\n",
+    "        )\n",
+    "        n_best = len(sorted_df)\n",
+    "\n",
+    "    # Get the top N best PageRank URLs, only selecting the 'URL' column\n",
+    "    best_pagerank_urls = sorted_df.head(n_best)[[\"URL\"]]\n",
+    "\n",
+    "    return best_pagerank_urls\n",
+    "\n",
+    "\n",
+    "# --- User Inputs ---\n",
+    "try:\n",
+    "    n_best_input = int(input(\"Enter number of best PageRank URLs to retrieve: \"))\n",
+    "    # Prompt for the subpath filter\n",
+    "    url_subpath_input = input(\n",
+    "        \"Enter URL subpath to filter by (e.g., /learning-spaces/, leave empty for no filter): \"\n",
+    "    )\n",
+    "    # Set to None if user leaves it empty\n",
+    "    if not url_subpath_input.strip():\n",
+    "        url_subpath_input = None\n",
+    "except ValueError:\n",
+    "    print(\"Invalid input. Please enter an integer for the count.\")\n",
+    "    exit()  # Exit if input is not a valid integer\n",
+    "\n",
+    "# --- Run Analysis ---\n",
+    "if url_subpath_input:\n",
+    "    print(\n",
+    "        f\"\\nRetrieving {n_best_input} best PageRank URLs from subpath '{url_subpath_input}':\"\n",
+    "    )\n",
+    "else:\n",
+    "    print(f\"\\nRetrieving {n_best_input} best PageRank URLs (no subpath filter):\")\n",
+    "\n",
+    "best_pages = get_best_pagerank_urls_in_folder(\n",
+    "    file_path, n_best_input, url_subpath_input\n",
+    ")\n",
+    "\n",
+    "if not best_pages.empty:\n",
+    "    for url in best_pages[\"URL\"]:  # Iterate through the 'URL' column of the DataFrame\n",
+    "        print(url)\n",
+    "else:\n",
+    "    print(\"No best PageRank URLs found based on the provided criteria.\")"
+   ]
   }
  ],
  "metadata": {