Adding random url filtering to filtering url connections csv

martech-engineer · martech-engineer · commit 1da264698816 · 2025-07-09T12:04:04.000+02:00
diff --git a/src/backend/services/filering_url_connections.ipynb b/src/backend/services/filering_url_connections.ipynb
@@ -140,6 +140,97 @@
     "else:\n",
     "    print(\"No top performers found based on the provided criteria.\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YbuR_a9ij-Hk"
+   },
+   "source": [
+    "# FINDING RANDOM URLS"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "fSs-MIP2j8pX"
+   },
+   "outputs": [],
+   "source": [
+    "from google.colab import drive\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "import random\n",
+    "\n",
+    "# Mount Google Drive\n",
+    "drive.mount(\"/content/drive\")\n",
+    "\n",
+    "# Define path to CSV file\n",
+    "base_path = \"/content/drive/My Drive/WebKnoGraph/data\"\n",
+    "file_path = os.path.join(base_path, \"url_analysis_results.csv\")\n",
+    "\n",
+    "\n",
+    "def get_random_urls(file_path, n_random):\n",
+    "    \"\"\"\n",
+    "    Retrieves a random sample of N URLs from the entire dataset, regardless of folder depth.\n",
+    "\n",
+    "    Args:\n",
+    "        file_path (str): The path to the CSV file containing URL analysis results.\n",
+    "        n_random (int): The number of random URLs to retrieve.\n",
+    "\n",
+    "    Returns:\n",
+    "        pandas.DataFrame: A DataFrame containing the 'URL' column of the random URLs.\n",
+    "                          Returns an empty DataFrame if no data or missing columns.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        df = pd.read_csv(file_path)\n",
+    "    except FileNotFoundError:\n",
+    "        print(f\"Error: File not found at {file_path}\")\n",
+    "        return pd.DataFrame()  # Return empty DataFrame on error\n",
+    "\n",
+    "    required_cols = {\"URL\"}  # Only 'URL' is strictly required for this function\n",
+    "    if not required_cols.issubset(df.columns):\n",
+    "        raise ValueError(\n",
+    "            f\"Missing one or more required columns: {required_cols - set(df.columns)}\"\n",
+    "        )\n",
+    "\n",
+    "    if df.empty:\n",
+    "        print(\"The CSV file is empty.\")\n",
+    "        return pd.DataFrame()\n",
+    "\n",
+    "    # Ensure n_random does not exceed the number of available URLs\n",
+    "    if n_random > len(df):\n",
+    "        print(\n",
+    "            f\"Warning: Requested {n_random} random URLs, but only {len(df)} are available. Returning all available URLs.\"\n",
+    "        )\n",
+    "        n_random = len(df)\n",
+    "\n",
+    "    # Get a random sample of URLs\n",
+    "    random_urls = df.sample(\n",
+    "        n=n_random, random_state=None\n",
+    "    )  # random_state=None ensures true randomness each run\n",
+    "\n",
+    "    return random_urls[[\"URL\"]]\n",
+    "\n",
+    "\n",
+    "# --- User Inputs ---\n",
+    "try:\n",
+    "    n_random_input = int(input(\"Enter number of random URLs to retrieve: \"))\n",
+    "except ValueError:\n",
+    "    print(\"Invalid input. Please enter an integer for the count.\")\n",
+    "    exit()  # Exit if input is not a valid integer\n",
+    "\n",
+    "# --- Run Analysis ---\n",
+    "print(f\"\\nRetrieving {n_random_input} random URLs:\")\n",
+    "random_pages = get_random_urls(file_path, n_random_input)\n",
+    "\n",
+    "if not random_pages.empty:\n",
+    "    for url in random_pages[\"URL\"]:\n",
+    "        print(url)\n",
+    "else:\n",
+    "    print(\"No random URLs found based on the provided criteria.\")"
+   ]
   }
  ],
  "metadata": {