Skip to content
8 changes: 6 additions & 2 deletions catalog/management/commands/clean_archives.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
from typing import Any

import logging

from django.core.management.base import BaseCommand
from django.db.models import Count

from catalog.models import Course

logger = logging.getLogger(__name__)


class Command(BaseCommand):
def handle(self, *args: Any, **options: Any) -> None:
print("Cleaning archives")
logger.info("Cleaning archives")
empty_archived_courses = (
Course.objects.filter(is_archive=True)
.annotate(num_doc=Count("document"))
.filter(num_doc=0)
)
print("Deleting %s empty courses" % len(empty_archived_courses))
logger.info("Deleting %s empty courses", len(empty_archived_courses))
empty_archived_courses.delete()
11 changes: 7 additions & 4 deletions catalog/management/commands/crawl_uv.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import csv
import logging

from django.core.management.base import BaseCommand

import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


class Command(BaseCommand):
def handle(self, *args, **options):
Expand All @@ -17,17 +20,17 @@ def handle(self, *args, **options):

courses = []
fails = []
print(f"Found {len(options)} options")
logger.info("Found %s options", len(options)) # debug car verbeux
for option in options:
print(f"..{option.text}")
logger.debug("..%s", option.text)
value = option["value"]
response = requests.get(
f"https://uv.ulb.ac.be/course/index.php?categoryid={value}&browse=courses&perpage=1000&page=0"
)
soup = BeautifulSoup(response.content, "html.parser")

course_divs = soup.find_all("div", {"class": "coursebox"})
print(f"Found {len(courses)} in {option.text}")
logger.info("Found %s in %s", len(courses), option.text)

for course in course_divs:
try:
Expand All @@ -37,7 +40,7 @@ def handle(self, *args, **options):
except: # noqa
fails.append(course.text)

print(f"Found {len(courses)} and failed to parse {len(fails)}")
logger.info("Found %s and failed to parse %s", len(courses), len(fails))
with open("csv/uv_courses.csv", "w") as fd:
writer = csv.writer(fd)
for course in courses:
Expand Down
55 changes: 29 additions & 26 deletions catalog/management/commands/download_program_contents.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,76 @@
from typing import Any

import json
import logging
from urllib.parse import quote

from django.core.management import BaseCommand

import requests
from rich import print
from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn

logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = ""
help = "Download course contents for all programs from ULB API"

def handle(self, *args: Any, **options: Any) -> None:
with open("csv/programs.json") as f:
programs: list[dict] = json.load(f)
print("\n[bold blue]Listing the course content of all programs...[/]\n")
logger.info("Listing the course content of all programs...")

failed: list = []
program_content: dict[str, dict[str, dict]] = {}

# programs = [p for p in programs if p["slug"] in ["BA-GEOG"]]

with Progress(
SpinnerColumn(),
*Progress.get_default_columns(),
MofNCompleteColumn(),
) as progress:
task1 = progress.add_task(
"Listing the course content of all programs...", total=len(programs)
)
task1 = progress.add_task("Processing programs...", total=len(programs))

for progam in programs:
slug_upper = progam["slug"].upper()
progress.update(
task1,
advance=1,
description=f"Listing the course content of {progam['slug'].upper()}...",
description=f"Listing content of {slug_upper}...",
)

if "parent" in progam:
qs = f"/ksup/programme?gen=prod&anet={progam['parent'].upper()}&option={progam['slug'].upper()}&lang=fr"
qs = f"/ksup/programme?gen=prod&anet={progam['parent'].upper()}&option={slug_upper}&lang=fr"
else:
qs = f"/ksup/programme?gen=prod&anet={progam['slug'].upper()}&lang=fr"
qs = f"/ksup/programme?gen=prod&anet={slug_upper}&lang=fr"

URL = f"https://www.ulb.be/api/formation?path={quote(qs)}"
try:
response = requests.get(URL)
if not response.ok:
if "parent" in progam:
print(
f"[yellow]Skip:[/] [magenta]{progam['slug'].upper()}[/] with bogus parent {progam['parent'].upper()}"
# Utilisation de logger.warning pour les sauts/retries
logger.warning(
"Skip: %s with bogus parent %s. Retrying...",
slug_upper,
progam["parent"].upper(),
)

print("Retry")
qs = f"/ksup/programme?gen=prod&anet={progam['slug'].upper()}&lang=fr"
qs = f"/ksup/programme?gen=prod&anet={slug_upper}&lang=fr"
URL = f"https://www.ulb.be/api/formation?path={quote(qs)}"
response = requests.get(URL)
if not response.ok:
print("Retry failed")
logger.error("Retry failed for %s", slug_upper)
continue

else:
print(
f"[red]Error:[/] [magenta]{progam['slug'].upper()}[/] failed with {response.status_code}"
logger.error(
"%s failed with %s. URL: %s",
slug_upper,
response.status_code,
URL,
)
print(" ", URL)
continue

except Exception:
print(f"[red]Error:[/] Failed to GET {progam['slug'].upper()}")
print(" URL", URL)
progress.console.print_exception()
logger.exception("Failed to GET %s. URL: %s", slug_upper, URL)
continue

try:
Expand All @@ -92,8 +92,11 @@ def handle(self, *args: Any, **options: Any) -> None:
}
except Exception:
failed.append(progam["slug"])
print(f"Error while listing content of {progam['slug']}")
progress.console.print_exception()
logger.exception(
"Error while listing content of %s", progam["slug"]
)

with open("csv/courses.json", "w+") as all_courses_json:
json.dump(program_content, all_courses_json, indent=2)

logger.info("Course content download complete. Saved to csv/courses.json")
37 changes: 19 additions & 18 deletions catalog/management/commands/download_programs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,29 @@
from typing import Any

import json
import logging
import re

from django.core.management import BaseCommand

import requests
from bs4 import BeautifulSoup
from rich import print
from rich.progress import MofNCompleteColumn, Progress, SpinnerColumn

logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = ""
help = "Download the list of available programs from ULB"

PAGE_SIZE = 20

URL = f"https://www.ulb.be/servlet/search?beanKey=beanKeyRechercheFormation&types=formation&natureFormation=ulb&s=FACULTE_ASC&limit={PAGE_SIZE}"

def handle(self, *args: Any, **options: Any) -> None:
programs: list[dict] = []

parent_programs: set[str] = set()
print("[bold blue]Gathering the list of available programs...[/]\n")

logger.info("Gathering the list of available programs...")

with Progress(
SpinnerColumn(),
Expand All @@ -36,9 +37,11 @@ def handle(self, *args: Any, **options: Any) -> None:
task1 = progress.add_task(
"Listing available programs...", total=result_count
)
progress.console.print(

logger.info(
"Querying ULB a first time to count the number of programs available..."
)

while page < last_page:
response = requests.get(self.URL + f"&page={page}")
soup = BeautifulSoup(response.content, "html.parser")
Expand All @@ -53,14 +56,15 @@ def handle(self, *args: Any, **options: Any) -> None:
r"a( +)donné( +)(?P<count>\d+)( +)résultats", result_count_text
):
result_count = int(match.group("count"))

else:
raise Exception(
f"Could not parse result count ({result_count_text})"
)

last_page = int(result_count / self.PAGE_SIZE) + 1
progress.console.print(
f"Found {result_count} programs on {last_page} pages..."

logger.info(
"Found %s programs on %s pages...", result_count, last_page
)
progress.update(task1, total=result_count)

Expand All @@ -73,7 +77,6 @@ def handle(self, *args: Any, **options: Any) -> None:
program_name = mnemonic_span.find_previous(
"strong", {"class": "search-result__structure-intitule"}
).text

faculties: list = []
for elem in fac:
children = elem.findChildren()
Expand All @@ -89,7 +92,6 @@ def handle(self, *args: Any, **options: Any) -> None:
"name": program_name,
"faculty": faculties,
}

if option_div := mnemonic_span.find_previous(
"div", {"class": "search-result__resultat--fille"}
):
Expand All @@ -104,18 +106,17 @@ def handle(self, *args: Any, **options: Any) -> None:

programs.append(p)
else:
progress.console.print(
f"Skipping already seen [magenta]{mnemonic_span.text}"
)
logger.debug("Skipping already seen %s", mnemonic_span.text)
progress.update(task1, completed=self.PAGE_SIZE * page)
page += 1

print(
f"Found {len(parent_programs)} programs containing options, ignoring those..."
logger.info(
"Found %s programs containing options, ignoring those...",
len(parent_programs),
)
print(parent_programs)
logger.debug("Ignored programs: %s", parent_programs)
programs = [p for p in programs if p["slug"] not in parent_programs]

print(f"Found {len(programs)} distinct programs, dumping to json...")
logger.info("Found %s distinct programs, dumping to json...", len(programs))
with open("csv/programs.json", "w") as f:
json.dump(programs, f, indent=4)
10 changes: 7 additions & 3 deletions catalog/management/commands/find_orphans.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import csv
import json
import logging

from django.core.management.base import BaseCommand
from django.db.models import Count

from catalog.models import Course
from catalog.slug import normalize_slug

logger = logging.getLogger(__name__)


class Command(BaseCommand):
def handle(self, *args, **options):
Expand All @@ -26,10 +29,11 @@ def handle(self, *args, **options):
empty_orphans = orphans.filter(num_docs=0)

orphans_to_fix = orphans.exclude(num_docs=0)
print(
f"{empty_orphans.count()} empty orphans and {orphans_to_fix.count()} orphans with documents"
logger.info(
"%s empty orphans and %s orphans with documents",
empty_orphans.count(),
orphans_to_fix.count(),
)

with open("csv/orphans.csv", "w") as fd:
writer = csv.writer(fd)
for course in orphans_to_fix:
Expand Down
7 changes: 5 additions & 2 deletions catalog/management/commands/load_courses.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import json
import logging

from django.core.management.base import BaseCommand
from django.db import transaction

from catalog.models import Category, Course
from catalog.slug import normalize_slug

logger = logging.getLogger(__name__)


def get_category(slug, name=None, parent=None, type=None):
cat, created = Category.objects.get_or_create(
Expand All @@ -22,13 +25,13 @@ def handle(self, *args, **options):
programs = json.load(f)

with transaction.atomic():
print("Temporarily set all courses as archived")
logger.info("Temporarily set all courses as archived")
for course in Course.objects.all():
course.is_archive = True
course.save()

for program_slug, courses in programs.items():
print(f"Inserting {len(courses)} courses from {program_slug}")
logger.info("Inserting %s courses from %s", len(courses), program_slug)
category = get_category(program_slug)
for course in courses.values():
bloc = course["bloc"]
Expand Down
Loading