Source code for routinepy.lib.scraper.parsers.html.routine_page

from typing import Optional
from urllib.parse import parse_qs, urlparse

from bs4 import BeautifulSoup, Tag
from loguru import logger

from ...scraper_utils import normalize_url


[docs] class RoutinePageParser: """ A parser for extracting routine information from university web pages. This class contains CSS selectors used to locate and extract different types of routines (regular class, exams, supplementary exams) from the `university's webpage <https://bubt.edu.bd/home/routines>`_ """ # Container selectors ROUTINE_CONTAINER_SELECTOR = "div#accordion.panel-group" """CSS Selector of the main container element holding all routine sections""" CLASS_ACCORDION_SELECTOR = "div#accordionClassRoutine" """CSS Selector of the container specifically for class routine table""" EXAM_ACCORDION_SELECTOR = "div#Exam_Routine" """CSS Selector of the container specifically for exam routine table""" SUP_EXAM_ACCORDION_SELECTOR = "div#Sup_Exam_Routine" """CSS Selector of the container specifically for supplementary exam routine table""" # Cell selectors EXAM_NAME_CELL_SELECTOR = "td:nth-child(1)" """ CSS selector for exam name cells in tables Targets the first column of exam routine tables for name """ CLASS_LINK_SELECTOR = "a.btn.btn-primary" """ CSS selector for class routine links Targets primary button-style link for getting `View` link """
[docs] def __init__(self, html: str): """ Initialize the :class:`RoutinePageParser` with HTML content to be parsed. :param html: Raw HTML string containing the routine page content. :type html: str :raises ValueError: If the provided HTML is empty """ if not html.strip(): raise ValueError("HTML content cannot be empty") self.soup = BeautifulSoup(html, "lxml")
@staticmethod def _extract_prog_and_sem_no(query_params: dict) -> dict[str, str]: """ Extract program and semester codes from URL query parameters. :param query_params: Parsed query parameters from a URL (from parse_qs) :type query_params: dict :return: A dictionary containing: :rtype: dict[str, str] **Example**: .. code-block:: python >>> self._extract_prog_and_sem_no({'p': ['006'], 'semNo': ['610,064']}) {'program_code': '006', 'semester_code': '610,064'} """ program_code = query_params.get("p", [""])[0] semester_code = query_params.get("semNo", [""])[0] return {"program_code": program_code, "semester_code": semester_code} @staticmethod def _extract_title(container: Tag) -> str: """ Extract the title of a routine table. Args: container (Tag): A BeautifulSoup Tag object containing the table. Returns: str: The text of the h2 tag within the container, or "Untitled" if not found. """ title_tag = container.select_one("h2") return title_tag.get_text(strip=True) if title_tag else "Untitled" @logger.catch() def _parse_routine_table( self, container_id: str, is_class: bool = False, ) -> dict[str, str]: """ Parse a routine table to extract its title and links. Args: container_id (str): CSS selector for the table container (e.g., `div#accordionClassRoutine`). is_class (bool, optional): If True, parse as class routine table; otherwise, parse as exam routine table. Defaults to False. Returns: dict: A dictionary containing: - title (str): The title of the routine table. - links (list[dict]): A list of link data dictionaries (structure depends on is_class). Raises: ValueError: If the specified container is not found in the HTML. """ container = self.soup.select_one(container_id) if not container: raise ValueError(f"Routine container {container_id} not found") title = self._extract_title(container) links = self._parse_links(container, is_class=is_class) return {"title": title, "links": links} def _parse_links( self, container: Tag, is_class: bool = False, ) -> list[dict[str, str]]: """ Extract and normalize links from a routine table. Args: container (Tag): A BeautifulSoup Tag object containing the table. is_class (bool, optional): If True, parse links as class routines; otherwise, parse as exam routines. Defaults to False. Returns ------- list A list of dictionaries containing link data: - For class routines, each dictionary has program_code, semester_code, and link. - For exam routines, each dictionary maps an exam name to a list of name-URL pairs. """ links = [] for row in container.select("table > tbody > tr"): if is_class: link = self._parse_class_row_link(row) else: link = self._parse_exam_row_link(row) if link: links.append(link) return links def _parse_class_row_link(self, row: Tag) -> Optional[dict[str, str]]: """ Parse a class routine table row to extract link data. Args: row (Tag): A BeautifulSoup Tag object representing a table row. Returns ------- dict A dictionary with keys: - program_code (str): The academic program code. - semester_code (str): The semester code. - link (str): The URL to the class routine. None if the row lacks a valid link or query parameters. """ link_tag = row.select_one(self.CLASS_LINK_SELECTOR) if not link_tag or not (link := link_tag.get("href")): return None parsed_url = urlparse(link) if not parsed_url.query: return None query_params = parse_qs(parsed_url.query) if not query_params: return None meta = self._extract_prog_and_sem_no(query_params) meta["link"] = link return meta def _parse_exam_row_link( self, row: Tag, ) -> dict[str, str]: """ Parse an exam routine table row to extract link data. Args: row (Tag): A BeautifulSoup Tag object representing a table row. Returns ------- dict A dictionary mapping the exam name (str) to a list of dictionaries, each containing a link name and its normalized URL. None if the row lacks a valid name cell. """ name_cell = row.select_one(self.EXAM_NAME_CELL_SELECTOR) if not name_cell: return None name_cell = name_cell.get_text(strip=True) links = row.find_all("a") urls = list() for i in links: link = i.get("href") name = i.get_text(strip=True).split() name = " ".join(name) if link: urls.append({name: normalize_url(link)}) return {name_cell: urls}