Source code for routinepy.lib.scraper.parsers.html.class_routine

from bs4 import BeautifulSoup

from routinepy.lib.scraper.models import RawClassRoutineTable


[docs] class ClassRoutineParser: """ Parser for extracting class routine information from university HTML pages. This parser extracts three main components from the HTML structure: 1. Metadata table (containing routine header information) 2. Main routine table (containing schedule data) 3. Faculty table (containing instructor information) """ TABLE_META_SELECTOR = "table#HdtableRtn" """CSS selector for the metadata table (contains semester/program info)""" def __init__(self, html: str): """ Initialize the :class:`ClassRoutineParser` with HTML content to be parsed. :param html: Raw HTML string containing the routine page content. :type html: str :raises ValueError: If the provided HTML is empty """ if not html.strip(): raise ValueError("HTML content cannot be empty") self.soup = BeautifulSoup(html, "lxml")
[docs] def extract_routine_tables(self) -> list[RawClassRoutineTable]: """ Extract and package all routine table parts from the HTML. This method locates and extracts three related HTML tables for each class routine: - Metadata table: Contains semester, program and section information. - Main routine table: Contains the class schedule. - Faculty table: Contains teacher and course details. :return: A list of :class:`RawClassRoutineTable` objects, each containing the metadata, routine, and faculty tables for a single class routine. :rtype: list[RawClassRoutineTable] """ extracted_tables = [] table_meta = self.soup.select(self.TABLE_META_SELECTOR) for table in table_meta: routine = table.find_next("table") faculty_table = routine.find_next("table") extracted_tables.append( RawClassRoutineTable( meta=table, routine=routine, faculty_table=faculty_table ) ) return extracted_tables