#!/usr/bin/env python3 # SPDX-License-Identifier: CC0-1.0 # # Copyright (C) 2025 Wojtek Kosior from dataclasses import dataclass from pathlib import Path import sys import time import re from bs4 import BeautifulSoup import requests import yaml @dataclass class Group: name: str aliases: list[str] origin: str motives: list[str] sectors: list[str] countries: list[str] def sanitize(text): text = text.replace("\xa0", "").strip() if text and text[-1] == "(": text = text[:-1].strip() return text def heavy_sanitize(text): for bad in [" ", ".", "mostly", "(", ")"]: text = sanitize(text).lower().replace(bad, "") return text path = Path(sys.argv[1]) soup = BeautifulSoup(path.read_text(), features="lxml") all_pages = soup.select("body > a[name]") groups_found = {} def page_to_group(page): node = page while node.text != "\nNames\xa0": node = node.next_sibling if node is None: return None name_text = node.previous_sibling.previous_sibling.text name = sanitize(name_text.split(",")[0]) if name in groups_found: return None def incomplete_data_abort(what): print(f"Incomplete data for group {name} ({what}).", file=sys.stderr) aliases = [] while True: node = node.next_sibling if node.name == "i": alias = sanitize(node.previous_sibling.text) if alias in groups_found: print(f"Alias {alias} of group {name} already registered.", file=sys.stderr) elif alias and alias != name: aliases.append(alias) elif node.text == ("\nInformation\xa0"): return incomplete_data_abort("no country") elif node.text == "\nCountry\xa0": break origin = sanitize(node.next_sibling.next_sibling) while node.text != "\nMotivation\xa0": if node.text == ("\nInformation\xa0"): return incomplete_data_abort("no motivation") node = node.next_sibling motives = [] new_motives = node.next_sibling.next_sibling.split(",") new_motives = [heavy_sanitize(text) for text in new_motives] motives.extend(filter(None, new_motives)) while (node.text != ("\nInformation\xa0") and not node.text.startswith("\nSectors:")): node = node.next_sibling sectors_string = "" while True: if node.name: node = node.next_sibling continue if (node.text == "\nInformation\xa0" or "Countries:" in node.text or ("\n" in node.text and not node.text.startswith("\nSectors:"))): break new_sectors_text = node.text if "Sectors:" in new_sectors_text: new_sectors_text = new_sectors_text.split("Sectors:")[1] sectors_string += new_sectors_text node = node.next_sibling sectors = [] for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_string): sector = heavy_sanitize(text) if sector: sectors.append(sector) while (node.text != ("\nInformation\xa0") and "Countries:" not in node.text): node = node.next_sibling countries_string = "" while True: if node.name: node = node.next_sibling continue if node.text == "\nInformation\xa0" or "\n" in node.text: break new_countries_text = node.text if "Countries:" in new_countries_text: new_countries_text = new_countries_text.split("Countries:")[1] countries_string += new_countries_text node = node.next_sibling countries = [] for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_string): country = heavy_sanitize(text) if country: countries.append(country) return Group( name=name, aliases=aliases, origin=origin, motives=motives, sectors=sectors, countries=countries ) for page in all_pages: group = page_to_group(page) if group: for key in [group.name] + group.aliases: groups_found[key] = group all_names = sorted(set(g.name for g in groups_found.values())) yaml.safe_dump({"groups": [groups_found[name].__dict__ for name in all_names]}, sys.stdout)