1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
#!/usr/bin/env python3
# SPDX-License-Identifier: CC0-1.0
#
# Copyright (C) 2025 Wojtek Kosior <koszko@koszko.org>
from dataclasses import dataclass
from pathlib import Path
import sys
import time
import re
from bs4 import BeautifulSoup
import requests
import yaml
@dataclass
class Group:
name: str
aliases: list[str]
origin: str
motives: list[str]
sectors: list[str]
countries: list[str]
def sanitize(text):
text = text.replace("\xa0", "").strip()
if text and text[-1] == "(":
text = text[:-1].strip()
return text
def heavy_sanitize(text):
for bad in [" ", ".", "mostly", "(", ")"]:
text = sanitize(text).lower().replace(bad, "")
return text
path = Path(sys.argv[1])
soup = BeautifulSoup(path.read_text(), features="lxml")
all_pages = soup.select("body > a[name]")
groups_found = {}
def page_to_group(page):
node = page
while node.text != "\nNames\xa0":
node = node.next_sibling
if node is None:
return None
name_text = node.previous_sibling.previous_sibling.text
name = sanitize(name_text.split(",")[0])
if name in groups_found:
return None
def incomplete_data_abort(what):
print(f"Incomplete data for group {name} ({what}).", file=sys.stderr)
aliases = []
while True:
node = node.next_sibling
if node.name == "i":
alias = sanitize(node.previous_sibling.text)
if alias in groups_found:
print(f"Alias {alias} of group {name} already registered.",
file=sys.stderr)
elif alias and alias != name:
aliases.append(alias)
elif node.text == ("\nInformation\xa0"):
return incomplete_data_abort("no country")
elif node.text == "\nCountry\xa0":
break
origin = sanitize(node.next_sibling.next_sibling)
while node.text != "\nMotivation\xa0":
if node.text == ("\nInformation\xa0"):
return incomplete_data_abort("no motivation")
node = node.next_sibling
motives = []
new_motives = node.next_sibling.next_sibling.split(",")
new_motives = [heavy_sanitize(text) for text in new_motives]
motives.extend(filter(None, new_motives))
while (node.text != ("\nInformation\xa0")
and not node.text.startswith("\nSectors:")):
node = node.next_sibling
sectors_string = ""
while True:
if node.name:
node = node.next_sibling
continue
if (node.text == "\nInformation\xa0"
or "Countries:" in node.text
or ("\n" in node.text and not node.text.startswith("\nSectors:"))):
break
new_sectors_text = node.text
if "Sectors:" in new_sectors_text:
new_sectors_text = new_sectors_text.split("Sectors:")[1]
sectors_string += new_sectors_text
node = node.next_sibling
sectors = []
for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_string):
sector = heavy_sanitize(text)
if sector:
sectors.append(sector)
while (node.text != ("\nInformation\xa0")
and "Countries:" not in node.text):
node = node.next_sibling
countries_string = ""
while True:
if node.name:
node = node.next_sibling
continue
if node.text == "\nInformation\xa0" or "\n" in node.text:
break
new_countries_text = node.text
if "Countries:" in new_countries_text:
new_countries_text = new_countries_text.split("Countries:")[1]
countries_string += new_countries_text
node = node.next_sibling
countries = []
for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_string):
country = heavy_sanitize(text)
if country:
countries.append(country)
return Group(
name=name,
aliases=aliases,
origin=origin,
motives=motives,
sectors=sectors,
countries=countries
)
for page in all_pages:
group = page_to_group(page)
if group:
for key in [group.name] + group.aliases:
groups_found[key] = group
all_names = sorted(set(g.name for g in groups_found.values()))
yaml.safe_dump({"groups": [groups_found[name].__dict__ for name in all_names]},
sys.stdout)
|