summaryrefslogtreecommitdiff
path: root/scrape_blackobird_groups_info.py
blob: 082c349524ad488befd18b1614bd712f83f8eb83 (about) (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python3

# SPDX-License-Identifier: CC0-1.0
#
# Copyright (C) 2025 Wojtek Kosior <koszko@koszko.org>

from dataclasses import dataclass
from pathlib import Path
import sys
import time
import re

from bs4 import BeautifulSoup
import requests
import yaml

@dataclass
class Group:
    name: str
    aliases: list[str]
    origin: str
    motives: list[str]
    sectors: list[str]
    countries: list[str]

def sanitize(text):
    text = text.replace("\xa0", "").strip()

    if text and text[-1] == "(":
        text = text[:-1].strip()

    return text

def heavy_sanitize(text):
    for bad in [" ", ".", "mostly", "(", ")"]:
        text = sanitize(text).lower().replace(bad, "")
    return text

path = Path(sys.argv[1])
soup = BeautifulSoup(path.read_text(), features="lxml")

all_pages = soup.select("body > a[name]")

groups_found = {}

def page_to_group(page):
    node = page
    while node.text != "\nNames\xa0":
        node = node.next_sibling
        if node is None:
            return None

    name_text = node.previous_sibling.previous_sibling.text
    name = sanitize(name_text.split(",")[0])
    if name in groups_found:
        return None

    def incomplete_data_abort(what):
        print(f"Incomplete data for group {name} ({what}).", file=sys.stderr)

    aliases = []

    while True:
        node = node.next_sibling

        if node.name == "i":
            alias = sanitize(node.previous_sibling.text)
            if alias in groups_found:
                print(f"Alias {alias} of group {name} already registered.",
                      file=sys.stderr)
            elif alias and alias != name:
                aliases.append(alias)

        elif node.text == ("\nInformation\xa0"):
            return incomplete_data_abort("no country")

        elif node.text == "\nCountry\xa0":
            break

    origin = sanitize(node.next_sibling.next_sibling)

    while node.text != "\nMotivation\xa0":
        if node.text == ("\nInformation\xa0"):
            return incomplete_data_abort("no motivation")
        node = node.next_sibling

    motives = []

    new_motives = node.next_sibling.next_sibling.split(",")
    new_motives = [heavy_sanitize(text) for text in new_motives]
    motives.extend(filter(None, new_motives))

    while (node.text != ("\nInformation\xa0")
           and not node.text.startswith("\nSectors:")):
        node = node.next_sibling

    sectors_string = ""

    while True:
        if node.name:
            node = node.next_sibling
            continue

        if (node.text == "\nInformation\xa0"
            or "Countries:" in node.text
            or ("\n" in node.text and not node.text.startswith("\nSectors:"))):
            break

        new_sectors_text = node.text
        if "Sectors:" in new_sectors_text:
            new_sectors_text = new_sectors_text.split("Sectors:")[1]

        sectors_string += new_sectors_text

        node = node.next_sibling

    sectors = []

    for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_string):
        sector = heavy_sanitize(text)
        if sector:
            sectors.append(sector)

    while (node.text != ("\nInformation\xa0")
           and "Countries:" not in node.text):
        node = node.next_sibling

    countries_string = ""

    while True:
        if node.name:
            node = node.next_sibling
            continue

        if node.text == "\nInformation\xa0" or "\n" in node.text:
            break

        new_countries_text = node.text
        if "Countries:" in new_countries_text:
            new_countries_text = new_countries_text.split("Countries:")[1]

        countries_string += new_countries_text

        node = node.next_sibling

    countries = []

    for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_string):
        country = heavy_sanitize(text)
        if country:
            countries.append(country)

    return Group(
        name=name,
        aliases=aliases,
        origin=origin,
        motives=motives,
        sectors=sectors,
        countries=countries
    )

for page in all_pages:
    group = page_to_group(page)

    if group:
        for key in [group.name] + group.aliases:
            groups_found[key] = group

all_names = sorted(set(g.name for g in groups_found.values()))

yaml.safe_dump({"groups": [groups_found[name].__dict__ for name in all_names]},
               sys.stdout)