summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorW. Kosior <koszko@koszko.org>2025-01-09 01:18:35 +0100
committerW. Kosior <koszko@koszko.org>2025-01-09 01:18:35 +0100
commitcc93455733b0ee24080e78d1a78b5fe624dc1709 (patch)
treee76a306efd0eff7d778ebe8d38ae47073466d28b
parentd63572395f027b7776d57e62d0019800e3c4657d (diff)
downloadAGH-threat-intel-course-cc93455733b0ee24080e78d1a78b5fe624dc1709.tar.gz
AGH-threat-intel-course-cc93455733b0ee24080e78d1a78b5fe624dc1709.zip
remove trailing whitespace
-rwxr-xr-xscrape_blackobird_groups_info.py58
1 files changed, 29 insertions, 29 deletions
diff --git a/scrape_blackobird_groups_info.py b/scrape_blackobird_groups_info.py
index 43d91ca..2e99959 100755
--- a/scrape_blackobird_groups_info.py
+++ b/scrape_blackobird_groups_info.py
@@ -25,10 +25,10 @@ class Group:
def sanitize(text):
text = text.replace("\xa0", "").strip()
-
+
if text and text[-1] == "(":
text = text[:-1].strip()
-
+
return text
def heavy_sanitize(text):
@@ -49,20 +49,20 @@ def page_to_group(page):
node = node.next_sibling
if node is None:
return None
-
+
name_text = node.previous_sibling.previous_sibling.text
name = sanitize(name_text.split(",")[0])
if name in groups_found:
return None
-
+
def incomplete_data_abort(what):
print(f"Incomplete data for group {name} ({what}).", file=sys.stderr)
-
+
aliases = []
-
+
while True:
node = node.next_sibling
-
+
if node.name == "i":
alias = sanitize(node.previous_sibling.text)
if alias in groups_found:
@@ -70,78 +70,78 @@ def page_to_group(page):
file=sys.stderr)
elif alias and alias != name:
aliases.append(alias)
-
+
elif node.text == ("\nInformation\xa0"):
return incomplete_data_abort("no country")
-
+
elif node.text == "\nCountry\xa0":
break
-
+
origin = sanitize(node.next_sibling.next_sibling)
-
+
while node.text != "\nMotivation\xa0":
if node.text == ("\nInformation\xa0"):
return incomplete_data_abort("no motivation")
node = node.next_sibling
-
+
motives = []
-
+
new_motives = node.next_sibling.next_sibling.split(",")
new_motives = [heavy_sanitize(text) for text in new_motives]
motives.extend(filter(None, new_motives))
-
+
while (node.text != ("\nInformation\xa0")
and not node.text.startswith("\nSectors:")):
node = node.next_sibling
-
+
sectors = []
-
+
while True:
if node.name:
node = node.next_sibling
continue
-
+
if (node.text == "\nInformation\xa0"
or "Countries:" in node.text
or ("\n" in node.text and not node.text.startswith("\nSectors:"))):
break
-
+
sectors_text = node.text
if "Sectors:" in sectors_text:
sectors_text = sectors_text.split("Sectors:")[1]
-
+
for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_text):
sector = heavy_sanitize(text)
if sector:
sectors.append(sector)
-
+
node = node.next_sibling
-
+
while (node.text != ("\nInformation\xa0")
and "Countries:" not in node.text):
node = node.next_sibling
-
+
countries = []
-
+
while True:
if node.name:
node = node.next_sibling
continue
-
+
if node.text == "\nInformation\xa0" or "\n" in node.text:
break
-
+
countries_text = node.text
if "Countries:" in countries_text:
countries_text = countries_text.split("Countries:")[1]
-
+
for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_text):
country = heavy_sanitize(text)
if country:
countries.append(country)
-
+
node = node.next_sibling
-
+
return Group(
name=name,
aliases=aliases,
@@ -153,7 +153,7 @@ def page_to_group(page):
for page in all_pages:
group = page_to_group(page)
-
+
if group:
for key in [group.name] + group.aliases:
groups_found[key] = group