summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorW. Kosior <koszko@koszko.org>2025-01-08 19:14:43 +0100
committerW. Kosior <koszko@koszko.org>2025-01-08 19:14:43 +0100
commita602cb0280251264b63648e9687f15040a16f4f1 (patch)
treee2ddaaee50c112d7b94e10f4e7aee24bda716adc
parenteaebabe439bd653cfa22701cb64697775d5ac38b (diff)
downloadAGH-threat-intel-course-a602cb0280251264b63648e9687f15040a16f4f1.tar.gz
AGH-threat-intel-course-a602cb0280251264b63648e9687f15040a16f4f1.zip
scrape PDF from blackobird's GitHub
-rw-r--r--Makefile26
-rwxr-xr-xscrape_blackobird_groups_info.py173
2 files changed, 195 insertions, 4 deletions
diff --git a/Makefile b/Makefile
index be3cc06..a830b0d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,14 @@
#
# Copyright (C) 2024 Wojtek Kosior <koszko@koszko.org>
-# Make sure you have Pandoc, a LaTeX distribution, Python as well as Python
-# packages `BeautifulSoup4', `pyyaml' (YAML parser library) and `requests'
-# installed.
+# Make sure you have Pandoc, a LaTeX distribution, poppler, Python as well as
+# Python packages `BeautifulSoup4', `pyyaml' (YAML parser library) and
+# `requests' installed.
PYTHON=python3
PANDOC=pandoc
LATEXMK=latexmk
+PDFTOHTML=pdftohtml
DEFAULT_TARGETS = \
profiles_with_scraped_info.yaml \
@@ -18,7 +19,7 @@ DEFAULT_TARGETS = \
all: $(DEFAULT_TARGETS)
.PHONY: all
-.SUFFIXES: .pdf .md
+.SUFFIXES: .pdf .md ss.html
.md.pdf:
$(PANDOC) \
@@ -29,12 +30,29 @@ all: $(DEFAULT_TARGETS)
--columns=1 \
-o $@ $<
+scraping:
+ mkdir $@
+
+scraping/Threat_Group_Cards: | scraping
+ mkdir $@
+
+scraping/Threat_Group_Cards/Threat_Group_Cards_v2.0.pdf: | \
+ scraping/Threat_Group_Cards
+ wget -O $@ "https://github.com/blackorbird/APT_REPORT/raw/master/Threat_Group_Cards_v2.0.pdf"
+
+.pdfss.html:
+ $(PDFTOHTML) $< $@
+
tables.md: threats_by_sector_table.py profiles.yaml
$(PYTHON) $^ > $@
profiles_with_scraped_info.yaml: scrape_mitre_groups_info.py profiles.yaml
$(PYTHON) $^ > $@
+blackobird_scraped_profiles.yaml: scrape_blackobird_groups_info.py \
+ scraping/Threat_Group_Cards/Threat_Group_Cards_v2.0ss.html
+ $(PYTHON) $^ > $@
+
techniques_table.tex: techniques_table.py profiles_with_scraped_info.yaml
$(PYTHON) $^ > $@
diff --git a/scrape_blackobird_groups_info.py b/scrape_blackobird_groups_info.py
new file mode 100755
index 0000000..6f3bae6
--- /dev/null
+++ b/scrape_blackobird_groups_info.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+
+# SPDX-License-Identifier: CC0-1.0
+#
+# Copyright (C) 2025 Wojtek Kosior <koszko@koszko.org>
+
+from dataclasses import dataclass
+from pathlib import Path
+import sys
+import time
+import re
+
+from bs4 import BeautifulSoup
+import requests
+import yaml
+
+@dataclass
+class Group:
+ name: str
+ aliases: list[str]
+ origin: str
+ motives: list[str]
+ sectors: list[str]
+ countries: list[str]
+
+def sanitize(text):
+ text = text.replace("\xa0", "").strip()
+
+ if text and text[-1] == "(":
+ text = text[:-1].strip()
+
+ return text
+
+def heavy_sanitize(text):
+ for bad in [" ", ".", "mostly", "(", ")"]:
+ text = sanitize(text).lower().replace(bad, "")
+ return text
+
+path = Path(sys.argv[1])
+soup = BeautifulSoup(path.read_text(), features="lxml")
+
+all_pages = soup.select("body > a[name]")
+
+groups_found = {}
+
+def page_to_group(page):
+ node = page
+ while node.text != "\nNames\xa0":
+ node = node.next_sibling
+ if node is None:
+ return None
+
+ name_text = node.previous_sibling.previous_sibling.text
+ name = sanitize(name_text.split(",")[0])
+ if name in groups_found:
+ return None
+
+ def incomplete_data_abort(what):
+ print(f"Incomplete data for group {name} ({what}).", file=sys.stderr)
+
+ aliases = []
+
+ while True:
+ node = node.next_sibling
+
+ if node.name == "i":
+ alias = sanitize(node.previous_sibling.text)
+ if alias in groups_found:
+ print(f"Alias {alias} of group {name} already registered.",
+ file=sys.stderr)
+ elif alias and alias != name:
+ aliases.append(alias)
+
+ elif node.text == ("\nInformation\xa0"):
+ return incomplete_data_abort("no country")
+
+ elif node.text == "\nCountry\xa0":
+ break
+
+ origin = sanitize(node.next_sibling.next_sibling)
+
+ while node.text != "\nMotivation\xa0":
+ if node.text == ("\nInformation\xa0"):
+ return incomplete_data_abort("no motivation")
+ node = node.next_sibling
+
+ motives = []
+
+ while True:
+ node = node.next_sibling
+
+ if node.name:
+ continue
+
+ if node.text == ("\nInformation\xa0") or "\n" in node.text:
+ break
+
+ new_motives = node.next_sibling.next_sibling.split(",")
+ new_motives = [heavy_sanitize(text) for text in new_motives]
+ motives.extend(filter(None, new_motives))
+
+ while (node.text != ("\nInformation\xa0")
+ and not node.text.startswith("\nSectors:")):
+ node = node.next_sibling
+
+ sectors = []
+
+ while True:
+ if node.name:
+ node = node.next_sibling
+ continue
+
+ if (node.text == "\nInformation\xa0"
+ or "Countries:" in node.text
+ or ("\n" in node.text and not node.text.startswith("\nSectors:"))):
+ break
+
+ sectors_text = node.text
+ if "Sectors:" in sectors_text:
+ sectors_text = sectors_text.split("Sectors:")[1]
+
+ for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_text):
+ sector = heavy_sanitize(text)
+ if sector:
+ sectors.append(sector)
+
+ node = node.next_sibling
+
+ while (node.text != ("\nInformation\xa0")
+ and "Countries:" not in node.text):
+ node = node.next_sibling
+
+ countries = []
+
+ while True:
+ if node.name:
+ node = node.next_sibling
+ continue
+
+ if node.text == "\nInformation\xa0" or "\n" in node.text:
+ break
+
+ countries_text = node.text
+ if "Countries:" in countries_text:
+ countries_text = countries_text.split("Countries:")[1]
+
+ for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_text):
+ country = heavy_sanitize(text)
+ if country:
+ countries.append(country)
+
+ node = node.next_sibling
+
+ return Group(
+ name=name,
+ aliases=aliases,
+ origin=origin,
+ motives=motives,
+ sectors=sectors,
+ countries=countries
+ )
+
+for page in all_pages:
+ group = page_to_group(page)
+
+ if group:
+ for key in [group.name] + group.aliases:
+ groups_found[key] = group
+
+all_names = sorted(set(g.name for g in groups_found.values()))
+
+yaml.safe_dump({"groups": [groups_found[name].__dict__ for name in all_names]},
+ sys.stdout)