diff options
author | W. Kosior <koszko@koszko.org> | 2025-01-08 19:14:43 +0100 |
---|---|---|
committer | W. Kosior <koszko@koszko.org> | 2025-01-08 19:14:43 +0100 |
commit | a602cb0280251264b63648e9687f15040a16f4f1 (patch) | |
tree | e2ddaaee50c112d7b94e10f4e7aee24bda716adc | |
parent | eaebabe439bd653cfa22701cb64697775d5ac38b (diff) | |
download | AGH-threat-intel-course-a602cb0280251264b63648e9687f15040a16f4f1.tar.gz AGH-threat-intel-course-a602cb0280251264b63648e9687f15040a16f4f1.zip |
scrape PDF from blackobird's GitHub
-rw-r--r-- | Makefile | 26 | ||||
-rwxr-xr-x | scrape_blackobird_groups_info.py | 173 |
2 files changed, 195 insertions, 4 deletions
@@ -2,13 +2,14 @@ # # Copyright (C) 2024 Wojtek Kosior <koszko@koszko.org> -# Make sure you have Pandoc, a LaTeX distribution, Python as well as Python -# packages `BeautifulSoup4', `pyyaml' (YAML parser library) and `requests' -# installed. +# Make sure you have Pandoc, a LaTeX distribution, poppler, Python as well as +# Python packages `BeautifulSoup4', `pyyaml' (YAML parser library) and +# `requests' installed. PYTHON=python3 PANDOC=pandoc LATEXMK=latexmk +PDFTOHTML=pdftohtml DEFAULT_TARGETS = \ profiles_with_scraped_info.yaml \ @@ -18,7 +19,7 @@ DEFAULT_TARGETS = \ all: $(DEFAULT_TARGETS) .PHONY: all -.SUFFIXES: .pdf .md +.SUFFIXES: .pdf .md ss.html .md.pdf: $(PANDOC) \ @@ -29,12 +30,29 @@ all: $(DEFAULT_TARGETS) --columns=1 \ -o $@ $< +scraping: + mkdir $@ + +scraping/Threat_Group_Cards: | scraping + mkdir $@ + +scraping/Threat_Group_Cards/Threat_Group_Cards_v2.0.pdf: | \ + scraping/Threat_Group_Cards + wget -O $@ "https://github.com/blackorbird/APT_REPORT/raw/master/Threat_Group_Cards_v2.0.pdf" + +.pdfss.html: + $(PDFTOHTML) $< $@ + tables.md: threats_by_sector_table.py profiles.yaml $(PYTHON) $^ > $@ profiles_with_scraped_info.yaml: scrape_mitre_groups_info.py profiles.yaml $(PYTHON) $^ > $@ +blackobird_scraped_profiles.yaml: scrape_blackobird_groups_info.py \ + scraping/Threat_Group_Cards/Threat_Group_Cards_v2.0ss.html + $(PYTHON) $^ > $@ + techniques_table.tex: techniques_table.py profiles_with_scraped_info.yaml $(PYTHON) $^ > $@ diff --git a/scrape_blackobird_groups_info.py b/scrape_blackobird_groups_info.py new file mode 100755 index 0000000..6f3bae6 --- /dev/null +++ b/scrape_blackobird_groups_info.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 + +# SPDX-License-Identifier: CC0-1.0 +# +# Copyright (C) 2025 Wojtek Kosior <koszko@koszko.org> + +from dataclasses import dataclass +from pathlib import Path +import sys +import time +import re + +from bs4 import BeautifulSoup +import requests +import yaml + +@dataclass +class Group: + name: str + aliases: list[str] + origin: str + motives: list[str] + sectors: list[str] + countries: list[str] + +def sanitize(text): + text = text.replace("\xa0", "").strip() + + if text and text[-1] == "(": + text = text[:-1].strip() + + return text + +def heavy_sanitize(text): + for bad in [" ", ".", "mostly", "(", ")"]: + text = sanitize(text).lower().replace(bad, "") + return text + +path = Path(sys.argv[1]) +soup = BeautifulSoup(path.read_text(), features="lxml") + +all_pages = soup.select("body > a[name]") + +groups_found = {} + +def page_to_group(page): + node = page + while node.text != "\nNames\xa0": + node = node.next_sibling + if node is None: + return None + + name_text = node.previous_sibling.previous_sibling.text + name = sanitize(name_text.split(",")[0]) + if name in groups_found: + return None + + def incomplete_data_abort(what): + print(f"Incomplete data for group {name} ({what}).", file=sys.stderr) + + aliases = [] + + while True: + node = node.next_sibling + + if node.name == "i": + alias = sanitize(node.previous_sibling.text) + if alias in groups_found: + print(f"Alias {alias} of group {name} already registered.", + file=sys.stderr) + elif alias and alias != name: + aliases.append(alias) + + elif node.text == ("\nInformation\xa0"): + return incomplete_data_abort("no country") + + elif node.text == "\nCountry\xa0": + break + + origin = sanitize(node.next_sibling.next_sibling) + + while node.text != "\nMotivation\xa0": + if node.text == ("\nInformation\xa0"): + return incomplete_data_abort("no motivation") + node = node.next_sibling + + motives = [] + + while True: + node = node.next_sibling + + if node.name: + continue + + if node.text == ("\nInformation\xa0") or "\n" in node.text: + break + + new_motives = node.next_sibling.next_sibling.split(",") + new_motives = [heavy_sanitize(text) for text in new_motives] + motives.extend(filter(None, new_motives)) + + while (node.text != ("\nInformation\xa0") + and not node.text.startswith("\nSectors:")): + node = node.next_sibling + + sectors = [] + + while True: + if node.name: + node = node.next_sibling + continue + + if (node.text == "\nInformation\xa0" + or "Countries:" in node.text + or ("\n" in node.text and not node.text.startswith("\nSectors:"))): + break + + sectors_text = node.text + if "Sectors:" in sectors_text: + sectors_text = sectors_text.split("Sectors:")[1] + + for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_text): + sector = heavy_sanitize(text) + if sector: + sectors.append(sector) + + node = node.next_sibling + + while (node.text != ("\nInformation\xa0") + and "Countries:" not in node.text): + node = node.next_sibling + + countries = [] + + while True: + if node.name: + node = node.next_sibling + continue + + if node.text == "\nInformation\xa0" or "\n" in node.text: + break + + countries_text = node.text + if "Countries:" in countries_text: + countries_text = countries_text.split("Countries:")[1] + + for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_text): + country = heavy_sanitize(text) + if country: + countries.append(country) + + node = node.next_sibling + + return Group( + name=name, + aliases=aliases, + origin=origin, + motives=motives, + sectors=sectors, + countries=countries + ) + +for page in all_pages: + group = page_to_group(page) + + if group: + for key in [group.name] + group.aliases: + groups_found[key] = group + +all_names = sorted(set(g.name for g in groups_found.values())) + +yaml.safe_dump({"groups": [groups_found[name].__dict__ for name in all_names]}, + sys.stdout) |