#!/usr/bin/env python3 # SPDX-License-Identifier: CC0-1.0 # # Copyright (C) 2024 Wojtek Kosior from dataclasses import dataclass from pathlib import Path import sys import time from bs4 import BeautifulSoup import requests import yaml mitre_pages_path = Path(".") / "scraping" / "attack.mitre.org" mitre_request_delay_time = 5 # seconds last_mitre_download_time = 0 def mitre_page_download(path): global last_mitre_download_time now = time.time() time_to_wait = last_mitre_download_time + mitre_request_delay_time - now if time_to_wait > 0: time.sleep(time_to_wait) URI = 'https://attack.mitre.org/' + path print(f"Fetching `{URI}'.", file=sys.stderr) response = requests.get(URI) response.raise_for_status() last_mitre_download_time = time.time() return response.text def mitre_page_get(path): page_path = (mitre_pages_path / path).with_suffix(".html") if page_path.exists(): return page_path.read_text() else: if not page_path.parent.exists(): page_path.parent.mkdir(parents=True) page_text = mitre_page_download(path) page_path.write_text(page_text) return page_text @dataclass class Group: name: str mitre_id: str aliases: list[str] technique_ids: list[str] @dataclass class Technique: name: str mitre_id: str def get_group_techniques(gid): techniques = {} soup = BeautifulSoup(mitre_page_get(f"groups/{gid}/"), features="lxml") for row in soup.select("table.techniques-used tbody tr"): if "sub" in row.attrs.get("class", []): continue tid_cell = row.select_one("td:nth-child(2)") tid = tid_cell.select_one("a").get_text(strip=True) name_cell_idx = 1 + int(tid_cell.attrs.get("colspan", 1)) name = row.select_one(f"td:nth-child({name_cell_idx}) a:first-child")\ .get_text(strip=True) if not name or not tid: print("Incomplete data for group's technique.", file=sys.stderr) else: techniques[tid] = Technique(name, tid) return techniques def get_groups_and_techniques(relevant_names): groups = {} all_techniques = {} soup = BeautifulSoup(mitre_page_get("groups/"), features="lxml") for row in soup.select("tbody tr"): gid = row.select_one("td:nth-child(1) a").get_text(strip=True) name = row.select_one("td:nth-child(2) a").get_text(strip=True) if not name or not gid: print("Incomplete data for group.", file=sys.stderr) elif name not in relevant_names: print(f"Ignoring group `{name}'", file=sys.stderr) elif name in groups: print(f"Double definition of group `{name}'", file=sys.stderr) else: aliases = row.select_one("td:nth-child(3)").get_text().split(", ") aliases = list(filter(None, [alias.strip() for alias in aliases])) techniques = get_group_techniques(gid) all_techniques.update(techniques) groups[name] = Group(name, gid, aliases, [t.mitre_id for t in techniques.values()]) return groups, all_techniques def get_profiles_data(profiles_path): if profiles_path: with open(profiles_path) as inp: return yaml.safe_load(inp) return yaml.safe_load(sys.stdin) if __name__ == "__main__": profiles_data = get_profiles_data(None if len(sys.argv) < 2 else sys.argv[1]) group_profiles = dict((g["name"],g) for g in profiles_data["groups"]) groups, techniques = get_groups_and_techniques(group_profiles) missing_names = set(group_profiles).difference(groups) if missing_names: print(f"No data found for group(s): {', '.join(sorted(missing_names))}", file=sys.stderr) for name, group in groups.items(): group_profiles[name].update(group.__dict__) profiles_data["groups"] = list(group_profiles.values()) profiles_data["techniques"] = [t.__dict__ for t in techniques.values()] yaml.safe_dump(profiles_data, sys.stdout)