#!/usr/bin/env python3 # SPDX-License-Identifier: CC0-1.0 # # Copyright (C) 2024 Wojtek Kosior from dataclasses import dataclass from html.parser import HTMLParser from pathlib import Path import sys import requests import yaml mitre_pages_path = Path(".") / "scraping" / "attack.mitre.org" profiles_path = Path('./profiles.yaml') def mitre_page_download(path): response = requests.get('https://attack.mitre.org/' + path) response.raise_for_status() return response.text def mitre_page_get(path): page_path = mitre_pages_path / path if page_path.exists(): return page_path.read_text() else: if not page_path.parent.exists(): page_path.parent.mkdir(parents=True) page_text = mitre_page_download(path) page_path.write_text(page_text) return page_text @dataclass class Group: name: str mitre_id: str aliases: list[str] class GroupListPageParser(HTMLParser): def __init__(self, relevant_groups): super().__init__() self.relevant_groups = relevant_groups self.col_numbers = [-1] self.current_tags = ["*TOP*"] self.collected_groups = {} self.collecting_new_group() def collecting_new_group(self): self.current_group_mitre_id = None self.current_group_name = None self.current_group_aliases = None def handle_starttag(self, tag, attrs): self.current_tags.append(tag) if tag == "tr": self.col_numbers.append(-1) elif tag == "td": self.col_numbers[-1] += 1 def handle_data(self, data): if self.current_tags[-1] == "a" and self.col_numbers[-1] == 0: self.current_group_mitre_id = data.strip() elif self.current_tags[-1] == "a" and self.col_numbers[-1] == 1: self.current_group_name = data.strip() elif self.current_tags[-1] == "td" and self.col_numbers[-1] == 2: data = data.strip() if data: self.current_group_aliases = data.split(", ") else: self.current_group_aliases = [] def handle_endtag(self, tag): self.current_tags.pop() if tag == "tr": self.col_numbers.pop() if self.current_group_name is None or \ self.current_group_mitre_id is None or \ self.current_group_aliases is None: print("Incomplete data for group.", file=sys.stderr) return if self.current_group_name not in self.relevant_groups: print(f"Ignoring group `{self.current_group_name}'", file=sys.stderr) return if self.current_group_name in self.collected_groups: print(f"Double definition of group `{self.current_group_name}'", file=sys.stderr) return self.collected_groups[self.current_group_name] = Group( self.current_group_name, self.current_group_mitre_id, self.current_group_aliases ) self.collecting_new_group() def get_groups(names): parser = GroupListPageParser(names) parser.feed(mitre_page_get("groups/")) return parser.collected_groups def get_group_names(profiles_path): def group_names(inp): return {group["name"] for group in yaml.safe_load(inp)["groups"]} if profiles_path: with open(profiles_path) as inp: return group_names(inp) return group_names(sys.stdin) if __name__ == "__main__": group_names = get_group_names(None if len(sys.argv) < 2 else sys.argv[1]) groups = get_groups(group_names) yaml.safe_dump([group.__dict__ for group in groups.values()], sys.stdout)