1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
#!/usr/bin/env python3
# SPDX-License-Identifier: CC0-1.0
#
# Copyright (C) 2024 Wojtek Kosior <koszko@koszko.org>
from dataclasses import dataclass
from pathlib import Path
import sys
import time
from bs4 import BeautifulSoup
import requests
import yaml
mitre_pages_path = Path(".") / "scraping" / "attack.mitre.org"
mitre_request_delay_time = 5 # seconds
last_mitre_download_time = 0
def mitre_page_download(path):
global last_mitre_download_time
now = time.time()
time_to_wait = last_mitre_download_time + mitre_request_delay_time - now
if time_to_wait > 0:
time.sleep(time_to_wait)
URI = 'https://attack.mitre.org/' + path
print(f"Fetching `{URI}'.", file=sys.stderr)
response = requests.get(URI)
response.raise_for_status()
last_mitre_download_time = time.time()
return response.text
def mitre_page_get(path):
page_path = (mitre_pages_path / path).with_suffix(".html")
if page_path.exists():
return page_path.read_text()
else:
if not page_path.parent.exists():
page_path.parent.mkdir(parents=True)
page_text = mitre_page_download(path)
page_path.write_text(page_text)
return page_text
@dataclass
class Group:
name: str
mitre_id: str
aliases: list[str]
technique_ids: list[str]
@dataclass
class Technique:
name: str
mitre_id: str
def get_group_techniques(gid):
techniques = {}
soup = BeautifulSoup(mitre_page_get(f"groups/{gid}/"), features="lxml")
for row in soup.select("table.techniques-used tbody tr"):
if "sub" in row.attrs.get("class", []):
continue
tid_cell = row.select_one("td:nth-child(2)")
tid = tid_cell.select_one("a").get_text(strip=True)
name_cell_idx = 1 + int(tid_cell.attrs.get("colspan", 1))
name = row.select_one(f"td:nth-child({name_cell_idx}) a:first-child")\
.get_text(strip=True)
if not name or not tid:
print("Incomplete data for group's technique.", file=sys.stderr)
else:
techniques[tid] = Technique(name, tid)
return techniques
def get_groups_and_techniques(relevant_names):
groups = {}
all_techniques = {}
soup = BeautifulSoup(mitre_page_get("groups/"), features="lxml")
for row in soup.select("tbody tr"):
gid = row.select_one("td:nth-child(1) a").get_text(strip=True)
name = row.select_one("td:nth-child(2) a").get_text(strip=True)
if not name or not gid:
print("Incomplete data for group.", file=sys.stderr)
elif name not in relevant_names:
print(f"Ignoring group `{name}'", file=sys.stderr)
elif name in groups:
print(f"Double definition of group `{name}'", file=sys.stderr)
else:
aliases = row.select_one("td:nth-child(3)").get_text().split(", ")
aliases = list(filter(None, [alias.strip() for alias in aliases]))
techniques = get_group_techniques(gid)
all_techniques.update(techniques)
groups[name] = Group(name, gid, aliases,
[t.mitre_id for t in techniques.values()])
return groups, all_techniques
def get_profiles_data(profiles_path):
if profiles_path:
with open(profiles_path) as inp:
return yaml.safe_load(inp)
return yaml.safe_load(sys.stdin)
if __name__ == "__main__":
profiles_data = get_profiles_data(None if len(sys.argv) < 2
else sys.argv[1])
group_profiles = dict((g["name"],g) for g in profiles_data["groups"])
groups, techniques = get_groups_and_techniques(group_profiles)
missing_names = set(group_profiles).difference(groups)
if missing_names:
print(f"No data found for group(s): {', '.join(sorted(missing_names))}",
file=sys.stderr)
for name, group in groups.items():
group_profiles[name].update(group.__dict__)
profiles_data["groups"] = list(group_profiles.values())
profiles_data["techniques"] = [t.__dict__ for t in techniques.values()]
yaml.safe_dump(profiles_data, sys.stdout)
|