diff options
author | W. Kosior <koszko@koszko.org> | 2025-01-09 01:18:35 +0100 |
---|---|---|
committer | W. Kosior <koszko@koszko.org> | 2025-01-09 01:18:35 +0100 |
commit | cc93455733b0ee24080e78d1a78b5fe624dc1709 (patch) | |
tree | e76a306efd0eff7d778ebe8d38ae47073466d28b | |
parent | d63572395f027b7776d57e62d0019800e3c4657d (diff) | |
download | AGH-threat-intel-course-cc93455733b0ee24080e78d1a78b5fe624dc1709.tar.gz AGH-threat-intel-course-cc93455733b0ee24080e78d1a78b5fe624dc1709.zip |
remove trailing whitespace
-rwxr-xr-x | scrape_blackobird_groups_info.py | 58 |
1 files changed, 29 insertions, 29 deletions
diff --git a/scrape_blackobird_groups_info.py b/scrape_blackobird_groups_info.py index 43d91ca..2e99959 100755 --- a/scrape_blackobird_groups_info.py +++ b/scrape_blackobird_groups_info.py @@ -25,10 +25,10 @@ class Group: def sanitize(text): text = text.replace("\xa0", "").strip() - + if text and text[-1] == "(": text = text[:-1].strip() - + return text def heavy_sanitize(text): @@ -49,20 +49,20 @@ def page_to_group(page): node = node.next_sibling if node is None: return None - + name_text = node.previous_sibling.previous_sibling.text name = sanitize(name_text.split(",")[0]) if name in groups_found: return None - + def incomplete_data_abort(what): print(f"Incomplete data for group {name} ({what}).", file=sys.stderr) - + aliases = [] - + while True: node = node.next_sibling - + if node.name == "i": alias = sanitize(node.previous_sibling.text) if alias in groups_found: @@ -70,78 +70,78 @@ def page_to_group(page): file=sys.stderr) elif alias and alias != name: aliases.append(alias) - + elif node.text == ("\nInformation\xa0"): return incomplete_data_abort("no country") - + elif node.text == "\nCountry\xa0": break - + origin = sanitize(node.next_sibling.next_sibling) - + while node.text != "\nMotivation\xa0": if node.text == ("\nInformation\xa0"): return incomplete_data_abort("no motivation") node = node.next_sibling - + motives = [] - + new_motives = node.next_sibling.next_sibling.split(",") new_motives = [heavy_sanitize(text) for text in new_motives] motives.extend(filter(None, new_motives)) - + while (node.text != ("\nInformation\xa0") and not node.text.startswith("\nSectors:")): node = node.next_sibling - + sectors = [] - + while True: if node.name: node = node.next_sibling continue - + if (node.text == "\nInformation\xa0" or "Countries:" in node.text or ("\n" in node.text and not node.text.startswith("\nSectors:"))): break - + sectors_text = node.text if "Sectors:" in sectors_text: sectors_text = sectors_text.split("Sectors:")[1] - + for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_text): sector = heavy_sanitize(text) if sector: sectors.append(sector) - + node = node.next_sibling - + while (node.text != ("\nInformation\xa0") and "Countries:" not in node.text): node = node.next_sibling - + countries = [] - + while True: if node.name: node = node.next_sibling continue - + if node.text == "\nInformation\xa0" or "\n" in node.text: break - + countries_text = node.text if "Countries:" in countries_text: countries_text = countries_text.split("Countries:")[1] - + for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_text): country = heavy_sanitize(text) if country: countries.append(country) - + node = node.next_sibling - + return Group( name=name, aliases=aliases, @@ -153,7 +153,7 @@ def page_to_group(page): for page in all_pages: group = page_to_group(page) - + if group: for key in [group.name] + group.aliases: groups_found[key] = group |