diff options
author | W. Kosior <koszko@koszko.org> | 2025-01-09 19:59:36 +0100 |
---|---|---|
committer | W. Kosior <koszko@koszko.org> | 2025-01-09 19:59:36 +0100 |
commit | 87276929e0ec1464626143e3d5212464fda8d61c (patch) | |
tree | 7a2ce6a55e3707b08daca0b855203a75f0f48013 | |
parent | 640517a91c588e259357c739e30edb91f02e523a (diff) | |
download | AGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.tar.gz AGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.zip |
fix handling of newlines in scraper
-rwxr-xr-x | scrape_blackobird_groups_info.py | 40 |
1 files changed, 24 insertions, 16 deletions
diff --git a/scrape_blackobird_groups_info.py b/scrape_blackobird_groups_info.py index 2e99959..082c349 100755 --- a/scrape_blackobird_groups_info.py +++ b/scrape_blackobird_groups_info.py @@ -94,7 +94,7 @@ def page_to_group(page): and not node.text.startswith("\nSectors:")): node = node.next_sibling - sectors = [] + sectors_string = "" while True: if node.name: @@ -106,22 +106,26 @@ def page_to_group(page): or ("\n" in node.text and not node.text.startswith("\nSectors:"))): break - sectors_text = node.text - if "Sectors:" in sectors_text: - sectors_text = sectors_text.split("Sectors:")[1] + new_sectors_text = node.text + if "Sectors:" in new_sectors_text: + new_sectors_text = new_sectors_text.split("Sectors:")[1] - for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_text): - sector = heavy_sanitize(text) - if sector: - sectors.append(sector) + sectors_string += new_sectors_text node = node.next_sibling + sectors = [] + + for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_string): + sector = heavy_sanitize(text) + if sector: + sectors.append(sector) + while (node.text != ("\nInformation\xa0") and "Countries:" not in node.text): node = node.next_sibling - countries = [] + countries_string = "" while True: if node.name: @@ -131,17 +135,21 @@ def page_to_group(page): if node.text == "\nInformation\xa0" or "\n" in node.text: break - countries_text = node.text - if "Countries:" in countries_text: - countries_text = countries_text.split("Countries:")[1] + new_countries_text = node.text + if "Countries:" in new_countries_text: + new_countries_text = new_countries_text.split("Countries:")[1] - for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_text): - country = heavy_sanitize(text) - if country: - countries.append(country) + countries_string += new_countries_text node = node.next_sibling + countries = [] + + for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_string): + country = heavy_sanitize(text) + if country: + countries.append(country) + return Group( name=name, aliases=aliases, |