summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorW. Kosior <koszko@koszko.org>2025-01-09 19:59:36 +0100
committerW. Kosior <koszko@koszko.org>2025-01-09 19:59:36 +0100
commit87276929e0ec1464626143e3d5212464fda8d61c (patch)
tree7a2ce6a55e3707b08daca0b855203a75f0f48013
parent640517a91c588e259357c739e30edb91f02e523a (diff)
downloadAGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.tar.gz
AGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.zip
fix handling of newlines in scraper
-rwxr-xr-xscrape_blackobird_groups_info.py40
1 files changed, 24 insertions, 16 deletions
diff --git a/scrape_blackobird_groups_info.py b/scrape_blackobird_groups_info.py
index 2e99959..082c349 100755
--- a/scrape_blackobird_groups_info.py
+++ b/scrape_blackobird_groups_info.py
@@ -94,7 +94,7 @@ def page_to_group(page):
and not node.text.startswith("\nSectors:")):
node = node.next_sibling
- sectors = []
+ sectors_string = ""
while True:
if node.name:
@@ -106,22 +106,26 @@ def page_to_group(page):
or ("\n" in node.text and not node.text.startswith("\nSectors:"))):
break
- sectors_text = node.text
- if "Sectors:" in sectors_text:
- sectors_text = sectors_text.split("Sectors:")[1]
+ new_sectors_text = node.text
+ if "Sectors:" in new_sectors_text:
+ new_sectors_text = new_sectors_text.split("Sectors:")[1]
- for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_text):
- sector = heavy_sanitize(text)
- if sector:
- sectors.append(sector)
+ sectors_string += new_sectors_text
node = node.next_sibling
+ sectors = []
+
+ for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_string):
+ sector = heavy_sanitize(text)
+ if sector:
+ sectors.append(sector)
+
while (node.text != ("\nInformation\xa0")
and "Countries:" not in node.text):
node = node.next_sibling
- countries = []
+ countries_string = ""
while True:
if node.name:
@@ -131,17 +135,21 @@ def page_to_group(page):
if node.text == "\nInformation\xa0" or "\n" in node.text:
break
- countries_text = node.text
- if "Countries:" in countries_text:
- countries_text = countries_text.split("Countries:")[1]
+ new_countries_text = node.text
+ if "Countries:" in new_countries_text:
+ new_countries_text = new_countries_text.split("Countries:")[1]
- for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_text):
- country = heavy_sanitize(text)
- if country:
- countries.append(country)
+ countries_string += new_countries_text
node = node.next_sibling
+ countries = []
+
+ for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_string):
+ country = heavy_sanitize(text)
+ if country:
+ countries.append(country)
+
return Group(
name=name,
aliases=aliases,