fix handling of newlines in scraper

author: W. Kosior <koszko@koszko.org> 2025-01-09 19:59:36 +0100
committer: W. Kosior <koszko@koszko.org> 2025-01-09 19:59:36 +0100
commit: 87276929e0ec1464626143e3d5212464fda8d61c (patch)
tree: 7a2ce6a55e3707b08daca0b855203a75f0f48013
parent: 640517a91c588e259357c739e30edb91f02e523a (diff)
download: AGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.tar.gz
AGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.zip
1 files changed, 24 insertions, 16 deletions
diff --git a/scrape_blackobird_groups_info.py b/scrape_blackobird_groups_info.py
index 2e99959..082c349 100755
--- a/scrape_blackobird_groups_info.py
+++ b/scrape_blackobird_groups_info.py
@@ -94,7 +94,7 @@ def page_to_group(page):
            and not node.text.startswith("\nSectors:")):
         node = node.next_sibling
 
-    sectors = []
+    sectors_string = ""
 
     while True:
         if node.name:
@@ -106,22 +106,26 @@ def page_to_group(page):
             or ("\n" in node.text and not node.text.startswith("\nSectors:"))):
             break
 
-        sectors_text = node.text
-        if "Sectors:" in sectors_text:
-            sectors_text = sectors_text.split("Sectors:")[1]
+        new_sectors_text = node.text
+        if "Sectors:" in new_sectors_text:
+            new_sectors_text = new_sectors_text.split("Sectors:")[1]
 
-        for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_text):
-            sector = heavy_sanitize(text)
-            if sector:
-                sectors.append(sector)
+        sectors_string += new_sectors_text
 
         node = node.next_sibling
 
+    sectors = []
+
+    for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", sectors_string):
+        sector = heavy_sanitize(text)
+        if sector:
+            sectors.append(sector)
+
     while (node.text != ("\nInformation\xa0")
            and "Countries:" not in node.text):
         node = node.next_sibling
 
-    countries = []
+    countries_string = ""
 
     while True:
         if node.name:
@@ -131,17 +135,21 @@ def page_to_group(page):
         if node.text == "\nInformation\xa0" or "\n" in node.text:
             break
 
-        countries_text = node.text
-        if "Countries:" in countries_text:
-            countries_text = countries_text.split("Countries:")[1]
+        new_countries_text = node.text
+        if "Countries:" in new_countries_text:
+            new_countries_text = new_countries_text.split("Countries:")[1]
 
-        for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_text):
-            country = heavy_sanitize(text)
-            if country:
-                countries.append(country)
+        countries_string += new_countries_text
 
         node = node.next_sibling
 
+    countries = []
+
+    for text in re.split("[.]|;|,|[ \xa0]and|and[ \xa0]", countries_string):
+        country = heavy_sanitize(text)
+        if country:
+            countries.append(country)
+
     return Group(
         name=name,
         aliases=aliases,
author	W. Kosior <koszko@koszko.org>	2025-01-09 19:59:36 +0100
committer	W. Kosior <koszko@koszko.org>	2025-01-09 19:59:36 +0100
commit	87276929e0ec1464626143e3d5212464fda8d61c (patch)
tree	7a2ce6a55e3707b08daca0b855203a75f0f48013
parent	640517a91c588e259357c739e30edb91f02e523a (diff)
download	AGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.tar.gz AGH-threat-intel-course-87276929e0ec1464626143e3d5212464fda8d61c.zip