certtools
diff --git a/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/user/bots.md
Lines changed: 3 additions & 2 deletions b/‎docs/user/bots.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎intelmq/bots/parsers/twitter/REQUIREMENTS.txt renamed to ‎intelmq/bots/parsers/ioc_extractor/REQUIREMENTS.txt b/‎intelmq/bots/parsers/twitter/REQUIREMENTS.txt renamed to ‎intelmq/bots/parsers/ioc_extractor/REQUIREMENTS.txt
diff --git a/‎intelmq/bots/parsers/twitter/__init__.py renamed to ‎intelmq/bots/parsers/ioc_extractor/__init__.py b/‎intelmq/bots/parsers/twitter/__init__.py renamed to ‎intelmq/bots/parsers/ioc_extractor/__init__.py
diff --git a/‎intelmq/bots/parsers/ioc_extractor/parser.py
Lines changed: 132 additions & 0 deletions b/‎intelmq/bots/parsers/ioc_extractor/parser.py
Lines changed: 132 additions & 0 deletions
diff --git a/‎intelmq/bots/parsers/twitter/parser.py
Lines changed: 12 additions & 117 deletions b/‎intelmq/bots/parsers/twitter/parser.py
Lines changed: 12 additions & 117 deletions
diff --git a/‎intelmq/tests/bots/parsers/twitter/__init__.py renamed to ‎intelmq/tests/bots/parsers/ioc_extractor/__init__.py b/‎intelmq/tests/bots/parsers/twitter/__init__.py renamed to ‎intelmq/tests/bots/parsers/ioc_extractor/__init__.py
diff --git a/‎intelmq/tests/bots/parsers/ioc_extractor/test_parser.py
Lines changed: 88 additions & 0 deletions b/‎intelmq/tests/bots/parsers/ioc_extractor/test_parser.py
Lines changed: 88 additions & 0 deletions
diff --git a/‎intelmq/tests/bots/parsers/twitter/tweet.txt renamed to ‎intelmq/tests/bots/parsers/ioc_extractor/tweet.txt b/‎intelmq/tests/bots/parsers/twitter/tweet.txt renamed to ‎intelmq/tests/bots/parsers/ioc_extractor/tweet.txt
diff --git a/‎intelmq/tests/bots/parsers/twitter/tweet.txt.license renamed to ‎intelmq/tests/bots/parsers/ioc_extractor/tweet.txt.license b/‎intelmq/tests/bots/parsers/twitter/tweet.txt.license renamed to ‎intelmq/tests/bots/parsers/ioc_extractor/tweet.txt.license
@@ -29,6 +29,8 @@
   - Log the downloaded size in bytes to ease troubleshooting (PR#2554 by Sebastian Wagner).
   - Fix import for Timeout exception preventing another exception (fixes #2555, PR#2556 by Sebastian Wagner).
 - Remove `intelmq.bots.collectors.twitter` as it uses an unmaintained library and does not work any more (fixes #2346, #2441, PR#2568 by Sebastian Wagner).
+- Renamed `intelmq.bots.parser.twitter` to `intelmq.bots.parser.ioc_extractor` (PR#2568 by Sebastian Wagner).
+  - Added `intelmq.bots.parser.twitter` as a stub to load the IoC Extractor parser.
 
 #### Parsers
 - `intelmq.bots.parsers.shadowserver._config`:
 
@@ -2268,11 +2268,12 @@ No additional parameters.
 
 ---
 
-### Twitter <div id="intelmq.bots.parsers.twitter.parser" />
+### IoC Extractor (ex: Twitter) <div id="intelmq.bots.parsers.twitter.parser" /><div id="intelmq.bots.parsers.ioc_extractor.parser" />
 
 Extracts URLs from text, fuzzy, aimed at parsing tweets.
 
-**Module:** `intelmq.bots.parsers.twitter.parser`
+**Module:** `intelmq.bots.parsers.ioc_extractor.parser`<br>
+previously: `intelmq.bots.parsers.twitter.parser`
 
 **Parameters:**
 
 
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2018 Karel
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+r"""
+
+Parser of text intended to obtain IOCs from tweets.
+First substitutions are performed and then words in the text are compared with
+'(/|^)([a-z0-9.-]+\.[a-z0-9]+?)([/:]|$)'
+In the case of a match it is checked whether this can be a valid domain using get_tld
+There is also a whitelist for filtering out good domains.
+
+
+Parameters:
+    domain_whitelist : domains that will be ignored in parsing
+
+    substitutions : semicolon separated list of pairs
+                    substitutions that will be made in the text,
+                    for example " .com,.com" enables parsing of one fuzzy format "[.];." enables the parsing of another fuzzy format
+
+    classification_type : string with a valid classificationtype
+"""
+import re
+import pkg_resources
+
+from typing import Optional, Iterable
+
+from intelmq.lib.bot import ParserBot, utils
+from intelmq.lib.exceptions import InvalidArgument
+from intelmq.lib.harmonization import ClassificationType
+from intelmq.lib.exceptions import MissingDependencyError
+
+try:
+    from url_normalize import url_normalize
+except ImportError:
+    url_normalize = None
+
+try:
+    import tld.exceptions
+    from tld import get_tld
+    from tld.utils import update_tld_names
+except ImportError:
+    get_tld = None
+    update_tld_names = None
+
+
+class IocExtractorParserBot(ParserBot):
+    """Parse tweets and extract IoC data. Currently only URLs are supported, a whitelist of safe domains can be provided"""
+    default_scheme: Optional[str] = None
+    domain_whitelist: str = 't.co'
+    _domain_whitelist: Iterable[str] = []
+    substitutions: str = ".net;[.]net"
+    _substitutions: Iterable[str] = []
+    classification_type: str = "blacklist"
+
+    def init(self):
+        if url_normalize is None:
+            raise MissingDependencyError("url-normalize")
+        url_version = pkg_resources.get_distribution("url-normalize").version
+        if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and self.default_scheme is not None:
+            raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
+                             "Get at least version '1.4.1'." % url_version)
+        if get_tld is None:
+            raise MissingDependencyError("tld")
+        try:
+            update_tld_names()
+        except tld.exceptions.TldIOError:
+            self.logger.info("Could not update TLD names cache.")
+        if self.domain_whitelist != '':
+            self._domain_whitelist.extend(self.domain_whitelist.split(','))
+        if self.substitutions != '':
+            temp = self.substitutions.split(';')
+            if len(temp) % 2 != 0:
+                raise InvalidArgument(
+                    'substitutions',
+                    got=self.substitutions,
+                    expected="even number of ; separated strings")
+            for i in range(int(len(temp) / 2)):
+                self._substitutions.append([temp[2 * i], temp[2 * i + 1]])
+        if not ClassificationType.is_valid(self.classification_type):
+            self.classification_type = 'unknown'
+
+        if self.default_scheme is not None:
+            self.url_kwargs = {'default_scheme': self.default_scheme}
+        else:
+            self.url_kwargs = {}
+
+    def get_domain(self, address):
+        try:
+            dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2)
+            if not self.in_whitelist(dom):
+                if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True):
+                    return url_normalize(dom, **self.url_kwargs)
+            return None
+        except AttributeError:
+            return None
+        except UnicodeError:  # url_normalize's error, happens when something weird matches regex
+            self.logger.info("Caught UnicodeError on %r.", address)
+            return None
+
+    def in_whitelist(self, domain: str) -> bool:
+        for dom_clean in self._domain_whitelist:
+            if re.search(dom_clean, domain):
+                return True
+        return False
+
+    def get_data_from_text(self, text) -> list:
+        data = []
+        for sub in self._substitutions:  # allows for custom matches
+            text = text.replace(sub[0], sub[1])
+        tweet_text = text.split()
+        tweet_text = [x.strip().lower() for x in tweet_text]
+        for part in tweet_text:
+            domain = self.get_domain(part)
+            if domain:
+                data.append(domain)
+        return data
+
+    def process(self):
+        report = self.receive_message()
+        text = utils.base64_decode(report["raw"])
+        data = self.get_data_from_text(text)
+        for url in data:
+            event = self.new_event(report)
+            event.add('raw', text)
+            event.add('source.url', url)
+            event.add('classification.type', self.classification_type)
+            self.send_message(event)
+        self.acknowledge_message()
+
+
+BOT = IocExtractorParserBot
@@ -1,132 +1,27 @@
-# SPDX-FileCopyrightText: 2018 Karel
+# SPDX-FileCopyrightText: 2025 Institute for Common Good Technology, Sebastian Wagner
 #
 # SPDX-License-Identifier: AGPL-3.0-or-later
 
-r"""
-
-Parser of text intended to obtain IOCs from tweets.
-First substitutions are performed and then words in the text are compared with
-'(/|^)([a-z0-9.-]+\.[a-z0-9]+?)([/:]|$)'
-In the case of a match it is checked whether this can be a valid domain using get_tld
-There is also a whitelist for filtering out good domains.
-
-
-Parameters:
-    domain_whitelist : domains that will be ignored in parsing
-
-    substitutions : semicolon separated list of pairs
-                    substitutions that will be made in the text,
-                    for example " .com,.com" enables parsing of one fuzzy format "[.];." enables the parsing of another fuzzy format
-
-    classification_type : string with a valid classificationtype
 """
-import re
-import pkg_resources
-
-from typing import Optional, Iterable
+A stub for backwards-compatibility
+"""
+from typing import Optional, List
 
-from intelmq.lib.bot import ParserBot, utils
-from intelmq.lib.exceptions import InvalidArgument
-from intelmq.lib.harmonization import ClassificationType
-from intelmq.lib.exceptions import MissingDependencyError
+from intelmq.bots.parsers.ioc_extractor.parser import IocExtractorParserBot
 
-try:
-    from url_normalize import url_normalize
-except ImportError:
-    url_normalize = None
 
-try:
-    import tld.exceptions
-    from tld import get_tld
-    from tld.utils import update_tld_names
-except ImportError:
-    get_tld = None
-    update_tld_names = None
+DEPRECATION_WARNING = "This bot is deprecated and will be removed in version 4.0. Use the 'IoC Extractor' bot instead."
 
 
-class TwitterParserBot(ParserBot):
-    """Parse tweets and extract IoC data. Currently only URLs are supported, a whitelist of safe domains can be provided"""
-    default_scheme: Optional[str] = None
-    domain_whitelist: str = 't.co'
-    _domain_whitelist: Iterable[str] = []
-    substitutions: str = ".net;[.]net"
-    _substitutions: Iterable[str] = []
-    classification_type: str = "blacklist"
+class TwitterParserBot(IocExtractorParserBot):
 
     def init(self):
-        if url_normalize is None:
-            raise MissingDependencyError("url-normalize")
-        url_version = pkg_resources.get_distribution("url-normalize").version
-        if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and self.default_scheme is not None:
-            raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
-                             "Get at least version '1.4.1'." % url_version)
-        if get_tld is None:
-            raise MissingDependencyError("tld")
-        try:
-            update_tld_names()
-        except tld.exceptions.TldIOError:
-            self.logger.info("Could not update TLD names cache.")
-        if self.domain_whitelist != '':
-            self._domain_whitelist.extend(self.domain_whitelist.split(','))
-        if self.substitutions != '':
-            temp = self.substitutions.split(';')
-            if len(temp) % 2 != 0:
-                raise InvalidArgument(
-                    'substitutions',
-                    got=self.substitutions,
-                    expected="even number of ; separated strings")
-            for i in range(int(len(temp) / 2)):
-                self._substitutions.append([temp[2 * i], temp[2 * i + 1]])
-        if not ClassificationType.is_valid(self.classification_type):
-            self.classification_type = 'unknown'
-
-        if self.default_scheme is not None:
-            self.url_kwargs = {'default_scheme': self.default_scheme}
-        else:
-            self.url_kwargs = {}
-
-    def get_domain(self, address):
-        try:
-            dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2)
-            if not self.in_whitelist(dom):
-                if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True):
-                    return url_normalize(dom, **self.url_kwargs)
-            return None
-        except AttributeError:
-            return None
-        except UnicodeError:  # url_normalize's error, happens when something weird matches regex
-            self.logger.info("Caught UnicodeError on %r.", address)
-            return None
-
-    def in_whitelist(self, domain: str) -> bool:
-        for dom_clean in self._domain_whitelist:
-            if re.search(dom_clean, domain):
-                return True
-        return False
-
-    def get_data_from_text(self, text) -> list:
-        data = []
-        for sub in self._substitutions:  # allows for custom matches
-            text = text.replace(sub[0], sub[1])
-        tweet_text = text.split()
-        tweet_text = [x.strip().lower() for x in tweet_text]
-        for part in tweet_text:
-            domain = self.get_domain(part)
-            if domain:
-                data.append(domain)
-        return data
+        self.logger.warn(DEPRECATION_WARNING)
+        super().init()
 
-    def process(self):
-        report = self.receive_message()
-        text = utils.base64_decode(report["raw"])
-        data = self.get_data_from_text(text)
-        for url in data:
-            event = self.new_event(report)
-            event.add('raw', text)
-            event.add('source.url', url)
-            event.add('classification.type', self.classification_type)
-            self.send_message(event)
-        self.acknowledge_message()
+    @staticmethod
+    def check(parameters: dict) -> Optional[List[List[str]]]:
+        return [["warning", DEPRECATION_WARNING]]
 
 
 BOT = TwitterParserBot
@@ -0,0 +1,88 @@
+# SPDX-FileCopyrightText: 2018 Karel
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import unittest
+
+import intelmq.lib.test as test
+import intelmq.lib.utils as utils
+from intelmq.bots.parsers.ioc_extractor.parser import IocExtractorParserBot
+
+with open(os.path.join(os.path.dirname(__file__),
+                       'tweet.txt')) as handle:
+    EXAMPLE_FILE = handle.read()
+
+REPORT = {'__type': 'Report',
+          'feed.name': 'Twitter',
+          'feed.url': 'https://twitter.com/El_carlos/1234456',
+          'raw': utils.base64_encode(EXAMPLE_FILE),
+          'time.observation': '2015-09-14T12:00:00+02:00'
+          }
+EVENTS =[ {'source.url': 'http://testweb.com/sales-invoice/',
+            '__type': 'Event',
+          'feed.name': 'Twitter',
+          'classification.type':'blacklist',
+          'feed.url': 'https://twitter.com/El_carlos/1234456',
+          'time.observation': '2015-09-14T12:00:00+02:00',
+          'raw': utils.base64_encode(EXAMPLE_FILE)},
+            {'source.url': 'http://cc.pro/images.html',
+            '__type': 'Event',
+          'feed.name': 'Twitter',
+          'classification.type':'blacklist',
+          'feed.url': 'https://twitter.com/El_carlos/1234456',
+          'time.observation': '2015-09-14T12:00:00+02:00',
+          'raw': utils.base64_encode(EXAMPLE_FILE)},
+            {'source.url': 'http://karlos.net/',
+            '__type': 'Event',
+          'feed.name': 'Twitter',
+          'classification.type':'blacklist',
+          'feed.url': 'https://twitter.com/El_carlos/1234456',
+          'time.observation': '2015-09-14T12:00:00+02:00',
+          'raw': utils.base64_encode(EXAMPLE_FILE)},
+            {'source.url': 'http://block.de.com/malware.exe',
+            '__type': 'Event',
+          'feed.name': 'Twitter',
+          'classification.type':'blacklist',
+          'feed.url': 'https://twitter.com/El_carlos/1234456',
+          'time.observation': '2015-09-14T12:00:00+02:00',
+          'raw': utils.base64_encode(EXAMPLE_FILE)},
+            {'source.url': 'http://ghzz.com/',
+            '__type': 'Event',
+          'feed.name': 'Twitter',
+          'classification.type':'blacklist',
+          'feed.url': 'https://twitter.com/El_carlos/1234456',
+          'time.observation': '2015-09-14T12:00:00+02:00',
+          'raw': utils.base64_encode(EXAMPLE_FILE)}]
+
+
+@test.skip_exotic()
+class TestIocExtractorParserBot(test.BotTestCase, unittest.TestCase):
+    """
+    A TestCase for IocExtractorParserBot.
+    """
+
+    @classmethod
+    def set_bot(cls):
+        cls.bot_reference = IocExtractorParserBot
+        cls.sysconfig = {"substitutions" : " .net;.net;[.];.;,;.",
+                         "classification_type": "blacklist",
+                         }
+        # url-normalize 1.4.1 supporting this parameter is only available for 3.6
+        cls.sysconfig["default_scheme"] = "http"
+
+    def test_parse(self):
+        self.input_message = REPORT
+        self.run_bot()
+        self.assertMessageEqual(0, EVENTS[0])
+        self.assertMessageEqual(1, EVENTS[1])
+        self.assertMessageEqual(2, EVENTS[2])
+        self.assertMessageEqual(3, EVENTS[3])
+        self.assertMessageEqual(4, EVENTS[4])
+
+
+if __name__ == '__main__':  # pragma: no cover
+    unittest.main()