certtools
diff --git a/‎CHANGELOG.md
Lines changed: 4 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎docs/user/bots.md
Lines changed: 3 additions & 55 deletions b/‎docs/user/bots.md
Lines changed: 3 additions & 55 deletions
diff --git a/‎intelmq/bots/collectors/twitter/REQUIREMENTS.txt
Lines changed: 0 additions & 4 deletions b/‎intelmq/bots/collectors/twitter/REQUIREMENTS.txt
Lines changed: 0 additions & 4 deletions
diff --git a/‎intelmq/bots/collectors/twitter/collector_twitter.py
Lines changed: 0 additions & 134 deletions b/‎intelmq/bots/collectors/twitter/collector_twitter.py
Lines changed: 0 additions & 134 deletions
diff --git a/‎intelmq/bots/parsers/twitter/REQUIREMENTS.txt renamed to ‎intelmq/bots/parsers/ioc_extractor/REQUIREMENTS.txt b/‎intelmq/bots/parsers/twitter/REQUIREMENTS.txt renamed to ‎intelmq/bots/parsers/ioc_extractor/REQUIREMENTS.txt
diff --git a/‎intelmq/bots/collectors/twitter/__init__.py renamed to ‎intelmq/bots/parsers/ioc_extractor/__init__.py b/‎intelmq/bots/collectors/twitter/__init__.py renamed to ‎intelmq/bots/parsers/ioc_extractor/__init__.py
diff --git a/‎intelmq/bots/parsers/ioc_extractor/parser.py
Lines changed: 132 additions & 0 deletions b/‎intelmq/bots/parsers/ioc_extractor/parser.py
Lines changed: 132 additions & 0 deletions
@@ -28,6 +28,9 @@
 - `intelmq.bots.collectors.mail.collector_mail_url`:
   - Log the downloaded size in bytes to ease troubleshooting (PR#2554 by Sebastian Wagner).
   - Fix import for Timeout exception preventing another exception (fixes #2555, PR#2556 by Sebastian Wagner).
+- Remove `intelmq.bots.collectors.twitter` as it uses an unmaintained library and does not work any more (fixes #2346, #2441, PR#2568 by Sebastian Wagner).
+- Renamed `intelmq.bots.parser.twitter` to `intelmq.bots.parser.ioc_extractor` (PR#2568 by Sebastian Wagner).
+  - Added `intelmq.bots.parser.twitter` as a stub to load the IoC Extractor parser.
 
 #### Parsers
 - `intelmq.bots.parsers.shadowserver._config`:
@@ -59,7 +62,7 @@
 - Install build dependencies for `pymssql` on Python 3.8 as there are no wheels available for this Python version (PR#2542 by Sebastian Wagner).
 - Install `psql` explicitly for workflow support on other platforms such as act (PR#2542 by Sebastian Wagner).
 - Create intelmq user & group if running privileged to allow dropping privileges (PR#2542 by Sebastian Wagner).
-- `intelmq.tests.lib.test_pipeline.TestAmqp.test_acknowledge`: Also skip on Python 3.11 besides on 3.8 when running on CI (PR#2542 by Sebastian Wagner).
+- `intelmq.tests.lib.test_pipeline.TestAmqp.test_acknowledge`: Also skip on Python 3.11 and 3.12 besides on 3.8 when running on CI (PR#2542 by Sebastian Wagner).
 - Full pytest workflow: Version-independent install of postgres client, for Ubuntu 24.04 (default on GitHub now) test environment compatibility (PR#2557 by Sebastian Wagner).
 - Debian package build workflow: Use artifact upload v4 instead of v3 (PR#2565 by Sebastian Wagner).
 
 
@@ -1276,59 +1276,6 @@ Also, you will need to know an appropriate STOMP *destination* (aka
 
 (optional, string) Password to use.
 
----
-
-### Twitter (REMOVE?) <div id="intelmq.bots.collectors.twitter.collector_twitter" />
-
-Collects tweets.
-
-Collects tweets from target_timelines. Up to tweet_count tweets from each user and up to timelimit back in time. The
-tweet text is sent separately and if allowed, links to pastebin are followed and the text sent in a separate report
-
-**Module:** `intelmq.bots.collectors.twitter.collector_twitter`
-
-**Parameters (also expects [feed parameters](#feed-parameters)):**
-
-**`target_timelines`**
-
-() screen_names of twitter accounts to be followed
-
-**`tweet_count`**
-
-() number of tweets to be taken from each account
-
-**`timelimit`**
-
-() maximum age of the tweets collected in seconds
-
-**`follow_urls`**
-
-() list of screen_names for which URLs will be followed
-
-**`exclude_replies`**
-
-() exclude replies of the followed screen_names
-
-**`include_rts`**
-
-() whether to include retweets by given screen_name
-
-**`consumer_key`**
-
-() Twitter API login data
-
-**`consumer_secret`**
-
-() Twitter API login data
-
-**`access_token_key`**
-
-() Twitter API login data
-
-**`access_token_secret`**
-
-() Twitter API login data
-
 ## Parser Bots
 
 If not set differently during parsing, all parser bots copy the following fields from the report to an event:
@@ -2321,11 +2268,12 @@ No additional parameters.
 
 ---
 
-### Twitter <div id="intelmq.bots.parsers.twitter.parser" />
+### IoC Extractor (ex: Twitter) <div id="intelmq.bots.parsers.twitter.parser" /><div id="intelmq.bots.parsers.ioc_extractor.parser" />
 
 Extracts URLs from text, fuzzy, aimed at parsing tweets.
 
-**Module:** `intelmq.bots.parsers.twitter.parser`
+**Module:** `intelmq.bots.parsers.ioc_extractor.parser`<br>
+previously: `intelmq.bots.parsers.twitter.parser`
 
 **Parameters:**
 
 
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2018 Karel
+#
+# SPDX-License-Identifier: AGPL-3.0-or-later
+
+r"""
+
+Parser of text intended to obtain IOCs from tweets.
+First substitutions are performed and then words in the text are compared with
+'(/|^)([a-z0-9.-]+\.[a-z0-9]+?)([/:]|$)'
+In the case of a match it is checked whether this can be a valid domain using get_tld
+There is also a whitelist for filtering out good domains.
+
+
+Parameters:
+    domain_whitelist : domains that will be ignored in parsing
+
+    substitutions : semicolon separated list of pairs
+                    substitutions that will be made in the text,
+                    for example " .com,.com" enables parsing of one fuzzy format "[.];." enables the parsing of another fuzzy format
+
+    classification_type : string with a valid classificationtype
+"""
+import re
+import pkg_resources
+
+from typing import Optional, Iterable
+
+from intelmq.lib.bot import ParserBot, utils
+from intelmq.lib.exceptions import InvalidArgument
+from intelmq.lib.harmonization import ClassificationType
+from intelmq.lib.exceptions import MissingDependencyError
+
+try:
+    from url_normalize import url_normalize
+except ImportError:
+    url_normalize = None
+
+try:
+    import tld.exceptions
+    from tld import get_tld
+    from tld.utils import update_tld_names
+except ImportError:
+    get_tld = None
+    update_tld_names = None
+
+
+class IocExtractorParserBot(ParserBot):
+    """Parse tweets and extract IoC data. Currently only URLs are supported, a whitelist of safe domains can be provided"""
+    default_scheme: Optional[str] = None
+    domain_whitelist: str = 't.co'
+    _domain_whitelist: Iterable[str] = []
+    substitutions: str = ".net;[.]net"
+    _substitutions: Iterable[str] = []
+    classification_type: str = "blacklist"
+
+    def init(self):
+        if url_normalize is None:
+            raise MissingDependencyError("url-normalize")
+        url_version = pkg_resources.get_distribution("url-normalize").version
+        if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and self.default_scheme is not None:
+            raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
+                             "Get at least version '1.4.1'." % url_version)
+        if get_tld is None:
+            raise MissingDependencyError("tld")
+        try:
+            update_tld_names()
+        except tld.exceptions.TldIOError:
+            self.logger.info("Could not update TLD names cache.")
+        if self.domain_whitelist != '':
+            self._domain_whitelist.extend(self.domain_whitelist.split(','))
+        if self.substitutions != '':
+            temp = self.substitutions.split(';')
+            if len(temp) % 2 != 0:
+                raise InvalidArgument(
+                    'substitutions',
+                    got=self.substitutions,
+                    expected="even number of ; separated strings")
+            for i in range(int(len(temp) / 2)):
+                self._substitutions.append([temp[2 * i], temp[2 * i + 1]])
+        if not ClassificationType.is_valid(self.classification_type):
+            self.classification_type = 'unknown'
+
+        if self.default_scheme is not None:
+            self.url_kwargs = {'default_scheme': self.default_scheme}
+        else:
+            self.url_kwargs = {}
+
+    def get_domain(self, address):
+        try:
+            dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2)
+            if not self.in_whitelist(dom):
+                if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True):
+                    return url_normalize(dom, **self.url_kwargs)
+            return None
+        except AttributeError:
+            return None
+        except UnicodeError:  # url_normalize's error, happens when something weird matches regex
+            self.logger.info("Caught UnicodeError on %r.", address)
+            return None
+
+    def in_whitelist(self, domain: str) -> bool:
+        for dom_clean in self._domain_whitelist:
+            if re.search(dom_clean, domain):
+                return True
+        return False
+
+    def get_data_from_text(self, text) -> list:
+        data = []
+        for sub in self._substitutions:  # allows for custom matches
+            text = text.replace(sub[0], sub[1])
+        tweet_text = text.split()
+        tweet_text = [x.strip().lower() for x in tweet_text]
+        for part in tweet_text:
+            domain = self.get_domain(part)
+            if domain:
+                data.append(domain)
+        return data
+
+    def process(self):
+        report = self.receive_message()
+        text = utils.base64_decode(report["raw"])
+        data = self.get_data_from_text(text)
+        for url in data:
+            event = self.new_event(report)
+            event.add('raw', text)
+            event.add('source.url', url)
+            event.add('classification.type', self.classification_type)
+            self.send_message(event)
+        self.acknowledge_message()
+
+
+BOT = IocExtractorParserBot