Skip to content

Commit 0441d92

Browse files
committed
MAINT: Rename Twitter parser to IoC Extractor
Twitter collector is removed (does not work anymore) but the Twitter parser is still usable, so rename it with a more generic name
1 parent 4d50c35 commit 0441d92

File tree

11 files changed

+247
-186
lines changed

11 files changed

+247
-186
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
- Log the downloaded size in bytes to ease troubleshooting (PR#2554 by Sebastian Wagner).
3030
- Fix import for Timeout exception preventing another exception (fixes #2555, PR#2556 by Sebastian Wagner).
3131
- Remove `intelmq.bots.collectors.twitter` as it uses an unmaintained library and does not work any more (fixes #2346, #2441, PR#2568 by Sebastian Wagner).
32+
- Renamed `intelmq.bots.parser.twitter` to `intelmq.bots.parser.ioc_extractor` (PR#2568 by Sebastian Wagner).
33+
- Added `intelmq.bots.parser.twitter` as a stub to load the IoC Extractor parser.
3234

3335
#### Parsers
3436
- `intelmq.bots.parsers.shadowserver._config`:

docs/user/bots.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,11 +2268,12 @@ No additional parameters.
22682268

22692269
---
22702270

2271-
### Twitter <div id="intelmq.bots.parsers.twitter.parser" />
2271+
### IoC Extractor (ex: Twitter) <div id="intelmq.bots.parsers.twitter.parser" /><div id="intelmq.bots.parsers.ioc_extractor.parser" />
22722272

22732273
Extracts URLs from text, fuzzy, aimed at parsing tweets.
22742274

2275-
**Module:** `intelmq.bots.parsers.twitter.parser`
2275+
**Module:** `intelmq.bots.parsers.ioc_extractor.parser`<br>
2276+
previously: `intelmq.bots.parsers.twitter.parser`
22762277

22772278
**Parameters:**
22782279

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# SPDX-FileCopyrightText: 2018 Karel
2+
#
3+
# SPDX-License-Identifier: AGPL-3.0-or-later
4+
5+
r"""
6+
7+
Parser of text intended to obtain IOCs from tweets.
8+
First substitutions are performed and then words in the text are compared with
9+
'(/|^)([a-z0-9.-]+\.[a-z0-9]+?)([/:]|$)'
10+
In the case of a match it is checked whether this can be a valid domain using get_tld
11+
There is also a whitelist for filtering out good domains.
12+
13+
14+
Parameters:
15+
domain_whitelist : domains that will be ignored in parsing
16+
17+
substitutions : semicolon separated list of pairs
18+
substitutions that will be made in the text,
19+
for example " .com,.com" enables parsing of one fuzzy format "[.];." enables the parsing of another fuzzy format
20+
21+
classification_type : string with a valid classificationtype
22+
"""
23+
import re
24+
import pkg_resources
25+
26+
from typing import Optional, Iterable
27+
28+
from intelmq.lib.bot import ParserBot, utils
29+
from intelmq.lib.exceptions import InvalidArgument
30+
from intelmq.lib.harmonization import ClassificationType
31+
from intelmq.lib.exceptions import MissingDependencyError
32+
33+
try:
34+
from url_normalize import url_normalize
35+
except ImportError:
36+
url_normalize = None
37+
38+
try:
39+
import tld.exceptions
40+
from tld import get_tld
41+
from tld.utils import update_tld_names
42+
except ImportError:
43+
get_tld = None
44+
update_tld_names = None
45+
46+
47+
class IocExtractorParserBot(ParserBot):
48+
"""Parse tweets and extract IoC data. Currently only URLs are supported, a whitelist of safe domains can be provided"""
49+
default_scheme: Optional[str] = None
50+
domain_whitelist: str = 't.co'
51+
_domain_whitelist: Iterable[str] = []
52+
substitutions: str = ".net;[.]net"
53+
_substitutions: Iterable[str] = []
54+
classification_type: str = "blacklist"
55+
56+
def init(self):
57+
if url_normalize is None:
58+
raise MissingDependencyError("url-normalize")
59+
url_version = pkg_resources.get_distribution("url-normalize").version
60+
if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and self.default_scheme is not None:
61+
raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
62+
"Get at least version '1.4.1'." % url_version)
63+
if get_tld is None:
64+
raise MissingDependencyError("tld")
65+
try:
66+
update_tld_names()
67+
except tld.exceptions.TldIOError:
68+
self.logger.info("Could not update TLD names cache.")
69+
if self.domain_whitelist != '':
70+
self._domain_whitelist.extend(self.domain_whitelist.split(','))
71+
if self.substitutions != '':
72+
temp = self.substitutions.split(';')
73+
if len(temp) % 2 != 0:
74+
raise InvalidArgument(
75+
'substitutions',
76+
got=self.substitutions,
77+
expected="even number of ; separated strings")
78+
for i in range(int(len(temp) / 2)):
79+
self._substitutions.append([temp[2 * i], temp[2 * i + 1]])
80+
if not ClassificationType.is_valid(self.classification_type):
81+
self.classification_type = 'unknown'
82+
83+
if self.default_scheme is not None:
84+
self.url_kwargs = {'default_scheme': self.default_scheme}
85+
else:
86+
self.url_kwargs = {}
87+
88+
def get_domain(self, address):
89+
try:
90+
dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2)
91+
if not self.in_whitelist(dom):
92+
if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True):
93+
return url_normalize(dom, **self.url_kwargs)
94+
return None
95+
except AttributeError:
96+
return None
97+
except UnicodeError: # url_normalize's error, happens when something weird matches regex
98+
self.logger.info("Caught UnicodeError on %r.", address)
99+
return None
100+
101+
def in_whitelist(self, domain: str) -> bool:
102+
for dom_clean in self._domain_whitelist:
103+
if re.search(dom_clean, domain):
104+
return True
105+
return False
106+
107+
def get_data_from_text(self, text) -> list:
108+
data = []
109+
for sub in self._substitutions: # allows for custom matches
110+
text = text.replace(sub[0], sub[1])
111+
tweet_text = text.split()
112+
tweet_text = [x.strip().lower() for x in tweet_text]
113+
for part in tweet_text:
114+
domain = self.get_domain(part)
115+
if domain:
116+
data.append(domain)
117+
return data
118+
119+
def process(self):
120+
report = self.receive_message()
121+
text = utils.base64_decode(report["raw"])
122+
data = self.get_data_from_text(text)
123+
for url in data:
124+
event = self.new_event(report)
125+
event.add('raw', text)
126+
event.add('source.url', url)
127+
event.add('classification.type', self.classification_type)
128+
self.send_message(event)
129+
self.acknowledge_message()
130+
131+
132+
BOT = IocExtractorParserBot
Lines changed: 12 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1,132 +1,27 @@
1-
# SPDX-FileCopyrightText: 2018 Karel
1+
# SPDX-FileCopyrightText: 2025 Institute for Common Good Technology, Sebastian Wagner
22
#
33
# SPDX-License-Identifier: AGPL-3.0-or-later
44

5-
r"""
6-
7-
Parser of text intended to obtain IOCs from tweets.
8-
First substitutions are performed and then words in the text are compared with
9-
'(/|^)([a-z0-9.-]+\.[a-z0-9]+?)([/:]|$)'
10-
In the case of a match it is checked whether this can be a valid domain using get_tld
11-
There is also a whitelist for filtering out good domains.
12-
13-
14-
Parameters:
15-
domain_whitelist : domains that will be ignored in parsing
16-
17-
substitutions : semicolon separated list of pairs
18-
substitutions that will be made in the text,
19-
for example " .com,.com" enables parsing of one fuzzy format "[.];." enables the parsing of another fuzzy format
20-
21-
classification_type : string with a valid classificationtype
225
"""
23-
import re
24-
import pkg_resources
25-
26-
from typing import Optional, Iterable
6+
A stub for backwards-compatibility
7+
"""
8+
from typing import Optional, List
279

28-
from intelmq.lib.bot import ParserBot, utils
29-
from intelmq.lib.exceptions import InvalidArgument
30-
from intelmq.lib.harmonization import ClassificationType
31-
from intelmq.lib.exceptions import MissingDependencyError
10+
from intelmq.bots.parsers.ioc_extractor.parser import IocExtractorParserBot
3211

33-
try:
34-
from url_normalize import url_normalize
35-
except ImportError:
36-
url_normalize = None
3712

38-
try:
39-
import tld.exceptions
40-
from tld import get_tld
41-
from tld.utils import update_tld_names
42-
except ImportError:
43-
get_tld = None
44-
update_tld_names = None
13+
DEPRECATION_WARNING = "This bot is deprecated and will be removed in version 4.0. Use the 'IoC Extractor' bot instead."
4514

4615

47-
class TwitterParserBot(ParserBot):
48-
"""Parse tweets and extract IoC data. Currently only URLs are supported, a whitelist of safe domains can be provided"""
49-
default_scheme: Optional[str] = None
50-
domain_whitelist: str = 't.co'
51-
_domain_whitelist: Iterable[str] = []
52-
substitutions: str = ".net;[.]net"
53-
_substitutions: Iterable[str] = []
54-
classification_type: str = "blacklist"
16+
class TwitterParserBot(IocExtractorParserBot):
5517

5618
def init(self):
57-
if url_normalize is None:
58-
raise MissingDependencyError("url-normalize")
59-
url_version = pkg_resources.get_distribution("url-normalize").version
60-
if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and self.default_scheme is not None:
61-
raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
62-
"Get at least version '1.4.1'." % url_version)
63-
if get_tld is None:
64-
raise MissingDependencyError("tld")
65-
try:
66-
update_tld_names()
67-
except tld.exceptions.TldIOError:
68-
self.logger.info("Could not update TLD names cache.")
69-
if self.domain_whitelist != '':
70-
self._domain_whitelist.extend(self.domain_whitelist.split(','))
71-
if self.substitutions != '':
72-
temp = self.substitutions.split(';')
73-
if len(temp) % 2 != 0:
74-
raise InvalidArgument(
75-
'substitutions',
76-
got=self.substitutions,
77-
expected="even number of ; separated strings")
78-
for i in range(int(len(temp) / 2)):
79-
self._substitutions.append([temp[2 * i], temp[2 * i + 1]])
80-
if not ClassificationType.is_valid(self.classification_type):
81-
self.classification_type = 'unknown'
82-
83-
if self.default_scheme is not None:
84-
self.url_kwargs = {'default_scheme': self.default_scheme}
85-
else:
86-
self.url_kwargs = {}
87-
88-
def get_domain(self, address):
89-
try:
90-
dom = re.search(r'(//|^)([a-z0-9.-]*[a-z]\.[a-z][a-z-]*?(?:[/:].*|$))', address).group(2)
91-
if not self.in_whitelist(dom):
92-
if get_tld(url_normalize(dom, **self.url_kwargs), fail_silently=True):
93-
return url_normalize(dom, **self.url_kwargs)
94-
return None
95-
except AttributeError:
96-
return None
97-
except UnicodeError: # url_normalize's error, happens when something weird matches regex
98-
self.logger.info("Caught UnicodeError on %r.", address)
99-
return None
100-
101-
def in_whitelist(self, domain: str) -> bool:
102-
for dom_clean in self._domain_whitelist:
103-
if re.search(dom_clean, domain):
104-
return True
105-
return False
106-
107-
def get_data_from_text(self, text) -> list:
108-
data = []
109-
for sub in self._substitutions: # allows for custom matches
110-
text = text.replace(sub[0], sub[1])
111-
tweet_text = text.split()
112-
tweet_text = [x.strip().lower() for x in tweet_text]
113-
for part in tweet_text:
114-
domain = self.get_domain(part)
115-
if domain:
116-
data.append(domain)
117-
return data
19+
self.logger.warn(DEPRECATION_WARNING)
20+
super().init()
11821

119-
def process(self):
120-
report = self.receive_message()
121-
text = utils.base64_decode(report["raw"])
122-
data = self.get_data_from_text(text)
123-
for url in data:
124-
event = self.new_event(report)
125-
event.add('raw', text)
126-
event.add('source.url', url)
127-
event.add('classification.type', self.classification_type)
128-
self.send_message(event)
129-
self.acknowledge_message()
22+
@staticmethod
23+
def check(parameters: dict) -> Optional[List[List[str]]]:
24+
return [["warning", DEPRECATION_WARNING]]
13025

13126

13227
BOT = TwitterParserBot
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# SPDX-FileCopyrightText: 2018 Karel
2+
#
3+
# SPDX-License-Identifier: AGPL-3.0-or-later
4+
5+
# -*- coding: utf-8 -*-
6+
7+
import os
8+
import sys
9+
import unittest
10+
11+
import intelmq.lib.test as test
12+
import intelmq.lib.utils as utils
13+
from intelmq.bots.parsers.ioc_extractor.parser import IocExtractorParserBot
14+
15+
with open(os.path.join(os.path.dirname(__file__),
16+
'tweet.txt')) as handle:
17+
EXAMPLE_FILE = handle.read()
18+
19+
REPORT = {'__type': 'Report',
20+
'feed.name': 'Twitter',
21+
'feed.url': 'https://twitter.com/El_carlos/1234456',
22+
'raw': utils.base64_encode(EXAMPLE_FILE),
23+
'time.observation': '2015-09-14T12:00:00+02:00'
24+
}
25+
EVENTS =[ {'source.url': 'http://testweb.com/sales-invoice/',
26+
'__type': 'Event',
27+
'feed.name': 'Twitter',
28+
'classification.type':'blacklist',
29+
'feed.url': 'https://twitter.com/El_carlos/1234456',
30+
'time.observation': '2015-09-14T12:00:00+02:00',
31+
'raw': utils.base64_encode(EXAMPLE_FILE)},
32+
{'source.url': 'http://cc.pro/images.html',
33+
'__type': 'Event',
34+
'feed.name': 'Twitter',
35+
'classification.type':'blacklist',
36+
'feed.url': 'https://twitter.com/El_carlos/1234456',
37+
'time.observation': '2015-09-14T12:00:00+02:00',
38+
'raw': utils.base64_encode(EXAMPLE_FILE)},
39+
{'source.url': 'http://karlos.net/',
40+
'__type': 'Event',
41+
'feed.name': 'Twitter',
42+
'classification.type':'blacklist',
43+
'feed.url': 'https://twitter.com/El_carlos/1234456',
44+
'time.observation': '2015-09-14T12:00:00+02:00',
45+
'raw': utils.base64_encode(EXAMPLE_FILE)},
46+
{'source.url': 'http://block.de.com/malware.exe',
47+
'__type': 'Event',
48+
'feed.name': 'Twitter',
49+
'classification.type':'blacklist',
50+
'feed.url': 'https://twitter.com/El_carlos/1234456',
51+
'time.observation': '2015-09-14T12:00:00+02:00',
52+
'raw': utils.base64_encode(EXAMPLE_FILE)},
53+
{'source.url': 'http://ghzz.com/',
54+
'__type': 'Event',
55+
'feed.name': 'Twitter',
56+
'classification.type':'blacklist',
57+
'feed.url': 'https://twitter.com/El_carlos/1234456',
58+
'time.observation': '2015-09-14T12:00:00+02:00',
59+
'raw': utils.base64_encode(EXAMPLE_FILE)}]
60+
61+
62+
@test.skip_exotic()
63+
class TestIocExtractorParserBot(test.BotTestCase, unittest.TestCase):
64+
"""
65+
A TestCase for IocExtractorParserBot.
66+
"""
67+
68+
@classmethod
69+
def set_bot(cls):
70+
cls.bot_reference = IocExtractorParserBot
71+
cls.sysconfig = {"substitutions" : " .net;.net;[.];.;,;.",
72+
"classification_type": "blacklist",
73+
}
74+
# url-normalize 1.4.1 supporting this parameter is only available for 3.6
75+
cls.sysconfig["default_scheme"] = "http"
76+
77+
def test_parse(self):
78+
self.input_message = REPORT
79+
self.run_bot()
80+
self.assertMessageEqual(0, EVENTS[0])
81+
self.assertMessageEqual(1, EVENTS[1])
82+
self.assertMessageEqual(2, EVENTS[2])
83+
self.assertMessageEqual(3, EVENTS[3])
84+
self.assertMessageEqual(4, EVENTS[4])
85+
86+
87+
if __name__ == '__main__': # pragma: no cover
88+
unittest.main()

0 commit comments

Comments
 (0)