Skip to content

Commit 4ff15de

Browse files
sacca97copernico
authored andcommitted
prepared new rules, cleaned Git and raw_commit code, optimized NLP code, set up real filtering of commits, optimized version_to_tag matches, added various security keywords, deleted some old code
1 parent 279a999 commit 4ff15de

File tree

15 files changed

+408
-668
lines changed

15 files changed

+408
-668
lines changed

prospector/datamodel/advisory.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def __init__(
6464
versions: Dict[str, List[str]] = None,
6565
files: Set[str] = None,
6666
keywords: Set[str] = None,
67+
files_extensions: Set[str] = None,
6768
):
6869
self.cve_id = cve_id
6970
self.description = description
@@ -75,6 +76,7 @@ def __init__(
7576
self.versions = versions or dict()
7677
self.files = files or set()
7778
self.keywords = keywords or set()
79+
self.files_extension = files_extensions or set()
7880

7981
def analyze(
8082
self,
@@ -89,8 +91,10 @@ def analyze(
8991
self.affected_products.extend(extract_products(self.description))
9092
self.affected_products = list(set(self.affected_products))
9193

94+
files, extension = extract_affected_filenames(self.description)
95+
self.files_extension = extension
9296
# TODO: this could be done on the words extracted from the description
93-
self.files.update(extract_affected_filenames(self.description))
97+
self.files.update(files)
9498

9599
self.keywords.update(set(extract_words_from_text(self.description)))
96100

@@ -135,14 +139,6 @@ def parse_advisory(self, data):
135139
]
136140
self.versions["fixed"] = [v for v in self.versions["fixed"] if v is not None]
137141

138-
# [
139-
# (
140-
# item.get("versionEndIncluding"), # item.get("versionStartExcluding")
141-
# item.get("versionEndExcluding"), # , item.get("versionEndIncluding")
142-
# )
143-
# for item in data["configurations"][0]["nodes"][0]["cpeMatch"]
144-
# ]
145-
146142

147143
def get_from_nvd(cve_id: str):
148144
"""Get an advisory from the NVD dtabase"""
@@ -182,7 +178,7 @@ def build_advisory_record(
182178
fetch_references: bool = False,
183179
use_nvd: bool = True,
184180
publication_date: Optional[str] = None,
185-
advisory_keywords: Optional[str] = None,
181+
advisory_keywords: Set[str] = set(),
186182
modified_files: Optional[str] = None,
187183
) -> AdvisoryRecord:
188184

@@ -206,8 +202,8 @@ def build_advisory_record(
206202
isoparse(publication_date).timestamp()
207203
)
208204

209-
if advisory_keywords and len(advisory_keywords) > 0:
210-
advisory_record.keywords.update(set(advisory_keywords.split(",")))
205+
if len(advisory_keywords) > 0:
206+
advisory_record.keywords = advisory_keywords
211207

212208
if modified_files and len(modified_files) > 0:
213209
advisory_record.files.update(set(modified_files.split(",")))

prospector/datamodel/commit.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
extract_jira_references,
99
)
1010
from git.raw_commit import RawCommit
11-
from util.lsh import decode_minhash, encode_minhash
11+
from util.lsh import decode_minhash, encode_minhash, get_encoded_minhash
1212

1313

1414
class Commit(BaseModel):
@@ -62,11 +62,11 @@ def add_match(self, rule: Dict[str, Any]):
6262
def has_twin(self):
6363
return len(self.twins) > 0
6464

65-
def has_tag(self):
66-
return self.tags[0] != ""
65+
def has_tag(self, tag: str) -> bool:
66+
return tag in self.tags
6767

6868
def get_tag(self):
69-
return self.tags[0]
69+
return self.tags[0] if len(self.tags) else "no-tag"
7070

7171
def compute_relevance(self):
7272
self.relevance = sum([rule.get("relevance") for rule in self.matched_rules])
@@ -128,18 +128,16 @@ def make_from_raw_commit(raw: RawCommit) -> Commit:
128128
timestamp=raw.get_timestamp(),
129129
changed_files=raw.get_changed_files(),
130130
message=raw.get_msg(),
131-
twins=raw.get_twins(),
132-
minhash=raw.get_minhash(),
133131
)
134132

135133
# NOTE: all attributes that do not depend on a particular query
136134
# (e.g. do not depend on a particular Advisory Record)
137135
# should be computed here so that they can be stored in the db.
138136
# Space-efficiency is important.
137+
commit.minhash = get_encoded_minhash(raw.get_msg(50))
139138

140139
commit.diff, commit.hunks = raw.get_diff()
141-
commit.tags.append(raw.get_tag())
142-
# commit.tags = raw.get_tags()
140+
commit.tags = raw.find_tags()
143141
commit.jira_refs = extract_jira_references(commit.repository, commit.message)
144142
commit.ghissue_refs = extract_ghissue_references(commit.repository, commit.message)
145143
commit.cve_refs = extract_cve_references(commit.message)

prospector/datamodel/nlp.py

Lines changed: 79 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,40 @@
1-
import os
21
import re
3-
from typing import Dict, List, Set
2+
from typing import Dict, List, Set, Tuple
43

5-
import requests
6-
7-
# from util.http import extract_from_webpage, fetch_url, get_from_xml
84
from spacy import load
95

106
from datamodel.constants import RELEVANT_EXTENSIONS
11-
from util.http import get_from_xml
7+
from util.http import fetch_url, get_from_xml
128

13-
JIRA_ISSUE_URL = "https://issues.apache.org/jira/browse/"
14-
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
9+
nlp = load("en_core_web_sm")
1510

1611

17-
nlp = load("en_core_web_sm")
12+
def get_names(text: str, exclude: str) -> List[str]:
13+
"""
14+
Extract names from text
15+
"""
16+
return [
17+
token.text
18+
for token in nlp(text)
19+
if token.pos_ in ("PROPN", "NOUN")
20+
and token.text.casefold() not in exclude
21+
and token.is_alpha
22+
]
23+
24+
25+
def clean_string(text: str) -> str:
26+
"""
27+
Remove all non-alphanumeric characters from a string
28+
"""
29+
return " ".join(
30+
set(
31+
[
32+
token.lemma_
33+
for token in nlp(text)
34+
if not token.is_punct and len(token.lemma_) > 2
35+
]
36+
)
37+
)
1838

1939

2040
def extract_words_from_text(text: str) -> List[str]:
@@ -23,7 +43,9 @@ def extract_words_from_text(text: str) -> List[str]:
2343
return [
2444
token.lemma_.casefold()
2545
for token in nlp(text)
26-
if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
46+
if token.pos_ in ("NOUN", "VERB", "PROPN")
47+
and len(token.lemma_) > 3
48+
and token.lemma_.isalnum()
2749
]
2850

2951

@@ -63,15 +85,19 @@ def extract_products(text: str) -> List[str]:
6385

6486
def extract_affected_filenames(
6587
text: str, extensions: List[str] = RELEVANT_EXTENSIONS
66-
) -> Set[str]:
88+
) -> Tuple[Set[str], Set[str]]:
6789
files = set()
90+
extension = set()
6891
for word in text.split():
69-
res = word.strip("_,.:;-+!?()[]'\"")
70-
res = extract_filename_from_path(res)
71-
res = extract_filename(res, extensions)
72-
if res:
73-
files.add(res)
74-
return files
92+
res = re.sub(r"^[^a-z0-9]+|[^a-z0-9]+$", "", word, flags=re.IGNORECASE)
93+
res = re.split(r"[\\\/]", res)[-1]
94+
res, ext = extract_filename(res, extensions)
95+
if len(res) > 0:
96+
files.update(res)
97+
if ext is not None:
98+
extension.add(ext)
99+
100+
return files, extension
75101

76102

77103
# TODO: enhanche this
@@ -80,34 +106,23 @@ def extract_filename_from_path(text: str) -> str:
80106
return text.split("/")[-1]
81107

82108

83-
def extract_filename(text: str, relevant_extensions: List[str]) -> str:
109+
def extract_filename(text: str, relevant_extensions: List[str]) -> List[str]:
84110
# Covers cases file.extension if extension is relevant, extensions come from CLI parameter
85-
extensions_regex = r"^(?:^|\s?)([\w\-]{2,}\.(?:%s))(?:$|\s|\.|,|:)" % "|".join(
86-
relevant_extensions
87-
)
88-
89-
res = re.search(extensions_regex, text)
90-
if res:
91-
return res.group(1)
92-
93-
# Covers cases like: class::method, class.method,
94-
# TODO: in nebula is getting the e from e.g.
95-
res = re.search(
96-
r"^(\w{2,})(?:\.|:{2})(\w+)$", text
97-
) # ^(\w{2,})(?:\.|:{2})(\w{2,})$
98-
# Check if it is not a number
99-
if res and not bool(re.match(r"^\d+$", res.group(1))):
100-
return res.group(1)
101-
102-
# className or class_name (normal string with underscore)
103-
# TODO: ShenYu and words
104-
# like this should be excluded...
105-
# TODO: filter for not present in url
106-
#
107-
if bool(re.search(r"[a-z]{2,}[A-Z]+[a-z]*", text)) or "_" in text:
108-
return text
109-
110-
return None
111+
res = re.search(r"(?:(\w{2,})\.)+(\w+)", text, flags=re.IGNORECASE)
112+
if res is not None:
113+
if res.group(2) in relevant_extensions:
114+
return [res.group(1)], res.group(2)
115+
elif not res.group(2).isdigit():
116+
return [res.group(2), res.group(1)], None
117+
118+
# This regex covers cases with various camelcase filenames and underscore, dash names
119+
if bool(
120+
re.search(
121+
r"(?:[a-z]|[A-Z])[a-zA-Z]+[A-Z]\w*|(?:[a-zA-Z]{2,}[_-])+[a-zA-Z]{2,}", text
122+
)
123+
):
124+
return [text], None
125+
return [], None
111126

112127

113128
def extract_ghissue_references(repository: str, text: str) -> Dict[str, str]:
@@ -116,21 +131,25 @@ def extract_ghissue_references(repository: str, text: str) -> Dict[str, str]:
116131
"""
117132
refs = dict()
118133

119-
# /repos/{owner}/{repo}/issues/{issue_number}
120-
headers = {
121-
"Accept": "application/vnd.github+json",
122-
}
123-
if GITHUB_TOKEN:
124-
headers.update({"Authorization": f"Bearer {GITHUB_TOKEN}"})
125-
126134
for result in re.finditer(r"(?:#|gh-)(\d+)", text):
127135
id = result.group(1)
128-
owner, repo = repository.split("/")[-2:]
129-
url = f"https://api.github.com/repos/{owner}/{repo}/issues/{id}"
130-
r = requests.get(url, headers=headers)
131-
if r.status_code == 200:
132-
data = r.json()
133-
refs[id] = f"{data['title']} {data['body']}"
136+
url = f"{repository}/issues/{id}"
137+
content = fetch_url(url=url, extract_text=False)
138+
gh_ref_data = content.find_all(
139+
attrs={
140+
"class": ["comment-body", "markdown-title"],
141+
}
142+
)
143+
gh_ref_data.extend(
144+
content.find_all(
145+
attrs={
146+
"id": re.compile(r"ref-issue|ref-pullrequest|ref-commit"),
147+
}
148+
)
149+
)
150+
refs[id] = " ".join(
151+
[" ".join(block.get_text().split()) for block in gh_ref_data]
152+
)
134153

135154
return refs
136155

@@ -146,12 +165,9 @@ def extract_jira_references(repository: str, text: str) -> Dict[str, str]:
146165

147166
for result in re.finditer(r"[A-Z]+-\d+", text):
148167
id = result.group()
149-
issue_content = get_from_xml(id)
150-
refs[id] = (
151-
" ".join(re.findall(r"\w{3,}", issue_content))
152-
if len(issue_content) > 0
153-
else ""
154-
)
168+
if id.startswith("CVE-"):
169+
continue
170+
refs[id] = get_from_xml(id)
155171

156172
return refs
157173

@@ -172,23 +188,3 @@ def extract_references_keywords(text: str) -> List[str]:
172188
for result in re.finditer(r"[A-Z]{2,}-\d+|github\.com\/(?:\w+|\/)*", text)
173189
if "CVE" not in result.group(0)
174190
]
175-
176-
177-
# def extract_special_terms(description: str) -> Set[str]:
178-
# """
179-
# Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
180-
# These are usually code fragments and names of code entities, or paths.
181-
# """
182-
183-
# return set()
184-
# # TODO replace this with NLP implementation
185-
# # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
186-
# # noinspection PyUnreachableCode
187-
# result = []
188-
# for word in description.split():
189-
# no_punctation_word = word.rstrip(").,;:?!\"'").lstrip("(")
190-
# contains_non_word_char = re.search(r"\W", no_punctation_word)
191-
# contains_non_initial_upper_case = re.search(r"\B[A-Z]", no_punctation_word)
192-
# if contains_non_initial_upper_case or contains_non_word_char:
193-
# result.append(word)
194-
# return tuple(result)

prospector/datamodel/nlp_test.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ def test_extract_similar_words():
2525

2626
ADVISORY_TEXT_5 = """A vulnerability in the JsonMapObjectReaderWriter of Apache CXF allows an attacker to submit malformed JSON to a web service, which results in the thread getting stuck in an infinite loop, consuming CPU indefinitely. This issue affects Apache CXF versions prior to 3.4.4; Apache CXF versions prior to 3.3.11."""
2727

28+
ADVISORY_TEXT_6 = """Apache HTTP Server versions 2.4.41 to 2.4.46 mod_proxy_http can be made to crash (NULL pointer dereference) with specially crafted requests using both Content-Length and Transfer-Encoding headers, leading to a Denial of Service"""
29+
2830

2931
def test_extract_affected_filenames():
3032
result1 = extract_affected_filenames(ADVISORY_TEXT_1)
@@ -74,6 +76,11 @@ def test_extract_jira_references():
7476

7577

7678
def test_extract_gh_issues():
77-
d = extract_ghissue_references("https://github.com/slackhq/nebula", "#310")
79+
d = extract_ghissue_references("https://github.com/apache/commons-text", "#341")
7880
print(d)
79-
pass
81+
raise NotImplementedError
82+
83+
84+
def test_extract_filenames_single():
85+
d = extract_affected_filenames(ADVISORY_TEXT_6)
86+
raise Exception(d)

0 commit comments

Comments
 (0)