Skip to content

Commit 49bfe5d

Browse files
sacca97copernico
authored andcommitted
small fixes and improvement in nlp code
1 parent 72e5ff0 commit 49bfe5d

File tree

1 file changed

+43
-34
lines changed

1 file changed

+43
-34
lines changed

prospector/datamodel/nlp.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,36 +17,14 @@
1717
nlp = load("en_core_web_sm")
1818

1919

20-
def extract_special_terms(description: str) -> Set[str]:
21-
"""
22-
Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
23-
These are usually code fragments and names of code entities, or paths.
24-
"""
25-
26-
return set()
27-
# TODO replace this with NLP implementation
28-
# see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
29-
# noinspection PyUnreachableCode
30-
result = []
31-
for word in description.split():
32-
no_punctation_word = word.rstrip(").,;:?!\"'").lstrip("(")
33-
contains_non_word_char = re.search(r"\W", no_punctation_word)
34-
contains_non_initial_upper_case = re.search(r"\B[A-Z]", no_punctation_word)
35-
if contains_non_initial_upper_case or contains_non_word_char:
36-
result.append(word)
37-
return tuple(result)
38-
39-
40-
def extract_words_from_text(text: str) -> Set[str]:
20+
def extract_words_from_text(text: str) -> List[str]:
4121
"""Use spacy to extract "relevant words" from text"""
4222
# Lemmatization
43-
return set(
44-
[
45-
token.lemma_.casefold()
46-
for token in nlp(text)
47-
if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
48-
]
49-
)
23+
return [
24+
token.lemma_.casefold()
25+
for token in nlp(text)
26+
if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
27+
]
5028

5129

5230
def find_similar_words(adv_words: Set[str], commit_msg: str, exclude: str) -> Set[str]:
@@ -70,10 +48,17 @@ def extract_products(text: str) -> List[str]:
7048
"""
7149
Extract product names from advisory text
7250
"""
73-
# TODO implement this properly
74-
regex = r"([A-Z]+[a-z\b]+)"
75-
result = set(re.findall(regex, text))
76-
return [p for p in result if len(p) > 2]
51+
return list(
52+
set(
53+
[
54+
token.text
55+
for token in nlp(text)
56+
if token.pos_ in ("PROPN")
57+
and token.text.isalpha()
58+
and len(token.text) > 2
59+
]
60+
) # "NOUN",
61+
)
7762

7863

7964
def extract_affected_filenames(
@@ -107,14 +92,18 @@ def extract_filename(text: str, relevant_extensions: List[str]) -> str:
10792

10893
# Covers cases like: class::method, class.method,
10994
# TODO: in nebula is getting the e from e.g.
110-
res = re.search(r"^(\w+)(?:\.|:{2})(\w+)$", text) # ^(\w{2,})(?:\.|:{2})(\w{2,})$
95+
res = re.search(
96+
r"^(\w{2,})(?:\.|:{2})(\w+)$", text
97+
) # ^(\w{2,})(?:\.|:{2})(\w{2,})$
11198
# Check if it is not a number
11299
if res and not bool(re.match(r"^\d+$", res.group(1))):
113-
return res.group(1) if len(res.group(1)) > 3 else None
100+
return res.group(1)
114101

115102
# className or class_name (normal string with underscore)
116103
# TODO: ShenYu and words
117104
# like this should be excluded...
105+
# TODO: filter for not present in url
106+
#
118107
if bool(re.search(r"[a-z]{2,}[A-Z]+[a-z]*", text)) or "_" in text:
119108
return text
120109

@@ -183,3 +172,23 @@ def extract_references_keywords(text: str) -> List[str]:
183172
for result in re.finditer(r"[A-Z]{2,}-\d+|github\.com\/(?:\w+|\/)*", text)
184173
if "CVE" not in result.group(0)
185174
]
175+
176+
177+
# def extract_special_terms(description: str) -> Set[str]:
178+
# """
179+
# Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
180+
# These are usually code fragments and names of code entities, or paths.
181+
# """
182+
183+
# return set()
184+
# # TODO replace this with NLP implementation
185+
# # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
186+
# # noinspection PyUnreachableCode
187+
# result = []
188+
# for word in description.split():
189+
# no_punctation_word = word.rstrip(").,;:?!\"'").lstrip("(")
190+
# contains_non_word_char = re.search(r"\W", no_punctation_word)
191+
# contains_non_initial_upper_case = re.search(r"\B[A-Z]", no_punctation_word)
192+
# if contains_non_initial_upper_case or contains_non_word_char:
193+
# result.append(word)
194+
# return tuple(result)

0 commit comments

Comments
 (0)