17
17
nlp = load ("en_core_web_sm" )
18
18
19
19
20
- def extract_special_terms (description : str ) -> Set [str ]:
21
- """
22
- Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
23
- These are usually code fragments and names of code entities, or paths.
24
- """
25
-
26
- return set ()
27
- # TODO replace this with NLP implementation
28
- # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
29
- # noinspection PyUnreachableCode
30
- result = []
31
- for word in description .split ():
32
- no_punctation_word = word .rstrip (").,;:?!\" '" ).lstrip ("(" )
33
- contains_non_word_char = re .search (r"\W" , no_punctation_word )
34
- contains_non_initial_upper_case = re .search (r"\B[A-Z]" , no_punctation_word )
35
- if contains_non_initial_upper_case or contains_non_word_char :
36
- result .append (word )
37
- return tuple (result )
38
-
39
-
40
- def extract_words_from_text (text : str ) -> Set [str ]:
20
+ def extract_words_from_text (text : str ) -> List [str ]:
41
21
"""Use spacy to extract "relevant words" from text"""
42
22
# Lemmatization
43
- return set (
44
- [
45
- token .lemma_ .casefold ()
46
- for token in nlp (text )
47
- if token .pos_ in ("NOUN" , "VERB" , "PROPN" ) and len (token .lemma_ ) > 3
48
- ]
49
- )
23
+ return [
24
+ token .lemma_ .casefold ()
25
+ for token in nlp (text )
26
+ if token .pos_ in ("NOUN" , "VERB" , "PROPN" ) and len (token .lemma_ ) > 3
27
+ ]
50
28
51
29
52
30
def find_similar_words (adv_words : Set [str ], commit_msg : str , exclude : str ) -> Set [str ]:
@@ -70,10 +48,17 @@ def extract_products(text: str) -> List[str]:
70
48
"""
71
49
Extract product names from advisory text
72
50
"""
73
- # TODO implement this properly
74
- regex = r"([A-Z]+[a-z\b]+)"
75
- result = set (re .findall (regex , text ))
76
- return [p for p in result if len (p ) > 2 ]
51
+ return list (
52
+ set (
53
+ [
54
+ token .text
55
+ for token in nlp (text )
56
+ if token .pos_ in ("PROPN" )
57
+ and token .text .isalpha ()
58
+ and len (token .text ) > 2
59
+ ]
60
+ ) # "NOUN",
61
+ )
77
62
78
63
79
64
def extract_affected_filenames (
@@ -107,14 +92,18 @@ def extract_filename(text: str, relevant_extensions: List[str]) -> str:
107
92
108
93
# Covers cases like: class::method, class.method,
109
94
# TODO: in nebula is getting the e from e.g.
110
- res = re .search (r"^(\w+)(?:\.|:{2})(\w+)$" , text ) # ^(\w{2,})(?:\.|:{2})(\w{2,})$
95
+ res = re .search (
96
+ r"^(\w{2,})(?:\.|:{2})(\w+)$" , text
97
+ ) # ^(\w{2,})(?:\.|:{2})(\w{2,})$
111
98
# Check if it is not a number
112
99
if res and not bool (re .match (r"^\d+$" , res .group (1 ))):
113
- return res .group (1 ) if len ( res . group ( 1 )) > 3 else None
100
+ return res .group (1 )
114
101
115
102
# className or class_name (normal string with underscore)
116
103
# TODO: ShenYu and words
117
104
# like this should be excluded...
105
+ # TODO: filter for not present in url
106
+ #
118
107
if bool (re .search (r"[a-z]{2,}[A-Z]+[a-z]*" , text )) or "_" in text :
119
108
return text
120
109
@@ -183,3 +172,23 @@ def extract_references_keywords(text: str) -> List[str]:
183
172
for result in re .finditer (r"[A-Z]{2,}-\d+|github\.com\/(?:\w+|\/)*" , text )
184
173
if "CVE" not in result .group (0 )
185
174
]
175
+
176
+
177
+ # def extract_special_terms(description: str) -> Set[str]:
178
+ # """
179
+ # Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
180
+ # These are usually code fragments and names of code entities, or paths.
181
+ # """
182
+
183
+ # return set()
184
+ # # TODO replace this with NLP implementation
185
+ # # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
186
+ # # noinspection PyUnreachableCode
187
+ # result = []
188
+ # for word in description.split():
189
+ # no_punctation_word = word.rstrip(").,;:?!\"'").lstrip("(")
190
+ # contains_non_word_char = re.search(r"\W", no_punctation_word)
191
+ # contains_non_initial_upper_case = re.search(r"\B[A-Z]", no_punctation_word)
192
+ # if contains_non_initial_upper_case or contains_non_word_char:
193
+ # result.append(word)
194
+ # return tuple(result)
0 commit comments