1
- import os
2
1
import re
3
- from typing import Dict , List , Set
2
+ from typing import Dict , List , Set , Tuple
4
3
5
- import requests
6
-
7
- # from util.http import extract_from_webpage, fetch_url, get_from_xml
8
4
from spacy import load
9
5
10
6
from datamodel .constants import RELEVANT_EXTENSIONS
11
- from util .http import get_from_xml
7
+ from util .http import fetch_url , get_from_xml
12
8
13
- JIRA_ISSUE_URL = "https://issues.apache.org/jira/browse/"
14
- GITHUB_TOKEN = os .getenv ("GITHUB_TOKEN" )
9
+ nlp = load ("en_core_web_sm" )
15
10
16
11
17
- nlp = load ("en_core_web_sm" )
12
+ def get_names (text : str , exclude : str ) -> List [str ]:
13
+ """
14
+ Extract names from text
15
+ """
16
+ return [
17
+ token .text
18
+ for token in nlp (text )
19
+ if token .pos_ in ("PROPN" , "NOUN" )
20
+ and token .text .casefold () not in exclude
21
+ and token .is_alpha
22
+ ]
23
+
24
+
25
+ def clean_string (text : str ) -> str :
26
+ """
27
+ Remove all non-alphanumeric characters from a string
28
+ """
29
+ return " " .join (
30
+ set (
31
+ [
32
+ token .lemma_
33
+ for token in nlp (text )
34
+ if not token .is_punct and len (token .lemma_ ) > 2
35
+ ]
36
+ )
37
+ )
18
38
19
39
20
40
def extract_words_from_text (text : str ) -> List [str ]:
@@ -23,7 +43,9 @@ def extract_words_from_text(text: str) -> List[str]:
23
43
return [
24
44
token .lemma_ .casefold ()
25
45
for token in nlp (text )
26
- if token .pos_ in ("NOUN" , "VERB" , "PROPN" ) and len (token .lemma_ ) > 3
46
+ if token .pos_ in ("NOUN" , "VERB" , "PROPN" )
47
+ and len (token .lemma_ ) > 3
48
+ and token .lemma_ .isalnum ()
27
49
]
28
50
29
51
@@ -63,15 +85,19 @@ def extract_products(text: str) -> List[str]:
63
85
64
86
def extract_affected_filenames (
65
87
text : str , extensions : List [str ] = RELEVANT_EXTENSIONS
66
- ) -> Set [str ]:
88
+ ) -> Tuple [ Set [str ], Set [ str ] ]:
67
89
files = set ()
90
+ extension = set ()
68
91
for word in text .split ():
69
- res = word .strip ("_,.:;-+!?()[]'\" " )
70
- res = extract_filename_from_path (res )
71
- res = extract_filename (res , extensions )
72
- if res :
73
- files .add (res )
74
- return files
92
+ res = re .sub (r"^[^a-z0-9]+|[^a-z0-9]+$" , "" , word , flags = re .IGNORECASE )
93
+ res = re .split (r"[\\\/]" , res )[- 1 ]
94
+ res , ext = extract_filename (res , extensions )
95
+ if len (res ) > 0 :
96
+ files .update (res )
97
+ if ext is not None :
98
+ extension .add (ext )
99
+
100
+ return files , extension
75
101
76
102
77
103
# TODO: enhanche this
@@ -80,34 +106,23 @@ def extract_filename_from_path(text: str) -> str:
80
106
return text .split ("/" )[- 1 ]
81
107
82
108
83
- def extract_filename (text : str , relevant_extensions : List [str ]) -> str :
109
+ def extract_filename (text : str , relevant_extensions : List [str ]) -> List [ str ] :
84
110
# Covers cases file.extension if extension is relevant, extensions come from CLI parameter
85
- extensions_regex = r"^(?:^|\s?)([\w\-]{2,}\.(?:%s))(?:$|\s|\.|,|:)" % "|" .join (
86
- relevant_extensions
87
- )
88
-
89
- res = re .search (extensions_regex , text )
90
- if res :
91
- return res .group (1 )
92
-
93
- # Covers cases like: class::method, class.method,
94
- # TODO: in nebula is getting the e from e.g.
95
- res = re .search (
96
- r"^(\w{2,})(?:\.|:{2})(\w+)$" , text
97
- ) # ^(\w{2,})(?:\.|:{2})(\w{2,})$
98
- # Check if it is not a number
99
- if res and not bool (re .match (r"^\d+$" , res .group (1 ))):
100
- return res .group (1 )
101
-
102
- # className or class_name (normal string with underscore)
103
- # TODO: ShenYu and words
104
- # like this should be excluded...
105
- # TODO: filter for not present in url
106
- #
107
- if bool (re .search (r"[a-z]{2,}[A-Z]+[a-z]*" , text )) or "_" in text :
108
- return text
109
-
110
- return None
111
+ res = re .search (r"(?:(\w{2,})\.)+(\w+)" , text , flags = re .IGNORECASE )
112
+ if res is not None :
113
+ if res .group (2 ) in relevant_extensions :
114
+ return [res .group (1 )], res .group (2 )
115
+ elif not res .group (2 ).isdigit ():
116
+ return [res .group (2 ), res .group (1 )], None
117
+
118
+ # This regex covers cases with various camelcase filenames and underscore, dash names
119
+ if bool (
120
+ re .search (
121
+ r"(?:[a-z]|[A-Z])[a-zA-Z]+[A-Z]\w*|(?:[a-zA-Z]{2,}[_-])+[a-zA-Z]{2,}" , text
122
+ )
123
+ ):
124
+ return [text ], None
125
+ return [], None
111
126
112
127
113
128
def extract_ghissue_references (repository : str , text : str ) -> Dict [str , str ]:
@@ -116,21 +131,25 @@ def extract_ghissue_references(repository: str, text: str) -> Dict[str, str]:
116
131
"""
117
132
refs = dict ()
118
133
119
- # /repos/{owner}/{repo}/issues/{issue_number}
120
- headers = {
121
- "Accept" : "application/vnd.github+json" ,
122
- }
123
- if GITHUB_TOKEN :
124
- headers .update ({"Authorization" : f"Bearer { GITHUB_TOKEN } " })
125
-
126
134
for result in re .finditer (r"(?:#|gh-)(\d+)" , text ):
127
135
id = result .group (1 )
128
- owner , repo = repository .split ("/" )[- 2 :]
129
- url = f"https://api.github.com/repos/{ owner } /{ repo } /issues/{ id } "
130
- r = requests .get (url , headers = headers )
131
- if r .status_code == 200 :
132
- data = r .json ()
133
- refs [id ] = f"{ data ['title' ]} { data ['body' ]} "
136
+ url = f"{ repository } /issues/{ id } "
137
+ content = fetch_url (url = url , extract_text = False )
138
+ gh_ref_data = content .find_all (
139
+ attrs = {
140
+ "class" : ["comment-body" , "markdown-title" ],
141
+ }
142
+ )
143
+ gh_ref_data .extend (
144
+ content .find_all (
145
+ attrs = {
146
+ "id" : re .compile (r"ref-issue|ref-pullrequest|ref-commit" ),
147
+ }
148
+ )
149
+ )
150
+ refs [id ] = " " .join (
151
+ [" " .join (block .get_text ().split ()) for block in gh_ref_data ]
152
+ )
134
153
135
154
return refs
136
155
@@ -146,12 +165,9 @@ def extract_jira_references(repository: str, text: str) -> Dict[str, str]:
146
165
147
166
for result in re .finditer (r"[A-Z]+-\d+" , text ):
148
167
id = result .group ()
149
- issue_content = get_from_xml (id )
150
- refs [id ] = (
151
- " " .join (re .findall (r"\w{3,}" , issue_content ))
152
- if len (issue_content ) > 0
153
- else ""
154
- )
168
+ if id .startswith ("CVE-" ):
169
+ continue
170
+ refs [id ] = get_from_xml (id )
155
171
156
172
return refs
157
173
@@ -172,23 +188,3 @@ def extract_references_keywords(text: str) -> List[str]:
172
188
for result in re .finditer (r"[A-Z]{2,}-\d+|github\.com\/(?:\w+|\/)*" , text )
173
189
if "CVE" not in result .group (0 )
174
190
]
175
-
176
-
177
- # def extract_special_terms(description: str) -> Set[str]:
178
- # """
179
- # Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
180
- # These are usually code fragments and names of code entities, or paths.
181
- # """
182
-
183
- # return set()
184
- # # TODO replace this with NLP implementation
185
- # # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
186
- # # noinspection PyUnreachableCode
187
- # result = []
188
- # for word in description.split():
189
- # no_punctation_word = word.rstrip(").,;:?!\"'").lstrip("(")
190
- # contains_non_word_char = re.search(r"\W", no_punctation_word)
191
- # contains_non_initial_upper_case = re.search(r"\B[A-Z]", no_punctation_word)
192
- # if contains_non_initial_upper_case or contains_non_word_char:
193
- # result.append(word)
194
- # return tuple(result)
0 commit comments