SAP
diff --git a/‎prospector/datamodel/advisory.py
Lines changed: 8 additions & 12 deletions b/‎prospector/datamodel/advisory.py
Lines changed: 8 additions & 12 deletions
diff --git a/‎prospector/datamodel/commit.py
Lines changed: 6 additions & 8 deletions b/‎prospector/datamodel/commit.py
Lines changed: 6 additions & 8 deletions
diff --git a/‎prospector/datamodel/nlp.py
Lines changed: 79 additions & 83 deletions b/‎prospector/datamodel/nlp.py
Lines changed: 79 additions & 83 deletions
diff --git a/‎prospector/datamodel/nlp_test.py
Lines changed: 9 additions & 2 deletions b/‎prospector/datamodel/nlp_test.py
Lines changed: 9 additions & 2 deletions
@@ -64,6 +64,7 @@ def __init__(
         versions: Dict[str, List[str]] = None,
         files: Set[str] = None,
         keywords: Set[str] = None,
+        files_extensions: Set[str] = None,
     ):
         self.cve_id = cve_id
         self.description = description
@@ -75,6 +76,7 @@ def __init__(
         self.versions = versions or dict()
         self.files = files or set()
         self.keywords = keywords or set()
+        self.files_extension = files_extensions or set()
 
     def analyze(
         self,
@@ -89,8 +91,10 @@ def analyze(
         self.affected_products.extend(extract_products(self.description))
         self.affected_products = list(set(self.affected_products))
 
+        files, extension = extract_affected_filenames(self.description)
+        self.files_extension = extension
         # TODO: this could be done on the words extracted from the description
-        self.files.update(extract_affected_filenames(self.description))
+        self.files.update(files)
 
         self.keywords.update(set(extract_words_from_text(self.description)))
 
@@ -135,14 +139,6 @@ def parse_advisory(self, data):
         ]
         self.versions["fixed"] = [v for v in self.versions["fixed"] if v is not None]
 
-        # [
-        #     (
-        #         item.get("versionEndIncluding"),  # item.get("versionStartExcluding")
-        #         item.get("versionEndExcluding"),  # , item.get("versionEndIncluding")
-        #     )
-        #     for item in data["configurations"][0]["nodes"][0]["cpeMatch"]
-        # ]
-
 
 def get_from_nvd(cve_id: str):
     """Get an advisory from the NVD dtabase"""
@@ -182,7 +178,7 @@ def build_advisory_record(
     fetch_references: bool = False,
     use_nvd: bool = True,
     publication_date: Optional[str] = None,
-    advisory_keywords: Optional[str] = None,
+    advisory_keywords: Set[str] = set(),
     modified_files: Optional[str] = None,
 ) -> AdvisoryRecord:
 
@@ -206,8 +202,8 @@ def build_advisory_record(
             isoparse(publication_date).timestamp()
         )
 
-    if advisory_keywords and len(advisory_keywords) > 0:
-        advisory_record.keywords.update(set(advisory_keywords.split(",")))
+    if len(advisory_keywords) > 0:
+        advisory_record.keywords = advisory_keywords
 
     if modified_files and len(modified_files) > 0:
         advisory_record.files.update(set(modified_files.split(",")))
 
@@ -8,7 +8,7 @@
     extract_jira_references,
 )
 from git.raw_commit import RawCommit
-from util.lsh import decode_minhash, encode_minhash
+from util.lsh import decode_minhash, encode_minhash, get_encoded_minhash
 
 
 class Commit(BaseModel):
@@ -62,11 +62,11 @@ def add_match(self, rule: Dict[str, Any]):
     def has_twin(self):
         return len(self.twins) > 0
 
-    def has_tag(self):
-        return self.tags[0] != ""
+    def has_tag(self, tag: str) -> bool:
+        return tag in self.tags
 
     def get_tag(self):
-        return self.tags[0]
+        return self.tags[0] if len(self.tags) else "no-tag"
 
     def compute_relevance(self):
         self.relevance = sum([rule.get("relevance") for rule in self.matched_rules])
@@ -128,18 +128,16 @@ def make_from_raw_commit(raw: RawCommit) -> Commit:
         timestamp=raw.get_timestamp(),
         changed_files=raw.get_changed_files(),
         message=raw.get_msg(),
-        twins=raw.get_twins(),
-        minhash=raw.get_minhash(),
     )
 
     # NOTE: all attributes that do not depend on a particular query
     # (e.g. do not depend on a particular Advisory Record)
     # should be computed here so that they can be stored in the db.
     # Space-efficiency is important.
+    commit.minhash = get_encoded_minhash(raw.get_msg(50))
 
     commit.diff, commit.hunks = raw.get_diff()
-    commit.tags.append(raw.get_tag())
-    # commit.tags = raw.get_tags()
+    commit.tags = raw.find_tags()
     commit.jira_refs = extract_jira_references(commit.repository, commit.message)
     commit.ghissue_refs = extract_ghissue_references(commit.repository, commit.message)
     commit.cve_refs = extract_cve_references(commit.message)
 
@@ -1,20 +1,40 @@
-import os
 import re
-from typing import Dict, List, Set
+from typing import Dict, List, Set, Tuple
 
-import requests
-
-# from util.http import extract_from_webpage, fetch_url, get_from_xml
 from spacy import load
 
 from datamodel.constants import RELEVANT_EXTENSIONS
-from util.http import get_from_xml
+from util.http import fetch_url, get_from_xml
 
-JIRA_ISSUE_URL = "https://issues.apache.org/jira/browse/"
-GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+nlp = load("en_core_web_sm")
 
 
-nlp = load("en_core_web_sm")
+def get_names(text: str, exclude: str) -> List[str]:
+    """
+    Extract names from text
+    """
+    return [
+        token.text
+        for token in nlp(text)
+        if token.pos_ in ("PROPN", "NOUN")
+        and token.text.casefold() not in exclude
+        and token.is_alpha
+    ]
+
+
+def clean_string(text: str) -> str:
+    """
+    Remove all non-alphanumeric characters from a string
+    """
+    return " ".join(
+        set(
+            [
+                token.lemma_
+                for token in nlp(text)
+                if not token.is_punct and len(token.lemma_) > 2
+            ]
+        )
+    )
 
 
 def extract_words_from_text(text: str) -> List[str]:
@@ -23,7 +43,9 @@ def extract_words_from_text(text: str) -> List[str]:
     return [
         token.lemma_.casefold()
         for token in nlp(text)
-        if token.pos_ in ("NOUN", "VERB", "PROPN") and len(token.lemma_) > 3
+        if token.pos_ in ("NOUN", "VERB", "PROPN")
+        and len(token.lemma_) > 3
+        and token.lemma_.isalnum()
     ]
 
 
@@ -63,15 +85,19 @@ def extract_products(text: str) -> List[str]:
 
 def extract_affected_filenames(
     text: str, extensions: List[str] = RELEVANT_EXTENSIONS
-) -> Set[str]:
+) -> Tuple[Set[str], Set[str]]:
     files = set()
+    extension = set()
     for word in text.split():
-        res = word.strip("_,.:;-+!?()[]'\"")
-        res = extract_filename_from_path(res)
-        res = extract_filename(res, extensions)
-        if res:
-            files.add(res)
-    return files
+        res = re.sub(r"^[^a-z0-9]+|[^a-z0-9]+$", "", word, flags=re.IGNORECASE)
+        res = re.split(r"[\\\/]", res)[-1]
+        res, ext = extract_filename(res, extensions)
+        if len(res) > 0:
+            files.update(res)
+        if ext is not None:
+            extension.add(ext)
+
+    return files, extension
 
 
 # TODO: enhanche this
@@ -80,34 +106,23 @@ def extract_filename_from_path(text: str) -> str:
     return text.split("/")[-1]
 
 
-def extract_filename(text: str, relevant_extensions: List[str]) -> str:
+def extract_filename(text: str, relevant_extensions: List[str]) -> List[str]:
     # Covers cases file.extension if extension is relevant, extensions come from CLI parameter
-    extensions_regex = r"^(?:^|\s?)([\w\-]{2,}\.(?:%s))(?:$|\s|\.|,|:)" % "|".join(
-        relevant_extensions
-    )
-
-    res = re.search(extensions_regex, text)
-    if res:
-        return res.group(1)
-
-    # Covers cases like: class::method, class.method,
-    # TODO: in nebula is getting the e from e.g.
-    res = re.search(
-        r"^(\w{2,})(?:\.|:{2})(\w+)$", text
-    )  # ^(\w{2,})(?:\.|:{2})(\w{2,})$
-    # Check if it is not a number
-    if res and not bool(re.match(r"^\d+$", res.group(1))):
-        return res.group(1)
-
-    # className or class_name (normal string with underscore)
-    # TODO: ShenYu and words
-    # like this should be excluded...
-    # TODO: filter for not present in url
-    #
-    if bool(re.search(r"[a-z]{2,}[A-Z]+[a-z]*", text)) or "_" in text:
-        return text
-
-    return None
+    res = re.search(r"(?:(\w{2,})\.)+(\w+)", text, flags=re.IGNORECASE)
+    if res is not None:
+        if res.group(2) in relevant_extensions:
+            return [res.group(1)], res.group(2)
+        elif not res.group(2).isdigit():
+            return [res.group(2), res.group(1)], None
+
+    # This regex covers cases with various camelcase filenames and underscore, dash names
+    if bool(
+        re.search(
+            r"(?:[a-z]|[A-Z])[a-zA-Z]+[A-Z]\w*|(?:[a-zA-Z]{2,}[_-])+[a-zA-Z]{2,}", text
+        )
+    ):
+        return [text], None
+    return [], None
 
 
 def extract_ghissue_references(repository: str, text: str) -> Dict[str, str]:
@@ -116,21 +131,25 @@ def extract_ghissue_references(repository: str, text: str) -> Dict[str, str]:
     """
     refs = dict()
 
-    # /repos/{owner}/{repo}/issues/{issue_number}
-    headers = {
-        "Accept": "application/vnd.github+json",
-    }
-    if GITHUB_TOKEN:
-        headers.update({"Authorization": f"Bearer {GITHUB_TOKEN}"})
-
     for result in re.finditer(r"(?:#|gh-)(\d+)", text):
         id = result.group(1)
-        owner, repo = repository.split("/")[-2:]
-        url = f"https://api.github.com/repos/{owner}/{repo}/issues/{id}"
-        r = requests.get(url, headers=headers)
-        if r.status_code == 200:
-            data = r.json()
-            refs[id] = f"{data['title']} {data['body']}"
+        url = f"{repository}/issues/{id}"
+        content = fetch_url(url=url, extract_text=False)
+        gh_ref_data = content.find_all(
+            attrs={
+                "class": ["comment-body", "markdown-title"],
+            }
+        )
+        gh_ref_data.extend(
+            content.find_all(
+                attrs={
+                    "id": re.compile(r"ref-issue|ref-pullrequest|ref-commit"),
+                }
+            )
+        )
+        refs[id] = " ".join(
+            [" ".join(block.get_text().split()) for block in gh_ref_data]
+        )
 
     return refs
 
@@ -146,12 +165,9 @@ def extract_jira_references(repository: str, text: str) -> Dict[str, str]:
 
     for result in re.finditer(r"[A-Z]+-\d+", text):
         id = result.group()
-        issue_content = get_from_xml(id)
-        refs[id] = (
-            " ".join(re.findall(r"\w{3,}", issue_content))
-            if len(issue_content) > 0
-            else ""
-        )
+        if id.startswith("CVE-"):
+            continue
+        refs[id] = get_from_xml(id)
 
     return refs
 
@@ -172,23 +188,3 @@ def extract_references_keywords(text: str) -> List[str]:
         for result in re.finditer(r"[A-Z]{2,}-\d+|github\.com\/(?:\w+|\/)*", text)
         if "CVE" not in result.group(0)
     ]
-
-
-# def extract_special_terms(description: str) -> Set[str]:
-#     """
-#     Extract all words (space delimited) which presumably cannot be part of an natural language sentence.
-#     These are usually code fragments and names of code entities, or paths.
-#     """
-
-#     return set()
-#     # TODO replace this with NLP implementation
-#     # see, https://github.com/SAP/project-kb/issues/256#issuecomment-927639866
-#     # noinspection PyUnreachableCode
-#     result = []
-#     for word in description.split():
-#         no_punctation_word = word.rstrip(").,;:?!\"'").lstrip("(")
-#         contains_non_word_char = re.search(r"\W", no_punctation_word)
-#         contains_non_initial_upper_case = re.search(r"\B[A-Z]", no_punctation_word)
-#         if contains_non_initial_upper_case or contains_non_word_char:
-#             result.append(word)
-#     return tuple(result)
@@ -25,6 +25,8 @@ def test_extract_similar_words():
 
 ADVISORY_TEXT_5 = """A vulnerability in the JsonMapObjectReaderWriter of Apache CXF allows an attacker to submit malformed JSON to a web service, which results in the thread getting stuck in an infinite loop, consuming CPU indefinitely. This issue affects Apache CXF versions prior to 3.4.4; Apache CXF versions prior to 3.3.11."""
 
+ADVISORY_TEXT_6 = """Apache HTTP Server versions 2.4.41 to 2.4.46 mod_proxy_http can be made to crash (NULL pointer dereference) with specially crafted requests using both Content-Length and Transfer-Encoding headers, leading to a Denial of Service"""
+
 
 def test_extract_affected_filenames():
     result1 = extract_affected_filenames(ADVISORY_TEXT_1)
@@ -74,6 +76,11 @@ def test_extract_jira_references():
 
 
 def test_extract_gh_issues():
-    d = extract_ghissue_references("https://github.com/slackhq/nebula", "#310")
+    d = extract_ghissue_references("https://github.com/apache/commons-text", "#341")
     print(d)
-    pass
+    raise NotImplementedError
+
+
+def test_extract_filenames_single():
+    d = extract_affected_filenames(ADVISORY_TEXT_6)
+    raise Exception(d)