Skip to content

Commit 6f07393

Browse files
committed
Merge branch 'lauraschauer-evaluation-scripts'
2 parents 5bf5c71 + 09b92d9 commit 6f07393

30 files changed

+11560
-62
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ prospector/test_report.json
6262
prospector/.idea/*
6363
prospector/*.html
6464
prospector/*.json
65-
prospector/evaluation
65+
prospector/evaluation/data/input/*
66+
prospector/evaluation/data/reports/*
67+
prospector/evaluation/config.yaml
6668
.DS_Store
6769
prospector/pipeline/reports/*

prospector/datamodel/advisory.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def analyze(self):
108108

109109
# for k, v in self.references.items():
110110
# print(k, v)
111-
logger.debug("References: " + str(self.references))
111+
# logger.debug("References: " + str(self.references))
112112

113113
# TODO: I should extract interesting stuff from the references immediately ad maintain them just for a fast lookup
114114
logger.debug(f"Relevant references: {len(self.references)}")
@@ -210,6 +210,7 @@ def get_commits_in_advisory_references(self) -> List[str]:
210210
}
211211
limit += 1
212212

213+
# Filter out references that are not commit hashes, eg. commit::master
213214
return [
214215
ref.split("::")[1]
215216
for ref in self.references

prospector/datamodel/nlp.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,14 @@ def extract_words_from_text(text: str) -> List[str]:
5050
]
5151

5252

53-
def find_similar_words(adv_words: Set[str], commit_msg: str, exclude: str) -> Set[str]:
53+
def find_similar_words(
54+
adv_words: Set[str], commit_msg: str, exclude: str
55+
) -> Set[str]:
5456
"""Extract nouns from commit message that appears in the advisory text"""
5557
commit_words = {
56-
word for word in extract_words_from_text(commit_msg) if word not in exclude
58+
word
59+
for word in extract_words_from_text(commit_msg)
60+
if word not in exclude
5761
}
5862
return commit_words.intersection(adv_words)
5963
# return [word for word in extract_words_from_text(commit_msg) if word in adv_words]
@@ -63,7 +67,9 @@ def extract_versions(text: str) -> List[str]:
6367
"""
6468
Extract all versions mentioned in the text
6569
"""
66-
return list(set(re.findall(r"(\d+(?:\.\d+)+)", text))) # Should be more accurate
70+
return list(
71+
set(re.findall(r"(\d+(?:\.\d+)+)", text))
72+
) # Should be more accurate
6773
# return re.findall(r"[0-9]+\.[0-9]+[0-9a-z.]*", text)
6874

6975

@@ -134,7 +140,8 @@ def extract_filename(text: str, relevant_extensions: List[str]) -> List[str]:
134140
# This regex covers cases with various camelcase filenames and underscore, dash names
135141
if bool(
136142
re.search(
137-
r"(?:[a-z]|[A-Z])[a-zA-Z]+[A-Z]\w*|(?:[a-zA-Z]{2,}[_-])+[a-zA-Z]{2,}", text
143+
r"(?:[a-z]|[A-Z])[a-zA-Z]+[A-Z]\w*|(?:[a-zA-Z]{2,}[_-])+[a-zA-Z]{2,}",
144+
text,
138145
)
139146
):
140147
return [text], None
@@ -195,7 +202,12 @@ def extract_cve_references(text: str) -> List[str]:
195202
Extract CVE identifiers
196203
"""
197204
return list(
198-
set([result.group(0) for result in re.finditer(r"CVE-\d{4}-\d{4,8}", text)])
205+
set(
206+
[
207+
result.group(0)
208+
for result in re.finditer(r"CVE-\d{4}-\d{4,8}", text)
209+
]
210+
)
199211
)
200212

201213

prospector/docker-compose.yml

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,16 @@ services:
1616
GIT_CACHE: /tmp/gitcache
1717
CVE_DATA_PATH: /app/cve_data
1818
REDIS_URL: redis://redis:6379/0
19-
#POSTGRES_HOST: db
20-
#POSTGRES_PORT: 5432
21-
#POSTGRES_USER: postgres
22-
#POSTGRES_PASSWORD: example
23-
#POSTGRES_DBNAME: postgres
24-
#NVD_API_KEY: ${NVD_API_KEY}
2519

2620
worker:
2721
build:
2822
context: .
2923
dockerfile: docker/worker/Dockerfile
3024
volumes:
3125
- ./:/app
32-
# - ./pipeline/reports:/app/pipeline/reports
26+
- ./data_sources/reports:/app/data_sources/reports
27+
- ./evaluation/data/reports/:/app/evaluation/data/reports
28+
- ./../../../data/gitcache:/tmp/gitcache
3329
depends_on:
3430
- redis
3531
environment:

prospector/docker/Dockerfile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
FROM python:3.10-slim
22

3-
COPY . /app
3+
RUN mkdir -p /app
4+
COPY ./requirements.txt /app/
45
WORKDIR /app
6+
# Create log files with permissions for host user
7+
RUN touch evaluation.log
8+
RUN touch prospector.log
9+
RUN chown ${UID}:${GID} evaluation.log
10+
RUN chown ${UID}:${GID} prospector.log
11+
12+
# Install dependencies with pip
513
RUN pip install --upgrade pip
614
RUN apt update && apt install -y --no-install-recommends gcc g++ libffi-dev python3-dev libpq-dev git curl
715
RUN pip install --no-cache-dir -r requirements.txt

prospector/docker/worker/etc_supervisor_confd_rqworker.conf.j2

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ command=/usr/local/bin/python3 /usr/local/bin/rq worker {{env['RQ_QUEUE']}} -u r
88
process_name=%(program_name)s%(process_num)01d
99

1010
; If you want to run more than one worker instance, increase this
11-
numprocs=2
11+
numprocs=10
1212
redirect_stderr=true
1313

1414
; This is the directory from which RQ is ran. Be sure to point this to the

prospector/evaluation/README.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Evaluate Prospector
2+
3+
This folder contains the scripts used for evaluating Prospector's reports and data needed for it (created and used in Summer 2024). The folder is structured as follows:
4+
5+
1. **Data** folder: contains input data, Prospector reports and results of the analysis of the Prospector reports.
6+
2. **Scripts**: The scripts used for running Prospector on a batch of CVEs, and for analysing the created reports.
7+
8+
Prospector is run in the following way in this evaluation:
9+
10+
First, the five docker containers must be started with `make docker-setup` or manually with `docker` commands. Once they are running, `docker ps` should show the following:
11+
12+
```bash
13+
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
14+
c73aed108475 prospector_backend "python ./service/ma…" 47 minutes ago Up 47 minutes 0.0.0.0:8000->8000/tcp, :::8000->8000/tcp prospector_backend_1
15+
2e9da86b09a8 prospector_worker "/usr/local/bin/star…" 47 minutes ago Up 47 minutes prospector_worker_1
16+
b219fd6219ed adminer "entrypoint.sh php -…" 47 minutes ago Up 47 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp prospector_adminer_1
17+
9aacdc04f7c5 postgres "docker-entrypoint.s…" 47 minutes ago Up 47 minutes 0.0.0.0:5432->5432/tcp, :::5432->5432/tcp db
18+
7c540450ab76 redis:alpine "docker-entrypoint.s…" 47 minutes ago Up 47 minutes 0.0.0.0:6379->6379/tcp, :::6379->6379/tcp prospector_redis_1
19+
```
20+
21+
[`dispatch_jobs.py`](#running-prospector-on-multiple-cves-dispatch_jobspy) creates jobs with the `prospector()` function in them and enqueues
22+
them in a Redis Queue, from which the `prospector_worker` container fetches jobs and executes them. To visualise what is going on, run
23+
`docker attach prospector_worker_1` to see the usual console output. In order to change something inside the container, run `docker exec -it prospector_worker_1 bash` to open an interactive bash shell.
24+
25+
You can set the number of workers in `docker/worker/etc_supervisor_confd_rqworker.conf.j2`.
26+
27+
## Configuration File
28+
29+
The configuration file has two parts to it: a main part and a Prospector settings part, which is a copy of a part of the original Prospector `config.yaml` file.
30+
31+
The main part at the top allows you to set the path to where the input data can be found, where Prospector reports should be saved to and where analysis results should be saved to.
32+
33+
The Prospector part allows you to set the settings for Prospector (independent from the Prospector settings used when running Prospector with `./run_prospector`). **Watch out**: Since the `prospector_worker` container is created in the beginning with the current state of the `config.yaml`, simply saving any changes in `config.yaml` and dispatching new jobs will still run them with the old configuration. For new configuration parameters to take effect, either destroy the containers with `make docker-clean` and rebuild them with `make docker-setup` or open an interactive shell to the container and make your changes to the code in there.
34+
35+
## Script Files explained
36+
37+
### Running Prospector on multiple CVEs (`dispatch_jobs.py`)
38+
39+
The code for running Prospector is in `dispatch_jobs.py`. It exctracts CVE IDs from the data given in the path constructed as: `input_data_path` + the `-i` CL parameter. It then dispatches a job for each CVE ID to the queue, from where these jobs get executed. The path to the input file is split into two components (`input_data_path` in `config.yaml` and the `-i` parameter) because you might have one folder in which you have several different input data files of the same format. This keeps you from typing the full path, but still allows you to switch between the files between different runs.
40+
41+
The reports are generated in the worker container, and saved to `prospector_reports_path_container`. This folder is mounted into the container, so you can see any newly generated reports in the same folder on the host.
42+
43+
Do not confuse this paramter with `prospector_reports_path_host`, which sets the path to a batch of reports on the host (used for analysis). Your workflow should be as follows:
44+
45+
1. Dispatch reports
46+
2. When the report generation has finished, move the reports to any other folder (preferably outside of the `prospector/` folder to keep the build context for the container from getting too big).
47+
3. Analyse the reports by setting the `prospector_reports_path_host` to the folder where you moved the reports to.
48+
49+
### Analysing the generated reports (`analyse.py`)
50+
51+
Start an analysis with
52+
53+
```bash
54+
python3 evaluation/main.py -i <your_input_data_csv_file> -a
55+
```
56+
57+
This will start the `analyse_prospector_reports()` function in `analyse.py`, which re-creates the summary execution table from [AssureMOSS D6.3](https://assuremoss.eu/en/resources/Deliverables/D6.3):
58+
![D6.3 Summary Execution Table](images/summary_execution_table.png)
59+
60+
It also creates a detailed JSON file (listing CVEs in each category) in `data/results/summary_execution/` to inspect which CVEs are in which category.

prospector/evaluation/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)