Skip to content

Commit 7284e34

Browse files
authored
Search twin commits and misc other improvements (#342)
- Implemented twin commits, closes #147 - Using basemodel again (pydantic) - Fixed the logger, improved log aesthetics and readability - github and jira issues are collected using their respective APIs (GITHUB token required via .env file) - Better word extraction and filtering based on tests conducted on the small dataset. - Report handlers now create nested folders - updated to python 3.10 - skip GitHub when fetching references - refactored git and raw_commit modules: all commit IDs, timestamp, parent, message and changed files are collected with a single call to git executable - refactored logging - Fixes: #339 #341 #334 #331 #326 #336
1 parent 881dd45 commit 7284e34

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+2140
-2111
lines changed

.github/workflows/python.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ jobs:
4141
# Maps tcp port 5432 on service container to the host
4242
- 5432:5432
4343
steps:
44-
- uses: actions/checkout@v2
45-
- name: Set up Python 3.8
46-
uses: actions/setup-python@v2
44+
- uses: actions/checkout@v3
45+
- name: Set up Python 3.10
46+
uses: actions/setup-python@v4
4747
with:
48-
python-version: 3.8
48+
python-version: 3.10.6
4949
- name: Setup virtual environment
5050
run: |
5151
python -m pip install --upgrade pip

.gitignore

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,10 @@ kaybee/pkged.go
3030
kaybeeconf.yaml
3131
prospector/.env
3232
prospector/workspace.code-workspace
33-
prospector/.env
3433
prospector/disabled_tests/skip_test-commits.db
3534
prospector/disabled_tests/skip_test-vulnerabilities.db
35+
prospector/results
36+
prospector/*.py
3637
prospector/.vscode/launch.json
3738
prospector/.vscode/settings.json
3839
prospector/install_fastext.sh
@@ -45,7 +46,8 @@ prospector/client/cli/cov_html/*
4546
prospector/client/web/node-app/node_modules
4647
prospector/.coverage.*
4748
prospector/.coverage
48-
**/cov_html/*
49+
**/cov_html
50+
prospector/cov_html
4951
.coverage
5052
prospector/prospector.code-workspace
5153
prospector/requests-cache.sqlite

prospector/.flake8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[flake8]
2-
ignore = E203, E501, W503,F401,F403
2+
ignore = E203, E501, W503,F401,F403,W605
33
exclude =
44
# No need to traverse our git directory
55
.git,

prospector/Makefile

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ test:
1313

1414
setup: requirements.txt
1515
@echo "$(PROGRESS) Installing requirements"
16-
pip install -r requirements.txt
16+
@pip install -r requirements.txt
1717
@echo "$(DONE) Installed requirements"
1818
@echo "$(PROGRESS) Installing pre-commit and other modules"
1919
@pre-commit install
@@ -26,7 +26,7 @@ dev-setup: setup requirements-dev.txt
2626
@mkdir -p $(CVE_DATA_PATH)
2727
@echo "$(DONE) Created directory $(CVE_DATA_PATH)"
2828
@echo "$(PROGRESS) Installing development requirements"
29-
pip install -r requirements-dev.txt
29+
@pip install -r requirements-dev.txt
3030
@echo "$(DONE) Installed development requirements"
3131

3232
docker-setup:
@@ -56,7 +56,11 @@ select-run:
5656
python client/cli/main.py $(cve) --repository $(repository) --use-nvd
5757

5858
clean:
59-
rm prospector-report.html
60-
rm -f all.log* error.log*
61-
rm -rf $(GIT_CACHE)/*
62-
rm -rf __pycache__
59+
@rm -f prospector.log
60+
@rm -rf $(GIT_CACHE)/*
61+
@rm -rf __pycache__
62+
@rm -rf */__pycache__
63+
@rm -rf */*/__pycache__
64+
@rm -rf *report.html
65+
@rm -rf *.json
66+
@rm -rf requests-cache.sqlite

prospector/api/__init__.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +0,0 @@
1-
import os
2-
3-
DB_CONNECT_STRING = "postgresql://{}:{}@{}:{}/{}".format(
4-
os.environ["POSTGRES_USER"],
5-
os.environ["POSTGRES_PASSWORD"],
6-
os.environ["POSTGRES_HOST"],
7-
os.environ["POSTGRES_PORT"],
8-
os.environ["POSTGRES_DBNAME"],
9-
).lower()

prospector/api/api_test.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from fastapi.testclient import TestClient
2-
import pytest
32

43
from api.main import app
54
from datamodel.commit import Commit
@@ -22,13 +21,13 @@ def test_status():
2221
def test_post_preprocessed_commits():
2322
commit_1 = Commit(
2423
repository="https://github.com/apache/dubbo", commit_id="yyy"
25-
).__dict__
24+
).as_dict()
2625
commit_2 = Commit(
2726
repository="https://github.com/apache/dubbo", commit_id="zzz"
28-
).__dict__
27+
).as_dict()
2928
commit_3 = Commit(
3029
repository="https://github.com/apache/struts", commit_id="bbb"
31-
).__dict__
30+
).as_dict()
3231
commits = [commit_1, commit_2, commit_3]
3332
response = client.post("/commits/", json=commits)
3433
assert response.status_code == 200
@@ -43,7 +42,7 @@ def test_get_specific_commit():
4342
assert response.json()[0]["commit_id"] == commit_id
4443

4544

46-
@pytest.mark.skip(reason="will raise exception")
45+
# @pytest.mark.skip(reason="will raise exception")
4746
def test_get_commits_by_repository():
4847
repository = "https://github.com/apache/dubbo"
4948
response = client.get("/commits/" + repository)

prospector/api/main.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,12 @@
1-
# import os
2-
31
import uvicorn
42
from fastapi import FastAPI
53

6-
# from fastapi import Depends
74
from fastapi.middleware.cors import CORSMiddleware
85
from fastapi.responses import HTMLResponse, RedirectResponse
96

107
# from .dependencies import oauth2_scheme
118
from api.routers import jobs, nvd, preprocessed, users
129

13-
# from commitdb.postgres import PostgresCommitDB
14-
15-
# from pprint import pprint
16-
17-
18-
# db = PostgresCommitDB()
19-
# db.connect(DB_CONNECT_STRING)
20-
2110
api_metadata = [
2211
{"name": "data", "description": "Operations with data used to train ML models."},
2312
{
@@ -72,4 +61,9 @@ async def get_status():
7261

7362

7463
if __name__ == "__main__":
75-
uvicorn.run(app, host="0.0.0.0", port=80)
64+
65+
uvicorn.run(
66+
app,
67+
host="0.0.0.0",
68+
port=80,
69+
)

prospector/api/routers/jobs.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55
from rq import Connection, Queue
66
from rq.job import Job
77

8-
import log.util
8+
from log.logger import logger
99
from api.routers.nvd_feed_update import main
1010
from git.git import do_clone
1111

12-
_logger = log.util.init_local_logger()
1312

1413
redis_url = os.environ["REDIS_URL"]
1514

@@ -57,7 +56,7 @@ async def get_job(job_id):
5756
queue = Queue()
5857
job = queue.fetch_job(job_id)
5958
if job:
60-
_logger.info("job {} result: {}".format(job.get_id(), job.result))
59+
logger.info("job {} result: {}".format(job.get_id(), job.result))
6160
response_object = {
6261
"job_data": {
6362
"job_id": job.get_id(),

prospector/api/routers/nvd.py

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@
66
from fastapi import APIRouter, HTTPException
77
from fastapi.responses import JSONResponse
88

9-
import log.util
10-
11-
_logger = log.util.init_local_logger()
9+
from log.logger import logger
1210

1311

1412
router = APIRouter(
@@ -25,37 +23,37 @@
2523

2624
@router.get("/vulnerabilities/by-year/{year}")
2725
async def get_vuln_list_by_year(year: str):
28-
_logger.debug("Requested list of vulnerabilities for " + year)
26+
logger.debug("Requested list of vulnerabilities for " + year)
2927

3028
if len(year) != 4 or not year.isdigit():
3129
return JSONResponse([])
3230

3331
data_dir = os.path.join(DATA_PATH, year)
3432
if not os.path.isdir(data_dir):
35-
_logger.info("No data found for year " + year)
33+
logger.info("No data found for year " + year)
3634
raise HTTPException(
3735
status_code=404, detail="No vulnerabilities found for " + year
3836
)
3937

40-
_logger.debug("Serving data for year " + year)
38+
logger.debug("Serving data for year " + year)
4139
vuln_ids = [vid.rstrip(".json") for vid in os.listdir(data_dir)]
4240
results = {"count": len(vuln_ids), "data": vuln_ids}
4341
return JSONResponse(results)
4442

4543

4644
@router.get("/vulnerabilities/{vuln_id}")
4745
async def get_vuln_data(vuln_id):
48-
_logger.debug("Requested data for vulnerability " + vuln_id)
46+
logger.debug("Requested data for vulnerability " + vuln_id)
4947

5048
year = vuln_id.split("-")[1]
5149
json_file = os.path.join(DATA_PATH, year, vuln_id.upper() + ".json")
5250
if not os.path.isfile(json_file):
53-
_logger.info("No file found: " + json_file)
51+
logger.info("No file found: " + json_file)
5452
raise HTTPException(
5553
status_code=404, detail=json_file
5654
) # detail="Vulnerability data not found")
5755

58-
_logger.debug("Serving file: " + json_file)
56+
logger.debug("Serving file: " + json_file)
5957
with open(json_file) as f:
6058
data = json.loads(f.read())
6159

@@ -64,7 +62,7 @@ async def get_vuln_data(vuln_id):
6462

6563
@router.get("/status")
6664
async def status():
67-
_logger.debug("Serving status page")
65+
logger.debug("Serving status page")
6866
out = dict()
6967
metadata_file = os.path.join(DATA_PATH, "metadata.json")
7068
if os.path.isfile(metadata_file):

prospector/api/routers/nvd_feed_update.py

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,10 @@
2424
import requests
2525
from tqdm import tqdm
2626

27-
import log.util
27+
from log.logger import logger
2828

29-
_logger = log.util.init_local_logger()
29+
30+
NVD_API_KEY = os.getenv("NVD_API_KEY", "")
3031

3132
# note: The NVD has not data older than 2002
3233
START_FROM_YEAR = os.getenv("CVE_DATA_AS_OF_YEAR", "2002")
@@ -41,22 +42,20 @@ def do_update(quiet=False):
4142
with open(os.path.join(DATA_PATH, "metadata.json"), "r") as f:
4243
last_fetch_metadata = json.load(f)
4344
if not quiet:
44-
_logger.info("last fetch: " + last_fetch_metadata["sha256"])
45+
logger.info("last fetch: " + last_fetch_metadata["sha256"])
4546
except Exception:
4647
last_fetch_metadata["sha256"] = ""
47-
_logger.info(
48+
logger.info(
4849
"Could not read metadata about previous fetches"
4950
" (this might be the first time we fetch data).",
5051
exc_info=True,
5152
)
5253

5354
# read metadata of new data from the NVD site
54-
url = "https://nvd.nist.gov/feeds/json/cve/{}/nvdcve-{}-modified.meta".format(
55-
FEED_SCHEMA_VERSION, FEED_SCHEMA_VERSION
56-
)
57-
r = requests.get(url)
55+
url = "https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-modified.meta"
56+
r = requests.get(url, params={"apiKey": NVD_API_KEY})
5857
if r.status_code != 200:
59-
_logger.error(
58+
logger.error(
6059
"Received status code {} when contacting {}.".format(r.status_code, url)
6160
)
6261
return False
@@ -67,12 +66,12 @@ def do_update(quiet=False):
6766
d_split = d.split(":", 1)
6867
metadata_dict[d_split[0]] = d_split[1].strip()
6968
if not quiet:
70-
_logger.info("current: " + metadata_dict["sha256"])
69+
logger.info("current: " + metadata_dict["sha256"])
7170

7271
# check if the new data is actually new
7372
if last_fetch_metadata["sha256"] == metadata_dict["sha256"]:
7473
if not quiet:
75-
_logger.info("We already have this update, no new data to fetch.")
74+
logger.info("We already have this update, no new data to fetch.")
7675
return False
7776

7877
do_fetch("modified")
@@ -86,30 +85,28 @@ def do_fetch_full(start_from_year=START_FROM_YEAR, quiet=False):
8685
y for y in range(int(start_from_year), int(time.strftime("%Y")) + 1)
8786
]
8887
if not quiet:
89-
_logger.info("Fetching feeds: " + str(years_to_fetch))
88+
logger.info("Fetching feeds: " + str(years_to_fetch))
9089

9190
for y in years_to_fetch:
9291
if not do_fetch(y):
93-
_logger.error("Could not fetch data for year " + str(y))
92+
logger.error("Could not fetch data for year " + str(y))
9493

9594

9695
def do_fetch(what, quiet=True):
9796
"""
9897
the 'what' parameter can be a year or 'recent' or 'modified'
9998
"""
100-
url = "https://nvd.nist.gov/feeds/json/cve/{}/nvdcve-{}-{}.json.zip".format(
101-
FEED_SCHEMA_VERSION, FEED_SCHEMA_VERSION, what
102-
)
103-
r = requests.get(url)
99+
url = f"https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{what}.json.zip"
100+
r = requests.get(url, params={"apiKey": NVD_API_KEY})
104101
if r.status_code != 200:
105-
_logger.error(
102+
logger.error(
106103
"Received status code {} when contacting {}.".format(r.status_code, url)
107104
)
108105
return False
109106

110107
with closing(r), zipfile.ZipFile(io.BytesIO(r.content)) as archive:
111108
for f in archive.infolist():
112-
_logger.info(f.filename)
109+
logger.info(f.filename)
113110
data = json.loads(archive.read(f).decode())
114111

115112
if not quiet:
@@ -135,17 +132,17 @@ def need_full(quiet=False):
135132
if os.path.exists(DATA_PATH) and os.path.isdir(DATA_PATH):
136133
if not os.listdir(DATA_PATH):
137134
if not quiet:
138-
_logger.info("Data folder {} is empty".format(DATA_PATH))
135+
logger.info("Data folder {} is empty".format(DATA_PATH))
139136
return True
140137

141138
# Directory exists and is not empty
142139
if not quiet:
143-
_logger.info("Data folder found at " + DATA_PATH)
140+
logger.info("Data folder found at " + DATA_PATH)
144141
return False
145142

146143
# Directory doesn't exist
147144
if not quiet:
148-
_logger.info("Data folder {} does not exist".format(DATA_PATH))
145+
logger.info("Data folder {} does not exist".format(DATA_PATH))
149146
return True
150147

151148

@@ -162,5 +159,5 @@ def main(force, quiet):
162159
do_update(quiet=quiet)
163160

164161

165-
if __name__ == "__main__":
166-
plac.call(main)
162+
# if __name__ == "__main__":
163+
# plac.call(main)

0 commit comments

Comments
 (0)