Skip to content

Commit ab169b3

Browse files
committed
Try a simple test via GitHub Actions
1 parent c4e2f5a commit ab169b3

File tree

9 files changed

+103
-15
lines changed

9 files changed

+103
-15
lines changed

.github/workflows/ci.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: CI
2+
on: [push, pull_request]
3+
jobs:
4+
pre_job:
5+
name: Check duplicate
6+
runs-on: ubuntu-latest
7+
outputs:
8+
should_skip: ${{ steps.skip_check.outputs.should_skip }}
9+
steps:
10+
- id: skip_check
11+
uses: fkirc/skip-duplicate-actions@12aca0a884f6137d619d6a8a09fcc3406ced5281
12+
with:
13+
cancel_others: true
14+
concurrent_skipping: same_content
15+
do_not_skip: '["pull_request", "schedule", "workflow_dispatch"]'
16+
17+
build:
18+
name: Test
19+
needs: pre_job
20+
if: ${{ needs.pre_job.outputs.should_skip != 'true' }}
21+
runs-on: ubuntu-latest
22+
steps:
23+
- run: git config --global core.autocrlf input
24+
- uses: actions/checkout@v3
25+
with:
26+
fetch-depth: 0
27+
- uses: actions/setup-python@v4
28+
with:
29+
python-version: '3.11'
30+
cache: 'pipenv'
31+
- name: Install pipenv
32+
run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
33+
- run: pipenv install
34+
- name: Compile database
35+
run: pipenv run python compile_db.py --output test/hs.db < test/patterns.json > test/patterns_final.json
36+
- name: Generate test data
37+
run: pipenv run python gen_test_data.py
38+
- name: Preprocess training data
39+
run: pipenv run python preprocess.py train --database test/hs.db --sherlock-path test/ --output-dir test/
40+
- name: Train the model
41+
run: pipenv run python train.py --sherlock-path test/ --input-dir test/ --output-dir test/
42+
- name: Preprocess test data
43+
run: pipenv run python preprocess.py test --database test/hs.db --sherlock-path test/ --output-dir test/
44+
- name: Evaluate the model
45+
run: pipenv run python test.py --sherlock-path test/ --input-dir test/ | grep "weighted avg 1.00"

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,5 @@ patterns.json
1010
*.json
1111
*.h5
1212
*.png
13+
*.parquet
1314
regex101/

compile_db.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
import argparse
12
import ast
23
import hyperscan
34
import json
45
import pickle
56
import sys
67

78

9+
parser = argparse.ArgumentParser()
10+
parser.add_argument("-o", "--output", default="hs.db")
11+
args = parser.parse_args()
12+
813
sys.stderr.write("Collecting patterns...\n")
914
regexes = set()
1015
for line in sys.stdin:
@@ -34,5 +39,5 @@
3439
# Compile the final database and save to file
3540
sys.stderr.write("Compiling %d patterns...\n" % len(patterns))
3641
db.compile(expressions=patterns, ids=ids, flags=flags)
37-
with open("hs.db", "wb") as f:
42+
with open(args.output, "wb") as f:
3843
pickle.dump([len(patterns), hyperscan.dumpb(db)], f)

gen_test_data.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pandas as pd
2+
3+
4+
data = ["['a', 'b', 'c']", "['1', '2', '3']"]
5+
df = pd.DataFrame(data, columns=['values'])
6+
df.to_parquet('test/test_values.parquet', index=True)
7+
8+
df = pd.DataFrame(data * 100, columns=['values'])
9+
df.to_parquet('test/train_values.parquet', index=True)
10+
11+
labels = ["alpha", "numeric"]
12+
df = pd.DataFrame(labels, columns=['type'])
13+
df.to_parquet('test/test_labels.parquet', index=True)
14+
15+
df = pd.DataFrame(labels * 100, columns=['type'])
16+
df.to_parquet('test/train_labels.parquet', index=True)

preprocess.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,16 @@
2323

2424
parser = argparse.ArgumentParser()
2525
parser.add_argument("dataset", choices=["train", "test"])
26+
parser.add_argument("--database", default="hs.db")
27+
parser.add_argument("--sherlock-path", default="../sherlock-project/data/data/raw")
28+
parser.add_argument("--output-dir", default=".")
2629
args = parser.parse_args()
2730

28-
output_file = f"preprocessed_{args.dataset}.txt"
31+
output_file = os.path.join(args.output_dir, f"preprocessed_{args.dataset}.txt")
2932

3033
# Load the precompiled regular expression database
3134
sys.stderr.write("Loading regexes from file…\n")
32-
with open("hs.db", "rb") as f:
35+
with open(args.database, "rb") as f:
3336
[num_patterns, bdb] = pickle.load(f)
3437
db = hyperscan.loadb(bdb)
3538
# Scratch is not correctly initialized for deserialized databses
@@ -43,7 +46,7 @@ def on_match(match_id, from_idx, to_idx, flags, context):
4346

4447

4548
# Load the values
46-
pq_values = ParquetFile(f"../sherlock-project/data/data/raw/{args.dataset}_values.parquet")
49+
pq_values = ParquetFile(os.path.join(args.sherlock_path, f"{args.dataset}_values.parquet"))
4750

4851
# Remove the output if it exists
4952
if os.path.exists(output_file):

test.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import argparse
2+
import os
13
import sys
24

35
import numpy as np
@@ -10,22 +12,27 @@
1012

1113
BATCH_SIZE = 1000
1214

15+
parser = argparse.ArgumentParser()
16+
parser.add_argument("--sherlock-path", default="../sherlock-project/data/data/raw")
17+
parser.add_argument("--input-dir", default=".")
18+
args = parser.parse_args()
19+
1320
sys.stderr.write("Loading labels...\n")
14-
pq_labels = ParquetFile("../sherlock-project/data/data/raw/test_labels.parquet")
21+
pq_labels = ParquetFile(os.path.join(args.sherlock_path, "test_labels.parquet"))
1522
labels = pd.DataFrame(
1623
{"type": pd.Categorical(pq_labels.read(columns=["type"]).columns[0].to_numpy())}
1724
)
1825
le = LabelEncoder()
19-
le.classes_ = np.load("classes.npy", allow_pickle=True)
26+
le.classes_ = np.load(os.path.join(args.input_dir, "classes.npy"), allow_pickle=True)
2027
# labels = le.transform(labels.values.ravel())
2128
num_examples = len(labels)
2229

23-
model = model_from_json(open("nn_model_sherlock.json", "r").read())
24-
model.load_weights("nn_model_weights_sherlock.h5")
30+
model = model_from_json(open(os.path.join(args.input_dir, "nn_model_sherlock.json"), "r").read())
31+
model.load_weights(os.path.join(args.input_dir, "nn_model_weights_sherlock.h5"))
2532

2633
sys.stderr.write("Evaluating...\n")
2734
labels_pred = [""] * len(labels)
28-
preprocessed = open("preprocessed_test.txt", "r")
35+
preprocessed = open(os.path.join(args.input_dir, "preprocessed_test.txt"), "r")
2936
batch = 0
3037
with tqdm(total=len(labels)) as pbar:
3138
while True:

test/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
!patterns.json

test/patterns.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"^[A-Za-z]$"
2+
"^[0-9]$"

train.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import argparse
2+
import os
13
import sys
24

35
import numpy as np
@@ -17,8 +19,14 @@
1719

1820
BATCH_SIZE = 1000
1921

22+
parser = argparse.ArgumentParser()
23+
parser.add_argument("--sherlock-path", default="../sherlock-project/data/data/raw")
24+
parser.add_argument("--input-dir", default=".")
25+
parser.add_argument("--output-dir", default=".")
26+
args = parser.parse_args()
27+
2028
sys.stderr.write("Loading labels...\n")
21-
pq_labels = ParquetFile("../sherlock-project/data/data/raw/train_labels.parquet")
29+
pq_labels = ParquetFile(os.path.join(args.sherlock_path, "train_labels.parquet"))
2230
labels = pd.DataFrame(
2331
{
2432
"type": pd.Categorical(
@@ -31,10 +39,10 @@
3139
# Encode the labels as integers
3240
le = LabelEncoder().fit(labels.values.ravel())
3341
labels = le.transform(labels.values.ravel())
34-
np.save("classes.npy", le.classes_)
42+
np.save(os.path.join(args.output_dir, "classes.npy"), le.classes_)
3543

3644
# Load one row just to get the shape of the input
37-
preprocessed = open("preprocessed_train.txt", "r")
45+
preprocessed = open(os.path.join(args.input_dir, "preprocessed_train.txt"), "r")
3846
matrix = np.loadtxt(preprocessed, max_rows=1)
3947
regex_shape = matrix.shape[0]
4048

@@ -78,9 +86,9 @@
7886
loss="categorical_crossentropy",
7987
metrics=["categorical_accuracy"],
8088
)
81-
open("nn_model_sherlock.json", "w").write(model.to_json())
89+
open(os.path.join(args.output_dir, "nn_model_sherlock.json"), "w").write(model.to_json())
8290

83-
preprocessed = open("preprocessed_train.txt", "r")
91+
preprocessed = open(os.path.join(args.input_dir, "preprocessed_train.txt"), "r")
8492
i = 0
8593
with tqdm(total=len(labels)) as pbar:
8694
while True:
@@ -105,4 +113,4 @@
105113
pbar.update(len(matrix))
106114

107115
# Save the trained model weights
108-
model.save_weights("nn_model_weights_sherlock.h5")
116+
model.save_weights(os.path.join(args.output_dir, "nn_model_weights_sherlock.h5"))

0 commit comments

Comments
 (0)