diff --git a/.gitignore b/.gitignore index 23ecf4a..c045983 100644 --- a/.gitignore +++ b/.gitignore @@ -162,4 +162,5 @@ cython_debug/ **/checkpoints/* .vscode/ -models/ \ No newline at end of file +models/ +data/*/ \ No newline at end of file diff --git a/benchmark/pyod_.py b/benchmark/pyod_.py index 0f75759..3a54228 100644 --- a/benchmark/pyod_.py +++ b/benchmark/pyod_.py @@ -23,10 +23,6 @@ KNN, LMDD, LOF, MCD, OCSVM, PCA, FeatureBagging, IForest) -# TODO: add sklearnex to accelerate sklearn -# from sklearnex import patch_sklearn -# patch_sklearn() - warnings.filterwarnings("ignore") diff --git a/docs/Makefile b/docs/Makefile index 70e71f9..d4bb2cb 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -6,7 +6,7 @@ SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . -BUILDDIR = .. #_build +BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: diff --git a/docs/_static/custom.css b/docs/_static/custom.css index 33a972c..544c81f 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -3,4 +3,11 @@ /* or any size you want */ height: auto; /* keep aspect ratio */ +} + +.wy-nav-content { + padding: 1.618em 3.236em; + height: 100%; + max-width: 1600px; + margin: auto; } \ No newline at end of file diff --git a/docs/_static/flowbench.png b/docs/_static/flowbench.png new file mode 100644 index 0000000..3746ba7 Binary files /dev/null and b/docs/_static/flowbench.png differ diff --git a/docs/conf.py b/docs/conf.py index bc4fb22..5f704e0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,6 +18,7 @@ 'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.mathjax', + 'sphinx.ext.doctest', ] templates_path = ['_templates'] diff --git a/docs/examples.rst b/docs/examples.rst index ce68152..8681559 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -1,3 +1,159 @@ Examples ======== +Load Dataset +------------ + +- load data as graphs in ``pytorch_geometric`` format: + + .. code-block:: python + + from flowbench.dataset import FlowDataset + dataset = FlowDataset(root="./", name="montage") + data = dataset[0] + + The ``data`` contains the structural information by accessing ``data.edge_index``, and node feature information ``data.x``. + +- load data as tabular data in ``pytorch`` format: + + .. code-block:: python + + from flowbench.dataset import FlowDataset + dataset = FlowDataset(root="./", name="montage") + data = dataset[0] + Xs = data.x + ys = data.y + + Unlike the graph ``pyg.data``, the ``data`` only contains the node features. + +- load data as tabular data in ``numpy`` format: + + .. code-block:: python + + from flowbench.dataset import FlowDataset + dataset = FlowDataset(root="./", name="montage") + data = dataset[0] + Xs = data.x.numpy() + ys = data.y.numpy() + + This is the same as the previous one, but the data is in ``numpy`` format, which is typically used in the models from ``sklearn`` and ``xgboost``. + +- load test data with ``huggingface`` interface. + We have uploaded our parsed text data in the ``huggingface`` dataset. You can load the data with the following code: + + .. code-block:: python + + from datasets import load_dataset + dataset = load_dataset("cshjin/poseidon", "1000genome") + + The dataset is in the format of ``dict`` with keys ``train``, ``test``, and ``validation``. + +PyOD Models +----------- + +=================== ================ ====================================================================================================== ===== =================================================== +Type Abbr Algorithm Year Class +=================== ================ ====================================================================================================== ===== =================================================== +Probabilistic ABOD Angle-Based Outlier Detection 2008 :class:`flowbench.unsupervised.pyod.ABOD` +Probabilistic KDE Outlier Detection with Kernel Density Functions 2007 :class:`flowbench.unsupervised.pyod.KDE` +Probabilistic GMM Probabilistic Mixture Modeling for Outlier Analysis :class:`flowbench.unsupervised.pyod.GMM` +Linear Model PCA Principal Component Analysis (the sum of weighted projected distances to the eigenvector hyperplanes) 2003 :class:`flowbench.unsupervised.pyod.PCA` +Linear Model OCSVM One-Class Support Vector Machines 2001 :class:`flowbench.unsupervised.pyod.OCSVM` +Linear Model LMDD Deviation-based Outlier Detection (LMDD) 1996 :class:`flowbench.unsupervised.pyod.LMDD` +Proximity-Based LOF Local Outlier Factor 2000 :class:`flowbench.unsupervised.pyod.LOF` +Proximity-Based CBLOF Clustering-Based Local Outlier Factor 2003 :class:`flowbench.unsupervised.pyod.CBLOF` +Proximity-Based kNN k Nearest Neighbors (use the distance to the kth nearest neighbor as the outlier score) 2000 :class:`flowbench.unsupervised.pyod.KNN` +Outlier Ensembles IForest Isolation Forest 2008 :class:`flowbench.unsupervised.pyod.IForest` +Outlier Ensembles INNE Isolation-based Anomaly Detection Using Nearest-Neighbor Ensembles 2018 :class:`flowbench.unsupervised.pyod.INNE` +Outlier Ensembles LSCP LSCP: Locally Selective Combination of Parallel Outlier Ensembles 2019 :class:`flowbench.unsupervised.pyod.LSCP` +=================== ================ ====================================================================================================== ===== =================================================== + +- Example of using `GMM` + + .. code-block:: python + + from flowbench.pyod import GMM + from flowbench.dataset import FlowDataset + dataset = FlowDataset(root="./", name="1000genome") + Xs = ds.x.numpy() + clf = GMM() + clf.fit(Xs) + y_pred = clf.predict(Xs) + + - Detailed example in ``example/demo_pyod.py`` + +PyGOD Models +------------ + +=========== ================== ===== ============================================== +Type Abbr Year Class +=========== ================== ===== ============================================== +Clustering SCAN 2007 :class:`flowbench.unsupervised.pygod.SCAN` +GNN+AE GAE 2016 :class:`flowbench.unsupervised.pygod.GAE` +MF Radar 2017 :class:`flowbench.unsupervised.pygod.Radar` +MF ANOMALOUS 2018 :class:`flowbench.unsupervised.pygod.ANOMALOUS` +MF ONE 2019 :class:`flowbench.unsupervised.pygod.ONE` +GNN+AE DOMINANT 2019 :class:`flowbench.unsupervised.pygod.DOMINANT` +MLP+AE DONE 2020 :class:`flowbench.unsupervised.pygod.DONE` +MLP+AE AdONE 2020 :class:`flowbench.unsupervised.pygod.AdONE` +GNN+AE AnomalyDAE 2020 :class:`flowbench.unsupervised.pygod.AnomalyDAE` +GAN GAAN 2020 :class:`flowbench.unsupervised.pygod.GAAN` +GNN+AE DMGD 2020 :class:`flowbench.unsupervised.pygod.DMGD` +GNN OCGNN 2021 :class:`flowbench.unsupervised.pygod.OCGNN` +GNN+AE+SSL CoLA 2021 :class:`flowbench.unsupervised.pygod.CoLA` +GNN+AE GUIDE 2021 :class:`flowbench.unsupervised.pygod.GUIDE` +GNN+AE+SSL CONAD 2022 :class:`flowbench.unsupervised.pygod.CONAD` +GNN+AE GADNR 2024 :class:`flowbench.unsupervised.pygod.GADNR` +=========== ================== ===== ============================================== + + +- Example of using `GMM` + + .. code-block:: python + + from flowbench.unsupervised.pygod import GAE + from flowbench.dataset import FlowDataset + dataset = FlowDataset(root="./", name="1000genome") + data = dataset[0] + clf = GAE() + clf.fit(data) + + - Detailed example in ``example/demo_pygod.py`` + + +Supervised Models +----------------- + +- Example of using `MLP` + + .. code-block:: python + + from flowbench.supervised.mlp import MLPClassifier + from flowbench.dataset import FlowDataset + dataset = FlowDataset(root="./", name="1000genome") + data = dataset[0] + clf = MLPClassifier() + clf.fit(data) + + - Detailed example in ``example/demo_supervised.py`` + +Supervised fine-tuned LLMs +-------------------------- + +- Example of using LoRA (Low-rank Adaptation) for supervised fine-tuned LLMs: + + .. code-block:: python + + from peft import LoraConfig + dataset = load_dataset("cshjin/poseidon", "1000genome") + # data processing + ... + # LoRA config + peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1) + training_args = TrainingArgument(...) + # LoRA trainer + trainer = Trainer(peft_model, ...) + trainer.train() + ... + + - Detailed example in ``example/demo_sft_lora.py`` \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 030f15e..dc8264a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,13 @@ Flow-Bench is a benchmark dataset for anomaly detection techniques in computatio Flow-Bench contains workflow execution traces, executed on distributed infrastructure, that include systematically injected anomalies (labeled), and offers both the raw execution logs and a more compact parsed version. In this GitHub repository, apart from the logs and traces, you will find sample code to load and process the parsed data using pytorch, as well as, the code used to parse the raw logs and events. +.. figure:: _static/flowbench.png + :alt: FlowBench Outline + :align: center + :scale: 50% + Figure: FlowBench - An Anomaly Detection Benchmark Dataset + .. toctree:: :maxdepth: 2 :caption: Contents: @@ -25,6 +31,7 @@ In this GitHub repository, apart from the logs and traces, you will find sample flowbench.nlp license + Indices and tables ================== diff --git a/flowbench/supervised/gnn.py b/flowbench/supervised/gnn.py index f0d7db4..845c753 100644 --- a/flowbench/supervised/gnn.py +++ b/flowbench/supervised/gnn.py @@ -86,92 +86,92 @@ def configure_optimizers(self): return torch.optim.Adam(self.parameters(), lr=self.lr) -class GNN_v2(L.LightningModule): - r""" GNN model for node classification - NOTE: the version used in WORKS'22 paper - """ - - def __init__(self, num_features, num_classes, **kwargs): - super(GNN_v2, self).__init__() - self.hidden_dim = kwargs.get('hidden_dim', 128) - self.num_layers = kwargs.get('num_layers', 3) - self.lr = kwargs.get('lr', 1e-4) - self.dropout = kwargs.get('dropout', 0.5) - - # add the ability to add one or more conv layers - conv_blocks = [ - GCNConv(num_features, self.hidden_dim), - ReLU(), - ] - - # ability to add one or more conv blocks - for _ in range(self.num_layers - 1): - conv_blocks += [ - GCNConv(self.hidden_dim, self.hidden_dim), - ReLU(), - GCNConv(self.hidden_dim, self.hidden_dim), - ReLU(), - ] - - # group all the conv layers - self.conv_layers = ModuleList(conv_blocks) - - # add the linear layers for flattening the output from MPNN - self.flatten = Sequential( - Linear(self.hidden_dim, self.hidden_dim), - ReLU(), - Linear(self.hidden_dim, num_classes)) - - self.acc = torchmetrics.Accuracy(task='binary') - self.auroc = torchmetrics.AUROC(task='binary') - - def forward(self, data): - # process the layers - x, edge_index = data.x, data.edge_index - for idx, layer in enumerate(self.conv_layers): - if isinstance(layer, GCNConv): - x = layer(x, edge_index) - else: - x = layer(x) - x = F.dropout(x, p=self.dropout, training=self.training) - # pass the output to the linear output layer - out = self.flatten(x) - - # return the output - return F.log_softmax(out, dim=1) - - def training_step(self, batch, batch_idx): - x = self.forward(batch) - loss = torch.nn.functional.cross_entropy(x, batch.y) - # self.log('train_loss', loss) - # acc = self.acc(x.argmax(dim=1), batch.y) - # self.log('train_acc', acc, on_epoch=False, prog_bar=False, on_step=False) - # auc = self.auroc(x.argmax(dim=1), batch.y) - # self.log('train_auc', auc, on_epoch=False) - return loss - - def validation_step(self, batch, batch_idx): - x = self.forward(batch) - loss = torch.nn.functional.cross_entropy(x, batch.y) - self.log('val_loss', loss) - acc = self.acc(x.argmax(dim=1), batch.y) - self.log('val_acc', acc, on_epoch=True, prog_bar=True, on_step=False) - auc = self.auroc(x.argmax(dim=1), batch.y) - self.log('val_auc', auc, on_epoch=True) - return loss - - def test_step(self, batch, batch_idx): - x = self.forward(batch) - loss = torch.nn.functional.cross_entropy(x, batch.y) - self.log('test_loss', loss) - acc = self.acc(x.argmax(dim=1), batch.y) - self.log('test_acc', acc, on_epoch=True) - auc = self.auroc(x.argmax(dim=1), batch.y) - self.log('test_auc', auc, on_epoch=True) - return loss - - def configure_optimizers(self): - return torch.optim.Adam(self.parameters(), lr=self.lr) +# class GNN_v2(L.LightningModule): +# r""" GNN model for node classification +# NOTE: the version used in WORKS'22 paper +# """ + +# def __init__(self, num_features, num_classes, **kwargs): +# super(GNN_v2, self).__init__() +# self.hidden_dim = kwargs.get('hidden_dim', 128) +# self.num_layers = kwargs.get('num_layers', 3) +# self.lr = kwargs.get('lr', 1e-4) +# self.dropout = kwargs.get('dropout', 0.5) + +# # add the ability to add one or more conv layers +# conv_blocks = [ +# GCNConv(num_features, self.hidden_dim), +# ReLU(), +# ] + +# # ability to add one or more conv blocks +# for _ in range(self.num_layers - 1): +# conv_blocks += [ +# GCNConv(self.hidden_dim, self.hidden_dim), +# ReLU(), +# GCNConv(self.hidden_dim, self.hidden_dim), +# ReLU(), +# ] + +# # group all the conv layers +# self.conv_layers = ModuleList(conv_blocks) + +# # add the linear layers for flattening the output from MPNN +# self.flatten = Sequential( +# Linear(self.hidden_dim, self.hidden_dim), +# ReLU(), +# Linear(self.hidden_dim, num_classes)) + +# self.acc = torchmetrics.Accuracy(task='binary') +# self.auroc = torchmetrics.AUROC(task='binary') + +# def forward(self, data): +# # process the layers +# x, edge_index = data.x, data.edge_index +# for idx, layer in enumerate(self.conv_layers): +# if isinstance(layer, GCNConv): +# x = layer(x, edge_index) +# else: +# x = layer(x) +# x = F.dropout(x, p=self.dropout, training=self.training) +# # pass the output to the linear output layer +# out = self.flatten(x) + +# # return the output +# return F.log_softmax(out, dim=1) + +# def training_step(self, batch, batch_idx): +# x = self.forward(batch) +# loss = torch.nn.functional.cross_entropy(x, batch.y) +# # self.log('train_loss', loss) +# # acc = self.acc(x.argmax(dim=1), batch.y) +# # self.log('train_acc', acc, on_epoch=False, prog_bar=False, on_step=False) +# # auc = self.auroc(x.argmax(dim=1), batch.y) +# # self.log('train_auc', auc, on_epoch=False) +# return loss + +# def validation_step(self, batch, batch_idx): +# x = self.forward(batch) +# loss = torch.nn.functional.cross_entropy(x, batch.y) +# self.log('val_loss', loss) +# acc = self.acc(x.argmax(dim=1), batch.y) +# self.log('val_acc', acc, on_epoch=True, prog_bar=True, on_step=False) +# auc = self.auroc(x.argmax(dim=1), batch.y) +# self.log('val_auc', auc, on_epoch=True) +# return loss + +# def test_step(self, batch, batch_idx): +# x = self.forward(batch) +# loss = torch.nn.functional.cross_entropy(x, batch.y) +# self.log('test_loss', loss) +# acc = self.acc(x.argmax(dim=1), batch.y) +# self.log('test_acc', acc, on_epoch=True) +# auc = self.auroc(x.argmax(dim=1), batch.y) +# self.log('test_auc', auc, on_epoch=True) +# return loss + +# def configure_optimizers(self): +# return torch.optim.Adam(self.parameters(), lr=self.lr) class PyG_GNN(L.LightningModule): diff --git a/flowbench/supervised/utils.py b/flowbench/supervised/utils.py index 4018e24..a3673a7 100644 --- a/flowbench/supervised/utils.py +++ b/flowbench/supervised/utils.py @@ -8,6 +8,7 @@ def early_stopping_callback(minitor='val_loss', patience=5, mode='min'): r""" Early stopping callback. + Args: minitor (str): The metric to monitor. patience (int): Number of epochs with no improvement after which training will be stopped. diff --git a/flowbench/unsupervised/pygod.py b/flowbench/unsupervised/pygod.py index 1ed74a3..49840c1 100644 --- a/flowbench/unsupervised/pygod.py +++ b/flowbench/unsupervised/pygod.py @@ -7,14 +7,14 @@ of `pyod.detector`. Citation: -@article{liu2022bond, - title={Bond: Benchmarking unsupervised outlier node detection on static attributed graphs}, - author={Liu, Kay and Dou, Yingtong and Zhao, Yue and Ding, Xueying and Hu, Xiyang and Zhang, Ruitong and Ding, Kaize and Chen, Canyu and Peng, Hao and Shu, Kai and Sun, Lichao and Li, Jundong and Chen, George H. and Jia, Zhihao and Yu, Philip S.}, - journal={Advances in Neural Information Processing Systems}, - volume={35}, - pages={27021--27035}, - year={2022} -} + @article{liu2022bond, + title={Bond: Benchmarking unsupervised outlier node detection on static attributed graphs}, + author={Liu, Kay and Dou, Yingtong and Zhao, Yue and Ding, Xueying and Hu, Xiyang and Zhang, Ruitong and Ding, Kaize and Chen, Canyu and Peng, Hao and Shu, Kai and Sun, Lichao and Li, Jundong and Chen, George H. and Jia, Zhihao and Yu, Philip S.}, + journal={Advances in Neural Information Processing Systems}, + volume={35}, + pages={27021--27035}, + year={2022} + } For more information, please refer to https://docs.pygod.org/en/latest/. diff --git a/flowbench/unsupervised/pyod.py b/flowbench/unsupervised/pyod.py index e0f8218..a09dbbe 100644 --- a/flowbench/unsupervised/pyod.py +++ b/flowbench/unsupervised/pyod.py @@ -7,17 +7,17 @@ of `pyod.models`. Citation: -@article{PyOD2019, - author = {Zhao, Yue}, - title = {PyOD: A Python Toolbox for Scalable Outlier Detection}, - year = {2019}, - publisher = {Journal of Machine Learning Research}, - journal = {JMLR}, - volume = {20}, - number = {96}, - pages = {1-7}, - url = {http://jmlr.org/papers/v20/19-011.html}, -} + @article{PyOD2019, + author = {Zhao, Yue}, + title = {PyOD: A Python Toolbox for Scalable Outlier Detection}, + year = {2019}, + publisher = {Journal of Machine Learning Research}, + journal = {JMLR}, + volume = {20}, + number = {96}, + pages = {1-7}, + url = {http://jmlr.org/papers/v20/19-011.html}, + } For more information, please refer to https://pyod.readthedocs.io/.