Skip to content

Commit d62feb0

Browse files
authored
Polishing (#13)
* chore: isort/black * doc: update README * chore: name
1 parent 0cf2790 commit d62feb0

File tree

9 files changed

+22
-15
lines changed

9 files changed

+22
-15
lines changed

README.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Regression Transformer
22
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
3+
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
4+
[![Gradio demo](https://img.shields.io/website-up-down-green-red/https/hf.space/gradioiframe/GT4SD/regression_transformer/+.svg?label=demo%20status)](https://huggingface.co/spaces/GT4SD/regression_transformer)
35

46
A multitask Transformer that reformulates regression as a conditional sequence modeling task.
57
This yields a dichotomous language model that seamlessly integrates regression with property-driven conditional generation.
@@ -9,7 +11,7 @@ This yields a dichotomous language model that seamlessly integrates regression w
911
This repo contains the development code.
1012

1113
## Demo with UI
12-
🤗 A gradio demo with a simple UI is available at: https://huggingface.co/spaces/jannisborn/regression_transformer
14+
🤗 A gradio demo with a simple UI is available on [HuggingFace spaces](https://huggingface.co/spaces/GT4SD/regression_transformer)
1315
![Summary](assets/gradio_demo.png)
1416

1517

@@ -123,10 +125,10 @@ At this point the folder containing the vocabulary file can be used to load a to
123125
If you use the regression transformer, please cite:
124126
```bib
125127
@article{born2022regression,
126-
title={Regression Transformer: Concurrent Conditional Generation and Regression by Blending Numerical and Textual Tokens},
128+
title={Regression Transformer enables concurrent sequence regression and generation for molecular language modeling},
127129
author={Born, Jannis and Manica, Matteo},
128-
journal={arXiv preprint arXiv:2202.01338},
129-
note={Spotlight talk at ICLR workshop on Machine Learning for Drug Discovery},
130-
year={2022}
130+
journal={Nature Machine Intelligence},
131+
note={Article in press. arXiv preprint arXiv:2202.01338},
132+
year={2023}
131133
}
132134
```

terminator/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
"""Utiltities for transformer-based conditional molecule generation."""
22
__version__ = "0.0.1"
3+
__name__ = "terminator"

terminator/collators.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from dataclasses import dataclass
22
from typing import Dict, Iterable, List, Optional, Tuple, Union
3-
import transformers
3+
44
import torch
5+
import transformers
56
from transformers import DataCollatorForPermutationLanguageModeling
67
from transformers.tokenization_utils import PreTrainedTokenizer
78
from transformers.tokenization_utils_base import BatchEncoding

terminator/datasets.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@ def get_dataset(
88
line_by_line: bool = True,
99
):
1010
if line_by_line:
11-
return LineByLineTextDataset(tokenizer=tokenizer, file_path=filepath, block_size=block_size)
11+
return LineByLineTextDataset(
12+
tokenizer=tokenizer, file_path=filepath, block_size=block_size
13+
)
1214
else:
1315
return TextDataset(
1416
tokenizer=tokenizer,

terminator/functional_groups.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
# which is included in the file license.txt, found at the root
66
# of the RDKit source tree.
77

8+
from collections import namedtuple
9+
810
#
911
#
1012
# Richard hall 2017
@@ -13,7 +15,6 @@
1315
# refine output function
1416
# astex_ifg: identify functional groups a la Ertl, J. Cheminform (2017) 9:36
1517
from rdkit import Chem
16-
from collections import namedtuple
1718

1819

1920
def merge(mol, marked, aset):

terminator/nlp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
def parse_humicroedit(
8-
dataset, expression_separator: str = '{', expression_end: str = '}'
8+
dataset, expression_separator: str = "{", expression_end: str = "}"
99
) -> List[str]:
1010
"""
1111
Parse the humicrocredit dataset in an appropriate format.

terminator/numerical_encodings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def get_float_encoding(
4040
else:
4141
digit = int(token[1])
4242
order = int(token.split("_")[-2])
43-
val = digit * 10 ** order
43+
val = digit * 10**order
4444

4545
for i in range(0, embedding_size, 2):
4646
vals[i] = val / (i + 1)
@@ -72,7 +72,7 @@ def get_int_encoding(token: str, embedding_size: int) -> torch.Tensor:
7272
else:
7373
digit = int(token[1])
7474
order = int(token.split("_")[-2])
75-
val = digit * 10 ** order
75+
val = digit * 10**order
7676

7777
if order < 0:
7878
raise ValueError(

terminator/tokenization.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -399,9 +399,9 @@ class XLNetRTTokenizer(XLNetTokenizer):
399399
def set_property_tokenizer(
400400
self,
401401
tokenizer: PropertyTokenizer,
402-
expression_separator: str = '{',
403-
expression_end: str = '}',
404-
property_token: str = '[funny]',
402+
expression_separator: str = "{",
403+
expression_end: str = "}",
404+
property_token: str = "[funny]",
405405
):
406406
"""
407407
Set the property tokenizer to be used by the main tokenizer.

terminator/trainer_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def get_trainer_dict(dictionary: Dict[str, Any]) -> Dict[str, Any]:
4949

5050

5151
def nested_new_like(arrays, num_samples, padding_index=-100):
52-
""" Create the same nested structure as `arrays` with a first dimension always at `num_samples`."""
52+
"""Create the same nested structure as `arrays` with a first dimension always at `num_samples`."""
5353
if isinstance(arrays, (list, tuple)):
5454
return type(arrays)(nested_new_like(x, num_samples) for x in arrays)
5555
return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:]))

0 commit comments

Comments
 (0)