r2d4
diff --git a/‎.gitignore
Lines changed: 5 additions & 0 deletions b/‎.gitignore
Lines changed: 5 additions & 0 deletions
diff --git a/‎LICENSE
Lines changed: 21 additions & 0 deletions b/‎LICENSE
Lines changed: 21 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 114 additions & 0 deletions b/‎README.md
Lines changed: 114 additions & 0 deletions
diff --git a/‎example.py
Lines changed: 58 additions & 0 deletions b/‎example.py
Lines changed: 58 additions & 0 deletions
@@ -0,0 +1,5 @@
+env
+.ruff_cache
+dist
+*.egg-info
+**/__pycache__
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Matt Rickard
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,114 @@
+# ReLLM
+Regular Expressions for Language Model Completions.
+
+Get exact structure out of any language model completion with regular expressions.
+
+Return specific syntactic structure (e.g. JSON or XML), or specific semantic structure (e.g. a date or a number), or even complete templates (e.g. a sentence with a blank to fill in).
+
+How does it work? For each token, ReLLM tests every possible completion against a partial regex. For the potential completions that do not match the pattern, ReLLM masks the logits so that the language model does not generate them.
+
+### Installation
+```
+pip install rellm
+```
+
+The preliminary results are interesting -- even for small models, constraining the token space with ReLLM can improve the quality of the completions. Not to mention the ability to more easily parse the output programmatically. Take a look at some of the examples below (you can run them with [example.py](example.py))
+
+```python
+import regex
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from rellm import complete_re
+
+model = AutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+prompt = "ReLLM, the best way to get structured data out of LLMs, is an acronym for "
+pattern = regex.compile(r'Re[a-z]+ L[a-z]+ L[a-z]+ M[a-z]+')
+output = complete_re(tokenizer=tokenizer, 
+                     model=model, 
+                     prompt=prompt,
+                     pattern=pattern,
+                     do_sample=True,
+                     max_new_tokens=80)
+print(output)
+```
+
+```
+> Realized Logistic Logistics Model
+```
+
+
+## Examples using GPT2 (124 million parameters)
+
+**Prompt**: Return the first three letters of the alphabet in a json array:
+
+**Pattern** \[\"[a-z]\", \"[a-z]\", \"[a-z]\"\]
+
+**ReLLM**: ["a", "b", "c"]
+
+**Without ReLLM**: { "index": 0, "id":"1", "description":"", "text": "[{ "id": 0, "name":
+#
+**Prompt**: Fill in the sentence with an interesting story about the dentist:
+
+**Pattern**: Today I\'m going to the [a-z]+ to [a-z]+ because ([a-z]+ )*\.
+
+**ReLLM**: Today I'm going to the dentist to see because it is a very important day for me
+
+**Without ReLLM**: 'My family bought me an appointment with a dentist when I was 15. The dentist gave me one a year and then I was told on
+#
+
+**Prompt**: Is this a good demo?
+
+**Pattern**: (Yes|No)
+
+**ReLLM**: No.
+
+**Without ReLLM**: I don't know, but this is amazing! Even more amazing is how the design can take place on a small stage that uses LEDs.
+As
+
+#
+
+**Prompt**: Convert the date May 4, 2023 to the format mm/dd/yyyy:
+
+**Pattern**: [0-9]{2}/[0-9]{2}/[0-9]{4}
+
+**ReLLM**: 00/00/0045
+
+**Without ReLLM**:  mm:ss
+
+A-Z, Z-A, W-H (0-9:9:19)
+
+Z-R
+
+#
+
+**Prompt**: Jeff Dean is a
+
+**Pattern** (Programmer|Computer Scientist|AGI)
+
+**ReLLM**: Computer Scientist
+
+**Without ReLLM**: former national basketball champion and a former professional basketball player. He currently serves as general counsel for the NCAA Office of the Vice President for Academic Affairs.
+
+#
+
+**Prompt**: I can eat 
+
+**Pattern**: [0-9]{1,10} [a-z]* of [a-z]*
+
+**ReLLM**: 800 calories of coffee
+
+**Without ReLLM**: iced coffee here on the west side and do this, so can you?"
+
+"Why, I don't understand. What did you mean by
+
+#
+
+**Prompt**: ReLLM, the best way to get structured data out of LLMs, is an acronym for
+
+**Patern**: Re[a-z]+ L[a-z]+ L[a-z]+ M[a-z]+
+
+**ReLLM**: Realized Logistic Logistics Model
+
+**Without ReLLM**: Largest Largest Address Space (MELSP), which has its roots in the  Internet network, at least when compared
@@ -0,0 +1,58 @@
+
+import regex
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from rellm import complete_re
+
+model = AutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+examples = [
+    {
+        "prompt": "Return the first three letters of the alphabet in a json array:",
+        "pattern": regex.compile(r'\[\"[a-z]\", \"[a-z]\", \"[a-z]\"\]'),
+        "max_new_tokens": 10,
+    },
+    {
+        "prompt": "Fill in the sentence with an interesting story about the dentist:",
+        "pattern": regex.compile(r'Today I\'m going to the [a-z]+ to [a-z]+ because ([a-z]+ )*\.'),
+        "max_new_tokens": 20,
+    },
+    {
+        "prompt": "Is this a good demo?",
+        "pattern": regex.compile(r'(Yes|No)\.'),
+        "max_new_tokens": 2,
+    },
+    {
+        "prompt": "Convert the date May 4, 2023 to the format mm/dd/yyyy:",
+        "pattern": regex.compile(r'[0-9]{2}/[0-9]{2}/[0-9]{4}'),
+        "max_new_tokens": 20,
+    },
+    {
+        "prompt": "Jeff Dean is a ",
+        "pattern": regex.compile(r'(Programmer|Computer Scientist|AGI)'),
+        "max_new_tokens": 10,
+    },
+    {
+        "prompt": 'I can eat ',
+        "pattern": regex.compile(r'[0-9]{1,10} [a-z]* of [a-z]*'),
+        "max_new_tokens": 10,
+        "do_sample": True,
+    },
+    {
+        "prompt": 'ReLLM, the best way to get structured data out of LLMs, is an acronym for ',
+        "pattern": regex.compile(r'Re[a-z]+ L[a-z]+ L[a-z]+ M[a-z]+'),
+        "max_new_tokens": 10,
+        "do_sample": True,
+    }
+]
+
+for example in examples:
+    print("\n===Prompt===\n", example["prompt"])
+    output = complete_re(tokenizer=tokenizer, model=model,**example)
+    print("\n===ReLLM===\n", output)
+    vanilla_output_ids = model.generate(tokenizer.encode(example["prompt"], return_tensors="pt"),
+                                        max_new_tokens=30, 
+                                        pad_token_id=tokenizer.eos_token_id, 
+                                        do_sample=True)
+    print("\n===Without ReLLM===\n", tokenizer.decode(vanilla_output_ids[0])[len(example["prompt"]):])