Merge pull request #20 from vadim0x60/vadim

Anastasiia-Grishina · web-flow · commit ccb264c14f5b · 2025-03-14T13:10:43.000+01:00
Multiple quality of life improvements
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "seidr"
-version = "3.1.1"
+version = "3.4.0"
 description = "Synthesize Execute Instruct Debug Rank"
 authors = ["Vadim Liventsev <v.liventsev@tue.nl>", "Anastasia Grishina <anastasiia@simula.no>"]
 license = "MIT"
@@ -16,17 +16,19 @@ python = "^3.9"
 psb2 = ">=1.1.1"
 openai = "<1.0.0"
 more-itertools = ">=8.0.0,<9.0.0"
-programlib = ">=9.0.2,<10.0.0"
+programlib = ">=12.0.4"
 wandb = "<1.0.0"
 gitpython = ">=3.0.0,<4.0.0"
 tenacity = ">=8.0.0,<9.0.0"
 pandas = ">=1.0.0,<2.0.0"
 fire = "<1.0.0"
 jsonlines = "^4.0.0"
-jupyterlab = "^4.0.7"
 black = "^23.10.1"
-langchain = "^0.0.326"
-pytest-codeblocks = "^0.17.0"
+langchain = "~=0.1"
+langchain-community = "~=0.2"
+langchain-anthropic = "~=0.1"
+pytest-codeblocks = "~=0.17"
+anthropic = "~=0.29"
 
 [build-system]
 requires = ["poetry-core"]
diff --git a/seidr/dev.py b/seidr/dev.py
@@ -3,8 +3,9 @@
 from programlib import Program, Language
 from typing import Callable, Optional, Iterable, Tuple, List, Generator
 import random
+import time
 
-from seidr.llm import explore_llm
+from seidr.llm import explore_llm, default_batch_size
 from seidr.eval import Evaluation
 
 
@@ -124,7 +125,8 @@ def __init__(self,
                  log_llm_call: Callable = lambda **kwargs: print(kwargs),
                  max_programs: Optional[int] = None,
                  batch_size: Optional[int] = None,
-                 ollama_url: Optional[str] = None) -> None:
+                 ollama_url: Optional[str] = None,
+                 delay: int = 0) -> None:
         self.task_name = task_name
         self.task_description = task_description
         self.critics = critics
@@ -141,13 +143,11 @@ def __init__(self,
         self.log_llm_call = log_llm_call
         self.max_programs = max_programs
         self.ollama_url = ollama_url
+        self.delay = delay
 
         if not batch_size:
-            if 'gpt' in model_name:
-                self.batch_size = 10
-            else:
-                # Because Ollama doesn't support batch inference
-                self.batch_size = 1
+            batch_size = default_batch_size(model_name)
+        self.batch_size = batch_size          
 
     def draft(self, start_code: str = '') -> Iterable[str]:
         """Create a draft solution with the "generate" prompt template
@@ -286,4 +286,6 @@ def have_kids(
             if self.max_programs is not None and (idx == self.max_programs - 1):
                 break
 
+            time.sleep(self.delay)
+
         return best_code
diff --git a/seidr/eval.py b/seidr/eval.py
@@ -10,7 +10,7 @@ class Evaluation(ABC):
     Produces a binary pass/fail result, a float score, and a text report
     """
 
-    def __init__(self, SUT: Program, passing_score: float = 1.):
+    def __init__(self, SUT, passing_score: float = 1.):
         """
         SUT: System Under Test
         passing_score: float score required to pass the evaluation
@@ -97,3 +97,56 @@ def pen_report(self) -> str:
         else:
             self.output = "\n".join(self.output) if type(self.output) == list else self.output
             return self.output
+
+class Gymnasium(Evaluation):
+    def __init__(self, env, code, language, passing_score, error_reward=-1000):
+        self.action_mode = type(env.action_space).__name__.lower()
+        program = Program(code, language=language)
+        super().__init__(program, passing_score)
+
+        self.env = env
+        self.tot_reward = 0
+        self.tot_txt = ''
+        self.done = False
+        self.error_reward = error_reward
+
+    def play(self):
+        if self.done:
+            return
+
+        self.tot_reward = 0
+        self.tot_txt = ''
+        agent = self.SUT.spawn(action_mode=self.action_mode)
+
+        try:
+            observation, info = self.env.reset()
+            self.tot_txt += info.get('memos', '')
+            terminated = False
+            truncated = False
+            
+            while not (terminated or truncated):
+                if 'ascii' in self.env.metadata.get('render.modes', []):
+                    ascii_render = self.env.render(mode='ascii')
+                    self.tot_txt += ascii_render
+
+                action, _ = agent.predict(observation, deterministic=True)
+
+                observation, reward, terminated, truncated, info = self.env.step(action)
+                self.tot_reward += reward
+                self.tot_txt += info.get('memos', '')
+        except RuntimeError as e:
+            self.tot_reward = self.error_reward
+            self.tot_txt += f'FATAL {e}'
+        finally:
+            agent.close()
+
+        self.done = True
+
+    def score(self):
+        self.play()
+        return self.tot_reward
+
+    def pen_report(self):
+        self.play()
+        self.tot_txt += f'\nFinal reward: {self.tot_reward}'
+        return self.tot_txt
diff --git a/seidr/github.py b/seidr/github.py
@@ -52,7 +52,7 @@ def ensure_repo(remote: str, path: pathlib.Path | str, branch: str = None) -> Re
         if branch:
             repo.git.checkout(branch)
     except GitError as e:
-        logging.info(f'Git error in ensure repo {e}. \n{traceback.print_stack()}')
+        logging.info(f'Git error in ensure repo {e}.')
         shutil.rmtree(path, ignore_errors=True)
         repo = Repo.clone_from(remote, path)
 
diff --git a/seidr/llm.py b/seidr/llm.py
@@ -3,6 +3,7 @@
 
 from langchain.chains import LLMChain
 from langchain.chat_models import ChatOpenAI, ChatOllama
+from langchain_anthropic import ChatAnthropic
 from collections.abc import Iterable
 from typing import Callable, Optional
 import re
@@ -22,9 +23,9 @@ def extract_codes(
         language: Language | str
 ) -> str:
     """Extract code out of a message and (if Python) format it with black"""
+
     try:
         code_blocks = list(extract_from_buffer(StringIO(message_content)))
-        code_blocks = [code for code in code_blocks if not bool(code)]
     except RuntimeError as e:
         code_blocks = []
 
@@ -46,6 +47,20 @@ def run_black(code: str) -> str:
         logging.info(e)
         return code
 
+def which_api(model_name):
+    model_name = model_name.lower()
+    if "gpt" in model_name or "deepseek" in model_name:
+        return ChatOpenAI
+    elif "claude" in model_name:
+        return ChatAnthropic
+    else:
+        return ChatOllama
+    
+def default_batch_size(model_name):
+    if which_api(model_name) == ChatOllama:
+        return 1
+    else:
+        return 10
 
 def create_chain(
         temperature: float = 0.,
@@ -55,14 +70,23 @@ def create_chain(
 ) -> LLMChain:
     """Set up a LangChain LLMChain"""
     chat_prompt_template = create_chat_prompt_template(mode)
-    if "gpt" in model_name.lower():
+    api = which_api(model_name)
+
+    if api == ChatOpenAI:
         chat_model = ChatOpenAI(
             model=model_name,
             temperature=temperature,
+            openai_api_base=os.getenv("OPENAI_API_BASE"),
             openai_api_key=os.getenv("OPENAI_API_KEY"),
             openai_organization=os.getenv("OPENAI_ORG")
         )
-    elif "llama" in model_name.lower():
+    elif api == ChatAnthropic:
+        chat_model = ChatAnthropic(
+            model_name=model_name,
+            temperature=temperature,
+            anthropic_api_key=os.getenv('ANTHROPIC_API_KEY')
+        )
+    elif api == ChatOllama:
         chat_model = ChatOllama(
             base_url=base_url,
             model=model_name,
@@ -90,7 +114,7 @@ def query_llm(
     # Assistants are trained to respond with one message.
     # it is theoretically possible to get more than one message, but it is very unlikely.
     assert all(len(r) == 1 for r in result.generations), "The models are expected to respond with one message"
-    result = [r[0].message.content for r in result.generations if r[0].message.content]
+    result = [r[0].message.content for r in result.generations]
 
     if mode == "repair":
         logging.info(f"Generating repair candidates for bug summary: \n{kwargs['bug_summary']}\n")