trustyai-explainability
diff --git a/‎.gitignore
Lines changed: 0 additions & 1 deletion b/‎.gitignore
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/basic_test.ipynb
Lines changed: 41 additions & 39 deletions b/‎examples/basic_test.ipynb
Lines changed: 41 additions & 39 deletions
diff --git a/‎examples/templating.ipynb
Lines changed: 26 additions & 26 deletions b/‎examples/templating.ipynb
Lines changed: 26 additions & 26 deletions
diff --git a/‎src/vllm_judge/__init__.py
Lines changed: 1 addition & 1 deletion b/‎src/vllm_judge/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/vllm_judge/api/__init__.py
Lines changed: 0 additions & 3 deletions b/‎src/vllm_judge/api/__init__.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/vllm_judge/api/client.py
Lines changed: 0 additions & 3 deletions b/‎src/vllm_judge/api/client.py
Lines changed: 0 additions & 3 deletions
@@ -166,7 +166,6 @@ cython_debug/
 .idea/.gitignore
 .idea/misc.xml
 .idea/modules.xml
-.idea/trustyai-service-v2.iml
 .idea/vcs.xml
 .idea/inspectionProfiles/profiles_settings.xml
 
 
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -31,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -50,20 +50,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'decision': False,\n",
-       " 'reasoning': 'The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.',\n",
+       "{'decision': 'PASS',\n",
+       " 'reasoning': 'The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.',\n",
        " 'score': None,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": false,\\n    \"reasoning\": \"The response lacks a professional tone and is informal. It uses casual language and lacks context or formal structure.\",\\n    \"score\": null\\n}'}}"
+       "  'raw_response': '{\\n    \"decision\": \"PASS\",\\n    \"reasoning\": \"The content maintains a professional tone by clearly stating the intention to bump the version and asking for confirmation.\",\\n    \"score\": null\\n}'}}"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -74,47 +74,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'decision': 5,\n",
-       " 'reasoning': 'The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.',\n",
-       " 'score': 5.0,\n",
+       "{'decision': 'False',\n",
+       " 'reasoning': 'The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.',\n",
+       " 'score': 0.2,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": 5,\\n    \"reasoning\": \"The response lacks a formal and professional tone. It uses informal language and an interrogative form which is not typical in professional communication.\",\\n    \"score\": 5\\n}'}}"
+       "  'raw_response': '{\\n    \"decision\": \"False\",\\n    \"reasoning\": \"The content lacks a professional tone as it is phrased as a question without providing context or justification for the version bump.\",\\n    \"score\": 0.2\\n}'}}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "res = await judge.evaluate(content=\"I want to bump the version to 1.0.1, is it a good idea?\",\n",
     "                           criteria=\"Check the professional tone.\",\n",
-    "                           rubric=\"Assign a score between 0 and 10 based on the professional tone. 0 is the worst and 10 is the best.\")\n",
+    "                           rubric=\"Assign a score between 0 and 1 based on the professional tone. 0 is the worst and 1 is the best.\")\n",
     "res.model_dump()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'decision': 5,\n",
-       " 'reasoning': 'The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.',\n",
-       " 'score': 5.0,\n",
+       "{'decision': 'True',\n",
+       " 'reasoning': 'The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.',\n",
+       " 'score': 1.0,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": 5,\\n    \"reasoning\": \"The response is somewhat direct and to the point, but lacks formality and context typically expected in a professional setting.\",\\n    \"score\": 5\\n}'}}"
+       "  'raw_response': '{\\n    \"decision\": \"True\",\\n    \"reasoning\": \"The response is clear and to the point, maintaining a professional tone by asking for a decision on the version bump.\",\\n    \"score\": 1\\n}'}}"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -124,15 +124,17 @@
     "                           criteria=\"Check the professional tone.\",\n",
     "                           rubric={\n",
     "                               0: \"The response is not professional.\",\n",
-    "                               5: \"The response is somewhat professional.\",\n",
-    "                               10: \"The response is very professional.\"\n",
-    "                           })\n",
+    "                               0.5: \"The response is somewhat professional.\",\n",
+    "                               1: \"The response is very professional.\"\n",
+    "                           },\n",
+    "                           scale=(0, 1)\n",
+    "                           )\n",
     "res.model_dump()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -146,20 +148,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "{'decision': 'moderate',\n",
-       " 'reasoning': 'The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.',\n",
+       " 'reasoning': 'The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.',\n",
        " 'score': 5.0,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": \"moderate\",\\n    \"reasoning\": \"The email lacks formal language and a clear request, which is more appropriate for a professional setting. It is direct but informal.\",\\n    \"score\": 5\\n}'}}"
+       "  'raw_response': '{\\n    \"decision\": \"moderate\",\\n    \"reasoning\": \"The content is a question about version bumping and lacks formal structure or context, which makes it slightly informal. However, it does not contain any unprofessional language or tone.\",\\n    \"score\": 5\\n}'}}"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -172,20 +174,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "{'decision': 'non-professional',\n",
-       " 'reasoning': 'The response uses informal and expletive language, which is not appropriate for a professional context.',\n",
-       " 'score': 1.0,\n",
+       " 'reasoning': \"The phrase 'Holy shit, this is a great!' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\n",
+       " 'score': 2.0,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": \"non-professional\",\\n    \"reasoning\": \"The response uses informal and expletive language, which is not appropriate for a professional context.\",\\n    \"score\": 1\\n}'}}"
+       "  'raw_response': '{\\n    \"decision\": \"non-professional\",\\n    \"reasoning\": \"The phrase \\'Holy shit, this is a great!\\' is informal and contains an exclamation, which does not meet the criteria for a professional tone.\",\\n    \"score\": 2\\n}'}}"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -198,22 +200,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'decision': True,\n",
-       " 'reasoning': 'The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.',\n",
+       "{'decision': 'PASS',\n",
+       " 'reasoning': 'The statement is accurate and complete as it correctly identifies Paris as the capital of France.',\n",
        " 'score': None,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": true,\\n    \"reasoning\": \"The response correctly identifies Paris as the capital of France, addressing both accuracy and completeness.\",\\n    \"score\": null\\n}',\n",
+       "  'raw_response': '{\\n    \"decision\": \"PASS\",\\n    \"reasoning\": \"The statement is accurate and complete as it correctly identifies Paris as the capital of France.\",\\n    \"score\": null\\n}',\n",
        "  'template_vars': {'input': 'What is the capital of France?'},\n",
        "  'template_engine': 'format'}}"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
 
@@ -20,22 +20,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'decision': 5,\n",
-       " 'reasoning': 'The function is concise but uses a naive recursive approach which is inefficient and can lead to a stack overflow for large values of n. It lacks error handling and docstrings for documentation.',\n",
-       " 'score': 5.0,\n",
+       "{'decision': 'FAIL',\n",
+       " 'reasoning': \"The function contains a typo ('fib' instead of 'fibonacci' in the recursive call) and lacks a proper docstring or comments, which are essential for production code. The code is also not optimized and may lead to a stack overflow for large values of n due to repeated calculations.\",\n",
+       " 'score': 3.0,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": 5,\\n    \"reasoning\": \"The function is concise but uses a naive recursive approach which is inefficient and can lead to a stack overflow for large values of n. It lacks error handling and docstrings for documentation.\",\\n    \"score\": 5\\n}',\n",
+       "  'raw_response': '{\\n    \"decision\": \"FAIL\",\\n    \"reasoning\": \"The function contains a typo (\\'fib\\' instead of \\'fibonacci\\' in the recursive call) and lacks a proper docstring or comments, which are essential for production code. The code is also not optimized and may lead to a stack overflow for large values of n due to repeated calculations.\",\\n    \"score\": 3\\n}',\n",
        "  'template_vars': {'language': 'Python', 'use_case': 'production deployment'},\n",
        "  'template_engine': 'format'}}"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -100,22 +100,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'decision': 1,\n",
-       " 'reasoning': \"The function does not perform any data science tasks and incorrectly uses the name 'fib' instead of 'fibonacci' in the recursive call, which will cause an error.\",\n",
-       " 'score': None,\n",
+       "{'decision': 'False',\n",
+       " 'reasoning': 'The code is a recursive implementation of the Fibonacci sequence, but it lacks an appropriate base case check and is inefficient for larger values of n due to repeated calculations. This makes it unsuitable for data science applications where performance and efficiency are crucial.',\n",
+       " 'score': 1.0,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": 1,\\n    \"reasoning\": \"The function does not perform any data science tasks and incorrectly uses the name \\'fib\\' instead of \\'fibonacci\\' in the recursive call, which will cause an error.\",\\n    \"score\": null\\n}',\n",
+       "  'raw_response': '{\\n    \"decision\": \"False\",\\n    \"reasoning\": \"The code is a recursive implementation of the Fibonacci sequence, but it lacks an appropriate base case check and is inefficient for larger values of n due to repeated calculations. This makes it unsuitable for data science applications where performance and efficiency are crucial.\",\\n    \"score\": 1\\n}',\n",
        "  'template_vars': {'language': 'Python', 'purpose': 'data science'},\n",
        "  'template_engine': 'format'}}"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -126,22 +126,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'decision': 5,\n",
-       " 'reasoning': 'The function is a correct implementation of the Fibonacci sequence, but it lacks an optimization for performance and does not consider edge cases like negative numbers or non-integer inputs.',\n",
-       " 'score': 5.0,\n",
+       "{'decision': 'false',\n",
+       " 'reasoning': 'The function is a correct implementation of the Fibonacci sequence, but it is not efficient for large values of n due to its exponential time complexity. This makes it unsuitable for web frontend applications where performance is critical.',\n",
+       " 'score': 1.0,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": 5,\\n    \"reasoning\": \"The function is a correct implementation of the Fibonacci sequence, but it lacks an optimization for performance and does not consider edge cases like negative numbers or non-integer inputs.\",\\n    \"score\": 5\\n}',\n",
+       "  'raw_response': '{\\n    \"decision\": \"false\",\\n    \"reasoning\": \"The function is a correct implementation of the Fibonacci sequence, but it is not efficient for large values of n due to its exponential time complexity. This makes it unsuitable for web frontend applications where performance is critical.\",\\n    \"score\": 1\\n}',\n",
        "  'template_vars': {'language': 'JavaScript', 'purpose': 'web frontend'},\n",
        "  'template_engine': 'format'}}"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -152,7 +152,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -191,25 +191,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "{'decision': 'DECENT',\n",
-       " 'reasoning': 'The code is simple and returns a JSON response, but it lacks RESTful design principles, error handling, and proper documentation. There is no indication of authentication or authorization, which are critical for API security.',\n",
-       " 'score': 4.0,\n",
+       " 'reasoning': 'The API endpoint follows a basic RESTful design by using the GET method for retrieving users. However, it lacks error handling and documentation, which are crucial for a robust API. Additionally, there is no mention of authentication and authorization, which are essential for securing the API.',\n",
+       " 'score': 5.0,\n",
        " 'metadata': {'model': 'qwen2',\n",
-       "  'raw_response': '{\\n    \"decision\": \"DECENT\",\\n    \"reasoning\": \"The code is simple and returns a JSON response, but it lacks RESTful design principles, error handling, and proper documentation. There is no indication of authentication or authorization, which are critical for API security.\",\\n    \"score\": 4\\n}',\n",
+       "  'raw_response': '{\\n    \"decision\": \"DECENT\",\\n    \"reasoning\": \"The API endpoint follows a basic RESTful design by using the GET method for retrieving users. However, it lacks error handling and documentation, which are crucial for a robust API. Additionally, there is no mention of authentication and authorization, which are essential for securing the API.\",\\n    \"score\": 5\\n}',\n",
        "  'template_vars': {'aspects': ['RESTful design',\n",
        "    'Error handling',\n",
        "    'Documentation'],\n",
        "   'security_critical': True},\n",
        "  'template_engine': 'jinja2'}}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
 
@@ -17,7 +17,7 @@
     ModelSpecificMetric
 )
 from vllm_judge.templating import TemplateProcessor
-from vllm_judge.metrics import (
+from vllm_judge.builtin_metrics import (
     # General metrics
     HELPFULNESS,
     ACCURACY,
 
@@ -1,6 +1,3 @@
-"""
-API module for vLLM Judge.
-"""
 from vllm_judge.api.server import app, create_app, start_server
 from vllm_judge.api.client import JudgeClient
 from vllm_judge.api.models import (
 
@@ -1,6 +1,3 @@
-"""
-HTTP client for vLLM Judge API.
-"""
 import asyncio
 from typing import Union, Dict, List, Optional, Tuple, Any, AsyncIterator
 import httpx
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`ModelSpecificMetric`
`18`	`18`	`)`
`19`	`19`	`from vllm_judge.templating import TemplateProcessor`
`20`		`-from vllm_judge.metrics import (`
	`20`	`+from vllm_judge.builtin_metrics import (`
`21`	`21`	`# General metrics`
`22`	`22`	`HELPFULNESS,`
`23`	`23`	`ACCURACY,`