Skip to content

Commit a78a1a5

Browse files
Merge pull request #2 from DashyDashOrg/v0.0.5
v0.0.5
2 parents f8911d1 + da7b253 commit a78a1a5

File tree

7 files changed

+368
-364
lines changed

7 files changed

+368
-364
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ data = [('John Doe', 25, 50),
3838
('Olivia Jackson', 29, 55)]
3939
df = pd.DataFrame(data, columns=['name', 'age', 'donation'])
4040

41-
conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"))
41+
conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"), verbose=True)
4242
result = conv_df.prompt("What is the average donation of people older than 30 who donated more than $50?")
4343

4444
print(f"Result ({type(result)}):\n {result}")

pandas_llm/__init__.py

Lines changed: 359 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,359 @@
1-
from pandas_llm import PandasLLM
1+
import pandas as pd
2+
import datetime
3+
import numpy as np
4+
import openai
5+
import os
6+
import re
7+
import json
8+
9+
# sandbox.py
10+
from RestrictedPython import compile_restricted
11+
from RestrictedPython.Guards import safe_builtins,guarded_iter_unpack_sequence
12+
from RestrictedPython.Eval import default_guarded_getattr, default_guarded_getitem, default_guarded_getiter
13+
import pandas as pd
14+
15+
class Sandbox:
16+
def __init__(self):
17+
self._allowed_imports = {}
18+
19+
def allow_import(self, module_name):
20+
try:
21+
module = __import__(module_name)
22+
self._allowed_imports[module_name] = module
23+
except ImportError:
24+
pass
25+
26+
def execute(self, code, local_vars = {}):
27+
allowed_builtins = safe_builtins
28+
# Add __builtins__, __import__, and allowed imports to the globals
29+
restricted_globals = {"__builtins__": allowed_builtins}
30+
restricted_globals.update(self._allowed_imports)
31+
32+
builtin_mappings = {
33+
"__import__": __import__,
34+
"_getattr_": default_guarded_getattr,
35+
"_getitem_": default_guarded_getitem,
36+
"_getiter_": default_guarded_getiter,
37+
"_iter_unpack_sequence_": guarded_iter_unpack_sequence,
38+
"list": list,
39+
"set": set,
40+
"pd": pd,
41+
}
42+
43+
series_methods = [
44+
"sum", "mean", "any", "argmax", "argmin", "count", "cumsum", "cumprod", "diff",
45+
"dropna", "fillna", "head", "idxmax", "idxmin", "last", "max", "min", "notna",
46+
"prod", "quantile", "rename", "round", "tail", "to_frame", "to_list", "to_numpy",
47+
"to_string","unique", "sort_index", "sort_values", "aggregate"
48+
]
49+
50+
51+
builtin_mappings.update({method: getattr(pd.Series, method) for method in series_methods})
52+
53+
restricted_globals["__builtins__"].update(builtin_mappings)
54+
55+
byte_code = compile_restricted(source=code, filename='<inline>', mode='exec')
56+
57+
# Execute the restricted code
58+
exec(byte_code, restricted_globals, local_vars)
59+
60+
return local_vars
61+
62+
63+
class PandasLLM(pd.DataFrame):
64+
"""
65+
PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a
66+
wrapper around the OpenAI API.
67+
"""
68+
69+
code_blocks = [r'```python(.*?)```',r'```(.*?)```']
70+
71+
llm_default_model = "gpt-3.5-turbo"
72+
llm_default_temperature = 0.2
73+
llm_engine = "openai"
74+
llm_default_params = { "model": llm_default_model,
75+
"temperature": llm_default_temperature}
76+
llm_api_key = None
77+
78+
prompt_override = False
79+
custom_prompt = ""
80+
data_privacy = True
81+
path = None
82+
verbose = False
83+
code_block = ""
84+
force_sandbox = False
85+
def __init__(self,
86+
data,
87+
llm_engine:str = "openai", llm_params=llm_default_params,
88+
prompt_override:bool = False,
89+
custom_prompt:str = "",
90+
path:str = None,
91+
verbose:bool = False,
92+
data_privacy:bool = True,
93+
llm_api_key:str = None,
94+
force_sandbox:bool = False,
95+
*args, **kwargs):
96+
"""
97+
This is the constructor for the PandasLLM class. It takes in the following arguments:
98+
data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples,
99+
a list of dictionaries, a dictionary, a string, or a list.
100+
llm_engine: The name of the OpenAI engine to use.
101+
llm_params: A dictionary of parameters to be used with the OpenAI API.
102+
prompt_override: A boolean that determines whether or not the prompt is overridden.
103+
custom_prompt: A string that overrides the prompt.
104+
path: The path to the file to be used.
105+
verbose: A boolean that determines whether or not the output is verbose.
106+
data_privacy: A boolean that determines whether or not the data is private.
107+
llm_api_key: The OpenAI API key to be used.
108+
force_sandbox: if False and the sandbox fails, it will retry using eval (less safe)
109+
110+
The constructor also calls the parent class's constructor.
111+
112+
113+
Args:
114+
data (pandas dataframe, mandatory): dataset to query. Defaults to None.
115+
llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai".
116+
llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2".
117+
prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False.
118+
custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "".
119+
path (str, optional): the path where the files containing debug data will be save. Defaults to None.
120+
verbose (bool, optional): if True debugging info will be printed. Defaults to False.
121+
data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True.
122+
llm_api_key (str, optional): the Open API key. Defaults to None.
123+
force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False.
124+
"""
125+
126+
127+
super().__init__(data, *args, **kwargs)
128+
129+
self.llm_params = llm_params or {}
130+
131+
# Set up OpenAI API key from the environment or the config
132+
self.llm_api_key = llm_api_key or os.environ.get("OPENAI_API_KEY")
133+
134+
self.llm_engine = llm_engine
135+
self.llm_params = llm_params or {}
136+
self.model = self.llm_params.get("model", self.llm_default_model)
137+
self.temperature = self.llm_params.get("temperature", self.llm_default_temperature)
138+
139+
self.prompt_override = prompt_override
140+
self.custom_prompt = custom_prompt
141+
142+
self.data_privacy = data_privacy
143+
self.path = path
144+
self.verbose = verbose
145+
self.force_sandbox = force_sandbox
146+
147+
def _buildPromptForRole(self):
148+
prompt_role = f"""
149+
I want you to act as a data scientist and Python coder. I want you code for me.
150+
I have a dataset of {len(self)} rows and {len(self.columns)} columns.
151+
Columns and their type are the following:
152+
"""
153+
154+
for col in self.columns:
155+
col_type = self.dtypes[col]
156+
prompt_role += f"{col} ({col_type})\n"
157+
158+
return prompt_role
159+
160+
def _buildPromptForProblemSolving(self, request):
161+
162+
if self.prompt_override:
163+
return self.custom_prompt
164+
165+
columns = ""
166+
for col in self.columns:
167+
col_type = self.dtypes[col]
168+
columns += f"{col} ({col_type})\n"
169+
170+
prompt_problem = f"""
171+
Given a DataFrame named 'df' of {len(self)} rows and {len(self.columns)} columns,
172+
Its columns are the following:
173+
174+
{columns}
175+
176+
I want you to solve the following problem:
177+
write a Python code snippet that addresses the following request:
178+
{request}
179+
180+
While crafting the code, please follow these guidelines:
181+
1. When comparing or searching for strings, use lower case letters, ignore case sensitivity, and apply a "contains" search.
182+
2. Ensure that the answer is a single line of code without explanations, comments, or additional details.
183+
3. If a single line solution is not possible, multiline solutions or functions are acceptable, but the code must end with an assignment to the variable 'result'.
184+
4. Assign the resulting code to the variable 'result'.
185+
5. Avoid importing any additional libraries than pandas and numpy.
186+
187+
"""
188+
if not self.custom_prompt is None and len(self.custom_prompt) > 0:
189+
190+
prompt_problem += f"""
191+
Also:
192+
{self.custom_prompt}
193+
"""
194+
195+
return prompt_problem
196+
197+
def _extractPythonCode(self, text: str, regexp: str) -> str:
198+
# Define the regular expression pattern for the Python code block
199+
pattern = regexp
200+
201+
# Search for the pattern in the input text
202+
match = re.search(pattern, text, re.DOTALL)
203+
204+
# If a match is found, return the extracted code (without the markers)
205+
if match:
206+
return match.group(1).strip()
207+
208+
# If no match is found, return an empty string
209+
return ""
210+
211+
def _print(self, *args, **kwargs):
212+
if self.verbose:
213+
print(*args, **kwargs)
214+
215+
# def _variable_to_string(self, variable):
216+
# if variable is None: return None
217+
# try:
218+
219+
# if isinstance(variable, pd.Series):
220+
# # convert to dataframe
221+
# variable = variable.to_frame()
222+
223+
# if isinstance(variable, pd.DataFrame):
224+
# variable = variable.drop_duplicates()
225+
# if len(variable) == 0: return None
226+
# return str(variable)
227+
228+
# elif isinstance(variable, np.ndarray):
229+
# if len(variable) == 0: return None
230+
# return np.array2string(variable)
231+
# else:
232+
# # Convert the variable to a string
233+
# return str(variable)
234+
# except Exception as e:
235+
# return str(variable)
236+
237+
238+
def _save(self,name,value):
239+
if self.path is None or self.path == "":
240+
return
241+
try:
242+
with open(f"{self.path}/{name}", 'w') as file:
243+
file.write(value)
244+
except Exception as e:
245+
self._print(f"error {e}")
246+
return
247+
248+
def _execInSandbox(self, df, generated_code:str):
249+
250+
# Create a Sandbox instance and allow pandas to be imported
251+
sandbox = Sandbox()
252+
sandbox.allow_import("pandas")
253+
sandbox.allow_import("numpy")
254+
255+
# Define the initial code to set up the DataFrame
256+
initial_code = f"""
257+
import pandas as pd
258+
import datetime
259+
from pandas import Timestamp
260+
import numpy as np
261+
262+
"""
263+
264+
# Combine the initial code and the generated code
265+
full_code = initial_code + "\n" + generated_code
266+
267+
self._save("temp/prompt_code.py",full_code)
268+
# Execute the combined code in the Sandbox
269+
sandbox_result = sandbox.execute(full_code, {"df":df})
270+
271+
# Get the result from the local_vars dictionary
272+
result = sandbox_result.get("result")
273+
return result
274+
275+
def prompt(self, request: str):
276+
"""
277+
278+
Args:
279+
request (str): prompt containing the request. it must be expressed as a question or a problem to solve
280+
281+
Returns:
282+
Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float
283+
"""
284+
285+
# Set up OpenAI API key
286+
openai.api_key = self.llm_api_key
287+
288+
messages=[
289+
{"role": "system",
290+
"content": self._buildPromptForRole()},
291+
{"role": "user",
292+
"content": self._buildPromptForProblemSolving(request)
293+
}
294+
]
295+
296+
response = None
297+
for times in range(0,3):
298+
try:
299+
response = openai.ChatCompletion.create(
300+
model=self.model,
301+
temperature=self.temperature,
302+
messages = messages
303+
)
304+
break;
305+
except Exception as e:
306+
self._print(f"error {e}")
307+
continue
308+
309+
if response is None:
310+
return "Please try later"
311+
312+
self._save("temp/prompt_cmd.json",json.dumps(messages, indent=4))
313+
314+
generated_code = response.choices[0].message.content
315+
if generated_code == "" or generated_code is None:
316+
self.code_block = ""
317+
return None
318+
319+
self.code_block = generated_code
320+
321+
results=[]
322+
for regexp in self.code_blocks:
323+
cleaned_code = self._extractPythonCode(generated_code,regexp)
324+
if cleaned_code == "" or cleaned_code is None:
325+
continue
326+
results.append(cleaned_code)
327+
results.append(generated_code)
328+
329+
if len(results) == 0:
330+
return None
331+
332+
result = None
333+
for cleaned_code in results:
334+
335+
try:
336+
result = self.execInSandbox(self, cleaned_code)
337+
except Exception as e:
338+
self._print(f"error {e}")
339+
if not self.force_sandbox:
340+
try:
341+
expression = re.sub(r"^\s*result\s*=", "", cleaned_code).strip()
342+
result = eval(expression, {'df': self, 'pd': pd, 'np': np, 'datetime': datetime, 'result': result})
343+
except Exception as e:
344+
self._print(f"error {e}")
345+
pass
346+
347+
if result is not None and str(result) != "":
348+
break
349+
350+
if self.data_privacy == True:
351+
# non formatted result
352+
return result
353+
354+
# currently the privacy option is not needed.
355+
# in the future, we can choose to send data to LLM if privacy is set to false
356+
357+
return result
358+
359+

pandas_llm/example-chatbot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def main():
6363
Please note that these names, ages, and donations are randomly generated and do not correspond to real individuals or their donations.
6464
6565
You can ask questions like:
66-
- show me the name of donors
66+
- show me the list of names
6767
- What is the average age of people who donated?
6868
- What is the average donation amount?
6969
- What is the average donation of people older than 30?

pandas_llm/example.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
import os
22
import pandas as pd
3+
4+
import sys
5+
from pathlib import Path
6+
sys.path.append(str(Path(__file__).resolve().parent.parent))
37
from pandas_llm import PandasLLM
48

59
# Data
@@ -17,7 +21,7 @@
1721
('Olivia Jackson', 29, 55)]
1822
df = pd.DataFrame(data, columns=['name', 'age', 'donation'])
1923

20-
conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"))
24+
conv_df = PandasLLM(data=df, llm_api_key = os.environ.get("OPENAI_API_KEY"), verbose=True)
2125
result = conv_df.prompt("What is the average donation of people older than 30 who donated more than $50?")
2226

2327
print(f"Result ({type(result)}):\n {result}")

0 commit comments

Comments
 (0)