1
- from pandas_llm import PandasLLM
1
+ import pandas as pd
2
+ import datetime
3
+ import numpy as np
4
+ import openai
5
+ import os
6
+ import re
7
+ import json
8
+
9
+ # sandbox.py
10
+ from RestrictedPython import compile_restricted
11
+ from RestrictedPython .Guards import safe_builtins ,guarded_iter_unpack_sequence
12
+ from RestrictedPython .Eval import default_guarded_getattr , default_guarded_getitem , default_guarded_getiter
13
+ import pandas as pd
14
+
15
+ class Sandbox :
16
+ def __init__ (self ):
17
+ self ._allowed_imports = {}
18
+
19
+ def allow_import (self , module_name ):
20
+ try :
21
+ module = __import__ (module_name )
22
+ self ._allowed_imports [module_name ] = module
23
+ except ImportError :
24
+ pass
25
+
26
+ def execute (self , code , local_vars = {}):
27
+ allowed_builtins = safe_builtins
28
+ # Add __builtins__, __import__, and allowed imports to the globals
29
+ restricted_globals = {"__builtins__" : allowed_builtins }
30
+ restricted_globals .update (self ._allowed_imports )
31
+
32
+ builtin_mappings = {
33
+ "__import__" : __import__ ,
34
+ "_getattr_" : default_guarded_getattr ,
35
+ "_getitem_" : default_guarded_getitem ,
36
+ "_getiter_" : default_guarded_getiter ,
37
+ "_iter_unpack_sequence_" : guarded_iter_unpack_sequence ,
38
+ "list" : list ,
39
+ "set" : set ,
40
+ "pd" : pd ,
41
+ }
42
+
43
+ series_methods = [
44
+ "sum" , "mean" , "any" , "argmax" , "argmin" , "count" , "cumsum" , "cumprod" , "diff" ,
45
+ "dropna" , "fillna" , "head" , "idxmax" , "idxmin" , "last" , "max" , "min" , "notna" ,
46
+ "prod" , "quantile" , "rename" , "round" , "tail" , "to_frame" , "to_list" , "to_numpy" ,
47
+ "to_string" ,"unique" , "sort_index" , "sort_values" , "aggregate"
48
+ ]
49
+
50
+
51
+ builtin_mappings .update ({method : getattr (pd .Series , method ) for method in series_methods })
52
+
53
+ restricted_globals ["__builtins__" ].update (builtin_mappings )
54
+
55
+ byte_code = compile_restricted (source = code , filename = '<inline>' , mode = 'exec' )
56
+
57
+ # Execute the restricted code
58
+ exec (byte_code , restricted_globals , local_vars )
59
+
60
+ return local_vars
61
+
62
+
63
+ class PandasLLM (pd .DataFrame ):
64
+ """
65
+ PandasLLM is a subclass of the Pandas DataFrame class. It is designed to provide a
66
+ wrapper around the OpenAI API.
67
+ """
68
+
69
+ code_blocks = [r'```python(.*?)```' ,r'```(.*?)```' ]
70
+
71
+ llm_default_model = "gpt-3.5-turbo"
72
+ llm_default_temperature = 0.2
73
+ llm_engine = "openai"
74
+ llm_default_params = { "model" : llm_default_model ,
75
+ "temperature" : llm_default_temperature }
76
+ llm_api_key = None
77
+
78
+ prompt_override = False
79
+ custom_prompt = ""
80
+ data_privacy = True
81
+ path = None
82
+ verbose = False
83
+ code_block = ""
84
+ force_sandbox = False
85
+ def __init__ (self ,
86
+ data ,
87
+ llm_engine :str = "openai" , llm_params = llm_default_params ,
88
+ prompt_override :bool = False ,
89
+ custom_prompt :str = "" ,
90
+ path :str = None ,
91
+ verbose :bool = False ,
92
+ data_privacy :bool = True ,
93
+ llm_api_key :str = None ,
94
+ force_sandbox :bool = False ,
95
+ * args , ** kwargs ):
96
+ """
97
+ This is the constructor for the PandasLLM class. It takes in the following arguments:
98
+ data: The data to be used. It can be a Pandas DataFrame, a list of lists, a list of tuples,
99
+ a list of dictionaries, a dictionary, a string, or a list.
100
+ llm_engine: The name of the OpenAI engine to use.
101
+ llm_params: A dictionary of parameters to be used with the OpenAI API.
102
+ prompt_override: A boolean that determines whether or not the prompt is overridden.
103
+ custom_prompt: A string that overrides the prompt.
104
+ path: The path to the file to be used.
105
+ verbose: A boolean that determines whether or not the output is verbose.
106
+ data_privacy: A boolean that determines whether or not the data is private.
107
+ llm_api_key: The OpenAI API key to be used.
108
+ force_sandbox: if False and the sandbox fails, it will retry using eval (less safe)
109
+
110
+ The constructor also calls the parent class's constructor.
111
+
112
+
113
+ Args:
114
+ data (pandas dataframe, mandatory): dataset to query. Defaults to None.
115
+ llm_engine (str, optional): LLM engine, currently only OpenAI is supported. Defaults to "openai".
116
+ llm_params (dict, optional): LLM engine parameters. Defaults to model=gpt-3.5-turbo and temperature=0.2".
117
+ prompt_override (bool, optional): if True, the custom prompt is mandatory and it will became the main prompt. Defaults to False.
118
+ custom_prompt (str, optional): if prompt_override is False, the custom prompt will be added to the default pandas_llm prompt. Defaults to "".
119
+ path (str, optional): the path where the files containing debug data will be save. Defaults to None.
120
+ verbose (bool, optional): if True debugging info will be printed. Defaults to False.
121
+ data_privacy (bool, optional): if True, the function will not send the data content to OpenAI. Defaults to True.
122
+ llm_api_key (str, optional): the Open API key. Defaults to None.
123
+ force_sandbox (bool, optional): if False and the sandbox fails, it will retry using eval (less safe). Defaults to False.
124
+ """
125
+
126
+
127
+ super ().__init__ (data , * args , ** kwargs )
128
+
129
+ self .llm_params = llm_params or {}
130
+
131
+ # Set up OpenAI API key from the environment or the config
132
+ self .llm_api_key = llm_api_key or os .environ .get ("OPENAI_API_KEY" )
133
+
134
+ self .llm_engine = llm_engine
135
+ self .llm_params = llm_params or {}
136
+ self .model = self .llm_params .get ("model" , self .llm_default_model )
137
+ self .temperature = self .llm_params .get ("temperature" , self .llm_default_temperature )
138
+
139
+ self .prompt_override = prompt_override
140
+ self .custom_prompt = custom_prompt
141
+
142
+ self .data_privacy = data_privacy
143
+ self .path = path
144
+ self .verbose = verbose
145
+ self .force_sandbox = force_sandbox
146
+
147
+ def _buildPromptForRole (self ):
148
+ prompt_role = f"""
149
+ I want you to act as a data scientist and Python coder. I want you code for me.
150
+ I have a dataset of { len (self )} rows and { len (self .columns )} columns.
151
+ Columns and their type are the following:
152
+ """
153
+
154
+ for col in self .columns :
155
+ col_type = self .dtypes [col ]
156
+ prompt_role += f"{ col } ({ col_type } )\n "
157
+
158
+ return prompt_role
159
+
160
+ def _buildPromptForProblemSolving (self , request ):
161
+
162
+ if self .prompt_override :
163
+ return self .custom_prompt
164
+
165
+ columns = ""
166
+ for col in self .columns :
167
+ col_type = self .dtypes [col ]
168
+ columns += f"{ col } ({ col_type } )\n "
169
+
170
+ prompt_problem = f"""
171
+ Given a DataFrame named 'df' of { len (self )} rows and { len (self .columns )} columns,
172
+ Its columns are the following:
173
+
174
+ { columns }
175
+
176
+ I want you to solve the following problem:
177
+ write a Python code snippet that addresses the following request:
178
+ { request }
179
+
180
+ While crafting the code, please follow these guidelines:
181
+ 1. When comparing or searching for strings, use lower case letters, ignore case sensitivity, and apply a "contains" search.
182
+ 2. Ensure that the answer is a single line of code without explanations, comments, or additional details.
183
+ 3. If a single line solution is not possible, multiline solutions or functions are acceptable, but the code must end with an assignment to the variable 'result'.
184
+ 4. Assign the resulting code to the variable 'result'.
185
+ 5. Avoid importing any additional libraries than pandas and numpy.
186
+
187
+ """
188
+ if not self .custom_prompt is None and len (self .custom_prompt ) > 0 :
189
+
190
+ prompt_problem += f"""
191
+ Also:
192
+ { self .custom_prompt }
193
+ """
194
+
195
+ return prompt_problem
196
+
197
+ def _extractPythonCode (self , text : str , regexp : str ) -> str :
198
+ # Define the regular expression pattern for the Python code block
199
+ pattern = regexp
200
+
201
+ # Search for the pattern in the input text
202
+ match = re .search (pattern , text , re .DOTALL )
203
+
204
+ # If a match is found, return the extracted code (without the markers)
205
+ if match :
206
+ return match .group (1 ).strip ()
207
+
208
+ # If no match is found, return an empty string
209
+ return ""
210
+
211
+ def _print (self , * args , ** kwargs ):
212
+ if self .verbose :
213
+ print (* args , ** kwargs )
214
+
215
+ # def _variable_to_string(self, variable):
216
+ # if variable is None: return None
217
+ # try:
218
+
219
+ # if isinstance(variable, pd.Series):
220
+ # # convert to dataframe
221
+ # variable = variable.to_frame()
222
+
223
+ # if isinstance(variable, pd.DataFrame):
224
+ # variable = variable.drop_duplicates()
225
+ # if len(variable) == 0: return None
226
+ # return str(variable)
227
+
228
+ # elif isinstance(variable, np.ndarray):
229
+ # if len(variable) == 0: return None
230
+ # return np.array2string(variable)
231
+ # else:
232
+ # # Convert the variable to a string
233
+ # return str(variable)
234
+ # except Exception as e:
235
+ # return str(variable)
236
+
237
+
238
+ def _save (self ,name ,value ):
239
+ if self .path is None or self .path == "" :
240
+ return
241
+ try :
242
+ with open (f"{ self .path } /{ name } " , 'w' ) as file :
243
+ file .write (value )
244
+ except Exception as e :
245
+ self ._print (f"error { e } " )
246
+ return
247
+
248
+ def _execInSandbox (self , df , generated_code :str ):
249
+
250
+ # Create a Sandbox instance and allow pandas to be imported
251
+ sandbox = Sandbox ()
252
+ sandbox .allow_import ("pandas" )
253
+ sandbox .allow_import ("numpy" )
254
+
255
+ # Define the initial code to set up the DataFrame
256
+ initial_code = f"""
257
+ import pandas as pd
258
+ import datetime
259
+ from pandas import Timestamp
260
+ import numpy as np
261
+
262
+ """
263
+
264
+ # Combine the initial code and the generated code
265
+ full_code = initial_code + "\n " + generated_code
266
+
267
+ self ._save ("temp/prompt_code.py" ,full_code )
268
+ # Execute the combined code in the Sandbox
269
+ sandbox_result = sandbox .execute (full_code , {"df" :df })
270
+
271
+ # Get the result from the local_vars dictionary
272
+ result = sandbox_result .get ("result" )
273
+ return result
274
+
275
+ def prompt (self , request : str ):
276
+ """
277
+
278
+ Args:
279
+ request (str): prompt containing the request. it must be expressed as a question or a problem to solve
280
+
281
+ Returns:
282
+ Any: contains the result or solution of the problem. Tipically the result data type is a dataframe, a Series or a float
283
+ """
284
+
285
+ # Set up OpenAI API key
286
+ openai .api_key = self .llm_api_key
287
+
288
+ messages = [
289
+ {"role" : "system" ,
290
+ "content" : self ._buildPromptForRole ()},
291
+ {"role" : "user" ,
292
+ "content" : self ._buildPromptForProblemSolving (request )
293
+ }
294
+ ]
295
+
296
+ response = None
297
+ for times in range (0 ,3 ):
298
+ try :
299
+ response = openai .ChatCompletion .create (
300
+ model = self .model ,
301
+ temperature = self .temperature ,
302
+ messages = messages
303
+ )
304
+ break ;
305
+ except Exception as e :
306
+ self ._print (f"error { e } " )
307
+ continue
308
+
309
+ if response is None :
310
+ return "Please try later"
311
+
312
+ self ._save ("temp/prompt_cmd.json" ,json .dumps (messages , indent = 4 ))
313
+
314
+ generated_code = response .choices [0 ].message .content
315
+ if generated_code == "" or generated_code is None :
316
+ self .code_block = ""
317
+ return None
318
+
319
+ self .code_block = generated_code
320
+
321
+ results = []
322
+ for regexp in self .code_blocks :
323
+ cleaned_code = self ._extractPythonCode (generated_code ,regexp )
324
+ if cleaned_code == "" or cleaned_code is None :
325
+ continue
326
+ results .append (cleaned_code )
327
+ results .append (generated_code )
328
+
329
+ if len (results ) == 0 :
330
+ return None
331
+
332
+ result = None
333
+ for cleaned_code in results :
334
+
335
+ try :
336
+ result = self .execInSandbox (self , cleaned_code )
337
+ except Exception as e :
338
+ self ._print (f"error { e } " )
339
+ if not self .force_sandbox :
340
+ try :
341
+ expression = re .sub (r"^\s*result\s*=" , "" , cleaned_code ).strip ()
342
+ result = eval (expression , {'df' : self , 'pd' : pd , 'np' : np , 'datetime' : datetime , 'result' : result })
343
+ except Exception as e :
344
+ self ._print (f"error { e } " )
345
+ pass
346
+
347
+ if result is not None and str (result ) != "" :
348
+ break
349
+
350
+ if self .data_privacy == True :
351
+ # non formatted result
352
+ return result
353
+
354
+ # currently the privacy option is not needed.
355
+ # in the future, we can choose to send data to LLM if privacy is set to false
356
+
357
+ return result
358
+
359
+
0 commit comments